Merge "Update docs/hooks.txt for ShowSearchHitTitle"
[mediawiki.git] / includes / tidy / Balancer.php
blob95cbe09fe2973c0ec7b3b69e087eba9a4e675779
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
26 namespace MediaWiki\Tidy;
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
35 // A note for future librarization[1] -- this file is a good candidate
36 // for splitting into an independent library, except that it is currently
37 // highly optimized for MediaWiki use. It only implements the portions
38 // of the HTML5 tree builder used by tags supported by MediaWiki, and
39 // does not contain a true tokenizer pass, instead relying on
40 // comment stripping, attribute normalization, and escaping done by
41 // the MediaWiki Sanitizer. It also deliberately avoids building
42 // a true DOM in memory, instead serializing elements to an output string
43 // as soon as possible (usually as soon as the tag is closed) to reduce
44 // its memory footprint.
46 // We've been gradually lifting some of these restrictions to handle
47 // non-sanitized output generated by extensions, but we shortcut the tokenizer
48 // for speed (primarily by splitting on `<`) and so rely on syntactic
49 // well-formedness.
51 // On the other hand, I've been pretty careful to note with comments in the
52 // code the places where this implementation omits features of the spec or
53 // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
54 // implement the missing pieces and make this a standalone PHP HTML5 parser.
55 // In order to do so, some sort of MediaWiki-specific API will need
56 // to be added to (a) allow the Balancer to bypass the tokenizer,
57 // and (b) support on-the-fly flattening instead of DOM node creation.
59 // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
61 /**
62 * Utility constants and sets for the HTML5 tree building algorithm.
63 * Sets are associative arrays indexed first by namespace and then by
64 * lower-cased tag name.
66 * @ingroup Parser
67 * @since 1.27
69 class BalanceSets {
70 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
71 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
72 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
74 public static $unsupportedSet = [
75 self::HTML_NAMESPACE => [
76 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
77 'frame' => true,
78 'plaintext' => true, 'isindex' => true,
79 'xmp' => true, 'iframe' => true, 'noembed' => true,
80 'noscript' => true, 'script' => true,
81 'title' => true
85 public static $emptyElementSet = [
86 self::HTML_NAMESPACE => [
87 'area' => true, 'base' => true, 'basefont' => true,
88 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
89 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
90 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
91 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
95 public static $extraLinefeedSet = [
96 self::HTML_NAMESPACE => [
97 'pre' => true, 'textarea' => true, 'listing' => true,
101 public static $headingSet = [
102 self::HTML_NAMESPACE => [
103 'h1' => true, 'h2' => true, 'h3' => true,
104 'h4' => true, 'h5' => true, 'h6' => true
108 public static $specialSet = [
109 self::HTML_NAMESPACE => [
110 'address' => true, 'applet' => true, 'area' => true,
111 'article' => true, 'aside' => true, 'base' => true,
112 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
113 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
114 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
115 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
116 'dt' => true, 'embed' => true, 'fieldset' => true,
117 'figcaption' => true, 'figure' => true, 'footer' => true,
118 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
119 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
120 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
121 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
122 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
123 'listing' => true, 'main' => true, 'marquee' => true,
124 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
125 'noembed' => true, 'noframes' => true, 'noscript' => true,
126 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
127 'plaintext' => true, 'pre' => true, 'script' => true,
128 'section' => true, 'select' => true, 'source' => true,
129 'style' => true, 'summary' => true, 'table' => true,
130 'tbody' => true, 'td' => true, 'template' => true,
131 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
132 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
133 'wbr' => true, 'xmp' => true
135 self::SVG_NAMESPACE => [
136 'foreignobject' => true, 'desc' => true, 'title' => true
138 self::MATHML_NAMESPACE => [
139 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
140 'mtext' => true, 'annotation-xml' => true
144 public static $addressDivPSet = [
145 self::HTML_NAMESPACE => [
146 'address' => true, 'div' => true, 'p' => true
150 public static $tableSectionRowSet = [
151 self::HTML_NAMESPACE => [
152 'table' => true, 'thead' => true, 'tbody' => true,
153 'tfoot' => true, 'tr' => true
157 public static $impliedEndTagsSet = [
158 self::HTML_NAMESPACE => [
159 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
160 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
161 'rt' => true, 'rtc' => true
165 public static $thoroughImpliedEndTagsSet = [
166 self::HTML_NAMESPACE => [
167 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
168 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
169 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
170 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
171 'thead' => true, 'tr' => true
175 public static $tableCellSet = [
176 self::HTML_NAMESPACE => [
177 'td' => true, 'th' => true
180 public static $tableContextSet = [
181 self::HTML_NAMESPACE => [
182 'table' => true, 'template' => true, 'html' => true
186 public static $tableBodyContextSet = [
187 self::HTML_NAMESPACE => [
188 'tbody' => true, 'tfoot' => true, 'thead' => true,
189 'template' => true, 'html' => true
193 public static $tableRowContextSet = [
194 self::HTML_NAMESPACE => [
195 'tr' => true, 'template' => true, 'html' => true
199 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
200 public static $formAssociatedSet = [
201 self::HTML_NAMESPACE => [
202 'button' => true, 'fieldset' => true, 'input' => true,
203 'keygen' => true, 'object' => true, 'output' => true,
204 'select' => true, 'textarea' => true, 'img' => true
208 public static $inScopeSet = [
209 self::HTML_NAMESPACE => [
210 'applet' => true, 'caption' => true, 'html' => true,
211 'marquee' => true, 'object' => true,
212 'table' => true, 'td' => true, 'template' => true,
213 'th' => true
215 self::SVG_NAMESPACE => [
216 'foreignobject' => true, 'desc' => true, 'title' => true
218 self::MATHML_NAMESPACE => [
219 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
220 'mtext' => true, 'annotation-xml' => true
224 private static $inListItemScopeSet = null;
225 public static function inListItemScopeSet() {
226 if ( self::$inListItemScopeSet === null ) {
227 self::$inListItemScopeSet = self::$inScopeSet;
228 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
229 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
231 return self::$inListItemScopeSet;
234 private static $inButtonScopeSet = null;
235 public static function inButtonScopeSet() {
236 if ( self::$inButtonScopeSet === null ) {
237 self::$inButtonScopeSet = self::$inScopeSet;
238 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
240 return self::$inButtonScopeSet;
243 public static $inTableScopeSet = [
244 self::HTML_NAMESPACE => [
245 'html' => true, 'table' => true, 'template' => true
249 public static $inInvertedSelectScopeSet = [
250 self::HTML_NAMESPACE => [
251 'option' => true, 'optgroup' => true
255 public static $mathmlTextIntegrationPointSet = [
256 self::MATHML_NAMESPACE => [
257 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
258 'mtext' => true
262 public static $htmlIntegrationPointSet = [
263 self::SVG_NAMESPACE => [
264 'foreignobject' => true,
265 'desc' => true,
266 'title' => true
270 // For tidy compatibility.
271 public static $tidyPWrapSet = [
272 self::HTML_NAMESPACE => [
273 'body' => true, 'blockquote' => true,
274 // We parse with <body> as the fragment context, but the top-level
275 // element on the stack is actually <html>. We could use the
276 // "adjusted current node" everywhere to work around this, but it's
277 // easier just to add <html> to the p-wrap set.
278 'html' => true,
281 public static $tidyInlineSet = [
282 self::HTML_NAMESPACE => [
283 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
284 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
285 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
286 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
287 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
288 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
289 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
290 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
291 's' => true, 'samp' => true, 'select' => true, 'small' => true,
292 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
293 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
294 'var' => true,
300 * A BalanceElement is a simplified version of a DOM Node. The main
301 * difference is that we only keep BalanceElements around for nodes
302 * currently on the BalanceStack of open elements. As soon as an
303 * element is closed, with some minor exceptions relating to the
304 * tree builder "adoption agency algorithm", the element and all its
305 * children are serialized to a string using the flatten() method.
306 * This keeps our memory usage low.
308 * @ingroup Parser
309 * @since 1.27
311 class BalanceElement {
313 * The namespace of the element.
314 * @var string $namespaceURI
316 public $namespaceURI;
318 * The lower-cased name of the element.
319 * @var string $localName
321 public $localName;
323 * Attributes for the element, in array form
324 * @var array $attribs
326 public $attribs;
329 * Parent of this element, or the string "flat" if this element has
330 * already been flattened into its parent.
331 * @var BalanceElement|string|null $parent
333 public $parent;
336 * An array of children of this element. Typically only the last
337 * child will be an actual BalanceElement object; the rest will
338 * be strings, representing either text nodes or flattened
339 * BalanceElement objects.
340 * @var BalanceElement[]|string[] $children
342 public $children;
345 * A unique string identifier for Noah's Ark purposes, lazy initialized
347 private $noahKey;
350 * The next active formatting element in the list, or null if this is the
351 * end of the AFE list or if the element is not in the AFE list.
353 public $nextAFE;
356 * The previous active formatting element in the list, or null if this is
357 * the start of the list or if the element is not in the AFE list.
359 public $prevAFE;
362 * The next element in the Noah's Ark species bucket.
364 public $nextNoah;
367 * Make a new BalanceElement corresponding to the HTML DOM Element
368 * with the given localname, namespace, and attributes.
370 * @param string $namespaceURI The namespace of the element.
371 * @param string $localName The lowercased name of the tag.
372 * @param array $attribs Attributes of the element
374 public function __construct( $namespaceURI, $localName, array $attribs ) {
375 $this->localName = $localName;
376 $this->namespaceURI = $namespaceURI;
377 $this->attribs = $attribs;
378 $this->contents = '';
379 $this->parent = null;
380 $this->children = [];
384 * Remove the given child from this element.
385 * @param BalanceElement $elt
387 private function removeChild( BalanceElement $elt ) {
388 Assert::precondition(
389 $this->parent !== 'flat', "Can't removeChild after flattening $this"
391 Assert::parameter(
392 $elt->parent === $this, 'elt', 'must have $this as a parent'
394 $idx = array_search( $elt, $this->children, true );
395 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
396 $elt->parent = null;
397 array_splice( $this->children, $idx, 1 );
401 * Find $a in the list of children and insert $b before it.
402 * @param BalanceElement $a
403 * @param BalanceElement|string $b
405 public function insertBefore( BalanceElement $a, $b ) {
406 Assert::precondition(
407 $this->parent !== 'flat', "Can't insertBefore after flattening."
409 $idx = array_search( $a, $this->children, true );
410 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
411 if ( is_string( $b ) ) {
412 array_splice( $this->children, $idx, 0, [ $b ] );
413 } else {
414 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
415 if ( $b->parent !== null ) {
416 $b->parent->removeChild( $b );
418 array_splice( $this->children, $idx, 0, [ $b ] );
419 $b->parent = $this;
424 * Append $elt to the end of the list of children.
425 * @param BalanceElement|string $elt
427 public function appendChild( $elt ) {
428 Assert::precondition(
429 $this->parent !== 'flat', "Can't appendChild after flattening."
431 if ( is_string( $elt ) ) {
432 array_push( $this->children, $elt );
433 return;
435 // Remove $elt from parent, if it had one.
436 if ( $elt->parent !== null ) {
437 $elt->parent->removeChild( $elt );
439 array_push( $this->children, $elt );
440 $elt->parent = $this;
444 * Transfer all of the children of $elt to $this.
445 * @param BalanceElement $elt
447 public function adoptChildren( BalanceElement $elt ) {
448 Assert::precondition(
449 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
451 foreach ( $elt->children as $child ) {
452 if ( !is_string( $child ) ) {
453 // This is an optimization which avoids an O(n^2) set of
454 // array_splice operations.
455 $child->parent = null;
457 $this->appendChild( $child );
459 $elt->children = [];
463 * Flatten this node and all of its children into a string, as specified
464 * by the HTML serialization specification, and replace this node
465 * in its parent by that string.
467 * @param array $config Balancer configuration; see Balancer::__construct().
468 * @return string
470 * @see __toString()
472 public function flatten( array $config ) {
473 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
474 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
475 $idx = array_search( $this, $this->parent->children, true );
476 Assert::parameter(
477 $idx !== false, '$this', 'must be a child of its parent'
479 $tidyCompat = $config['tidyCompat'];
480 if ( $tidyCompat ) {
481 $blank = true;
482 foreach ( $this->children as $elt ) {
483 if ( !is_string( $elt ) ) {
484 $elt = $elt->flatten( $config );
486 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
487 $blank = false;
490 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
491 $this->localName = 'p';
492 } elseif ( $blank ) {
493 // Add 'mw-empty-elt' class so elements can be hidden via CSS
494 // for compatibility with legacy tidy.
495 if ( !count( $this->attribs ) &&
496 ( $this->localName === 'tr' || $this->localName === 'li' )
498 $this->attribs = [ 'class' => "mw-empty-elt" ];
500 $blank = false;
502 $flat = $blank ? '' : "{$this}";
503 } else {
504 $flat = "{$this}";
506 $this->parent->children[$idx] = $flat;
507 $this->parent = 'flat'; // for assertion checking
508 return $flat;
512 * Serialize this node and all of its children to a string, as specified
513 * by the HTML serialization specification.
515 * @return string The serialization of the BalanceElement
516 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
518 public function __toString() {
519 $encAttribs = '';
520 foreach ( $this->attribs as $name => $value ) {
521 $encValue = Sanitizer::encodeAttribute( $value );
522 $encAttribs .= " $name=\"$encValue\"";
524 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
525 $out = "<{$this->localName}{$encAttribs}>";
526 $len = strlen( $out );
527 // flatten children
528 foreach ( $this->children as $elt ) {
529 $out .= "{$elt}";
531 $out .= "</{$this->localName}>";
532 if (
533 $this->isA( BalanceSets::$extraLinefeedSet ) &&
534 $out[$len] === "\n"
536 // Double the linefeed after pre/listing/textarea
537 // according to the HTML5 fragment serialization algorithm.
538 $out = substr( $out, 0, $len + 1 ) .
539 substr( $out, $len );
541 } else {
542 $out = "<{$this->localName}{$encAttribs} />";
543 Assert::invariant(
544 count( $this->children ) === 0,
545 "Empty elements shouldn't have children."
548 return $out;
551 // Utility functions on BalanceElements.
554 * Determine if $this represents a specific HTML tag, is a member of
555 * a tag set, or is equal to another BalanceElement.
557 * @param BalanceElement|array|string $set The target BalanceElement,
558 * set (from the BalanceSets class), or string (HTML tag name).
559 * @return bool
561 public function isA( $set ) {
562 if ( $set instanceof BalanceElement ) {
563 return $this === $set;
564 } elseif ( is_array( $set ) ) {
565 return isset( $set[$this->namespaceURI] ) &&
566 isset( $set[$this->namespaceURI][$this->localName] );
567 } else {
568 // assume this is an HTML element name.
569 return $this->isHtml() && $this->localName === $set;
574 * Determine if this element is an HTML element with the specified name
575 * @param string $tagName
576 * @return bool
578 public function isHtmlNamed( $tagName ) {
579 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
580 && $this->localName === $tagName;
584 * Determine if $this represents an element in the HTML namespace.
586 * @return bool
588 public function isHtml() {
589 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
593 * Determine if $this represents a MathML text integration point,
594 * as defined in the HTML5 specification.
596 * @return bool
597 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
599 public function isMathmlTextIntegrationPoint() {
600 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
604 * Determine if $this represents an HTML integration point,
605 * as defined in the HTML5 specification.
607 * @return bool
608 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
610 public function isHtmlIntegrationPoint() {
611 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
612 return true;
614 if (
615 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
616 $this->localName === 'annotation-xml' &&
617 isset( $this->attribs['encoding'] ) &&
618 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
619 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
621 return true;
623 return false;
627 * Get a string key for the Noah's Ark algorithm
629 public function getNoahKey() {
630 if ( $this->noahKey === null ) {
631 $attribs = $this->attribs;
632 ksort( $attribs );
633 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
635 return $this->noahKey;
640 * The "stack of open elements" as defined in the HTML5 tree builder
641 * spec. This contains methods to ensure that content (start tags, text)
642 * are inserted at the correct place in the output string, and to
643 * flatten BalanceElements are they are closed to avoid holding onto
644 * a complete DOM tree for the document in memory.
646 * The stack defines a PHP iterator to traverse it in "reverse order",
647 * that is, the most-recently-added element is visited first in a
648 * foreach loop.
650 * @ingroup Parser
651 * @since 1.27
652 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
654 class BalanceStack implements IteratorAggregate {
656 * Backing storage for the stack.
657 * @var BalanceElement[] $elements
659 private $elements = [];
661 * Foster parent mode determines how nodes are inserted into the
662 * stack.
663 * @var bool $fosterParentMode
664 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
666 public $fosterParentMode = false;
668 * Configuration options governing flattening.
669 * @var array $config
670 * @see Balancer::__construct()
672 private $config;
674 * Reference to the current element
676 public $currentNode;
679 * Create a new BalanceStack with a single BalanceElement on it,
680 * representing the root &lt;html&gt; node.
681 * @param array $config Balancer configuration; see Balancer::_construct().
683 public function __construct( array $config ) {
684 // always a root <html> element on the stack
685 array_push(
686 $this->elements,
687 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
689 $this->currentNode = $this->elements[0];
690 $this->config = $config;
694 * Return a string representing the output of the tree builder:
695 * all the children of the root &lt;html&gt; node.
696 * @return string
698 public function getOutput() {
699 // Don't include the outer '<html>....</html>'
700 $out = '';
701 foreach ( $this->elements[0]->children as $elt ) {
702 $out .= is_string( $elt ) ? $elt :
703 $elt->flatten( $this->config );
705 return $out;
709 * Insert a comment at the appropriate place for inserting a node.
710 * @param string $value Content of the comment.
711 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
713 public function insertComment( $value ) {
714 // Just another type of text node, except for tidy p-wrapping.
715 return $this->insertText( '<!--' . $value . '-->', true );
719 * Insert text at the appropriate place for inserting a node.
720 * @param string $value
721 * @param bool $isComment
722 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
724 public function insertText( $value, $isComment = false ) {
725 if (
726 $this->fosterParentMode &&
727 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
729 $this->fosterParent( $value );
730 } elseif (
731 $this->config['tidyCompat'] && !$isComment &&
732 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
734 $this->insertHTMLElement( 'mw:p-wrap', [] );
735 return $this->insertText( $value );
736 } else {
737 $this->currentNode->appendChild( $value );
742 * Insert a BalanceElement at the appropriate place, pushing it
743 * on to the open elements stack.
744 * @param string $namespaceURI The element namespace
745 * @param string $tag The tag name
746 * @param string $attribs Normalized attributes, as a string.
747 * @return BalanceElement
748 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
750 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
751 return $this->insertElement(
752 new BalanceElement( $namespaceURI, $tag, $attribs )
757 * Insert an HTML element at the appropriate place, pushing it on to
758 * the open elements stack.
759 * @param string $tag The tag name
760 * @param string $attribs Normalized attributes, as a string.
761 * @return BalanceElement
762 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
764 public function insertHTMLElement( $tag, $attribs ) {
765 return $this->insertForeignElement(
766 BalanceSets::HTML_NAMESPACE, $tag, $attribs
771 * Insert an element at the appropriate place and push it on to the
772 * open elements stack.
773 * @param BalanceElement $elt
774 * @return BalanceElement
775 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
777 public function insertElement( BalanceElement $elt ) {
778 if (
779 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
780 !$elt->isA( BalanceSets::$tidyInlineSet )
782 // Tidy compatibility.
783 $this->pop();
785 if (
786 $this->fosterParentMode &&
787 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
789 $elt = $this->fosterParent( $elt );
790 } else {
791 $this->currentNode->appendChild( $elt );
793 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
794 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
795 array_push( $this->elements, $elt );
796 $this->currentNode = $elt;
797 return $elt;
801 * Determine if the stack has $tag in scope.
802 * @param BalanceElement|array|string $tag
803 * @return bool
804 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
806 public function inScope( $tag ) {
807 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
811 * Determine if the stack has $tag in button scope.
812 * @param BalanceElement|array|string $tag
813 * @return bool
814 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
816 public function inButtonScope( $tag ) {
817 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
821 * Determine if the stack has $tag in list item scope.
822 * @param BalanceElement|array|string $tag
823 * @return bool
824 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
826 public function inListItemScope( $tag ) {
827 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
831 * Determine if the stack has $tag in table scope.
832 * @param BalanceElement|array|string $tag
833 * @return bool
834 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
836 public function inTableScope( $tag ) {
837 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
841 * Determine if the stack has $tag in select scope.
842 * @param BalanceElement|array|string $tag
843 * @return bool
844 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
846 public function inSelectScope( $tag ) {
847 // Can't use inSpecificScope to implement this, since it involves
848 // *inverting* a set of tags. Implement manually.
849 foreach ( $this as $elt ) {
850 if ( $elt->isA( $tag ) ) {
851 return true;
853 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
854 return false;
857 return false;
861 * Determine if the stack has $tag in a specific scope, $set.
862 * @param BalanceElement|array|string $tag
863 * @param BalanceElement|array|string $set
864 * @return bool
865 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
867 public function inSpecificScope( $tag, $set ) {
868 foreach ( $this as $elt ) {
869 if ( $elt->isA( $tag ) ) {
870 return true;
872 if ( $elt->isA( $set ) ) {
873 return false;
876 return false;
880 * Generate implied end tags.
881 * @param string $butnot
882 * @param bool $thorough True if we should generate end tags thoroughly.
883 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
885 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
886 $endTagSet = $thorough ?
887 BalanceSets::$thoroughImpliedEndTagsSet :
888 BalanceSets::$impliedEndTagsSet;
889 while ( $this->currentNode ) {
890 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
891 break;
893 if ( !$this->currentNode->isA( $endTagSet ) ) {
894 break;
896 $this->pop();
901 * Return the adjusted current node.
903 public function adjustedCurrentNode( $fragmentContext ) {
904 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
905 $fragmentContext : $this->currentNode;
909 * Return an iterator over this stack which visits the current node
910 * first, and the root node last.
911 * @return \Iterator
913 public function getIterator() {
914 return new ReverseArrayIterator( $this->elements );
918 * Return the BalanceElement at the given position $idx, where
919 * position 0 represents the root element.
920 * @param int $idx
921 * @return BalanceElement
923 public function node( $idx ) {
924 return $this->elements[ $idx ];
928 * Replace the element at position $idx in the BalanceStack with $elt.
929 * @param int $idx
930 * @param BalanceElement $elt
932 public function replaceAt( $idx, BalanceElement $elt ) {
933 Assert::precondition(
934 $this->elements[$idx]->parent !== 'flat',
935 'Replaced element should not have already been flattened.'
937 Assert::precondition(
938 $elt->parent !== 'flat',
939 'New element should not have already been flattened.'
941 $this->elements[$idx] = $elt;
942 if ( $idx === count( $this->elements ) - 1 ) {
943 $this->currentNode = $elt;
948 * Return the position of the given BalanceElement, set, or
949 * HTML tag name string in the BalanceStack.
950 * @param BalanceElement|array|string $tag
951 * @return int
953 public function indexOf( $tag ) {
954 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
955 if ( $this->elements[$i]->isA( $tag ) ) {
956 return $i;
959 return -1;
963 * Return the number of elements currently in the BalanceStack.
964 * @return int
966 public function length() {
967 return count( $this->elements );
971 * Remove the current node from the BalanceStack, flattening it
972 * in the process.
974 public function pop() {
975 $elt = array_pop( $this->elements );
976 if ( count( $this->elements ) ) {
977 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
978 } else {
979 $this->currentNode = null;
981 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
982 $elt->flatten( $this->config );
987 * Remove all nodes up to and including position $idx from the
988 * BalanceStack, flattening them in the process.
989 * @param int $idx
991 public function popTo( $idx ) {
992 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
993 $this->pop();
998 * Pop elements off the stack up to and including the first
999 * element with the specified HTML tagname (or matching the given
1000 * set).
1001 * @param BalanceElement|array|string $tag
1003 public function popTag( $tag ) {
1004 while ( $this->currentNode ) {
1005 if ( $this->currentNode->isA( $tag ) ) {
1006 $this->pop();
1007 break;
1009 $this->pop();
1014 * Pop elements off the stack *not including* the first element
1015 * in the specified set.
1016 * @param BalanceElement|array|string $set
1018 public function clearToContext( $set ) {
1019 // Note that we don't loop to 0. Never pop the <html> elt off.
1020 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
1021 if ( $this->currentNode->isA( $set ) ) {
1022 break;
1024 $this->pop();
1029 * Remove the given $elt from the BalanceStack, optionally
1030 * flattening it in the process.
1031 * @param BalanceElement $elt The element to remove.
1032 * @param bool $flatten Whether to flatten the removed element.
1034 public function removeElement( BalanceElement $elt, $flatten = true ) {
1035 Assert::parameter(
1036 $elt->parent !== 'flat',
1037 '$elt',
1038 '$elt should not already have been flattened.'
1040 Assert::parameter(
1041 $elt->parent->parent !== 'flat',
1042 '$elt',
1043 'The parent of $elt should not already have been flattened.'
1045 $idx = array_search( $elt, $this->elements, true );
1046 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1047 array_splice( $this->elements, $idx, 1 );
1048 if ( $idx === count( $this->elements ) ) {
1049 $this->currentNode = $this->elements[$idx - 1];
1051 if ( $flatten ) {
1052 // serialize $elt into its parent
1053 // otherwise, it will eventually serialize when the parent
1054 // is serialized, we just hold onto the memory for its
1055 // tree of objects a little longer.
1056 $elt->flatten( $this->config );
1058 Assert::postcondition(
1059 array_search( $elt, $this->elements, true ) === false,
1060 '$elt should no longer be in open elements stack'
1065 * Find $a in the BalanceStack and insert $b after it.
1066 * @param BalanceElement $a
1067 * @param BalanceElement $b
1069 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1070 $idx = $this->indexOf( $a );
1071 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1072 if ( $idx === count( $this->elements ) - 1 ) {
1073 array_push( $this->elements, $b );
1074 $this->currentNode = $b;
1075 } else {
1076 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1080 // Fostering and adoption.
1083 * Foster parent the given $elt in the stack of open elements.
1084 * @param BalanceElement|string $elt
1085 * @return BalanceElement|string
1087 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1089 private function fosterParent( $elt ) {
1090 $lastTable = $this->indexOf( 'table' );
1091 $lastTemplate = $this->indexOf( 'template' );
1092 $parent = null;
1093 $before = null;
1095 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1096 $parent = $this->elements[$lastTemplate];
1097 } elseif ( $lastTable >= 0 ) {
1098 $parent = $this->elements[$lastTable]->parent;
1099 // Assume all tables have parents, since we're not running scripts!
1100 Assert::invariant(
1101 $parent !== null, "All tables should have parents"
1103 $before = $this->elements[$lastTable];
1104 } else {
1105 $parent = $this->elements[0]; // the `html` element.
1108 if ( $this->config['tidyCompat'] ) {
1109 if ( is_string( $elt ) ) {
1110 // We're fostering text: do we need a p-wrapper?
1111 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1112 $this->insertHTMLElement( 'mw:p-wrap', [] );
1113 $this->insertText( $elt );
1114 return $elt;
1116 } else {
1117 // We're fostering an element; do we need to merge p-wrappers?
1118 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1119 $idx = $before ?
1120 array_search( $before, $parent->children, true ) :
1121 count( $parent->children );
1122 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1123 if (
1124 $after instanceof BalanceElement &&
1125 $after->isHtmlNamed( 'mw:p-wrap' )
1127 return $after; // Re-use existing p-wrapper.
1133 if ( $before ) {
1134 $parent->insertBefore( $before, $elt );
1135 } else {
1136 $parent->appendChild( $elt );
1138 return $elt;
1142 * Run the "adoption agency algoritm" (AAA) for the given subject
1143 * tag name.
1144 * @param string $tag The subject tag name.
1145 * @param BalanceActiveFormattingElements $afe The current
1146 * active formatting elements list.
1147 * @return true if the adoption agency algorithm "did something", false
1148 * if more processing is required by the caller.
1149 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1151 public function adoptionAgency( $tag, $afe ) {
1152 // If the current node is an HTML element whose tag name is subject,
1153 // and the current node is not in the list of active formatting
1154 // elements, then pop the current node off the stack of open
1155 // elements and abort these steps.
1156 if (
1157 $this->currentNode->isHtmlNamed( $tag ) &&
1158 !$afe->isInList( $this->currentNode )
1160 $this->pop();
1161 return true; // no more handling required
1164 // Outer loop: If outer loop counter is greater than or
1165 // equal to eight, then abort these steps.
1166 for ( $outer = 0; $outer < 8; $outer++ ) {
1167 // Let the formatting element be the last element in the list
1168 // of active formatting elements that: is between the end of
1169 // the list and the last scope marker in the list, if any, or
1170 // the start of the list otherwise, and has the same tag name
1171 // as the token.
1172 $fmtElt = $afe->findElementByTag( $tag );
1174 // If there is no such node, then abort these steps and instead
1175 // act as described in the "any other end tag" entry below.
1176 if ( !$fmtElt ) {
1177 return false; // false means handle by the default case
1180 // Otherwise, if there is such a node, but that node is not in
1181 // the stack of open elements, then this is a parse error;
1182 // remove the element from the list, and abort these steps.
1183 $index = $this->indexOf( $fmtElt );
1184 if ( $index < 0 ) {
1185 $afe->remove( $fmtElt );
1186 return true; // true means no more handling required
1189 // Otherwise, if there is such a node, and that node is also in
1190 // the stack of open elements, but the element is not in scope,
1191 // then this is a parse error; ignore the token, and abort
1192 // these steps.
1193 if ( !$this->inScope( $fmtElt ) ) {
1194 return true;
1197 // Let the furthest block be the topmost node in the stack of
1198 // open elements that is lower in the stack than the formatting
1199 // element, and is an element in the special category. There
1200 // might not be one.
1201 $furthestBlock = null;
1202 $furthestBlockIndex = -1;
1203 $stackLength = $this->length();
1204 for ( $i = $index+1; $i < $stackLength; $i++ ) {
1205 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1206 $furthestBlock = $this->node( $i );
1207 $furthestBlockIndex = $i;
1208 break;
1212 // If there is no furthest block, then the UA must skip the
1213 // subsequent steps and instead just pop all the nodes from the
1214 // bottom of the stack of open elements, from the current node
1215 // up to and including the formatting element, and remove the
1216 // formatting element from the list of active formatting
1217 // elements.
1218 if ( !$furthestBlock ) {
1219 $this->popTag( $fmtElt );
1220 $afe->remove( $fmtElt );
1221 return true;
1224 // Let the common ancestor be the element immediately above
1225 // the formatting element in the stack of open elements.
1226 $ancestor = $this->node( $index-1 );
1228 // Let a bookmark note the position of the formatting
1229 // element in the list of active formatting elements
1230 // relative to the elements on either side of it in the
1231 // list.
1232 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1233 $afe->insertAfter( $fmtElt, $BOOKMARK );
1235 // Let node and last node be the furthest block.
1236 $node = $furthestBlock;
1237 $lastNode = $furthestBlock;
1238 $nodeIndex = $furthestBlockIndex;
1239 $isAFE = false;
1241 // Inner loop
1242 for ( $inner = 1; true; $inner++ ) {
1243 // Let node be the element immediately above node in
1244 // the stack of open elements, or if node is no longer
1245 // in the stack of open elements (e.g. because it got
1246 // removed by this algorithm), the element that was
1247 // immediately above node in the stack of open elements
1248 // before node was removed.
1249 $node = $this->node( --$nodeIndex );
1251 // If node is the formatting element, then go
1252 // to the next step in the overall algorithm.
1253 if ( $node === $fmtElt ) break;
1255 // If the inner loop counter is greater than three and node
1256 // is in the list of active formatting elements, then remove
1257 // node from the list of active formatting elements.
1258 $isAFE = $afe->isInList( $node );
1259 if ( $inner > 3 && $isAFE ) {
1260 $afe->remove( $node );
1261 $isAFE = false;
1264 // If node is not in the list of active formatting
1265 // elements, then remove node from the stack of open
1266 // elements and then go back to the step labeled inner
1267 // loop.
1268 if ( !$isAFE ) {
1269 // Don't flatten here, since we're about to relocate
1270 // parts of this $node.
1271 $this->removeElement( $node, false );
1272 continue;
1275 // Create an element for the token for which the
1276 // element node was created with common ancestor as
1277 // the intended parent, replace the entry for node
1278 // in the list of active formatting elements with an
1279 // entry for the new element, replace the entry for
1280 // node in the stack of open elements with an entry for
1281 // the new element, and let node be the new element.
1282 $newElt = new BalanceElement(
1283 $node->namespaceURI, $node->localName, $node->attribs );
1284 $afe->replace( $node, $newElt );
1285 $this->replaceAt( $nodeIndex, $newElt );
1286 $node = $newElt;
1288 // If last node is the furthest block, then move the
1289 // aforementioned bookmark to be immediately after the
1290 // new node in the list of active formatting elements.
1291 if ( $lastNode === $furthestBlock ) {
1292 $afe->remove( $BOOKMARK );
1293 $afe->insertAfter( $newElt, $BOOKMARK );
1296 // Insert last node into node, first removing it from
1297 // its previous parent node if any.
1298 $node->appendChild( $lastNode );
1300 // Let last node be node.
1301 $lastNode = $node;
1304 // If the common ancestor node is a table, tbody, tfoot,
1305 // thead, or tr element, then, foster parent whatever last
1306 // node ended up being in the previous step, first removing
1307 // it from its previous parent node if any.
1308 if (
1309 $this->fosterParentMode &&
1310 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1312 $this->fosterParent( $lastNode );
1313 } else {
1314 // Otherwise, append whatever last node ended up being in
1315 // the previous step to the common ancestor node, first
1316 // removing it from its previous parent node if any.
1317 $ancestor->appendChild( $lastNode );
1320 // Create an element for the token for which the
1321 // formatting element was created, with furthest block
1322 // as the intended parent.
1323 $newElt2 = new BalanceElement(
1324 $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
1326 // Take all of the child nodes of the furthest block and
1327 // append them to the element created in the last step.
1328 $newElt2->adoptChildren( $furthestBlock );
1330 // Append that new element to the furthest block.
1331 $furthestBlock->appendChild( $newElt2 );
1333 // Remove the formatting element from the list of active
1334 // formatting elements, and insert the new element into the
1335 // list of active formatting elements at the position of
1336 // the aforementioned bookmark.
1337 $afe->remove( $fmtElt );
1338 $afe->replace( $BOOKMARK, $newElt2 );
1340 // Remove the formatting element from the stack of open
1341 // elements, and insert the new element into the stack of
1342 // open elements immediately below the position of the
1343 // furthest block in that stack.
1344 $this->removeElement( $fmtElt );
1345 $this->insertAfter( $furthestBlock, $newElt2 );
1348 return true;
1352 * Return the contents of the open elements stack as a string for
1353 * debugging.
1354 * @return string
1356 public function __toString() {
1357 $r = [];
1358 foreach ( $this->elements as $elt ) {
1359 array_push( $r, $elt->localName );
1361 return implode( $r, ' ' );
1366 * A pseudo-element used as a marker in the list of active formatting elements
1368 * @ingroup Parser
1369 * @since 1.27
1371 class BalanceMarker {
1372 public $nextAFE;
1373 public $prevAFE;
1377 * The list of active formatting elements, which is used to handle
1378 * mis-nested formatting element tags in the HTML5 tree builder
1379 * specification.
1381 * @ingroup Parser
1382 * @since 1.27
1383 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1385 class BalanceActiveFormattingElements {
1386 /** The last (most recent) element in the list */
1387 private $tail;
1389 /** The first (least recent) element in the list */
1390 private $head;
1393 * An array of arrays representing the population of elements in each bucket
1394 * according to the Noah's Ark clause. The outer array is stack-like, with each
1395 * integer-indexed element representing a segment of the list, bounded by
1396 * markers. The first element represents the segment of the list before the
1397 * first marker.
1399 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1400 * identifies each bucket according to the rules in the spec. The value in
1401 * the inner array is the first (least recently inserted) element in the bucket,
1402 * and subsequent members of the bucket can be found by iterating through the
1403 * singly-linked list via $node->nextNoah.
1405 * This is optimised for the most common case of inserting into a bucket
1406 * with zero members, and deleting a bucket containing one member. In the
1407 * worst case, iteration through the list is still O(1) in the document
1408 * size, since each bucket can have at most 3 members.
1410 private $noahTableStack = [ [] ];
1412 public function __destruct() {
1413 $next = null;
1414 for ( $node = $this->head; $node; $node = $next ) {
1415 $next = $node->nextAFE;
1416 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1418 $this->head = $this->tail = $this->noahTableStack = null;
1421 public function insertMarker() {
1422 $elt = new BalanceMarker;
1423 if ( $this->tail ) {
1424 $this->tail->nextAFE = $elt;
1425 $elt->prevAFE = $this->tail;
1426 } else {
1427 $this->head = $elt;
1429 $this->tail = $elt;
1430 $this->noahTableStack[] = [];
1434 * Follow the steps required when the spec requires us to "push onto the
1435 * list of active formatting elements".
1436 * @param BalanceElement $elt
1438 public function push( BalanceElement $elt ) {
1439 // Must not be in the list already
1440 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1441 throw new ParameterAssertionException( '$elt',
1442 'Cannot insert a node into the AFE list twice' );
1445 // "Noah's Ark clause" -- if there are already three copies of
1446 // this element before we encounter a marker, then drop the last
1447 // one.
1448 $noahKey = $elt->getNoahKey();
1449 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1450 if ( !isset( $table[$noahKey] ) ) {
1451 $table[$noahKey] = $elt;
1452 } else {
1453 $count = 1;
1454 $head = $tail = $table[$noahKey];
1455 while ( $tail->nextNoah ) {
1456 $tail = $tail->nextNoah;
1457 $count++;
1459 if ( $count >= 3 ) {
1460 $this->remove( $head );
1462 $tail->nextNoah = $elt;
1464 // Add to the main AFE list
1465 if ( $this->tail ) {
1466 $this->tail->nextAFE = $elt;
1467 $elt->prevAFE = $this->tail;
1468 } else {
1469 $this->head = $elt;
1471 $this->tail = $elt;
1475 * Follow the steps required when the spec asks us to "clear the list of
1476 * active formatting elements up to the last marker".
1478 public function clearToMarker() {
1479 // Iterate back through the list starting from the tail
1480 $tail = $this->tail;
1481 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1482 // Unlink the element
1483 $prev = $tail->prevAFE;
1484 $tail->prevAFE = null;
1485 if ( $prev ) {
1486 $prev->nextAFE = null;
1488 $tail->nextNoah = null;
1489 $tail = $prev;
1491 // If we finished on a marker, unlink it and pop it off the Noah table stack
1492 if ( $tail ) {
1493 $prev = $tail->prevAFE;
1494 if ( $prev ) {
1495 $prev->nextAFE = null;
1497 $tail = $prev;
1498 array_pop( $this->noahTableStack );
1499 } else {
1500 // No marker: wipe the top-level Noah table (which is the only one)
1501 $this->noahTableStack[0] = [];
1503 // If we removed all the elements, clear the head pointer
1504 if ( !$tail ) {
1505 $this->head = null;
1507 $this->tail = $tail;
1511 * Find and return the last element with the specified tag between the
1512 * end of the list and the last marker on the list.
1513 * Used when parsing &lt;a&gt; "in body mode".
1515 public function findElementByTag( $tag ) {
1516 $elt = $this->tail;
1517 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1518 if ( $elt->localName === $tag ) {
1519 return $elt;
1521 $elt = $elt->prevAFE;
1523 return null;
1527 * Determine whether an element is in the list of formatting elements.
1528 * @param BalanceElement $elt
1529 * @return boolean
1531 public function isInList( BalanceElement $elt ) {
1532 return $this->head === $elt || $elt->prevAFE;
1536 * Find the element $elt in the list and remove it.
1537 * Used when parsing &lt;a&gt; in body mode.
1539 * @param BalanceElement $elt
1541 public function remove( BalanceElement $elt ) {
1542 if ( $this->head !== $elt && !$elt->prevAFE ) {
1543 throw new ParameterAssertionException( '$elt',
1544 "Attempted to remove an element which is not in the AFE list" );
1546 // Update head and tail pointers
1547 if ( $this->head === $elt ) {
1548 $this->head = $elt->nextAFE;
1550 if ( $this->tail === $elt ) {
1551 $this->tail = $elt->prevAFE;
1553 // Update previous element
1554 if ( $elt->prevAFE ) {
1555 $elt->prevAFE->nextAFE = $elt->nextAFE;
1557 // Update next element
1558 if ( $elt->nextAFE ) {
1559 $elt->nextAFE->prevAFE = $elt->prevAFE;
1561 // Clear pointers so that isInList() etc. will work
1562 $elt->prevAFE = $elt->nextAFE = null;
1563 // Update Noah list
1564 $this->removeFromNoahList( $elt );
1567 private function addToNoahList( BalanceElement $elt ) {
1568 $noahKey = $elt->getNoahKey();
1569 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1570 if ( !isset( $table[$noahKey] ) ) {
1571 $table[$noahKey] = $elt;
1572 } else {
1573 $tail = $table[$noahKey];
1574 while ( $tail->nextNoah ) {
1575 $tail = $tail->nextNoah;
1577 $tail->nextNoah = $elt;
1581 private function removeFromNoahList( BalanceElement $elt ) {
1582 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1583 $key = $elt->getNoahKey();
1584 $noahElt = $table[$key];
1585 if ( $noahElt === $elt ) {
1586 if ( $noahElt->nextNoah ) {
1587 $table[$key] = $noahElt->nextNoah;
1588 $noahElt->nextNoah = null;
1589 } else {
1590 unset( $table[$key] );
1592 } else {
1593 do {
1594 $prevNoahElt = $noahElt;
1595 $noahElt = $prevNoahElt->nextNoah;
1596 if ( $noahElt === $elt ) {
1597 // Found it, unlink
1598 $prevNoahElt->nextNoah = $elt->nextNoah;
1599 $elt->nextNoah = null;
1600 break;
1602 } while ( $noahElt );
1607 * Find element $a in the list and replace it with element $b
1609 * @param BalanceElement $a
1610 * @param BalanceElement $b
1612 public function replace( BalanceElement $a, BalanceElement $b ) {
1613 if ( $this->head !== $a && !$a->prevAFE ) {
1614 throw new ParameterAssertionException( '$a',
1615 "Attempted to replace an element which is not in the AFE list" );
1617 // Update head and tail pointers
1618 if ( $this->head === $a ) {
1619 $this->head = $b;
1621 if ( $this->tail === $a ) {
1622 $this->tail = $b;
1624 // Update previous element
1625 if ( $a->prevAFE ) {
1626 $a->prevAFE->nextAFE = $b;
1628 // Update next element
1629 if ( $a->nextAFE ) {
1630 $a->nextAFE->prevAFE = $b;
1632 $b->prevAFE = $a->prevAFE;
1633 $b->nextAFE = $a->nextAFE;
1634 $a->nextAFE = $a->prevAFE = null;
1635 // Update Noah list
1636 $this->removeFromNoahList( $a );
1637 $this->addToNoahList( $b );
1641 * Find $a in the list and insert $b after it.
1643 * @param BalanceElement $a
1644 * @param BalanceElement $b
1646 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1647 if ( $this->head !== $a && !$a->prevAFE ) {
1648 throw new ParameterAssertionException( '$a',
1649 "Attempted to insert after an element which is not in the AFE list" );
1651 if ( $this->tail === $a ) {
1652 $this->tail = $b;
1654 if ( $a->nextAFE ) {
1655 $a->nextAFE->prevAFE = $b;
1657 $b->nextAFE = $a->nextAFE;
1658 $b->prevAFE = $a;
1659 $a->nextAFE = $b;
1660 $this->addToNoahList( $b );
1663 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1665 * Reconstruct the active formatting elements.
1666 * @param BalanceStack $stack The open elements stack
1667 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1669 // @codingStandardsIgnoreEnd
1670 public function reconstruct( $stack ) {
1671 $entry = $this->tail;
1672 // If there are no entries in the list of active formatting elements,
1673 // then there is nothing to reconstruct
1674 if ( !$entry ) {
1675 return;
1677 // If the last is a marker, do nothing.
1678 if ( $entry instanceof BalanceMarker ) {
1679 return;
1681 // Or if it is an open element, do nothing.
1682 if ( $stack->indexOf( $entry ) >= 0 ) {
1683 return;
1686 // Loop backward through the list until we find a marker or an
1687 // open element
1688 $foundIt = false;
1689 while ( $entry->prevAFE ) {
1690 $entry = $entry->prevAFE;
1691 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1692 $foundIt = true;
1693 break;
1697 // Now loop forward, starting from the element after the current one (or
1698 // the first element if we didn't find a marker or open element),
1699 // recreating formatting elements and pushing them back onto the list
1700 // of open elements.
1701 if ( $foundIt ) {
1702 $entry = $entry->nextAFE;
1704 do {
1705 $newElement = $stack->insertHTMLElement(
1706 $entry->localName,
1707 $entry->attribs );
1708 $this->replace( $entry, $newElement );
1709 $entry = $newElement->nextAFE;
1710 } while ( $entry );
1714 * Get a string representation of the AFE list, for debugging
1716 public function __toString() {
1717 $prev = null;
1718 $s = '';
1719 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1720 if ( $node instanceof BalanceMarker ) {
1721 $s .= "MARKER\n";
1722 continue;
1724 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1725 if ( $node->nextNoah ) {
1726 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1727 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1728 ')';
1730 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1731 $s .= " (reverse link is wrong!)";
1733 $s .= "\n";
1735 if ( $prev !== $this->tail ) {
1736 $s .= "(tail pointer is wrong!)\n";
1738 return $s;
1743 * An implementation of the tree building portion of the HTML5 parsing
1744 * spec.
1746 * This is used to balance and tidy output so that the result can
1747 * always be cleanly serialized/deserialized by an HTML5 parser. It
1748 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1749 * a number of constraints which are not enforced by the HTML5 parsing
1750 * process. But the result will be free of gross errors: misnested or
1751 * unclosed tags, for example, and will be unchanged by spec-complient
1752 * parsing followed by serialization.
1754 * The tree building stage is structured as a state machine.
1755 * When comparing the implementation to
1756 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1757 * note that each state is implemented as a function with a
1758 * name ending in `Mode` (because the HTML spec refers to them
1759 * as insertion modes). The current insertion mode is held by
1760 * the $parseMode property.
1762 * The following simplifications have been made:
1763 * - We handle body content only (ie, we start `in body`.)
1764 * - The document is never in "quirks mode".
1765 * - All occurrences of < and > have been entity escaped, so we
1766 * can parse tags by simply splitting on those two characters.
1767 * (This also simplifies the handling of < inside <textarea>.)
1768 * The character < must not appear inside comments.
1769 * Similarly, all attributes have been "cleaned" and are double-quoted
1770 * and escaped.
1771 * - All null characters are assumed to have been removed.
1772 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1773 * <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
1774 * <noembed>, <noscript>, <script>, <title>. As a result,
1775 * further simplifications can be made:
1776 * - `frameset-ok` is not tracked.
1777 * - `head element pointer` is not tracked (but presumed non-null)
1778 * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
1779 * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
1781 * We generally mark places where we omit cases from the spec due to
1782 * disallowed elements with a comment: `// OMITTED: <element-name>`.
1784 * The HTML spec keeps a flag during the parsing process to track
1785 * whether or not a "parse error" has been encountered. We don't
1786 * bother to track that flag, we just implement the error-handling
1787 * process as specified.
1789 * @ingroup Parser
1790 * @since 1.27
1791 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1793 class Balancer {
1794 private $parseMode;
1795 /** @var \Iterator */
1796 private $bitsIterator;
1797 private $allowedHtmlElements;
1798 /** @var BalanceActiveFormattingElements */
1799 private $afe;
1800 /** @var BalanceStack */
1801 private $stack;
1802 private $strict;
1803 private $allowComments;
1804 private $config;
1806 private $textIntegrationMode;
1807 private $pendingTableText;
1808 private $originalInsertionMode;
1809 private $fragmentContext;
1810 private $formElementPointer;
1811 private $ignoreLinefeed;
1812 private $inRCDATA;
1813 private $inRAWTEXT;
1815 /** @var callable|null */
1816 private $processingCallback;
1817 /** @var array */
1818 private $processingArgs;
1821 * Valid HTML5 comments.
1822 * Regex borrowed from Tim Starling's "remex-html" project.
1824 const VALID_COMMENT_REGEX = "~ !--
1825 ( # 1. Comment match detector
1826 > | -> | # Invalid short close
1827 ( # 2. Comment contents
1829 (?! --> )
1830 (?! --!> )
1831 (?! --! \z )
1832 (?! -- \z )
1833 (?! - \z )
1837 ( # 3. Comment close
1838 --> | # Normal close
1839 --!> | # Comment end bang
1840 ( # 4. Indicate matches requiring EOF
1841 --! | # EOF in comment end bang state
1842 -- | # EOF in comment end state
1843 - | # EOF in comment end dash state
1844 # EOF in comment state
1848 ([^<]*) \z # 5. Non-tag text after the comment
1849 ~xs";
1852 * Create a new Balancer.
1853 * @param array $config Balancer configuration. Includes:
1854 * 'strict' : boolean, defaults to false.
1855 * When true, enforces syntactic constraints on input:
1856 * all non-tag '<' must be escaped, all attributes must be
1857 * separated by a single space and double-quoted. This is
1858 * consistent with the output of the Sanitizer.
1859 * 'allowedHtmlElements' : array, defaults to null.
1860 * When present, the keys of this associative array give
1861 * the acceptable HTML tag names. When not present, no
1862 * tag sanitization is done.
1863 * 'tidyCompat' : boolean, defaults to false.
1864 * When true, the serialization algorithm is tweaked to
1865 * provide historical compatibility with the old "tidy"
1866 * program: <p>-wrapping is done to the children of
1867 * <body> and <blockquote> elements, and empty elements
1868 * are removed.
1869 * 'allowComments': boolean, defaults to true.
1870 * When true, allows HTML comments in the input.
1871 * The Sanitizer generally strips all comments, so if you
1872 * are running on sanitized output you can set this to
1873 * false to get a bit more performance.
1875 public function __construct( array $config = [] ) {
1876 $this->config = $config = $config + [
1877 'strict' => false,
1878 'allowedHtmlElements' => null,
1879 'tidyCompat' => false,
1880 'allowComments' => true,
1882 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1883 $this->strict = $config['strict'];
1884 $this->allowComments = $config['allowComments'];
1885 if ( $this->allowedHtmlElements !== null ) {
1886 // Sanity check!
1887 $bad = array_uintersect_assoc(
1888 $this->allowedHtmlElements,
1889 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1890 function( $a, $b ) {
1891 // Ignore the values (just intersect the keys) by saying
1892 // all values are equal to each other.
1893 return 0;
1896 if ( count( $bad ) > 0 ) {
1897 $badstr = implode( array_keys( $bad ), ',' );
1898 throw new ParameterAssertionException(
1899 '$config',
1900 'Balance attempted with sanitization including ' .
1901 "unsupported elements: {$badstr}"
1908 * Return a balanced HTML string for the HTML fragment given by $text,
1909 * subject to the caveats listed in the class description. The result
1910 * will typically be idempotent -- that is, rebalancing the output
1911 * would result in no change.
1913 * @param string $text The markup to be balanced
1914 * @param callable $processingCallback Callback to do any variable or
1915 * parameter replacements in HTML attributes values
1916 * @param array|bool $processingArgs Arguments for the processing callback
1917 * @return string The balanced markup
1919 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1920 $this->parseMode = 'inBodyMode';
1921 $this->bitsIterator = new ExplodeIterator( '<', $text );
1922 $this->afe = new BalanceActiveFormattingElements();
1923 $this->stack = new BalanceStack( $this->config );
1924 $this->processingCallback = $processingCallback;
1925 $this->processingArgs = $processingArgs;
1927 $this->textIntegrationMode =
1928 $this->ignoreLinefeed =
1929 $this->inRCDATA =
1930 $this->inRAWTEXT = false;
1932 // The stack is constructed with an <html> element already on it.
1933 // Set this up as a fragment parsed with <body> as the context.
1934 $this->fragmentContext =
1935 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1936 $this->resetInsertionMode();
1937 $this->formElementPointer = null;
1938 for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
1939 if ( $e->isHtmlNamed( 'form' ) ) {
1940 $this->formElementPointer = $e;
1941 break;
1945 // First element is text not tag
1946 $x = $this->bitsIterator->current();
1947 $this->bitsIterator->next();
1948 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1949 // Now process each tag.
1950 while ( $this->bitsIterator->valid() ) {
1951 $this->advance();
1953 $this->insertToken( 'eof', null );
1954 $result = $this->stack->getOutput();
1955 // Free memory before returning.
1956 $this->bitsIterator = null;
1957 $this->afe = null;
1958 $this->stack = null;
1959 $this->fragmentContext = null;
1960 $this->formElementPointer = null;
1961 return $result;
1965 * Pass a token to the tree builder. The $token will be one of the
1966 * strings "tag", "endtag", or "text".
1968 private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
1969 // validate tags against $unsupportedSet
1970 if ( $token === 'tag' || $token === 'endtag' ) {
1971 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1972 // As described in "simplifications" above, these tags are
1973 // not supported in the balancer.
1974 Assert::invariant(
1975 !$this->strict,
1976 "Unsupported $token <$value> found."
1978 return false;
1980 } elseif ( $token === 'text' && $value === '' ) {
1981 // Don't actually inject the empty string as a text token.
1982 return true;
1984 // Support pre/listing/textarea by suppressing initial linefeed
1985 if ( $this->ignoreLinefeed ) {
1986 $this->ignoreLinefeed = false;
1987 if ( $token === 'text' ) {
1988 if ( $value[0] === "\n" ) {
1989 if ( $value === "\n" ) {
1990 // Nothing would be left, don't inject the empty string.
1991 return true;
1993 $value = substr( $value, 1 );
1997 // Some hoops we have to jump through
1998 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
2000 $isForeign = true;
2001 if (
2002 $this->stack->length() === 0 ||
2003 $adjusted->isHtml() ||
2004 $token === 'eof'
2006 $isForeign = false;
2007 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
2008 if ( $token === 'text' ) {
2009 $isForeign = false;
2010 } elseif (
2011 $token === 'tag' &&
2012 $value !== 'mglyph' && $value !== 'malignmark'
2014 $isForeign = false;
2016 } elseif (
2017 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
2018 $adjusted->localName === 'annotation-xml' &&
2019 $token === 'tag' && $value === 'svg'
2021 $isForeign = false;
2022 } elseif (
2023 $adjusted->isHtmlIntegrationPoint() &&
2024 ( $token === 'tag' || $token === 'text' )
2026 $isForeign = false;
2028 if ( $isForeign ) {
2029 return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
2030 } else {
2031 $func = $this->parseMode;
2032 return $this->$func( $token, $value, $attribs, $selfClose );
2036 private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
2037 if ( $token === 'text' ) {
2038 $this->stack->insertText( $value );
2039 return true;
2040 } elseif ( $token === 'tag' ) {
2041 switch ( $value ) {
2042 case 'font':
2043 if ( isset( $attribs['color'] )
2044 || isset( $attribs['face'] )
2045 || isset( $attribs['size'] )
2047 break;
2049 // otherwise, fall through
2050 case 'b':
2051 case 'big':
2052 case 'blockquote':
2053 case 'body':
2054 case 'br':
2055 case 'center':
2056 case 'code':
2057 case 'dd':
2058 case 'div':
2059 case 'dl':
2060 case 'dt':
2061 case 'em':
2062 case 'embed':
2063 case 'h1':
2064 case 'h2':
2065 case 'h3':
2066 case 'h4':
2067 case 'h5':
2068 case 'h6':
2069 case 'head':
2070 case 'hr':
2071 case 'i':
2072 case 'img':
2073 case 'li':
2074 case 'listing':
2075 case 'menu':
2076 case 'meta':
2077 case 'nobr':
2078 case 'ol':
2079 case 'p':
2080 case 'pre':
2081 case 'ruby':
2082 case 's':
2083 case 'small':
2084 case 'span':
2085 case 'strong':
2086 case 'strike':
2087 case 'sub':
2088 case 'sup':
2089 case 'table':
2090 case 'tt':
2091 case 'u':
2092 case 'ul':
2093 case 'var':
2094 if ( $this->fragmentContext ) {
2095 break;
2097 while ( true ) {
2098 $this->stack->pop();
2099 $node = $this->stack->currentNode;
2100 if (
2101 $node->isMathmlTextIntegrationPoint() ||
2102 $node->isHtmlIntegrationPoint() ||
2103 $node->isHtml()
2105 break;
2108 return $this->insertToken( $token, $value, $attribs, $selfClose );
2110 // "Any other start tag"
2111 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
2112 $this->fragmentContext : $this->stack->currentNode;
2113 $this->stack->insertForeignElement(
2114 $adjusted->namespaceURI, $value, $attribs
2116 if ( $selfClose ) {
2117 $this->stack->pop();
2119 return true;
2120 } elseif ( $token === 'endtag' ) {
2121 $first = true;
2122 foreach ( $this->stack as $i => $node ) {
2123 if ( $node->isHtml() && !$first ) {
2124 // process the end tag as HTML
2125 $func = $this->parseMode;
2126 return $this->$func( $token, $value, $attribs, $selfClose );
2127 } elseif ( $i === 0 ) {
2128 return true;
2129 } elseif ( $node->localName === $value ) {
2130 $this->stack->popTag( $node );
2131 return true;
2133 $first = false;
2139 * Grab the next "token" from $bitsIterator. This is either a open/close
2140 * tag or text or a comment, depending on whether the Sanitizer approves.
2142 private function advance() {
2143 $x = $this->bitsIterator->current();
2144 $this->bitsIterator->next();
2145 $regs = [];
2146 // Handle comments. These won't be generated by mediawiki (they
2147 // are stripped in the Sanitizer) but may be generated by extensions.
2148 if (
2149 $this->allowComments &&
2150 !( $this->inRCDATA || $this->inRAWTEXT ) &&
2151 preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
2152 // verify EOF condition where necessary
2153 ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
2155 $contents = $regs[2][0];
2156 $rest = $regs[5][0];
2157 $this->insertToken( 'comment', $contents );
2158 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2159 return;
2161 // $slash: Does the current element start with a '/'?
2162 // $t: Current element name
2163 // $attribStr: String between element name and >
2164 // $brace: Ending '>' or '/>'
2165 // $rest: Everything until the next element from the $bitsIterator
2166 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2167 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2168 $t = strtolower( $t );
2169 if ( $this->strict ) {
2170 // Verify that attributes are all properly double-quoted
2171 Assert::invariant(
2172 preg_match(
2173 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2175 "Bad attribute string found"
2178 } else {
2179 Assert::invariant(
2180 !$this->strict, "< found which does not start a valid tag"
2182 $slash = $t = $attribStr = $brace = $rest = null;
2184 $goodTag = $t;
2185 if ( $this->inRCDATA ) {
2186 if ( $slash && $t === $this->inRCDATA ) {
2187 $this->inRCDATA = false;
2188 } else {
2189 // No tags allowed; this emulates the "rcdata" tokenizer mode.
2190 $goodTag = false;
2193 if ( $this->inRAWTEXT ) {
2194 if ( $slash && $t === $this->inRAWTEXT ) {
2195 $this->inRAWTEXT = false;
2196 } else {
2197 // No tags allowed, no entity-escaping done.
2198 $goodTag = false;
2201 $sanitize = $this->allowedHtmlElements !== null;
2202 if ( $sanitize ) {
2203 $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
2205 if ( $goodTag ) {
2206 if ( is_callable( $this->processingCallback ) ) {
2207 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2209 if ( $sanitize ) {
2210 $goodTag = Sanitizer::validateTag( $attribStr, $t );
2213 if ( $goodTag ) {
2214 if ( $sanitize ) {
2215 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2216 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2217 } else {
2218 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2220 $goodTag = $this->insertToken(
2221 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2224 if ( $goodTag ) {
2225 $rest = str_replace( '>', '&gt;', $rest );
2226 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2227 } elseif ( $this->inRAWTEXT ) {
2228 $this->insertToken( 'text', "<$x" );
2229 } else {
2230 // bad tag; serialize entire thing as text.
2231 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2235 private function switchMode( $mode ) {
2236 Assert::parameter(
2237 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2239 $oldMode = $this->parseMode;
2240 $this->parseMode = $mode;
2241 return $oldMode;
2244 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
2245 $this->switchMode( $mode );
2246 return $this->insertToken( $token, $value, $attribs, $selfClose );
2249 private function resetInsertionMode() {
2250 $last = false;
2251 foreach ( $this->stack as $i => $node ) {
2252 if ( $i === 0 ) {
2253 $last = true;
2254 if ( $this->fragmentContext ) {
2255 $node = $this->fragmentContext;
2258 if ( $node->isHtml() ) {
2259 switch ( $node->localName ) {
2260 case 'select':
2261 $stackLength = $this->stack->length();
2262 for ( $j = $i + 1; $j < $stackLength-1; $j++ ) {
2263 $ancestor = $this->stack->node( $stackLength-$j-1 );
2264 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2265 break;
2267 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2268 $this->switchMode( 'inSelectInTableMode' );
2269 return;
2272 $this->switchMode( 'inSelectMode' );
2273 return;
2274 case 'tr':
2275 $this->switchMode( 'inRowMode' );
2276 return;
2277 case 'tbody':
2278 case 'tfoot':
2279 case 'thead':
2280 $this->switchMode( 'inTableBodyMode' );
2281 return;
2282 case 'caption':
2283 $this->switchMode( 'inCaptionMode' );
2284 return;
2285 case 'colgroup':
2286 $this->switchMode( 'inColumnGroupMode' );
2287 return;
2288 case 'table':
2289 $this->switchMode( 'inTableMode' );
2290 return;
2291 case 'template':
2292 $this->switchMode(
2293 array_slice( $this->templateInsertionModes, -1 )[0]
2295 return;
2296 case 'body':
2297 $this->switchMode( 'inBodyMode' );
2298 return;
2299 // OMITTED: <frameset>
2300 // OMITTED: <html>
2301 // OMITTED: <head>
2302 default:
2303 if ( !$last ) {
2304 // OMITTED: <head>
2305 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2306 $this->switchMode( 'inCellMode' );
2307 return;
2312 if ( $last ) {
2313 $this->switchMode( 'inBodyMode' );
2314 return;
2319 private function stopParsing() {
2320 // Most of the spec methods are inapplicable, other than step 2:
2321 // "pop all the nodes off the stack of open elements".
2322 // We're going to keep the top-most <html> element on the stack, though.
2324 // Clear the AFE list first, otherwise the element objects will stay live
2325 // during serialization, potentially using O(N^2) memory. Note that
2326 // popping the stack will never result in reconstructing the active
2327 // formatting elements.
2328 $this->afe = null;
2329 $this->stack->popTo( 1 );
2332 private function parseRawText( $value, $attribs = null ) {
2333 $this->stack->insertHTMLElement( $value, $attribs );
2334 $this->inRAWTEXT = $value;
2335 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2336 return true;
2339 private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
2340 if ( $token === 'text' ) {
2341 $this->stack->insertText( $value );
2342 return true;
2343 } elseif ( $token === 'eof' ) {
2344 $this->stack->pop();
2345 return $this->switchModeAndReprocess(
2346 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
2348 } elseif ( $token === 'endtag' ) {
2349 $this->stack->pop();
2350 $this->switchMode( $this->originalInsertionMode );
2351 return true;
2353 return true;
2356 private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
2357 if ( $token === 'text' ) {
2358 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2359 $this->stack->insertText( $matches[0] );
2360 $value = substr( $value, strlen( $matches[0] ) );
2362 if ( strlen( $value ) === 0 ) {
2363 return true; // All text handled.
2365 // Fall through to handle non-whitespace below.
2366 } elseif ( $token === 'tag' ) {
2367 switch ( $value ) {
2368 case 'meta':
2369 // OMITTED: in a full HTML parser, this might change the encoding.
2370 // falls through
2371 // OMITTED: <html>
2372 case 'base':
2373 case 'basefont':
2374 case 'bgsound':
2375 case 'link':
2376 $this->stack->insertHTMLElement( $value, $attribs );
2377 $this->stack->pop();
2378 return true;
2379 // OMITTED: <title>
2380 // OMITTED: <noscript>
2381 case 'noframes':
2382 case 'style':
2383 return $this->parseRawText( $value, $attribs );
2384 // OMITTED: <script>
2385 case 'template':
2386 $this->stack->insertHTMLElement( $value, $attribs );
2387 $this->afe->insertMarker();
2388 // OMITTED: frameset_ok
2389 $this->switchMode( 'inTemplateMode' );
2390 $this->templateInsertionModes[] = $this->parseMode;
2391 return true;
2392 // OMITTED: <head>
2394 } elseif ( $token === 'endtag' ) {
2395 switch ( $value ) {
2396 // OMITTED: <head>
2397 // OMITTED: <body>
2398 // OMITTED: <html>
2399 case 'br':
2400 break; // handle at the bottom of the function
2401 case 'template':
2402 if ( $this->stack->indexOf( $value ) < 0 ) {
2403 return true; // Ignore the token.
2405 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2406 $this->stack->popTag( $value );
2407 $this->afe->clearToMarker();
2408 array_pop( $this->templateInsertionModes );
2409 $this->resetInsertionMode();
2410 return true;
2411 default:
2412 // ignore any other end tag
2413 return true;
2415 } elseif ( $token === 'comment' ) {
2416 $this->stack->insertComment( $value );
2417 return true;
2420 // If not handled above
2421 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2422 // Then redo this one
2423 return $this->insertToken( $token, $value, $attribs, $selfClose );
2426 private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
2427 if ( $token === 'text' ) {
2428 $this->afe->reconstruct( $this->stack );
2429 $this->stack->insertText( $value );
2430 return true;
2431 } elseif ( $token === 'eof' ) {
2432 if ( !empty( $this->templateInsertionModes ) ) {
2433 return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
2435 $this->stopParsing();
2436 return true;
2437 } elseif ( $token === 'tag' ) {
2438 switch ( $value ) {
2439 // OMITTED: <html>
2440 case 'base':
2441 case 'basefont':
2442 case 'bgsound':
2443 case 'link':
2444 case 'meta':
2445 case 'noframes':
2446 // OMITTED: <script>
2447 case 'style':
2448 case 'template':
2449 // OMITTED: <title>
2450 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2451 // OMITTED: <body>
2452 // OMITTED: <frameset>
2454 case 'address':
2455 case 'article':
2456 case 'aside':
2457 case 'blockquote':
2458 case 'center':
2459 case 'details':
2460 case 'dialog':
2461 case 'dir':
2462 case 'div':
2463 case 'dl':
2464 case 'fieldset':
2465 case 'figcaption':
2466 case 'figure':
2467 case 'footer':
2468 case 'header':
2469 case 'hgroup':
2470 case 'main':
2471 case 'menu':
2472 case 'nav':
2473 case 'ol':
2474 case 'p':
2475 case 'section':
2476 case 'summary':
2477 case 'ul':
2478 if ( $this->stack->inButtonScope( 'p' ) ) {
2479 $this->inBodyMode( 'endtag', 'p' );
2481 $this->stack->insertHTMLElement( $value, $attribs );
2482 return true;
2484 case 'h1':
2485 case 'h2':
2486 case 'h3':
2487 case 'h4':
2488 case 'h5':
2489 case 'h6':
2490 if ( $this->stack->inButtonScope( 'p' ) ) {
2491 $this->inBodyMode( 'endtag', 'p' );
2493 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2494 $this->stack->pop();
2496 $this->stack->insertHTMLElement( $value, $attribs );
2497 return true;
2499 case 'pre':
2500 case 'listing':
2501 if ( $this->stack->inButtonScope( 'p' ) ) {
2502 $this->inBodyMode( 'endtag', 'p' );
2504 $this->stack->insertHTMLElement( $value, $attribs );
2505 $this->ignoreLinefeed = true;
2506 // OMITTED: frameset_ok
2507 return true;
2509 case 'form':
2510 if (
2511 $this->formElementPointer &&
2512 $this->stack->indexOf( 'template' ) < 0
2514 return true; // in a form, not in a template.
2516 if ( $this->stack->inButtonScope( "p" ) ) {
2517 $this->inBodyMode( 'endtag', 'p' );
2519 $elt = $this->stack->insertHTMLElement( $value, $attribs );
2520 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2521 $this->formElementPointer = $elt;
2523 return true;
2525 case 'li':
2526 // OMITTED: frameset_ok
2527 foreach ( $this->stack as $node ) {
2528 if ( $node->isHtmlNamed( 'li' ) ) {
2529 $this->inBodyMode( 'endtag', 'li' );
2530 break;
2532 if (
2533 $node->isA( BalanceSets::$specialSet ) &&
2534 !$node->isA( BalanceSets::$addressDivPSet )
2536 break;
2539 if ( $this->stack->inButtonScope( 'p' ) ) {
2540 $this->inBodyMode( 'endtag', 'p' );
2542 $this->stack->insertHTMLElement( $value, $attribs );
2543 return true;
2545 case 'dd':
2546 case 'dt':
2547 // OMITTED: frameset_ok
2548 foreach ( $this->stack as $node ) {
2549 if ( $node->isHtmlNamed( 'dd' ) ) {
2550 $this->inBodyMode( 'endtag', 'dd' );
2551 break;
2553 if ( $node->isHtmlNamed( 'dt' ) ) {
2554 $this->inBodyMode( 'endtag', 'dt' );
2555 break;
2557 if (
2558 $node->isA( BalanceSets::$specialSet ) &&
2559 !$node->isA( BalanceSets::$addressDivPSet )
2561 break;
2564 if ( $this->stack->inButtonScope( 'p' ) ) {
2565 $this->inBodyMode( 'endtag', 'p' );
2567 $this->stack->insertHTMLElement( $value, $attribs );
2568 return true;
2570 // OMITTED: <plaintext>
2572 case 'button':
2573 if ( $this->stack->inScope( 'button' ) ) {
2574 $this->inBodyMode( 'endtag', 'button' );
2575 return $this->insertToken( $token, $value, $attribs, $selfClose );
2577 $this->afe->reconstruct( $this->stack );
2578 $this->stack->insertHTMLElement( $value, $attribs );
2579 return true;
2581 case 'a':
2582 $activeElement = $this->afe->findElementByTag( 'a' );
2583 if ( $activeElement ) {
2584 $this->inBodyMode( 'endtag', 'a' );
2585 if ( $this->afe->isInList( $activeElement ) ) {
2586 $this->afe->remove( $activeElement );
2587 // Don't flatten here, since when we fall
2588 // through below we might foster parent
2589 // the new <a> tag inside this one.
2590 $this->stack->removeElement( $activeElement, false );
2593 // Falls through
2594 case 'b':
2595 case 'big':
2596 case 'code':
2597 case 'em':
2598 case 'font':
2599 case 'i':
2600 case 's':
2601 case 'small':
2602 case 'strike':
2603 case 'strong':
2604 case 'tt':
2605 case 'u':
2606 $this->afe->reconstruct( $this->stack );
2607 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2608 return true;
2610 case 'nobr':
2611 $this->afe->reconstruct( $this->stack );
2612 if ( $this->stack->inScope( 'nobr' ) ) {
2613 $this->inBodyMode( 'endtag', 'nobr' );
2614 $this->afe->reconstruct( $this->stack );
2616 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2617 return true;
2619 case 'applet':
2620 case 'marquee':
2621 case 'object':
2622 $this->afe->reconstruct( $this->stack );
2623 $this->stack->insertHTMLElement( $value, $attribs );
2624 $this->afe->insertMarker();
2625 // OMITTED: frameset_ok
2626 return true;
2628 case 'table':
2629 // The document is never in "quirks mode"; see simplifications
2630 // above.
2631 if ( $this->stack->inButtonScope( 'p' ) ) {
2632 $this->inBodyMode( 'endtag', 'p' );
2634 $this->stack->insertHTMLElement( $value, $attribs );
2635 // OMITTED: frameset_ok
2636 $this->switchMode( 'inTableMode' );
2637 return true;
2639 case 'area':
2640 case 'br':
2641 case 'embed':
2642 case 'img':
2643 case 'keygen':
2644 case 'wbr':
2645 $this->afe->reconstruct( $this->stack );
2646 $this->stack->insertHTMLElement( $value, $attribs );
2647 $this->stack->pop();
2648 // OMITTED: frameset_ok
2649 return true;
2651 case 'input':
2652 $this->afe->reconstruct( $this->stack );
2653 $this->stack->insertHTMLElement( $value, $attribs );
2654 $this->stack->pop();
2655 // OMITTED: frameset_ok
2656 // (hence we don't need to examine the tag's "type" attribute)
2657 return true;
2659 case 'menuitem':
2660 case 'param':
2661 case 'source':
2662 case 'track':
2663 $this->stack->insertHTMLElement( $value, $attribs );
2664 $this->stack->pop();
2665 return true;
2667 case 'hr':
2668 if ( $this->stack->inButtonScope( 'p' ) ) {
2669 $this->inBodyMode( 'endtag', 'p' );
2671 $this->stack->insertHTMLElement( $value, $attribs );
2672 $this->stack->pop();
2673 return true;
2675 case 'image':
2676 // warts!
2677 return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
2679 // OMITTED: <isindex>
2681 case 'textarea':
2682 $this->stack->insertHTMLElement( $value, $attribs );
2683 $this->ignoreLinefeed = true;
2684 $this->inRCDATA = $value; // emulate rcdata tokenizer mode
2685 // OMITTED: frameset_ok
2686 return true;
2688 // OMITTED: <xmp>
2689 // OMITTED: <iframe>
2690 // OMITTED: <noembed>
2691 // OMITTED: <noscript>
2693 case 'select':
2694 $this->afe->reconstruct( $this->stack );
2695 $this->stack->insertHTMLElement( $value, $attribs );
2696 switch ( $this->parseMode ) {
2697 case 'inTableMode':
2698 case 'inCaptionMode':
2699 case 'inTableBodyMode':
2700 case 'inRowMode':
2701 case 'inCellMode':
2702 $this->switchMode( 'inSelectInTableMode' );
2703 return true;
2704 default:
2705 $this->switchMode( 'inSelectMode' );
2706 return true;
2709 case 'optgroup':
2710 case 'option':
2711 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2712 $this->inBodyMode( 'endtag', 'option' );
2714 $this->afe->reconstruct( $this->stack );
2715 $this->stack->insertHTMLElement( $value, $attribs );
2716 return true;
2718 case 'rb':
2719 case 'rtc':
2720 if ( $this->stack->inScope( 'ruby' ) ) {
2721 $this->stack->generateImpliedEndTags();
2723 $this->stack->insertHTMLElement( $value, $attribs );
2724 return true;
2726 case 'rp':
2727 case 'rt':
2728 if ( $this->stack->inScope( 'ruby' ) ) {
2729 $this->stack->generateImpliedEndTags( 'rtc' );
2731 $this->stack->insertHTMLElement( $value, $attribs );
2732 return true;
2734 case 'math':
2735 $this->afe->reconstruct( $this->stack );
2736 // We skip the spec's "adjust MathML attributes" and
2737 // "adjust foreign attributes" steps, since the browser will
2738 // do this later when it parses the output and it doesn't affect
2739 // balancing.
2740 $this->stack->insertForeignElement(
2741 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2743 if ( $selfClose ) {
2744 // emit explicit </math> tag.
2745 $this->stack->pop();
2747 return true;
2749 case 'svg':
2750 $this->afe->reconstruct( $this->stack );
2751 // We skip the spec's "adjust SVG attributes" and
2752 // "adjust foreign attributes" steps, since the browser will
2753 // do this later when it parses the output and it doesn't affect
2754 // balancing.
2755 $this->stack->insertForeignElement(
2756 BalanceSets::SVG_NAMESPACE, $value, $attribs
2758 if ( $selfClose ) {
2759 // emit explicit </svg> tag.
2760 $this->stack->pop();
2762 return true;
2764 case 'caption':
2765 case 'col':
2766 case 'colgroup':
2767 // OMITTED: <frame>
2768 case 'head':
2769 case 'tbody':
2770 case 'td':
2771 case 'tfoot':
2772 case 'th':
2773 case 'thead':
2774 case 'tr':
2775 // Ignore table tags if we're not inTableMode
2776 return true;
2779 // Handle any other start tag here
2780 $this->afe->reconstruct( $this->stack );
2781 $this->stack->insertHTMLElement( $value, $attribs );
2782 return true;
2783 } elseif ( $token === 'endtag' ) {
2784 switch ( $value ) {
2785 // </body>,</html> are unsupported.
2787 case 'template':
2788 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2790 case 'address':
2791 case 'article':
2792 case 'aside':
2793 case 'blockquote':
2794 case 'button':
2795 case 'center':
2796 case 'details':
2797 case 'dialog':
2798 case 'dir':
2799 case 'div':
2800 case 'dl':
2801 case 'fieldset':
2802 case 'figcaption':
2803 case 'figure':
2804 case 'footer':
2805 case 'header':
2806 case 'hgroup':
2807 case 'listing':
2808 case 'main':
2809 case 'menu':
2810 case 'nav':
2811 case 'ol':
2812 case 'pre':
2813 case 'section':
2814 case 'summary':
2815 case 'ul':
2816 // Ignore if there is not a matching open tag
2817 if ( !$this->stack->inScope( $value ) ) {
2818 return true;
2820 $this->stack->generateImpliedEndTags();
2821 $this->stack->popTag( $value );
2822 return true;
2824 case 'form':
2825 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2826 $openform = $this->formElementPointer;
2827 $this->formElementPointer = null;
2828 if ( !$openform || !$this->stack->inScope( $openform ) ) {
2829 return true;
2831 $this->stack->generateImpliedEndTags();
2832 // Don't flatten yet if we're removing a <form> element
2833 // out-of-order. (eg. `<form><div></form>`)
2834 $flatten = ( $this->stack->currentNode === $openform );
2835 $this->stack->removeElement( $openform, $flatten );
2836 } else {
2837 if ( !$this->stack->inScope( 'form' ) ) {
2838 return true;
2840 $this->stack->generateImpliedEndTags();
2841 $this->stack->popTag( 'form' );
2843 return true;
2845 case 'p':
2846 if ( !$this->stack->inButtonScope( 'p' ) ) {
2847 $this->inBodyMode( 'tag', 'p', [] );
2848 return $this->insertToken( $token, $value, $attribs, $selfClose );
2850 $this->stack->generateImpliedEndTags( $value );
2851 $this->stack->popTag( $value );
2852 return true;
2854 case 'li':
2855 if ( !$this->stack->inListItemScope( $value ) ) {
2856 return true; // ignore
2858 $this->stack->generateImpliedEndTags( $value );
2859 $this->stack->popTag( $value );
2860 return true;
2862 case 'dd':
2863 case 'dt':
2864 if ( !$this->stack->inScope( $value ) ) {
2865 return true; // ignore
2867 $this->stack->generateImpliedEndTags( $value );
2868 $this->stack->popTag( $value );
2869 return true;
2871 case 'h1':
2872 case 'h2':
2873 case 'h3':
2874 case 'h4':
2875 case 'h5':
2876 case 'h6':
2877 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2878 return true; // ignore
2880 $this->stack->generateImpliedEndTags();
2881 $this->stack->popTag( BalanceSets::$headingSet );
2882 return true;
2884 case 'sarcasm':
2885 // Take a deep breath, then:
2886 break;
2888 case 'a':
2889 case 'b':
2890 case 'big':
2891 case 'code':
2892 case 'em':
2893 case 'font':
2894 case 'i':
2895 case 'nobr':
2896 case 's':
2897 case 'small':
2898 case 'strike':
2899 case 'strong':
2900 case 'tt':
2901 case 'u':
2902 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2903 return true; // If we did something, we're done.
2905 break; // Go to the "any other end tag" case.
2907 case 'applet':
2908 case 'marquee':
2909 case 'object':
2910 if ( !$this->stack->inScope( $value ) ) {
2911 return true; // ignore
2913 $this->stack->generateImpliedEndTags();
2914 $this->stack->popTag( $value );
2915 $this->afe->clearToMarker();
2916 return true;
2918 case 'br':
2919 // Turn </br> into <br>
2920 return $this->inBodyMode( 'tag', $value, [] );
2923 // Any other end tag goes here
2924 foreach ( $this->stack as $i => $node ) {
2925 if ( $node->isHtmlNamed( $value ) ) {
2926 $this->stack->generateImpliedEndTags( $value );
2927 $this->stack->popTo( $i ); // including $i
2928 break;
2929 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2930 return true; // ignore this close token.
2933 return true;
2934 } elseif ( $token === 'comment' ) {
2935 $this->stack->insertComment( $value );
2936 return true;
2937 } else {
2938 Assert::invariant( false, "Bad token type: $token" );
2942 private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
2943 if ( $token === 'text' ) {
2944 if ( $this->textIntegrationMode ) {
2945 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
2946 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2947 $this->pendingTableText = '';
2948 $this->originalInsertionMode = $this->parseMode;
2949 return $this->switchModeAndReprocess( 'inTableTextMode',
2950 $token, $value, $attribs, $selfClose );
2952 // fall through to default case.
2953 } elseif ( $token === 'eof' ) {
2954 $this->stopParsing();
2955 return true;
2956 } elseif ( $token === 'tag' ) {
2957 switch ( $value ) {
2958 case 'caption':
2959 $this->afe->insertMarker();
2960 $this->stack->insertHTMLElement( $value, $attribs );
2961 $this->switchMode( 'inCaptionMode' );
2962 return true;
2963 case 'colgroup':
2964 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2965 $this->stack->insertHTMLElement( $value, $attribs );
2966 $this->switchMode( 'inColumnGroupMode' );
2967 return true;
2968 case 'col':
2969 $this->inTableMode( 'tag', 'colgroup', [] );
2970 return $this->insertToken( $token, $value, $attribs, $selfClose );
2971 case 'tbody':
2972 case 'tfoot':
2973 case 'thead':
2974 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2975 $this->stack->insertHTMLElement( $value, $attribs );
2976 $this->switchMode( 'inTableBodyMode' );
2977 return true;
2978 case 'td':
2979 case 'th':
2980 case 'tr':
2981 $this->inTableMode( 'tag', 'tbody', [] );
2982 return $this->insertToken( $token, $value, $attribs, $selfClose );
2983 case 'table':
2984 if ( !$this->stack->inTableScope( $value ) ) {
2985 return true; // Ignore this tag.
2987 $this->inTableMode( 'endtag', $value );
2988 return $this->insertToken( $token, $value, $attribs, $selfClose );
2990 case 'style':
2991 // OMITTED: <script>
2992 case 'template':
2993 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2995 case 'input':
2996 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2997 break; // Handle this as "everything else"
2999 $this->stack->insertHTMLElement( $value, $attribs );
3000 $this->stack->pop();
3001 return true;
3003 case 'form':
3004 if (
3005 $this->formElementPointer ||
3006 $this->stack->indexOf( 'template' ) >= 0
3008 return true; // ignore this token
3010 $this->formElementPointer =
3011 $this->stack->insertHTMLElement( $value, $attribs );
3012 $this->stack->popTag( $this->formElementPointer );
3013 return true;
3015 // Fall through for "anything else" clause.
3016 } elseif ( $token === 'endtag' ) {
3017 switch ( $value ) {
3018 case 'table':
3019 if ( !$this->stack->inTableScope( $value ) ) {
3020 return true; // Ignore.
3022 $this->stack->popTag( $value );
3023 $this->resetInsertionMode();
3024 return true;
3025 // OMITTED: <body>
3026 case 'caption':
3027 case 'col':
3028 case 'colgroup':
3029 // OMITTED: <html>
3030 case 'tbody':
3031 case 'td':
3032 case 'tfoot':
3033 case 'th':
3034 case 'thead':
3035 case 'tr':
3036 return true; // Ignore the token.
3037 case 'template':
3038 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3040 // Fall through for "anything else" clause.
3041 } elseif ( $token === 'comment' ) {
3042 $this->stack->insertComment( $value );
3043 return true;
3045 // This is the "anything else" case:
3046 $this->stack->fosterParentMode = true;
3047 $this->inBodyMode( $token, $value, $attribs, $selfClose );
3048 $this->stack->fosterParentMode = false;
3049 return true;
3052 private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
3053 if ( $token === 'text' ) {
3054 $this->pendingTableText .= $value;
3055 return true;
3057 // Non-text token:
3058 $text = $this->pendingTableText;
3059 $this->pendingTableText = '';
3060 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
3061 // This should match the "anything else" case inTableMode
3062 $this->stack->fosterParentMode = true;
3063 $this->inBodyMode( 'text', $text );
3064 $this->stack->fosterParentMode = false;
3065 } else {
3066 // Pending text is just whitespace.
3067 $this->stack->insertText( $text );
3069 return $this->switchModeAndReprocess(
3070 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
3074 // helper for inCaptionMode
3075 private function endCaption() {
3076 if ( !$this->stack->inTableScope( 'caption' ) ) {
3077 return false;
3079 $this->stack->generateImpliedEndTags();
3080 $this->stack->popTag( 'caption' );
3081 $this->afe->clearToMarker();
3082 $this->switchMode( 'inTableMode' );
3083 return true;
3086 private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
3087 if ( $token === 'tag' ) {
3088 switch ( $value ) {
3089 case 'caption':
3090 case 'col':
3091 case 'colgroup':
3092 case 'tbody':
3093 case 'td':
3094 case 'tfoot':
3095 case 'th':
3096 case 'thead':
3097 case 'tr':
3098 if ( $this->endCaption() ) {
3099 $this->insertToken( $token, $value, $attribs, $selfClose );
3101 return true;
3103 // Fall through to "anything else" case.
3104 } elseif ( $token === 'endtag' ) {
3105 switch ( $value ) {
3106 case 'caption':
3107 $this->endCaption();
3108 return true;
3109 case 'table':
3110 if ( $this->endCaption() ) {
3111 $this->insertToken( $token, $value, $attribs, $selfClose );
3113 return true;
3114 case 'body':
3115 case 'col':
3116 case 'colgroup':
3117 // OMITTED: <html>
3118 case 'tbody':
3119 case 'td':
3120 case 'tfoot':
3121 case 'th':
3122 case 'thead':
3123 case 'tr':
3124 // Ignore the token
3125 return true;
3127 // Fall through to "anything else" case.
3129 // The Anything Else case
3130 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3133 private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
3134 if ( $token === 'text' ) {
3135 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
3136 $this->stack->insertText( $matches[0] );
3137 $value = substr( $value, strlen( $matches[0] ) );
3139 if ( strlen( $value ) === 0 ) {
3140 return true; // All text handled.
3142 // Fall through to handle non-whitespace below.
3143 } elseif ( $token === 'tag' ) {
3144 switch ( $value ) {
3145 // OMITTED: <html>
3146 case 'col':
3147 $this->stack->insertHTMLElement( $value, $attribs );
3148 $this->stack->pop();
3149 return true;
3150 case 'template':
3151 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3153 // Fall through for "anything else".
3154 } elseif ( $token === 'endtag' ) {
3155 switch ( $value ) {
3156 case 'colgroup':
3157 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3158 return true; // Ignore the token.
3160 $this->stack->pop();
3161 $this->switchMode( 'inTableMode' );
3162 return true;
3163 case 'col':
3164 return true; // Ignore the token.
3165 case 'template':
3166 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3168 // Fall through for "anything else".
3169 } elseif ( $token === 'eof' ) {
3170 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3171 } elseif ( $token === 'comment' ) {
3172 $this->stack->insertComment( $value );
3173 return true;
3176 // Anything else
3177 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3178 return true; // Ignore the token.
3180 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3181 return $this->insertToken( $token, $value, $attribs, $selfClose );
3184 // Helper function for inTableBodyMode
3185 private function endSection() {
3186 if ( !(
3187 $this->stack->inTableScope( 'tbody' ) ||
3188 $this->stack->inTableScope( 'thead' ) ||
3189 $this->stack->inTableScope( 'tfoot' )
3190 ) ) {
3191 return false;
3193 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3194 $this->stack->pop();
3195 $this->switchMode( 'inTableMode' );
3196 return true;
3198 private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
3199 if ( $token === 'tag' ) {
3200 switch ( $value ) {
3201 case 'tr':
3202 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3203 $this->stack->insertHTMLElement( $value, $attribs );
3204 $this->switchMode( 'inRowMode' );
3205 return true;
3206 case 'th':
3207 case 'td':
3208 $this->inTableBodyMode( 'tag', 'tr', [] );
3209 $this->insertToken( $token, $value, $attribs, $selfClose );
3210 return true;
3211 case 'caption':
3212 case 'col':
3213 case 'colgroup':
3214 case 'tbody':
3215 case 'tfoot':
3216 case 'thead':
3217 if ( $this->endSection() ) {
3218 $this->insertToken( $token, $value, $attribs, $selfClose );
3220 return true;
3222 } elseif ( $token === 'endtag' ) {
3223 switch ( $value ) {
3224 case 'table':
3225 if ( $this->endSection() ) {
3226 $this->insertToken( $token, $value, $attribs, $selfClose );
3228 return true;
3229 case 'tbody':
3230 case 'tfoot':
3231 case 'thead':
3232 if ( $this->stack->inTableScope( $value ) ) {
3233 $this->endSection();
3235 return true;
3236 // OMITTED: <body>
3237 case 'caption':
3238 case 'col':
3239 case 'colgroup':
3240 // OMITTED: <html>
3241 case 'td':
3242 case 'th':
3243 case 'tr':
3244 return true; // Ignore the token.
3247 // Anything else:
3248 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3251 // Helper function for inRowMode
3252 private function endRow() {
3253 if ( !$this->stack->inTableScope( 'tr' ) ) {
3254 return false;
3256 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3257 $this->stack->pop();
3258 $this->switchMode( 'inTableBodyMode' );
3259 return true;
3261 private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
3262 if ( $token === 'tag' ) {
3263 switch ( $value ) {
3264 case 'th':
3265 case 'td':
3266 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3267 $this->stack->insertHTMLElement( $value, $attribs );
3268 $this->switchMode( 'inCellMode' );
3269 $this->afe->insertMarker();
3270 return true;
3271 case 'caption':
3272 case 'col':
3273 case 'colgroup':
3274 case 'tbody':
3275 case 'tfoot':
3276 case 'thead':
3277 case 'tr':
3278 if ( $this->endRow() ) {
3279 $this->insertToken( $token, $value, $attribs, $selfClose );
3281 return true;
3283 } elseif ( $token === 'endtag' ) {
3284 switch ( $value ) {
3285 case 'tr':
3286 $this->endRow();
3287 return true;
3288 case 'table':
3289 if ( $this->endRow() ) {
3290 $this->insertToken( $token, $value, $attribs, $selfClose );
3292 return true;
3293 case 'tbody':
3294 case 'tfoot':
3295 case 'thead':
3296 if (
3297 $this->stack->inTableScope( $value ) &&
3298 $this->endRow()
3300 $this->insertToken( $token, $value, $attribs, $selfClose );
3302 return true;
3303 // OMITTED: <body>
3304 case 'caption':
3305 case 'col':
3306 case 'colgroup':
3307 // OMITTED: <html>
3308 case 'td':
3309 case 'th':
3310 return true; // Ignore the token.
3313 // Anything else:
3314 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3317 // Helper for inCellMode
3318 private function endCell() {
3319 if ( $this->stack->inTableScope( 'td' ) ) {
3320 $this->inCellMode( 'endtag', 'td' );
3321 return true;
3322 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3323 $this->inCellMode( 'endtag', 'th' );
3324 return true;
3325 } else {
3326 return false;
3329 private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
3330 if ( $token === 'tag' ) {
3331 switch ( $value ) {
3332 case 'caption':
3333 case 'col':
3334 case 'colgroup':
3335 case 'tbody':
3336 case 'td':
3337 case 'tfoot':
3338 case 'th':
3339 case 'thead':
3340 case 'tr':
3341 if ( $this->endCell() ) {
3342 $this->insertToken( $token, $value, $attribs, $selfClose );
3344 return true;
3346 } elseif ( $token === 'endtag' ) {
3347 switch ( $value ) {
3348 case 'td':
3349 case 'th':
3350 if ( $this->stack->inTableScope( $value ) ) {
3351 $this->stack->generateImpliedEndTags();
3352 $this->stack->popTag( $value );
3353 $this->afe->clearToMarker();
3354 $this->switchMode( 'inRowMode' );
3356 return true;
3357 // OMITTED: <body>
3358 case 'caption':
3359 case 'col':
3360 case 'colgroup':
3361 // OMITTED: <html>
3362 return true;
3364 case 'table':
3365 case 'tbody':
3366 case 'tfoot':
3367 case 'thead':
3368 case 'tr':
3369 if ( $this->stack->inTableScope( $value ) ) {
3370 $this->stack->generateImpliedEndTags();
3371 $this->stack->popTag( BalanceSets::$tableCellSet );
3372 $this->afe->clearToMarker();
3373 $this->switchMode( 'inRowMode' );
3374 $this->insertToken( $token, $value, $attribs, $selfClose );
3376 return true;
3379 // Anything else:
3380 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3383 private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
3384 if ( $token === 'text' ) {
3385 $this->stack->insertText( $value );
3386 return true;
3387 } elseif ( $token === 'eof' ) {
3388 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3389 } elseif ( $token === 'tag' ) {
3390 switch ( $value ) {
3391 // OMITTED: <html>
3392 case 'option':
3393 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3394 $this->stack->pop();
3396 $this->stack->insertHTMLElement( $value, $attribs );
3397 return true;
3398 case 'optgroup':
3399 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3400 $this->stack->pop();
3402 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3403 $this->stack->pop();
3405 $this->stack->insertHTMLElement( $value, $attribs );
3406 return true;
3407 case 'select':
3408 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3409 return true;
3410 case 'input':
3411 case 'keygen':
3412 case 'textarea':
3413 if ( !$this->stack->inSelectScope( 'select' ) ) {
3414 return true; // ignore token (fragment case)
3416 $this->inSelectMode( 'endtag', 'select' );
3417 return $this->insertToken( $token, $value, $attribs, $selfClose );
3418 case 'script':
3419 case 'template':
3420 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3422 } elseif ( $token === 'endtag' ) {
3423 switch ( $value ) {
3424 case 'optgroup':
3425 if (
3426 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3427 $this->stack->length() >= 2 &&
3428 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3430 $this->stack->pop();
3432 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3433 $this->stack->pop();
3435 return true;
3436 case 'option':
3437 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3438 $this->stack->pop();
3440 return true;
3441 case 'select':
3442 if ( !$this->stack->inSelectScope( $value ) ) {
3443 return true; // fragment case
3445 $this->stack->popTag( $value );
3446 $this->resetInsertionMode();
3447 return true;
3448 case 'template':
3449 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3451 } elseif ( $token === 'comment' ) {
3452 $this->stack->insertComment( $value );
3453 return true;
3455 // anything else: just ignore the token
3456 return true;
3459 private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
3460 switch ( $value ) {
3461 case 'caption':
3462 case 'table':
3463 case 'tbody':
3464 case 'tfoot':
3465 case 'thead':
3466 case 'tr':
3467 case 'td':
3468 case 'th':
3469 if ( $token === 'tag' ) {
3470 $this->inSelectInTableMode( 'endtag', 'select' );
3471 return $this->insertToken( $token, $value, $attribs, $selfClose );
3472 } elseif ( $token === 'endtag' ) {
3473 if ( $this->stack->inTableScope( $value ) ) {
3474 $this->inSelectInTableMode( 'endtag', 'select' );
3475 return $this->insertToken( $token, $value, $attribs, $selfClose );
3477 return true;
3480 // anything else
3481 return $this->inSelectMode( $token, $value, $attribs, $selfClose );
3484 private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
3485 if ( $token === 'text' || $token === 'comment' ) {
3486 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3487 } elseif ( $token === 'eof' ) {
3488 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3489 $this->stopParsing();
3490 } else {
3491 $this->stack->popTag( 'template' );
3492 $this->afe->clearToMarker();
3493 array_pop( $this->templateInsertionModes );
3494 $this->resetInsertionMode();
3495 $this->insertToken( $token, $value, $attribs, $selfClose );
3497 return true;
3498 } elseif ( $token === 'tag' ) {
3499 switch ( $value ) {
3500 case 'base':
3501 case 'basefont':
3502 case 'bgsound':
3503 case 'link':
3504 case 'meta':
3505 case 'noframes':
3506 // OMITTED: <script>
3507 case 'style':
3508 case 'template':
3509 // OMITTED: <title>
3510 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3512 case 'caption':
3513 case 'colgroup':
3514 case 'tbody':
3515 case 'tfoot':
3516 case 'thead':
3517 return $this->switchModeAndReprocess(
3518 'inTableMode', $token, $value, $attribs, $selfClose
3521 case 'col':
3522 return $this->switchModeAndReprocess(
3523 'inColumnGroupMode', $token, $value, $attribs, $selfClose
3526 case 'tr':
3527 return $this->switchModeAndReprocess(
3528 'inTableBodyMode', $token, $value, $attribs, $selfClose
3531 case 'td':
3532 case 'th':
3533 return $this->switchModeAndReprocess(
3534 'inRowMode', $token, $value, $attribs, $selfClose
3537 return $this->switchModeAndReprocess(
3538 'inBodyMode', $token, $value, $attribs, $selfClose
3540 } elseif ( $token === 'endtag' ) {
3541 switch ( $value ) {
3542 case 'template':
3543 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3545 return true;
3546 } else {
3547 Assert::invariant( false, "Bad token type: $token" );