ApiPageSet: Use processTitlesArray() in getRedirectTargets()
[mediawiki.git] / includes / tidy / Balancer.php
blob3467b49cae839c0d7498ee2745154bcd91530dbe
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
26 namespace MediaWiki\Tidy;
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
35 // A note for future librarization[1] -- this file is a good candidate
36 // for splitting into an independent library, except that it is currently
37 // highly optimized for MediaWiki use. It only implements the portions
38 // of the HTML5 tree builder used by tags supported by MediaWiki, and
39 // does not contain a true tokenizer pass, instead relying on
40 // comment stripping, attribute normalization, and escaping done by
41 // the MediaWiki Sanitizer. It also deliberately avoids building
42 // a true DOM in memory, instead serializing elements to an output string
43 // as soon as possible (usually as soon as the tag is closed) to reduce
44 // its memory footprint.
46 // We've been gradually lifting some of these restrictions to handle
47 // non-sanitized output generated by extensions, but we shortcut the tokenizer
48 // for speed (primarily by splitting on `<`) and so rely on syntactic
49 // well-formedness.
51 // On the other hand, I've been pretty careful to note with comments in the
52 // code the places where this implementation omits features of the spec or
53 // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
54 // implement the missing pieces and make this a standalone PHP HTML5 parser.
55 // In order to do so, some sort of MediaWiki-specific API will need
56 // to be added to (a) allow the Balancer to bypass the tokenizer,
57 // and (b) support on-the-fly flattening instead of DOM node creation.
59 // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
61 /**
62 * Utility constants and sets for the HTML5 tree building algorithm.
63 * Sets are associative arrays indexed first by namespace and then by
64 * lower-cased tag name.
66 * @ingroup Parser
67 * @since 1.27
69 class BalanceSets {
70 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
71 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
72 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
74 public static $unsupportedSet = [
75 self::HTML_NAMESPACE => [
76 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
77 'frame' => true,
78 'plaintext' => true,
79 'xmp' => true, 'iframe' => true, 'noembed' => true,
80 'noscript' => true, 'script' => true,
81 'title' => true
85 public static $emptyElementSet = [
86 self::HTML_NAMESPACE => [
87 'area' => true, 'base' => true, 'basefont' => true,
88 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
89 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
90 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
91 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
95 public static $extraLinefeedSet = [
96 self::HTML_NAMESPACE => [
97 'pre' => true, 'textarea' => true, 'listing' => true,
101 public static $headingSet = [
102 self::HTML_NAMESPACE => [
103 'h1' => true, 'h2' => true, 'h3' => true,
104 'h4' => true, 'h5' => true, 'h6' => true
108 public static $specialSet = [
109 self::HTML_NAMESPACE => [
110 'address' => true, 'applet' => true, 'area' => true,
111 'article' => true, 'aside' => true, 'base' => true,
112 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
113 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
114 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
115 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
116 'dt' => true, 'embed' => true, 'fieldset' => true,
117 'figcaption' => true, 'figure' => true, 'footer' => true,
118 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
119 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
120 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
121 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
122 'input' => true, 'li' => true, 'link' => true,
123 'listing' => true, 'main' => true, 'marquee' => true,
124 'menu' => true, 'meta' => true, 'nav' => true,
125 'noembed' => true, 'noframes' => true, 'noscript' => true,
126 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
127 'plaintext' => true, 'pre' => true, 'script' => true,
128 'section' => true, 'select' => true, 'source' => true,
129 'style' => true, 'summary' => true, 'table' => true,
130 'tbody' => true, 'td' => true, 'template' => true,
131 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
132 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
133 'wbr' => true, 'xmp' => true
135 self::SVG_NAMESPACE => [
136 'foreignobject' => true, 'desc' => true, 'title' => true
138 self::MATHML_NAMESPACE => [
139 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
140 'mtext' => true, 'annotation-xml' => true
144 public static $addressDivPSet = [
145 self::HTML_NAMESPACE => [
146 'address' => true, 'div' => true, 'p' => true
150 public static $tableSectionRowSet = [
151 self::HTML_NAMESPACE => [
152 'table' => true, 'thead' => true, 'tbody' => true,
153 'tfoot' => true, 'tr' => true
157 public static $impliedEndTagsSet = [
158 self::HTML_NAMESPACE => [
159 'dd' => true, 'dt' => true, 'li' => true,
160 'menuitem' => true, 'optgroup' => true,
161 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
162 'rt' => true, 'rtc' => true
166 public static $thoroughImpliedEndTagsSet = [
167 self::HTML_NAMESPACE => [
168 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
169 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
170 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
171 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
172 'thead' => true, 'tr' => true
176 public static $tableCellSet = [
177 self::HTML_NAMESPACE => [
178 'td' => true, 'th' => true
181 public static $tableContextSet = [
182 self::HTML_NAMESPACE => [
183 'table' => true, 'template' => true, 'html' => true
187 public static $tableBodyContextSet = [
188 self::HTML_NAMESPACE => [
189 'tbody' => true, 'tfoot' => true, 'thead' => true,
190 'template' => true, 'html' => true
194 public static $tableRowContextSet = [
195 self::HTML_NAMESPACE => [
196 'tr' => true, 'template' => true, 'html' => true
200 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
201 public static $formAssociatedSet = [
202 self::HTML_NAMESPACE => [
203 'button' => true, 'fieldset' => true, 'input' => true,
204 'keygen' => true, 'object' => true, 'output' => true,
205 'select' => true, 'textarea' => true, 'img' => true
209 public static $inScopeSet = [
210 self::HTML_NAMESPACE => [
211 'applet' => true, 'caption' => true, 'html' => true,
212 'marquee' => true, 'object' => true,
213 'table' => true, 'td' => true, 'template' => true,
214 'th' => true
216 self::SVG_NAMESPACE => [
217 'foreignobject' => true, 'desc' => true, 'title' => true
219 self::MATHML_NAMESPACE => [
220 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
221 'mtext' => true, 'annotation-xml' => true
225 private static $inListItemScopeSet = null;
226 public static function inListItemScopeSet() {
227 if ( self::$inListItemScopeSet === null ) {
228 self::$inListItemScopeSet = self::$inScopeSet;
229 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
230 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
232 return self::$inListItemScopeSet;
235 private static $inButtonScopeSet = null;
236 public static function inButtonScopeSet() {
237 if ( self::$inButtonScopeSet === null ) {
238 self::$inButtonScopeSet = self::$inScopeSet;
239 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
241 return self::$inButtonScopeSet;
244 public static $inTableScopeSet = [
245 self::HTML_NAMESPACE => [
246 'html' => true, 'table' => true, 'template' => true
250 public static $inInvertedSelectScopeSet = [
251 self::HTML_NAMESPACE => [
252 'option' => true, 'optgroup' => true
256 public static $mathmlTextIntegrationPointSet = [
257 self::MATHML_NAMESPACE => [
258 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
259 'mtext' => true
263 public static $htmlIntegrationPointSet = [
264 self::SVG_NAMESPACE => [
265 'foreignobject' => true,
266 'desc' => true,
267 'title' => true
271 // For tidy compatibility.
272 public static $tidyPWrapSet = [
273 self::HTML_NAMESPACE => [
274 'body' => true, 'blockquote' => true,
275 // We parse with <body> as the fragment context, but the top-level
276 // element on the stack is actually <html>. We could use the
277 // "adjusted current node" everywhere to work around this, but it's
278 // easier just to add <html> to the p-wrap set.
279 'html' => true,
282 public static $tidyInlineSet = [
283 self::HTML_NAMESPACE => [
284 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
285 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
286 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
287 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
288 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
289 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
290 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
291 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
292 's' => true, 'samp' => true, 'select' => true, 'small' => true,
293 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
294 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
295 'var' => true,
301 * A BalanceElement is a simplified version of a DOM Node. The main
302 * difference is that we only keep BalanceElements around for nodes
303 * currently on the BalanceStack of open elements. As soon as an
304 * element is closed, with some minor exceptions relating to the
305 * tree builder "adoption agency algorithm", the element and all its
306 * children are serialized to a string using the flatten() method.
307 * This keeps our memory usage low.
309 * @ingroup Parser
310 * @since 1.27
312 class BalanceElement {
314 * The namespace of the element.
315 * @var string $namespaceURI
317 public $namespaceURI;
319 * The lower-cased name of the element.
320 * @var string $localName
322 public $localName;
324 * Attributes for the element, in array form
325 * @var array $attribs
327 public $attribs;
330 * Parent of this element, or the string "flat" if this element has
331 * already been flattened into its parent.
332 * @var BalanceElement|string|null $parent
334 public $parent;
337 * An array of children of this element. Typically only the last
338 * child will be an actual BalanceElement object; the rest will
339 * be strings, representing either text nodes or flattened
340 * BalanceElement objects.
341 * @var BalanceElement[]|string[] $children
343 public $children;
346 * A unique string identifier for Noah's Ark purposes, lazy initialized
348 private $noahKey;
351 * The next active formatting element in the list, or null if this is the
352 * end of the AFE list or if the element is not in the AFE list.
354 public $nextAFE;
357 * The previous active formatting element in the list, or null if this is
358 * the start of the list or if the element is not in the AFE list.
360 public $prevAFE;
363 * The next element in the Noah's Ark species bucket.
365 public $nextNoah;
368 * Make a new BalanceElement corresponding to the HTML DOM Element
369 * with the given localname, namespace, and attributes.
371 * @param string $namespaceURI The namespace of the element.
372 * @param string $localName The lowercased name of the tag.
373 * @param array $attribs Attributes of the element
375 public function __construct( $namespaceURI, $localName, array $attribs ) {
376 $this->localName = $localName;
377 $this->namespaceURI = $namespaceURI;
378 $this->attribs = $attribs;
379 $this->contents = '';
380 $this->parent = null;
381 $this->children = [];
385 * Remove the given child from this element.
386 * @param BalanceElement $elt
388 private function removeChild( BalanceElement $elt ) {
389 Assert::precondition(
390 $this->parent !== 'flat', "Can't removeChild after flattening $this"
392 Assert::parameter(
393 $elt->parent === $this, 'elt', 'must have $this as a parent'
395 $idx = array_search( $elt, $this->children, true );
396 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
397 $elt->parent = null;
398 array_splice( $this->children, $idx, 1 );
402 * Find $a in the list of children and insert $b before it.
403 * @param BalanceElement $a
404 * @param BalanceElement|string $b
406 public function insertBefore( BalanceElement $a, $b ) {
407 Assert::precondition(
408 $this->parent !== 'flat', "Can't insertBefore after flattening."
410 $idx = array_search( $a, $this->children, true );
411 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
412 if ( is_string( $b ) ) {
413 array_splice( $this->children, $idx, 0, [ $b ] );
414 } else {
415 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
416 if ( $b->parent !== null ) {
417 $b->parent->removeChild( $b );
419 array_splice( $this->children, $idx, 0, [ $b ] );
420 $b->parent = $this;
425 * Append $elt to the end of the list of children.
426 * @param BalanceElement|string $elt
428 public function appendChild( $elt ) {
429 Assert::precondition(
430 $this->parent !== 'flat', "Can't appendChild after flattening."
432 if ( is_string( $elt ) ) {
433 array_push( $this->children, $elt );
434 return;
436 // Remove $elt from parent, if it had one.
437 if ( $elt->parent !== null ) {
438 $elt->parent->removeChild( $elt );
440 array_push( $this->children, $elt );
441 $elt->parent = $this;
445 * Transfer all of the children of $elt to $this.
446 * @param BalanceElement $elt
448 public function adoptChildren( BalanceElement $elt ) {
449 Assert::precondition(
450 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
452 foreach ( $elt->children as $child ) {
453 if ( !is_string( $child ) ) {
454 // This is an optimization which avoids an O(n^2) set of
455 // array_splice operations.
456 $child->parent = null;
458 $this->appendChild( $child );
460 $elt->children = [];
464 * Flatten this node and all of its children into a string, as specified
465 * by the HTML serialization specification, and replace this node
466 * in its parent by that string.
468 * @param array $config Balancer configuration; see Balancer::__construct().
469 * @return string
471 * @see __toString()
473 public function flatten( array $config ) {
474 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
475 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
476 $idx = array_search( $this, $this->parent->children, true );
477 Assert::parameter(
478 $idx !== false, '$this', 'must be a child of its parent'
480 $tidyCompat = $config['tidyCompat'];
481 if ( $tidyCompat ) {
482 $blank = true;
483 foreach ( $this->children as $elt ) {
484 if ( !is_string( $elt ) ) {
485 $elt = $elt->flatten( $config );
487 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
488 $blank = false;
491 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
492 $this->localName = 'p';
493 } elseif ( $blank ) {
494 // Add 'mw-empty-elt' class so elements can be hidden via CSS
495 // for compatibility with legacy tidy.
496 if ( !count( $this->attribs ) &&
497 ( $this->localName === 'tr' || $this->localName === 'li' )
499 $this->attribs = [ 'class' => "mw-empty-elt" ];
501 $blank = false;
502 } elseif (
503 $this->isA( BalanceSets::$extraLinefeedSet ) &&
504 count( $this->children ) > 0 &&
505 substr( $this->children[0], 0, 1 ) == "\n"
507 // Double the linefeed after pre/listing/textarea
508 // according to the (old) HTML5 fragment serialization
509 // algorithm (see https://github.com/whatwg/html/issues/944)
510 // to ensure this will round-trip.
511 array_unshift( $this->children, "\n" );
513 $flat = $blank ? '' : "{$this}";
514 } else {
515 $flat = "{$this}";
517 $this->parent->children[$idx] = $flat;
518 $this->parent = 'flat'; // for assertion checking
519 return $flat;
523 * Serialize this node and all of its children to a string, as specified
524 * by the HTML serialization specification.
526 * @return string The serialization of the BalanceElement
527 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
529 public function __toString() {
530 $encAttribs = '';
531 foreach ( $this->attribs as $name => $value ) {
532 $encValue = Sanitizer::encodeAttribute( $value );
533 $encAttribs .= " $name=\"$encValue\"";
535 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
536 $out = "<{$this->localName}{$encAttribs}>";
537 $len = strlen( $out );
538 // flatten children
539 foreach ( $this->children as $elt ) {
540 $out .= "{$elt}";
542 $out .= "</{$this->localName}>";
543 } else {
544 $out = "<{$this->localName}{$encAttribs} />";
545 Assert::invariant(
546 count( $this->children ) === 0,
547 "Empty elements shouldn't have children."
550 return $out;
553 // Utility functions on BalanceElements.
556 * Determine if $this represents a specific HTML tag, is a member of
557 * a tag set, or is equal to another BalanceElement.
559 * @param BalanceElement|array|string $set The target BalanceElement,
560 * set (from the BalanceSets class), or string (HTML tag name).
561 * @return bool
563 public function isA( $set ) {
564 if ( $set instanceof BalanceElement ) {
565 return $this === $set;
566 } elseif ( is_array( $set ) ) {
567 return isset( $set[$this->namespaceURI] ) &&
568 isset( $set[$this->namespaceURI][$this->localName] );
569 } else {
570 // assume this is an HTML element name.
571 return $this->isHtml() && $this->localName === $set;
576 * Determine if this element is an HTML element with the specified name
577 * @param string $tagName
578 * @return bool
580 public function isHtmlNamed( $tagName ) {
581 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
582 && $this->localName === $tagName;
586 * Determine if $this represents an element in the HTML namespace.
588 * @return bool
590 public function isHtml() {
591 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
595 * Determine if $this represents a MathML text integration point,
596 * as defined in the HTML5 specification.
598 * @return bool
599 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
601 public function isMathmlTextIntegrationPoint() {
602 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
606 * Determine if $this represents an HTML integration point,
607 * as defined in the HTML5 specification.
609 * @return bool
610 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
612 public function isHtmlIntegrationPoint() {
613 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
614 return true;
616 if (
617 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
618 $this->localName === 'annotation-xml' &&
619 isset( $this->attribs['encoding'] ) &&
620 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
621 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
623 return true;
625 return false;
629 * Get a string key for the Noah's Ark algorithm
631 public function getNoahKey() {
632 if ( $this->noahKey === null ) {
633 $attribs = $this->attribs;
634 ksort( $attribs );
635 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
637 return $this->noahKey;
642 * The "stack of open elements" as defined in the HTML5 tree builder
643 * spec. This contains methods to ensure that content (start tags, text)
644 * are inserted at the correct place in the output string, and to
645 * flatten BalanceElements are they are closed to avoid holding onto
646 * a complete DOM tree for the document in memory.
648 * The stack defines a PHP iterator to traverse it in "reverse order",
649 * that is, the most-recently-added element is visited first in a
650 * foreach loop.
652 * @ingroup Parser
653 * @since 1.27
654 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
656 class BalanceStack implements IteratorAggregate {
658 * Backing storage for the stack.
659 * @var BalanceElement[] $elements
661 private $elements = [];
663 * Foster parent mode determines how nodes are inserted into the
664 * stack.
665 * @var bool $fosterParentMode
666 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
668 public $fosterParentMode = false;
670 * Configuration options governing flattening.
671 * @var array $config
672 * @see Balancer::__construct()
674 private $config;
676 * Reference to the current element
678 public $currentNode;
681 * Create a new BalanceStack with a single BalanceElement on it,
682 * representing the root &lt;html&gt; node.
683 * @param array $config Balancer configuration; see Balancer::_construct().
685 public function __construct( array $config ) {
686 // always a root <html> element on the stack
687 array_push(
688 $this->elements,
689 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
691 $this->currentNode = $this->elements[0];
692 $this->config = $config;
696 * Return a string representing the output of the tree builder:
697 * all the children of the root &lt;html&gt; node.
698 * @return string
700 public function getOutput() {
701 // Don't include the outer '<html>....</html>'
702 $out = '';
703 foreach ( $this->elements[0]->children as $elt ) {
704 $out .= is_string( $elt ) ? $elt :
705 $elt->flatten( $this->config );
707 return $out;
711 * Insert a comment at the appropriate place for inserting a node.
712 * @param string $value Content of the comment.
713 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
715 public function insertComment( $value ) {
716 // Just another type of text node, except for tidy p-wrapping.
717 return $this->insertText( '<!--' . $value . '-->', true );
721 * Insert text at the appropriate place for inserting a node.
722 * @param string $value
723 * @param bool $isComment
724 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
726 public function insertText( $value, $isComment = false ) {
727 if (
728 $this->fosterParentMode &&
729 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
731 $this->fosterParent( $value );
732 } elseif (
733 $this->config['tidyCompat'] && !$isComment &&
734 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
736 $this->insertHTMLElement( 'mw:p-wrap', [] );
737 return $this->insertText( $value );
738 } else {
739 $this->currentNode->appendChild( $value );
744 * Insert a BalanceElement at the appropriate place, pushing it
745 * on to the open elements stack.
746 * @param string $namespaceURI The element namespace
747 * @param string $tag The tag name
748 * @param string $attribs Normalized attributes, as a string.
749 * @return BalanceElement
750 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
752 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
753 return $this->insertElement(
754 new BalanceElement( $namespaceURI, $tag, $attribs )
759 * Insert an HTML element at the appropriate place, pushing it on to
760 * the open elements stack.
761 * @param string $tag The tag name
762 * @param string $attribs Normalized attributes, as a string.
763 * @return BalanceElement
764 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
766 public function insertHTMLElement( $tag, $attribs ) {
767 return $this->insertForeignElement(
768 BalanceSets::HTML_NAMESPACE, $tag, $attribs
773 * Insert an element at the appropriate place and push it on to the
774 * open elements stack.
775 * @param BalanceElement $elt
776 * @return BalanceElement
777 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
779 public function insertElement( BalanceElement $elt ) {
780 if (
781 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
782 !$elt->isA( BalanceSets::$tidyInlineSet )
784 // Tidy compatibility.
785 $this->pop();
787 if (
788 $this->fosterParentMode &&
789 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
791 $elt = $this->fosterParent( $elt );
792 } else {
793 $this->currentNode->appendChild( $elt );
795 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
796 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
797 array_push( $this->elements, $elt );
798 $this->currentNode = $elt;
799 return $elt;
803 * Determine if the stack has $tag in scope.
804 * @param BalanceElement|array|string $tag
805 * @return bool
806 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
808 public function inScope( $tag ) {
809 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
813 * Determine if the stack has $tag in button scope.
814 * @param BalanceElement|array|string $tag
815 * @return bool
816 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
818 public function inButtonScope( $tag ) {
819 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
823 * Determine if the stack has $tag in list item scope.
824 * @param BalanceElement|array|string $tag
825 * @return bool
826 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
828 public function inListItemScope( $tag ) {
829 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
833 * Determine if the stack has $tag in table scope.
834 * @param BalanceElement|array|string $tag
835 * @return bool
836 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
838 public function inTableScope( $tag ) {
839 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
843 * Determine if the stack has $tag in select scope.
844 * @param BalanceElement|array|string $tag
845 * @return bool
846 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
848 public function inSelectScope( $tag ) {
849 // Can't use inSpecificScope to implement this, since it involves
850 // *inverting* a set of tags. Implement manually.
851 foreach ( $this as $elt ) {
852 if ( $elt->isA( $tag ) ) {
853 return true;
855 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
856 return false;
859 return false;
863 * Determine if the stack has $tag in a specific scope, $set.
864 * @param BalanceElement|array|string $tag
865 * @param BalanceElement|array|string $set
866 * @return bool
867 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
869 public function inSpecificScope( $tag, $set ) {
870 foreach ( $this as $elt ) {
871 if ( $elt->isA( $tag ) ) {
872 return true;
874 if ( $elt->isA( $set ) ) {
875 return false;
878 return false;
882 * Generate implied end tags.
883 * @param string $butnot
884 * @param bool $thorough True if we should generate end tags thoroughly.
885 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
887 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
888 $endTagSet = $thorough ?
889 BalanceSets::$thoroughImpliedEndTagsSet :
890 BalanceSets::$impliedEndTagsSet;
891 while ( $this->currentNode ) {
892 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
893 break;
895 if ( !$this->currentNode->isA( $endTagSet ) ) {
896 break;
898 $this->pop();
903 * Return the adjusted current node.
905 public function adjustedCurrentNode( $fragmentContext ) {
906 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
907 $fragmentContext : $this->currentNode;
911 * Return an iterator over this stack which visits the current node
912 * first, and the root node last.
913 * @return \Iterator
915 public function getIterator() {
916 return new ReverseArrayIterator( $this->elements );
920 * Return the BalanceElement at the given position $idx, where
921 * position 0 represents the root element.
922 * @param int $idx
923 * @return BalanceElement
925 public function node( $idx ) {
926 return $this->elements[ $idx ];
930 * Replace the element at position $idx in the BalanceStack with $elt.
931 * @param int $idx
932 * @param BalanceElement $elt
934 public function replaceAt( $idx, BalanceElement $elt ) {
935 Assert::precondition(
936 $this->elements[$idx]->parent !== 'flat',
937 'Replaced element should not have already been flattened.'
939 Assert::precondition(
940 $elt->parent !== 'flat',
941 'New element should not have already been flattened.'
943 $this->elements[$idx] = $elt;
944 if ( $idx === count( $this->elements ) - 1 ) {
945 $this->currentNode = $elt;
950 * Return the position of the given BalanceElement, set, or
951 * HTML tag name string in the BalanceStack.
952 * @param BalanceElement|array|string $tag
953 * @return int
955 public function indexOf( $tag ) {
956 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
957 if ( $this->elements[$i]->isA( $tag ) ) {
958 return $i;
961 return -1;
965 * Return the number of elements currently in the BalanceStack.
966 * @return int
968 public function length() {
969 return count( $this->elements );
973 * Remove the current node from the BalanceStack, flattening it
974 * in the process.
976 public function pop() {
977 $elt = array_pop( $this->elements );
978 if ( count( $this->elements ) ) {
979 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
980 } else {
981 $this->currentNode = null;
983 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
984 $elt->flatten( $this->config );
989 * Remove all nodes up to and including position $idx from the
990 * BalanceStack, flattening them in the process.
991 * @param int $idx
993 public function popTo( $idx ) {
994 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
995 $this->pop();
1000 * Pop elements off the stack up to and including the first
1001 * element with the specified HTML tagname (or matching the given
1002 * set).
1003 * @param BalanceElement|array|string $tag
1005 public function popTag( $tag ) {
1006 while ( $this->currentNode ) {
1007 if ( $this->currentNode->isA( $tag ) ) {
1008 $this->pop();
1009 break;
1011 $this->pop();
1016 * Pop elements off the stack *not including* the first element
1017 * in the specified set.
1018 * @param BalanceElement|array|string $set
1020 public function clearToContext( $set ) {
1021 // Note that we don't loop to 0. Never pop the <html> elt off.
1022 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
1023 if ( $this->currentNode->isA( $set ) ) {
1024 break;
1026 $this->pop();
1031 * Remove the given $elt from the BalanceStack, optionally
1032 * flattening it in the process.
1033 * @param BalanceElement $elt The element to remove.
1034 * @param bool $flatten Whether to flatten the removed element.
1036 public function removeElement( BalanceElement $elt, $flatten = true ) {
1037 Assert::parameter(
1038 $elt->parent !== 'flat',
1039 '$elt',
1040 '$elt should not already have been flattened.'
1042 Assert::parameter(
1043 $elt->parent->parent !== 'flat',
1044 '$elt',
1045 'The parent of $elt should not already have been flattened.'
1047 $idx = array_search( $elt, $this->elements, true );
1048 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1049 array_splice( $this->elements, $idx, 1 );
1050 if ( $idx === count( $this->elements ) ) {
1051 $this->currentNode = $this->elements[$idx - 1];
1053 if ( $flatten ) {
1054 // serialize $elt into its parent
1055 // otherwise, it will eventually serialize when the parent
1056 // is serialized, we just hold onto the memory for its
1057 // tree of objects a little longer.
1058 $elt->flatten( $this->config );
1060 Assert::postcondition(
1061 array_search( $elt, $this->elements, true ) === false,
1062 '$elt should no longer be in open elements stack'
1067 * Find $a in the BalanceStack and insert $b after it.
1068 * @param BalanceElement $a
1069 * @param BalanceElement $b
1071 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1072 $idx = $this->indexOf( $a );
1073 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1074 if ( $idx === count( $this->elements ) - 1 ) {
1075 array_push( $this->elements, $b );
1076 $this->currentNode = $b;
1077 } else {
1078 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1082 // Fostering and adoption.
1085 * Foster parent the given $elt in the stack of open elements.
1086 * @param BalanceElement|string $elt
1087 * @return BalanceElement|string
1089 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1091 private function fosterParent( $elt ) {
1092 $lastTable = $this->indexOf( 'table' );
1093 $lastTemplate = $this->indexOf( 'template' );
1094 $parent = null;
1095 $before = null;
1097 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1098 $parent = $this->elements[$lastTemplate];
1099 } elseif ( $lastTable >= 0 ) {
1100 $parent = $this->elements[$lastTable]->parent;
1101 // Assume all tables have parents, since we're not running scripts!
1102 Assert::invariant(
1103 $parent !== null, "All tables should have parents"
1105 $before = $this->elements[$lastTable];
1106 } else {
1107 $parent = $this->elements[0]; // the `html` element.
1110 if ( $this->config['tidyCompat'] ) {
1111 if ( is_string( $elt ) ) {
1112 // We're fostering text: do we need a p-wrapper?
1113 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1114 $this->insertHTMLElement( 'mw:p-wrap', [] );
1115 $this->insertText( $elt );
1116 return $elt;
1118 } else {
1119 // We're fostering an element; do we need to merge p-wrappers?
1120 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1121 $idx = $before ?
1122 array_search( $before, $parent->children, true ) :
1123 count( $parent->children );
1124 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1125 if (
1126 $after instanceof BalanceElement &&
1127 $after->isHtmlNamed( 'mw:p-wrap' )
1129 return $after; // Re-use existing p-wrapper.
1135 if ( $before ) {
1136 $parent->insertBefore( $before, $elt );
1137 } else {
1138 $parent->appendChild( $elt );
1140 return $elt;
1144 * Run the "adoption agency algoritm" (AAA) for the given subject
1145 * tag name.
1146 * @param string $tag The subject tag name.
1147 * @param BalanceActiveFormattingElements $afe The current
1148 * active formatting elements list.
1149 * @return true if the adoption agency algorithm "did something", false
1150 * if more processing is required by the caller.
1151 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1153 public function adoptionAgency( $tag, $afe ) {
1154 // If the current node is an HTML element whose tag name is subject,
1155 // and the current node is not in the list of active formatting
1156 // elements, then pop the current node off the stack of open
1157 // elements and abort these steps.
1158 if (
1159 $this->currentNode->isHtmlNamed( $tag ) &&
1160 !$afe->isInList( $this->currentNode )
1162 $this->pop();
1163 return true; // no more handling required
1166 // Outer loop: If outer loop counter is greater than or
1167 // equal to eight, then abort these steps.
1168 for ( $outer = 0; $outer < 8; $outer++ ) {
1169 // Let the formatting element be the last element in the list
1170 // of active formatting elements that: is between the end of
1171 // the list and the last scope marker in the list, if any, or
1172 // the start of the list otherwise, and has the same tag name
1173 // as the token.
1174 $fmtElt = $afe->findElementByTag( $tag );
1176 // If there is no such node, then abort these steps and instead
1177 // act as described in the "any other end tag" entry below.
1178 if ( !$fmtElt ) {
1179 return false; // false means handle by the default case
1182 // Otherwise, if there is such a node, but that node is not in
1183 // the stack of open elements, then this is a parse error;
1184 // remove the element from the list, and abort these steps.
1185 $index = $this->indexOf( $fmtElt );
1186 if ( $index < 0 ) {
1187 $afe->remove( $fmtElt );
1188 return true; // true means no more handling required
1191 // Otherwise, if there is such a node, and that node is also in
1192 // the stack of open elements, but the element is not in scope,
1193 // then this is a parse error; ignore the token, and abort
1194 // these steps.
1195 if ( !$this->inScope( $fmtElt ) ) {
1196 return true;
1199 // Let the furthest block be the topmost node in the stack of
1200 // open elements that is lower in the stack than the formatting
1201 // element, and is an element in the special category. There
1202 // might not be one.
1203 $furthestBlock = null;
1204 $furthestBlockIndex = -1;
1205 $stackLength = $this->length();
1206 for ( $i = $index+1; $i < $stackLength; $i++ ) {
1207 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1208 $furthestBlock = $this->node( $i );
1209 $furthestBlockIndex = $i;
1210 break;
1214 // If there is no furthest block, then the UA must skip the
1215 // subsequent steps and instead just pop all the nodes from the
1216 // bottom of the stack of open elements, from the current node
1217 // up to and including the formatting element, and remove the
1218 // formatting element from the list of active formatting
1219 // elements.
1220 if ( !$furthestBlock ) {
1221 $this->popTag( $fmtElt );
1222 $afe->remove( $fmtElt );
1223 return true;
1226 // Let the common ancestor be the element immediately above
1227 // the formatting element in the stack of open elements.
1228 $ancestor = $this->node( $index-1 );
1230 // Let a bookmark note the position of the formatting
1231 // element in the list of active formatting elements
1232 // relative to the elements on either side of it in the
1233 // list.
1234 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1235 $afe->insertAfter( $fmtElt, $BOOKMARK );
1237 // Let node and last node be the furthest block.
1238 $node = $furthestBlock;
1239 $lastNode = $furthestBlock;
1240 $nodeIndex = $furthestBlockIndex;
1241 $isAFE = false;
1243 // Inner loop
1244 for ( $inner = 1; true; $inner++ ) {
1245 // Let node be the element immediately above node in
1246 // the stack of open elements, or if node is no longer
1247 // in the stack of open elements (e.g. because it got
1248 // removed by this algorithm), the element that was
1249 // immediately above node in the stack of open elements
1250 // before node was removed.
1251 $node = $this->node( --$nodeIndex );
1253 // If node is the formatting element, then go
1254 // to the next step in the overall algorithm.
1255 if ( $node === $fmtElt ) break;
1257 // If the inner loop counter is greater than three and node
1258 // is in the list of active formatting elements, then remove
1259 // node from the list of active formatting elements.
1260 $isAFE = $afe->isInList( $node );
1261 if ( $inner > 3 && $isAFE ) {
1262 $afe->remove( $node );
1263 $isAFE = false;
1266 // If node is not in the list of active formatting
1267 // elements, then remove node from the stack of open
1268 // elements and then go back to the step labeled inner
1269 // loop.
1270 if ( !$isAFE ) {
1271 // Don't flatten here, since we're about to relocate
1272 // parts of this $node.
1273 $this->removeElement( $node, false );
1274 continue;
1277 // Create an element for the token for which the
1278 // element node was created with common ancestor as
1279 // the intended parent, replace the entry for node
1280 // in the list of active formatting elements with an
1281 // entry for the new element, replace the entry for
1282 // node in the stack of open elements with an entry for
1283 // the new element, and let node be the new element.
1284 $newElt = new BalanceElement(
1285 $node->namespaceURI, $node->localName, $node->attribs );
1286 $afe->replace( $node, $newElt );
1287 $this->replaceAt( $nodeIndex, $newElt );
1288 $node = $newElt;
1290 // If last node is the furthest block, then move the
1291 // aforementioned bookmark to be immediately after the
1292 // new node in the list of active formatting elements.
1293 if ( $lastNode === $furthestBlock ) {
1294 $afe->remove( $BOOKMARK );
1295 $afe->insertAfter( $newElt, $BOOKMARK );
1298 // Insert last node into node, first removing it from
1299 // its previous parent node if any.
1300 $node->appendChild( $lastNode );
1302 // Let last node be node.
1303 $lastNode = $node;
1306 // If the common ancestor node is a table, tbody, tfoot,
1307 // thead, or tr element, then, foster parent whatever last
1308 // node ended up being in the previous step, first removing
1309 // it from its previous parent node if any.
1310 if (
1311 $this->fosterParentMode &&
1312 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1314 $this->fosterParent( $lastNode );
1315 } else {
1316 // Otherwise, append whatever last node ended up being in
1317 // the previous step to the common ancestor node, first
1318 // removing it from its previous parent node if any.
1319 $ancestor->appendChild( $lastNode );
1322 // Create an element for the token for which the
1323 // formatting element was created, with furthest block
1324 // as the intended parent.
1325 $newElt2 = new BalanceElement(
1326 $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
1328 // Take all of the child nodes of the furthest block and
1329 // append them to the element created in the last step.
1330 $newElt2->adoptChildren( $furthestBlock );
1332 // Append that new element to the furthest block.
1333 $furthestBlock->appendChild( $newElt2 );
1335 // Remove the formatting element from the list of active
1336 // formatting elements, and insert the new element into the
1337 // list of active formatting elements at the position of
1338 // the aforementioned bookmark.
1339 $afe->remove( $fmtElt );
1340 $afe->replace( $BOOKMARK, $newElt2 );
1342 // Remove the formatting element from the stack of open
1343 // elements, and insert the new element into the stack of
1344 // open elements immediately below the position of the
1345 // furthest block in that stack.
1346 $this->removeElement( $fmtElt );
1347 $this->insertAfter( $furthestBlock, $newElt2 );
1350 return true;
1354 * Return the contents of the open elements stack as a string for
1355 * debugging.
1356 * @return string
1358 public function __toString() {
1359 $r = [];
1360 foreach ( $this->elements as $elt ) {
1361 array_push( $r, $elt->localName );
1363 return implode( $r, ' ' );
1368 * A pseudo-element used as a marker in the list of active formatting elements
1370 * @ingroup Parser
1371 * @since 1.27
1373 class BalanceMarker {
1374 public $nextAFE;
1375 public $prevAFE;
1379 * The list of active formatting elements, which is used to handle
1380 * mis-nested formatting element tags in the HTML5 tree builder
1381 * specification.
1383 * @ingroup Parser
1384 * @since 1.27
1385 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1387 class BalanceActiveFormattingElements {
1388 /** The last (most recent) element in the list */
1389 private $tail;
1391 /** The first (least recent) element in the list */
1392 private $head;
1395 * An array of arrays representing the population of elements in each bucket
1396 * according to the Noah's Ark clause. The outer array is stack-like, with each
1397 * integer-indexed element representing a segment of the list, bounded by
1398 * markers. The first element represents the segment of the list before the
1399 * first marker.
1401 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1402 * identifies each bucket according to the rules in the spec. The value in
1403 * the inner array is the first (least recently inserted) element in the bucket,
1404 * and subsequent members of the bucket can be found by iterating through the
1405 * singly-linked list via $node->nextNoah.
1407 * This is optimised for the most common case of inserting into a bucket
1408 * with zero members, and deleting a bucket containing one member. In the
1409 * worst case, iteration through the list is still O(1) in the document
1410 * size, since each bucket can have at most 3 members.
1412 private $noahTableStack = [ [] ];
1414 public function __destruct() {
1415 $next = null;
1416 for ( $node = $this->head; $node; $node = $next ) {
1417 $next = $node->nextAFE;
1418 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1420 $this->head = $this->tail = $this->noahTableStack = null;
1423 public function insertMarker() {
1424 $elt = new BalanceMarker;
1425 if ( $this->tail ) {
1426 $this->tail->nextAFE = $elt;
1427 $elt->prevAFE = $this->tail;
1428 } else {
1429 $this->head = $elt;
1431 $this->tail = $elt;
1432 $this->noahTableStack[] = [];
1436 * Follow the steps required when the spec requires us to "push onto the
1437 * list of active formatting elements".
1438 * @param BalanceElement $elt
1440 public function push( BalanceElement $elt ) {
1441 // Must not be in the list already
1442 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1443 throw new ParameterAssertionException( '$elt',
1444 'Cannot insert a node into the AFE list twice' );
1447 // "Noah's Ark clause" -- if there are already three copies of
1448 // this element before we encounter a marker, then drop the last
1449 // one.
1450 $noahKey = $elt->getNoahKey();
1451 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1452 if ( !isset( $table[$noahKey] ) ) {
1453 $table[$noahKey] = $elt;
1454 } else {
1455 $count = 1;
1456 $head = $tail = $table[$noahKey];
1457 while ( $tail->nextNoah ) {
1458 $tail = $tail->nextNoah;
1459 $count++;
1461 if ( $count >= 3 ) {
1462 $this->remove( $head );
1464 $tail->nextNoah = $elt;
1466 // Add to the main AFE list
1467 if ( $this->tail ) {
1468 $this->tail->nextAFE = $elt;
1469 $elt->prevAFE = $this->tail;
1470 } else {
1471 $this->head = $elt;
1473 $this->tail = $elt;
1477 * Follow the steps required when the spec asks us to "clear the list of
1478 * active formatting elements up to the last marker".
1480 public function clearToMarker() {
1481 // Iterate back through the list starting from the tail
1482 $tail = $this->tail;
1483 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1484 // Unlink the element
1485 $prev = $tail->prevAFE;
1486 $tail->prevAFE = null;
1487 if ( $prev ) {
1488 $prev->nextAFE = null;
1490 $tail->nextNoah = null;
1491 $tail = $prev;
1493 // If we finished on a marker, unlink it and pop it off the Noah table stack
1494 if ( $tail ) {
1495 $prev = $tail->prevAFE;
1496 if ( $prev ) {
1497 $prev->nextAFE = null;
1499 $tail = $prev;
1500 array_pop( $this->noahTableStack );
1501 } else {
1502 // No marker: wipe the top-level Noah table (which is the only one)
1503 $this->noahTableStack[0] = [];
1505 // If we removed all the elements, clear the head pointer
1506 if ( !$tail ) {
1507 $this->head = null;
1509 $this->tail = $tail;
1513 * Find and return the last element with the specified tag between the
1514 * end of the list and the last marker on the list.
1515 * Used when parsing &lt;a&gt; "in body mode".
1517 public function findElementByTag( $tag ) {
1518 $elt = $this->tail;
1519 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1520 if ( $elt->localName === $tag ) {
1521 return $elt;
1523 $elt = $elt->prevAFE;
1525 return null;
1529 * Determine whether an element is in the list of formatting elements.
1530 * @param BalanceElement $elt
1531 * @return boolean
1533 public function isInList( BalanceElement $elt ) {
1534 return $this->head === $elt || $elt->prevAFE;
1538 * Find the element $elt in the list and remove it.
1539 * Used when parsing &lt;a&gt; in body mode.
1541 * @param BalanceElement $elt
1543 public function remove( BalanceElement $elt ) {
1544 if ( $this->head !== $elt && !$elt->prevAFE ) {
1545 throw new ParameterAssertionException( '$elt',
1546 "Attempted to remove an element which is not in the AFE list" );
1548 // Update head and tail pointers
1549 if ( $this->head === $elt ) {
1550 $this->head = $elt->nextAFE;
1552 if ( $this->tail === $elt ) {
1553 $this->tail = $elt->prevAFE;
1555 // Update previous element
1556 if ( $elt->prevAFE ) {
1557 $elt->prevAFE->nextAFE = $elt->nextAFE;
1559 // Update next element
1560 if ( $elt->nextAFE ) {
1561 $elt->nextAFE->prevAFE = $elt->prevAFE;
1563 // Clear pointers so that isInList() etc. will work
1564 $elt->prevAFE = $elt->nextAFE = null;
1565 // Update Noah list
1566 $this->removeFromNoahList( $elt );
1569 private function addToNoahList( BalanceElement $elt ) {
1570 $noahKey = $elt->getNoahKey();
1571 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1572 if ( !isset( $table[$noahKey] ) ) {
1573 $table[$noahKey] = $elt;
1574 } else {
1575 $tail = $table[$noahKey];
1576 while ( $tail->nextNoah ) {
1577 $tail = $tail->nextNoah;
1579 $tail->nextNoah = $elt;
1583 private function removeFromNoahList( BalanceElement $elt ) {
1584 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1585 $key = $elt->getNoahKey();
1586 $noahElt = $table[$key];
1587 if ( $noahElt === $elt ) {
1588 if ( $noahElt->nextNoah ) {
1589 $table[$key] = $noahElt->nextNoah;
1590 $noahElt->nextNoah = null;
1591 } else {
1592 unset( $table[$key] );
1594 } else {
1595 do {
1596 $prevNoahElt = $noahElt;
1597 $noahElt = $prevNoahElt->nextNoah;
1598 if ( $noahElt === $elt ) {
1599 // Found it, unlink
1600 $prevNoahElt->nextNoah = $elt->nextNoah;
1601 $elt->nextNoah = null;
1602 break;
1604 } while ( $noahElt );
1609 * Find element $a in the list and replace it with element $b
1611 * @param BalanceElement $a
1612 * @param BalanceElement $b
1614 public function replace( BalanceElement $a, BalanceElement $b ) {
1615 if ( $this->head !== $a && !$a->prevAFE ) {
1616 throw new ParameterAssertionException( '$a',
1617 "Attempted to replace an element which is not in the AFE list" );
1619 // Update head and tail pointers
1620 if ( $this->head === $a ) {
1621 $this->head = $b;
1623 if ( $this->tail === $a ) {
1624 $this->tail = $b;
1626 // Update previous element
1627 if ( $a->prevAFE ) {
1628 $a->prevAFE->nextAFE = $b;
1630 // Update next element
1631 if ( $a->nextAFE ) {
1632 $a->nextAFE->prevAFE = $b;
1634 $b->prevAFE = $a->prevAFE;
1635 $b->nextAFE = $a->nextAFE;
1636 $a->nextAFE = $a->prevAFE = null;
1637 // Update Noah list
1638 $this->removeFromNoahList( $a );
1639 $this->addToNoahList( $b );
1643 * Find $a in the list and insert $b after it.
1645 * @param BalanceElement $a
1646 * @param BalanceElement $b
1648 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1649 if ( $this->head !== $a && !$a->prevAFE ) {
1650 throw new ParameterAssertionException( '$a',
1651 "Attempted to insert after an element which is not in the AFE list" );
1653 if ( $this->tail === $a ) {
1654 $this->tail = $b;
1656 if ( $a->nextAFE ) {
1657 $a->nextAFE->prevAFE = $b;
1659 $b->nextAFE = $a->nextAFE;
1660 $b->prevAFE = $a;
1661 $a->nextAFE = $b;
1662 $this->addToNoahList( $b );
1665 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1667 * Reconstruct the active formatting elements.
1668 * @param BalanceStack $stack The open elements stack
1669 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1671 // @codingStandardsIgnoreEnd
1672 public function reconstruct( $stack ) {
1673 $entry = $this->tail;
1674 // If there are no entries in the list of active formatting elements,
1675 // then there is nothing to reconstruct
1676 if ( !$entry ) {
1677 return;
1679 // If the last is a marker, do nothing.
1680 if ( $entry instanceof BalanceMarker ) {
1681 return;
1683 // Or if it is an open element, do nothing.
1684 if ( $stack->indexOf( $entry ) >= 0 ) {
1685 return;
1688 // Loop backward through the list until we find a marker or an
1689 // open element
1690 $foundIt = false;
1691 while ( $entry->prevAFE ) {
1692 $entry = $entry->prevAFE;
1693 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1694 $foundIt = true;
1695 break;
1699 // Now loop forward, starting from the element after the current one (or
1700 // the first element if we didn't find a marker or open element),
1701 // recreating formatting elements and pushing them back onto the list
1702 // of open elements.
1703 if ( $foundIt ) {
1704 $entry = $entry->nextAFE;
1706 do {
1707 $newElement = $stack->insertHTMLElement(
1708 $entry->localName,
1709 $entry->attribs );
1710 $this->replace( $entry, $newElement );
1711 $entry = $newElement->nextAFE;
1712 } while ( $entry );
1716 * Get a string representation of the AFE list, for debugging
1718 public function __toString() {
1719 $prev = null;
1720 $s = '';
1721 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1722 if ( $node instanceof BalanceMarker ) {
1723 $s .= "MARKER\n";
1724 continue;
1726 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1727 if ( $node->nextNoah ) {
1728 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1729 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1730 ')';
1732 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1733 $s .= " (reverse link is wrong!)";
1735 $s .= "\n";
1737 if ( $prev !== $this->tail ) {
1738 $s .= "(tail pointer is wrong!)\n";
1740 return $s;
1745 * An implementation of the tree building portion of the HTML5 parsing
1746 * spec.
1748 * This is used to balance and tidy output so that the result can
1749 * always be cleanly serialized/deserialized by an HTML5 parser. It
1750 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1751 * a number of constraints which are not enforced by the HTML5 parsing
1752 * process. But the result will be free of gross errors: misnested or
1753 * unclosed tags, for example, and will be unchanged by spec-complient
1754 * parsing followed by serialization.
1756 * The tree building stage is structured as a state machine.
1757 * When comparing the implementation to
1758 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1759 * note that each state is implemented as a function with a
1760 * name ending in `Mode` (because the HTML spec refers to them
1761 * as insertion modes). The current insertion mode is held by
1762 * the $parseMode property.
1764 * The following simplifications have been made:
1765 * - We handle body content only (ie, we start `in body`.)
1766 * - The document is never in "quirks mode".
1767 * - All occurrences of < and > have been entity escaped, so we
1768 * can parse tags by simply splitting on those two characters.
1769 * (This also simplifies the handling of < inside <textarea>.)
1770 * The character < must not appear inside comments.
1771 * Similarly, all attributes have been "cleaned" and are double-quoted
1772 * and escaped.
1773 * - All null characters are assumed to have been removed.
1774 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1775 * <frame>, <plaintext>, <xmp>, <iframe>,
1776 * <noembed>, <noscript>, <script>, <title>. As a result,
1777 * further simplifications can be made:
1778 * - `frameset-ok` is not tracked.
1779 * - `head element pointer` is not tracked (but presumed non-null)
1780 * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
1781 * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
1783 * We generally mark places where we omit cases from the spec due to
1784 * disallowed elements with a comment: `// OMITTED: <element-name>`.
1786 * The HTML spec keeps a flag during the parsing process to track
1787 * whether or not a "parse error" has been encountered. We don't
1788 * bother to track that flag, we just implement the error-handling
1789 * process as specified.
1791 * @ingroup Parser
1792 * @since 1.27
1793 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1795 class Balancer {
1796 private $parseMode;
1797 /** @var \Iterator */
1798 private $bitsIterator;
1799 private $allowedHtmlElements;
1800 /** @var BalanceActiveFormattingElements */
1801 private $afe;
1802 /** @var BalanceStack */
1803 private $stack;
1804 private $strict;
1805 private $allowComments;
1806 private $config;
1808 private $textIntegrationMode;
1809 private $pendingTableText;
1810 private $originalInsertionMode;
1811 private $fragmentContext;
1812 private $formElementPointer;
1813 private $ignoreLinefeed;
1814 private $inRCDATA;
1815 private $inRAWTEXT;
1817 /** @var callable|null */
1818 private $processingCallback;
1819 /** @var array */
1820 private $processingArgs;
1823 * Valid HTML5 comments.
1824 * Regex borrowed from Tim Starling's "remex-html" project.
1826 const VALID_COMMENT_REGEX = "~ !--
1827 ( # 1. Comment match detector
1828 > | -> | # Invalid short close
1829 ( # 2. Comment contents
1831 (?! --> )
1832 (?! --!> )
1833 (?! --! \z )
1834 (?! -- \z )
1835 (?! - \z )
1839 ( # 3. Comment close
1840 --> | # Normal close
1841 --!> | # Comment end bang
1842 ( # 4. Indicate matches requiring EOF
1843 --! | # EOF in comment end bang state
1844 -- | # EOF in comment end state
1845 - | # EOF in comment end dash state
1846 (?#nothing) # EOF in comment state
1850 ([^<]*) \z # 5. Non-tag text after the comment
1851 ~xs";
1854 * Create a new Balancer.
1855 * @param array $config Balancer configuration. Includes:
1856 * 'strict' : boolean, defaults to false.
1857 * When true, enforces syntactic constraints on input:
1858 * all non-tag '<' must be escaped, all attributes must be
1859 * separated by a single space and double-quoted. This is
1860 * consistent with the output of the Sanitizer.
1861 * 'allowedHtmlElements' : array, defaults to null.
1862 * When present, the keys of this associative array give
1863 * the acceptable HTML tag names. When not present, no
1864 * tag sanitization is done.
1865 * 'tidyCompat' : boolean, defaults to false.
1866 * When true, the serialization algorithm is tweaked to
1867 * provide historical compatibility with the old "tidy"
1868 * program: <p>-wrapping is done to the children of
1869 * <body> and <blockquote> elements, and empty elements
1870 * are removed. The <pre>/<listing>/<textarea> serialization
1871 * is also tweaked to allow lossless round trips.
1872 * (See: https://github.com/whatwg/html/issues/944)
1873 * 'allowComments': boolean, defaults to true.
1874 * When true, allows HTML comments in the input.
1875 * The Sanitizer generally strips all comments, so if you
1876 * are running on sanitized output you can set this to
1877 * false to get a bit more performance.
1879 public function __construct( array $config = [] ) {
1880 $this->config = $config = $config + [
1881 'strict' => false,
1882 'allowedHtmlElements' => null,
1883 'tidyCompat' => false,
1884 'allowComments' => true,
1886 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1887 $this->strict = $config['strict'];
1888 $this->allowComments = $config['allowComments'];
1889 if ( $this->allowedHtmlElements !== null ) {
1890 // Sanity check!
1891 $bad = array_uintersect_assoc(
1892 $this->allowedHtmlElements,
1893 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1894 function( $a, $b ) {
1895 // Ignore the values (just intersect the keys) by saying
1896 // all values are equal to each other.
1897 return 0;
1900 if ( count( $bad ) > 0 ) {
1901 $badstr = implode( array_keys( $bad ), ',' );
1902 throw new ParameterAssertionException(
1903 '$config',
1904 'Balance attempted with sanitization including ' .
1905 "unsupported elements: {$badstr}"
1912 * Return a balanced HTML string for the HTML fragment given by $text,
1913 * subject to the caveats listed in the class description. The result
1914 * will typically be idempotent -- that is, rebalancing the output
1915 * would result in no change.
1917 * @param string $text The markup to be balanced
1918 * @param callable $processingCallback Callback to do any variable or
1919 * parameter replacements in HTML attributes values
1920 * @param array|bool $processingArgs Arguments for the processing callback
1921 * @return string The balanced markup
1923 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1924 $this->parseMode = 'inBodyMode';
1925 $this->bitsIterator = new ExplodeIterator( '<', $text );
1926 $this->afe = new BalanceActiveFormattingElements();
1927 $this->stack = new BalanceStack( $this->config );
1928 $this->processingCallback = $processingCallback;
1929 $this->processingArgs = $processingArgs;
1931 $this->textIntegrationMode =
1932 $this->ignoreLinefeed =
1933 $this->inRCDATA =
1934 $this->inRAWTEXT = false;
1936 // The stack is constructed with an <html> element already on it.
1937 // Set this up as a fragment parsed with <body> as the context.
1938 $this->fragmentContext =
1939 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1940 $this->resetInsertionMode();
1941 $this->formElementPointer = null;
1942 for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
1943 if ( $e->isHtmlNamed( 'form' ) ) {
1944 $this->formElementPointer = $e;
1945 break;
1949 // First element is text not tag
1950 $x = $this->bitsIterator->current();
1951 $this->bitsIterator->next();
1952 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1953 // Now process each tag.
1954 while ( $this->bitsIterator->valid() ) {
1955 $this->advance();
1957 $this->insertToken( 'eof', null );
1958 $result = $this->stack->getOutput();
1959 // Free memory before returning.
1960 $this->bitsIterator = null;
1961 $this->afe = null;
1962 $this->stack = null;
1963 $this->fragmentContext = null;
1964 $this->formElementPointer = null;
1965 return $result;
1969 * Pass a token to the tree builder. The $token will be one of the
1970 * strings "tag", "endtag", or "text".
1972 private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
1973 // validate tags against $unsupportedSet
1974 if ( $token === 'tag' || $token === 'endtag' ) {
1975 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1976 // As described in "simplifications" above, these tags are
1977 // not supported in the balancer.
1978 Assert::invariant(
1979 !$this->strict,
1980 "Unsupported $token <$value> found."
1982 return false;
1984 } elseif ( $token === 'text' && $value === '' ) {
1985 // Don't actually inject the empty string as a text token.
1986 return true;
1988 // Support pre/listing/textarea by suppressing initial linefeed
1989 if ( $this->ignoreLinefeed ) {
1990 $this->ignoreLinefeed = false;
1991 if ( $token === 'text' ) {
1992 if ( $value[0] === "\n" ) {
1993 if ( $value === "\n" ) {
1994 // Nothing would be left, don't inject the empty string.
1995 return true;
1997 $value = substr( $value, 1 );
2001 // Some hoops we have to jump through
2002 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
2004 // The spec calls this the "tree construction dispatcher".
2005 $isForeign = true;
2006 if (
2007 $this->stack->length() === 0 ||
2008 $adjusted->isHtml() ||
2009 $token === 'eof'
2011 $isForeign = false;
2012 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
2013 if ( $token === 'text' ) {
2014 $isForeign = false;
2015 } elseif (
2016 $token === 'tag' &&
2017 $value !== 'mglyph' && $value !== 'malignmark'
2019 $isForeign = false;
2021 } elseif (
2022 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
2023 $adjusted->localName === 'annotation-xml' &&
2024 $token === 'tag' && $value === 'svg'
2026 $isForeign = false;
2027 } elseif (
2028 $adjusted->isHtmlIntegrationPoint() &&
2029 ( $token === 'tag' || $token === 'text' )
2031 $isForeign = false;
2033 if ( $isForeign ) {
2034 return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
2035 } else {
2036 $func = $this->parseMode;
2037 return $this->$func( $token, $value, $attribs, $selfClose );
2041 private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
2042 if ( $token === 'text' ) {
2043 $this->stack->insertText( $value );
2044 return true;
2045 } elseif ( $token === 'comment' ) {
2046 $this->stack->insertComment( $value );
2047 return true;
2048 } elseif ( $token === 'tag' ) {
2049 switch ( $value ) {
2050 case 'font':
2051 if ( isset( $attribs['color'] )
2052 || isset( $attribs['face'] )
2053 || isset( $attribs['size'] )
2055 break;
2057 // otherwise, fall through
2058 case 'b':
2059 case 'big':
2060 case 'blockquote':
2061 case 'body':
2062 case 'br':
2063 case 'center':
2064 case 'code':
2065 case 'dd':
2066 case 'div':
2067 case 'dl':
2068 case 'dt':
2069 case 'em':
2070 case 'embed':
2071 case 'h1':
2072 case 'h2':
2073 case 'h3':
2074 case 'h4':
2075 case 'h5':
2076 case 'h6':
2077 case 'head':
2078 case 'hr':
2079 case 'i':
2080 case 'img':
2081 case 'li':
2082 case 'listing':
2083 case 'menu':
2084 case 'meta':
2085 case 'nobr':
2086 case 'ol':
2087 case 'p':
2088 case 'pre':
2089 case 'ruby':
2090 case 's':
2091 case 'small':
2092 case 'span':
2093 case 'strong':
2094 case 'strike':
2095 case 'sub':
2096 case 'sup':
2097 case 'table':
2098 case 'tt':
2099 case 'u':
2100 case 'ul':
2101 case 'var':
2102 if ( $this->fragmentContext ) {
2103 break;
2105 while ( true ) {
2106 $this->stack->pop();
2107 $node = $this->stack->currentNode;
2108 if (
2109 $node->isMathmlTextIntegrationPoint() ||
2110 $node->isHtmlIntegrationPoint() ||
2111 $node->isHtml()
2113 break;
2116 return $this->insertToken( $token, $value, $attribs, $selfClose );
2118 // "Any other start tag"
2119 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
2120 $this->fragmentContext : $this->stack->currentNode;
2121 $this->stack->insertForeignElement(
2122 $adjusted->namespaceURI, $value, $attribs
2124 if ( $selfClose ) {
2125 $this->stack->pop();
2127 return true;
2128 } elseif ( $token === 'endtag' ) {
2129 $first = true;
2130 foreach ( $this->stack as $i => $node ) {
2131 if ( $node->isHtml() && !$first ) {
2132 // process the end tag as HTML
2133 $func = $this->parseMode;
2134 return $this->$func( $token, $value, $attribs, $selfClose );
2135 } elseif ( $i === 0 ) {
2136 return true;
2137 } elseif ( $node->localName === $value ) {
2138 $this->stack->popTag( $node );
2139 return true;
2141 $first = false;
2147 * Grab the next "token" from $bitsIterator. This is either a open/close
2148 * tag or text or a comment, depending on whether the Sanitizer approves.
2150 private function advance() {
2151 $x = $this->bitsIterator->current();
2152 $this->bitsIterator->next();
2153 $regs = [];
2154 // Handle comments. These won't be generated by mediawiki (they
2155 // are stripped in the Sanitizer) but may be generated by extensions.
2156 if (
2157 $this->allowComments &&
2158 !( $this->inRCDATA || $this->inRAWTEXT ) &&
2159 preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
2160 // verify EOF condition where necessary
2161 ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
2163 $contents = $regs[2][0];
2164 $rest = $regs[5][0];
2165 $this->insertToken( 'comment', $contents );
2166 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2167 return;
2169 // $slash: Does the current element start with a '/'?
2170 // $t: Current element name
2171 // $attribStr: String between element name and >
2172 // $brace: Ending '>' or '/>'
2173 // $rest: Everything until the next element from the $bitsIterator
2174 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2175 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2176 $t = strtolower( $t );
2177 if ( $this->strict ) {
2178 // Verify that attributes are all properly double-quoted
2179 Assert::invariant(
2180 preg_match(
2181 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2183 "Bad attribute string found"
2186 } else {
2187 Assert::invariant(
2188 !$this->strict, "< found which does not start a valid tag"
2190 $slash = $t = $attribStr = $brace = $rest = null;
2192 $goodTag = $t;
2193 if ( $this->inRCDATA ) {
2194 if ( $slash && $t === $this->inRCDATA ) {
2195 $this->inRCDATA = false;
2196 } else {
2197 // No tags allowed; this emulates the "rcdata" tokenizer mode.
2198 $goodTag = false;
2201 if ( $this->inRAWTEXT ) {
2202 if ( $slash && $t === $this->inRAWTEXT ) {
2203 $this->inRAWTEXT = false;
2204 } else {
2205 // No tags allowed, no entity-escaping done.
2206 $goodTag = false;
2209 $sanitize = $this->allowedHtmlElements !== null;
2210 if ( $sanitize ) {
2211 $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
2213 if ( $goodTag ) {
2214 if ( is_callable( $this->processingCallback ) ) {
2215 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2217 if ( $sanitize ) {
2218 $goodTag = Sanitizer::validateTag( $attribStr, $t );
2221 if ( $goodTag ) {
2222 if ( $sanitize ) {
2223 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2224 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2225 } else {
2226 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2228 $goodTag = $this->insertToken(
2229 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2232 if ( $goodTag ) {
2233 $rest = str_replace( '>', '&gt;', $rest );
2234 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2235 } elseif ( $this->inRAWTEXT ) {
2236 $this->insertToken( 'text', "<$x" );
2237 } else {
2238 // bad tag; serialize entire thing as text.
2239 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2243 private function switchMode( $mode ) {
2244 Assert::parameter(
2245 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2247 $oldMode = $this->parseMode;
2248 $this->parseMode = $mode;
2249 return $oldMode;
2252 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
2253 $this->switchMode( $mode );
2254 return $this->insertToken( $token, $value, $attribs, $selfClose );
2257 private function resetInsertionMode() {
2258 $last = false;
2259 foreach ( $this->stack as $i => $node ) {
2260 if ( $i === 0 ) {
2261 $last = true;
2262 if ( $this->fragmentContext ) {
2263 $node = $this->fragmentContext;
2266 if ( $node->isHtml() ) {
2267 switch ( $node->localName ) {
2268 case 'select':
2269 $stackLength = $this->stack->length();
2270 for ( $j = $i + 1; $j < $stackLength-1; $j++ ) {
2271 $ancestor = $this->stack->node( $stackLength-$j-1 );
2272 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2273 break;
2275 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2276 $this->switchMode( 'inSelectInTableMode' );
2277 return;
2280 $this->switchMode( 'inSelectMode' );
2281 return;
2282 case 'tr':
2283 $this->switchMode( 'inRowMode' );
2284 return;
2285 case 'tbody':
2286 case 'tfoot':
2287 case 'thead':
2288 $this->switchMode( 'inTableBodyMode' );
2289 return;
2290 case 'caption':
2291 $this->switchMode( 'inCaptionMode' );
2292 return;
2293 case 'colgroup':
2294 $this->switchMode( 'inColumnGroupMode' );
2295 return;
2296 case 'table':
2297 $this->switchMode( 'inTableMode' );
2298 return;
2299 case 'template':
2300 $this->switchMode(
2301 array_slice( $this->templateInsertionModes, -1 )[0]
2303 return;
2304 case 'body':
2305 $this->switchMode( 'inBodyMode' );
2306 return;
2307 // OMITTED: <frameset>
2308 // OMITTED: <html>
2309 // OMITTED: <head>
2310 default:
2311 if ( !$last ) {
2312 // OMITTED: <head>
2313 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2314 $this->switchMode( 'inCellMode' );
2315 return;
2320 if ( $last ) {
2321 $this->switchMode( 'inBodyMode' );
2322 return;
2327 private function stopParsing() {
2328 // Most of the spec methods are inapplicable, other than step 2:
2329 // "pop all the nodes off the stack of open elements".
2330 // We're going to keep the top-most <html> element on the stack, though.
2332 // Clear the AFE list first, otherwise the element objects will stay live
2333 // during serialization, potentially using O(N^2) memory. Note that
2334 // popping the stack will never result in reconstructing the active
2335 // formatting elements.
2336 $this->afe = null;
2337 $this->stack->popTo( 1 );
2340 private function parseRawText( $value, $attribs = null ) {
2341 $this->stack->insertHTMLElement( $value, $attribs );
2342 $this->inRAWTEXT = $value;
2343 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2344 return true;
2347 private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
2348 if ( $token === 'text' ) {
2349 $this->stack->insertText( $value );
2350 return true;
2351 } elseif ( $token === 'eof' ) {
2352 $this->stack->pop();
2353 return $this->switchModeAndReprocess(
2354 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
2356 } elseif ( $token === 'endtag' ) {
2357 $this->stack->pop();
2358 $this->switchMode( $this->originalInsertionMode );
2359 return true;
2361 return true;
2364 private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
2365 if ( $token === 'text' ) {
2366 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2367 $this->stack->insertText( $matches[0] );
2368 $value = substr( $value, strlen( $matches[0] ) );
2370 if ( strlen( $value ) === 0 ) {
2371 return true; // All text handled.
2373 // Fall through to handle non-whitespace below.
2374 } elseif ( $token === 'tag' ) {
2375 switch ( $value ) {
2376 case 'meta':
2377 // OMITTED: in a full HTML parser, this might change the encoding.
2378 // falls through
2379 // OMITTED: <html>
2380 case 'base':
2381 case 'basefont':
2382 case 'bgsound':
2383 case 'link':
2384 $this->stack->insertHTMLElement( $value, $attribs );
2385 $this->stack->pop();
2386 return true;
2387 // OMITTED: <title>
2388 // OMITTED: <noscript>
2389 case 'noframes':
2390 case 'style':
2391 return $this->parseRawText( $value, $attribs );
2392 // OMITTED: <script>
2393 case 'template':
2394 $this->stack->insertHTMLElement( $value, $attribs );
2395 $this->afe->insertMarker();
2396 // OMITTED: frameset_ok
2397 $this->switchMode( 'inTemplateMode' );
2398 $this->templateInsertionModes[] = $this->parseMode;
2399 return true;
2400 // OMITTED: <head>
2402 } elseif ( $token === 'endtag' ) {
2403 switch ( $value ) {
2404 // OMITTED: <head>
2405 // OMITTED: <body>
2406 // OMITTED: <html>
2407 case 'br':
2408 break; // handle at the bottom of the function
2409 case 'template':
2410 if ( $this->stack->indexOf( $value ) < 0 ) {
2411 return true; // Ignore the token.
2413 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2414 $this->stack->popTag( $value );
2415 $this->afe->clearToMarker();
2416 array_pop( $this->templateInsertionModes );
2417 $this->resetInsertionMode();
2418 return true;
2419 default:
2420 // ignore any other end tag
2421 return true;
2423 } elseif ( $token === 'comment' ) {
2424 $this->stack->insertComment( $value );
2425 return true;
2428 // If not handled above
2429 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2430 // Then redo this one
2431 return $this->insertToken( $token, $value, $attribs, $selfClose );
2434 private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
2435 if ( $token === 'text' ) {
2436 $this->afe->reconstruct( $this->stack );
2437 $this->stack->insertText( $value );
2438 return true;
2439 } elseif ( $token === 'eof' ) {
2440 if ( !empty( $this->templateInsertionModes ) ) {
2441 return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
2443 $this->stopParsing();
2444 return true;
2445 } elseif ( $token === 'tag' ) {
2446 switch ( $value ) {
2447 // OMITTED: <html>
2448 case 'base':
2449 case 'basefont':
2450 case 'bgsound':
2451 case 'link':
2452 case 'meta':
2453 case 'noframes':
2454 // OMITTED: <script>
2455 case 'style':
2456 case 'template':
2457 // OMITTED: <title>
2458 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2459 // OMITTED: <body>
2460 // OMITTED: <frameset>
2462 case 'address':
2463 case 'article':
2464 case 'aside':
2465 case 'blockquote':
2466 case 'center':
2467 case 'details':
2468 case 'dialog':
2469 case 'dir':
2470 case 'div':
2471 case 'dl':
2472 case 'fieldset':
2473 case 'figcaption':
2474 case 'figure':
2475 case 'footer':
2476 case 'header':
2477 case 'hgroup':
2478 case 'main':
2479 case 'nav':
2480 case 'ol':
2481 case 'p':
2482 case 'section':
2483 case 'summary':
2484 case 'ul':
2485 if ( $this->stack->inButtonScope( 'p' ) ) {
2486 $this->inBodyMode( 'endtag', 'p' );
2488 $this->stack->insertHTMLElement( $value, $attribs );
2489 return true;
2491 case 'menu':
2492 if ( $this->stack->inButtonScope( "p" ) ) {
2493 $this->inBodyMode( 'endtag', 'p' );
2495 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2496 $this->stack->pop();
2498 $this->stack->insertHTMLElement( $value, $attribs );
2499 return true;
2501 case 'h1':
2502 case 'h2':
2503 case 'h3':
2504 case 'h4':
2505 case 'h5':
2506 case 'h6':
2507 if ( $this->stack->inButtonScope( 'p' ) ) {
2508 $this->inBodyMode( 'endtag', 'p' );
2510 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2511 $this->stack->pop();
2513 $this->stack->insertHTMLElement( $value, $attribs );
2514 return true;
2516 case 'pre':
2517 case 'listing':
2518 if ( $this->stack->inButtonScope( 'p' ) ) {
2519 $this->inBodyMode( 'endtag', 'p' );
2521 $this->stack->insertHTMLElement( $value, $attribs );
2522 $this->ignoreLinefeed = true;
2523 // OMITTED: frameset_ok
2524 return true;
2526 case 'form':
2527 if (
2528 $this->formElementPointer &&
2529 $this->stack->indexOf( 'template' ) < 0
2531 return true; // in a form, not in a template.
2533 if ( $this->stack->inButtonScope( "p" ) ) {
2534 $this->inBodyMode( 'endtag', 'p' );
2536 $elt = $this->stack->insertHTMLElement( $value, $attribs );
2537 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2538 $this->formElementPointer = $elt;
2540 return true;
2542 case 'li':
2543 // OMITTED: frameset_ok
2544 foreach ( $this->stack as $node ) {
2545 if ( $node->isHtmlNamed( 'li' ) ) {
2546 $this->inBodyMode( 'endtag', 'li' );
2547 break;
2549 if (
2550 $node->isA( BalanceSets::$specialSet ) &&
2551 !$node->isA( BalanceSets::$addressDivPSet )
2553 break;
2556 if ( $this->stack->inButtonScope( 'p' ) ) {
2557 $this->inBodyMode( 'endtag', 'p' );
2559 $this->stack->insertHTMLElement( $value, $attribs );
2560 return true;
2562 case 'dd':
2563 case 'dt':
2564 // OMITTED: frameset_ok
2565 foreach ( $this->stack as $node ) {
2566 if ( $node->isHtmlNamed( 'dd' ) ) {
2567 $this->inBodyMode( 'endtag', 'dd' );
2568 break;
2570 if ( $node->isHtmlNamed( 'dt' ) ) {
2571 $this->inBodyMode( 'endtag', 'dt' );
2572 break;
2574 if (
2575 $node->isA( BalanceSets::$specialSet ) &&
2576 !$node->isA( BalanceSets::$addressDivPSet )
2578 break;
2581 if ( $this->stack->inButtonScope( 'p' ) ) {
2582 $this->inBodyMode( 'endtag', 'p' );
2584 $this->stack->insertHTMLElement( $value, $attribs );
2585 return true;
2587 // OMITTED: <plaintext>
2589 case 'button':
2590 if ( $this->stack->inScope( 'button' ) ) {
2591 $this->inBodyMode( 'endtag', 'button' );
2592 return $this->insertToken( $token, $value, $attribs, $selfClose );
2594 $this->afe->reconstruct( $this->stack );
2595 $this->stack->insertHTMLElement( $value, $attribs );
2596 return true;
2598 case 'a':
2599 $activeElement = $this->afe->findElementByTag( 'a' );
2600 if ( $activeElement ) {
2601 $this->inBodyMode( 'endtag', 'a' );
2602 if ( $this->afe->isInList( $activeElement ) ) {
2603 $this->afe->remove( $activeElement );
2604 // Don't flatten here, since when we fall
2605 // through below we might foster parent
2606 // the new <a> tag inside this one.
2607 $this->stack->removeElement( $activeElement, false );
2610 // Falls through
2611 case 'b':
2612 case 'big':
2613 case 'code':
2614 case 'em':
2615 case 'font':
2616 case 'i':
2617 case 's':
2618 case 'small':
2619 case 'strike':
2620 case 'strong':
2621 case 'tt':
2622 case 'u':
2623 $this->afe->reconstruct( $this->stack );
2624 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2625 return true;
2627 case 'nobr':
2628 $this->afe->reconstruct( $this->stack );
2629 if ( $this->stack->inScope( 'nobr' ) ) {
2630 $this->inBodyMode( 'endtag', 'nobr' );
2631 $this->afe->reconstruct( $this->stack );
2633 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
2634 return true;
2636 case 'applet':
2637 case 'marquee':
2638 case 'object':
2639 $this->afe->reconstruct( $this->stack );
2640 $this->stack->insertHTMLElement( $value, $attribs );
2641 $this->afe->insertMarker();
2642 // OMITTED: frameset_ok
2643 return true;
2645 case 'table':
2646 // The document is never in "quirks mode"; see simplifications
2647 // above.
2648 if ( $this->stack->inButtonScope( 'p' ) ) {
2649 $this->inBodyMode( 'endtag', 'p' );
2651 $this->stack->insertHTMLElement( $value, $attribs );
2652 // OMITTED: frameset_ok
2653 $this->switchMode( 'inTableMode' );
2654 return true;
2656 case 'area':
2657 case 'br':
2658 case 'embed':
2659 case 'img':
2660 case 'keygen':
2661 case 'wbr':
2662 $this->afe->reconstruct( $this->stack );
2663 $this->stack->insertHTMLElement( $value, $attribs );
2664 $this->stack->pop();
2665 // OMITTED: frameset_ok
2666 return true;
2668 case 'input':
2669 $this->afe->reconstruct( $this->stack );
2670 $this->stack->insertHTMLElement( $value, $attribs );
2671 $this->stack->pop();
2672 // OMITTED: frameset_ok
2673 // (hence we don't need to examine the tag's "type" attribute)
2674 return true;
2676 case 'param':
2677 case 'source':
2678 case 'track':
2679 $this->stack->insertHTMLElement( $value, $attribs );
2680 $this->stack->pop();
2681 return true;
2683 case 'hr':
2684 if ( $this->stack->inButtonScope( 'p' ) ) {
2685 $this->inBodyMode( 'endtag', 'p' );
2687 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2688 $this->stack->pop();
2690 $this->stack->insertHTMLElement( $value, $attribs );
2691 $this->stack->pop();
2692 return true;
2694 case 'image':
2695 // warts!
2696 return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
2698 case 'textarea':
2699 $this->stack->insertHTMLElement( $value, $attribs );
2700 $this->ignoreLinefeed = true;
2701 $this->inRCDATA = $value; // emulate rcdata tokenizer mode
2702 // OMITTED: frameset_ok
2703 return true;
2705 // OMITTED: <xmp>
2706 // OMITTED: <iframe>
2707 // OMITTED: <noembed>
2708 // OMITTED: <noscript>
2710 case 'select':
2711 $this->afe->reconstruct( $this->stack );
2712 $this->stack->insertHTMLElement( $value, $attribs );
2713 switch ( $this->parseMode ) {
2714 case 'inTableMode':
2715 case 'inCaptionMode':
2716 case 'inTableBodyMode':
2717 case 'inRowMode':
2718 case 'inCellMode':
2719 $this->switchMode( 'inSelectInTableMode' );
2720 return true;
2721 default:
2722 $this->switchMode( 'inSelectMode' );
2723 return true;
2726 case 'optgroup':
2727 case 'option':
2728 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2729 $this->inBodyMode( 'endtag', 'option' );
2731 $this->afe->reconstruct( $this->stack );
2732 $this->stack->insertHTMLElement( $value, $attribs );
2733 return true;
2735 case 'menuitem':
2736 if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
2737 $this->stack->pop();
2739 $this->afe->reconstruct( $this->stack );
2740 $this->stack->insertHTMLElement( $value, $attribs );
2741 return true;
2743 case 'rb':
2744 case 'rtc':
2745 if ( $this->stack->inScope( 'ruby' ) ) {
2746 $this->stack->generateImpliedEndTags();
2748 $this->stack->insertHTMLElement( $value, $attribs );
2749 return true;
2751 case 'rp':
2752 case 'rt':
2753 if ( $this->stack->inScope( 'ruby' ) ) {
2754 $this->stack->generateImpliedEndTags( 'rtc' );
2756 $this->stack->insertHTMLElement( $value, $attribs );
2757 return true;
2759 case 'math':
2760 $this->afe->reconstruct( $this->stack );
2761 // We skip the spec's "adjust MathML attributes" and
2762 // "adjust foreign attributes" steps, since the browser will
2763 // do this later when it parses the output and it doesn't affect
2764 // balancing.
2765 $this->stack->insertForeignElement(
2766 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2768 if ( $selfClose ) {
2769 // emit explicit </math> tag.
2770 $this->stack->pop();
2772 return true;
2774 case 'svg':
2775 $this->afe->reconstruct( $this->stack );
2776 // We skip the spec's "adjust SVG attributes" and
2777 // "adjust foreign attributes" steps, since the browser will
2778 // do this later when it parses the output and it doesn't affect
2779 // balancing.
2780 $this->stack->insertForeignElement(
2781 BalanceSets::SVG_NAMESPACE, $value, $attribs
2783 if ( $selfClose ) {
2784 // emit explicit </svg> tag.
2785 $this->stack->pop();
2787 return true;
2789 case 'caption':
2790 case 'col':
2791 case 'colgroup':
2792 // OMITTED: <frame>
2793 case 'head':
2794 case 'tbody':
2795 case 'td':
2796 case 'tfoot':
2797 case 'th':
2798 case 'thead':
2799 case 'tr':
2800 // Ignore table tags if we're not inTableMode
2801 return true;
2804 // Handle any other start tag here
2805 $this->afe->reconstruct( $this->stack );
2806 $this->stack->insertHTMLElement( $value, $attribs );
2807 return true;
2808 } elseif ( $token === 'endtag' ) {
2809 switch ( $value ) {
2810 // </body>,</html> are unsupported.
2812 case 'template':
2813 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2815 case 'address':
2816 case 'article':
2817 case 'aside':
2818 case 'blockquote':
2819 case 'button':
2820 case 'center':
2821 case 'details':
2822 case 'dialog':
2823 case 'dir':
2824 case 'div':
2825 case 'dl':
2826 case 'fieldset':
2827 case 'figcaption':
2828 case 'figure':
2829 case 'footer':
2830 case 'header':
2831 case 'hgroup':
2832 case 'listing':
2833 case 'main':
2834 case 'menu':
2835 case 'nav':
2836 case 'ol':
2837 case 'pre':
2838 case 'section':
2839 case 'summary':
2840 case 'ul':
2841 // Ignore if there is not a matching open tag
2842 if ( !$this->stack->inScope( $value ) ) {
2843 return true;
2845 $this->stack->generateImpliedEndTags();
2846 $this->stack->popTag( $value );
2847 return true;
2849 case 'form':
2850 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2851 $openform = $this->formElementPointer;
2852 $this->formElementPointer = null;
2853 if ( !$openform || !$this->stack->inScope( $openform ) ) {
2854 return true;
2856 $this->stack->generateImpliedEndTags();
2857 // Don't flatten yet if we're removing a <form> element
2858 // out-of-order. (eg. `<form><div></form>`)
2859 $flatten = ( $this->stack->currentNode === $openform );
2860 $this->stack->removeElement( $openform, $flatten );
2861 } else {
2862 if ( !$this->stack->inScope( 'form' ) ) {
2863 return true;
2865 $this->stack->generateImpliedEndTags();
2866 $this->stack->popTag( 'form' );
2868 return true;
2870 case 'p':
2871 if ( !$this->stack->inButtonScope( 'p' ) ) {
2872 $this->inBodyMode( 'tag', 'p', [] );
2873 return $this->insertToken( $token, $value, $attribs, $selfClose );
2875 $this->stack->generateImpliedEndTags( $value );
2876 $this->stack->popTag( $value );
2877 return true;
2879 case 'li':
2880 if ( !$this->stack->inListItemScope( $value ) ) {
2881 return true; // ignore
2883 $this->stack->generateImpliedEndTags( $value );
2884 $this->stack->popTag( $value );
2885 return true;
2887 case 'dd':
2888 case 'dt':
2889 if ( !$this->stack->inScope( $value ) ) {
2890 return true; // ignore
2892 $this->stack->generateImpliedEndTags( $value );
2893 $this->stack->popTag( $value );
2894 return true;
2896 case 'h1':
2897 case 'h2':
2898 case 'h3':
2899 case 'h4':
2900 case 'h5':
2901 case 'h6':
2902 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2903 return true; // ignore
2905 $this->stack->generateImpliedEndTags();
2906 $this->stack->popTag( BalanceSets::$headingSet );
2907 return true;
2909 case 'sarcasm':
2910 // Take a deep breath, then:
2911 break;
2913 case 'a':
2914 case 'b':
2915 case 'big':
2916 case 'code':
2917 case 'em':
2918 case 'font':
2919 case 'i':
2920 case 'nobr':
2921 case 's':
2922 case 'small':
2923 case 'strike':
2924 case 'strong':
2925 case 'tt':
2926 case 'u':
2927 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2928 return true; // If we did something, we're done.
2930 break; // Go to the "any other end tag" case.
2932 case 'applet':
2933 case 'marquee':
2934 case 'object':
2935 if ( !$this->stack->inScope( $value ) ) {
2936 return true; // ignore
2938 $this->stack->generateImpliedEndTags();
2939 $this->stack->popTag( $value );
2940 $this->afe->clearToMarker();
2941 return true;
2943 case 'br':
2944 // Turn </br> into <br>
2945 return $this->inBodyMode( 'tag', $value, [] );
2948 // Any other end tag goes here
2949 foreach ( $this->stack as $i => $node ) {
2950 if ( $node->isHtmlNamed( $value ) ) {
2951 $this->stack->generateImpliedEndTags( $value );
2952 $this->stack->popTo( $i ); // including $i
2953 break;
2954 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2955 return true; // ignore this close token.
2958 return true;
2959 } elseif ( $token === 'comment' ) {
2960 $this->stack->insertComment( $value );
2961 return true;
2962 } else {
2963 Assert::invariant( false, "Bad token type: $token" );
2967 private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
2968 if ( $token === 'text' ) {
2969 if ( $this->textIntegrationMode ) {
2970 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
2971 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2972 $this->pendingTableText = '';
2973 $this->originalInsertionMode = $this->parseMode;
2974 return $this->switchModeAndReprocess( 'inTableTextMode',
2975 $token, $value, $attribs, $selfClose );
2977 // fall through to default case.
2978 } elseif ( $token === 'eof' ) {
2979 $this->stopParsing();
2980 return true;
2981 } elseif ( $token === 'tag' ) {
2982 switch ( $value ) {
2983 case 'caption':
2984 $this->afe->insertMarker();
2985 $this->stack->insertHTMLElement( $value, $attribs );
2986 $this->switchMode( 'inCaptionMode' );
2987 return true;
2988 case 'colgroup':
2989 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2990 $this->stack->insertHTMLElement( $value, $attribs );
2991 $this->switchMode( 'inColumnGroupMode' );
2992 return true;
2993 case 'col':
2994 $this->inTableMode( 'tag', 'colgroup', [] );
2995 return $this->insertToken( $token, $value, $attribs, $selfClose );
2996 case 'tbody':
2997 case 'tfoot':
2998 case 'thead':
2999 $this->stack->clearToContext( BalanceSets::$tableContextSet );
3000 $this->stack->insertHTMLElement( $value, $attribs );
3001 $this->switchMode( 'inTableBodyMode' );
3002 return true;
3003 case 'td':
3004 case 'th':
3005 case 'tr':
3006 $this->inTableMode( 'tag', 'tbody', [] );
3007 return $this->insertToken( $token, $value, $attribs, $selfClose );
3008 case 'table':
3009 if ( !$this->stack->inTableScope( $value ) ) {
3010 return true; // Ignore this tag.
3012 $this->inTableMode( 'endtag', $value );
3013 return $this->insertToken( $token, $value, $attribs, $selfClose );
3015 case 'style':
3016 // OMITTED: <script>
3017 case 'template':
3018 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3020 case 'input':
3021 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
3022 break; // Handle this as "everything else"
3024 $this->stack->insertHTMLElement( $value, $attribs );
3025 $this->stack->pop();
3026 return true;
3028 case 'form':
3029 if (
3030 $this->formElementPointer ||
3031 $this->stack->indexOf( 'template' ) >= 0
3033 return true; // ignore this token
3035 $this->formElementPointer =
3036 $this->stack->insertHTMLElement( $value, $attribs );
3037 $this->stack->popTag( $this->formElementPointer );
3038 return true;
3040 // Fall through for "anything else" clause.
3041 } elseif ( $token === 'endtag' ) {
3042 switch ( $value ) {
3043 case 'table':
3044 if ( !$this->stack->inTableScope( $value ) ) {
3045 return true; // Ignore.
3047 $this->stack->popTag( $value );
3048 $this->resetInsertionMode();
3049 return true;
3050 // OMITTED: <body>
3051 case 'caption':
3052 case 'col':
3053 case 'colgroup':
3054 // OMITTED: <html>
3055 case 'tbody':
3056 case 'td':
3057 case 'tfoot':
3058 case 'th':
3059 case 'thead':
3060 case 'tr':
3061 return true; // Ignore the token.
3062 case 'template':
3063 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3065 // Fall through for "anything else" clause.
3066 } elseif ( $token === 'comment' ) {
3067 $this->stack->insertComment( $value );
3068 return true;
3070 // This is the "anything else" case:
3071 $this->stack->fosterParentMode = true;
3072 $this->inBodyMode( $token, $value, $attribs, $selfClose );
3073 $this->stack->fosterParentMode = false;
3074 return true;
3077 private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
3078 if ( $token === 'text' ) {
3079 $this->pendingTableText .= $value;
3080 return true;
3082 // Non-text token:
3083 $text = $this->pendingTableText;
3084 $this->pendingTableText = '';
3085 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
3086 // This should match the "anything else" case inTableMode
3087 $this->stack->fosterParentMode = true;
3088 $this->inBodyMode( 'text', $text );
3089 $this->stack->fosterParentMode = false;
3090 } else {
3091 // Pending text is just whitespace.
3092 $this->stack->insertText( $text );
3094 return $this->switchModeAndReprocess(
3095 $this->originalInsertionMode, $token, $value, $attribs, $selfClose
3099 // helper for inCaptionMode
3100 private function endCaption() {
3101 if ( !$this->stack->inTableScope( 'caption' ) ) {
3102 return false;
3104 $this->stack->generateImpliedEndTags();
3105 $this->stack->popTag( 'caption' );
3106 $this->afe->clearToMarker();
3107 $this->switchMode( 'inTableMode' );
3108 return true;
3111 private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
3112 if ( $token === 'tag' ) {
3113 switch ( $value ) {
3114 case 'caption':
3115 case 'col':
3116 case 'colgroup':
3117 case 'tbody':
3118 case 'td':
3119 case 'tfoot':
3120 case 'th':
3121 case 'thead':
3122 case 'tr':
3123 if ( $this->endCaption() ) {
3124 $this->insertToken( $token, $value, $attribs, $selfClose );
3126 return true;
3128 // Fall through to "anything else" case.
3129 } elseif ( $token === 'endtag' ) {
3130 switch ( $value ) {
3131 case 'caption':
3132 $this->endCaption();
3133 return true;
3134 case 'table':
3135 if ( $this->endCaption() ) {
3136 $this->insertToken( $token, $value, $attribs, $selfClose );
3138 return true;
3139 case 'body':
3140 case 'col':
3141 case 'colgroup':
3142 // OMITTED: <html>
3143 case 'tbody':
3144 case 'td':
3145 case 'tfoot':
3146 case 'th':
3147 case 'thead':
3148 case 'tr':
3149 // Ignore the token
3150 return true;
3152 // Fall through to "anything else" case.
3154 // The Anything Else case
3155 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3158 private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
3159 if ( $token === 'text' ) {
3160 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
3161 $this->stack->insertText( $matches[0] );
3162 $value = substr( $value, strlen( $matches[0] ) );
3164 if ( strlen( $value ) === 0 ) {
3165 return true; // All text handled.
3167 // Fall through to handle non-whitespace below.
3168 } elseif ( $token === 'tag' ) {
3169 switch ( $value ) {
3170 // OMITTED: <html>
3171 case 'col':
3172 $this->stack->insertHTMLElement( $value, $attribs );
3173 $this->stack->pop();
3174 return true;
3175 case 'template':
3176 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3178 // Fall through for "anything else".
3179 } elseif ( $token === 'endtag' ) {
3180 switch ( $value ) {
3181 case 'colgroup':
3182 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3183 return true; // Ignore the token.
3185 $this->stack->pop();
3186 $this->switchMode( 'inTableMode' );
3187 return true;
3188 case 'col':
3189 return true; // Ignore the token.
3190 case 'template':
3191 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3193 // Fall through for "anything else".
3194 } elseif ( $token === 'eof' ) {
3195 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3196 } elseif ( $token === 'comment' ) {
3197 $this->stack->insertComment( $value );
3198 return true;
3201 // Anything else
3202 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3203 return true; // Ignore the token.
3205 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3206 return $this->insertToken( $token, $value, $attribs, $selfClose );
3209 // Helper function for inTableBodyMode
3210 private function endSection() {
3211 if ( !(
3212 $this->stack->inTableScope( 'tbody' ) ||
3213 $this->stack->inTableScope( 'thead' ) ||
3214 $this->stack->inTableScope( 'tfoot' )
3215 ) ) {
3216 return false;
3218 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3219 $this->stack->pop();
3220 $this->switchMode( 'inTableMode' );
3221 return true;
3223 private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
3224 if ( $token === 'tag' ) {
3225 switch ( $value ) {
3226 case 'tr':
3227 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3228 $this->stack->insertHTMLElement( $value, $attribs );
3229 $this->switchMode( 'inRowMode' );
3230 return true;
3231 case 'th':
3232 case 'td':
3233 $this->inTableBodyMode( 'tag', 'tr', [] );
3234 $this->insertToken( $token, $value, $attribs, $selfClose );
3235 return true;
3236 case 'caption':
3237 case 'col':
3238 case 'colgroup':
3239 case 'tbody':
3240 case 'tfoot':
3241 case 'thead':
3242 if ( $this->endSection() ) {
3243 $this->insertToken( $token, $value, $attribs, $selfClose );
3245 return true;
3247 } elseif ( $token === 'endtag' ) {
3248 switch ( $value ) {
3249 case 'table':
3250 if ( $this->endSection() ) {
3251 $this->insertToken( $token, $value, $attribs, $selfClose );
3253 return true;
3254 case 'tbody':
3255 case 'tfoot':
3256 case 'thead':
3257 if ( $this->stack->inTableScope( $value ) ) {
3258 $this->endSection();
3260 return true;
3261 // OMITTED: <body>
3262 case 'caption':
3263 case 'col':
3264 case 'colgroup':
3265 // OMITTED: <html>
3266 case 'td':
3267 case 'th':
3268 case 'tr':
3269 return true; // Ignore the token.
3272 // Anything else:
3273 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3276 // Helper function for inRowMode
3277 private function endRow() {
3278 if ( !$this->stack->inTableScope( 'tr' ) ) {
3279 return false;
3281 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3282 $this->stack->pop();
3283 $this->switchMode( 'inTableBodyMode' );
3284 return true;
3286 private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
3287 if ( $token === 'tag' ) {
3288 switch ( $value ) {
3289 case 'th':
3290 case 'td':
3291 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3292 $this->stack->insertHTMLElement( $value, $attribs );
3293 $this->switchMode( 'inCellMode' );
3294 $this->afe->insertMarker();
3295 return true;
3296 case 'caption':
3297 case 'col':
3298 case 'colgroup':
3299 case 'tbody':
3300 case 'tfoot':
3301 case 'thead':
3302 case 'tr':
3303 if ( $this->endRow() ) {
3304 $this->insertToken( $token, $value, $attribs, $selfClose );
3306 return true;
3308 } elseif ( $token === 'endtag' ) {
3309 switch ( $value ) {
3310 case 'tr':
3311 $this->endRow();
3312 return true;
3313 case 'table':
3314 if ( $this->endRow() ) {
3315 $this->insertToken( $token, $value, $attribs, $selfClose );
3317 return true;
3318 case 'tbody':
3319 case 'tfoot':
3320 case 'thead':
3321 if (
3322 $this->stack->inTableScope( $value ) &&
3323 $this->endRow()
3325 $this->insertToken( $token, $value, $attribs, $selfClose );
3327 return true;
3328 // OMITTED: <body>
3329 case 'caption':
3330 case 'col':
3331 case 'colgroup':
3332 // OMITTED: <html>
3333 case 'td':
3334 case 'th':
3335 return true; // Ignore the token.
3338 // Anything else:
3339 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3342 // Helper for inCellMode
3343 private function endCell() {
3344 if ( $this->stack->inTableScope( 'td' ) ) {
3345 $this->inCellMode( 'endtag', 'td' );
3346 return true;
3347 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3348 $this->inCellMode( 'endtag', 'th' );
3349 return true;
3350 } else {
3351 return false;
3354 private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
3355 if ( $token === 'tag' ) {
3356 switch ( $value ) {
3357 case 'caption':
3358 case 'col':
3359 case 'colgroup':
3360 case 'tbody':
3361 case 'td':
3362 case 'tfoot':
3363 case 'th':
3364 case 'thead':
3365 case 'tr':
3366 if ( $this->endCell() ) {
3367 $this->insertToken( $token, $value, $attribs, $selfClose );
3369 return true;
3371 } elseif ( $token === 'endtag' ) {
3372 switch ( $value ) {
3373 case 'td':
3374 case 'th':
3375 if ( $this->stack->inTableScope( $value ) ) {
3376 $this->stack->generateImpliedEndTags();
3377 $this->stack->popTag( $value );
3378 $this->afe->clearToMarker();
3379 $this->switchMode( 'inRowMode' );
3381 return true;
3382 // OMITTED: <body>
3383 case 'caption':
3384 case 'col':
3385 case 'colgroup':
3386 // OMITTED: <html>
3387 return true;
3389 case 'table':
3390 case 'tbody':
3391 case 'tfoot':
3392 case 'thead':
3393 case 'tr':
3394 if ( $this->stack->inTableScope( $value ) ) {
3395 $this->stack->generateImpliedEndTags();
3396 $this->stack->popTag( BalanceSets::$tableCellSet );
3397 $this->afe->clearToMarker();
3398 $this->switchMode( 'inRowMode' );
3399 $this->insertToken( $token, $value, $attribs, $selfClose );
3401 return true;
3404 // Anything else:
3405 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3408 private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
3409 if ( $token === 'text' ) {
3410 $this->stack->insertText( $value );
3411 return true;
3412 } elseif ( $token === 'eof' ) {
3413 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3414 } elseif ( $token === 'tag' ) {
3415 switch ( $value ) {
3416 // OMITTED: <html>
3417 case 'option':
3418 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3419 $this->stack->pop();
3421 $this->stack->insertHTMLElement( $value, $attribs );
3422 return true;
3423 case 'optgroup':
3424 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3425 $this->stack->pop();
3427 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3428 $this->stack->pop();
3430 $this->stack->insertHTMLElement( $value, $attribs );
3431 return true;
3432 case 'select':
3433 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3434 return true;
3435 case 'input':
3436 case 'keygen':
3437 case 'textarea':
3438 if ( !$this->stack->inSelectScope( 'select' ) ) {
3439 return true; // ignore token (fragment case)
3441 $this->inSelectMode( 'endtag', 'select' );
3442 return $this->insertToken( $token, $value, $attribs, $selfClose );
3443 case 'script':
3444 case 'template':
3445 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3447 } elseif ( $token === 'endtag' ) {
3448 switch ( $value ) {
3449 case 'optgroup':
3450 if (
3451 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3452 $this->stack->length() >= 2 &&
3453 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3455 $this->stack->pop();
3457 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3458 $this->stack->pop();
3460 return true;
3461 case 'option':
3462 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3463 $this->stack->pop();
3465 return true;
3466 case 'select':
3467 if ( !$this->stack->inSelectScope( $value ) ) {
3468 return true; // fragment case
3470 $this->stack->popTag( $value );
3471 $this->resetInsertionMode();
3472 return true;
3473 case 'template':
3474 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3476 } elseif ( $token === 'comment' ) {
3477 $this->stack->insertComment( $value );
3478 return true;
3480 // anything else: just ignore the token
3481 return true;
3484 private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
3485 switch ( $value ) {
3486 case 'caption':
3487 case 'table':
3488 case 'tbody':
3489 case 'tfoot':
3490 case 'thead':
3491 case 'tr':
3492 case 'td':
3493 case 'th':
3494 if ( $token === 'tag' ) {
3495 $this->inSelectInTableMode( 'endtag', 'select' );
3496 return $this->insertToken( $token, $value, $attribs, $selfClose );
3497 } elseif ( $token === 'endtag' ) {
3498 if ( $this->stack->inTableScope( $value ) ) {
3499 $this->inSelectInTableMode( 'endtag', 'select' );
3500 return $this->insertToken( $token, $value, $attribs, $selfClose );
3502 return true;
3505 // anything else
3506 return $this->inSelectMode( $token, $value, $attribs, $selfClose );
3509 private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
3510 if ( $token === 'text' || $token === 'comment' ) {
3511 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3512 } elseif ( $token === 'eof' ) {
3513 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3514 $this->stopParsing();
3515 } else {
3516 $this->stack->popTag( 'template' );
3517 $this->afe->clearToMarker();
3518 array_pop( $this->templateInsertionModes );
3519 $this->resetInsertionMode();
3520 $this->insertToken( $token, $value, $attribs, $selfClose );
3522 return true;
3523 } elseif ( $token === 'tag' ) {
3524 switch ( $value ) {
3525 case 'base':
3526 case 'basefont':
3527 case 'bgsound':
3528 case 'link':
3529 case 'meta':
3530 case 'noframes':
3531 // OMITTED: <script>
3532 case 'style':
3533 case 'template':
3534 // OMITTED: <title>
3535 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3537 case 'caption':
3538 case 'colgroup':
3539 case 'tbody':
3540 case 'tfoot':
3541 case 'thead':
3542 return $this->switchModeAndReprocess(
3543 'inTableMode', $token, $value, $attribs, $selfClose
3546 case 'col':
3547 return $this->switchModeAndReprocess(
3548 'inColumnGroupMode', $token, $value, $attribs, $selfClose
3551 case 'tr':
3552 return $this->switchModeAndReprocess(
3553 'inTableBodyMode', $token, $value, $attribs, $selfClose
3556 case 'td':
3557 case 'th':
3558 return $this->switchModeAndReprocess(
3559 'inRowMode', $token, $value, $attribs, $selfClose
3562 return $this->switchModeAndReprocess(
3563 'inBodyMode', $token, $value, $attribs, $selfClose
3565 } elseif ( $token === 'endtag' ) {
3566 switch ( $value ) {
3567 case 'template':
3568 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3570 return true;
3571 } else {
3572 Assert::invariant( false, "Bad token type: $token" );