3 namespace MediaWiki\Parser
;
5 use Wikimedia\RemexHtml\Tokenizer\Attributes
;
6 use Wikimedia\RemexHtml\Tokenizer\PlainAttributes
;
7 use Wikimedia\RemexHtml\Tokenizer\RelayTokenHandler
;
8 use Wikimedia\RemexHtml\Tokenizer\TokenHandler
;
11 * Helper class for Sanitizer::removeSomeTags().
14 class RemexRemoveTagHandler
extends RelayTokenHandler
{
16 * @var string The original HTML source string (used for fallback text
17 * when rejecting an HTML tag).
22 * @var array<string,true> Set of HTML tags which can be self-closed.
27 * @var array<string,true> Self-closed tags which are on $htmlsingle
28 * but not on $htmlsingleonly will be emitted as an empty element.
30 private $htmlsingleonly;
33 * @var array<string,true> Set of allowed HTML open/close tags.
35 private $htmlelements;
38 * @var ?callable(Attributes,mixed...):Attributes Callback to mutate or
39 * sanitize attributes.
41 private $attrCallback;
44 * @var ?array $args Optional extra arguments to provide to the
47 private $callbackArgs;
50 * @param TokenHandler $nextHandler Handler to relay accepted tokens.
51 * @param string $source Input source string.
52 * @param array $tagData Information about allowed/rejected tags.
53 * @param ?callable $attrCallback Attribute handler callback.
54 * The full signature is ?callable(Attributes,mixed...):Attributes
55 * @param ?array $callbackArgs Optional arguments to attribute handler.
57 public function __construct(
58 TokenHandler
$nextHandler,
61 ?callable
$attrCallback,
64 parent
::__construct( $nextHandler );
65 $this->source
= $source;
66 $this->htmlsingle
= $tagData['htmlsingle'];
67 $this->htmlsingleonly
= $tagData['htmlsingleonly'];
68 $this->htmlelements
= $tagData['htmlelements'];
69 $this->attrCallback
= $attrCallback;
70 $this->callbackArgs
= $callbackArgs ??
[];
76 public function comment( $text, $sourceStart, $sourceLength ) {
77 // Don't relay comments.
81 * Takes attribute names and values for a tag and the tag name and
82 * validates that the tag is allowed to be present.
83 * This DOES NOT validate the attributes, nor does it validate the
84 * tags themselves. This method only handles the special circumstances
85 * where we may want to allow a tag within content but ONLY when it has
86 * specific attributes set.
88 * @param string $element
89 * @param Attributes $attrs
92 * @see Sanitizer::validateTag()
94 private static function validateTag( string $element, Attributes
$attrs ): bool {
95 if ( $element == 'meta' ||
$element == 'link' ) {
96 $params = $attrs->getValues();
97 if ( !isset( $params['itemprop'] ) ) {
98 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
101 if ( $element == 'meta' && !isset( $params['content'] ) ) {
102 // <meta> must have a content="" for the itemprop
105 if ( $element == 'link' && !isset( $params['href'] ) ) {
106 // <link> must have an associated href=""
117 public function startTag( $name, Attributes
$attrs, $selfClose, $sourceStart, $sourceLength ) {
118 // Handle a start tag from the tokenizer: either relay it to the
119 // next stage, or re-emit it as raw text.
122 $t = strtolower( $name );
123 if ( isset( $this->htmlelements
[$t] ) ) {
124 if ( $this->attrCallback
) {
125 $attrs = ( $this->attrCallback
)( $attrs, ...$this->callbackArgs
);
127 if ( $selfClose && !( isset( $this->htmlsingle
[$t] ) ||
isset( $this->htmlsingleonly
[$t] ) ) ) {
128 // Remove the self-closing slash, to be consistent with
129 // HTML5 semantics. T134423
132 if ( !self
::validateTag( $t, $attrs ) ) {
135 $fixedAttrs = Sanitizer
::validateTagAttributes( $attrs->getValues(), $t );
136 $attrs = new PlainAttributes( $fixedAttrs );
138 if ( $selfClose && !isset( $this->htmlsingleonly
[$t] ) ) {
139 // Interpret self-closing tags as empty tags even when
140 // HTML5 would interpret them as start tags. Such input
141 // is commonly seen on Wikimedia wikis with this intention.
142 $this->nextHandler
->startTag( $name, $attrs, false, $sourceStart, $sourceLength );
143 $this->nextHandler
->endTag( $name, $sourceStart +
$sourceLength, 0 );
145 $this->nextHandler
->startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
150 // Emit this as a text node instead.
151 $this->nextHandler
->characters( $this->source
, $sourceStart, $sourceLength, $sourceStart, $sourceLength );
157 public function endTag( $name, $sourceStart, $sourceLength ) {
158 // Handle an end tag from the tokenizer: either relay it to the
159 // next stage, or re-emit it as raw text.
161 $t = strtolower( $name );
162 if ( isset( $this->htmlelements
[$t] ) ) {
163 // This is a good tag, relay it.
164 $this->nextHandler
->endTag( $name, $sourceStart, $sourceLength );
166 // Emit this as a text node instead.
167 $this->nextHandler
->characters( $this->source
, $sourceStart, $sourceLength, $sourceStart, $sourceLength );