Merge "mediawiki.api: Remove console warning for legacy token type"
[mediawiki.git] / includes / parser / Sanitizer.php
blob0174d94863ceb6c4ba8e841ea7808b8f1d1599f5
1 <?php
2 /**
3 * HTML sanitizer for %MediaWiki.
5 * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al
6 * https://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
23 * @file
24 * @ingroup Parser
27 namespace MediaWiki\Parser;
29 use InvalidArgumentException;
30 use LogicException;
31 use MediaWiki\HookContainer\HookRunner;
32 use MediaWiki\MediaWikiServices;
33 use MediaWiki\Tidy\RemexCompatFormatter;
34 use StringUtils;
35 use UnexpectedValueException;
36 use Wikimedia\RemexHtml\HTMLData;
37 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
42 /**
43 * HTML sanitizer for MediaWiki
44 * @ingroup Parser
46 class Sanitizer {
47 /**
48 * Regular expression to match various types of character references in
49 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
50 * Note that HTML5 allows some named entities to omit the trailing
51 * semicolon; wikitext entities *must* have a trailing semicolon.
53 private const CHAR_REFS_REGEX =
54 '/&([A-Za-z0-9\x80-\xff]+;)
55 |&\#([0-9]+);
56 |&\#[xX]([0-9A-Fa-f]+);
57 |&/x';
59 /**
60 * Acceptable tag name charset from HTML5 parsing spec
61 * https://www.w3.org/TR/html5/syntax.html#tag-open-state
63 private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
65 /**
66 * Pattern matching evil uris like javascript:
67 * WARNING: DO NOT use this in any place that actually requires denying
68 * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
69 * pattern-based deny lists; the only way to be secure from javascript:
70 * uri based xss vectors is to allow only things that you know are safe
71 * and deny everything else.
72 * [1]: http://ha.ckers.org/xss.html
74 private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75 private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
77 /**
78 * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
80 * @since 1.30
82 public const ID_PRIMARY = 0;
84 /**
85 * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
86 * if no fallback is configured.
88 * @since 1.30
90 public const ID_FALLBACK = 1;
92 /**
93 * Character entity aliases accepted by MediaWiki in wikitext.
94 * These are not part of the HTML standard.
96 private const MW_ENTITY_ALIASES = [
97 'רלמ;' => 'rlm;',
98 'رلم;' => 'rlm;',
102 * Lazy-initialised attributes regex, see getAttribsRegex()
104 private static ?string $attribsRegex = null;
107 * Regular expression to match HTML/XML attribute pairs within a tag.
108 * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
109 * Used in Sanitizer::decodeTagAttributes
111 private static function getAttribsRegex(): string {
112 if ( self::$attribsRegex === null ) {
113 $spaceChars = '\x09\x0a\x0c\x0d\x20';
114 $space = "[{$spaceChars}]";
115 $attrib = "[^{$spaceChars}\/>=]";
116 $attribFirst = "(?:{$attrib}|=)";
117 self::$attribsRegex =
118 "/({$attribFirst}{$attrib}*)
119 ($space*=$space*
121 # The attribute value: quoted or alone
122 \"([^\"]*)(?:\"|\$)
123 | '([^']*)(?:'|\$)
124 | (((?!$space|>).)*)
126 )?/sxu";
128 return self::$attribsRegex;
132 * Lazy-initialised attribute name regex, see getAttribNameRegex()
134 private static ?string $attribNameRegex = null;
137 * Used in Sanitizer::decodeTagAttributes to filter attributes.
139 private static function getAttribNameRegex(): string {
140 if ( self::$attribNameRegex === null ) {
141 $attribFirst = "[:_\p{L}\p{N}]";
142 $attrib = "[:_\.\-\p{L}\p{N}]";
143 self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
145 return self::$attribNameRegex;
149 * Return the various lists of recognized tags
150 * @param string[] $extratags For any extra tags to include
151 * @param string[] $removetags For any tags (default or extra) to exclude
152 * @return array
153 * @internal
155 public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array {
156 static $commonCase, $staticInitialised = false;
157 $isCommonCase = ( $extratags === [] && $removetags === [] );
158 if ( $staticInitialised && $isCommonCase && $commonCase ) {
159 return $commonCase;
162 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
163 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
165 if ( !$staticInitialised ) {
166 $htmlpairsStatic = [ # Tags that must be closed
167 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
168 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
169 'strike', 'strong', 'tt', 'var', 'div', 'center',
170 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
171 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
172 'kbd', 'samp', 'data', 'time', 'mark'
174 # These tags can be self-closed. For tags not also on
175 # $htmlsingleonly, a self-closed tag will be emitted as
176 # an empty element (open-tag/close-tag pair).
177 $htmlsingle = [
178 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
181 # Elements that cannot have close tags. This is (not coincidentally)
182 # also the list of tags for which the HTML 5 parsing algorithm
183 # requires you to "acknowledge the token's self-closing flag", i.e.
184 # a self-closing tag like <br/> is not an HTML 5 parse error only
185 # for this list.
186 $htmlsingleonly = [
187 'br', 'wbr', 'hr', 'meta', 'link'
190 $htmlnest = [ # Tags that can be nested--??
191 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
192 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
193 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
195 $tabletags = [ # Can only appear inside table, we will close them
196 'td', 'th', 'tr',
198 $htmllist = [ # Tags used by list
199 'ul', 'ol',
201 $listtags = [ # Tags that can appear in a list
202 'li',
205 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
206 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
208 # Convert them all to hashtables for faster lookup
209 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
210 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
211 foreach ( $vars as $var ) {
212 $$var = array_fill_keys( $$var, true );
214 $staticInitialised = true;
217 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
218 $extratags = array_fill_keys( $extratags, true );
219 $removetags = array_fill_keys( $removetags, true );
220 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
221 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
223 $result = [
224 'htmlpairs' => $htmlpairs,
225 'htmlsingle' => $htmlsingle,
226 'htmlsingleonly' => $htmlsingleonly,
227 'htmlnest' => $htmlnest,
228 'tabletags' => $tabletags,
229 'htmllist' => $htmllist,
230 'listtags' => $listtags,
231 'htmlsingleallowed' => $htmlsingleallowed,
232 'htmlelements' => $htmlelements,
234 if ( $isCommonCase ) {
235 $commonCase = $result;
237 return $result;
241 * Cleans up HTML, removes dangerous tags and attributes, and
242 * removes HTML comments; BEWARE there may be unmatched HTML
243 * tags in the result.
245 * @note Callers are recommended to use `::removeSomeTags()` instead
246 * of this method. `Sanitizer::removeSomeTags()` is safer and will
247 * always return well-formed HTML; however, it is significantly
248 * slower (especially for short strings where setup costs
249 * predominate). This method is for internal use by the legacy parser
250 * where we know the result will be cleaned up in a subsequent tidy pass.
252 * @param string $text Original string; see T268353 for why untainted.
253 * @param-taint $text none
254 * @param callable|null $processCallback Callback to do any variable or
255 * parameter replacements in HTML attribute values.
256 * This argument should be considered @internal.
257 * @param-taint $processCallback exec_shell
258 * @param array|bool $args Arguments for the processing callback
259 * @param-taint $args none
260 * @param array $extratags For any extra tags to include
261 * @param-taint $extratags tainted
262 * @param array $removetags For any tags (default or extra) to exclude
263 * @param-taint $removetags none
264 * @return string
265 * @return-taint escaped
266 * @internal
268 public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null,
269 $args = [], array $extratags = [], array $removetags = []
270 ): string {
271 $tagData = self::getRecognizedTagData( $extratags, $removetags );
272 $htmlsingle = $tagData['htmlsingle'];
273 $htmlsingleonly = $tagData['htmlsingleonly'];
274 $htmlelements = $tagData['htmlelements'];
276 # Remove HTML comments
277 $text = self::removeHTMLcomments( $text );
278 $bits = explode( '<', $text );
279 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
281 # this might be possible using remex tidy itself
282 foreach ( $bits as $x ) {
283 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
284 [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
286 $badtag = false;
287 $t = strtolower( $t );
288 if ( isset( $htmlelements[$t] ) ) {
289 if ( is_callable( $processCallback ) ) {
290 call_user_func_array( $processCallback, [ &$params, $args ] );
293 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
294 // Remove the self-closing slash, to be consistent
295 // with HTML5 semantics. T134423
296 $brace = '>';
298 if ( !self::validateTag( $params, $t ) ) {
299 $badtag = true;
302 $newparams = self::fixTagAttributes( $params, $t );
303 if ( !$badtag ) {
304 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
305 # Interpret self-closing tags as empty tags even when
306 # HTML 5 would interpret them as start tags. Such input
307 # is commonly seen on Wikimedia wikis with this intention.
308 $brace = "></$t>";
311 $rest = str_replace( '>', '&gt;', $rest );
312 $text .= "<$slash$t$newparams$brace$rest";
313 continue;
317 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
319 return $text;
323 * Cleans up HTML, removes dangerous tags and attributes, and
324 * removes HTML comments; the result will always be balanced and
325 * tidy HTML.
326 * @param string $text Source string; see T268353 for why untainted
327 * @param-taint $text none
328 * @param array $options Options controlling the cleanup:
329 * string[] $options['extraTags'] Any extra tags to allow
330 * (This property taints the whole array.)
331 * string[] $options['removeTags'] Any tags (default or extra) to exclude
332 * callable(Attributes,...):Attributes $options['attrCallback'] Callback
333 * to do any variable or parameter replacements in HTML attribute
334 * values before further cleanup; should be considered @internal
335 * and not for external use.
336 * array $options['attrCallbackArgs'] Additional arguments for the
337 * attribute callback
338 * @param-taint $options tainted
339 * @return string The cleaned up HTML
340 * @return-taint escaped
341 * @since 1.38
343 public static function removeSomeTags(
344 string $text, array $options = []
345 ): string {
346 $extraTags = $options['extraTags'] ?? [];
347 $removeTags = $options['removeTags'] ?? [];
348 // These options are @internal:
349 $attrCallback = $options['attrCallback'] ?? null;
350 $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
352 // This disallows HTML5-style "missing trailing semicolon" attributes
353 // In wikitext "clean&copy" does *not* contain an entity.
354 $text = self::normalizeCharReferences( $text );
356 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
357 // Use RemexHtml to tokenize $text and remove the barred tags
358 $formatter = new RemexCompatFormatter;
359 $serializer = new RemexSerializer( $formatter );
360 $treeBuilder = new RemexTreeBuilder( $serializer, [
361 'ignoreErrors' => true,
362 'ignoreNulls' => true,
363 ] );
364 $dispatcher = new RemexDispatcher( $treeBuilder );
365 $tokenHandler = $dispatcher;
366 $remover = new RemexRemoveTagHandler(
367 $tokenHandler, $text, $tagData,
368 $attrCallback, $attrCallbackArgs
370 $tokenizer = new RemexTokenizer( $remover, $text, [
371 'ignoreErrors' => true,
372 // don't ignore char refs, we want them to be decoded
373 'ignoreNulls' => true,
374 'skipPreprocess' => true,
375 ] );
376 $tokenizer->execute( [
377 'fragmentNamespace' => HTMLData::NS_HTML,
378 'fragmentName' => 'body',
379 ] );
380 return $serializer->getResult();
384 * Remove '<!--', '-->', and everything between.
385 * To avoid leaving blank lines, when a comment is both preceded
386 * and followed by a newline (ignoring spaces), trim leading and
387 * trailing spaces and one of the newlines.
389 public static function removeHTMLcomments( string $text ): string {
390 // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
391 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
392 $end = strpos( $text, '-->', $start + 4 );
393 if ( $end === false ) {
394 # Unterminated comment; bail out
395 break;
398 $end += 3;
400 # Trim space and newline if the comment is both
401 # preceded and followed by a newline
402 $spaceStart = max( $start - 1, 0 );
403 $spaceLen = $end - $spaceStart;
404 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
405 $spaceStart--;
406 $spaceLen++;
408 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
409 $spaceLen++;
411 if ( substr( $text, $spaceStart, 1 ) === "\n"
412 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
413 # Remove the comment, leading and trailing
414 # spaces, and leave only one newline.
415 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
416 } else {
417 # Remove just the comment.
418 $text = substr_replace( $text, '', $start, $end - $start );
421 return $text;
425 * Takes attribute names and values for a tag and the tag name and
426 * validates that the tag is allowed to be present.
427 * This DOES NOT validate the attributes, nor does it validate the
428 * tags themselves. This method only handles the special circumstances
429 * where we may want to allow a tag within content but ONLY when it has
430 * specific attributes set.
432 * @see RemexRemoveTagHandler::validateTag()
434 private static function validateTag( string $params, string $element ): bool {
435 $params = self::decodeTagAttributes( $params );
437 if ( $element == 'meta' || $element == 'link' ) {
438 if ( !isset( $params['itemprop'] ) ) {
439 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
440 return false;
442 if ( $element == 'meta' && !isset( $params['content'] ) ) {
443 // <meta> must have a content="" for the itemprop
444 return false;
446 if ( $element == 'link' && !isset( $params['href'] ) ) {
447 // <link> must have an associated href=""
448 return false;
452 return true;
456 * Take an array of attribute names and values and normalize or discard
457 * illegal values for the given element type.
459 * - Discards attributes not allowed for the given element
460 * - Unsafe style attributes are discarded
461 * - Invalid id attributes are re-encoded
463 * @todo Check for legal values where the DTD limits things.
464 * @todo Check for unique id attribute :P
466 public static function validateTagAttributes( array $attribs, string $element ): array {
467 return self::validateAttributes( $attribs,
468 self::attributesAllowedInternal( $element ) );
472 * Take an array of attribute names and values and normalize or discard
473 * illegal values.
475 * - Discards attributes not on the given list
476 * - Unsafe style attributes are discarded
477 * - Invalid id attributes are re-encoded
479 * @param array $attribs
480 * @param array $allowed List of allowed attribute names,
481 * as an associative array where keys give valid attribute names
482 * (since 1.34). Before 1.35, passing a sequential array of
483 * valid attribute names was permitted but that is now deprecated.
484 * @return array
486 * @todo Check for legal values where the DTD limits things.
487 * @todo Check for unique id attribute :P
489 public static function validateAttributes( array $attribs, array $allowed ): array {
490 if ( isset( $allowed[0] ) ) {
491 // Calling this function with a sequential array is
492 // deprecated. For now just convert it.
493 wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
494 $allowed = array_fill_keys( $allowed, true );
496 $validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols();
497 $hrefExp = '/^(' . $validProtocols . ')[^\s]+$/';
499 $out = [];
500 foreach ( $attribs as $attribute => $value ) {
501 # Allow XML namespace declaration to allow RDFa
502 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
503 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
504 $out[$attribute] = $value;
507 continue;
510 # Allow any attribute beginning with "data-"
511 # However:
512 # * Disallow data attributes used by MediaWiki code
513 # * Ensure that the attribute is not namespaced by banning
514 # colons.
515 if ( (
516 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
517 !array_key_exists( $attribute, $allowed )
518 ) || self::isReservedDataAttribute( $attribute ) ) {
519 continue;
522 # Strip javascript "expression" from stylesheets.
523 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
524 if ( $attribute == 'style' ) {
525 $value = self::checkCss( $value );
528 # Escape HTML id attributes
529 if ( $attribute === 'id' ) {
530 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
531 if ( $value === false || $value === '' ) {
532 continue;
536 # Escape HTML id reference lists
537 if ( $attribute === 'aria-describedby'
538 || $attribute === 'aria-flowto'
539 || $attribute === 'aria-labelledby'
540 || $attribute === 'aria-owns'
542 $value = self::escapeIdReferenceListInternal( $value );
545 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
546 if ( $attribute === 'rel' || $attribute === 'rev'
547 # RDFa
548 || $attribute === 'about' || $attribute === 'property'
549 || $attribute === 'resource' || $attribute === 'datatype'
550 || $attribute === 'typeof'
551 # HTML5 microdata
552 || $attribute === 'itemid' || $attribute === 'itemprop'
553 || $attribute === 'itemref' || $attribute === 'itemscope'
554 || $attribute === 'itemtype'
556 // Paranoia. Allow "simple" values but suppress javascript
557 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
558 continue;
562 # NOTE: even though elements using href/src are not allowed directly, supply
563 # validation code that can be used by tag hook handlers, etc
564 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
565 if ( !preg_match( $hrefExp, $value ) ) {
566 continue; // drop any href or src attributes not using an allowed protocol.
567 // NOTE: this also drops all relative URLs
571 if ( $attribute === 'tabindex' && $value !== '0' ) {
572 // Only allow tabindex of 0, which is useful for accessibility.
573 continue;
576 // If this attribute was previously set, override it.
577 // Output should only have one attribute of each name.
578 $out[$attribute] = $value;
581 # itemtype, itemid, itemref don't make sense without itemscope
582 if ( !array_key_exists( 'itemscope', $out ) ) {
583 unset( $out['itemtype'] );
584 unset( $out['itemid'] );
585 unset( $out['itemref'] );
587 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
589 return $out;
593 * Given an attribute name, checks whether it is a reserved data attribute
594 * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
595 * core and extension code can safely use it to communicate with frontend code.
596 * @param string $attr Attribute name.
597 * @return bool
599 public static function isReservedDataAttribute( string $attr ): bool {
600 // data-ooui is reserved for ooui.
601 // data-mw and data-parsoid are reserved for parsoid.
602 // data-mw-<name here> is reserved for extensions (or core) if
603 // they need to communicate some data to the client and want to be
604 // sure that it isn't coming from an untrusted user.
605 // We ignore the possibility of namespaces since user-generated HTML
606 // can't use them anymore.
607 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
611 * Merge two sets of HTML attributes. Conflicting items in the second set
612 * will override those in the first, except for 'class' attributes which
613 * will be combined (if they're both strings).
615 * @todo implement merging for other attributes such as style
617 public static function mergeAttributes( array $a, array $b ): array {
618 $out = array_merge( $a, $b );
619 if ( isset( $a['class'] ) && isset( $b['class'] )
620 && is_string( $a['class'] ) && is_string( $b['class'] )
621 && $a['class'] !== $b['class']
623 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
624 -1, PREG_SPLIT_NO_EMPTY );
625 $out['class'] = implode( ' ', array_unique( $classes ) );
627 return $out;
631 * Normalize CSS into a format we can easily search for hostile input
632 * - decode character references
633 * - decode escape sequences
634 * - remove comments, unless the entire value is one single comment
635 * @param string $value the css string
636 * @return string normalized css
638 public static function normalizeCss( string $value ): string {
639 // Decode character references like &#123;
640 $value = self::decodeCharReferences( $value );
642 // Decode escape sequences and line continuation
643 // See the grammar in the CSS 2 spec, appendix D.
644 // This has to be done AFTER decoding character references.
645 // This means it isn't possible for this function to return
646 // unsanitized escape sequences. It is possible to manufacture
647 // input that contains character references that decode to
648 // escape sequences that decode to character references, but
649 // it's OK for the return value to contain character references
650 // because the caller is supposed to escape those anyway.
651 static $decodeRegex;
652 if ( !$decodeRegex ) {
653 $space = '[\\x20\\t\\r\\n\\f]';
654 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
655 $backslash = '\\\\';
656 $decodeRegex = "/ $backslash
658 ($nl) | # 1. Line continuation
659 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
660 (.) | # 3. backslash cancelling special meaning
661 () | # 4. backslash at end of string
662 )/xu";
664 $value = preg_replace_callback( $decodeRegex,
665 [ __CLASS__, 'cssDecodeCallback' ], $value );
667 // Let the value through if it's nothing but a single comment, to
668 // allow other functions which may reject it to pass some error
669 // message through.
670 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
671 // Remove any comments; IE gets token splitting wrong
672 // This must be done AFTER decoding character references and
673 // escape sequences, because those steps can introduce comments
674 // This step cannot introduce character references or escape
675 // sequences, because it replaces comments with spaces rather
676 // than removing them completely.
677 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
679 // Remove anything after a comment-start token, to guard against
680 // incorrect client implementations.
681 $commentPos = strpos( $value, '/*' );
682 if ( $commentPos !== false ) {
683 $value = substr( $value, 0, $commentPos );
687 return $value;
691 * Pick apart some CSS and check it for forbidden or unsafe structures.
692 * Returns a sanitized string. This sanitized string will have
693 * character references and escape sequences decoded and comments
694 * stripped (unless it is itself one valid comment, in which case the value
695 * will be passed through). If the input is just too evil, only a comment
696 * complaining about evilness will be returned.
698 * Currently URL references, 'expression', 'tps' are forbidden.
700 * NOTE: Despite the fact that character references are decoded, the
701 * returned string may contain character references given certain
702 * clever input strings. These character references must
703 * be escaped before the return value is embedded in HTML.
705 * @warning This method is intended to sanitize style attributes on
706 * html tags only. It is not safe to use on full CSS files.
707 * @param string $value
708 * @return string
710 public static function checkCss( $value ) {
711 $value = self::normalizeCss( $value );
713 // Reject problematic keywords and control characters
714 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
715 strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
716 return '/* invalid control char */';
717 } elseif ( preg_match(
718 '! expression
719 | accelerator\s*:
720 | -o-link\s*:
721 | -o-link-source\s*:
722 | -o-replace\s*:
723 | url\s*\(
724 | src\s*\(
725 | image\s*\(
726 | image-set\s*\(
727 | attr\s*\([^)]+[\s,]+url
728 !ix', $value ) ) {
729 return '/* insecure input */';
731 return $value;
734 private static function cssDecodeCallback( array $matches ): string {
735 if ( $matches[1] !== '' ) {
736 // Line continuation
737 return '';
738 } elseif ( $matches[2] !== '' ) {
739 # hexdec could return a float if the match is too long, but the
740 # regexp in question limits the string length to 6.
741 $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
742 } elseif ( $matches[3] !== '' ) {
743 $char = $matches[3];
744 } else {
745 $char = '\\';
747 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
748 // These characters need to be escaped in strings
749 // Clean up the escape sequence to avoid parsing errors by clients
750 return '\\' . dechex( ord( $char ) ) . ' ';
751 } else {
752 // Decode unnecessary escape
753 return $char;
758 * Take a tag soup fragment listing an HTML element's attributes
759 * and normalize it to well-formed XML, discarding unwanted attributes.
760 * Output is safe for further wikitext processing, with escaping of
761 * values that could trigger problems.
763 * - Normalizes attribute names to lowercase
764 * - Discards attributes not allowed for the given element
765 * - Turns broken or invalid entities into plaintext
766 * - Double-quotes all attribute values
767 * - Attributes without values are given the name as attribute
768 * - Double attributes are discarded
769 * - Unsafe style attributes are discarded
770 * - Prepends space if there are attributes.
771 * - (Optionally) Sorts attributes by name.
773 * @param string $text
774 * @param string $element
775 * @param bool $sorted Whether to sort the attributes (default: false)
776 * @return string
778 public static function fixTagAttributes( string $text, string $element, bool $sorted = false ): string {
779 if ( trim( $text ) == '' ) {
780 return '';
783 $decoded = self::decodeTagAttributes( $text );
784 $stripped = self::validateTagAttributes( $decoded, $element );
786 if ( $sorted ) {
787 ksort( $stripped );
790 return self::safeEncodeTagAttributes( $stripped );
794 * Encode an attribute value for HTML output.
795 * @param string $text
796 * @param-taint $text escapes_html
797 * @return string HTML-encoded text fragment
798 * @return-taint escaped
800 public static function encodeAttribute( string $text ): string {
801 $encValue = htmlspecialchars( $text, ENT_QUOTES );
803 // Whitespace is normalized during attribute decoding,
804 // so if we've been passed non-spaces we must encode them
805 // ahead of time or they won't be preserved.
806 $encValue = strtr( $encValue, [
807 "\n" => '&#10;',
808 "\r" => '&#13;',
809 "\t" => '&#9;',
810 ] );
812 return $encValue;
816 * Armor French spaces with a replacement character
818 * @since 1.32
819 * @param string $text Text to armor
820 * @param string $space Space character for the French spaces, defaults to '&#160;'
821 * @return string Armored text
823 public static function armorFrenchSpaces( string $text, string $space = '&#160;' ): string {
824 // Replace $ with \$ and \ with \\
825 $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
826 $fixtags = [
827 # French spaces, last one Guillemet-left
828 # only if it isn't followed by a word character.
829 '/ (?=[?:;!%»›](?!\w))/u' => "$space",
830 # French spaces, Guillemet-right
831 '/([«‹]) /u' => "\\1$space",
833 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
837 * Encode an attribute value for HTML tags, with extra armoring
838 * against further wiki processing.
839 * @param string $text
840 * @param-taint $text escapes_html
841 * @return string HTML-encoded text fragment
842 * @return-taint escaped
844 public static function safeEncodeAttribute( string $text ): string {
845 $encValue = self::encodeAttribute( $text );
847 # Templates and links may be expanded in later parsing,
848 # creating invalid or dangerous output. Suppress this.
849 $encValue = strtr( $encValue, [
850 // '<', '>', and '"' should never happen, as they indicate that we've received invalid input which should
851 // have been escaped.
852 '<' => '&lt;',
853 '>' => '&gt;',
854 '"' => '&quot;',
855 '{' => '&#123;',
856 '}' => '&#125;', // prevent unpaired language conversion syntax
857 '[' => '&#91;',
858 ']' => '&#93;',
859 "''" => '&#39;&#39;',
860 'ISBN' => '&#73;SBN',
861 'RFC' => '&#82;FC',
862 'PMID' => '&#80;MID',
863 '|' => '&#124;',
864 '__' => '&#95;_',
865 ] );
867 # Stupid hack
868 $validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols();
869 $encValue = preg_replace_callback(
870 '/((?i)' . $validProtocols . ')/',
871 static function ( $matches ) {
872 return str_replace( ':', '&#58;', $matches[1] );
874 $encValue );
875 return $encValue;
879 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
880 * a valid HTML id attribute.
882 * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
883 * proper escaping.
885 * @param string $id String to escape
886 * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
887 * should be used.
888 * @return string|false Escaped ID or false if fallback encoding is requested but it's not
889 * configured.
891 * @since 1.30
893 public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ) {
894 global $wgFragmentMode;
896 if ( !isset( $wgFragmentMode[$mode] ) ) {
897 if ( $mode === self::ID_PRIMARY ) {
898 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
900 return false;
903 $internalMode = $wgFragmentMode[$mode];
905 return self::escapeIdInternal( $id, $internalMode );
909 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
910 * a valid URL fragment.
912 * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
913 * proper escaping.
915 * @param string $id String to escape
916 * @return string Escaped ID
918 * @since 1.30
920 public static function escapeIdForLink( string $id ): string {
921 global $wgFragmentMode;
923 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
924 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
927 $mode = $wgFragmentMode[self::ID_PRIMARY];
929 $id = self::escapeIdInternalUrl( $id, $mode );
931 return $id;
935 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
936 * a valid URL fragment for external interwikis.
938 * @param string $id String to escape
939 * @return string Escaped ID
941 * @since 1.30
943 public static function escapeIdForExternalInterwiki( string $id ): string {
944 global $wgExternalInterwikiFragmentMode;
946 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
948 return $id;
952 * Do percent encoding of percent signs for href (but not id) attributes
954 * @since 1.35
955 * @see https://phabricator.wikimedia.org/T238385
956 * @param string $id String to escape
957 * @param string $mode One of modes from $wgFragmentMode
958 * @return string
960 private static function escapeIdInternalUrl( string $id, string $mode ): string {
961 $id = self::escapeIdInternal( $id, $mode );
962 if ( $mode === 'html5' ) {
963 $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
965 return $id;
969 * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
971 * @param string $id String to escape
972 * @param string $mode One of modes from $wgFragmentMode
973 * @return string
975 private static function escapeIdInternal( string $id, string $mode ): string {
976 // Truncate overly-long IDs. This isn't an HTML limit, it's just
977 // griefer protection. [T251506]
978 $id = mb_substr( $id, 0, 1024 );
980 switch ( $mode ) {
981 case 'html5':
982 // html5 spec says ids must not have any of the following:
983 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
984 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
985 // possible using either Lua or html entities.
986 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
987 break;
988 case 'legacy':
989 // This corresponds to 'noninitial' mode of the former escapeId()
990 static $replace = [
991 '%3A' => ':',
992 '%' => '.'
995 $id = urlencode( str_replace( ' ', '_', $id ) );
996 $id = strtr( $id, $replace );
997 break;
998 default:
999 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1002 return $id;
1006 * Given a string containing a space delimited list of ids, escape each id
1007 * to match ids escaped by the escapeIdForAttribute() function.
1009 * @param string $referenceString Space delimited list of ids
1010 * @return string
1012 private static function escapeIdReferenceListInternal( string $referenceString ): string {
1013 # Explode the space delimited list string into an array of tokens
1014 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1016 # Escape each token as an id
1017 foreach ( $references as &$ref ) {
1018 $ref = self::escapeIdForAttribute( $ref );
1021 # Merge the array back to a space delimited list string
1022 # If the array is empty, the result will be an empty string ('')
1023 $referenceString = implode( ' ', $references );
1025 return $referenceString;
1029 * Given a value, escape it so that it can be used as a CSS class and
1030 * return it.
1032 * @todo For extra validity, input should be validated UTF-8.
1034 * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
1036 public static function escapeClass( string $class ): string {
1037 // Convert ugly stuff to underscores and kill underscores in ugly places
1038 return rtrim( preg_replace(
1039 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1040 '_',
1041 $class ), '_' );
1045 * Given HTML input, escape with htmlspecialchars but un-escape entities.
1046 * This allows (generally harmless) entities like &#160; to survive.
1048 * @param string $html HTML to escape
1049 * @param-taint $html escapes_htmlnoent
1050 * @return string Escaped input
1051 * @return-taint escaped
1053 public static function escapeHtmlAllowEntities( string $html ): string {
1054 $html = self::decodeCharReferences( $html );
1055 # It seems wise to escape ' as well as ", as a matter of course. Can't
1056 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1057 # don't cause the entire string to disappear.
1058 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1059 return $html;
1063 * Return an associative array of attribute names and values from
1064 * a partial tag string. Attribute names are forced to lowercase,
1065 * character references are decoded to UTF-8 text.
1067 public static function decodeTagAttributes( string $text ): array {
1068 if ( trim( $text ) == '' ) {
1069 return [];
1072 $pairs = [];
1073 if ( !preg_match_all(
1074 self::getAttribsRegex(),
1075 $text,
1076 $pairs,
1077 PREG_SET_ORDER ) ) {
1078 return [];
1081 $attribs = [];
1082 foreach ( $pairs as $set ) {
1083 $attribute = strtolower( $set[1] );
1085 // Filter attribute names with unacceptable characters
1086 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1087 continue;
1090 $value = self::getTagAttributeCallback( $set );
1092 // Normalize whitespace
1093 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1094 $value = trim( $value );
1096 // Decode character references
1097 $attribs[$attribute] = self::decodeCharReferences( $value );
1099 return $attribs;
1103 * Build a partial tag string from an associative array of attribute
1104 * names and values as returned by decodeTagAttributes.
1106 public static function safeEncodeTagAttributes( array $assoc_array ): string {
1107 $attribs = [];
1108 foreach ( $assoc_array as $attribute => $value ) {
1109 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1110 $encValue = self::safeEncodeAttribute( $value );
1112 $attribs[] = "$encAttribute=\"$encValue\"";
1114 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1118 * Pick the appropriate attribute value from a match set from the
1119 * attribs regex matches.
1121 private static function getTagAttributeCallback( array $set ): string {
1122 if ( isset( $set[5] ) ) {
1123 # No quotes.
1124 return $set[5];
1125 } elseif ( isset( $set[4] ) ) {
1126 # Single-quoted
1127 return $set[4];
1128 } elseif ( isset( $set[3] ) ) {
1129 # Double-quoted
1130 return $set[3];
1131 } elseif ( !isset( $set[2] ) ) {
1132 # In XHTML, attributes must have a value so return an empty string.
1133 # See "Empty attribute syntax",
1134 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1135 return "";
1136 } else {
1137 throw new LogicException( "Tag conditions not met. This should never happen and is a bug." );
1141 private static function normalizeWhitespace( string $text ): string {
1142 return trim( preg_replace(
1143 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1144 ' ',
1145 $text ) );
1149 * Normalizes whitespace in a section name, such as might be returned
1150 * by Parser::stripSectionName(), for use in the id's that are used for
1151 * section links.
1153 public static function normalizeSectionNameWhitespace( string $section ): string {
1154 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1158 * Ensure that any entities and character references are legal
1159 * for XML and XHTML specifically. Any stray bits will be
1160 * &amp;-escaped to result in a valid text fragment.
1162 * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
1163 * numericized (this way we're well-formed even without a DTD)
1164 * b. any numeric char refs must be legal chars, not invalid or forbidden
1165 * c. use lower cased "&#x", not "&#X"
1166 * d. fix or reject non-valid attributes
1168 * @internal
1170 public static function normalizeCharReferences( string $text ): string {
1171 return preg_replace_callback(
1172 self::CHAR_REFS_REGEX,
1173 [ self::class, 'normalizeCharReferencesCallback' ],
1174 $text, -1, $count, PREG_UNMATCHED_AS_NULL
1178 private static function normalizeCharReferencesCallback( array $matches ): string {
1179 $ret = null;
1180 if ( isset( $matches[1] ) ) {
1181 $ret = self::normalizeEntity( $matches[1] );
1182 } elseif ( isset( $matches[2] ) ) {
1183 $ret = self::decCharReference( $matches[2] );
1184 } elseif ( isset( $matches[3] ) ) {
1185 $ret = self::hexCharReference( $matches[3] );
1187 if ( $ret === null ) {
1188 return htmlspecialchars( $matches[0], ENT_COMPAT );
1189 } else {
1190 return $ret;
1195 * If the named entity is defined in HTML5
1196 * return the equivalent numeric entity reference (except for the core &lt;
1197 * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
1198 * the HTML equivalent. Otherwise, returns HTML-escaped text of
1199 * pseudo-entity source (eg &amp;foo;)
1201 * @param string $name Semicolon-terminated name
1202 * @return string
1204 private static function normalizeEntity( string $name ): string {
1205 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1206 // Non-standard MediaWiki-specific entities
1207 return '&' . self::MW_ENTITY_ALIASES[$name];
1208 } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1209 // Keep these in word form
1210 return "&$name";
1211 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1212 // Beware: some entities expand to more than 1 codepoint
1213 return preg_replace_callback( '/./Ssu', static function ( $m ) {
1214 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1215 }, HTMLData::$namedEntityTranslations[$name] );
1216 } else {
1217 return "&amp;$name";
1221 private static function decCharReference( string $codepoint ): ?string {
1222 # intval() will (safely) saturate at the maximum signed integer
1223 # value if $codepoint is too many digits
1224 $point = intval( $codepoint );
1225 if ( self::validateCodepoint( $point ) ) {
1226 return "&#$point;";
1227 } else {
1228 return null;
1232 private static function hexCharReference( string $codepoint ): ?string {
1233 $point = hexdec( $codepoint );
1234 // hexdec() might return a float if the string is too long
1235 if ( is_int( $point ) && self::validateCodepoint( $point ) ) {
1236 return sprintf( '&#x%x;', $point );
1237 } else {
1238 return null;
1243 * Returns true if a given Unicode codepoint is a valid character in
1244 * both HTML5 and XML.
1246 private static function validateCodepoint( int $codepoint ): bool {
1247 # U+000C is valid in HTML5 but not allowed in XML.
1248 # U+000D is valid in XML but not allowed in HTML5.
1249 # U+007F - U+009F are disallowed in HTML5 (control characters).
1250 return $codepoint == 0x09
1251 || $codepoint == 0x0a
1252 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1253 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1254 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1255 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1259 * Decode any character references, numeric or named entities,
1260 * in the text and return a UTF-8 string.
1262 public static function decodeCharReferences( string $text ): string {
1263 return preg_replace_callback(
1264 self::CHAR_REFS_REGEX,
1265 [ self::class, 'decodeCharReferencesCallback' ],
1266 $text, -1, $count, PREG_UNMATCHED_AS_NULL
1271 * Decode any character references, numeric or named entities,
1272 * in the next and normalize the resulting string. (T16952)
1274 * This is useful for page titles, not for text to be displayed,
1275 * MediaWiki allows HTML entities to escape normalization as a feature.
1277 * @param string $text Already normalized, containing entities
1278 * @return string Still normalized, without entities
1280 public static function decodeCharReferencesAndNormalize( string $text ): string {
1281 $text = preg_replace_callback(
1282 self::CHAR_REFS_REGEX,
1283 [ self::class, 'decodeCharReferencesCallback' ],
1284 $text, -1, $count, PREG_UNMATCHED_AS_NULL
1287 if ( $count ) {
1288 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1289 } else {
1290 return $text;
1294 private static function decodeCharReferencesCallback( array $matches ): string {
1295 if ( isset( $matches[1] ) ) {
1296 return self::decodeEntity( $matches[1] );
1297 } elseif ( isset( $matches[2] ) ) {
1298 return self::decodeChar( intval( $matches[2] ) );
1299 } elseif ( isset( $matches[3] ) ) {
1300 $point = hexdec( $matches[3] );
1301 // hexdec() might return a float if the string is too long
1302 if ( !is_int( $point ) ) {
1303 // Invalid character reference.
1304 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1306 return self::decodeChar( $point );
1308 # Last case should be an ampersand by itself
1309 return $matches[0];
1313 * Return UTF-8 string for a codepoint if that is a valid
1314 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1315 * @internal
1317 private static function decodeChar( int $codepoint ): string {
1318 if ( self::validateCodepoint( $codepoint ) ) {
1319 return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1320 } else {
1321 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1326 * If the named entity is defined in HTML5
1327 * return the UTF-8 encoding of that character. Otherwise, returns
1328 * pseudo-entity source (eg "&foo;")
1330 * @param string $name Semicolon-terminated entity name
1331 * @return string
1333 private static function decodeEntity( string $name ): string {
1334 // These are MediaWiki-specific entities, not in the HTML standard
1335 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1336 $name = self::MW_ENTITY_ALIASES[$name];
1338 $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1339 return $trans ?? "&$name";
1343 * Fetch the list of acceptable attributes for a given element name.
1345 * @param string $element
1346 * @return array An associative array where keys are acceptable attribute
1347 * names
1349 private static function attributesAllowedInternal( string $element ): array {
1350 $list = self::setupAttributesAllowedInternal();
1351 return $list[$element] ?? [];
1355 * Foreach array key (an allowed HTML element), return an array
1356 * of allowed attributes.
1357 * @return array An associative array: keys are HTML element names;
1358 * values are associative arrays where the keys are allowed attribute
1359 * names.
1361 private static function setupAttributesAllowedInternal(): array {
1362 static $allowed;
1364 if ( $allowed !== null ) {
1365 return $allowed;
1368 // For lookup efficiency flip each attributes array so the keys are
1369 // the valid attributes.
1370 $merge = static function ( $a, $b, $c = [] ) {
1371 return array_merge(
1373 array_fill_keys( $b, true ),
1374 array_fill_keys( $c, true ) );
1376 $common = $merge( [], [
1377 # HTML
1378 'id',
1379 'class',
1380 'style',
1381 'lang',
1382 'dir',
1383 'title',
1384 'tabindex',
1386 # WAI-ARIA
1387 'aria-describedby',
1388 'aria-flowto',
1389 'aria-hidden',
1390 'aria-label',
1391 'aria-labelledby',
1392 'aria-level',
1393 'aria-owns',
1394 'role',
1396 # RDFa
1397 # These attributes are specified in section 9 of
1398 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1399 'about',
1400 'property',
1401 'resource',
1402 'datatype',
1403 'typeof',
1405 # Microdata. These are specified by
1406 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1407 'itemid',
1408 'itemprop',
1409 'itemref',
1410 'itemscope',
1411 'itemtype',
1412 ] );
1414 $block = $merge( $common, [ 'align' ] );
1416 $tablealign = [ 'align', 'valign' ];
1417 $tablecell = [
1418 'abbr',
1419 'axis',
1420 'headers',
1421 'scope',
1422 'rowspan',
1423 'colspan',
1424 'nowrap', # deprecated
1425 'width', # deprecated
1426 'height', # deprecated
1427 'bgcolor', # deprecated
1430 # Numbers refer to sections in HTML 4.01 standard describing the element.
1431 # See: https://www.w3.org/TR/html4/
1432 $allowed = [
1433 # 7.5.4
1434 'div' => $block,
1435 'center' => $common, # deprecated
1436 'span' => $common,
1438 # 7.5.5
1439 'h1' => $block,
1440 'h2' => $block,
1441 'h3' => $block,
1442 'h4' => $block,
1443 'h5' => $block,
1444 'h6' => $block,
1446 # 7.5.6
1447 # address
1449 # 8.2.4
1450 'bdo' => $common,
1452 # 9.2.1
1453 'em' => $common,
1454 'strong' => $common,
1455 'cite' => $common,
1456 'dfn' => $common,
1457 'code' => $common,
1458 'samp' => $common,
1459 'kbd' => $common,
1460 'var' => $common,
1461 'abbr' => $common,
1462 # acronym
1464 # 9.2.2
1465 'blockquote' => $merge( $common, [ 'cite' ] ),
1466 'q' => $merge( $common, [ 'cite' ] ),
1468 # 9.2.3
1469 'sub' => $common,
1470 'sup' => $common,
1472 # 9.3.1
1473 'p' => $block,
1475 # 9.3.2
1476 'br' => $merge( $common, [ 'clear' ] ),
1478 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1479 'wbr' => $common,
1481 # 9.3.4
1482 'pre' => $merge( $common, [ 'width' ] ),
1484 # 9.4
1485 'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1486 'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1488 # 10.2
1489 'ul' => $merge( $common, [ 'type' ] ),
1490 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1491 'li' => $merge( $common, [ 'type', 'value' ] ),
1493 # 10.3
1494 'dl' => $common,
1495 'dd' => $common,
1496 'dt' => $common,
1498 # 11.2.1
1499 'table' => $merge( $common,
1500 [ 'summary', 'width', 'border', 'frame',
1501 'rules', 'cellspacing', 'cellpadding',
1502 'align', 'bgcolor',
1503 ] ),
1505 # 11.2.2
1506 'caption' => $block,
1508 # 11.2.3
1509 'thead' => $common,
1510 'tfoot' => $common,
1511 'tbody' => $common,
1513 # 11.2.4
1514 'colgroup' => $merge( $common, [ 'span' ] ),
1515 'col' => $merge( $common, [ 'span' ] ),
1517 # 11.2.5
1518 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1520 # 11.2.6
1521 'td' => $merge( $common, $tablecell, $tablealign ),
1522 'th' => $merge( $common, $tablecell, $tablealign ),
1524 # 12.2
1525 # NOTE: <a> is not allowed directly, but this list of allowed
1526 # attributes is used from the Parser object
1527 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1529 # 13.2
1530 # Not usually allowed, but may be used for extension-style hooks
1531 # such as <math> when it is rasterized
1532 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1533 # Attributes for A/V tags added in T163583 / T133673
1534 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1535 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1536 'source' => $merge( $common, [ 'type', 'src' ] ),
1537 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1539 # 15.2.1
1540 'tt' => $common,
1541 'b' => $common,
1542 'i' => $common,
1543 'big' => $common,
1544 'small' => $common,
1545 'strike' => $common,
1546 's' => $common,
1547 'u' => $common,
1549 # 15.2.2
1550 'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1551 # basefont
1553 # 15.3
1554 'hr' => $merge( $common, [ 'width' ] ),
1556 # HTML Ruby annotation text module, simple ruby only.
1557 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1558 'ruby' => $common,
1559 # rbc
1560 'rb' => $common,
1561 'rp' => $common,
1562 'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1563 'rtc' => $common,
1565 # MathML root element, where used for extensions
1566 # 'title' may not be 100% valid here; it's XHTML
1567 # https://www.w3.org/TR/REC-MathML/
1568 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1570 // HTML 5 section 4.5
1571 'figure' => $common,
1572 'figcaption' => $common,
1574 # HTML 5 section 4.6
1575 'bdi' => $common,
1577 # HTML5 elements, defined by:
1578 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1579 'data' => $merge( $common, [ 'value' ] ),
1580 'time' => $merge( $common, [ 'datetime' ] ),
1581 'mark' => $common,
1583 // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1584 // is enabled so we don't bother adding a conditional to hide these
1585 // Also meta and link are only valid in WikiText as Microdata elements
1586 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1587 // So we don't bother including $common attributes that have no purpose.
1588 'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1589 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1591 # HTML 5 section 4.3.5
1592 'aside' => $common,
1595 return $allowed;
1599 * Take a fragment of (potentially invalid) HTML and return
1600 * a version with any tags removed, encoded as plain text.
1602 * Warning: this return value must be further escaped for literal
1603 * inclusion in HTML output as of 1.10!
1605 * @param string $html HTML fragment
1606 * @return string
1607 * @return-taint tainted
1609 public static function stripAllTags( string $html ): string {
1610 // Use RemexHtml to tokenize $html and extract the text
1611 $handler = new RemexStripTagHandler;
1612 $tokenizer = new RemexTokenizer( $handler, $html, [
1613 'ignoreErrors' => true,
1614 // don't ignore char refs, we want them to be decoded
1615 'ignoreNulls' => true,
1616 'skipPreprocess' => true,
1617 ] );
1618 $tokenizer->execute();
1619 $text = $handler->getResult();
1621 $text = self::normalizeWhitespace( $text );
1622 return $text;
1626 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1627 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1628 * PHP 5.1 doesn't.
1630 * Use for passing XHTML fragments to PHP's XML parsing functions
1632 * @deprecated since 1.36; will be made private or removed in a future
1633 * release.
1635 public static function hackDocType(): string {
1636 $out = "<!DOCTYPE html [\n";
1637 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1638 if ( substr( $entity, -1 ) !== ';' ) {
1639 // Some HTML entities omit the trailing semicolon;
1640 // wikitext does not permit these.
1641 continue;
1643 $name = substr( $entity, 0, -1 );
1644 $expansion = self::normalizeEntity( $entity );
1645 if ( $entity === $expansion ) {
1646 // Skip &lt; &gt; etc
1647 continue;
1649 $out .= "<!ENTITY $name \"$expansion\">";
1651 $out .= "]>\n";
1652 return $out;
1655 public static function cleanUrl( string $url ): string {
1656 # Normalize any HTML entities in input. They will be
1657 # re-escaped by makeExternalLink().
1658 $url = self::decodeCharReferences( $url );
1660 # Escape any control characters introduced by the above step
1661 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]+/',
1662 static fn ( $m ) => urlencode( $m[0] ), $url );
1664 # Validate hostname portion
1665 $matches = [];
1666 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1667 [ /* $whole */, $protocol, $host, $rest ] = $matches;
1669 // Characters that will be ignored in IDNs.
1670 // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1671 // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1672 // Strip them before further processing so deny lists and such work.
1673 $strip = "/
1674 \\s| # general whitespace
1675 \u{00AD}| # SOFT HYPHEN
1676 \u{034F}| # COMBINING GRAPHEME JOINER
1677 \u{061C}| # ARABIC LETTER MARK
1678 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1679 # HANGUL JUNGSEONG FILLER
1680 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1681 # KHMER VOWEL INHERENT AA
1682 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1683 # MONGOLIAN FREE VARIATION SELECTOR THREE
1684 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1685 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1686 # RIGHT-TO-LEFT MARK
1687 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1688 # RIGHT-TO-LEFT OVERRIDE
1689 [\u{2060}-\u{2064}]| # WORD JOINER..
1690 # INVISIBLE PLUS
1691 \u{2065}| # <reserved-2065>
1692 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1693 # NOMINAL DIGIT SHAPES
1694 \u{3164}| # HANGUL FILLER
1695 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1696 # VARIATION SELECTOR-16
1697 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1698 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1699 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1700 # <reserved-FFF8>
1701 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1702 # SHORTHAND FORMAT UP STEP
1703 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1704 # MUSICAL SYMBOL END PHRASE
1705 \u{E0000}| # <reserved-E0000>
1706 \u{E0001}| # LANGUAGE TAG
1707 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1708 # <reserved-E001F>
1709 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1710 # CANCEL TAG
1711 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1712 # <reserved-E00FF>
1713 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1714 # VARIATION SELECTOR-256
1715 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1716 # <reserved-E0FFF>
1717 /xuD";
1719 $host = preg_replace( $strip, '', $host );
1721 // IPv6 host names are bracketed with []. Url-decode these.
1722 if ( str_starts_with( $host, "//%5B" ) &&
1723 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1725 $host = '//[' . $matches[1] . ']' . $matches[2];
1728 // @todo FIXME: Validate hostnames here
1730 return $protocol . $host . $rest;
1731 } else {
1732 return $url;
1737 * Does a string look like an e-mail address?
1739 * This validates an email address using an HTML5 specification found at:
1740 * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
1741 * Which as of 2011-01-24 says:
1743 * A valid e-mail address is a string that matches the ABNF production
1744 * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
1745 * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
1746 * 3.5.
1748 * This function is an implementation of the specification as requested in
1749 * T24449.
1751 * Client-side forms will use the same standard validation rules via JS or
1752 * HTML 5 validation; additional restrictions can be enforced server-side
1753 * by extensions via the 'isValidEmailAddr' hook.
1755 * Note that this validation doesn't 100% match RFC 2822, but is believed
1756 * to be liberal enough for wide use. Some invalid addresses will still
1757 * pass validation here.
1759 * @since 1.18
1761 * @param string $addr E-mail address
1762 * @return bool
1764 public static function validateEmail( string $addr ): bool {
1765 $result = null;
1766 // TODO This method should be non-static, and have a HookRunner injected
1767 $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1768 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1769 return $result;
1772 // Please note strings below are enclosed in brackets [], this make the
1773 // hyphen "-" a range indicator. Hence it is double backslashed below.
1774 // See T28948
1775 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1776 $rfc1034_ldh_str = "a-z0-9\\-";
1778 $html5_email_regexp = "/
1779 ^ # start of string
1780 [$rfc5322_atext\\.]+ # user part which is liberal :p
1781 @ # 'apostrophe'
1782 [$rfc1034_ldh_str]+ # First domain part
1783 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1784 $ # End of string
1785 /ix"; // case Insensitive, eXtended
1787 return (bool)preg_match( $html5_email_regexp, $addr );
1791 /** @deprecated class alias since 1.41 */
1792 class_alias( Sanitizer::class, 'Sanitizer' );