Update git submodules
[mediawiki.git] / includes / parser / Sanitizer.php
blobefe4cd1a22fc1512cd6ca58169581807017eb48d
1 <?php
2 /**
3 * HTML sanitizer for %MediaWiki.
5 * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * https://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
23 * @file
24 * @ingroup Parser
27 namespace MediaWiki\Parser;
29 use InvalidArgumentException;
30 use LogicException;
31 use MediaWiki\HookContainer\HookRunner;
32 use MediaWiki\MediaWikiServices;
33 use MediaWiki\Tidy\RemexCompatFormatter;
34 use StringUtils;
35 use UnexpectedValueException;
36 use Wikimedia\RemexHtml\HTMLData;
37 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
38 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
39 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
40 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
42 /**
43 * HTML sanitizer for MediaWiki
44 * @ingroup Parser
46 class Sanitizer {
47 /**
48 * Regular expression to match various types of character references in
49 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
50 * Note that HTML5 allows some named entities to omit the trailing
51 * semicolon; wikitext entities *must* have a trailing semicolon.
53 private const CHAR_REFS_REGEX =
54 '/&([A-Za-z0-9\x80-\xff]+;)
55 |&\#([0-9]+);
56 |&\#[xX]([0-9A-Fa-f]+);
57 |(&)/x';
59 /**
60 * Acceptable tag name charset from HTML5 parsing spec
61 * https://www.w3.org/TR/html5/syntax.html#tag-open-state
63 private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
65 /**
66 * Pattern matching evil uris like javascript:
67 * WARNING: DO NOT use this in any place that actually requires denying
68 * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
69 * pattern-based deny lists; the only way to be secure from javascript:
70 * uri based xss vectors is to allow only things that you know are safe
71 * and deny everything else.
72 * [1]: http://ha.ckers.org/xss.html
74 private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
75 private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
77 /**
78 * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
80 * @since 1.30
82 public const ID_PRIMARY = 0;
84 /**
85 * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
86 * if no fallback is configured.
88 * @since 1.30
90 public const ID_FALLBACK = 1;
92 /**
93 * Character entity aliases accepted by MediaWiki in wikitext.
94 * These are not part of the HTML standard.
96 private const MW_ENTITY_ALIASES = [
97 'רלמ;' => 'rlm;',
98 'رلم;' => 'rlm;',
102 * Lazy-initialised attributes regex, see getAttribsRegex()
104 private static $attribsRegex;
107 * Regular expression to match HTML/XML attribute pairs within a tag.
108 * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
109 * Used in Sanitizer::decodeTagAttributes
110 * @return string
112 private static function getAttribsRegex() {
113 if ( self::$attribsRegex === null ) {
114 $spaceChars = '\x09\x0a\x0c\x0d\x20';
115 $space = "[{$spaceChars}]";
116 $attrib = "[^{$spaceChars}\/>=]";
117 $attribFirst = "(?:{$attrib}|=)";
118 self::$attribsRegex =
119 "/({$attribFirst}{$attrib}*)
120 ($space*=$space*
122 # The attribute value: quoted or alone
123 \"([^\"]*)(?:\"|\$)
124 | '([^']*)(?:'|\$)
125 | (((?!$space|>).)*)
127 )?/sxu";
129 return self::$attribsRegex;
133 * Lazy-initialised attribute name regex, see getAttribNameRegex()
135 private static $attribNameRegex;
138 * Used in Sanitizer::decodeTagAttributes to filter attributes.
139 * @return string
141 private static function getAttribNameRegex() {
142 if ( self::$attribNameRegex === null ) {
143 $attribFirst = "[:_\p{L}\p{N}]";
144 $attrib = "[:_\.\-\p{L}\p{N}]";
145 self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
147 return self::$attribNameRegex;
151 * Return the various lists of recognized tags
152 * @param string[] $extratags For any extra tags to include
153 * @param string[] $removetags For any tags (default or extra) to exclude
154 * @return array
155 * @internal
157 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
158 global $wgAllowImageTag;
159 static $commonCase, $staticInitialised;
160 $isCommonCase = ( $extratags === [] && $removetags === [] );
161 if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
162 return $commonCase;
165 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
166 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
168 // Base our staticInitialised variable off of the global config state so that if the globals
169 // are changed (like in the screwed up test system) we will re-initialise the settings.
170 $globalContext = $wgAllowImageTag;
171 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
172 $htmlpairsStatic = [ # Tags that must be closed
173 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
174 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
175 'strike', 'strong', 'tt', 'var', 'div', 'center',
176 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
177 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
178 'kbd', 'samp', 'data', 'time', 'mark'
180 # These tags can be self-closed. For tags not also on
181 # $htmlsingleonly, a self-closed tag will be emitted as
182 # an empty element (open-tag/close-tag pair).
183 $htmlsingle = [
184 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
187 # Elements that cannot have close tags. This is (not coincidentally)
188 # also the list of tags for which the HTML 5 parsing algorithm
189 # requires you to "acknowledge the token's self-closing flag", i.e.
190 # a self-closing tag like <br/> is not an HTML 5 parse error only
191 # for this list.
192 $htmlsingleonly = [
193 'br', 'wbr', 'hr', 'meta', 'link'
196 $htmlnest = [ # Tags that can be nested--??
197 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
198 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
199 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
201 $tabletags = [ # Can only appear inside table, we will close them
202 'td', 'th', 'tr',
204 $htmllist = [ # Tags used by list
205 'ul', 'ol',
207 $listtags = [ # Tags that can appear in a list
208 'li',
211 if ( $wgAllowImageTag ) {
212 wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
213 'is deprecated since MediaWiki 1.35', '1.35', false, false );
214 $htmlsingle[] = 'img';
215 $htmlsingleonly[] = 'img';
218 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
219 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
221 # Convert them all to hashtables for faster lookup
222 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
223 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
224 foreach ( $vars as $var ) {
225 $$var = array_fill_keys( $$var, true );
227 $staticInitialised = $globalContext;
230 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
231 $extratags = array_fill_keys( $extratags, true );
232 $removetags = array_fill_keys( $removetags, true );
233 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
234 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
235 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
236 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
238 $result = [
239 'htmlpairs' => $htmlpairs,
240 'htmlsingle' => $htmlsingle,
241 'htmlsingleonly' => $htmlsingleonly,
242 'htmlnest' => $htmlnest,
243 'tabletags' => $tabletags,
244 'htmllist' => $htmllist,
245 'listtags' => $listtags,
246 'htmlsingleallowed' => $htmlsingleallowed,
247 'htmlelements' => $htmlelements,
249 if ( $isCommonCase ) {
250 $commonCase = $result;
252 return $result;
256 * Cleans up HTML, removes dangerous tags and attributes, and
257 * removes HTML comments; BEWARE there may be unmatched HTML
258 * tags in the result.
260 * @note Callers are recommended to use `::removeSomeTags()`
261 * instead of this method. `Sanitizer::removeSomeTags()` is safer
262 * and will always return well-formed HTML; however, it is
263 * significantly slower (especially for short strings where setup
264 * costs predominate). This method, although faster, should only
265 * be used where we know the result be cleaned up in a subsequent
266 * tidy pass.
268 * @param string $text Original string; see T268353 for why untainted.
269 * @param-taint $text none
270 * @param callable|null $processCallback Callback to do any variable or
271 * parameter replacements in HTML attribute values.
272 * This argument should be considered @internal.
273 * @param-taint $processCallback exec_shell
274 * @param array|bool $args Arguments for the processing callback
275 * @param-taint $args none
276 * @param array $extratags For any extra tags to include
277 * @param-taint $extratags tainted
278 * @param array $removetags For any tags (default or extra) to exclude
279 * @param-taint $removetags none
280 * @return string
281 * @return-taint escaped
282 * @deprecated since 1.38. Use ::removeSomeTags(), which always gives
283 * balanced/tidy HTML.
285 public static function removeHTMLtags( $text, $processCallback = null,
286 $args = [], $extratags = [], $removetags = []
288 wfDeprecated( __METHOD__, '1.38' );
289 return self::internalRemoveHtmlTags(
290 $text, $processCallback, $args, $extratags, $removetags
295 * Cleans up HTML, removes dangerous tags and attributes, and
296 * removes HTML comments; BEWARE there may be unmatched HTML
297 * tags in the result.
299 * @note Callers are recommended to use `::removeSomeTags()` instead
300 * of this method. `Sanitizer::removeSomeTags()` is safer and will
301 * always return well-formed HTML; however, it is significantly
302 * slower (especially for short strings where setup costs
303 * predominate). This method is for internal use by the legacy parser
304 * where we know the result will be cleaned up in a subsequent tidy pass.
306 * @param string $text Original string; see T268353 for why untainted.
307 * @param-taint $text none
308 * @param callable|null $processCallback Callback to do any variable or
309 * parameter replacements in HTML attribute values.
310 * This argument should be considered @internal.
311 * @param-taint $processCallback exec_shell
312 * @param array|bool $args Arguments for the processing callback
313 * @param-taint $args none
314 * @param array $extratags For any extra tags to include
315 * @param-taint $extratags tainted
316 * @param array $removetags For any tags (default or extra) to exclude
317 * @param-taint $removetags none
318 * @return string
319 * @return-taint escaped
320 * @internal
322 public static function internalRemoveHtmlTags( $text, $processCallback = null,
323 $args = [], $extratags = [], $removetags = []
325 $tagData = self::getRecognizedTagData( $extratags, $removetags );
326 $htmlsingle = $tagData['htmlsingle'];
327 $htmlsingleonly = $tagData['htmlsingleonly'];
328 $htmlelements = $tagData['htmlelements'];
330 # Remove HTML comments
331 $text = self::removeHTMLcomments( $text );
332 $bits = explode( '<', $text );
333 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
335 # this might be possible using remex tidy itself
336 foreach ( $bits as $x ) {
337 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
338 [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
340 $badtag = false;
341 $t = strtolower( $t );
342 if ( isset( $htmlelements[$t] ) ) {
343 if ( is_callable( $processCallback ) ) {
344 call_user_func_array( $processCallback, [ &$params, $args ] );
347 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
348 // Remove the self-closing slash, to be consistent
349 // with HTML5 semantics. T134423
350 $brace = '>';
352 if ( !self::validateTag( $params, $t ) ) {
353 $badtag = true;
356 $newparams = self::fixTagAttributes( $params, $t );
357 if ( !$badtag ) {
358 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
359 # Interpret self-closing tags as empty tags even when
360 # HTML 5 would interpret them as start tags. Such input
361 # is commonly seen on Wikimedia wikis with this intention.
362 $brace = "></$t>";
365 $rest = str_replace( '>', '&gt;', $rest );
366 $text .= "<$slash$t$newparams$brace$rest";
367 continue;
371 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
373 return $text;
377 * Cleans up HTML, removes dangerous tags and attributes, and
378 * removes HTML comments; the result will always be balanced and
379 * tidy HTML.
380 * @param string $text Source string; see T268353 for why untainted
381 * @param-taint $text none
382 * @param array $options Options controlling the cleanup:
383 * string[] $options['extraTags'] Any extra tags to allow
384 * (This property taints the whole array.)
385 * string[] $options['removeTags'] Any tags (default or extra) to exclude
386 * callable(Attributes,...):Attributes $options['attrCallback'] Callback
387 * to do any variable or parameter replacements in HTML attribute
388 * values before further cleanup; should be considered @internal
389 * and not for external use.
390 * array $options['attrCallbackArgs'] Additional arguments for the
391 * attribute callback
392 * @param-taint $options tainted
393 * @return string The cleaned up HTML
394 * @return-taint escaped
395 * @since 1.38
397 public static function removeSomeTags(
398 string $text, array $options = []
399 ): string {
400 $extraTags = $options['extraTags'] ?? [];
401 $removeTags = $options['removeTags'] ?? [];
402 // These options are @internal:
403 $attrCallback = $options['attrCallback'] ?? null;
404 $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
406 // This disallows HTML5-style "missing trailing semicolon" attributes
407 // In wikitext "clean&copy" does *not* contain an entity.
408 $text = self::normalizeCharReferences( $text );
410 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
411 // Use RemexHtml to tokenize $text and remove the barred tags
412 $formatter = new RemexCompatFormatter;
413 $serializer = new RemexSerializer( $formatter );
414 $treeBuilder = new RemexTreeBuilder( $serializer, [
415 'ignoreErrors' => true,
416 'ignoreNulls' => true,
417 ] );
418 $dispatcher = new RemexDispatcher( $treeBuilder );
419 $tokenHandler = $dispatcher;
420 $remover = new RemexRemoveTagHandler(
421 $tokenHandler, $text, $tagData,
422 $attrCallback, $attrCallbackArgs
424 $tokenizer = new RemexTokenizer( $remover, $text, [
425 'ignoreErrors' => true,
426 // don't ignore char refs, we want them to be decoded
427 'ignoreNulls' => true,
428 'skipPreprocess' => true,
429 ] );
430 $tokenizer->execute( [
431 'fragmentNamespace' => HTMLData::NS_HTML,
432 'fragmentName' => 'body',
433 ] );
434 return $serializer->getResult();
438 * Remove '<!--', '-->', and everything between.
439 * To avoid leaving blank lines, when a comment is both preceded
440 * and followed by a newline (ignoring spaces), trim leading and
441 * trailing spaces and one of the newlines.
443 * @param string $text
444 * @return string
446 public static function removeHTMLcomments( $text ) {
447 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
448 $end = strpos( $text, '-->', $start + 4 );
449 if ( $end === false ) {
450 # Unterminated comment; bail out
451 break;
454 $end += 3;
456 # Trim space and newline if the comment is both
457 # preceded and followed by a newline
458 $spaceStart = max( $start - 1, 0 );
459 $spaceLen = $end - $spaceStart;
460 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
461 $spaceStart--;
462 $spaceLen++;
464 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
465 $spaceLen++;
467 if ( substr( $text, $spaceStart, 1 ) === "\n"
468 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
469 # Remove the comment, leading and trailing
470 # spaces, and leave only one newline.
471 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
472 } else {
473 # Remove just the comment.
474 $text = substr_replace( $text, '', $start, $end - $start );
477 return $text;
481 * Takes attribute names and values for a tag and the tag name and
482 * validates that the tag is allowed to be present.
483 * This DOES NOT validate the attributes, nor does it validate the
484 * tags themselves. This method only handles the special circumstances
485 * where we may want to allow a tag within content but ONLY when it has
486 * specific attributes set.
488 * @param string $params
489 * @param string $element
490 * @return bool
492 * @see RemexRemoveTagHandler::validateTag()
494 private static function validateTag( $params, $element ) {
495 $params = self::decodeTagAttributes( $params );
497 if ( $element == 'meta' || $element == 'link' ) {
498 if ( !isset( $params['itemprop'] ) ) {
499 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
500 return false;
502 if ( $element == 'meta' && !isset( $params['content'] ) ) {
503 // <meta> must have a content="" for the itemprop
504 return false;
506 if ( $element == 'link' && !isset( $params['href'] ) ) {
507 // <link> must have an associated href=""
508 return false;
512 return true;
516 * Take an array of attribute names and values and normalize or discard
517 * illegal values for the given element type.
519 * - Discards attributes not allowed for the given element
520 * - Unsafe style attributes are discarded
521 * - Invalid id attributes are re-encoded
523 * @param array $attribs
524 * @param string $element
525 * @return array
527 * @todo Check for legal values where the DTD limits things.
528 * @todo Check for unique id attribute :P
530 public static function validateTagAttributes( $attribs, $element ) {
531 return self::validateAttributes( $attribs,
532 self::attributesAllowedInternal( $element ) );
536 * Take an array of attribute names and values and normalize or discard
537 * illegal values.
539 * - Discards attributes not on the given list
540 * - Unsafe style attributes are discarded
541 * - Invalid id attributes are re-encoded
543 * @param array $attribs
544 * @param array $allowed List of allowed attribute names,
545 * as an associative array where keys give valid attribute names
546 * (since 1.34). Before 1.35, passing a sequential array of
547 * valid attribute names was permitted but that is now deprecated.
548 * @return array
550 * @todo Check for legal values where the DTD limits things.
551 * @todo Check for unique id attribute :P
553 public static function validateAttributes( $attribs, $allowed ) {
554 if ( isset( $allowed[0] ) ) {
555 // Calling this function with a sequential array is
556 // deprecated. For now just convert it.
557 wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
558 $allowed = array_fill_keys( $allowed, true );
560 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
562 $out = [];
563 foreach ( $attribs as $attribute => $value ) {
564 # Allow XML namespace declaration to allow RDFa
565 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
566 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
567 $out[$attribute] = $value;
570 continue;
573 # Allow any attribute beginning with "data-"
574 # However:
575 # * Disallow data attributes used by MediaWiki code
576 # * Ensure that the attribute is not namespaced by banning
577 # colons.
578 if ( (
579 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
580 !array_key_exists( $attribute, $allowed )
581 ) || self::isReservedDataAttribute( $attribute ) ) {
582 continue;
585 # Strip javascript "expression" from stylesheets.
586 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
587 if ( $attribute == 'style' ) {
588 $value = self::checkCss( $value );
591 # Escape HTML id attributes
592 if ( $attribute === 'id' ) {
593 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
596 # Escape HTML id reference lists
597 if ( $attribute === 'aria-describedby'
598 || $attribute === 'aria-flowto'
599 || $attribute === 'aria-labelledby'
600 || $attribute === 'aria-owns'
602 $value = self::escapeIdReferenceListInternal( $value );
605 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
606 if ( $attribute === 'rel' || $attribute === 'rev'
607 # RDFa
608 || $attribute === 'about' || $attribute === 'property'
609 || $attribute === 'resource' || $attribute === 'datatype'
610 || $attribute === 'typeof'
611 # HTML5 microdata
612 || $attribute === 'itemid' || $attribute === 'itemprop'
613 || $attribute === 'itemref' || $attribute === 'itemscope'
614 || $attribute === 'itemtype'
616 // Paranoia. Allow "simple" values but suppress javascript
617 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
618 continue;
622 # NOTE: even though elements using href/src are not allowed directly, supply
623 # validation code that can be used by tag hook handlers, etc
624 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
625 if ( !preg_match( $hrefExp, $value ) ) {
626 continue; // drop any href or src attributes not using an allowed protocol.
627 // NOTE: this also drops all relative URLs
631 if ( $attribute === 'tabindex' && $value !== '0' ) {
632 // Only allow tabindex of 0, which is useful for accessibility.
633 continue;
636 // If this attribute was previously set, override it.
637 // Output should only have one attribute of each name.
638 $out[$attribute] = $value;
641 # itemtype, itemid, itemref don't make sense without itemscope
642 if ( !array_key_exists( 'itemscope', $out ) ) {
643 unset( $out['itemtype'] );
644 unset( $out['itemid'] );
645 unset( $out['itemref'] );
647 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
649 return $out;
653 * Given an attribute name, checks whether it is a reserved data attribute
654 * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
655 * core and extension code can safely use it to communicate with frontend code.
656 * @param string $attr Attribute name.
657 * @return bool
659 public static function isReservedDataAttribute( $attr ) {
660 // data-ooui is reserved for ooui.
661 // data-mw and data-parsoid are reserved for parsoid.
662 // data-mw-<name here> is reserved for extensions (or core) if
663 // they need to communicate some data to the client and want to be
664 // sure that it isn't coming from an untrusted user.
665 // We ignore the possibility of namespaces since user-generated HTML
666 // can't use them anymore.
667 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
671 * Merge two sets of HTML attributes. Conflicting items in the second set
672 * will override those in the first, except for 'class' attributes which
673 * will be combined (if they're both strings).
675 * @todo implement merging for other attributes such as style
676 * @param array $a
677 * @param array $b
678 * @return array
680 public static function mergeAttributes( $a, $b ) {
681 $out = array_merge( $a, $b );
682 if ( isset( $a['class'] ) && isset( $b['class'] )
683 && is_string( $a['class'] ) && is_string( $b['class'] )
684 && $a['class'] !== $b['class']
686 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
687 -1, PREG_SPLIT_NO_EMPTY );
688 $out['class'] = implode( ' ', array_unique( $classes ) );
690 return $out;
694 * Normalize CSS into a format we can easily search for hostile input
695 * - decode character references
696 * - decode escape sequences
697 * - remove comments, unless the entire value is one single comment
698 * @param string $value the css string
699 * @return string normalized css
701 public static function normalizeCss( $value ) {
702 // Decode character references like &#123;
703 $value = self::decodeCharReferences( $value );
705 // Decode escape sequences and line continuation
706 // See the grammar in the CSS 2 spec, appendix D.
707 // This has to be done AFTER decoding character references.
708 // This means it isn't possible for this function to return
709 // unsanitized escape sequences. It is possible to manufacture
710 // input that contains character references that decode to
711 // escape sequences that decode to character references, but
712 // it's OK for the return value to contain character references
713 // because the caller is supposed to escape those anyway.
714 static $decodeRegex;
715 if ( !$decodeRegex ) {
716 $space = '[\\x20\\t\\r\\n\\f]';
717 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
718 $backslash = '\\\\';
719 $decodeRegex = "/ $backslash
721 ($nl) | # 1. Line continuation
722 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
723 (.) | # 3. backslash cancelling special meaning
724 () | # 4. backslash at end of string
725 )/xu";
727 $value = preg_replace_callback( $decodeRegex,
728 [ __CLASS__, 'cssDecodeCallback' ], $value );
730 // Let the value through if it's nothing but a single comment, to
731 // allow other functions which may reject it to pass some error
732 // message through.
733 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
734 // Remove any comments; IE gets token splitting wrong
735 // This must be done AFTER decoding character references and
736 // escape sequences, because those steps can introduce comments
737 // This step cannot introduce character references or escape
738 // sequences, because it replaces comments with spaces rather
739 // than removing them completely.
740 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
742 // Remove anything after a comment-start token, to guard against
743 // incorrect client implementations.
744 $commentPos = strpos( $value, '/*' );
745 if ( $commentPos !== false ) {
746 $value = substr( $value, 0, $commentPos );
750 return $value;
754 * Pick apart some CSS and check it for forbidden or unsafe structures.
755 * Returns a sanitized string. This sanitized string will have
756 * character references and escape sequences decoded and comments
757 * stripped (unless it is itself one valid comment, in which case the value
758 * will be passed through). If the input is just too evil, only a comment
759 * complaining about evilness will be returned.
761 * Currently URL references, 'expression', 'tps' are forbidden.
763 * NOTE: Despite the fact that character references are decoded, the
764 * returned string may contain character references given certain
765 * clever input strings. These character references must
766 * be escaped before the return value is embedded in HTML.
768 * @param string $value
769 * @return string
771 public static function checkCss( $value ) {
772 $value = self::normalizeCss( $value );
774 // Reject problematic keywords and control characters
775 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
776 strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
777 return '/* invalid control char */';
778 } elseif ( preg_match(
779 '! expression
780 | filter\s*:
781 | accelerator\s*:
782 | -o-link\s*:
783 | -o-link-source\s*:
784 | -o-replace\s*:
785 | url\s*\(
786 | image\s*\(
787 | image-set\s*\(
788 | attr\s*\([^)]+[\s,]+url
789 !ix', $value ) ) {
790 return '/* insecure input */';
792 return $value;
796 * @param array $matches
797 * @return string
799 private static function cssDecodeCallback( $matches ) {
800 if ( $matches[1] !== '' ) {
801 // Line continuation
802 return '';
803 } elseif ( $matches[2] !== '' ) {
804 # hexdec could return a float if the match is too long, but the
805 # regexp in question limits the string length to 6.
806 $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
807 } elseif ( $matches[3] !== '' ) {
808 $char = $matches[3];
809 } else {
810 $char = '\\';
812 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
813 // These characters need to be escaped in strings
814 // Clean up the escape sequence to avoid parsing errors by clients
815 return '\\' . dechex( ord( $char ) ) . ' ';
816 } else {
817 // Decode unnecessary escape
818 return $char;
823 * Take a tag soup fragment listing an HTML element's attributes
824 * and normalize it to well-formed XML, discarding unwanted attributes.
825 * Output is safe for further wikitext processing, with escaping of
826 * values that could trigger problems.
828 * - Normalizes attribute names to lowercase
829 * - Discards attributes not allowed for the given element
830 * - Turns broken or invalid entities into plaintext
831 * - Double-quotes all attribute values
832 * - Attributes without values are given the name as attribute
833 * - Double attributes are discarded
834 * - Unsafe style attributes are discarded
835 * - Prepends space if there are attributes.
836 * - (Optionally) Sorts attributes by name.
838 * @param string $text
839 * @param string $element
840 * @param bool $sorted Whether to sort the attributes (default: false)
841 * @return string
843 public static function fixTagAttributes( $text, $element, $sorted = false ) {
844 if ( trim( $text ) == '' ) {
845 return '';
848 $decoded = self::decodeTagAttributes( $text );
849 $stripped = self::validateTagAttributes( $decoded, $element );
851 if ( $sorted ) {
852 ksort( $stripped );
855 return self::safeEncodeTagAttributes( $stripped );
859 * Encode an attribute value for HTML output.
860 * @param string $text
861 * @param-taint $text escapes_html
862 * @return string HTML-encoded text fragment
863 * @return-taint escaped
865 public static function encodeAttribute( $text ) {
866 $encValue = htmlspecialchars( $text, ENT_QUOTES );
868 // Whitespace is normalized during attribute decoding,
869 // so if we've been passed non-spaces we must encode them
870 // ahead of time or they won't be preserved.
871 $encValue = strtr( $encValue, [
872 "\n" => '&#10;',
873 "\r" => '&#13;',
874 "\t" => '&#9;',
875 ] );
877 return $encValue;
881 * Armor French spaces with a replacement character
883 * @since 1.32
884 * @param string $text Text to armor
885 * @param string $space Space character for the French spaces, defaults to '&#160;'
886 * @return string Armored text
888 public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
889 // Replace $ with \$ and \ with \\
890 $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
891 $fixtags = [
892 # French spaces, last one Guillemet-left
893 # only if it isn't followed by a word character.
894 '/ (?=[?:;!%»›](?!\w))/u' => "$space",
895 # French spaces, Guillemet-right
896 '/([«‹]) /u' => "\\1$space",
898 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
902 * Encode an attribute value for HTML tags, with extra armoring
903 * against further wiki processing.
904 * @param string $text
905 * @param-taint $text escapes_html
906 * @return string HTML-encoded text fragment
907 * @return-taint escaped
909 public static function safeEncodeAttribute( $text ) {
910 $encValue = self::encodeAttribute( $text );
912 # Templates and links may be expanded in later parsing,
913 # creating invalid or dangerous output. Suppress this.
914 $encValue = strtr( $encValue, [
915 '<' => '&lt;', // This should never happen,
916 '>' => '&gt;', // we've received invalid input
917 '"' => '&quot;', // which should have been escaped.
918 '{' => '&#123;',
919 '}' => '&#125;', // prevent unpaired language conversion syntax
920 '[' => '&#91;',
921 ']' => '&#93;',
922 "''" => '&#39;&#39;',
923 'ISBN' => '&#73;SBN',
924 'RFC' => '&#82;FC',
925 'PMID' => '&#80;MID',
926 '|' => '&#124;',
927 '__' => '&#95;_',
928 ] );
930 # Stupid hack
931 $encValue = preg_replace_callback(
932 '/((?i)' . wfUrlProtocols() . ')/',
933 static function ( $matches ) {
934 return str_replace( ':', '&#58;', $matches[1] );
936 $encValue );
937 return $encValue;
941 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
942 * a valid HTML id attribute.
944 * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
945 * proper escaping.
947 * @param string $id String to escape
948 * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
949 * should be used.
950 * @return string|false Escaped ID or false if fallback encoding is requested but it's not
951 * configured.
953 * @since 1.30
955 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
956 global $wgFragmentMode;
958 if ( !isset( $wgFragmentMode[$mode] ) ) {
959 if ( $mode === self::ID_PRIMARY ) {
960 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
962 return false;
965 $internalMode = $wgFragmentMode[$mode];
967 return self::escapeIdInternal( $id, $internalMode );
971 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
972 * a valid URL fragment.
974 * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
975 * proper escaping.
977 * @param string $id String to escape
978 * @return string Escaped ID
980 * @since 1.30
982 public static function escapeIdForLink( $id ) {
983 global $wgFragmentMode;
985 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
986 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
989 $mode = $wgFragmentMode[self::ID_PRIMARY];
991 $id = self::escapeIdInternalUrl( $id, $mode );
993 return $id;
997 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
998 * a valid URL fragment for external interwikis.
1000 * @param string $id String to escape
1001 * @return string Escaped ID
1003 * @since 1.30
1005 public static function escapeIdForExternalInterwiki( $id ) {
1006 global $wgExternalInterwikiFragmentMode;
1008 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1010 return $id;
1014 * Do percent encoding of percent signs for href (but not id) attributes
1016 * @since 1.35
1017 * @see https://phabricator.wikimedia.org/T238385
1018 * @param string $id String to escape
1019 * @param string $mode One of modes from $wgFragmentMode
1020 * @return string
1022 private static function escapeIdInternalUrl( $id, $mode ) {
1023 $id = self::escapeIdInternal( $id, $mode );
1024 if ( $mode === 'html5' ) {
1025 $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1027 return $id;
1031 * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
1033 * @param string $id String to escape
1034 * @param string $mode One of modes from $wgFragmentMode
1035 * @return string
1037 private static function escapeIdInternal( $id, $mode ) {
1038 // Truncate overly-long IDs. This isn't an HTML limit, it's just
1039 // griefer protection. [T251506]
1040 $id = mb_substr( $id, 0, 1024 );
1042 switch ( $mode ) {
1043 case 'html5':
1044 // html5 spec says ids must not have any of the following:
1045 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1046 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1047 // possible using either Lua or html entities.
1048 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1049 break;
1050 case 'legacy':
1051 // This corresponds to 'noninitial' mode of the former escapeId()
1052 static $replace = [
1053 '%3A' => ':',
1054 '%' => '.'
1057 $id = urlencode( str_replace( ' ', '_', $id ) );
1058 $id = strtr( $id, $replace );
1059 break;
1060 default:
1061 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1064 return $id;
1068 * Given a string containing a space delimited list of ids, escape each id
1069 * to match ids escaped by the escapeIdForAttribute() function.
1071 * @param string $referenceString Space delimited list of ids
1072 * @return string
1074 private static function escapeIdReferenceListInternal( $referenceString ) {
1075 # Explode the space delimited list string into an array of tokens
1076 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1078 # Escape each token as an id
1079 foreach ( $references as &$ref ) {
1080 $ref = self::escapeIdForAttribute( $ref );
1083 # Merge the array back to a space delimited list string
1084 # If the array is empty, the result will be an empty string ('')
1085 $referenceString = implode( ' ', $references );
1087 return $referenceString;
1091 * Given a value, escape it so that it can be used as a CSS class and
1092 * return it.
1094 * @todo For extra validity, input should be validated UTF-8.
1096 * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
1098 * @param string $class
1099 * @return string
1101 public static function escapeClass( $class ) {
1102 // Convert ugly stuff to underscores and kill underscores in ugly places
1103 return rtrim( preg_replace(
1104 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1105 '_',
1106 $class ), '_' );
1110 * Given HTML input, escape with htmlspecialchars but un-escape entities.
1111 * This allows (generally harmless) entities like &#160; to survive.
1113 * @param string $html HTML to escape
1114 * @param-taint $html escapes_htmlnoent
1115 * @return string Escaped input
1116 * @return-taint escaped
1118 public static function escapeHtmlAllowEntities( $html ) {
1119 $html = self::decodeCharReferences( $html );
1120 # It seems wise to escape ' as well as ", as a matter of course. Can't
1121 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1122 # don't cause the entire string to disappear.
1123 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1124 return $html;
1128 * Return an associative array of attribute names and values from
1129 * a partial tag string. Attribute names are forced to lowercase,
1130 * character references are decoded to UTF-8 text.
1132 * @param string $text
1133 * @return array
1135 public static function decodeTagAttributes( $text ) {
1136 if ( trim( $text ) == '' ) {
1137 return [];
1140 $pairs = [];
1141 if ( !preg_match_all(
1142 self::getAttribsRegex(),
1143 $text,
1144 $pairs,
1145 PREG_SET_ORDER ) ) {
1146 return [];
1149 $attribs = [];
1150 foreach ( $pairs as $set ) {
1151 $attribute = strtolower( $set[1] );
1153 // Filter attribute names with unacceptable characters
1154 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1155 continue;
1158 $value = self::getTagAttributeCallback( $set );
1160 // Normalize whitespace
1161 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1162 $value = trim( $value );
1164 // Decode character references
1165 $attribs[$attribute] = self::decodeCharReferences( $value );
1167 return $attribs;
1171 * Build a partial tag string from an associative array of attribute
1172 * names and values as returned by decodeTagAttributes.
1174 * @param array $assoc_array
1175 * @return string
1177 public static function safeEncodeTagAttributes( $assoc_array ) {
1178 $attribs = [];
1179 foreach ( $assoc_array as $attribute => $value ) {
1180 $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1181 $encValue = self::safeEncodeAttribute( $value );
1183 $attribs[] = "$encAttribute=\"$encValue\"";
1185 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1189 * Pick the appropriate attribute value from a match set from the
1190 * attribs regex matches.
1192 * @param array $set
1193 * @return string
1195 private static function getTagAttributeCallback( $set ) {
1196 if ( isset( $set[5] ) ) {
1197 # No quotes.
1198 return $set[5];
1199 } elseif ( isset( $set[4] ) ) {
1200 # Single-quoted
1201 return $set[4];
1202 } elseif ( isset( $set[3] ) ) {
1203 # Double-quoted
1204 return $set[3];
1205 } elseif ( !isset( $set[2] ) ) {
1206 # In XHTML, attributes must have a value so return an empty string.
1207 # See "Empty attribute syntax",
1208 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1209 return "";
1210 } else {
1211 throw new LogicException( "Tag conditions not met. This should never happen and is a bug." );
1216 * @param string $text
1217 * @return string
1219 private static function normalizeWhitespace( $text ) {
1220 return trim( preg_replace(
1221 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1222 ' ',
1223 $text ) );
1227 * Normalizes whitespace in a section name, such as might be returned
1228 * by Parser::stripSectionName(), for use in the id's that are used for
1229 * section links.
1231 * @param string $section
1232 * @return string
1234 public static function normalizeSectionNameWhitespace( $section ) {
1235 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1239 * Ensure that any entities and character references are legal
1240 * for XML and XHTML specifically. Any stray bits will be
1241 * &amp;-escaped to result in a valid text fragment.
1243 * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
1244 * numericized (this way we're well-formed even without a DTD)
1245 * b. any numeric char refs must be legal chars, not invalid or forbidden
1246 * c. use lower cased "&#x", not "&#X"
1247 * d. fix or reject non-valid attributes
1249 * @param string $text
1250 * @return string
1251 * @internal
1253 public static function normalizeCharReferences( $text ) {
1254 return preg_replace_callback(
1255 self::CHAR_REFS_REGEX,
1256 [ self::class, 'normalizeCharReferencesCallback' ],
1257 $text );
1261 * @param string $matches
1262 * @return string
1264 private static function normalizeCharReferencesCallback( $matches ) {
1265 $ret = null;
1266 if ( $matches[1] != '' ) {
1267 $ret = self::normalizeEntity( $matches[1] );
1268 } elseif ( $matches[2] != '' ) {
1269 $ret = self::decCharReference( $matches[2] );
1270 } elseif ( $matches[3] != '' ) {
1271 $ret = self::hexCharReference( $matches[3] );
1273 if ( $ret === null ) {
1274 return htmlspecialchars( $matches[0], ENT_COMPAT );
1275 } else {
1276 return $ret;
1281 * If the named entity is defined in HTML5
1282 * return the equivalent numeric entity reference (except for the core &lt;
1283 * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
1284 * the HTML equivalent. Otherwise, returns HTML-escaped text of
1285 * pseudo-entity source (eg &amp;foo;)
1287 * @param string $name Semicolon-terminated name
1288 * @return string
1290 private static function normalizeEntity( $name ) {
1291 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1292 // Non-standard MediaWiki-specific entities
1293 return '&' . self::MW_ENTITY_ALIASES[$name];
1294 } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1295 // Keep these in word form
1296 return "&$name";
1297 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1298 // Beware: some entities expand to more than 1 codepoint
1299 return preg_replace_callback( '/./Ssu', static function ( $m ) {
1300 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1301 }, HTMLData::$namedEntityTranslations[$name] );
1302 } else {
1303 return "&amp;$name";
1308 * @param int|string $codepoint
1309 * @return null|string
1311 private static function decCharReference( $codepoint ) {
1312 # intval() will (safely) saturate at the maximum signed integer
1313 # value if $codepoint is too many digits
1314 $point = intval( $codepoint );
1315 if ( self::validateCodepoint( $point ) ) {
1316 return sprintf( '&#%d;', $point );
1317 } else {
1318 return null;
1323 * @param string $codepoint
1324 * @return null|string
1326 private static function hexCharReference( $codepoint ) {
1327 # hexdec() will return a float (not an int) if $codepoint is too
1328 # long, so protect against that. The largest valid codepoint is
1329 # 0x10FFFF.
1330 if ( strlen( ltrim( $codepoint, '0' ) ) > 6 ) {
1331 return null;
1333 $point = hexdec( $codepoint );
1334 if ( self::validateCodepoint( $point ) ) {
1335 return sprintf( '&#x%x;', $point );
1336 } else {
1337 return null;
1342 * Returns true if a given Unicode codepoint is a valid character in
1343 * both HTML5 and XML.
1344 * @param int $codepoint
1345 * @return bool
1347 private static function validateCodepoint( $codepoint ) {
1348 # U+000C is valid in HTML5 but not allowed in XML.
1349 # U+000D is valid in XML but not allowed in HTML5.
1350 # U+007F - U+009F are disallowed in HTML5 (control characters).
1351 return $codepoint == 0x09
1352 || $codepoint == 0x0a
1353 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1354 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1355 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1356 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1360 * Decode any character references, numeric or named entities,
1361 * in the text and return a UTF-8 string.
1363 * @param string $text
1364 * @return string
1366 public static function decodeCharReferences( $text ) {
1367 return preg_replace_callback(
1368 self::CHAR_REFS_REGEX,
1369 [ self::class, 'decodeCharReferencesCallback' ],
1370 $text );
1374 * Decode any character references, numeric or named entities,
1375 * in the next and normalize the resulting string. (T16952)
1377 * This is useful for page titles, not for text to be displayed,
1378 * MediaWiki allows HTML entities to escape normalization as a feature.
1380 * @param string $text Already normalized, containing entities
1381 * @return string Still normalized, without entities
1383 public static function decodeCharReferencesAndNormalize( $text ) {
1384 $text = preg_replace_callback(
1385 self::CHAR_REFS_REGEX,
1386 [ self::class, 'decodeCharReferencesCallback' ],
1387 $text,
1388 -1, // limit
1389 $count
1392 if ( $count ) {
1393 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1394 } else {
1395 return $text;
1400 * @param string $matches
1401 * @return string
1403 private static function decodeCharReferencesCallback( $matches ) {
1404 if ( $matches[1] != '' ) {
1405 return self::decodeEntity( $matches[1] );
1406 } elseif ( $matches[2] != '' ) {
1407 return self::decodeChar( intval( $matches[2] ) );
1408 } elseif ( $matches[3] != '' ) {
1409 # hexdec will return a float if the string is too long (!) so
1410 # check the length of the string first.
1411 if ( strlen( ltrim( $matches[3], '0' ) ) > 6 ) {
1412 // Invalid character reference.
1413 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1415 return self::decodeChar( hexdec( $matches[3] ) );
1417 # Last case should be an ampersand by itself
1418 return $matches[0];
1422 * Return UTF-8 string for a codepoint if that is a valid
1423 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1424 * @param int $codepoint
1425 * @return string
1426 * @internal
1428 private static function decodeChar( $codepoint ) {
1429 if ( self::validateCodepoint( $codepoint ) ) {
1430 return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1431 } else {
1432 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1437 * If the named entity is defined in HTML5
1438 * return the UTF-8 encoding of that character. Otherwise, returns
1439 * pseudo-entity source (eg "&foo;")
1441 * @param string $name Semicolon-terminated entity name
1442 * @return string
1444 private static function decodeEntity( $name ) {
1445 // These are MediaWiki-specific entities, not in the HTML standard
1446 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1447 $name = self::MW_ENTITY_ALIASES[$name];
1449 $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1450 return $trans ?? "&$name";
1454 * Fetch the list of acceptable attributes for a given element name.
1456 * @param string $element
1457 * @return array An associative array where keys are acceptable attribute
1458 * names
1460 private static function attributesAllowedInternal( $element ) {
1461 $list = self::setupAttributesAllowedInternal();
1462 return $list[$element] ?? [];
1466 * Foreach array key (an allowed HTML element), return an array
1467 * of allowed attributes.
1468 * @return array An associative array: keys are HTML element names;
1469 * values are associative arrays where the keys are allowed attribute
1470 * names.
1472 private static function setupAttributesAllowedInternal() {
1473 static $allowed;
1475 if ( $allowed !== null ) {
1476 return $allowed;
1479 // For lookup efficiency flip each attributes array so the keys are
1480 // the valid attributes.
1481 $merge = static function ( $a, $b, $c = [] ) {
1482 return array_merge(
1484 array_fill_keys( $b, true ),
1485 array_fill_keys( $c, true ) );
1487 $common = $merge( [], [
1488 # HTML
1489 'id',
1490 'class',
1491 'style',
1492 'lang',
1493 'dir',
1494 'title',
1495 'tabindex',
1497 # WAI-ARIA
1498 'aria-describedby',
1499 'aria-flowto',
1500 'aria-hidden',
1501 'aria-label',
1502 'aria-labelledby',
1503 'aria-level',
1504 'aria-owns',
1505 'role',
1507 # RDFa
1508 # These attributes are specified in section 9 of
1509 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1510 'about',
1511 'property',
1512 'resource',
1513 'datatype',
1514 'typeof',
1516 # Microdata. These are specified by
1517 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1518 'itemid',
1519 'itemprop',
1520 'itemref',
1521 'itemscope',
1522 'itemtype',
1523 ] );
1525 $block = $merge( $common, [ 'align' ] );
1527 $tablealign = [ 'align', 'valign' ];
1528 $tablecell = [
1529 'abbr',
1530 'axis',
1531 'headers',
1532 'scope',
1533 'rowspan',
1534 'colspan',
1535 'nowrap', # deprecated
1536 'width', # deprecated
1537 'height', # deprecated
1538 'bgcolor', # deprecated
1541 # Numbers refer to sections in HTML 4.01 standard describing the element.
1542 # See: https://www.w3.org/TR/html4/
1543 $allowed = [
1544 # 7.5.4
1545 'div' => $block,
1546 'center' => $common, # deprecated
1547 'span' => $common,
1549 # 7.5.5
1550 'h1' => $block,
1551 'h2' => $block,
1552 'h3' => $block,
1553 'h4' => $block,
1554 'h5' => $block,
1555 'h6' => $block,
1557 # 7.5.6
1558 # address
1560 # 8.2.4
1561 'bdo' => $common,
1563 # 9.2.1
1564 'em' => $common,
1565 'strong' => $common,
1566 'cite' => $common,
1567 'dfn' => $common,
1568 'code' => $common,
1569 'samp' => $common,
1570 'kbd' => $common,
1571 'var' => $common,
1572 'abbr' => $common,
1573 # acronym
1575 # 9.2.2
1576 'blockquote' => $merge( $common, [ 'cite' ] ),
1577 'q' => $merge( $common, [ 'cite' ] ),
1579 # 9.2.3
1580 'sub' => $common,
1581 'sup' => $common,
1583 # 9.3.1
1584 'p' => $block,
1586 # 9.3.2
1587 'br' => $merge( $common, [ 'clear' ] ),
1589 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1590 'wbr' => $common,
1592 # 9.3.4
1593 'pre' => $merge( $common, [ 'width' ] ),
1595 # 9.4
1596 'ins' => $merge( $common, [ 'cite', 'datetime' ] ),
1597 'del' => $merge( $common, [ 'cite', 'datetime' ] ),
1599 # 10.2
1600 'ul' => $merge( $common, [ 'type' ] ),
1601 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1602 'li' => $merge( $common, [ 'type', 'value' ] ),
1604 # 10.3
1605 'dl' => $common,
1606 'dd' => $common,
1607 'dt' => $common,
1609 # 11.2.1
1610 'table' => $merge( $common,
1611 [ 'summary', 'width', 'border', 'frame',
1612 'rules', 'cellspacing', 'cellpadding',
1613 'align', 'bgcolor',
1614 ] ),
1616 # 11.2.2
1617 'caption' => $block,
1619 # 11.2.3
1620 'thead' => $common,
1621 'tfoot' => $common,
1622 'tbody' => $common,
1624 # 11.2.4
1625 'colgroup' => $merge( $common, [ 'span' ] ),
1626 'col' => $merge( $common, [ 'span' ] ),
1628 # 11.2.5
1629 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ),
1631 # 11.2.6
1632 'td' => $merge( $common, $tablecell, $tablealign ),
1633 'th' => $merge( $common, $tablecell, $tablealign ),
1635 # 12.2
1636 # NOTE: <a> is not allowed directly, but this list of allowed
1637 # attributes is used from the Parser object
1638 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1640 # 13.2
1641 # Not usually allowed, but may be used for extension-style hooks
1642 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1643 # true
1644 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1645 # Attributes for A/V tags added in T163583 / T133673
1646 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1647 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1648 'source' => $merge( $common, [ 'type', 'src' ] ),
1649 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1651 # 15.2.1
1652 'tt' => $common,
1653 'b' => $common,
1654 'i' => $common,
1655 'big' => $common,
1656 'small' => $common,
1657 'strike' => $common,
1658 's' => $common,
1659 'u' => $common,
1661 # 15.2.2
1662 'font' => $merge( $common, [ 'size', 'color', 'face' ] ),
1663 # basefont
1665 # 15.3
1666 'hr' => $merge( $common, [ 'width' ] ),
1668 # HTML Ruby annotation text module, simple ruby only.
1669 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1670 'ruby' => $common,
1671 # rbc
1672 'rb' => $common,
1673 'rp' => $common,
1674 'rt' => $common, # $merge( $common, [ 'rbspan' ] ),
1675 'rtc' => $common,
1677 # MathML root element, where used for extensions
1678 # 'title' may not be 100% valid here; it's XHTML
1679 # https://www.w3.org/TR/REC-MathML/
1680 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1682 // HTML 5 section 4.5
1683 'figure' => $common,
1684 'figcaption' => $common,
1686 # HTML 5 section 4.6
1687 'bdi' => $common,
1689 # HTML5 elements, defined by:
1690 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1691 'data' => $merge( $common, [ 'value' ] ),
1692 'time' => $merge( $common, [ 'datetime' ] ),
1693 'mark' => $common,
1695 // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1696 // is enabled so we don't bother adding a conditional to hide these
1697 // Also meta and link are only valid in WikiText as Microdata elements
1698 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1699 // So we don't bother including $common attributes that have no purpose.
1700 'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1701 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1703 # HTML 5 section 4.3.5
1704 'aside' => $common,
1707 return $allowed;
1711 * Take a fragment of (potentially invalid) HTML and return
1712 * a version with any tags removed, encoded as plain text.
1714 * Warning: this return value must be further escaped for literal
1715 * inclusion in HTML output as of 1.10!
1717 * @param string $html HTML fragment
1718 * @return string
1719 * @return-taint tainted
1721 public static function stripAllTags( $html ) {
1722 // Use RemexHtml to tokenize $html and extract the text
1723 $handler = new RemexStripTagHandler;
1724 $tokenizer = new RemexTokenizer( $handler, $html, [
1725 'ignoreErrors' => true,
1726 // don't ignore char refs, we want them to be decoded
1727 'ignoreNulls' => true,
1728 'skipPreprocess' => true,
1729 ] );
1730 $tokenizer->execute();
1731 $text = $handler->getResult();
1733 $text = self::normalizeWhitespace( $text );
1734 return $text;
1738 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1739 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1740 * PHP 5.1 doesn't.
1742 * Use for passing XHTML fragments to PHP's XML parsing functions
1744 * @return string
1745 * @deprecated since 1.36; will be made private or removed in a future
1746 * release.
1748 public static function hackDocType() {
1749 $out = "<!DOCTYPE html [\n";
1750 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1751 if ( substr( $entity, -1 ) !== ';' ) {
1752 // Some HTML entities omit the trailing semicolon;
1753 // wikitext does not permit these.
1754 continue;
1756 $name = substr( $entity, 0, -1 );
1757 $expansion = self::normalizeEntity( $entity );
1758 if ( $entity === $expansion ) {
1759 // Skip &lt; &gt; etc
1760 continue;
1762 $out .= "<!ENTITY $name \"$expansion\">";
1764 $out .= "]>\n";
1765 return $out;
1769 * @param string $url
1770 * @return mixed|string
1772 public static function cleanUrl( $url ) {
1773 # Normalize any HTML entities in input. They will be
1774 # re-escaped by makeExternalLink().
1775 $url = self::decodeCharReferences( $url );
1777 # Escape any control characters introduced by the above step
1778 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1779 [ __CLASS__, 'cleanUrlCallback' ], $url );
1781 # Validate hostname portion
1782 $matches = [];
1783 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1784 [ /* $whole */, $protocol, $host, $rest ] = $matches;
1786 // Characters that will be ignored in IDNs.
1787 // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1788 // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1789 // Strip them before further processing so deny lists and such work.
1790 $strip = "/
1791 \\s| # general whitespace
1792 \u{00AD}| # SOFT HYPHEN
1793 \u{034F}| # COMBINING GRAPHEME JOINER
1794 \u{061C}| # ARABIC LETTER MARK
1795 [\u{115F}-\u{1160}]| # HANGUL CHOSEONG FILLER..
1796 # HANGUL JUNGSEONG FILLER
1797 [\u{17B4}-\u{17B5}]| # KHMER VOWEL INHERENT AQ..
1798 # KHMER VOWEL INHERENT AA
1799 [\u{180B}-\u{180D}]| # MONGOLIAN FREE VARIATION SELECTOR ONE..
1800 # MONGOLIAN FREE VARIATION SELECTOR THREE
1801 \u{180E}| # MONGOLIAN VOWEL SEPARATOR
1802 [\u{200B}-\u{200F}]| # ZERO WIDTH SPACE..
1803 # RIGHT-TO-LEFT MARK
1804 [\u{202A}-\u{202E}]| # LEFT-TO-RIGHT EMBEDDING..
1805 # RIGHT-TO-LEFT OVERRIDE
1806 [\u{2060}-\u{2064}]| # WORD JOINER..
1807 # INVISIBLE PLUS
1808 \u{2065}| # <reserved-2065>
1809 [\u{2066}-\u{206F}]| # LEFT-TO-RIGHT ISOLATE..
1810 # NOMINAL DIGIT SHAPES
1811 \u{3164}| # HANGUL FILLER
1812 [\u{FE00}-\u{FE0F}]| # VARIATION SELECTOR-1..
1813 # VARIATION SELECTOR-16
1814 \u{FEFF}| # ZERO WIDTH NO-BREAK SPACE
1815 \u{FFA0}| # HALFWIDTH HANGUL FILLER
1816 [\u{FFF0}-\u{FFF8}]| # <reserved-FFF0>..
1817 # <reserved-FFF8>
1818 [\u{1BCA0}-\u{1BCA3}]| # SHORTHAND FORMAT LETTER OVERLAP..
1819 # SHORTHAND FORMAT UP STEP
1820 [\u{1D173}-\u{1D17A}]| # MUSICAL SYMBOL BEGIN BEAM..
1821 # MUSICAL SYMBOL END PHRASE
1822 \u{E0000}| # <reserved-E0000>
1823 \u{E0001}| # LANGUAGE TAG
1824 [\u{E0002}-\u{E001F}]| # <reserved-E0002>..
1825 # <reserved-E001F>
1826 [\u{E0020}-\u{E007F}]| # TAG SPACE..
1827 # CANCEL TAG
1828 [\u{E0080}-\u{E00FF}]| # <reserved-E0080>..
1829 # <reserved-E00FF>
1830 [\u{E0100}-\u{E01EF}]| # VARIATION SELECTOR-17..
1831 # VARIATION SELECTOR-256
1832 [\u{E01F0}-\u{E0FFF}]| # <reserved-E01F0>..
1833 # <reserved-E0FFF>
1834 /xuD";
1836 $host = preg_replace( $strip, '', $host );
1838 // IPv6 host names are bracketed with []. Url-decode these.
1839 if ( str_starts_with( $host, "//%5B" ) &&
1840 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1842 $host = '//[' . $matches[1] . ']' . $matches[2];
1845 // @todo FIXME: Validate hostnames here
1847 return $protocol . $host . $rest;
1848 } else {
1849 return $url;
1854 * @param array $matches
1855 * @return string
1857 private static function cleanUrlCallback( $matches ) {
1858 return urlencode( $matches[0] );
1862 * Does a string look like an e-mail address?
1864 * This validates an email address using an HTML5 specification found at:
1865 * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
1866 * Which as of 2011-01-24 says:
1868 * A valid e-mail address is a string that matches the ABNF production
1869 * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
1870 * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
1871 * 3.5.
1873 * This function is an implementation of the specification as requested in
1874 * T24449.
1876 * Client-side forms will use the same standard validation rules via JS or
1877 * HTML 5 validation; additional restrictions can be enforced server-side
1878 * by extensions via the 'isValidEmailAddr' hook.
1880 * Note that this validation doesn't 100% match RFC 2822, but is believed
1881 * to be liberal enough for wide use. Some invalid addresses will still
1882 * pass validation here.
1884 * @since 1.18
1886 * @param string $addr E-mail address
1887 * @return bool
1889 public static function validateEmail( $addr ) {
1890 $result = null;
1891 // TODO This method should be non-static, and have a HookRunner injected
1892 $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1893 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1894 return $result;
1897 // Please note strings below are enclosed in brackets [], this make the
1898 // hyphen "-" a range indicator. Hence it is double backslashed below.
1899 // See T28948
1900 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1901 $rfc1034_ldh_str = "a-z0-9\\-";
1903 $html5_email_regexp = "/
1904 ^ # start of string
1905 [$rfc5322_atext\\.]+ # user part which is liberal :p
1906 @ # 'apostrophe'
1907 [$rfc1034_ldh_str]+ # First domain part
1908 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
1909 $ # End of string
1910 /ix"; // case Insensitive, eXtended
1912 return (bool)preg_match( $html5_email_regexp, $addr );
1917 * Retain the old class name for backwards compatibility.
1918 * @deprecated since 1.41
1920 class_alias( Sanitizer::class, 'Sanitizer' );