includes/parser/Sanitizer.php

   1 <?php
   2 /**
   3  * HTML sanitizer for %MediaWiki.
   4  *
   5  * Copyright © 2002-2005 Brooke Vibber <bvibber@wikimedia.org> et al
   6  * https://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 namespace MediaWiki\Parser;
  28
  29 use InvalidArgumentException;
  30 use LogicException;
  31 use MediaWiki\HookContainer\HookRunner;
  32 use MediaWiki\MediaWikiServices;
  33 use MediaWiki\Tidy\RemexCompatFormatter;
  34 use StringUtils;
  35 use UnexpectedValueException;
  36 use Wikimedia\RemexHtml\HTMLData;
  37 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
  38 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
  39 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
  40 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
  41
  42 /**
  43  * HTML sanitizer for MediaWiki
  44  * @ingroup Parser
  45  */
  46 class Sanitizer {
  47         /**
  48          * Regular expression to match various types of character references in
  49          * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
  50          * Note that HTML5 allows some named entities to omit the trailing
  51          * semicolon; wikitext entities *must* have a trailing semicolon.
  52          */
  53         private const CHAR_REFS_REGEX =
  54                 '/&([A-Za-z0-9\x80-\xff]+;)
  55                 |&\#([0-9]+);
  56                 |&\#[xX]([0-9A-Fa-f]+);
  57                 |&/x';
  58
  59         /**
  60          * Acceptable tag name charset from HTML5 parsing spec
  61          * https://www.w3.org/TR/html5/syntax.html#tag-open-state
  62          */
  63         private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
  64
  65         /**
  66          * Pattern matching evil uris like javascript:
  67          * WARNING: DO NOT use this in any place that actually requires denying
  68          * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
  69          * pattern-based deny lists; the only way to be secure from javascript:
  70          * uri based xss vectors is to allow only things that you know are safe
  71          * and deny everything else.
  72          * [1]: http://ha.ckers.org/xss.html
  73          */
  74         private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
  75         private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
  76
  77         /**
  78          * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
  79          *
  80          * @since 1.30
  81          */
  82         public const ID_PRIMARY = 0;
  83
  84         /**
  85          * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
  86          * if no fallback is configured.
  87          *
  88          * @since 1.30
  89          */
  90         public const ID_FALLBACK = 1;
  91
  92         /**
  93          * Character entity aliases accepted by MediaWiki in wikitext.
  94          * These are not part of the HTML standard.
  95          */
  96         private const MW_ENTITY_ALIASES = [
  97                 'רלמ;' => 'rlm;',
  98                 'رلم;' => 'rlm;',
  99         ];
 100
 101         /**
 102          * Lazy-initialised attributes regex, see getAttribsRegex()
 103          */
 104         private static ?string $attribsRegex = null;
 105
 106         /**
 107          * Regular expression to match HTML/XML attribute pairs within a tag.
 108          * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 109          * Used in Sanitizer::decodeTagAttributes
 110          */
 111         private static function getAttribsRegex(): string {
 112                 if ( self::$attribsRegex === null ) {
 113                         $spaceChars = '\x09\x0a\x0c\x0d\x20';
 114                         $space = "[{$spaceChars}]";
 115                         $attrib = "[^{$spaceChars}\/>=]";
 116                         $attribFirst = "(?:{$attrib}|=)";
 117                         self::$attribsRegex =
 118                                 "/({$attribFirst}{$attrib}*)
 119                                         ($space*=$space*
 120                                         (?:
 121                                                 # The attribute value: quoted or alone
 122                                                 \"([^\"]*)(?:\"|\$)
 123                                                 | '([^']*)(?:'|\$)
 124                                                 | (((?!$space|>).)*)
 125                                         )
 126                                 )?/sxu";
 127                 }
 128                 return self::$attribsRegex;
 129         }
 130
 131         /**
 132          * Lazy-initialised attribute name regex, see getAttribNameRegex()
 133          */
 134         private static ?string $attribNameRegex = null;
 135
 136         /**
 137          * Used in Sanitizer::decodeTagAttributes to filter attributes.
 138          */
 139         private static function getAttribNameRegex(): string {
 140                 if ( self::$attribNameRegex === null ) {
 141                         $attribFirst = "[:_\p{L}\p{N}]";
 142                         $attrib = "[:_\.\-\p{L}\p{N}]";
 143                         self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
 144                 }
 145                 return self::$attribNameRegex;
 146         }
 147
 148         /**
 149          * Return the various lists of recognized tags
 150          * @param string[] $extratags For any extra tags to include
 151          * @param string[] $removetags For any tags (default or extra) to exclude
 152          * @return array
 153          * @internal
 154          */
 155         public static function getRecognizedTagData( array $extratags = [], array $removetags = [] ): array {
 156                 static $commonCase, $staticInitialised = false;
 157                 $isCommonCase = ( $extratags === [] && $removetags === [] );
 158                 if ( $staticInitialised && $isCommonCase && $commonCase ) {
 159                         return $commonCase;
 160                 }
 161
 162                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 163                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
 164
 165                 if ( !$staticInitialised ) {
 166                         $htmlpairsStatic = [ # Tags that must be closed
 167                                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 168                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 169                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 170                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 171                                 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
 172                                 'kbd', 'samp', 'data', 'time', 'mark'
 173                         ];
 174                         # These tags can be self-closed. For tags not also on
 175                         # $htmlsingleonly, a self-closed tag will be emitted as
 176                         # an empty element (open-tag/close-tag pair).
 177                         $htmlsingle = [
 178                                 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
 179                         ];
 180
 181                         # Elements that cannot have close tags. This is (not coincidentally)
 182                         # also the list of tags for which the HTML 5 parsing algorithm
 183                         # requires you to "acknowledge the token's self-closing flag", i.e.
 184                         # a self-closing tag like <br/> is not an HTML 5 parse error only
 185                         # for this list.
 186                         $htmlsingleonly = [
 187                                 'br', 'wbr', 'hr', 'meta', 'link'
 188                         ];
 189
 190                         $htmlnest = [ # Tags that can be nested--??
 191                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 192                                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
 193                                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
 194                         ];
 195                         $tabletags = [ # Can only appear inside table, we will close them
 196                                 'td', 'th', 'tr',
 197                         ];
 198                         $htmllist = [ # Tags used by list
 199                                 'ul', 'ol',
 200                         ];
 201                         $listtags = [ # Tags that can appear in a list
 202                                 'li',
 203                         ];
 204
 205                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 206                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 207
 208                         # Convert them all to hashtables for faster lookup
 209                         $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 210                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
 211                         foreach ( $vars as $var ) {
 212                                 $$var = array_fill_keys( $$var, true );
 213                         }
 214                         $staticInitialised = true;
 215                 }
 216
 217                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 218                 $extratags = array_fill_keys( $extratags, true );
 219                 $removetags = array_fill_keys( $removetags, true );
 220                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 221                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
 222
 223                 $result = [
 224                         'htmlpairs' => $htmlpairs,
 225                         'htmlsingle' => $htmlsingle,
 226                         'htmlsingleonly' => $htmlsingleonly,
 227                         'htmlnest' => $htmlnest,
 228                         'tabletags' => $tabletags,
 229                         'htmllist' => $htmllist,
 230                         'listtags' => $listtags,
 231                         'htmlsingleallowed' => $htmlsingleallowed,
 232                         'htmlelements' => $htmlelements,
 233                 ];
 234                 if ( $isCommonCase ) {
 235                         $commonCase = $result;
 236                 }
 237                 return $result;
 238         }
 239
 240         /**
 241          * Cleans up HTML, removes dangerous tags and attributes, and
 242          * removes HTML comments; BEWARE there may be unmatched HTML
 243          * tags in the result.
 244          *
 245          * @note Callers are recommended to use `::removeSomeTags()` instead
 246          * of this method.  `Sanitizer::removeSomeTags()` is safer and will
 247          * always return well-formed HTML; however, it is significantly
 248          * slower (especially for short strings where setup costs
 249          * predominate).  This method is for internal use by the legacy parser
 250          * where we know the result will be cleaned up in a subsequent tidy pass.
 251          *
 252          * @param string $text Original string; see T268353 for why untainted.
 253          * @param-taint $text none
 254          * @param callable|null $processCallback Callback to do any variable or
 255          *   parameter replacements in HTML attribute values.
 256          *   This argument should be considered @internal.
 257          * @param-taint $processCallback exec_shell
 258          * @param array|bool $args Arguments for the processing callback
 259          * @param-taint $args none
 260          * @param array $extratags For any extra tags to include
 261          * @param-taint $extratags tainted
 262          * @param array $removetags For any tags (default or extra) to exclude
 263          * @param-taint $removetags none
 264          * @return string
 265          * @return-taint escaped
 266          * @internal
 267          */
 268         public static function internalRemoveHtmlTags( string $text, ?callable $processCallback = null,
 269                 $args = [], array $extratags = [], array $removetags = []
 270         ): string {
 271                 $tagData = self::getRecognizedTagData( $extratags, $removetags );
 272                 $htmlsingle = $tagData['htmlsingle'];
 273                 $htmlsingleonly = $tagData['htmlsingleonly'];
 274                 $htmlelements = $tagData['htmlelements'];
 275
 276                 # Remove HTML comments
 277                 $text = self::removeHTMLcomments( $text );
 278                 $bits = explode( '<', $text );
 279                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 280
 281                 # this might be possible using remex tidy itself
 282                 foreach ( $bits as $x ) {
 283                         if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
 284                                 [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
 285
 286                                 $badtag = false;
 287                                 $t = strtolower( $t );
 288                                 if ( isset( $htmlelements[$t] ) ) {
 289                                         if ( is_callable( $processCallback ) ) {
 290                                                 call_user_func_array( $processCallback, [ &$params, $args ] );
 291                                         }
 292
 293                                         if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
 294                                                 // Remove the self-closing slash, to be consistent
 295                                                 // with HTML5 semantics. T134423
 296                                                 $brace = '>';
 297                                         }
 298                                         if ( !self::validateTag( $params, $t ) ) {
 299                                                 $badtag = true;
 300                                         }
 301
 302                                         $newparams = self::fixTagAttributes( $params, $t );
 303                                         if ( !$badtag ) {
 304                                                 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
 305                                                         # Interpret self-closing tags as empty tags even when
 306                                                         # HTML 5 would interpret them as start tags. Such input
 307                                                         # is commonly seen on Wikimedia wikis with this intention.
 308                                                         $brace = "></$t>";
 309                                                 }
 310
 311                                                 $rest = str_replace( '>', '&gt;', $rest );
 312                                                 $text .= "<$slash$t$newparams$brace$rest";
 313                                                 continue;
 314                                         }
 315                                 }
 316                         }
 317                         $text .= '&lt;' . str_replace( '>', '&gt;', $x );
 318                 }
 319                 return $text;
 320         }
 321
 322         /**
 323          * Cleans up HTML, removes dangerous tags and attributes, and
 324          * removes HTML comments; the result will always be balanced and
 325          * tidy HTML.
 326          * @param string $text Source string; see T268353 for why untainted
 327          * @param-taint  $text none
 328          * @param array $options Options controlling the cleanup:
 329          *    string[] $options['extraTags'] Any extra tags to allow
 330          *      (This property taints the whole array.)
 331          *    string[] $options['removeTags'] Any tags (default or extra) to exclude
 332          *    callable(Attributes,...):Attributes $options['attrCallback'] Callback
 333          *      to do any variable or parameter replacements in HTML attribute
 334          *      values before further cleanup; should be considered @internal
 335          *      and not for external use.
 336          *    array $options['attrCallbackArgs'] Additional arguments for the
 337          *      attribute callback
 338          * @param-taint $options tainted
 339          * @return string The cleaned up HTML
 340          * @return-taint escaped
 341          * @since 1.38
 342          */
 343         public static function removeSomeTags(
 344                 string $text, array $options = []
 345         ): string {
 346                 $extraTags = $options['extraTags'] ?? [];
 347                 $removeTags = $options['removeTags'] ?? [];
 348                 // These options are @internal:
 349                 $attrCallback = $options['attrCallback'] ?? null;
 350                 $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
 351
 352                 // This disallows HTML5-style "missing trailing semicolon" attributes
 353                 // In wikitext "clean&copy" does *not* contain an entity.
 354                 $text = self::normalizeCharReferences( $text );
 355
 356                 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
 357                 // Use RemexHtml to tokenize $text and remove the barred tags
 358                 $formatter = new RemexCompatFormatter;
 359                 $serializer = new RemexSerializer( $formatter );
 360                 $treeBuilder = new RemexTreeBuilder( $serializer, [
 361                         'ignoreErrors' => true,
 362                         'ignoreNulls' => true,
 363                 ] );
 364                 $dispatcher = new RemexDispatcher( $treeBuilder );
 365                 $tokenHandler = $dispatcher;
 366                 $remover = new RemexRemoveTagHandler(
 367                         $tokenHandler, $text, $tagData,
 368                         $attrCallback, $attrCallbackArgs
 369                 );
 370                 $tokenizer = new RemexTokenizer( $remover, $text, [
 371                         'ignoreErrors' => true,
 372                         // don't ignore char refs, we want them to be decoded
 373                         'ignoreNulls' => true,
 374                         'skipPreprocess' => true,
 375                 ] );
 376                 $tokenizer->execute( [
 377                         'fragmentNamespace' => HTMLData::NS_HTML,
 378                         'fragmentName' => 'body',
 379                 ] );
 380                 return $serializer->getResult();
 381         }
 382
 383         /**
 384          * Remove '<!--', '-->', and everything between.
 385          * To avoid leaving blank lines, when a comment is both preceded
 386          * and followed by a newline (ignoring spaces), trim leading and
 387          * trailing spaces and one of the newlines.
 388          */
 389         public static function removeHTMLcomments( string $text ): string {
 390                 // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
 391                 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
 392                         $end = strpos( $text, '-->', $start + 4 );
 393                         if ( $end === false ) {
 394                                 # Unterminated comment; bail out
 395                                 break;
 396                         }
 397
 398                         $end += 3;
 399
 400                         # Trim space and newline if the comment is both
 401                         # preceded and followed by a newline
 402                         $spaceStart = max( $start - 1, 0 );
 403                         $spaceLen = $end - $spaceStart;
 404                         while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
 405                                 $spaceStart--;
 406                                 $spaceLen++;
 407                         }
 408                         while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
 409                                 $spaceLen++;
 410                         }
 411                         if ( substr( $text, $spaceStart, 1 ) === "\n"
 412                                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
 413                                 # Remove the comment, leading and trailing
 414                                 # spaces, and leave only one newline.
 415                                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
 416                         } else {
 417                                 # Remove just the comment.
 418                                 $text = substr_replace( $text, '', $start, $end - $start );
 419                         }
 420                 }
 421                 return $text;
 422         }
 423
 424         /**
 425          * Takes attribute names and values for a tag and the tag name and
 426          * validates that the tag is allowed to be present.
 427          * This DOES NOT validate the attributes, nor does it validate the
 428          * tags themselves. This method only handles the special circumstances
 429          * where we may want to allow a tag within content but ONLY when it has
 430          * specific attributes set.
 431          *
 432          * @see RemexRemoveTagHandler::validateTag()
 433          */
 434         private static function validateTag( string $params, string $element ): bool {
 435                 $params = self::decodeTagAttributes( $params );
 436
 437                 if ( $element == 'meta' || $element == 'link' ) {
 438                         if ( !isset( $params['itemprop'] ) ) {
 439                                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
 440                                 return false;
 441                         }
 442                         if ( $element == 'meta' && !isset( $params['content'] ) ) {
 443                                 // <meta> must have a content="" for the itemprop
 444                                 return false;
 445                         }
 446                         if ( $element == 'link' && !isset( $params['href'] ) ) {
 447                                 // <link> must have an associated href=""
 448                                 return false;
 449                         }
 450                 }
 451
 452                 return true;
 453         }
 454
 455         /**
 456          * Take an array of attribute names and values and normalize or discard
 457          * illegal values for the given element type.
 458          *
 459          * - Discards attributes not allowed for the given element
 460          * - Unsafe style attributes are discarded
 461          * - Invalid id attributes are re-encoded
 462          *
 463          * @todo Check for legal values where the DTD limits things.
 464          * @todo Check for unique id attribute :P
 465          */
 466         public static function validateTagAttributes( array $attribs, string $element ): array {
 467                 return self::validateAttributes( $attribs,
 468                         self::attributesAllowedInternal( $element ) );
 469         }
 470
 471         /**
 472          * Take an array of attribute names and values and normalize or discard
 473          * illegal values.
 474          *
 475          * - Discards attributes not on the given list
 476          * - Unsafe style attributes are discarded
 477          * - Invalid id attributes are re-encoded
 478          *
 479          * @param array $attribs
 480          * @param array $allowed List of allowed attribute names,
 481          *   as an associative array where keys give valid attribute names
 482          *   (since 1.34).  Before 1.35, passing a sequential array of
 483          *   valid attribute names was permitted but that is now deprecated.
 484          * @return array
 485          *
 486          * @todo Check for legal values where the DTD limits things.
 487          * @todo Check for unique id attribute :P
 488          */
 489         public static function validateAttributes( array $attribs, array $allowed ): array {
 490                 if ( isset( $allowed[0] ) ) {
 491                         // Calling this function with a sequential array is
 492                         // deprecated.  For now just convert it.
 493                         wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
 494                         $allowed = array_fill_keys( $allowed, true );
 495                 }
 496                 $validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols();
 497                 $hrefExp = '/^(' . $validProtocols . ')[^\s]+$/';
 498
 499                 $out = [];
 500                 foreach ( $attribs as $attribute => $value ) {
 501                         # Allow XML namespace declaration to allow RDFa
 502                         if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
 503                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 504                                         $out[$attribute] = $value;
 505                                 }
 506
 507                                 continue;
 508                         }
 509
 510                         # Allow any attribute beginning with "data-"
 511                         # However:
 512                         # * Disallow data attributes used by MediaWiki code
 513                         # * Ensure that the attribute is not namespaced by banning
 514                         #   colons.
 515                         if ( (
 516                                 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
 517                                 !array_key_exists( $attribute, $allowed )
 518                         ) || self::isReservedDataAttribute( $attribute ) ) {
 519                                 continue;
 520                         }
 521
 522                         # Strip javascript "expression" from stylesheets.
 523                         # https://msdn.microsoft.com/en-us/library/ms537634.aspx
 524                         if ( $attribute == 'style' ) {
 525                                 $value = self::checkCss( $value );
 526                         }
 527
 528                         # Escape HTML id attributes
 529                         if ( $attribute === 'id' ) {
 530                                 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
 531                                 if ( $value === false || $value === '' ) {
 532                                         continue;
 533                                 }
 534                         }
 535
 536                         # Escape HTML id reference lists
 537                         if ( $attribute === 'aria-describedby'
 538                                 || $attribute === 'aria-flowto'
 539                                 || $attribute === 'aria-labelledby'
 540                                 || $attribute === 'aria-owns'
 541                         ) {
 542                                 $value = self::escapeIdReferenceListInternal( $value );
 543                         }
 544
 545                         // RDFa and microdata properties allow URLs, URIs and/or CURIs.
 546                         if ( $attribute === 'rel' || $attribute === 'rev'
 547                                 # RDFa
 548                                 || $attribute === 'about' || $attribute === 'property'
 549                                 || $attribute === 'resource' || $attribute === 'datatype'
 550                                 || $attribute === 'typeof'
 551                                 # HTML5 microdata
 552                                 || $attribute === 'itemid' || $attribute === 'itemprop'
 553                                 || $attribute === 'itemref' || $attribute === 'itemscope'
 554                                 || $attribute === 'itemtype'
 555                         ) {
 556                                 // Paranoia. Allow "simple" values but suppress javascript
 557                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 558                                         continue;
 559                                 }
 560                         }
 561
 562                         # NOTE: even though elements using href/src are not allowed directly, supply
 563                         #       validation code that can be used by tag hook handlers, etc
 564                         if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
 565                                 if ( !preg_match( $hrefExp, $value ) ) {
 566                                         continue; // drop any href or src attributes not using an allowed protocol.
 567                                         // NOTE: this also drops all relative URLs
 568                                 }
 569                         }
 570
 571                         if ( $attribute === 'tabindex' && $value !== '0' ) {
 572                                 // Only allow tabindex of 0, which is useful for accessibility.
 573                                 continue;
 574                         }
 575
 576                         // If this attribute was previously set, override it.
 577                         // Output should only have one attribute of each name.
 578                         $out[$attribute] = $value;
 579                 }
 580
 581                 # itemtype, itemid, itemref don't make sense without itemscope
 582                 if ( !array_key_exists( 'itemscope', $out ) ) {
 583                         unset( $out['itemtype'] );
 584                         unset( $out['itemid'] );
 585                         unset( $out['itemref'] );
 586                 }
 587                 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
 588
 589                 return $out;
 590         }
 591
 592         /**
 593          * Given an attribute name, checks whether it is a reserved data attribute
 594          * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
 595          * core and extension code can safely use it to communicate with frontend code.
 596          * @param string $attr Attribute name.
 597          * @return bool
 598          */
 599         public static function isReservedDataAttribute( string $attr ): bool {
 600                 // data-ooui is reserved for ooui.
 601                 // data-mw and data-parsoid are reserved for parsoid.
 602                 // data-mw-<name here> is reserved for extensions (or core) if
 603                 // they need to communicate some data to the client and want to be
 604                 // sure that it isn't coming from an untrusted user.
 605                 // We ignore the possibility of namespaces since user-generated HTML
 606                 // can't use them anymore.
 607                 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
 608         }
 609
 610         /**
 611          * Merge two sets of HTML attributes.  Conflicting items in the second set
 612          * will override those in the first, except for 'class' attributes which
 613          * will be combined (if they're both strings).
 614          *
 615          * @todo implement merging for other attributes such as style
 616          */
 617         public static function mergeAttributes( array $a, array $b ): array {
 618                 $out = array_merge( $a, $b );
 619                 if ( isset( $a['class'] ) && isset( $b['class'] )
 620                         && is_string( $a['class'] ) && is_string( $b['class'] )
 621                         && $a['class'] !== $b['class']
 622                 ) {
 623                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 624                                 -1, PREG_SPLIT_NO_EMPTY );
 625                         $out['class'] = implode( ' ', array_unique( $classes ) );
 626                 }
 627                 return $out;
 628         }
 629
 630         /**
 631          * Normalize CSS into a format we can easily search for hostile input
 632          *  - decode character references
 633          *  - decode escape sequences
 634          *  - remove comments, unless the entire value is one single comment
 635          * @param string $value the css string
 636          * @return string normalized css
 637          */
 638         public static function normalizeCss( string $value ): string {
 639                 // Decode character references like &#123;
 640                 $value = self::decodeCharReferences( $value );
 641
 642                 // Decode escape sequences and line continuation
 643                 // See the grammar in the CSS 2 spec, appendix D.
 644                 // This has to be done AFTER decoding character references.
 645                 // This means it isn't possible for this function to return
 646                 // unsanitized escape sequences. It is possible to manufacture
 647                 // input that contains character references that decode to
 648                 // escape sequences that decode to character references, but
 649                 // it's OK for the return value to contain character references
 650                 // because the caller is supposed to escape those anyway.
 651                 static $decodeRegex;
 652                 if ( !$decodeRegex ) {
 653                         $space = '[\\x20\\t\\r\\n\\f]';
 654                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
 655                         $backslash = '\\\\';
 656                         $decodeRegex = "/ $backslash
 657                                 (?:
 658                                         ($nl) |  # 1. Line continuation
 659                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
 660                                         (.) | # 3. backslash cancelling special meaning
 661                                         () | # 4. backslash at end of string
 662                                 )/xu";
 663                 }
 664                 $value = preg_replace_callback( $decodeRegex,
 665                         [ __CLASS__, 'cssDecodeCallback' ], $value );
 666
 667                 // Let the value through if it's nothing but a single comment, to
 668                 // allow other functions which may reject it to pass some error
 669                 // message through.
 670                 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
 671                         // Remove any comments; IE gets token splitting wrong
 672                         // This must be done AFTER decoding character references and
 673                         // escape sequences, because those steps can introduce comments
 674                         // This step cannot introduce character references or escape
 675                         // sequences, because it replaces comments with spaces rather
 676                         // than removing them completely.
 677                         $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
 678
 679                         // Remove anything after a comment-start token, to guard against
 680                         // incorrect client implementations.
 681                         $commentPos = strpos( $value, '/*' );
 682                         if ( $commentPos !== false ) {
 683                                 $value = substr( $value, 0, $commentPos );
 684                         }
 685                 }
 686
 687                 return $value;
 688         }
 689
 690         /**
 691          * Pick apart some CSS and check it for forbidden or unsafe structures.
 692          * Returns a sanitized string. This sanitized string will have
 693          * character references and escape sequences decoded and comments
 694          * stripped (unless it is itself one valid comment, in which case the value
 695          * will be passed through). If the input is just too evil, only a comment
 696          * complaining about evilness will be returned.
 697          *
 698          * Currently URL references, 'expression', 'tps' are forbidden.
 699          *
 700          * NOTE: Despite the fact that character references are decoded, the
 701          * returned string may contain character references given certain
 702          * clever input strings. These character references must
 703          * be escaped before the return value is embedded in HTML.
 704          *
 705          * @warning This method is intended to sanitize style attributes on
 706          *  html tags only. It is not safe to use on full CSS files.
 707          * @param string $value
 708          * @return string
 709          */
 710         public static function checkCss( $value ) {
 711                 $value = self::normalizeCss( $value );
 712
 713                 // Reject problematic keywords and control characters
 714                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
 715                         strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
 716                         return '/* invalid control char */';
 717                 } elseif ( preg_match(
 718                         '! expression
 719                                 | accelerator\s*:
 720                                 | -o-link\s*:
 721                                 | -o-link-source\s*:
 722                                 | -o-replace\s*:
 723                                 | url\s*\(
 724                                 | src\s*\(
 725                                 | image\s*\(
 726                                 | image-set\s*\(
 727                                 | attr\s*\([^)]+[\s,]+url
 728                         !ix', $value ) ) {
 729                         return '/* insecure input */';
 730                 }
 731                 return $value;
 732         }
 733
 734         private static function cssDecodeCallback( array $matches ): string {
 735                 if ( $matches[1] !== '' ) {
 736                         // Line continuation
 737                         return '';
 738                 } elseif ( $matches[2] !== '' ) {
 739                         # hexdec could return a float if the match is too long, but the
 740                         # regexp in question limits the string length to 6.
 741                         $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
 742                 } elseif ( $matches[3] !== '' ) {
 743                         $char = $matches[3];
 744                 } else {
 745                         $char = '\\';
 746                 }
 747                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
 748                         // These characters need to be escaped in strings
 749                         // Clean up the escape sequence to avoid parsing errors by clients
 750                         return '\\' . dechex( ord( $char ) ) . ' ';
 751                 } else {
 752                         // Decode unnecessary escape
 753                         return $char;
 754                 }
 755         }
 756
 757         /**
 758          * Take a tag soup fragment listing an HTML element's attributes
 759          * and normalize it to well-formed XML, discarding unwanted attributes.
 760          * Output is safe for further wikitext processing, with escaping of
 761          * values that could trigger problems.
 762          *
 763          * - Normalizes attribute names to lowercase
 764          * - Discards attributes not allowed for the given element
 765          * - Turns broken or invalid entities into plaintext
 766          * - Double-quotes all attribute values
 767          * - Attributes without values are given the name as attribute
 768          * - Double attributes are discarded
 769          * - Unsafe style attributes are discarded
 770          * - Prepends space if there are attributes.
 771          * - (Optionally) Sorts attributes by name.
 772          *
 773          * @param string $text
 774          * @param string $element
 775          * @param bool $sorted Whether to sort the attributes (default: false)
 776          * @return string
 777          */
 778         public static function fixTagAttributes( string $text, string $element, bool $sorted = false ): string {
 779                 if ( trim( $text ) == '' ) {
 780                         return '';
 781                 }
 782
 783                 $decoded = self::decodeTagAttributes( $text );
 784                 $stripped = self::validateTagAttributes( $decoded, $element );
 785
 786                 if ( $sorted ) {
 787                         ksort( $stripped );
 788                 }
 789
 790                 return self::safeEncodeTagAttributes( $stripped );
 791         }
 792
 793         /**
 794          * Encode an attribute value for HTML output.
 795          * @param string $text
 796          * @param-taint $text escapes_html
 797          * @return string HTML-encoded text fragment
 798          * @return-taint escaped
 799          */
 800         public static function encodeAttribute( string $text ): string {
 801                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 802
 803                 // Whitespace is normalized during attribute decoding,
 804                 // so if we've been passed non-spaces we must encode them
 805                 // ahead of time or they won't be preserved.
 806                 $encValue = strtr( $encValue, [
 807                         "\n" => '&#10;',
 808                         "\r" => '&#13;',
 809                         "\t" => '&#9;',
 810                 ] );
 811
 812                 return $encValue;
 813         }
 814
 815         /**
 816          * Armor French spaces with a replacement character
 817          *
 818          * @since 1.32
 819          * @param string $text Text to armor
 820          * @param string $space Space character for the French spaces, defaults to '&#160;'
 821          * @return string Armored text
 822          */
 823         public static function armorFrenchSpaces( string $text, string $space = '&#160;' ): string {
 824                 // Replace $ with \$ and \ with \\
 825                 $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
 826                 $fixtags = [
 827                         # French spaces, last one Guillemet-left
 828                         # only if it isn't followed by a word character.
 829                         '/ (?=[?:;!%»›](?!\w))/u' => "$space",
 830                         # French spaces, Guillemet-right
 831                         '/([«‹]) /u' => "\\1$space",
 832                 ];
 833                 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
 834         }
 835
 836         /**
 837          * Encode an attribute value for HTML tags, with extra armoring
 838          * against further wiki processing.
 839          * @param string $text
 840          * @param-taint $text escapes_html
 841          * @return string HTML-encoded text fragment
 842          * @return-taint escaped
 843          */
 844         public static function safeEncodeAttribute( string $text ): string {
 845                 $encValue = self::encodeAttribute( $text );
 846
 847                 # Templates and links may be expanded in later parsing,
 848                 # creating invalid or dangerous output. Suppress this.
 849                 $encValue = strtr( $encValue, [
 850                         // '<', '>', and '"' should never happen, as they indicate that we've received invalid input which should
 851                         // have been escaped.
 852                         '<'    => '&lt;',
 853                         '>'    => '&gt;',
 854                         '"'    => '&quot;',
 855                         '{'    => '&#123;',
 856                         '}'    => '&#125;', // prevent unpaired language conversion syntax
 857                         '['    => '&#91;',
 858                         ']'    => '&#93;',
 859                         "''"   => '&#39;&#39;',
 860                         'ISBN' => '&#73;SBN',
 861                         'RFC'  => '&#82;FC',
 862                         'PMID' => '&#80;MID',
 863                         '|'    => '&#124;',
 864                         '__'   => '&#95;_',
 865                 ] );
 866
 867                 # Stupid hack
 868                 $validProtocols = MediaWikiServices::getInstance()->getUrlUtils()->validProtocols();
 869                 $encValue = preg_replace_callback(
 870                         '/((?i)' . $validProtocols . ')/',
 871                         static function ( $matches ) {
 872                                 return str_replace( ':', '&#58;', $matches[1] );
 873                         },
 874                         $encValue );
 875                 return $encValue;
 876         }
 877
 878         /**
 879          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
 880          * a valid HTML id attribute.
 881          *
 882          * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
 883          * proper escaping.
 884          *
 885          * @param string $id String to escape
 886          * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
 887          *     should be used.
 888          * @return string|false Escaped ID or false if fallback encoding is requested but it's not
 889          *     configured.
 890          *
 891          * @since 1.30
 892          */
 893         public static function escapeIdForAttribute( string $id, int $mode = self::ID_PRIMARY ) {
 894                 global $wgFragmentMode;
 895
 896                 if ( !isset( $wgFragmentMode[$mode] ) ) {
 897                         if ( $mode === self::ID_PRIMARY ) {
 898                                 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
 899                         }
 900                         return false;
 901                 }
 902
 903                 $internalMode = $wgFragmentMode[$mode];
 904
 905                 return self::escapeIdInternal( $id, $internalMode );
 906         }
 907
 908         /**
 909          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
 910          * a valid URL fragment.
 911          *
 912          * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
 913          * proper escaping.
 914          *
 915          * @param string $id String to escape
 916          * @return string Escaped ID
 917          *
 918          * @since 1.30
 919          */
 920         public static function escapeIdForLink( string $id ): string {
 921                 global $wgFragmentMode;
 922
 923                 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
 924                         throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
 925                 }
 926
 927                 $mode = $wgFragmentMode[self::ID_PRIMARY];
 928
 929                 $id = self::escapeIdInternalUrl( $id, $mode );
 930
 931                 return $id;
 932         }
 933
 934         /**
 935          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
 936          * a valid URL fragment for external interwikis.
 937          *
 938          * @param string $id String to escape
 939          * @return string Escaped ID
 940          *
 941          * @since 1.30
 942          */
 943         public static function escapeIdForExternalInterwiki( string $id ): string {
 944                 global $wgExternalInterwikiFragmentMode;
 945
 946                 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
 947
 948                 return $id;
 949         }
 950
 951         /**
 952          * Do percent encoding of percent signs for href (but not id) attributes
 953          *
 954          * @since 1.35
 955          * @see https://phabricator.wikimedia.org/T238385
 956          * @param string $id String to escape
 957          * @param string $mode One of modes from $wgFragmentMode
 958          * @return string
 959          */
 960         private static function escapeIdInternalUrl( string $id, string $mode ): string {
 961                 $id = self::escapeIdInternal( $id, $mode );
 962                 if ( $mode === 'html5' ) {
 963                         $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
 964                 }
 965                 return $id;
 966         }
 967
 968         /**
 969          * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
 970          *
 971          * @param string $id String to escape
 972          * @param string $mode One of modes from $wgFragmentMode
 973          * @return string
 974          */
 975         private static function escapeIdInternal( string $id, string $mode ): string {
 976                 // Truncate overly-long IDs.  This isn't an HTML limit, it's just
 977                 // griefer protection. [T251506]
 978                 $id = mb_substr( $id, 0, 1024 );
 979
 980                 switch ( $mode ) {
 981                         case 'html5':
 982                                 // html5 spec says ids must not have any of the following:
 983                                 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
 984                                 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
 985                                 // possible using either Lua or html entities.
 986                                 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
 987                                 break;
 988                         case 'legacy':
 989                                 // This corresponds to 'noninitial' mode of the former escapeId()
 990                                 static $replace = [
 991                                         '%3A' => ':',
 992                                         '%' => '.'
 993                                 ];
 994
 995                                 $id = urlencode( str_replace( ' ', '_', $id ) );
 996                                 $id = strtr( $id, $replace );
 997                                 break;
 998                         default:
 999                                 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1000                 }
1001
1002                 return $id;
1003         }
1004
1005         /**
1006          * Given a string containing a space delimited list of ids, escape each id
1007          * to match ids escaped by the escapeIdForAttribute() function.
1008          *
1009          * @param string $referenceString Space delimited list of ids
1010          * @return string
1011          */
1012         private static function escapeIdReferenceListInternal( string $referenceString ): string {
1013                 # Explode the space delimited list string into an array of tokens
1014                 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1015
1016                 # Escape each token as an id
1017                 foreach ( $references as &$ref ) {
1018                         $ref = self::escapeIdForAttribute( $ref );
1019                 }
1020
1021                 # Merge the array back to a space delimited list string
1022                 # If the array is empty, the result will be an empty string ('')
1023                 $referenceString = implode( ' ', $references );
1024
1025                 return $referenceString;
1026         }
1027
1028         /**
1029          * Given a value, escape it so that it can be used as a CSS class and
1030          * return it.
1031          *
1032          * @todo For extra validity, input should be validated UTF-8.
1033          *
1034          * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
1035          */
1036         public static function escapeClass( string $class ): string {
1037                 // Convert ugly stuff to underscores and kill underscores in ugly places
1038                 return rtrim( preg_replace(
1039                         [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1040                         '_',
1041                         $class ), '_' );
1042         }
1043
1044         /**
1045          * Given HTML input, escape with htmlspecialchars but un-escape entities.
1046          * This allows (generally harmless) entities like &#160; to survive.
1047          *
1048          * @param string $html HTML to escape
1049          * @param-taint $html escapes_htmlnoent
1050          * @return string Escaped input
1051          * @return-taint escaped
1052          */
1053         public static function escapeHtmlAllowEntities( string $html ): string {
1054                 $html = self::decodeCharReferences( $html );
1055                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
1056                 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1057                 # don't cause the entire string to disappear.
1058                 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1059                 return $html;
1060         }
1061
1062         /**
1063          * Return an associative array of attribute names and values from
1064          * a partial tag string. Attribute names are forced to lowercase,
1065          * character references are decoded to UTF-8 text.
1066          */
1067         public static function decodeTagAttributes( string $text ): array {
1068                 if ( trim( $text ) == '' ) {
1069                         return [];
1070                 }
1071
1072                 $pairs = [];
1073                 if ( !preg_match_all(
1074                         self::getAttribsRegex(),
1075                         $text,
1076                         $pairs,
1077                         PREG_SET_ORDER ) ) {
1078                         return [];
1079                 }
1080
1081                 $attribs = [];
1082                 foreach ( $pairs as $set ) {
1083                         $attribute = strtolower( $set[1] );
1084
1085                         // Filter attribute names with unacceptable characters
1086                         if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1087                                 continue;
1088                         }
1089
1090                         $value = self::getTagAttributeCallback( $set );
1091
1092                         // Normalize whitespace
1093                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1094                         $value = trim( $value );
1095
1096                         // Decode character references
1097                         $attribs[$attribute] = self::decodeCharReferences( $value );
1098                 }
1099                 return $attribs;
1100         }
1101
1102         /**
1103          * Build a partial tag string from an associative array of attribute
1104          * names and values as returned by decodeTagAttributes.
1105          */
1106         public static function safeEncodeTagAttributes( array $assoc_array ): string {
1107                 $attribs = [];
1108                 foreach ( $assoc_array as $attribute => $value ) {
1109                         $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1110                         $encValue = self::safeEncodeAttribute( $value );
1111
1112                         $attribs[] = "$encAttribute=\"$encValue\"";
1113                 }
1114                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1115         }
1116
1117         /**
1118          * Pick the appropriate attribute value from a match set from the
1119          * attribs regex matches.
1120          */
1121         private static function getTagAttributeCallback( array $set ): string {
1122                 if ( isset( $set[5] ) ) {
1123                         # No quotes.
1124                         return $set[5];
1125                 } elseif ( isset( $set[4] ) ) {
1126                         # Single-quoted
1127                         return $set[4];
1128                 } elseif ( isset( $set[3] ) ) {
1129                         # Double-quoted
1130                         return $set[3];
1131                 } elseif ( !isset( $set[2] ) ) {
1132                         # In XHTML, attributes must have a value so return an empty string.
1133                         # See "Empty attribute syntax",
1134                         # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1135                         return "";
1136                 } else {
1137                         throw new LogicException( "Tag conditions not met. This should never happen and is a bug." );
1138                 }
1139         }
1140
1141         private static function normalizeWhitespace( string $text ): string {
1142                 return trim( preg_replace(
1143                         '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1144                         ' ',
1145                         $text ) );
1146         }
1147
1148         /**
1149          * Normalizes whitespace in a section name, such as might be returned
1150          * by Parser::stripSectionName(), for use in the id's that are used for
1151          * section links.
1152          */
1153         public static function normalizeSectionNameWhitespace( string $section ): string {
1154                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1155         }
1156
1157         /**
1158          * Ensure that any entities and character references are legal
1159          * for XML and XHTML specifically. Any stray bits will be
1160          * &amp;-escaped to result in a valid text fragment.
1161          *
1162          * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
1163          *   numericized (this way we're well-formed even without a DTD)
1164          * b. any numeric char refs must be legal chars, not invalid or forbidden
1165          * c. use lower cased "&#x", not "&#X"
1166          * d. fix or reject non-valid attributes
1167          *
1168          * @internal
1169          */
1170         public static function normalizeCharReferences( string $text ): string {
1171                 return preg_replace_callback(
1172                         self::CHAR_REFS_REGEX,
1173                         [ self::class, 'normalizeCharReferencesCallback' ],
1174                         $text, -1, $count, PREG_UNMATCHED_AS_NULL
1175                 );
1176         }
1177
1178         private static function normalizeCharReferencesCallback( array $matches ): string {
1179                 $ret = null;
1180                 if ( isset( $matches[1] ) ) {
1181                         $ret = self::normalizeEntity( $matches[1] );
1182                 } elseif ( isset( $matches[2] ) ) {
1183                         $ret = self::decCharReference( $matches[2] );
1184                 } elseif ( isset( $matches[3] ) ) {
1185                         $ret = self::hexCharReference( $matches[3] );
1186                 }
1187                 if ( $ret === null ) {
1188                         return htmlspecialchars( $matches[0], ENT_COMPAT );
1189                 } else {
1190                         return $ret;
1191                 }
1192         }
1193
1194         /**
1195          * If the named entity is defined in HTML5
1196          * return the equivalent numeric entity reference (except for the core &lt;
1197          * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
1198          * the HTML equivalent. Otherwise, returns HTML-escaped text of
1199          * pseudo-entity source (eg &amp;foo;)
1200          *
1201          * @param string $name Semicolon-terminated name
1202          * @return string
1203          */
1204         private static function normalizeEntity( string $name ): string {
1205                 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1206                         // Non-standard MediaWiki-specific entities
1207                         return '&' . self::MW_ENTITY_ALIASES[$name];
1208                 } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1209                         // Keep these in word form
1210                         return "&$name";
1211                 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1212                         // Beware: some entities expand to more than 1 codepoint
1213                         return preg_replace_callback( '/./Ssu', static function ( $m ) {
1214                                 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1215                         }, HTMLData::$namedEntityTranslations[$name] );
1216                 } else {
1217                         return "&amp;$name";
1218                 }
1219         }
1220
1221         private static function decCharReference( string $codepoint ): ?string {
1222                 # intval() will (safely) saturate at the maximum signed integer
1223                 # value if $codepoint is too many digits
1224                 $point = intval( $codepoint );
1225                 if ( self::validateCodepoint( $point ) ) {
1226                         return "&#$point;";
1227                 } else {
1228                         return null;
1229                 }
1230         }
1231
1232         private static function hexCharReference( string $codepoint ): ?string {
1233                 $point = hexdec( $codepoint );
1234                 // hexdec() might return a float if the string is too long
1235                 if ( is_int( $point ) && self::validateCodepoint( $point ) ) {
1236                         return sprintf( '&#x%x;', $point );
1237                 } else {
1238                         return null;
1239                 }
1240         }
1241
1242         /**
1243          * Returns true if a given Unicode codepoint is a valid character in
1244          * both HTML5 and XML.
1245          */
1246         private static function validateCodepoint( int $codepoint ): bool {
1247                 # U+000C is valid in HTML5 but not allowed in XML.
1248                 # U+000D is valid in XML but not allowed in HTML5.
1249                 # U+007F - U+009F are disallowed in HTML5 (control characters).
1250                 return $codepoint == 0x09
1251                         || $codepoint == 0x0a
1252                         || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1253                         || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1254                         || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1255                         || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1256         }
1257
1258         /**
1259          * Decode any character references, numeric or named entities,
1260          * in the text and return a UTF-8 string.
1261          */
1262         public static function decodeCharReferences( string $text ): string {
1263                 return preg_replace_callback(
1264                         self::CHAR_REFS_REGEX,
1265                         [ self::class, 'decodeCharReferencesCallback' ],
1266                         $text, -1, $count, PREG_UNMATCHED_AS_NULL
1267                 );
1268         }
1269
1270         /**
1271          * Decode any character references, numeric or named entities,
1272          * in the next and normalize the resulting string. (T16952)
1273          *
1274          * This is useful for page titles, not for text to be displayed,
1275          * MediaWiki allows HTML entities to escape normalization as a feature.
1276          *
1277          * @param string $text Already normalized, containing entities
1278          * @return string Still normalized, without entities
1279          */
1280         public static function decodeCharReferencesAndNormalize( string $text ): string {
1281                 $text = preg_replace_callback(
1282                         self::CHAR_REFS_REGEX,
1283                         [ self::class, 'decodeCharReferencesCallback' ],
1284                         $text, -1, $count, PREG_UNMATCHED_AS_NULL
1285                 );
1286
1287                 if ( $count ) {
1288                         return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1289                 } else {
1290                         return $text;
1291                 }
1292         }
1293
1294         private static function decodeCharReferencesCallback( array $matches ): string {
1295                 if ( isset( $matches[1] ) ) {
1296                         return self::decodeEntity( $matches[1] );
1297                 } elseif ( isset( $matches[2] ) ) {
1298                         return self::decodeChar( intval( $matches[2] ) );
1299                 } elseif ( isset( $matches[3] ) ) {
1300                         $point = hexdec( $matches[3] );
1301                         // hexdec() might return a float if the string is too long
1302                         if ( !is_int( $point ) ) {
1303                                 // Invalid character reference.
1304                                 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1305                         }
1306                         return self::decodeChar( $point );
1307                 }
1308                 # Last case should be an ampersand by itself
1309                 return $matches[0];
1310         }
1311
1312         /**
1313          * Return UTF-8 string for a codepoint if that is a valid
1314          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1315          * @internal
1316          */
1317         private static function decodeChar( int $codepoint ): string {
1318                 if ( self::validateCodepoint( $codepoint ) ) {
1319                         return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1320                 } else {
1321                         return \UtfNormal\Constants::UTF8_REPLACEMENT;
1322                 }
1323         }
1324
1325         /**
1326          * If the named entity is defined in HTML5
1327          * return the UTF-8 encoding of that character. Otherwise, returns
1328          * pseudo-entity source (eg "&foo;")
1329          *
1330          * @param string $name Semicolon-terminated entity name
1331          * @return string
1332          */
1333         private static function decodeEntity( string $name ): string {
1334                 // These are MediaWiki-specific entities, not in the HTML standard
1335                 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1336                         $name = self::MW_ENTITY_ALIASES[$name];
1337                 }
1338                 $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1339                 return $trans ?? "&$name";
1340         }
1341
1342         /**
1343          * Fetch the list of acceptable attributes for a given element name.
1344          *
1345          * @param string $element
1346          * @return array An associative array where keys are acceptable attribute
1347          *   names
1348          */
1349         private static function attributesAllowedInternal( string $element ): array {
1350                 $list = self::setupAttributesAllowedInternal();
1351                 return $list[$element] ?? [];
1352         }
1353
1354         /**
1355          * Foreach array key (an allowed HTML element), return an array
1356          * of allowed attributes.
1357          * @return array An associative array: keys are HTML element names;
1358          *   values are associative arrays where the keys are allowed attribute
1359          *   names.
1360          */
1361         private static function setupAttributesAllowedInternal(): array {
1362                 static $allowed;
1363
1364                 if ( $allowed !== null ) {
1365                         return $allowed;
1366                 }
1367
1368                 // For lookup efficiency flip each attributes array so the keys are
1369                 // the valid attributes.
1370                 $merge = static function ( $a, $b, $c = [] ) {
1371                         return array_merge(
1372                                 $a,
1373                                 array_fill_keys( $b, true ),
1374                                 array_fill_keys( $c, true ) );
1375                 };
1376                 $common = $merge( [], [
1377                         # HTML
1378                         'id',
1379                         'class',
1380                         'style',
1381                         'lang',
1382                         'dir',
1383                         'title',
1384                         'tabindex',
1385
1386                         # WAI-ARIA
1387                         'aria-describedby',
1388                         'aria-flowto',
1389                         'aria-hidden',
1390                         'aria-label',
1391                         'aria-labelledby',
1392                         'aria-level',
1393                         'aria-owns',
1394                         'role',
1395
1396                         # RDFa
1397                         # These attributes are specified in section 9 of
1398                         # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1399                         'about',
1400                         'property',
1401                         'resource',
1402                         'datatype',
1403                         'typeof',
1404
1405                         # Microdata. These are specified by
1406                         # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1407                         'itemid',
1408                         'itemprop',
1409                         'itemref',
1410                         'itemscope',
1411                         'itemtype',
1412                 ] );
1413
1414                 $block = $merge( $common, [ 'align' ] );
1415
1416                 $tablealign = [ 'align', 'valign' ];
1417                 $tablecell = [
1418                         'abbr',
1419                         'axis',
1420                         'headers',
1421                         'scope',
1422                         'rowspan',
1423                         'colspan',
1424                         'nowrap', # deprecated
1425                         'width', # deprecated
1426                         'height', # deprecated
1427                         'bgcolor', # deprecated
1428                 ];
1429
1430                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1431                 # See: https://www.w3.org/TR/html4/
1432                 $allowed = [
1433                         # 7.5.4
1434                         'div'        => $block,
1435                         'center'     => $common, # deprecated
1436                         'span'       => $common,
1437
1438                         # 7.5.5
1439                         'h1'         => $block,
1440                         'h2'         => $block,
1441                         'h3'         => $block,
1442                         'h4'         => $block,
1443                         'h5'         => $block,
1444                         'h6'         => $block,
1445
1446                         # 7.5.6
1447                         # address
1448
1449                         # 8.2.4
1450                         'bdo'        => $common,
1451
1452                         # 9.2.1
1453                         'em'         => $common,
1454                         'strong'     => $common,
1455                         'cite'       => $common,
1456                         'dfn'        => $common,
1457                         'code'       => $common,
1458                         'samp'       => $common,
1459                         'kbd'        => $common,
1460                         'var'        => $common,
1461                         'abbr'       => $common,
1462                         # acronym
1463
1464                         # 9.2.2
1465                         'blockquote' => $merge( $common, [ 'cite' ] ),
1466                         'q'          => $merge( $common, [ 'cite' ] ),
1467
1468                         # 9.2.3
1469                         'sub'        => $common,
1470                         'sup'        => $common,
1471
1472                         # 9.3.1
1473                         'p'          => $block,
1474
1475                         # 9.3.2
1476                         'br'         => $merge( $common, [ 'clear' ] ),
1477
1478                         # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1479                         'wbr'        => $common,
1480
1481                         # 9.3.4
1482                         'pre'        => $merge( $common, [ 'width' ] ),
1483
1484                         # 9.4
1485                         'ins'        => $merge( $common, [ 'cite', 'datetime' ] ),
1486                         'del'        => $merge( $common, [ 'cite', 'datetime' ] ),
1487
1488                         # 10.2
1489                         'ul'         => $merge( $common, [ 'type' ] ),
1490                         'ol'         => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1491                         'li'         => $merge( $common, [ 'type', 'value' ] ),
1492
1493                         # 10.3
1494                         'dl'         => $common,
1495                         'dd'         => $common,
1496                         'dt'         => $common,
1497
1498                         # 11.2.1
1499                         'table'      => $merge( $common,
1500                                                                 [ 'summary', 'width', 'border', 'frame',
1501                                                                                 'rules', 'cellspacing', 'cellpadding',
1502                                                                                 'align', 'bgcolor',
1503                                                                 ] ),
1504
1505                         # 11.2.2
1506                         'caption'    => $block,
1507
1508                         # 11.2.3
1509                         'thead'      => $common,
1510                         'tfoot'      => $common,
1511                         'tbody'      => $common,
1512
1513                         # 11.2.4
1514                         'colgroup'   => $merge( $common, [ 'span' ] ),
1515                         'col'        => $merge( $common, [ 'span' ] ),
1516
1517                         # 11.2.5
1518                         'tr'         => $merge( $common, [ 'bgcolor' ], $tablealign ),
1519
1520                         # 11.2.6
1521                         'td'         => $merge( $common, $tablecell, $tablealign ),
1522                         'th'         => $merge( $common, $tablecell, $tablealign ),
1523
1524                         # 12.2
1525                         # NOTE: <a> is not allowed directly, but this list of allowed
1526                         # attributes is used from the Parser object
1527                         'a'          => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1528
1529                         # 13.2
1530                         # Not usually allowed, but may be used for extension-style hooks
1531                         # such as <math> when it is rasterized
1532                         'img'        => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1533                         # Attributes for A/V tags added in T163583 / T133673
1534                         'audio'      => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1535                         'video'      => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1536                         'source'     => $merge( $common, [ 'type', 'src' ] ),
1537                         'track'      => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1538
1539                         # 15.2.1
1540                         'tt'         => $common,
1541                         'b'          => $common,
1542                         'i'          => $common,
1543                         'big'        => $common,
1544                         'small'      => $common,
1545                         'strike'     => $common,
1546                         's'          => $common,
1547                         'u'          => $common,
1548
1549                         # 15.2.2
1550                         'font'       => $merge( $common, [ 'size', 'color', 'face' ] ),
1551                         # basefont
1552
1553                         # 15.3
1554                         'hr'         => $merge( $common, [ 'width' ] ),
1555
1556                         # HTML Ruby annotation text module, simple ruby only.
1557                         # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1558                         'ruby'       => $common,
1559                         # rbc
1560                         'rb'         => $common,
1561                         'rp'         => $common,
1562                         'rt'         => $common, # $merge( $common, [ 'rbspan' ] ),
1563                         'rtc'        => $common,
1564
1565                         # MathML root element, where used for extensions
1566                         # 'title' may not be 100% valid here; it's XHTML
1567                         # https://www.w3.org/TR/REC-MathML/
1568                         'math'       => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1569
1570                         // HTML 5 section 4.5
1571                         'figure'     => $common,
1572                         'figcaption' => $common,
1573
1574                         # HTML 5 section 4.6
1575                         'bdi' => $common,
1576
1577                         # HTML5 elements, defined by:
1578                         # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1579                         'data' => $merge( $common, [ 'value' ] ),
1580                         'time' => $merge( $common, [ 'datetime' ] ),
1581                         'mark' => $common,
1582
1583                         // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1584                         // is enabled so we don't bother adding a conditional to hide these
1585                         // Also meta and link are only valid in WikiText as Microdata elements
1586                         // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1587                         // So we don't bother including $common attributes that have no purpose.
1588                         'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1589                         'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1590
1591                         # HTML 5 section 4.3.5
1592                         'aside' => $common,
1593                 ];
1594
1595                 return $allowed;
1596         }
1597
1598         /**
1599          * Take a fragment of (potentially invalid) HTML and return
1600          * a version with any tags removed, encoded as plain text.
1601          *
1602          * Warning: this return value must be further escaped for literal
1603          * inclusion in HTML output as of 1.10!
1604          *
1605          * @param string $html HTML fragment
1606          * @return string
1607          * @return-taint tainted
1608          */
1609         public static function stripAllTags( string $html ): string {
1610                 // Use RemexHtml to tokenize $html and extract the text
1611                 $handler = new RemexStripTagHandler;
1612                 $tokenizer = new RemexTokenizer( $handler, $html, [
1613                         'ignoreErrors' => true,
1614                         // don't ignore char refs, we want them to be decoded
1615                         'ignoreNulls' => true,
1616                         'skipPreprocess' => true,
1617                 ] );
1618                 $tokenizer->execute();
1619                 $text = $handler->getResult();
1620
1621                 $text = self::normalizeWhitespace( $text );
1622                 return $text;
1623         }
1624
1625         /**
1626          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1627          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1628          * PHP 5.1 doesn't.
1629          *
1630          * Use for passing XHTML fragments to PHP's XML parsing functions
1631          *
1632          * @deprecated since 1.36; will be made private or removed in a future
1633          *    release.
1634          */
1635         public static function hackDocType(): string {
1636                 $out = "<!DOCTYPE html [\n";
1637                 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1638                         if ( substr( $entity, -1 ) !== ';' ) {
1639                                 // Some HTML entities omit the trailing semicolon;
1640                                 // wikitext does not permit these.
1641                                 continue;
1642                         }
1643                         $name = substr( $entity, 0, -1 );
1644                         $expansion = self::normalizeEntity( $entity );
1645                         if ( $entity === $expansion ) {
1646                                 // Skip &lt; &gt; etc
1647                                 continue;
1648                         }
1649                         $out .= "<!ENTITY $name \"$expansion\">";
1650                 }
1651                 $out .= "]>\n";
1652                 return $out;
1653         }
1654
1655         public static function cleanUrl( string $url ): string {
1656                 # Normalize any HTML entities in input. They will be
1657                 # re-escaped by makeExternalLink().
1658                 $url = self::decodeCharReferences( $url );
1659
1660                 # Escape any control characters introduced by the above step
1661                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]+/',
1662                         static fn ( $m ) => urlencode( $m[0] ), $url );
1663
1664                 # Validate hostname portion
1665                 $matches = [];
1666                 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1667                         [ /* $whole */, $protocol, $host, $rest ] = $matches;
1668
1669                         // Characters that will be ignored in IDNs.
1670                         // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1671                         // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1672                         // Strip them before further processing so deny lists and such work.
1673                         $strip = "/
1674                                 \\s|      # general whitespace
1675                                 \u{00AD}|               # SOFT HYPHEN
1676                                 \u{034F}|               # COMBINING GRAPHEME JOINER
1677                                 \u{061C}|               # ARABIC LETTER MARK
1678                                 [\u{115F}-\u{1160}]|    # HANGUL CHOSEONG FILLER..
1679                                                         # HANGUL JUNGSEONG FILLER
1680                                 [\u{17B4}-\u{17B5}]|    # KHMER VOWEL INHERENT AQ..
1681                                                         # KHMER VOWEL INHERENT AA
1682                                 [\u{180B}-\u{180D}]|    # MONGOLIAN FREE VARIATION SELECTOR ONE..
1683                                                         # MONGOLIAN FREE VARIATION SELECTOR THREE
1684                                 \u{180E}|               # MONGOLIAN VOWEL SEPARATOR
1685                                 [\u{200B}-\u{200F}]|    # ZERO WIDTH SPACE..
1686                                                         # RIGHT-TO-LEFT MARK
1687                                 [\u{202A}-\u{202E}]|    # LEFT-TO-RIGHT EMBEDDING..
1688                                                         # RIGHT-TO-LEFT OVERRIDE
1689                                 [\u{2060}-\u{2064}]|    # WORD JOINER..
1690                                                         # INVISIBLE PLUS
1691                                 \u{2065}|               # <reserved-2065>
1692                                 [\u{2066}-\u{206F}]|    # LEFT-TO-RIGHT ISOLATE..
1693                                                         # NOMINAL DIGIT SHAPES
1694                                 \u{3164}|               # HANGUL FILLER
1695                                 [\u{FE00}-\u{FE0F}]|    # VARIATION SELECTOR-1..
1696                                                         # VARIATION SELECTOR-16
1697                                 \u{FEFF}|               # ZERO WIDTH NO-BREAK SPACE
1698                                 \u{FFA0}|               # HALFWIDTH HANGUL FILLER
1699                                 [\u{FFF0}-\u{FFF8}]|    # <reserved-FFF0>..
1700                                                         # <reserved-FFF8>
1701                                 [\u{1BCA0}-\u{1BCA3}]|  # SHORTHAND FORMAT LETTER OVERLAP..
1702                                                         # SHORTHAND FORMAT UP STEP
1703                                 [\u{1D173}-\u{1D17A}]|  # MUSICAL SYMBOL BEGIN BEAM..
1704                                                         # MUSICAL SYMBOL END PHRASE
1705                                 \u{E0000}|              # <reserved-E0000>
1706                                 \u{E0001}|              # LANGUAGE TAG
1707                                 [\u{E0002}-\u{E001F}]|  # <reserved-E0002>..
1708                                                         # <reserved-E001F>
1709                                 [\u{E0020}-\u{E007F}]|  # TAG SPACE..
1710                                                         # CANCEL TAG
1711                                 [\u{E0080}-\u{E00FF}]|  # <reserved-E0080>..
1712                                                         # <reserved-E00FF>
1713                                 [\u{E0100}-\u{E01EF}]|  # VARIATION SELECTOR-17..
1714                                                         # VARIATION SELECTOR-256
1715                                 [\u{E01F0}-\u{E0FFF}]|  # <reserved-E01F0>..
1716                                                         # <reserved-E0FFF>
1717                                 /xuD";
1718
1719                         $host = preg_replace( $strip, '', $host );
1720
1721                         // IPv6 host names are bracketed with [].  Url-decode these.
1722                         if ( str_starts_with( $host, "//%5B" ) &&
1723                                 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1724                         ) {
1725                                 $host = '//[' . $matches[1] . ']' . $matches[2];
1726                         }
1727
1728                         // @todo FIXME: Validate hostnames here
1729
1730                         return $protocol . $host . $rest;
1731                 } else {
1732                         return $url;
1733                 }
1734         }
1735
1736         /**
1737          * Does a string look like an e-mail address?
1738          *
1739          * This validates an email address using an HTML5 specification found at:
1740          * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
1741          * Which as of 2011-01-24 says:
1742          *
1743          *   A valid e-mail address is a string that matches the ABNF production
1744          *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
1745          *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
1746          *   3.5.
1747          *
1748          * This function is an implementation of the specification as requested in
1749          * T24449.
1750          *
1751          * Client-side forms will use the same standard validation rules via JS or
1752          * HTML 5 validation; additional restrictions can be enforced server-side
1753          * by extensions via the 'isValidEmailAddr' hook.
1754          *
1755          * Note that this validation doesn't 100% match RFC 2822, but is believed
1756          * to be liberal enough for wide use. Some invalid addresses will still
1757          * pass validation here.
1758          *
1759          * @since 1.18
1760          *
1761          * @param string $addr E-mail address
1762          * @return bool
1763          */
1764         public static function validateEmail( string $addr ): bool {
1765                 $result = null;
1766                 // TODO This method should be non-static, and have a HookRunner injected
1767                 $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1768                 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1769                         return $result;
1770                 }
1771
1772                 // Please note strings below are enclosed in brackets [], this make the
1773                 // hyphen "-" a range indicator. Hence it is double backslashed below.
1774                 // See T28948
1775                 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1776                 $rfc1034_ldh_str = "a-z0-9\\-";
1777
1778                 $html5_email_regexp = "/
1779                 ^                      # start of string
1780                 [$rfc5322_atext\\.]+    # user part which is liberal :p
1781                 @                      # 'apostrophe'
1782                 [$rfc1034_ldh_str]+       # First domain part
1783                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
1784                 $                      # End of string
1785                 /ix"; // case Insensitive, eXtended
1786
1787                 return (bool)preg_match( $html5_email_regexp, $addr );
1788         }
1789 }
1790
1791 /** @deprecated class alias since 1.41 */
1792 class_alias( Sanitizer::class, 'Sanitizer' );