includes/parser/Sanitizer.php

   1 <?php
   2 /**
   3  * HTML sanitizer for %MediaWiki.
   4  *
   5  * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * https://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 namespace MediaWiki\Parser;
  28
  29 use InvalidArgumentException;
  30 use LogicException;
  31 use MediaWiki\HookContainer\HookRunner;
  32 use MediaWiki\MediaWikiServices;
  33 use MediaWiki\Tidy\RemexCompatFormatter;
  34 use StringUtils;
  35 use UnexpectedValueException;
  36 use Wikimedia\RemexHtml\HTMLData;
  37 use Wikimedia\RemexHtml\Serializer\Serializer as RemexSerializer;
  38 use Wikimedia\RemexHtml\Tokenizer\Tokenizer as RemexTokenizer;
  39 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher as RemexDispatcher;
  40 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder as RemexTreeBuilder;
  41
  42 /**
  43  * HTML sanitizer for MediaWiki
  44  * @ingroup Parser
  45  */
  46 class Sanitizer {
  47         /**
  48          * Regular expression to match various types of character references in
  49          * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences.
  50          * Note that HTML5 allows some named entities to omit the trailing
  51          * semicolon; wikitext entities *must* have a trailing semicolon.
  52          */
  53         private const CHAR_REFS_REGEX =
  54                 '/&([A-Za-z0-9\x80-\xff]+;)
  55                 |&\#([0-9]+);
  56                 |&\#[xX]([0-9A-Fa-f]+);
  57                 |(&)/x';
  58
  59         /**
  60          * Acceptable tag name charset from HTML5 parsing spec
  61          * https://www.w3.org/TR/html5/syntax.html#tag-open-state
  62          */
  63         private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
  64
  65         /**
  66          * Pattern matching evil uris like javascript:
  67          * WARNING: DO NOT use this in any place that actually requires denying
  68          * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass
  69          * pattern-based deny lists; the only way to be secure from javascript:
  70          * uri based xss vectors is to allow only things that you know are safe
  71          * and deny everything else.
  72          * [1]: http://ha.ckers.org/xss.html
  73          */
  74         private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
  75         private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
  76
  77         /**
  78          * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
  79          *
  80          * @since 1.30
  81          */
  82         public const ID_PRIMARY = 0;
  83
  84         /**
  85          * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
  86          * if no fallback is configured.
  87          *
  88          * @since 1.30
  89          */
  90         public const ID_FALLBACK = 1;
  91
  92         /**
  93          * Character entity aliases accepted by MediaWiki in wikitext.
  94          * These are not part of the HTML standard.
  95          */
  96         private const MW_ENTITY_ALIASES = [
  97                 'רלמ;' => 'rlm;',
  98                 'رلم;' => 'rlm;',
  99         ];
 100
 101         /**
 102          * Lazy-initialised attributes regex, see getAttribsRegex()
 103          */
 104         private static $attribsRegex;
 105
 106         /**
 107          * Regular expression to match HTML/XML attribute pairs within a tag.
 108          * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 109          * Used in Sanitizer::decodeTagAttributes
 110          * @return string
 111          */
 112         private static function getAttribsRegex() {
 113                 if ( self::$attribsRegex === null ) {
 114                         $spaceChars = '\x09\x0a\x0c\x0d\x20';
 115                         $space = "[{$spaceChars}]";
 116                         $attrib = "[^{$spaceChars}\/>=]";
 117                         $attribFirst = "(?:{$attrib}|=)";
 118                         self::$attribsRegex =
 119                                 "/({$attribFirst}{$attrib}*)
 120                                         ($space*=$space*
 121                                         (?:
 122                                                 # The attribute value: quoted or alone
 123                                                 \"([^\"]*)(?:\"|\$)
 124                                                 | '([^']*)(?:'|\$)
 125                                                 | (((?!$space|>).)*)
 126                                         )
 127                                 )?/sxu";
 128                 }
 129                 return self::$attribsRegex;
 130         }
 131
 132         /**
 133          * Lazy-initialised attribute name regex, see getAttribNameRegex()
 134          */
 135         private static $attribNameRegex;
 136
 137         /**
 138          * Used in Sanitizer::decodeTagAttributes to filter attributes.
 139          * @return string
 140          */
 141         private static function getAttribNameRegex() {
 142                 if ( self::$attribNameRegex === null ) {
 143                         $attribFirst = "[:_\p{L}\p{N}]";
 144                         $attrib = "[:_\.\-\p{L}\p{N}]";
 145                         self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
 146                 }
 147                 return self::$attribNameRegex;
 148         }
 149
 150         /**
 151          * Return the various lists of recognized tags
 152          * @param string[] $extratags For any extra tags to include
 153          * @param string[] $removetags For any tags (default or extra) to exclude
 154          * @return array
 155          * @internal
 156          */
 157         public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
 158                 global $wgAllowImageTag;
 159                 static $commonCase, $staticInitialised;
 160                 $isCommonCase = ( $extratags === [] && $removetags === [] );
 161                 if ( $staticInitialised === $wgAllowImageTag && $isCommonCase && $commonCase ) {
 162                         return $commonCase;
 163                 }
 164
 165                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 166                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic;
 167
 168                 // Base our staticInitialised variable off of the global config state so that if the globals
 169                 // are changed (like in the screwed up test system) we will re-initialise the settings.
 170                 $globalContext = $wgAllowImageTag;
 171                 if ( !$staticInitialised || $staticInitialised !== $globalContext ) {
 172                         $htmlpairsStatic = [ # Tags that must be closed
 173                                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 174                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 175                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 176                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 177                                 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
 178                                 'kbd', 'samp', 'data', 'time', 'mark'
 179                         ];
 180                         # These tags can be self-closed. For tags not also on
 181                         # $htmlsingleonly, a self-closed tag will be emitted as
 182                         # an empty element (open-tag/close-tag pair).
 183                         $htmlsingle = [
 184                                 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
 185                         ];
 186
 187                         # Elements that cannot have close tags. This is (not coincidentally)
 188                         # also the list of tags for which the HTML 5 parsing algorithm
 189                         # requires you to "acknowledge the token's self-closing flag", i.e.
 190                         # a self-closing tag like <br/> is not an HTML 5 parse error only
 191                         # for this list.
 192                         $htmlsingleonly = [
 193                                 'br', 'wbr', 'hr', 'meta', 'link'
 194                         ];
 195
 196                         $htmlnest = [ # Tags that can be nested--??
 197                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 198                                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
 199                                 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
 200                         ];
 201                         $tabletags = [ # Can only appear inside table, we will close them
 202                                 'td', 'th', 'tr',
 203                         ];
 204                         $htmllist = [ # Tags used by list
 205                                 'ul', 'ol',
 206                         ];
 207                         $listtags = [ # Tags that can appear in a list
 208                                 'li',
 209                         ];
 210
 211                         if ( $wgAllowImageTag ) {
 212                                 wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' .
 213                                         'is deprecated since MediaWiki 1.35', '1.35', false, false );
 214                                 $htmlsingle[] = 'img';
 215                                 $htmlsingleonly[] = 'img';
 216                         }
 217
 218                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 219                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 220
 221                         # Convert them all to hashtables for faster lookup
 222                         $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 223                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
 224                         foreach ( $vars as $var ) {
 225                                 $$var = array_fill_keys( $$var, true );
 226                         }
 227                         $staticInitialised = $globalContext;
 228                 }
 229
 230                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 231                 $extratags = array_fill_keys( $extratags, true );
 232                 $removetags = array_fill_keys( $removetags, true );
 233                 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
 234                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 235                 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal The static var is always set
 236                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
 237
 238                 $result = [
 239                         'htmlpairs' => $htmlpairs,
 240                         'htmlsingle' => $htmlsingle,
 241                         'htmlsingleonly' => $htmlsingleonly,
 242                         'htmlnest' => $htmlnest,
 243                         'tabletags' => $tabletags,
 244                         'htmllist' => $htmllist,
 245                         'listtags' => $listtags,
 246                         'htmlsingleallowed' => $htmlsingleallowed,
 247                         'htmlelements' => $htmlelements,
 248                 ];
 249                 if ( $isCommonCase ) {
 250                         $commonCase = $result;
 251                 }
 252                 return $result;
 253         }
 254
 255         /**
 256          * Cleans up HTML, removes dangerous tags and attributes, and
 257          * removes HTML comments; BEWARE there may be unmatched HTML
 258          * tags in the result.
 259          *
 260          * @note Callers are recommended to use `::removeSomeTags()`
 261          * instead of this method.  `Sanitizer::removeSomeTags()` is safer
 262          * and will always return well-formed HTML; however, it is
 263          * significantly slower (especially for short strings where setup
 264          * costs predominate).  This method, although faster, should only
 265          * be used where we know the result be cleaned up in a subsequent
 266          * tidy pass.
 267          *
 268          * @param string $text Original string; see T268353 for why untainted.
 269          * @param-taint $text none
 270          * @param callable|null $processCallback Callback to do any variable or
 271          *   parameter replacements in HTML attribute values.
 272          *   This argument should be considered @internal.
 273          * @param-taint $processCallback exec_shell
 274          * @param array|bool $args Arguments for the processing callback
 275          * @param-taint $args none
 276          * @param array $extratags For any extra tags to include
 277          * @param-taint $extratags tainted
 278          * @param array $removetags For any tags (default or extra) to exclude
 279          * @param-taint $removetags none
 280          * @return string
 281          * @return-taint escaped
 282          * @deprecated since 1.38. Use ::removeSomeTags(), which always gives
 283          * balanced/tidy HTML.
 284          */
 285         public static function removeHTMLtags( $text, $processCallback = null,
 286                 $args = [], $extratags = [], $removetags = []
 287         ) {
 288                 wfDeprecated( __METHOD__, '1.38' );
 289                 return self::internalRemoveHtmlTags(
 290                         $text, $processCallback, $args, $extratags, $removetags
 291                 );
 292         }
 293
 294         /**
 295          * Cleans up HTML, removes dangerous tags and attributes, and
 296          * removes HTML comments; BEWARE there may be unmatched HTML
 297          * tags in the result.
 298          *
 299          * @note Callers are recommended to use `::removeSomeTags()` instead
 300          * of this method.  `Sanitizer::removeSomeTags()` is safer and will
 301          * always return well-formed HTML; however, it is significantly
 302          * slower (especially for short strings where setup costs
 303          * predominate).  This method is for internal use by the legacy parser
 304          * where we know the result will be cleaned up in a subsequent tidy pass.
 305          *
 306          * @param string $text Original string; see T268353 for why untainted.
 307          * @param-taint $text none
 308          * @param callable|null $processCallback Callback to do any variable or
 309          *   parameter replacements in HTML attribute values.
 310          *   This argument should be considered @internal.
 311          * @param-taint $processCallback exec_shell
 312          * @param array|bool $args Arguments for the processing callback
 313          * @param-taint $args none
 314          * @param array $extratags For any extra tags to include
 315          * @param-taint $extratags tainted
 316          * @param array $removetags For any tags (default or extra) to exclude
 317          * @param-taint $removetags none
 318          * @return string
 319          * @return-taint escaped
 320          * @internal
 321          */
 322         public static function internalRemoveHtmlTags( $text, $processCallback = null,
 323                 $args = [], $extratags = [], $removetags = []
 324         ) {
 325                 $tagData = self::getRecognizedTagData( $extratags, $removetags );
 326                 $htmlsingle = $tagData['htmlsingle'];
 327                 $htmlsingleonly = $tagData['htmlsingleonly'];
 328                 $htmlelements = $tagData['htmlelements'];
 329
 330                 # Remove HTML comments
 331                 $text = self::removeHTMLcomments( $text );
 332                 $bits = explode( '<', $text );
 333                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 334
 335                 # this might be possible using remex tidy itself
 336                 foreach ( $bits as $x ) {
 337                         if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
 338                                 [ /* $qbar */, $slash, $t, $params, $brace, $rest ] = $regs;
 339
 340                                 $badtag = false;
 341                                 $t = strtolower( $t );
 342                                 if ( isset( $htmlelements[$t] ) ) {
 343                                         if ( is_callable( $processCallback ) ) {
 344                                                 call_user_func_array( $processCallback, [ &$params, $args ] );
 345                                         }
 346
 347                                         if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
 348                                                 // Remove the self-closing slash, to be consistent
 349                                                 // with HTML5 semantics. T134423
 350                                                 $brace = '>';
 351                                         }
 352                                         if ( !self::validateTag( $params, $t ) ) {
 353                                                 $badtag = true;
 354                                         }
 355
 356                                         $newparams = self::fixTagAttributes( $params, $t );
 357                                         if ( !$badtag ) {
 358                                                 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
 359                                                         # Interpret self-closing tags as empty tags even when
 360                                                         # HTML 5 would interpret them as start tags. Such input
 361                                                         # is commonly seen on Wikimedia wikis with this intention.
 362                                                         $brace = "></$t>";
 363                                                 }
 364
 365                                                 $rest = str_replace( '>', '&gt;', $rest );
 366                                                 $text .= "<$slash$t$newparams$brace$rest";
 367                                                 continue;
 368                                         }
 369                                 }
 370                         }
 371                         $text .= '&lt;' . str_replace( '>', '&gt;', $x );
 372                 }
 373                 return $text;
 374         }
 375
 376         /**
 377          * Cleans up HTML, removes dangerous tags and attributes, and
 378          * removes HTML comments; the result will always be balanced and
 379          * tidy HTML.
 380          * @param string $text Source string; see T268353 for why untainted
 381          * @param-taint  $text none
 382          * @param array $options Options controlling the cleanup:
 383          *    string[] $options['extraTags'] Any extra tags to allow
 384          *      (This property taints the whole array.)
 385          *    string[] $options['removeTags'] Any tags (default or extra) to exclude
 386          *    callable(Attributes,...):Attributes $options['attrCallback'] Callback
 387          *      to do any variable or parameter replacements in HTML attribute
 388          *      values before further cleanup; should be considered @internal
 389          *      and not for external use.
 390          *    array $options['attrCallbackArgs'] Additional arguments for the
 391          *      attribute callback
 392          * @param-taint $options tainted
 393          * @return string The cleaned up HTML
 394          * @return-taint escaped
 395          * @since 1.38
 396          */
 397         public static function removeSomeTags(
 398                 string $text, array $options = []
 399         ): string {
 400                 $extraTags = $options['extraTags'] ?? [];
 401                 $removeTags = $options['removeTags'] ?? [];
 402                 // These options are @internal:
 403                 $attrCallback = $options['attrCallback'] ?? null;
 404                 $attrCallbackArgs = $options['attrCallbackArgs'] ?? [];
 405
 406                 // This disallows HTML5-style "missing trailing semicolon" attributes
 407                 // In wikitext "clean&copy" does *not* contain an entity.
 408                 $text = self::normalizeCharReferences( $text );
 409
 410                 $tagData = self::getRecognizedTagData( $extraTags, $removeTags );
 411                 // Use RemexHtml to tokenize $text and remove the barred tags
 412                 $formatter = new RemexCompatFormatter;
 413                 $serializer = new RemexSerializer( $formatter );
 414                 $treeBuilder = new RemexTreeBuilder( $serializer, [
 415                         'ignoreErrors' => true,
 416                         'ignoreNulls' => true,
 417                 ] );
 418                 $dispatcher = new RemexDispatcher( $treeBuilder );
 419                 $tokenHandler = $dispatcher;
 420                 $remover = new RemexRemoveTagHandler(
 421                         $tokenHandler, $text, $tagData,
 422                         $attrCallback, $attrCallbackArgs
 423                 );
 424                 $tokenizer = new RemexTokenizer( $remover, $text, [
 425                         'ignoreErrors' => true,
 426                         // don't ignore char refs, we want them to be decoded
 427                         'ignoreNulls' => true,
 428                         'skipPreprocess' => true,
 429                 ] );
 430                 $tokenizer->execute( [
 431                         'fragmentNamespace' => HTMLData::NS_HTML,
 432                         'fragmentName' => 'body',
 433                 ] );
 434                 return $serializer->getResult();
 435         }
 436
 437         /**
 438          * Remove '<!--', '-->', and everything between.
 439          * To avoid leaving blank lines, when a comment is both preceded
 440          * and followed by a newline (ignoring spaces), trim leading and
 441          * trailing spaces and one of the newlines.
 442          *
 443          * @param string $text
 444          * @return string
 445          */
 446         public static function removeHTMLcomments( $text ) {
 447                 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
 448                         $end = strpos( $text, '-->', $start + 4 );
 449                         if ( $end === false ) {
 450                                 # Unterminated comment; bail out
 451                                 break;
 452                         }
 453
 454                         $end += 3;
 455
 456                         # Trim space and newline if the comment is both
 457                         # preceded and followed by a newline
 458                         $spaceStart = max( $start - 1, 0 );
 459                         $spaceLen = $end - $spaceStart;
 460                         while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
 461                                 $spaceStart--;
 462                                 $spaceLen++;
 463                         }
 464                         while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
 465                                 $spaceLen++;
 466                         }
 467                         if ( substr( $text, $spaceStart, 1 ) === "\n"
 468                                 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
 469                                 # Remove the comment, leading and trailing
 470                                 # spaces, and leave only one newline.
 471                                 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
 472                         } else {
 473                                 # Remove just the comment.
 474                                 $text = substr_replace( $text, '', $start, $end - $start );
 475                         }
 476                 }
 477                 return $text;
 478         }
 479
 480         /**
 481          * Takes attribute names and values for a tag and the tag name and
 482          * validates that the tag is allowed to be present.
 483          * This DOES NOT validate the attributes, nor does it validate the
 484          * tags themselves. This method only handles the special circumstances
 485          * where we may want to allow a tag within content but ONLY when it has
 486          * specific attributes set.
 487          *
 488          * @param string $params
 489          * @param string $element
 490          * @return bool
 491          *
 492          * @see RemexRemoveTagHandler::validateTag()
 493          */
 494         private static function validateTag( $params, $element ) {
 495                 $params = self::decodeTagAttributes( $params );
 496
 497                 if ( $element == 'meta' || $element == 'link' ) {
 498                         if ( !isset( $params['itemprop'] ) ) {
 499                                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
 500                                 return false;
 501                         }
 502                         if ( $element == 'meta' && !isset( $params['content'] ) ) {
 503                                 // <meta> must have a content="" for the itemprop
 504                                 return false;
 505                         }
 506                         if ( $element == 'link' && !isset( $params['href'] ) ) {
 507                                 // <link> must have an associated href=""
 508                                 return false;
 509                         }
 510                 }
 511
 512                 return true;
 513         }
 514
 515         /**
 516          * Take an array of attribute names and values and normalize or discard
 517          * illegal values for the given element type.
 518          *
 519          * - Discards attributes not allowed for the given element
 520          * - Unsafe style attributes are discarded
 521          * - Invalid id attributes are re-encoded
 522          *
 523          * @param array $attribs
 524          * @param string $element
 525          * @return array
 526          *
 527          * @todo Check for legal values where the DTD limits things.
 528          * @todo Check for unique id attribute :P
 529          */
 530         public static function validateTagAttributes( $attribs, $element ) {
 531                 return self::validateAttributes( $attribs,
 532                         self::attributesAllowedInternal( $element ) );
 533         }
 534
 535         /**
 536          * Take an array of attribute names and values and normalize or discard
 537          * illegal values.
 538          *
 539          * - Discards attributes not on the given list
 540          * - Unsafe style attributes are discarded
 541          * - Invalid id attributes are re-encoded
 542          *
 543          * @param array $attribs
 544          * @param array $allowed List of allowed attribute names,
 545          *   as an associative array where keys give valid attribute names
 546          *   (since 1.34).  Before 1.35, passing a sequential array of
 547          *   valid attribute names was permitted but that is now deprecated.
 548          * @return array
 549          *
 550          * @todo Check for legal values where the DTD limits things.
 551          * @todo Check for unique id attribute :P
 552          */
 553         public static function validateAttributes( $attribs, $allowed ) {
 554                 if ( isset( $allowed[0] ) ) {
 555                         // Calling this function with a sequential array is
 556                         // deprecated.  For now just convert it.
 557                         wfDeprecated( __METHOD__ . ' with sequential array', '1.35' );
 558                         $allowed = array_fill_keys( $allowed, true );
 559                 }
 560                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 561
 562                 $out = [];
 563                 foreach ( $attribs as $attribute => $value ) {
 564                         # Allow XML namespace declaration to allow RDFa
 565                         if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
 566                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 567                                         $out[$attribute] = $value;
 568                                 }
 569
 570                                 continue;
 571                         }
 572
 573                         # Allow any attribute beginning with "data-"
 574                         # However:
 575                         # * Disallow data attributes used by MediaWiki code
 576                         # * Ensure that the attribute is not namespaced by banning
 577                         #   colons.
 578                         if ( (
 579                                 !preg_match( '/^data-[^:]*$/i', $attribute ) &&
 580                                 !array_key_exists( $attribute, $allowed )
 581                         ) || self::isReservedDataAttribute( $attribute ) ) {
 582                                 continue;
 583                         }
 584
 585                         # Strip javascript "expression" from stylesheets.
 586                         # https://msdn.microsoft.com/en-us/library/ms537634.aspx
 587                         if ( $attribute == 'style' ) {
 588                                 $value = self::checkCss( $value );
 589                         }
 590
 591                         # Escape HTML id attributes
 592                         if ( $attribute === 'id' ) {
 593                                 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
 594                         }
 595
 596                         # Escape HTML id reference lists
 597                         if ( $attribute === 'aria-describedby'
 598                                 || $attribute === 'aria-flowto'
 599                                 || $attribute === 'aria-labelledby'
 600                                 || $attribute === 'aria-owns'
 601                         ) {
 602                                 $value = self::escapeIdReferenceListInternal( $value );
 603                         }
 604
 605                         // RDFa and microdata properties allow URLs, URIs and/or CURIs.
 606                         if ( $attribute === 'rel' || $attribute === 'rev'
 607                                 # RDFa
 608                                 || $attribute === 'about' || $attribute === 'property'
 609                                 || $attribute === 'resource' || $attribute === 'datatype'
 610                                 || $attribute === 'typeof'
 611                                 # HTML5 microdata
 612                                 || $attribute === 'itemid' || $attribute === 'itemprop'
 613                                 || $attribute === 'itemref' || $attribute === 'itemscope'
 614                                 || $attribute === 'itemtype'
 615                         ) {
 616                                 // Paranoia. Allow "simple" values but suppress javascript
 617                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
 618                                         continue;
 619                                 }
 620                         }
 621
 622                         # NOTE: even though elements using href/src are not allowed directly, supply
 623                         #       validation code that can be used by tag hook handlers, etc
 624                         if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
 625                                 if ( !preg_match( $hrefExp, $value ) ) {
 626                                         continue; // drop any href or src attributes not using an allowed protocol.
 627                                         // NOTE: this also drops all relative URLs
 628                                 }
 629                         }
 630
 631                         if ( $attribute === 'tabindex' && $value !== '0' ) {
 632                                 // Only allow tabindex of 0, which is useful for accessibility.
 633                                 continue;
 634                         }
 635
 636                         // If this attribute was previously set, override it.
 637                         // Output should only have one attribute of each name.
 638                         $out[$attribute] = $value;
 639                 }
 640
 641                 # itemtype, itemid, itemref don't make sense without itemscope
 642                 if ( !array_key_exists( 'itemscope', $out ) ) {
 643                         unset( $out['itemtype'] );
 644                         unset( $out['itemid'] );
 645                         unset( $out['itemref'] );
 646                 }
 647                 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
 648
 649                 return $out;
 650         }
 651
 652         /**
 653          * Given an attribute name, checks whether it is a reserved data attribute
 654          * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
 655          * core and extension code can safely use it to communicate with frontend code.
 656          * @param string $attr Attribute name.
 657          * @return bool
 658          */
 659         public static function isReservedDataAttribute( $attr ) {
 660                 // data-ooui is reserved for ooui.
 661                 // data-mw and data-parsoid are reserved for parsoid.
 662                 // data-mw-<name here> is reserved for extensions (or core) if
 663                 // they need to communicate some data to the client and want to be
 664                 // sure that it isn't coming from an untrusted user.
 665                 // We ignore the possibility of namespaces since user-generated HTML
 666                 // can't use them anymore.
 667                 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
 668         }
 669
 670         /**
 671          * Merge two sets of HTML attributes.  Conflicting items in the second set
 672          * will override those in the first, except for 'class' attributes which
 673          * will be combined (if they're both strings).
 674          *
 675          * @todo implement merging for other attributes such as style
 676          * @param array $a
 677          * @param array $b
 678          * @return array
 679          */
 680         public static function mergeAttributes( $a, $b ) {
 681                 $out = array_merge( $a, $b );
 682                 if ( isset( $a['class'] ) && isset( $b['class'] )
 683                         && is_string( $a['class'] ) && is_string( $b['class'] )
 684                         && $a['class'] !== $b['class']
 685                 ) {
 686                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 687                                 -1, PREG_SPLIT_NO_EMPTY );
 688                         $out['class'] = implode( ' ', array_unique( $classes ) );
 689                 }
 690                 return $out;
 691         }
 692
 693         /**
 694          * Normalize CSS into a format we can easily search for hostile input
 695          *  - decode character references
 696          *  - decode escape sequences
 697          *  - remove comments, unless the entire value is one single comment
 698          * @param string $value the css string
 699          * @return string normalized css
 700          */
 701         public static function normalizeCss( $value ) {
 702                 // Decode character references like &#123;
 703                 $value = self::decodeCharReferences( $value );
 704
 705                 // Decode escape sequences and line continuation
 706                 // See the grammar in the CSS 2 spec, appendix D.
 707                 // This has to be done AFTER decoding character references.
 708                 // This means it isn't possible for this function to return
 709                 // unsanitized escape sequences. It is possible to manufacture
 710                 // input that contains character references that decode to
 711                 // escape sequences that decode to character references, but
 712                 // it's OK for the return value to contain character references
 713                 // because the caller is supposed to escape those anyway.
 714                 static $decodeRegex;
 715                 if ( !$decodeRegex ) {
 716                         $space = '[\\x20\\t\\r\\n\\f]';
 717                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
 718                         $backslash = '\\\\';
 719                         $decodeRegex = "/ $backslash
 720                                 (?:
 721                                         ($nl) |  # 1. Line continuation
 722                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
 723                                         (.) | # 3. backslash cancelling special meaning
 724                                         () | # 4. backslash at end of string
 725                                 )/xu";
 726                 }
 727                 $value = preg_replace_callback( $decodeRegex,
 728                         [ __CLASS__, 'cssDecodeCallback' ], $value );
 729
 730                 // Let the value through if it's nothing but a single comment, to
 731                 // allow other functions which may reject it to pass some error
 732                 // message through.
 733                 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
 734                         // Remove any comments; IE gets token splitting wrong
 735                         // This must be done AFTER decoding character references and
 736                         // escape sequences, because those steps can introduce comments
 737                         // This step cannot introduce character references or escape
 738                         // sequences, because it replaces comments with spaces rather
 739                         // than removing them completely.
 740                         $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
 741
 742                         // Remove anything after a comment-start token, to guard against
 743                         // incorrect client implementations.
 744                         $commentPos = strpos( $value, '/*' );
 745                         if ( $commentPos !== false ) {
 746                                 $value = substr( $value, 0, $commentPos );
 747                         }
 748                 }
 749
 750                 return $value;
 751         }
 752
 753         /**
 754          * Pick apart some CSS and check it for forbidden or unsafe structures.
 755          * Returns a sanitized string. This sanitized string will have
 756          * character references and escape sequences decoded and comments
 757          * stripped (unless it is itself one valid comment, in which case the value
 758          * will be passed through). If the input is just too evil, only a comment
 759          * complaining about evilness will be returned.
 760          *
 761          * Currently URL references, 'expression', 'tps' are forbidden.
 762          *
 763          * NOTE: Despite the fact that character references are decoded, the
 764          * returned string may contain character references given certain
 765          * clever input strings. These character references must
 766          * be escaped before the return value is embedded in HTML.
 767          *
 768          * @param string $value
 769          * @return string
 770          */
 771         public static function checkCss( $value ) {
 772                 $value = self::normalizeCss( $value );
 773
 774                 // Reject problematic keywords and control characters
 775                 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
 776                         strpos( $value, \UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
 777                         return '/* invalid control char */';
 778                 } elseif ( preg_match(
 779                         '! expression
 780                                 | filter\s*:
 781                                 | accelerator\s*:
 782                                 | -o-link\s*:
 783                                 | -o-link-source\s*:
 784                                 | -o-replace\s*:
 785                                 | url\s*\(
 786                                 | image\s*\(
 787                                 | image-set\s*\(
 788                                 | attr\s*\([^)]+[\s,]+url
 789                         !ix', $value ) ) {
 790                         return '/* insecure input */';
 791                 }
 792                 return $value;
 793         }
 794
 795         /**
 796          * @param array $matches
 797          * @return string
 798          */
 799         private static function cssDecodeCallback( $matches ) {
 800                 if ( $matches[1] !== '' ) {
 801                         // Line continuation
 802                         return '';
 803                 } elseif ( $matches[2] !== '' ) {
 804                         # hexdec could return a float if the match is too long, but the
 805                         # regexp in question limits the string length to 6.
 806                         $char = \UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
 807                 } elseif ( $matches[3] !== '' ) {
 808                         $char = $matches[3];
 809                 } else {
 810                         $char = '\\';
 811                 }
 812                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
 813                         // These characters need to be escaped in strings
 814                         // Clean up the escape sequence to avoid parsing errors by clients
 815                         return '\\' . dechex( ord( $char ) ) . ' ';
 816                 } else {
 817                         // Decode unnecessary escape
 818                         return $char;
 819                 }
 820         }
 821
 822         /**
 823          * Take a tag soup fragment listing an HTML element's attributes
 824          * and normalize it to well-formed XML, discarding unwanted attributes.
 825          * Output is safe for further wikitext processing, with escaping of
 826          * values that could trigger problems.
 827          *
 828          * - Normalizes attribute names to lowercase
 829          * - Discards attributes not allowed for the given element
 830          * - Turns broken or invalid entities into plaintext
 831          * - Double-quotes all attribute values
 832          * - Attributes without values are given the name as attribute
 833          * - Double attributes are discarded
 834          * - Unsafe style attributes are discarded
 835          * - Prepends space if there are attributes.
 836          * - (Optionally) Sorts attributes by name.
 837          *
 838          * @param string $text
 839          * @param string $element
 840          * @param bool $sorted Whether to sort the attributes (default: false)
 841          * @return string
 842          */
 843         public static function fixTagAttributes( $text, $element, $sorted = false ) {
 844                 if ( trim( $text ) == '' ) {
 845                         return '';
 846                 }
 847
 848                 $decoded = self::decodeTagAttributes( $text );
 849                 $stripped = self::validateTagAttributes( $decoded, $element );
 850
 851                 if ( $sorted ) {
 852                         ksort( $stripped );
 853                 }
 854
 855                 return self::safeEncodeTagAttributes( $stripped );
 856         }
 857
 858         /**
 859          * Encode an attribute value for HTML output.
 860          * @param string $text
 861          * @param-taint $text escapes_html
 862          * @return string HTML-encoded text fragment
 863          * @return-taint escaped
 864          */
 865         public static function encodeAttribute( $text ) {
 866                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 867
 868                 // Whitespace is normalized during attribute decoding,
 869                 // so if we've been passed non-spaces we must encode them
 870                 // ahead of time or they won't be preserved.
 871                 $encValue = strtr( $encValue, [
 872                         "\n" => '&#10;',
 873                         "\r" => '&#13;',
 874                         "\t" => '&#9;',
 875                 ] );
 876
 877                 return $encValue;
 878         }
 879
 880         /**
 881          * Armor French spaces with a replacement character
 882          *
 883          * @since 1.32
 884          * @param string $text Text to armor
 885          * @param string $space Space character for the French spaces, defaults to '&#160;'
 886          * @return string Armored text
 887          */
 888         public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
 889                 // Replace $ with \$ and \ with \\
 890                 $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
 891                 $fixtags = [
 892                         # French spaces, last one Guillemet-left
 893                         # only if it isn't followed by a word character.
 894                         '/ (?=[?:;!%»›](?!\w))/u' => "$space",
 895                         # French spaces, Guillemet-right
 896                         '/([«‹]) /u' => "\\1$space",
 897                 ];
 898                 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
 899         }
 900
 901         /**
 902          * Encode an attribute value for HTML tags, with extra armoring
 903          * against further wiki processing.
 904          * @param string $text
 905          * @param-taint $text escapes_html
 906          * @return string HTML-encoded text fragment
 907          * @return-taint escaped
 908          */
 909         public static function safeEncodeAttribute( $text ) {
 910                 $encValue = self::encodeAttribute( $text );
 911
 912                 # Templates and links may be expanded in later parsing,
 913                 # creating invalid or dangerous output. Suppress this.
 914                 $encValue = strtr( $encValue, [
 915                         '<'    => '&lt;',   // This should never happen,
 916                         '>'    => '&gt;',   // we've received invalid input
 917                         '"'    => '&quot;', // which should have been escaped.
 918                         '{'    => '&#123;',
 919                         '}'    => '&#125;', // prevent unpaired language conversion syntax
 920                         '['    => '&#91;',
 921                         ']'    => '&#93;',
 922                         "''"   => '&#39;&#39;',
 923                         'ISBN' => '&#73;SBN',
 924                         'RFC'  => '&#82;FC',
 925                         'PMID' => '&#80;MID',
 926                         '|'    => '&#124;',
 927                         '__'   => '&#95;_',
 928                 ] );
 929
 930                 # Stupid hack
 931                 $encValue = preg_replace_callback(
 932                         '/((?i)' . wfUrlProtocols() . ')/',
 933                         static function ( $matches ) {
 934                                 return str_replace( ':', '&#58;', $matches[1] );
 935                         },
 936                         $encValue );
 937                 return $encValue;
 938         }
 939
 940         /**
 941          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
 942          * a valid HTML id attribute.
 943          *
 944          * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
 945          * proper escaping.
 946          *
 947          * @param string $id String to escape
 948          * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
 949          *     should be used.
 950          * @return string|false Escaped ID or false if fallback encoding is requested but it's not
 951          *     configured.
 952          *
 953          * @since 1.30
 954          */
 955         public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
 956                 global $wgFragmentMode;
 957
 958                 if ( !isset( $wgFragmentMode[$mode] ) ) {
 959                         if ( $mode === self::ID_PRIMARY ) {
 960                                 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
 961                         }
 962                         return false;
 963                 }
 964
 965                 $internalMode = $wgFragmentMode[$mode];
 966
 967                 return self::escapeIdInternal( $id, $internalMode );
 968         }
 969
 970         /**
 971          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
 972          * a valid URL fragment.
 973          *
 974          * WARNING: The output of this function is not guaranteed to be HTML safe, so be sure to use
 975          * proper escaping.
 976          *
 977          * @param string $id String to escape
 978          * @return string Escaped ID
 979          *
 980          * @since 1.30
 981          */
 982         public static function escapeIdForLink( $id ) {
 983                 global $wgFragmentMode;
 984
 985                 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
 986                         throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
 987                 }
 988
 989                 $mode = $wgFragmentMode[self::ID_PRIMARY];
 990
 991                 $id = self::escapeIdInternalUrl( $id, $mode );
 992
 993                 return $id;
 994         }
 995
 996         /**
 997          * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
 998          * a valid URL fragment for external interwikis.
 999          *
1000          * @param string $id String to escape
1001          * @return string Escaped ID
1002          *
1003          * @since 1.30
1004          */
1005         public static function escapeIdForExternalInterwiki( $id ) {
1006                 global $wgExternalInterwikiFragmentMode;
1007
1008                 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode );
1009
1010                 return $id;
1011         }
1012
1013         /**
1014          * Do percent encoding of percent signs for href (but not id) attributes
1015          *
1016          * @since 1.35
1017          * @see https://phabricator.wikimedia.org/T238385
1018          * @param string $id String to escape
1019          * @param string $mode One of modes from $wgFragmentMode
1020          * @return string
1021          */
1022         private static function escapeIdInternalUrl( $id, $mode ) {
1023                 $id = self::escapeIdInternal( $id, $mode );
1024                 if ( $mode === 'html5' ) {
1025                         $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id );
1026                 }
1027                 return $id;
1028         }
1029
1030         /**
1031          * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
1032          *
1033          * @param string $id String to escape
1034          * @param string $mode One of modes from $wgFragmentMode
1035          * @return string
1036          */
1037         private static function escapeIdInternal( $id, $mode ) {
1038                 // Truncate overly-long IDs.  This isn't an HTML limit, it's just
1039                 // griefer protection. [T251506]
1040                 $id = mb_substr( $id, 0, 1024 );
1041
1042                 switch ( $mode ) {
1043                         case 'html5':
1044                                 // html5 spec says ids must not have any of the following:
1045                                 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE
1046                                 // In practice, in wikitext, only tab, LF, CR (and SPACE) are
1047                                 // possible using either Lua or html entities.
1048                                 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id );
1049                                 break;
1050                         case 'legacy':
1051                                 // This corresponds to 'noninitial' mode of the former escapeId()
1052                                 static $replace = [
1053                                         '%3A' => ':',
1054                                         '%' => '.'
1055                                 ];
1056
1057                                 $id = urlencode( str_replace( ' ', '_', $id ) );
1058                                 $id = strtr( $id, $replace );
1059                                 break;
1060                         default:
1061                                 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1062                 }
1063
1064                 return $id;
1065         }
1066
1067         /**
1068          * Given a string containing a space delimited list of ids, escape each id
1069          * to match ids escaped by the escapeIdForAttribute() function.
1070          *
1071          * @param string $referenceString Space delimited list of ids
1072          * @return string
1073          */
1074         private static function escapeIdReferenceListInternal( $referenceString ) {
1075                 # Explode the space delimited list string into an array of tokens
1076                 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1077
1078                 # Escape each token as an id
1079                 foreach ( $references as &$ref ) {
1080                         $ref = self::escapeIdForAttribute( $ref );
1081                 }
1082
1083                 # Merge the array back to a space delimited list string
1084                 # If the array is empty, the result will be an empty string ('')
1085                 $referenceString = implode( ' ', $references );
1086
1087                 return $referenceString;
1088         }
1089
1090         /**
1091          * Given a value, escape it so that it can be used as a CSS class and
1092          * return it.
1093          *
1094          * @todo For extra validity, input should be validated UTF-8.
1095          *
1096          * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
1097          *
1098          * @param string $class
1099          * @return string
1100          */
1101         public static function escapeClass( $class ) {
1102                 // Convert ugly stuff to underscores and kill underscores in ugly places
1103                 return rtrim( preg_replace(
1104                         [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1105                         '_',
1106                         $class ), '_' );
1107         }
1108
1109         /**
1110          * Given HTML input, escape with htmlspecialchars but un-escape entities.
1111          * This allows (generally harmless) entities like &#160; to survive.
1112          *
1113          * @param string $html HTML to escape
1114          * @param-taint $html escapes_htmlnoent
1115          * @return string Escaped input
1116          * @return-taint escaped
1117          */
1118         public static function escapeHtmlAllowEntities( $html ) {
1119                 $html = self::decodeCharReferences( $html );
1120                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
1121                 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1122                 # don't cause the entire string to disappear.
1123                 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1124                 return $html;
1125         }
1126
1127         /**
1128          * Return an associative array of attribute names and values from
1129          * a partial tag string. Attribute names are forced to lowercase,
1130          * character references are decoded to UTF-8 text.
1131          *
1132          * @param string $text
1133          * @return array
1134          */
1135         public static function decodeTagAttributes( $text ) {
1136                 if ( trim( $text ) == '' ) {
1137                         return [];
1138                 }
1139
1140                 $pairs = [];
1141                 if ( !preg_match_all(
1142                         self::getAttribsRegex(),
1143                         $text,
1144                         $pairs,
1145                         PREG_SET_ORDER ) ) {
1146                         return [];
1147                 }
1148
1149                 $attribs = [];
1150                 foreach ( $pairs as $set ) {
1151                         $attribute = strtolower( $set[1] );
1152
1153                         // Filter attribute names with unacceptable characters
1154                         if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
1155                                 continue;
1156                         }
1157
1158                         $value = self::getTagAttributeCallback( $set );
1159
1160                         // Normalize whitespace
1161                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1162                         $value = trim( $value );
1163
1164                         // Decode character references
1165                         $attribs[$attribute] = self::decodeCharReferences( $value );
1166                 }
1167                 return $attribs;
1168         }
1169
1170         /**
1171          * Build a partial tag string from an associative array of attribute
1172          * names and values as returned by decodeTagAttributes.
1173          *
1174          * @param array $assoc_array
1175          * @return string
1176          */
1177         public static function safeEncodeTagAttributes( $assoc_array ) {
1178                 $attribs = [];
1179                 foreach ( $assoc_array as $attribute => $value ) {
1180                         $encAttribute = htmlspecialchars( $attribute, ENT_COMPAT );
1181                         $encValue = self::safeEncodeAttribute( $value );
1182
1183                         $attribs[] = "$encAttribute=\"$encValue\"";
1184                 }
1185                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1186         }
1187
1188         /**
1189          * Pick the appropriate attribute value from a match set from the
1190          * attribs regex matches.
1191          *
1192          * @param array $set
1193          * @return string
1194          */
1195         private static function getTagAttributeCallback( $set ) {
1196                 if ( isset( $set[5] ) ) {
1197                         # No quotes.
1198                         return $set[5];
1199                 } elseif ( isset( $set[4] ) ) {
1200                         # Single-quoted
1201                         return $set[4];
1202                 } elseif ( isset( $set[3] ) ) {
1203                         # Double-quoted
1204                         return $set[3];
1205                 } elseif ( !isset( $set[2] ) ) {
1206                         # In XHTML, attributes must have a value so return an empty string.
1207                         # See "Empty attribute syntax",
1208                         # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1209                         return "";
1210                 } else {
1211                         throw new LogicException( "Tag conditions not met. This should never happen and is a bug." );
1212                 }
1213         }
1214
1215         /**
1216          * @param string $text
1217          * @return string
1218          */
1219         private static function normalizeWhitespace( $text ) {
1220                 return trim( preg_replace(
1221                         '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1222                         ' ',
1223                         $text ) );
1224         }
1225
1226         /**
1227          * Normalizes whitespace in a section name, such as might be returned
1228          * by Parser::stripSectionName(), for use in the id's that are used for
1229          * section links.
1230          *
1231          * @param string $section
1232          * @return string
1233          */
1234         public static function normalizeSectionNameWhitespace( $section ) {
1235                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1236         }
1237
1238         /**
1239          * Ensure that any entities and character references are legal
1240          * for XML and XHTML specifically. Any stray bits will be
1241          * &amp;-escaped to result in a valid text fragment.
1242          *
1243          * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
1244          *   numericized (this way we're well-formed even without a DTD)
1245          * b. any numeric char refs must be legal chars, not invalid or forbidden
1246          * c. use lower cased "&#x", not "&#X"
1247          * d. fix or reject non-valid attributes
1248          *
1249          * @param string $text
1250          * @return string
1251          * @internal
1252          */
1253         public static function normalizeCharReferences( $text ) {
1254                 return preg_replace_callback(
1255                         self::CHAR_REFS_REGEX,
1256                         [ self::class, 'normalizeCharReferencesCallback' ],
1257                         $text );
1258         }
1259
1260         /**
1261          * @param string $matches
1262          * @return string
1263          */
1264         private static function normalizeCharReferencesCallback( $matches ) {
1265                 $ret = null;
1266                 if ( $matches[1] != '' ) {
1267                         $ret = self::normalizeEntity( $matches[1] );
1268                 } elseif ( $matches[2] != '' ) {
1269                         $ret = self::decCharReference( $matches[2] );
1270                 } elseif ( $matches[3] != '' ) {
1271                         $ret = self::hexCharReference( $matches[3] );
1272                 }
1273                 if ( $ret === null ) {
1274                         return htmlspecialchars( $matches[0], ENT_COMPAT );
1275                 } else {
1276                         return $ret;
1277                 }
1278         }
1279
1280         /**
1281          * If the named entity is defined in HTML5
1282          * return the equivalent numeric entity reference (except for the core &lt;
1283          * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
1284          * the HTML equivalent. Otherwise, returns HTML-escaped text of
1285          * pseudo-entity source (eg &amp;foo;)
1286          *
1287          * @param string $name Semicolon-terminated name
1288          * @return string
1289          */
1290         private static function normalizeEntity( $name ) {
1291                 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1292                         // Non-standard MediaWiki-specific entities
1293                         return '&' . self::MW_ENTITY_ALIASES[$name];
1294                 } elseif ( in_array( $name, [ 'lt;', 'gt;', 'amp;', 'quot;' ], true ) ) {
1295                         // Keep these in word form
1296                         return "&$name";
1297                 } elseif ( isset( HTMLData::$namedEntityTranslations[$name] ) ) {
1298                         // Beware: some entities expand to more than 1 codepoint
1299                         return preg_replace_callback( '/./Ssu', static function ( $m ) {
1300                                 return '&#' . \UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
1301                         }, HTMLData::$namedEntityTranslations[$name] );
1302                 } else {
1303                         return "&amp;$name";
1304                 }
1305         }
1306
1307         /**
1308          * @param int|string $codepoint
1309          * @return null|string
1310          */
1311         private static function decCharReference( $codepoint ) {
1312                 # intval() will (safely) saturate at the maximum signed integer
1313                 # value if $codepoint is too many digits
1314                 $point = intval( $codepoint );
1315                 if ( self::validateCodepoint( $point ) ) {
1316                         return sprintf( '&#%d;', $point );
1317                 } else {
1318                         return null;
1319                 }
1320         }
1321
1322         /**
1323          * @param string $codepoint
1324          * @return null|string
1325          */
1326         private static function hexCharReference( $codepoint ) {
1327                 # hexdec() will return a float (not an int) if $codepoint is too
1328                 # long, so protect against that.  The largest valid codepoint is
1329                 # 0x10FFFF.
1330                 if ( strlen( ltrim( $codepoint, '0' ) ) > 6 ) {
1331                         return null;
1332                 }
1333                 $point = hexdec( $codepoint );
1334                 if ( self::validateCodepoint( $point ) ) {
1335                         return sprintf( '&#x%x;', $point );
1336                 } else {
1337                         return null;
1338                 }
1339         }
1340
1341         /**
1342          * Returns true if a given Unicode codepoint is a valid character in
1343          * both HTML5 and XML.
1344          * @param int $codepoint
1345          * @return bool
1346          */
1347         private static function validateCodepoint( $codepoint ) {
1348                 # U+000C is valid in HTML5 but not allowed in XML.
1349                 # U+000D is valid in XML but not allowed in HTML5.
1350                 # U+007F - U+009F are disallowed in HTML5 (control characters).
1351                 return $codepoint == 0x09
1352                         || $codepoint == 0x0a
1353                         || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1354                         || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1355                         || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1356                         || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1357         }
1358
1359         /**
1360          * Decode any character references, numeric or named entities,
1361          * in the text and return a UTF-8 string.
1362          *
1363          * @param string $text
1364          * @return string
1365          */
1366         public static function decodeCharReferences( $text ) {
1367                 return preg_replace_callback(
1368                         self::CHAR_REFS_REGEX,
1369                         [ self::class, 'decodeCharReferencesCallback' ],
1370                         $text );
1371         }
1372
1373         /**
1374          * Decode any character references, numeric or named entities,
1375          * in the next and normalize the resulting string. (T16952)
1376          *
1377          * This is useful for page titles, not for text to be displayed,
1378          * MediaWiki allows HTML entities to escape normalization as a feature.
1379          *
1380          * @param string $text Already normalized, containing entities
1381          * @return string Still normalized, without entities
1382          */
1383         public static function decodeCharReferencesAndNormalize( $text ) {
1384                 $text = preg_replace_callback(
1385                         self::CHAR_REFS_REGEX,
1386                         [ self::class, 'decodeCharReferencesCallback' ],
1387                         $text,
1388                         -1, // limit
1389                         $count
1390                 );
1391
1392                 if ( $count ) {
1393                         return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1394                 } else {
1395                         return $text;
1396                 }
1397         }
1398
1399         /**
1400          * @param string $matches
1401          * @return string
1402          */
1403         private static function decodeCharReferencesCallback( $matches ) {
1404                 if ( $matches[1] != '' ) {
1405                         return self::decodeEntity( $matches[1] );
1406                 } elseif ( $matches[2] != '' ) {
1407                         return self::decodeChar( intval( $matches[2] ) );
1408                 } elseif ( $matches[3] != '' ) {
1409                         # hexdec will return a float if the string is too long (!) so
1410                         # check the length of the string first.
1411                         if ( strlen( ltrim( $matches[3], '0' ) ) > 6 ) {
1412                                 // Invalid character reference.
1413                                 return \UtfNormal\Constants::UTF8_REPLACEMENT;
1414                         }
1415                         return self::decodeChar( hexdec( $matches[3] ) );
1416                 }
1417                 # Last case should be an ampersand by itself
1418                 return $matches[0];
1419         }
1420
1421         /**
1422          * Return UTF-8 string for a codepoint if that is a valid
1423          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1424          * @param int $codepoint
1425          * @return string
1426          * @internal
1427          */
1428         private static function decodeChar( $codepoint ) {
1429                 if ( self::validateCodepoint( $codepoint ) ) {
1430                         return \UtfNormal\Utils::codepointToUtf8( $codepoint );
1431                 } else {
1432                         return \UtfNormal\Constants::UTF8_REPLACEMENT;
1433                 }
1434         }
1435
1436         /**
1437          * If the named entity is defined in HTML5
1438          * return the UTF-8 encoding of that character. Otherwise, returns
1439          * pseudo-entity source (eg "&foo;")
1440          *
1441          * @param string $name Semicolon-terminated entity name
1442          * @return string
1443          */
1444         private static function decodeEntity( $name ) {
1445                 // These are MediaWiki-specific entities, not in the HTML standard
1446                 if ( isset( self::MW_ENTITY_ALIASES[$name] ) ) {
1447                         $name = self::MW_ENTITY_ALIASES[$name];
1448                 }
1449                 $trans = HTMLData::$namedEntityTranslations[$name] ?? null;
1450                 return $trans ?? "&$name";
1451         }
1452
1453         /**
1454          * Fetch the list of acceptable attributes for a given element name.
1455          *
1456          * @param string $element
1457          * @return array An associative array where keys are acceptable attribute
1458          *   names
1459          */
1460         private static function attributesAllowedInternal( $element ) {
1461                 $list = self::setupAttributesAllowedInternal();
1462                 return $list[$element] ?? [];
1463         }
1464
1465         /**
1466          * Foreach array key (an allowed HTML element), return an array
1467          * of allowed attributes.
1468          * @return array An associative array: keys are HTML element names;
1469          *   values are associative arrays where the keys are allowed attribute
1470          *   names.
1471          */
1472         private static function setupAttributesAllowedInternal() {
1473                 static $allowed;
1474
1475                 if ( $allowed !== null ) {
1476                         return $allowed;
1477                 }
1478
1479                 // For lookup efficiency flip each attributes array so the keys are
1480                 // the valid attributes.
1481                 $merge = static function ( $a, $b, $c = [] ) {
1482                         return array_merge(
1483                                 $a,
1484                                 array_fill_keys( $b, true ),
1485                                 array_fill_keys( $c, true ) );
1486                 };
1487                 $common = $merge( [], [
1488                         # HTML
1489                         'id',
1490                         'class',
1491                         'style',
1492                         'lang',
1493                         'dir',
1494                         'title',
1495                         'tabindex',
1496
1497                         # WAI-ARIA
1498                         'aria-describedby',
1499                         'aria-flowto',
1500                         'aria-hidden',
1501                         'aria-label',
1502                         'aria-labelledby',
1503                         'aria-level',
1504                         'aria-owns',
1505                         'role',
1506
1507                         # RDFa
1508                         # These attributes are specified in section 9 of
1509                         # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1510                         'about',
1511                         'property',
1512                         'resource',
1513                         'datatype',
1514                         'typeof',
1515
1516                         # Microdata. These are specified by
1517                         # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1518                         'itemid',
1519                         'itemprop',
1520                         'itemref',
1521                         'itemscope',
1522                         'itemtype',
1523                 ] );
1524
1525                 $block = $merge( $common, [ 'align' ] );
1526
1527                 $tablealign = [ 'align', 'valign' ];
1528                 $tablecell = [
1529                         'abbr',
1530                         'axis',
1531                         'headers',
1532                         'scope',
1533                         'rowspan',
1534                         'colspan',
1535                         'nowrap', # deprecated
1536                         'width', # deprecated
1537                         'height', # deprecated
1538                         'bgcolor', # deprecated
1539                 ];
1540
1541                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1542                 # See: https://www.w3.org/TR/html4/
1543                 $allowed = [
1544                         # 7.5.4
1545                         'div'        => $block,
1546                         'center'     => $common, # deprecated
1547                         'span'       => $common,
1548
1549                         # 7.5.5
1550                         'h1'         => $block,
1551                         'h2'         => $block,
1552                         'h3'         => $block,
1553                         'h4'         => $block,
1554                         'h5'         => $block,
1555                         'h6'         => $block,
1556
1557                         # 7.5.6
1558                         # address
1559
1560                         # 8.2.4
1561                         'bdo'        => $common,
1562
1563                         # 9.2.1
1564                         'em'         => $common,
1565                         'strong'     => $common,
1566                         'cite'       => $common,
1567                         'dfn'        => $common,
1568                         'code'       => $common,
1569                         'samp'       => $common,
1570                         'kbd'        => $common,
1571                         'var'        => $common,
1572                         'abbr'       => $common,
1573                         # acronym
1574
1575                         # 9.2.2
1576                         'blockquote' => $merge( $common, [ 'cite' ] ),
1577                         'q'          => $merge( $common, [ 'cite' ] ),
1578
1579                         # 9.2.3
1580                         'sub'        => $common,
1581                         'sup'        => $common,
1582
1583                         # 9.3.1
1584                         'p'          => $block,
1585
1586                         # 9.3.2
1587                         'br'         => $merge( $common, [ 'clear' ] ),
1588
1589                         # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1590                         'wbr'        => $common,
1591
1592                         # 9.3.4
1593                         'pre'        => $merge( $common, [ 'width' ] ),
1594
1595                         # 9.4
1596                         'ins'        => $merge( $common, [ 'cite', 'datetime' ] ),
1597                         'del'        => $merge( $common, [ 'cite', 'datetime' ] ),
1598
1599                         # 10.2
1600                         'ul'         => $merge( $common, [ 'type' ] ),
1601                         'ol'         => $merge( $common, [ 'type', 'start', 'reversed' ] ),
1602                         'li'         => $merge( $common, [ 'type', 'value' ] ),
1603
1604                         # 10.3
1605                         'dl'         => $common,
1606                         'dd'         => $common,
1607                         'dt'         => $common,
1608
1609                         # 11.2.1
1610                         'table'      => $merge( $common,
1611                                                                 [ 'summary', 'width', 'border', 'frame',
1612                                                                                 'rules', 'cellspacing', 'cellpadding',
1613                                                                                 'align', 'bgcolor',
1614                                                                 ] ),
1615
1616                         # 11.2.2
1617                         'caption'    => $block,
1618
1619                         # 11.2.3
1620                         'thead'      => $common,
1621                         'tfoot'      => $common,
1622                         'tbody'      => $common,
1623
1624                         # 11.2.4
1625                         'colgroup'   => $merge( $common, [ 'span' ] ),
1626                         'col'        => $merge( $common, [ 'span' ] ),
1627
1628                         # 11.2.5
1629                         'tr'         => $merge( $common, [ 'bgcolor' ], $tablealign ),
1630
1631                         # 11.2.6
1632                         'td'         => $merge( $common, $tablecell, $tablealign ),
1633                         'th'         => $merge( $common, $tablecell, $tablealign ),
1634
1635                         # 12.2
1636                         # NOTE: <a> is not allowed directly, but this list of allowed
1637                         # attributes is used from the Parser object
1638                         'a'          => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1639
1640                         # 13.2
1641                         # Not usually allowed, but may be used for extension-style hooks
1642                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
1643                         # true
1644                         'img'        => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1645                         # Attributes for A/V tags added in T163583 / T133673
1646                         'audio'      => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ),
1647                         'video'      => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1648                         'source'     => $merge( $common, [ 'type', 'src' ] ),
1649                         'track'      => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1650
1651                         # 15.2.1
1652                         'tt'         => $common,
1653                         'b'          => $common,
1654                         'i'          => $common,
1655                         'big'        => $common,
1656                         'small'      => $common,
1657                         'strike'     => $common,
1658                         's'          => $common,
1659                         'u'          => $common,
1660
1661                         # 15.2.2
1662                         'font'       => $merge( $common, [ 'size', 'color', 'face' ] ),
1663                         # basefont
1664
1665                         # 15.3
1666                         'hr'         => $merge( $common, [ 'width' ] ),
1667
1668                         # HTML Ruby annotation text module, simple ruby only.
1669                         # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1670                         'ruby'       => $common,
1671                         # rbc
1672                         'rb'         => $common,
1673                         'rp'         => $common,
1674                         'rt'         => $common, # $merge( $common, [ 'rbspan' ] ),
1675                         'rtc'        => $common,
1676
1677                         # MathML root element, where used for extensions
1678                         # 'title' may not be 100% valid here; it's XHTML
1679                         # https://www.w3.org/TR/REC-MathML/
1680                         'math'       => $merge( [], [ 'class', 'style', 'id', 'title' ] ),
1681
1682                         // HTML 5 section 4.5
1683                         'figure'     => $common,
1684                         'figcaption' => $common,
1685
1686                         # HTML 5 section 4.6
1687                         'bdi' => $common,
1688
1689                         # HTML5 elements, defined by:
1690                         # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1691                         'data' => $merge( $common, [ 'value' ] ),
1692                         'time' => $merge( $common, [ 'datetime' ] ),
1693                         'mark' => $common,
1694
1695                         // meta and link are only permitted by internalRemoveHtmlTags when Microdata
1696                         // is enabled so we don't bother adding a conditional to hide these
1697                         // Also meta and link are only valid in WikiText as Microdata elements
1698                         // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1699                         // So we don't bother including $common attributes that have no purpose.
1700                         'meta' => $merge( [], [ 'itemprop', 'content' ] ),
1701                         'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ),
1702
1703                         # HTML 5 section 4.3.5
1704                         'aside' => $common,
1705                 ];
1706
1707                 return $allowed;
1708         }
1709
1710         /**
1711          * Take a fragment of (potentially invalid) HTML and return
1712          * a version with any tags removed, encoded as plain text.
1713          *
1714          * Warning: this return value must be further escaped for literal
1715          * inclusion in HTML output as of 1.10!
1716          *
1717          * @param string $html HTML fragment
1718          * @return string
1719          * @return-taint tainted
1720          */
1721         public static function stripAllTags( $html ) {
1722                 // Use RemexHtml to tokenize $html and extract the text
1723                 $handler = new RemexStripTagHandler;
1724                 $tokenizer = new RemexTokenizer( $handler, $html, [
1725                         'ignoreErrors' => true,
1726                         // don't ignore char refs, we want them to be decoded
1727                         'ignoreNulls' => true,
1728                         'skipPreprocess' => true,
1729                 ] );
1730                 $tokenizer->execute();
1731                 $text = $handler->getResult();
1732
1733                 $text = self::normalizeWhitespace( $text );
1734                 return $text;
1735         }
1736
1737         /**
1738          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1739          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1740          * PHP 5.1 doesn't.
1741          *
1742          * Use for passing XHTML fragments to PHP's XML parsing functions
1743          *
1744          * @return string
1745          * @deprecated since 1.36; will be made private or removed in a future
1746          *    release.
1747          */
1748         public static function hackDocType() {
1749                 $out = "<!DOCTYPE html [\n";
1750                 foreach ( HTMLData::$namedEntityTranslations as $entity => $translation ) {
1751                         if ( substr( $entity, -1 ) !== ';' ) {
1752                                 // Some HTML entities omit the trailing semicolon;
1753                                 // wikitext does not permit these.
1754                                 continue;
1755                         }
1756                         $name = substr( $entity, 0, -1 );
1757                         $expansion = self::normalizeEntity( $entity );
1758                         if ( $entity === $expansion ) {
1759                                 // Skip &lt; &gt; etc
1760                                 continue;
1761                         }
1762                         $out .= "<!ENTITY $name \"$expansion\">";
1763                 }
1764                 $out .= "]>\n";
1765                 return $out;
1766         }
1767
1768         /**
1769          * @param string $url
1770          * @return mixed|string
1771          */
1772         public static function cleanUrl( $url ) {
1773                 # Normalize any HTML entities in input. They will be
1774                 # re-escaped by makeExternalLink().
1775                 $url = self::decodeCharReferences( $url );
1776
1777                 # Escape any control characters introduced by the above step
1778                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
1779                         [ __CLASS__, 'cleanUrlCallback' ], $url );
1780
1781                 # Validate hostname portion
1782                 $matches = [];
1783                 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1784                         [ /* $whole */, $protocol, $host, $rest ] = $matches;
1785
1786                         // Characters that will be ignored in IDNs.
1787                         // https://datatracker.ietf.org/doc/html/rfc8264#section-9.13
1788                         // https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1789                         // Strip them before further processing so deny lists and such work.
1790                         $strip = "/
1791                                 \\s|      # general whitespace
1792                                 \u{00AD}|               # SOFT HYPHEN
1793                                 \u{034F}|               # COMBINING GRAPHEME JOINER
1794                                 \u{061C}|               # ARABIC LETTER MARK
1795                                 [\u{115F}-\u{1160}]|    # HANGUL CHOSEONG FILLER..
1796                                                         # HANGUL JUNGSEONG FILLER
1797                                 [\u{17B4}-\u{17B5}]|    # KHMER VOWEL INHERENT AQ..
1798                                                         # KHMER VOWEL INHERENT AA
1799                                 [\u{180B}-\u{180D}]|    # MONGOLIAN FREE VARIATION SELECTOR ONE..
1800                                                         # MONGOLIAN FREE VARIATION SELECTOR THREE
1801                                 \u{180E}|               # MONGOLIAN VOWEL SEPARATOR
1802                                 [\u{200B}-\u{200F}]|    # ZERO WIDTH SPACE..
1803                                                         # RIGHT-TO-LEFT MARK
1804                                 [\u{202A}-\u{202E}]|    # LEFT-TO-RIGHT EMBEDDING..
1805                                                         # RIGHT-TO-LEFT OVERRIDE
1806                                 [\u{2060}-\u{2064}]|    # WORD JOINER..
1807                                                         # INVISIBLE PLUS
1808                                 \u{2065}|               # <reserved-2065>
1809                                 [\u{2066}-\u{206F}]|    # LEFT-TO-RIGHT ISOLATE..
1810                                                         # NOMINAL DIGIT SHAPES
1811                                 \u{3164}|               # HANGUL FILLER
1812                                 [\u{FE00}-\u{FE0F}]|    # VARIATION SELECTOR-1..
1813                                                         # VARIATION SELECTOR-16
1814                                 \u{FEFF}|               # ZERO WIDTH NO-BREAK SPACE
1815                                 \u{FFA0}|               # HALFWIDTH HANGUL FILLER
1816                                 [\u{FFF0}-\u{FFF8}]|    # <reserved-FFF0>..
1817                                                         # <reserved-FFF8>
1818                                 [\u{1BCA0}-\u{1BCA3}]|  # SHORTHAND FORMAT LETTER OVERLAP..
1819                                                         # SHORTHAND FORMAT UP STEP
1820                                 [\u{1D173}-\u{1D17A}]|  # MUSICAL SYMBOL BEGIN BEAM..
1821                                                         # MUSICAL SYMBOL END PHRASE
1822                                 \u{E0000}|              # <reserved-E0000>
1823                                 \u{E0001}|              # LANGUAGE TAG
1824                                 [\u{E0002}-\u{E001F}]|  # <reserved-E0002>..
1825                                                         # <reserved-E001F>
1826                                 [\u{E0020}-\u{E007F}]|  # TAG SPACE..
1827                                                         # CANCEL TAG
1828                                 [\u{E0080}-\u{E00FF}]|  # <reserved-E0080>..
1829                                                         # <reserved-E00FF>
1830                                 [\u{E0100}-\u{E01EF}]|  # VARIATION SELECTOR-17..
1831                                                         # VARIATION SELECTOR-256
1832                                 [\u{E01F0}-\u{E0FFF}]|  # <reserved-E01F0>..
1833                                                         # <reserved-E0FFF>
1834                                 /xuD";
1835
1836                         $host = preg_replace( $strip, '', $host );
1837
1838                         // IPv6 host names are bracketed with [].  Url-decode these.
1839                         if ( str_starts_with( $host, "//%5B" ) &&
1840                                 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
1841                         ) {
1842                                 $host = '//[' . $matches[1] . ']' . $matches[2];
1843                         }
1844
1845                         // @todo FIXME: Validate hostnames here
1846
1847                         return $protocol . $host . $rest;
1848                 } else {
1849                         return $url;
1850                 }
1851         }
1852
1853         /**
1854          * @param array $matches
1855          * @return string
1856          */
1857         private static function cleanUrlCallback( $matches ) {
1858                 return urlencode( $matches[0] );
1859         }
1860
1861         /**
1862          * Does a string look like an e-mail address?
1863          *
1864          * This validates an email address using an HTML5 specification found at:
1865          * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
1866          * Which as of 2011-01-24 says:
1867          *
1868          *   A valid e-mail address is a string that matches the ABNF production
1869          *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
1870          *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
1871          *   3.5.
1872          *
1873          * This function is an implementation of the specification as requested in
1874          * T24449.
1875          *
1876          * Client-side forms will use the same standard validation rules via JS or
1877          * HTML 5 validation; additional restrictions can be enforced server-side
1878          * by extensions via the 'isValidEmailAddr' hook.
1879          *
1880          * Note that this validation doesn't 100% match RFC 2822, but is believed
1881          * to be liberal enough for wide use. Some invalid addresses will still
1882          * pass validation here.
1883          *
1884          * @since 1.18
1885          *
1886          * @param string $addr E-mail address
1887          * @return bool
1888          */
1889         public static function validateEmail( $addr ) {
1890                 $result = null;
1891                 // TODO This method should be non-static, and have a HookRunner injected
1892                 $hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
1893                 if ( !$hookRunner->onIsValidEmailAddr( $addr, $result ) ) {
1894                         return $result;
1895                 }
1896
1897                 // Please note strings below are enclosed in brackets [], this make the
1898                 // hyphen "-" a range indicator. Hence it is double backslashed below.
1899                 // See T28948
1900                 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
1901                 $rfc1034_ldh_str = "a-z0-9\\-";
1902
1903                 $html5_email_regexp = "/
1904                 ^                      # start of string
1905                 [$rfc5322_atext\\.]+    # user part which is liberal :p
1906                 @                      # 'apostrophe'
1907                 [$rfc1034_ldh_str]+       # First domain part
1908                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
1909                 $                      # End of string
1910                 /ix"; // case Insensitive, eXtended
1911
1912                 return (bool)preg_match( $html5_email_regexp, $addr );
1913         }
1914 }
1915
1916 /**
1917  * Retain the old class name for backwards compatibility.
1918  * @deprecated since 1.41
1919  */
1920 class_alias( Sanitizer::class, 'Sanitizer' );