includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @addtogroup Parser
  24  */
  25
  26 /**
  27  * Regular expression to match various types of character references in
  28  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  29  */
  30 define( 'MW_CHAR_REFS_REGEX',
  31         '/&([A-Za-z0-9]+);
  32          |&\#([0-9]+);
  33          |&\#x([0-9A-Za-z]+);
  34          |&\#X([0-9A-Za-z]+);
  35          |(&)/x' );
  36
  37 /**
  38  * Regular expression to match HTML/XML attribute pairs within a tag.
  39  * Allows some... latitude.
  40  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  41  */
  42 $attrib = '[A-Za-z0-9]';
  43 $space = '[\x09\x0a\x0d\x20]';
  44 define( 'MW_ATTRIBS_REGEX',
  45         "/(?:^|$space)($attrib+)
  46           ($space*=$space*
  47                 (?:
  48                  # The attribute value: quoted or alone
  49                   \"([^<\"]*)\"
  50                  | '([^<']*)'
  51                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  52                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  53                                                          # colors are specified like this.
  54                                                          # We'll be normalizing it.
  55                 )
  56            )?(?=$space|\$)/sx" );
  57
  58 /**
  59  * List of all named character entities defined in HTML 4.01
  60  * http://www.w3.org/TR/html4/sgml/entities.html
  61  * @private
  62  */
  63 global $wgHtmlEntities;
  64 $wgHtmlEntities = array(
  65         'Aacute'   => 193,
  66         'aacute'   => 225,
  67         'Acirc'    => 194,
  68         'acirc'    => 226,
  69         'acute'    => 180,
  70         'AElig'    => 198,
  71         'aelig'    => 230,
  72         'Agrave'   => 192,
  73         'agrave'   => 224,
  74         'alefsym'  => 8501,
  75         'Alpha'    => 913,
  76         'alpha'    => 945,
  77         'amp'      => 38,
  78         'and'      => 8743,
  79         'ang'      => 8736,
  80         'Aring'    => 197,
  81         'aring'    => 229,
  82         'asymp'    => 8776,
  83         'Atilde'   => 195,
  84         'atilde'   => 227,
  85         'Auml'     => 196,
  86         'auml'     => 228,
  87         'bdquo'    => 8222,
  88         'Beta'     => 914,
  89         'beta'     => 946,
  90         'brvbar'   => 166,
  91         'bull'     => 8226,
  92         'cap'      => 8745,
  93         'Ccedil'   => 199,
  94         'ccedil'   => 231,
  95         'cedil'    => 184,
  96         'cent'     => 162,
  97         'Chi'      => 935,
  98         'chi'      => 967,
  99         'circ'     => 710,
 100         'clubs'    => 9827,
 101         'cong'     => 8773,
 102         'copy'     => 169,
 103         'crarr'    => 8629,
 104         'cup'      => 8746,
 105         'curren'   => 164,
 106         'dagger'   => 8224,
 107         'Dagger'   => 8225,
 108         'darr'     => 8595,
 109         'dArr'     => 8659,
 110         'deg'      => 176,
 111         'Delta'    => 916,
 112         'delta'    => 948,
 113         'diams'    => 9830,
 114         'divide'   => 247,
 115         'Eacute'   => 201,
 116         'eacute'   => 233,
 117         'Ecirc'    => 202,
 118         'ecirc'    => 234,
 119         'Egrave'   => 200,
 120         'egrave'   => 232,
 121         'empty'    => 8709,
 122         'emsp'     => 8195,
 123         'ensp'     => 8194,
 124         'Epsilon'  => 917,
 125         'epsilon'  => 949,
 126         'equiv'    => 8801,
 127         'Eta'      => 919,
 128         'eta'      => 951,
 129         'ETH'      => 208,
 130         'eth'      => 240,
 131         'Euml'     => 203,
 132         'euml'     => 235,
 133         'euro'     => 8364,
 134         'exist'    => 8707,
 135         'fnof'     => 402,
 136         'forall'   => 8704,
 137         'frac12'   => 189,
 138         'frac14'   => 188,
 139         'frac34'   => 190,
 140         'frasl'    => 8260,
 141         'Gamma'    => 915,
 142         'gamma'    => 947,
 143         'ge'       => 8805,
 144         'gt'       => 62,
 145         'harr'     => 8596,
 146         'hArr'     => 8660,
 147         'hearts'   => 9829,
 148         'hellip'   => 8230,
 149         'Iacute'   => 205,
 150         'iacute'   => 237,
 151         'Icirc'    => 206,
 152         'icirc'    => 238,
 153         'iexcl'    => 161,
 154         'Igrave'   => 204,
 155         'igrave'   => 236,
 156         'image'    => 8465,
 157         'infin'    => 8734,
 158         'int'      => 8747,
 159         'Iota'     => 921,
 160         'iota'     => 953,
 161         'iquest'   => 191,
 162         'isin'     => 8712,
 163         'Iuml'     => 207,
 164         'iuml'     => 239,
 165         'Kappa'    => 922,
 166         'kappa'    => 954,
 167         'Lambda'   => 923,
 168         'lambda'   => 955,
 169         'lang'     => 9001,
 170         'laquo'    => 171,
 171         'larr'     => 8592,
 172         'lArr'     => 8656,
 173         'lceil'    => 8968,
 174         'ldquo'    => 8220,
 175         'le'       => 8804,
 176         'lfloor'   => 8970,
 177         'lowast'   => 8727,
 178         'loz'      => 9674,
 179         'lrm'      => 8206,
 180         'lsaquo'   => 8249,
 181         'lsquo'    => 8216,
 182         'lt'       => 60,
 183         'macr'     => 175,
 184         'mdash'    => 8212,
 185         'micro'    => 181,
 186         'middot'   => 183,
 187         'minus'    => 8722,
 188         'Mu'       => 924,
 189         'mu'       => 956,
 190         'nabla'    => 8711,
 191         'nbsp'     => 160,
 192         'ndash'    => 8211,
 193         'ne'       => 8800,
 194         'ni'       => 8715,
 195         'not'      => 172,
 196         'notin'    => 8713,
 197         'nsub'     => 8836,
 198         'Ntilde'   => 209,
 199         'ntilde'   => 241,
 200         'Nu'       => 925,
 201         'nu'       => 957,
 202         'Oacute'   => 211,
 203         'oacute'   => 243,
 204         'Ocirc'    => 212,
 205         'ocirc'    => 244,
 206         'OElig'    => 338,
 207         'oelig'    => 339,
 208         'Ograve'   => 210,
 209         'ograve'   => 242,
 210         'oline'    => 8254,
 211         'Omega'    => 937,
 212         'omega'    => 969,
 213         'Omicron'  => 927,
 214         'omicron'  => 959,
 215         'oplus'    => 8853,
 216         'or'       => 8744,
 217         'ordf'     => 170,
 218         'ordm'     => 186,
 219         'Oslash'   => 216,
 220         'oslash'   => 248,
 221         'Otilde'   => 213,
 222         'otilde'   => 245,
 223         'otimes'   => 8855,
 224         'Ouml'     => 214,
 225         'ouml'     => 246,
 226         'para'     => 182,
 227         'part'     => 8706,
 228         'permil'   => 8240,
 229         'perp'     => 8869,
 230         'Phi'      => 934,
 231         'phi'      => 966,
 232         'Pi'       => 928,
 233         'pi'       => 960,
 234         'piv'      => 982,
 235         'plusmn'   => 177,
 236         'pound'    => 163,
 237         'prime'    => 8242,
 238         'Prime'    => 8243,
 239         'prod'     => 8719,
 240         'prop'     => 8733,
 241         'Psi'      => 936,
 242         'psi'      => 968,
 243         'quot'     => 34,
 244         'radic'    => 8730,
 245         'rang'     => 9002,
 246         'raquo'    => 187,
 247         'rarr'     => 8594,
 248         'rArr'     => 8658,
 249         'rceil'    => 8969,
 250         'rdquo'    => 8221,
 251         'real'     => 8476,
 252         'reg'      => 174,
 253         'rfloor'   => 8971,
 254         'Rho'      => 929,
 255         'rho'      => 961,
 256         'rlm'      => 8207,
 257         'rsaquo'   => 8250,
 258         'rsquo'    => 8217,
 259         'sbquo'    => 8218,
 260         'Scaron'   => 352,
 261         'scaron'   => 353,
 262         'sdot'     => 8901,
 263         'sect'     => 167,
 264         'shy'      => 173,
 265         'Sigma'    => 931,
 266         'sigma'    => 963,
 267         'sigmaf'   => 962,
 268         'sim'      => 8764,
 269         'spades'   => 9824,
 270         'sub'      => 8834,
 271         'sube'     => 8838,
 272         'sum'      => 8721,
 273         'sup'      => 8835,
 274         'sup1'     => 185,
 275         'sup2'     => 178,
 276         'sup3'     => 179,
 277         'supe'     => 8839,
 278         'szlig'    => 223,
 279         'Tau'      => 932,
 280         'tau'      => 964,
 281         'there4'   => 8756,
 282         'Theta'    => 920,
 283         'theta'    => 952,
 284         'thetasym' => 977,
 285         'thinsp'   => 8201,
 286         'THORN'    => 222,
 287         'thorn'    => 254,
 288         'tilde'    => 732,
 289         'times'    => 215,
 290         'trade'    => 8482,
 291         'Uacute'   => 218,
 292         'uacute'   => 250,
 293         'uarr'     => 8593,
 294         'uArr'     => 8657,
 295         'Ucirc'    => 219,
 296         'ucirc'    => 251,
 297         'Ugrave'   => 217,
 298         'ugrave'   => 249,
 299         'uml'      => 168,
 300         'upsih'    => 978,
 301         'Upsilon'  => 933,
 302         'upsilon'  => 965,
 303         'Uuml'     => 220,
 304         'uuml'     => 252,
 305         'weierp'   => 8472,
 306         'Xi'       => 926,
 307         'xi'       => 958,
 308         'Yacute'   => 221,
 309         'yacute'   => 253,
 310         'yen'      => 165,
 311         'Yuml'     => 376,
 312         'yuml'     => 255,
 313         'Zeta'     => 918,
 314         'zeta'     => 950,
 315         'zwj'      => 8205,
 316         'zwnj'     => 8204 );
 317
 318 class Sanitizer {
 319         /**
 320          * Cleans up HTML, removes dangerous tags and attributes, and
 321          * removes HTML comments
 322          * @private
 323          * @param string $text
 324          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 325          * @param array $args for the processing callback
 326          * @return string
 327          */
 328         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 329                 global $wgUseTidy, $wgUserHtml;
 330
 331                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 332                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 333
 334                 wfProfileIn( __METHOD__ );
 335
 336                 if ( !$staticInitialised ) {
 337                         if( $wgUserHtml ) {
 338                                 $htmlpairs = array( # Tags that must be closed
 339                                         'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 340                                         'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 341                                         'strike', 'strong', 'tt', 'var', 'div', 'center',
 342                                         'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 343                                         'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 344                                 );
 345                                 $htmlsingle = array(
 346                                         'br', 'hr', 'li', 'dt', 'dd'
 347                                 );
 348                                 $htmlsingleonly = array( # Elements that cannot have close tags
 349                                         'br', 'hr'
 350                                 );
 351                                 $htmlnest = array( # Tags that can be nested--??
 352                                         'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 353                                         'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 354                                 );
 355                                 $tabletags = array( # Can only appear inside table, we will close them
 356                                         'td', 'th', 'tr',
 357                                 );
 358                                 $htmllist = array( # Tags used by list
 359                                         'ul','ol',
 360                                 );
 361                                 $listtags = array( # Tags that can appear in a list
 362                                         'li',
 363                                 );
 364
 365                         } else {
 366                                 $htmlpairs = array();
 367                                 $htmlsingle = array();
 368                                 $htmlnest = array();
 369                                 $tabletags = array();
 370                         }
 371
 372                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 373                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 374
 375                         # Convert them all to hashtables for faster lookup
 376                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 377                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 378                         foreach ( $vars as $var ) {
 379                                 $$var = array_flip( $$var );
 380                         }
 381                         $staticInitialised = true;
 382                 }
 383
 384                 # Remove HTML comments
 385                 $text = Sanitizer::removeHTMLcomments( $text );
 386                 $bits = explode( '<', $text );
 387                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 388                 if(!$wgUseTidy) {
 389                         $tagstack = $tablestack = array();
 390                         foreach ( $bits as $x ) {
 391                                 $regs = array();
 392                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 393                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 394                                 } else {
 395                                         $slash = $t = $params = $brace = $rest = null;
 396                                 }
 397
 398                                 $badtag = 0 ;
 399                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 400                                         # Check our stack
 401                                         if ( $slash ) {
 402                                                 # Closing a tag...
 403                                                 if( isset( $htmlsingleonly[$t] ) ) {
 404                                                         $badtag = 1;
 405                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 406                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 407                                                                 # Pop all elements with an optional close tag
 408                                                                 # and see if we find a match below them
 409                                                                 $optstack = array();
 410                                                                 array_push ($optstack, $ot);
 411                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 412                                                                                 isset( $htmlsingleallowed[$ot] ) )
 413                                                                 {
 414                                                                         array_push ($optstack, $ot);
 415                                                                 }
 416                                                                 if ( $t != $ot ) {
 417                                                                         # No match. Push the optinal elements back again
 418                                                                         $badtag = 1;
 419                                                                         while ( $ot = @array_pop( $optstack ) ) {
 420                                                                                 array_push( $tagstack, $ot );
 421                                                                         }
 422                                                                 }
 423                                                         } else {
 424                                                                 @array_push( $tagstack, $ot );
 425                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 426                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 427                                                                         $badtag = 1;
 428                                                                 }
 429                                                         }
 430                                                 } else {
 431                                                         if ( $t == 'table' ) {
 432                                                                 $tagstack = array_pop( $tablestack );
 433                                                         }
 434                                                 }
 435                                                 $newparams = '';
 436                                         } else {
 437                                                 # Keep track for later
 438                                                 if ( isset( $tabletags[$t] ) &&
 439                                                 ! in_array( 'table', $tagstack ) ) {
 440                                                         $badtag = 1;
 441                                                 } else if ( in_array( $t, $tagstack ) &&
 442                                                 ! isset( $htmlnest [$t ] ) ) {
 443                                                         $badtag = 1 ;
 444                                                 # Is it a self closed htmlpair ? (bug 5487)
 445                                                 } else if( $brace == '/>' &&
 446                                                 isset( $htmlpairs[$t] ) ) {
 447                                                         $badtag = 1;
 448                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 449                                                         # Hack to force empty tag for uncloseable elements
 450                                                         $brace = '/>';
 451                                                 } else if( isset( $htmlsingle[$t] ) ) {
 452                                                         # Hack to not close $htmlsingle tags
 453                                                         $brace = NULL;
 454                                                 } else if( isset( $tabletags[$t] )
 455                                                 &&  in_array($t ,$tagstack) ) {
 456                                                         // New table tag but forgot to close the previous one
 457                                                         $text .= "</$t>";
 458                                                 } else {
 459                                                         if ( $t == 'table' ) {
 460                                                                 array_push( $tablestack, $tagstack );
 461                                                                 $tagstack = array();
 462                                                         }
 463                                                         array_push( $tagstack, $t );
 464                                                 }
 465
 466                                                 # Replace any variables or template parameters with
 467                                                 # plaintext results.
 468                                                 if( is_callable( $processCallback ) ) {
 469                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 470                                                 }
 471
 472                                                 # Strip non-approved attributes from the tag
 473                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 474                                         }
 475                                         if ( ! $badtag ) {
 476                                                 $rest = str_replace( '>', '&gt;', $rest );
 477                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 478                                                 $text .= "<$slash$t$newparams$close>$rest";
 479                                                 continue;
 480                                         }
 481                                 }
 482                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 483                         }
 484                         # Close off any remaining tags
 485                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 486                                 $text .= "</$t>\n";
 487                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 488                         }
 489                 } else {
 490                         # this might be possible using tidy itself
 491                         foreach ( $bits as $x ) {
 492                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 493                                 $x, $regs );
 494                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 495                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 496                                         if( is_callable( $processCallback ) ) {
 497                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 498                                         }
 499                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 500                                         $rest = str_replace( '>', '&gt;', $rest );
 501                                         $text .= "<$slash$t$newparams$brace$rest";
 502                                 } else {
 503                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 504                                 }
 505                         }
 506                 }
 507                 wfProfileOut( __METHOD__ );
 508                 return $text;
 509         }
 510
 511         /**
 512          * Remove '<!--', '-->', and everything between.
 513          * To avoid leaving blank lines, when a comment is both preceded
 514          * and followed by a newline (ignoring spaces), trim leading and
 515          * trailing spaces and one of the newlines.
 516          *
 517          * @private
 518          * @param string $text
 519          * @return string
 520          */
 521         static function removeHTMLcomments( $text ) {
 522                 wfProfileIn( __METHOD__ );
 523                 while (($start = strpos($text, '<!--')) !== false) {
 524                         $end = strpos($text, '-->', $start + 4);
 525                         if ($end === false) {
 526                                 # Unterminated comment; bail out
 527                                 break;
 528                         }
 529
 530                         $end += 3;
 531
 532                         # Trim space and newline if the comment is both
 533                         # preceded and followed by a newline
 534                         $spaceStart = max($start - 1, 0);
 535                         $spaceLen = $end - $spaceStart;
 536                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 537                                 $spaceStart--;
 538                                 $spaceLen++;
 539                         }
 540                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 541                                 $spaceLen++;
 542                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 543                                 # Remove the comment, leading and trailing
 544                                 # spaces, and leave only one newline.
 545                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 546                         }
 547                         else {
 548                                 # Remove just the comment.
 549                                 $text = substr_replace($text, '', $start, $end - $start);
 550                         }
 551                 }
 552                 wfProfileOut( __METHOD__ );
 553                 return $text;
 554         }
 555
 556         /**
 557          * Take an array of attribute names and values and normalize or discard
 558          * illegal values for the given element type.
 559          *
 560          * - Discards attributes not on a whitelist for the given element
 561          * - Unsafe style attributes are discarded
 562          *
 563          * @param array $attribs
 564          * @param string $element
 565          * @return array
 566          *
 567          * @todo Check for legal values where the DTD limits things.
 568          * @todo Check for unique id attribute :P
 569          */
 570         static function validateTagAttributes( $attribs, $element ) {
 571                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 572                 $out = array();
 573                 foreach( $attribs as $attribute => $value ) {
 574                         if( !isset( $whitelist[$attribute] ) ) {
 575                                 continue;
 576                         }
 577                         # Strip javascript "expression" from stylesheets.
 578                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 579                         if( $attribute == 'style' ) {
 580                                 $value = Sanitizer::checkCss( $value );
 581                                 if( $value === false ) {
 582                                         # haxx0r
 583                                         continue;
 584                                 }
 585                         }
 586
 587                         if ( $attribute === 'id' )
 588                                 $value = Sanitizer::escapeId( $value );
 589
 590                         // If this attribute was previously set, override it.
 591                         // Output should only have one attribute of each name.
 592                         $out[$attribute] = $value;
 593                 }
 594                 return $out;
 595         }
 596
 597         /**
 598          * Pick apart some CSS and check it for forbidden or unsafe structures.
 599          * Returns a sanitized string, or false if it was just too evil.
 600          *
 601          * Currently URL references, 'expression', 'tps' are forbidden.
 602          *
 603          * @param string $value
 604          * @return mixed
 605          */
 606         static function checkCss( $value ) {
 607                 $stripped = Sanitizer::decodeCharReferences( $value );
 608
 609                 // Remove any comments; IE gets token splitting wrong
 610                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 611
 612                 $value = $stripped;
 613
 614                 // ... and continue checks
 615                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 616                         'codepointToUtf8(hexdec("$1"))', $stripped );
 617                 $stripped = str_replace( '\\', '', $stripped );
 618                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 619                                 $stripped ) ) {
 620                         # haxx0r
 621                         return false;
 622                 }
 623
 624                 return $value;
 625         }
 626
 627         /**
 628          * Take a tag soup fragment listing an HTML element's attributes
 629          * and normalize it to well-formed XML, discarding unwanted attributes.
 630          * Output is safe for further wikitext processing, with escaping of
 631          * values that could trigger problems.
 632          *
 633          * - Normalizes attribute names to lowercase
 634          * - Discards attributes not on a whitelist for the given element
 635          * - Turns broken or invalid entities into plaintext
 636          * - Double-quotes all attribute values
 637          * - Attributes without values are given the name as attribute
 638          * - Double attributes are discarded
 639          * - Unsafe style attributes are discarded
 640          * - Prepends space if there are attributes.
 641          *
 642          * @param string $text
 643          * @param string $element
 644          * @return string
 645          */
 646         static function fixTagAttributes( $text, $element ) {
 647                 if( trim( $text ) == '' ) {
 648                         return '';
 649                 }
 650
 651                 $stripped = Sanitizer::validateTagAttributes(
 652                         Sanitizer::decodeTagAttributes( $text ), $element );
 653
 654                 $attribs = array();
 655                 foreach( $stripped as $attribute => $value ) {
 656                         $encAttribute = htmlspecialchars( $attribute );
 657                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 658
 659                         $attribs[] = "$encAttribute=\"$encValue\"";
 660                 }
 661                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 662         }
 663
 664         /**
 665          * Encode an attribute value for HTML output.
 666          * @param $text
 667          * @return HTML-encoded text fragment
 668          */
 669         static function encodeAttribute( $text ) {
 670                 $encValue = htmlspecialchars( $text );
 671
 672                 // Whitespace is normalized during attribute decoding,
 673                 // so if we've been passed non-spaces we must encode them
 674                 // ahead of time or they won't be preserved.
 675                 $encValue = strtr( $encValue, array(
 676                         "\n" => '&#10;',
 677                         "\r" => '&#13;',
 678                         "\t" => '&#9;',
 679                 ) );
 680
 681                 return $encValue;
 682         }
 683
 684         /**
 685          * Encode an attribute value for HTML tags, with extra armoring
 686          * against further wiki processing.
 687          * @param $text
 688          * @return HTML-encoded text fragment
 689          */
 690         static function safeEncodeAttribute( $text ) {
 691                 $encValue = Sanitizer::encodeAttribute( $text );
 692
 693                 # Templates and links may be expanded in later parsing,
 694                 # creating invalid or dangerous output. Suppress this.
 695                 $encValue = strtr( $encValue, array(
 696                         '<'    => '&lt;',   // This should never happen,
 697                         '>'    => '&gt;',   // we've received invalid input
 698                         '"'    => '&quot;', // which should have been escaped.
 699                         '{'    => '&#123;',
 700                         '['    => '&#91;',
 701                         "''"   => '&#39;&#39;',
 702                         'ISBN' => '&#73;SBN',
 703                         'RFC'  => '&#82;FC',
 704                         'PMID' => '&#80;MID',
 705                         '|'    => '&#124;',
 706                         '__'   => '&#95;_',
 707                 ) );
 708
 709                 # Stupid hack
 710                 $encValue = preg_replace_callback(
 711                         '/(' . wfUrlProtocols() . ')/',
 712                         array( 'Sanitizer', 'armorLinksCallback' ),
 713                         $encValue );
 714                 return $encValue;
 715         }
 716
 717         /**
 718          * Given a value escape it so that it can be used in an id attribute and
 719          * return it, this does not validate the value however (see first link)
 720          *
 721          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 722          *                                                          in the id and
 723          *                                                          name attributes
 724          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 725          *
 726          * @bug 4461
 727          *
 728          * @static
 729          *
 730          * @param string $id
 731          * @return string
 732          */
 733         static function escapeId( $id ) {
 734                 static $replace = array(
 735                         '%3A' => ':',
 736                         '%' => '.'
 737                 );
 738
 739                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 740
 741                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 742         }
 743
 744         /**
 745          * Given a value, escape it so that it can be used as a CSS class and
 746          * return it.
 747          *
 748          * @todo For extra validity, input should be validated UTF-8.
 749          *
 750          * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 751          *
 752          * @param string $class
 753          * @return string
 754          */
 755         static function escapeClass( $class ) {
 756                 // Convert ugly stuff to underscores and kill underscores in ugly places
 757                 return rtrim(preg_replace(
 758                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 759                         '_',
 760                         $class ), '_');
 761         }
 762
 763         /**
 764          * Regex replace callback for armoring links against further processing.
 765          * @param array $matches
 766          * @return string
 767          * @private
 768          */
 769         private static function armorLinksCallback( $matches ) {
 770                 return str_replace( ':', '&#58;', $matches[1] );
 771         }
 772
 773         /**
 774          * Return an associative array of attribute names and values from
 775          * a partial tag string. Attribute names are forces to lowercase,
 776          * character references are decoded to UTF-8 text.
 777          *
 778          * @param string
 779          * @return array
 780          */
 781         static function decodeTagAttributes( $text ) {
 782                 $attribs = array();
 783
 784                 if( trim( $text ) == '' ) {
 785                         return $attribs;
 786                 }
 787
 788                 $pairs = array();
 789                 if( !preg_match_all(
 790                         MW_ATTRIBS_REGEX,
 791                         $text,
 792                         $pairs,
 793                         PREG_SET_ORDER ) ) {
 794                         return $attribs;
 795                 }
 796
 797                 foreach( $pairs as $set ) {
 798                         $attribute = strtolower( $set[1] );
 799                         $value = Sanitizer::getTagAttributeCallback( $set );
 800
 801                         // Normalize whitespace
 802                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 803                         $value = trim( $value );
 804
 805                         // Decode character references
 806                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 807                 }
 808                 return $attribs;
 809         }
 810
 811         /**
 812          * Pick the appropriate attribute value from a match set from the
 813          * MW_ATTRIBS_REGEX matches.
 814          *
 815          * @param array $set
 816          * @return string
 817          * @private
 818          */
 819         private static function getTagAttributeCallback( $set ) {
 820                 if( isset( $set[6] ) ) {
 821                         # Illegal #XXXXXX color with no quotes.
 822                         return $set[6];
 823                 } elseif( isset( $set[5] ) ) {
 824                         # No quotes.
 825                         return $set[5];
 826                 } elseif( isset( $set[4] ) ) {
 827                         # Single-quoted
 828                         return $set[4];
 829                 } elseif( isset( $set[3] ) ) {
 830                         # Double-quoted
 831                         return $set[3];
 832                 } elseif( !isset( $set[2] ) ) {
 833                         # In XHTML, attributes must have a value.
 834                         # For 'reduced' form, return explicitly the attribute name here.
 835                         return $set[1];
 836                 } else {
 837                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 838                 }
 839         }
 840
 841         /**
 842          * Normalize whitespace and character references in an XML source-
 843          * encoded text for an attribute value.
 844          *
 845          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 846          * but note that we're not returning the value, but are returning
 847          * XML source fragments that will be slapped into output.
 848          *
 849          * @param string $text
 850          * @return string
 851          * @private
 852          */
 853         private static function normalizeAttributeValue( $text ) {
 854                 return str_replace( '"', '&quot;',
 855                         preg_replace(
 856                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 857                                 ' ',
 858                                 Sanitizer::normalizeCharReferences( $text ) ) );
 859         }
 860
 861         /**
 862          * Ensure that any entities and character references are legal
 863          * for XML and XHTML specifically. Any stray bits will be
 864          * &amp;-escaped to result in a valid text fragment.
 865          *
 866          * a. any named char refs must be known in XHTML
 867          * b. any numeric char refs must be legal chars, not invalid or forbidden
 868          * c. use &#x, not &#X
 869          * d. fix or reject non-valid attributes
 870          *
 871          * @param string $text
 872          * @return string
 873          * @private
 874          */
 875         static function normalizeCharReferences( $text ) {
 876                 return preg_replace_callback(
 877                         MW_CHAR_REFS_REGEX,
 878                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 879                         $text );
 880         }
 881         /**
 882          * @param string $matches
 883          * @return string
 884          */
 885         static function normalizeCharReferencesCallback( $matches ) {
 886                 $ret = null;
 887                 if( $matches[1] != '' ) {
 888                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 889                 } elseif( $matches[2] != '' ) {
 890                         $ret = Sanitizer::decCharReference( $matches[2] );
 891                 } elseif( $matches[3] != ''  ) {
 892                         $ret = Sanitizer::hexCharReference( $matches[3] );
 893                 } elseif( $matches[4] != '' ) {
 894                         $ret = Sanitizer::hexCharReference( $matches[4] );
 895                 }
 896                 if( is_null( $ret ) ) {
 897                         return htmlspecialchars( $matches[0] );
 898                 } else {
 899                         return $ret;
 900                 }
 901         }
 902
 903         /**
 904          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 905          * return the named entity reference as is. Otherwise, returns
 906          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 907          *
 908          * @param string $name
 909          * @return string
 910          * @static
 911          */
 912         static function normalizeEntity( $name ) {
 913                 global $wgHtmlEntities;
 914                 if( isset( $wgHtmlEntities[$name] ) ) {
 915                         return "&$name;";
 916                 } else {
 917                         return "&amp;$name;";
 918                 }
 919         }
 920
 921         static function decCharReference( $codepoint ) {
 922                 $point = intval( $codepoint );
 923                 if( Sanitizer::validateCodepoint( $point ) ) {
 924                         return sprintf( '&#%d;', $point );
 925                 } else {
 926                         return null;
 927                 }
 928         }
 929
 930         static function hexCharReference( $codepoint ) {
 931                 $point = hexdec( $codepoint );
 932                 if( Sanitizer::validateCodepoint( $point ) ) {
 933                         return sprintf( '&#x%x;', $point );
 934                 } else {
 935                         return null;
 936                 }
 937         }
 938
 939         /**
 940          * Returns true if a given Unicode codepoint is a valid character in XML.
 941          * @param int $codepoint
 942          * @return bool
 943          */
 944         private static function validateCodepoint( $codepoint ) {
 945                 return ($codepoint ==    0x09)
 946                         || ($codepoint ==    0x0a)
 947                         || ($codepoint ==    0x0d)
 948                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 949                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 950                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 951         }
 952
 953         /**
 954          * Decode any character references, numeric or named entities,
 955          * in the text and return a UTF-8 string.
 956          *
 957          * @param string $text
 958          * @return string
 959          * @public
 960          * @static
 961          */
 962         public static function decodeCharReferences( $text ) {
 963                 return preg_replace_callback(
 964                         MW_CHAR_REFS_REGEX,
 965                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 966                         $text );
 967         }
 968
 969         /**
 970          * @param string $matches
 971          * @return string
 972          */
 973         static function decodeCharReferencesCallback( $matches ) {
 974                 if( $matches[1] != '' ) {
 975                         return Sanitizer::decodeEntity( $matches[1] );
 976                 } elseif( $matches[2] != '' ) {
 977                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 978                 } elseif( $matches[3] != ''  ) {
 979                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 980                 } elseif( $matches[4] != '' ) {
 981                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 982                 }
 983                 # Last case should be an ampersand by itself
 984                 return $matches[0];
 985         }
 986
 987         /**
 988          * Return UTF-8 string for a codepoint if that is a valid
 989          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 990          * @param int $codepoint
 991          * @return string
 992          * @private
 993          */
 994         static function decodeChar( $codepoint ) {
 995                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 996                         return codepointToUtf8( $codepoint );
 997                 } else {
 998                         return UTF8_REPLACEMENT;
 999                 }
1000         }
1001
1002         /**
1003          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1004          * return the UTF-8 encoding of that character. Otherwise, returns
1005          * pseudo-entity source (eg &foo;)
1006          *
1007          * @param string $name
1008          * @return string
1009          */
1010         static function decodeEntity( $name ) {
1011                 global $wgHtmlEntities;
1012                 if( isset( $wgHtmlEntities[$name] ) ) {
1013                         return codepointToUtf8( $wgHtmlEntities[$name] );
1014                 } else {
1015                         return "&$name;";
1016                 }
1017         }
1018
1019         /**
1020          * Fetch the whitelist of acceptable attributes for a given
1021          * element name.
1022          *
1023          * @param string $element
1024          * @return array
1025          */
1026         static function attributeWhitelist( $element ) {
1027                 static $list;
1028                 if( !isset( $list ) ) {
1029                         $list = Sanitizer::setupAttributeWhitelist();
1030                 }
1031                 return isset( $list[$element] )
1032                         ? $list[$element]
1033                         : array();
1034         }
1035
1036         /**
1037          * @todo Document it a bit
1038          * @return array
1039          */
1040         static function setupAttributeWhitelist() {
1041                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1042                 $block = array_merge( $common, array( 'align' ) );
1043                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1044                 $tablecell = array( 'abbr',
1045                                     'axis',
1046                                     'headers',
1047                                     'scope',
1048                                     'rowspan',
1049                                     'colspan',
1050                                     'nowrap', # deprecated
1051                                     'width',  # deprecated
1052                                     'height', # deprecated
1053                                     'bgcolor' # deprecated
1054                                     );
1055
1056                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1057                 # See: http://www.w3.org/TR/html4/
1058                 $whitelist = array (
1059                         # 7.5.4
1060                         'div'        => $block,
1061                         'center'     => $common, # deprecated
1062                         'span'       => $block, # ??
1063
1064                         # 7.5.5
1065                         'h1'         => $block,
1066                         'h2'         => $block,
1067                         'h3'         => $block,
1068                         'h4'         => $block,
1069                         'h5'         => $block,
1070                         'h6'         => $block,
1071
1072                         # 7.5.6
1073                         # address
1074
1075                         # 8.2.4
1076                         # bdo
1077
1078                         # 9.2.1
1079                         'em'         => $common,
1080                         'strong'     => $common,
1081                         'cite'       => $common,
1082                         # dfn
1083                         'code'       => $common,
1084                         # samp
1085                         # kbd
1086                         'var'        => $common,
1087                         # abbr
1088                         # acronym
1089
1090                         # 9.2.2
1091                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1092                         # q
1093
1094                         # 9.2.3
1095                         'sub'        => $common,
1096                         'sup'        => $common,
1097
1098                         # 9.3.1
1099                         'p'          => $block,
1100
1101                         # 9.3.2
1102                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1103
1104                         # 9.3.4
1105                         'pre'        => array_merge( $common, array( 'width' ) ),
1106
1107                         # 9.4
1108                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1109                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1110
1111                         # 10.2
1112                         'ul'         => array_merge( $common, array( 'type' ) ),
1113                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1114                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1115
1116                         # 10.3
1117                         'dl'         => $common,
1118                         'dd'         => $common,
1119                         'dt'         => $common,
1120
1121                         # 11.2.1
1122                         'table'      => array_merge( $common,
1123                                                                 array( 'summary', 'width', 'border', 'frame',
1124                                                                                 'rules', 'cellspacing', 'cellpadding',
1125                                                                                 'align', 'bgcolor',
1126                                                                 ) ),
1127
1128                         # 11.2.2
1129                         'caption'    => array_merge( $common, array( 'align' ) ),
1130
1131                         # 11.2.3
1132                         'thead'      => array_merge( $common, $tablealign ),
1133                         'tfoot'      => array_merge( $common, $tablealign ),
1134                         'tbody'      => array_merge( $common, $tablealign ),
1135
1136                         # 11.2.4
1137                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1138                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1139
1140                         # 11.2.5
1141                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1142
1143                         # 11.2.6
1144                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1145                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1146
1147                         # 15.2.1
1148                         'tt'         => $common,
1149                         'b'          => $common,
1150                         'i'          => $common,
1151                         'big'        => $common,
1152                         'small'      => $common,
1153                         'strike'     => $common,
1154                         's'          => $common,
1155                         'u'          => $common,
1156
1157                         # 15.2.2
1158                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1159                         # basefont
1160
1161                         # 15.3
1162                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1163
1164                         # XHTML Ruby annotation text module, simple ruby only.
1165                         # http://www.w3c.org/TR/ruby/
1166                         'ruby'       => $common,
1167                         # rbc
1168                         # rtc
1169                         'rb'         => $common,
1170                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1171                         'rp'         => $common,
1172                         );
1173                 return $whitelist;
1174         }
1175
1176         /**
1177          * Take a fragment of (potentially invalid) HTML and return
1178          * a version with any tags removed, encoded suitably for literal
1179          * inclusion in an attribute value.
1180          *
1181          * @param string $text HTML fragment
1182          * @return string
1183          */
1184         static function stripAllTags( $text ) {
1185                 # Actual <tags>
1186                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1187
1188                 # Normalize &entities and whitespace
1189                 $text = Sanitizer::normalizeAttributeValue( $text );
1190
1191                 # Will be placed into "double-quoted" attributes,
1192                 # make sure remaining bits are safe.
1193                 $text = str_replace(
1194                         array('<', '>', '"'),
1195                         array('&lt;', '&gt;', '&quot;'),
1196                         $text );
1197
1198                 return $text;
1199         }
1200
1201         /**
1202          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1203          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1204          * PHP 5.1 doesn't.
1205          *
1206          * Use for passing XHTML fragments to PHP's XML parsing functions
1207          *
1208          * @return string
1209          * @static
1210          */
1211         static function hackDocType() {
1212                 global $wgHtmlEntities;
1213                 $out = "<!DOCTYPE html [\n";
1214                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1215                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1216                 }
1217                 $out .= "]>\n";
1218                 return $out;
1219         }
1220
1221         static function cleanUrl( $url, $hostname=true ) {
1222                 # Normalize any HTML entities in input. They will be
1223                 # re-escaped by makeExternalLink().
1224                 $url = Sanitizer::decodeCharReferences( $url );
1225
1226                 # Escape any control characters introduced by the above step
1227                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1228
1229                 # Validate hostname portion
1230                 $matches = array();
1231                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1232                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1233
1234                         // Characters that will be ignored in IDNs.
1235                         // http://tools.ietf.org/html/3454#section-3.1
1236                         // Strip them before further processing so blacklists and such work.
1237                         $strip = "/
1238                                 \\s|          # general whitespace
1239                                 \xc2\xad|     # 00ad SOFT HYPHEN
1240                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1241                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1242                                 \xe2\x81\xa0| # 2060 WORD JOINER
1243                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1244                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1245                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1246                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1247                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1248                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1249                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1250                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1251                                 /xuD";
1252
1253                         $host = preg_replace( $strip, '', $host );
1254
1255                         // @fixme: validate hostnames here
1256
1257                         return $protocol . $host . $rest;
1258                 } else {
1259                         return $url;
1260                 }
1261         }
1262
1263 }
1264
1265 ?>