includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332
 333                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 334                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 335
 336                 wfProfileIn( __METHOD__ );
 337
 338                 if ( !$staticInitialised ) {
 339                         if( $wgUserHtml ) {
 340                                 $htmlpairs = array( # Tags that must be closed
 341                                         'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 342                                         'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 343                                         'strike', 'strong', 'tt', 'var', 'div', 'center',
 344                                         'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 345                                         'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 346                                 );
 347                                 $htmlsingle = array(
 348                                         'br', 'hr', 'li', 'dt', 'dd'
 349                                 );
 350                                 $htmlsingleonly = array( # Elements that cannot have close tags
 351                                         'br', 'hr'
 352                                 );
 353                                 $htmlnest = array( # Tags that can be nested--??
 354                                         'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 355                                         'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 356                                 );
 357                                 $tabletags = array( # Can only appear inside table
 358                                         'td', 'th', 'tr',
 359                                 );
 360                                 $htmllist = array( # Tags used by list
 361                                         'ul','ol',
 362                                 );
 363                                 $listtags = array( # Tags that can appear in a list
 364                                         'li',
 365                                 );
 366
 367                         } else {
 368                                 $htmlpairs = array();
 369                                 $htmlsingle = array();
 370                                 $htmlnest = array();
 371                                 $tabletags = array();
 372                         }
 373
 374                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 375                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 376
 377                         # Convert them all to hashtables for faster lookup
 378                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 379                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 380                         foreach ( $vars as $var ) {
 381                                 $$var = array_flip( $$var );
 382                         }
 383                         $staticInitialised = true;
 384                 }
 385
 386                 # Remove HTML comments
 387                 $text = Sanitizer::removeHTMLcomments( $text );
 388                 $bits = explode( '<', $text );
 389                 $text = array_shift( $bits );
 390                 if(!$wgUseTidy) {
 391                         $tagstack = $tablestack = array();
 392                         foreach ( $bits as $x ) {
 393                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 394                                 preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs );
 395                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 396                                 error_reporting( $prev );
 397
 398                                 $badtag = 0 ;
 399                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 400                                         # Check our stack
 401                                         if ( $slash ) {
 402                                                 # Closing a tag...
 403                                                 if( isset( $htmlsingleonly[$t] ) ) {
 404                                                         $badtag = 1;
 405                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 406                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 407                                                                 # Pop all elements with an optional close tag
 408                                                                 # and see if we find a match below them
 409                                                                 $optstack = array();
 410                                                                 array_push ($optstack, $ot);
 411                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 412                                                                                 isset( $htmlsingleallowed[$ot] ) )
 413                                                                 {
 414                                                                         array_push ($optstack, $ot);
 415                                                                 }
 416                                                                 if ( $t != $ot ) {
 417                                                                         # No match. Push the optinal elements back again
 418                                                                         $badtag = 1;
 419                                                                         while ( $ot = @array_pop( $optstack ) ) {
 420                                                                                 array_push( $tagstack, $ot );
 421                                                                         }
 422                                                                 }
 423                                                         } else {
 424                                                                 @array_push( $tagstack, $ot );
 425                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 426                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 427                                                                         $badtag = 1;
 428                                                                 }
 429                                                         }
 430                                                 } else {
 431                                                         if ( $t == 'table' ) {
 432                                                                 $tagstack = array_pop( $tablestack );
 433                                                         }
 434                                                 }
 435                                                 $newparams = '';
 436                                         } else {
 437                                                 # Keep track for later
 438                                                 if ( isset( $tabletags[$t] ) &&
 439                                                 ! in_array( 'table', $tagstack ) ) {
 440                                                         $badtag = 1;
 441                                                 } else if ( in_array( $t, $tagstack ) &&
 442                                                 ! isset( $htmlnest [$t ] ) ) {
 443                                                         $badtag = 1 ;
 444                                                 # Is it a self closed htmlpair ? (bug 5487)
 445                                                 } else if( $brace == '/>' &&
 446                                                 isset( $htmlpairs[$t] ) ) {
 447                                                         $badtag = 1;
 448                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 449                                                         # Hack to force empty tag for uncloseable elements
 450                                                         $brace = '/>';
 451                                                 } else if( isset( $htmlsingle[$t] ) ) {
 452                                                         # Hack to not close $htmlsingle tags
 453                                                         $brace = NULL;
 454                                                 } else {
 455                                                         if ( $t == 'table' ) {
 456                                                                 array_push( $tablestack, $tagstack );
 457                                                                 $tagstack = array();
 458                                                         }
 459                                                         array_push( $tagstack, $t );
 460                                                 }
 461
 462                                                 # Replace any variables or template parameters with
 463                                                 # plaintext results.
 464                                                 if( is_callable( $processCallback ) ) {
 465                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 466                                                 }
 467
 468                                                 # Strip non-approved attributes from the tag
 469                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 470                                         }
 471                                         if ( ! $badtag ) {
 472                                                 $rest = str_replace( '>', '&gt;', $rest );
 473                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 474                                                 $text .= "<$slash$t$newparams$close>$rest";
 475                                                 continue;
 476                                         }
 477                                 }
 478                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 479                         }
 480                         # Close off any remaining tags
 481                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 482                                 $text .= "</$t>\n";
 483                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 484                         }
 485                 } else {
 486                         # this might be possible using tidy itself
 487                         foreach ( $bits as $x ) {
 488                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 489                                 $x, $regs );
 490                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 491                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 492                                         if( is_callable( $processCallback ) ) {
 493                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 494                                         }
 495                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 496                                         $rest = str_replace( '>', '&gt;', $rest );
 497                                         $text .= "<$slash$t$newparams$brace$rest";
 498                                 } else {
 499                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 500                                 }
 501                         }
 502                 }
 503                 wfProfileOut( __METHOD__ );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Remove '<!--', '-->', and everything between.
 509          * To avoid leaving blank lines, when a comment is both preceded
 510          * and followed by a newline (ignoring spaces), trim leading and
 511          * trailing spaces and one of the newlines.
 512          *
 513          * @private
 514          * @param string $text
 515          * @return string
 516          */
 517         static function removeHTMLcomments( $text ) {
 518                 wfProfileIn( __METHOD__ );
 519                 while (($start = strpos($text, '<!--')) !== false) {
 520                         $end = strpos($text, '-->', $start + 4);
 521                         if ($end === false) {
 522                                 # Unterminated comment; bail out
 523                                 break;
 524                         }
 525
 526                         $end += 3;
 527
 528                         # Trim space and newline if the comment is both
 529                         # preceded and followed by a newline
 530                         $spaceStart = max($start - 1, 0);
 531                         $spaceLen = $end - $spaceStart;
 532                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 533                                 $spaceStart--;
 534                                 $spaceLen++;
 535                         }
 536                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 537                                 $spaceLen++;
 538                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 539                                 # Remove the comment, leading and trailing
 540                                 # spaces, and leave only one newline.
 541                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 542                         }
 543                         else {
 544                                 # Remove just the comment.
 545                                 $text = substr_replace($text, '', $start, $end - $start);
 546                         }
 547                 }
 548                 wfProfileOut( __METHOD__ );
 549                 return $text;
 550         }
 551
 552         /**
 553          * Take an array of attribute names and values and normalize or discard
 554          * illegal values for the given element type.
 555          *
 556          * - Discards attributes not on a whitelist for the given element
 557          * - Unsafe style attributes are discarded
 558          *
 559          * @param array $attribs
 560          * @param string $element
 561          * @return array
 562          *
 563          * @todo Check for legal values where the DTD limits things.
 564          * @todo Check for unique id attribute :P
 565          */
 566         static function validateTagAttributes( $attribs, $element ) {
 567                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 568                 $out = array();
 569                 foreach( $attribs as $attribute => $value ) {
 570                         if( !isset( $whitelist[$attribute] ) ) {
 571                                 continue;
 572                         }
 573                         # Strip javascript "expression" from stylesheets.
 574                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 575                         if( $attribute == 'style' ) {
 576                                 $value = Sanitizer::checkCss( $value );
 577                                 if( $value === false ) {
 578                                         # haxx0r
 579                                         continue;
 580                                 }
 581                         }
 582
 583                         if ( $attribute === 'id' )
 584                                 $value = Sanitizer::escapeId( $value );
 585
 586                         // If this attribute was previously set, override it.
 587                         // Output should only have one attribute of each name.
 588                         $out[$attribute] = $value;
 589                 }
 590                 return $out;
 591         }
 592
 593         /**
 594          * Pick apart some CSS and check it for forbidden or unsafe structures.
 595          * Returns a sanitized string, or false if it was just too evil.
 596          *
 597          * Currently URL references, 'expression', 'tps' are forbidden.
 598          *
 599          * @param string $value
 600          * @return mixed
 601          */
 602         static function checkCss( $value ) {
 603                 $stripped = Sanitizer::decodeCharReferences( $value );
 604
 605                 // Remove any comments; IE gets token splitting wrong
 606                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 607                 $value = $stripped;
 608
 609                 // ... and continue checks
 610                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 611                         'codepointToUtf8(hexdec("$1"))', $stripped );
 612                 $stripped = str_replace( '\\', '', $stripped );
 613                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 614                                 $stripped ) ) {
 615                         # haxx0r
 616                         return false;
 617                 }
 618
 619                 return $value;
 620         }
 621
 622         /**
 623          * Take a tag soup fragment listing an HTML element's attributes
 624          * and normalize it to well-formed XML, discarding unwanted attributes.
 625          * Output is safe for further wikitext processing, with escaping of
 626          * values that could trigger problems.
 627          *
 628          * - Normalizes attribute names to lowercase
 629          * - Discards attributes not on a whitelist for the given element
 630          * - Turns broken or invalid entities into plaintext
 631          * - Double-quotes all attribute values
 632          * - Attributes without values are given the name as attribute
 633          * - Double attributes are discarded
 634          * - Unsafe style attributes are discarded
 635          * - Prepends space if there are attributes.
 636          *
 637          * @param string $text
 638          * @param string $element
 639          * @return string
 640          */
 641         static function fixTagAttributes( $text, $element ) {
 642                 if( trim( $text ) == '' ) {
 643                         return '';
 644                 }
 645
 646                 $stripped = Sanitizer::validateTagAttributes(
 647                         Sanitizer::decodeTagAttributes( $text ), $element );
 648
 649                 $attribs = array();
 650                 foreach( $stripped as $attribute => $value ) {
 651                         $encAttribute = htmlspecialchars( $attribute );
 652                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 653
 654                         $attribs[] = "$encAttribute=\"$encValue\"";
 655                 }
 656                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 657         }
 658
 659         /**
 660          * Encode an attribute value for HTML output.
 661          * @param $text
 662          * @return HTML-encoded text fragment
 663          */
 664         static function encodeAttribute( $text ) {
 665                 $encValue = htmlspecialchars( $text );
 666
 667                 // Whitespace is normalized during attribute decoding,
 668                 // so if we've been passed non-spaces we must encode them
 669                 // ahead of time or they won't be preserved.
 670                 $encValue = strtr( $encValue, array(
 671                         "\n" => '&#10;',
 672                         "\r" => '&#13;',
 673                         "\t" => '&#9;',
 674                 ) );
 675
 676                 return $encValue;
 677         }
 678
 679         /**
 680          * Encode an attribute value for HTML tags, with extra armoring
 681          * against further wiki processing.
 682          * @param $text
 683          * @return HTML-encoded text fragment
 684          */
 685         static function safeEncodeAttribute( $text ) {
 686                 $encValue = Sanitizer::encodeAttribute( $text );
 687
 688                 # Templates and links may be expanded in later parsing,
 689                 # creating invalid or dangerous output. Suppress this.
 690                 $encValue = strtr( $encValue, array(
 691                         '<'    => '&lt;',   // This should never happen,
 692                         '>'    => '&gt;',   // we've received invalid input
 693                         '"'    => '&quot;', // which should have been escaped.
 694                         '{'    => '&#123;',
 695                         '['    => '&#91;',
 696                         "''"   => '&#39;&#39;',
 697                         'ISBN' => '&#73;SBN',
 698                         'RFC'  => '&#82;FC',
 699                         'PMID' => '&#80;MID',
 700                         '|'    => '&#124;',
 701                         '__'   => '&#95;_',
 702                 ) );
 703
 704                 # Stupid hack
 705                 $encValue = preg_replace_callback(
 706                         '/(' . wfUrlProtocols() . ')/',
 707                         array( 'Sanitizer', 'armorLinksCallback' ),
 708                         $encValue );
 709                 return $encValue;
 710         }
 711
 712         /**
 713          * Given a value escape it so that it can be used in an id attribute and
 714          * return it, this does not validate the value however (see first link)
 715          *
 716          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 717          *                                                          in the id and
 718          *                                                          name attributes
 719          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 720          *
 721          * @bug 4461
 722          *
 723          * @static
 724          *
 725          * @param string $id
 726          * @return string
 727          */
 728         static function escapeId( $id ) {
 729                 static $replace = array(
 730                         '%3A' => ':',
 731                         '%' => '.'
 732                 );
 733
 734                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 735
 736                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 737         }
 738
 739         /**
 740          * Given a value, escape it so that it can be used as a CSS class and
 741          * return it.
 742          *
 743          * TODO: For extra validity, input should be validated UTF-8.
 744          *
 745          * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 746          *
 747          * @param string $class
 748          * @return string
 749          */
 750         static function escapeClass( $class ) {
 751                 // Convert ugly stuff to underscores and kill underscores in ugly places
 752                 return rtrim(preg_replace(
 753                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 754                         '_',
 755                         $class ), '_');
 756         }
 757
 758         /**
 759          * Regex replace callback for armoring links against further processing.
 760          * @param array $matches
 761          * @return string
 762          * @private
 763          */
 764         private static function armorLinksCallback( $matches ) {
 765                 return str_replace( ':', '&#58;', $matches[1] );
 766         }
 767
 768         /**
 769          * Return an associative array of attribute names and values from
 770          * a partial tag string. Attribute names are forces to lowercase,
 771          * character references are decoded to UTF-8 text.
 772          *
 773          * @param string
 774          * @return array
 775          */
 776         static function decodeTagAttributes( $text ) {
 777                 $attribs = array();
 778
 779                 if( trim( $text ) == '' ) {
 780                         return $attribs;
 781                 }
 782
 783                 $pairs = array();
 784                 if( !preg_match_all(
 785                         MW_ATTRIBS_REGEX,
 786                         $text,
 787                         $pairs,
 788                         PREG_SET_ORDER ) ) {
 789                         return $attribs;
 790                 }
 791
 792                 foreach( $pairs as $set ) {
 793                         $attribute = strtolower( $set[1] );
 794                         $value = Sanitizer::getTagAttributeCallback( $set );
 795
 796                         // Normalize whitespace
 797                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 798                         $value = trim( $value );
 799
 800                         // Decode character references
 801                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 802                 }
 803                 return $attribs;
 804         }
 805
 806         /**
 807          * Pick the appropriate attribute value from a match set from the
 808          * MW_ATTRIBS_REGEX matches.
 809          *
 810          * @param array $set
 811          * @return string
 812          * @private
 813          */
 814         private static function getTagAttributeCallback( $set ) {
 815                 if( isset( $set[6] ) ) {
 816                         # Illegal #XXXXXX color with no quotes.
 817                         return $set[6];
 818                 } elseif( isset( $set[5] ) ) {
 819                         # No quotes.
 820                         return $set[5];
 821                 } elseif( isset( $set[4] ) ) {
 822                         # Single-quoted
 823                         return $set[4];
 824                 } elseif( isset( $set[3] ) ) {
 825                         # Double-quoted
 826                         return $set[3];
 827                 } elseif( !isset( $set[2] ) ) {
 828                         # In XHTML, attributes must have a value.
 829                         # For 'reduced' form, return explicitly the attribute name here.
 830                         return $set[1];
 831                 } else {
 832                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 833                 }
 834         }
 835
 836         /**
 837          * Normalize whitespace and character references in an XML source-
 838          * encoded text for an attribute value.
 839          *
 840          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 841          * but note that we're not returning the value, but are returning
 842          * XML source fragments that will be slapped into output.
 843          *
 844          * @param string $text
 845          * @return string
 846          * @private
 847          */
 848         private static function normalizeAttributeValue( $text ) {
 849                 return str_replace( '"', '&quot;',
 850                         preg_replace(
 851                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 852                                 ' ',
 853                                 Sanitizer::normalizeCharReferences( $text ) ) );
 854         }
 855
 856         /**
 857          * Ensure that any entities and character references are legal
 858          * for XML and XHTML specifically. Any stray bits will be
 859          * &amp;-escaped to result in a valid text fragment.
 860          *
 861          * a. any named char refs must be known in XHTML
 862          * b. any numeric char refs must be legal chars, not invalid or forbidden
 863          * c. use &#x, not &#X
 864          * d. fix or reject non-valid attributes
 865          *
 866          * @param string $text
 867          * @return string
 868          * @private
 869          */
 870         static function normalizeCharReferences( $text ) {
 871                 return preg_replace_callback(
 872                         MW_CHAR_REFS_REGEX,
 873                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 874                         $text );
 875         }
 876         /**
 877          * @param string $matches
 878          * @return string
 879          */
 880         static function normalizeCharReferencesCallback( $matches ) {
 881                 $ret = null;
 882                 if( $matches[1] != '' ) {
 883                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 884                 } elseif( $matches[2] != '' ) {
 885                         $ret = Sanitizer::decCharReference( $matches[2] );
 886                 } elseif( $matches[3] != ''  ) {
 887                         $ret = Sanitizer::hexCharReference( $matches[3] );
 888                 } elseif( $matches[4] != '' ) {
 889                         $ret = Sanitizer::hexCharReference( $matches[4] );
 890                 }
 891                 if( is_null( $ret ) ) {
 892                         return htmlspecialchars( $matches[0] );
 893                 } else {
 894                         return $ret;
 895                 }
 896         }
 897
 898         /**
 899          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 900          * return the named entity reference as is. Otherwise, returns
 901          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 902          *
 903          * @param string $name
 904          * @return string
 905          * @static
 906          */
 907         static function normalizeEntity( $name ) {
 908                 global $wgHtmlEntities;
 909                 if( isset( $wgHtmlEntities[$name] ) ) {
 910                         return "&$name;";
 911                 } else {
 912                         return "&amp;$name;";
 913                 }
 914         }
 915
 916         static function decCharReference( $codepoint ) {
 917                 $point = intval( $codepoint );
 918                 if( Sanitizer::validateCodepoint( $point ) ) {
 919                         return sprintf( '&#%d;', $point );
 920                 } else {
 921                         return null;
 922                 }
 923         }
 924
 925         static function hexCharReference( $codepoint ) {
 926                 $point = hexdec( $codepoint );
 927                 if( Sanitizer::validateCodepoint( $point ) ) {
 928                         return sprintf( '&#x%x;', $point );
 929                 } else {
 930                         return null;
 931                 }
 932         }
 933
 934         /**
 935          * Returns true if a given Unicode codepoint is a valid character in XML.
 936          * @param int $codepoint
 937          * @return bool
 938          */
 939         private static function validateCodepoint( $codepoint ) {
 940                 return ($codepoint ==    0x09)
 941                         || ($codepoint ==    0x0a)
 942                         || ($codepoint ==    0x0d)
 943                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 944                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 945                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 946         }
 947
 948         /**
 949          * Decode any character references, numeric or named entities,
 950          * in the text and return a UTF-8 string.
 951          *
 952          * @param string $text
 953          * @return string
 954          * @public
 955          * @static
 956          */
 957         public static function decodeCharReferences( $text ) {
 958                 return preg_replace_callback(
 959                         MW_CHAR_REFS_REGEX,
 960                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 961                         $text );
 962         }
 963
 964         /**
 965          * @param string $matches
 966          * @return string
 967          */
 968         static function decodeCharReferencesCallback( $matches ) {
 969                 if( $matches[1] != '' ) {
 970                         return Sanitizer::decodeEntity( $matches[1] );
 971                 } elseif( $matches[2] != '' ) {
 972                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 973                 } elseif( $matches[3] != ''  ) {
 974                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 975                 } elseif( $matches[4] != '' ) {
 976                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 977                 }
 978                 # Last case should be an ampersand by itself
 979                 return $matches[0];
 980         }
 981
 982         /**
 983          * Return UTF-8 string for a codepoint if that is a valid
 984          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 985          * @param int $codepoint
 986          * @return string
 987          * @private
 988          */
 989         static function decodeChar( $codepoint ) {
 990                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 991                         return codepointToUtf8( $codepoint );
 992                 } else {
 993                         return UTF8_REPLACEMENT;
 994                 }
 995         }
 996
 997         /**
 998          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 999          * return the UTF-8 encoding of that character. Otherwise, returns
1000          * pseudo-entity source (eg &foo;)
1001          *
1002          * @param string $name
1003          * @return string
1004          */
1005         static function decodeEntity( $name ) {
1006                 global $wgHtmlEntities;
1007                 if( isset( $wgHtmlEntities[$name] ) ) {
1008                         return codepointToUtf8( $wgHtmlEntities[$name] );
1009                 } else {
1010                         return "&$name;";
1011                 }
1012         }
1013
1014         /**
1015          * Fetch the whitelist of acceptable attributes for a given
1016          * element name.
1017          *
1018          * @param string $element
1019          * @return array
1020          */
1021         static function attributeWhitelist( $element ) {
1022                 static $list;
1023                 if( !isset( $list ) ) {
1024                         $list = Sanitizer::setupAttributeWhitelist();
1025                 }
1026                 return isset( $list[$element] )
1027                         ? $list[$element]
1028                         : array();
1029         }
1030
1031         /**
1032          * @todo Document it a bit
1033          * @return array
1034          */
1035         static function setupAttributeWhitelist() {
1036                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1037                 $block = array_merge( $common, array( 'align' ) );
1038                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1039                 $tablecell = array( 'abbr',
1040                                     'axis',
1041                                     'headers',
1042                                     'scope',
1043                                     'rowspan',
1044                                     'colspan',
1045                                     'nowrap', # deprecated
1046                                     'width',  # deprecated
1047                                     'height', # deprecated
1048                                     'bgcolor' # deprecated
1049                                     );
1050
1051                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1052                 # See: http://www.w3.org/TR/html4/
1053                 $whitelist = array (
1054                         # 7.5.4
1055                         'div'        => $block,
1056                         'center'     => $common, # deprecated
1057                         'span'       => $block, # ??
1058
1059                         # 7.5.5
1060                         'h1'         => $block,
1061                         'h2'         => $block,
1062                         'h3'         => $block,
1063                         'h4'         => $block,
1064                         'h5'         => $block,
1065                         'h6'         => $block,
1066
1067                         # 7.5.6
1068                         # address
1069
1070                         # 8.2.4
1071                         # bdo
1072
1073                         # 9.2.1
1074                         'em'         => $common,
1075                         'strong'     => $common,
1076                         'cite'       => $common,
1077                         # dfn
1078                         'code'       => $common,
1079                         # samp
1080                         # kbd
1081                         'var'        => $common,
1082                         # abbr
1083                         # acronym
1084
1085                         # 9.2.2
1086                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1087                         # q
1088
1089                         # 9.2.3
1090                         'sub'        => $common,
1091                         'sup'        => $common,
1092
1093                         # 9.3.1
1094                         'p'          => $block,
1095
1096                         # 9.3.2
1097                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1098
1099                         # 9.3.4
1100                         'pre'        => array_merge( $common, array( 'width' ) ),
1101
1102                         # 9.4
1103                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1104                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1105
1106                         # 10.2
1107                         'ul'         => array_merge( $common, array( 'type' ) ),
1108                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1109                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1110
1111                         # 10.3
1112                         'dl'         => $common,
1113                         'dd'         => $common,
1114                         'dt'         => $common,
1115
1116                         # 11.2.1
1117                         'table'      => array_merge( $common,
1118                                                                 array( 'summary', 'width', 'border', 'frame',
1119                                                                                 'rules', 'cellspacing', 'cellpadding',
1120                                                                                 'align', 'bgcolor',
1121                                                                 ) ),
1122
1123                         # 11.2.2
1124                         'caption'    => array_merge( $common, array( 'align' ) ),
1125
1126                         # 11.2.3
1127                         'thead'      => array_merge( $common, $tablealign ),
1128                         'tfoot'      => array_merge( $common, $tablealign ),
1129                         'tbody'      => array_merge( $common, $tablealign ),
1130
1131                         # 11.2.4
1132                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1133                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1134
1135                         # 11.2.5
1136                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1137
1138                         # 11.2.6
1139                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1140                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1141
1142                         # 15.2.1
1143                         'tt'         => $common,
1144                         'b'          => $common,
1145                         'i'          => $common,
1146                         'big'        => $common,
1147                         'small'      => $common,
1148                         'strike'     => $common,
1149                         's'          => $common,
1150                         'u'          => $common,
1151
1152                         # 15.2.2
1153                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1154                         # basefont
1155
1156                         # 15.3
1157                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1158
1159                         # XHTML Ruby annotation text module, simple ruby only.
1160                         # http://www.w3c.org/TR/ruby/
1161                         'ruby'       => $common,
1162                         # rbc
1163                         # rtc
1164                         'rb'         => $common,
1165                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1166                         'rp'         => $common,
1167                         );
1168                 return $whitelist;
1169         }
1170
1171         /**
1172          * Take a fragment of (potentially invalid) HTML and return
1173          * a version with any tags removed, encoded suitably for literal
1174          * inclusion in an attribute value.
1175          *
1176          * @param string $text HTML fragment
1177          * @return string
1178          */
1179         static function stripAllTags( $text ) {
1180                 # Actual <tags>
1181                 $text = preg_replace( '/ < .*? > /x', '', $text );
1182
1183                 # Normalize &entities and whitespace
1184                 $text = Sanitizer::normalizeAttributeValue( $text );
1185
1186                 # Will be placed into "double-quoted" attributes,
1187                 # make sure remaining bits are safe.
1188                 $text = str_replace(
1189                         array('<', '>', '"'),
1190                         array('&lt;', '&gt;', '&quot;'),
1191                         $text );
1192
1193                 return $text;
1194         }
1195
1196         /**
1197          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1198          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1199          * PHP 5.1 doesn't.
1200          *
1201          * Use for passing XHTML fragments to PHP's XML parsing functions
1202          *
1203          * @return string
1204          * @static
1205          */
1206         static function hackDocType() {
1207                 global $wgHtmlEntities;
1208                 $out = "<!DOCTYPE html [\n";
1209                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1210                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1211                 }
1212                 $out .= "]>\n";
1213                 return $out;
1214         }
1215
1216         static function cleanUrl( $url, $hostname=true ) {
1217                 # Normalize any HTML entities in input. They will be
1218                 # re-escaped by makeExternalLink().
1219                 $url = Sanitizer::decodeCharReferences( $url );
1220
1221                 # Escape any control characters introduced by the above step
1222                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1223
1224                 # Validate hostname portion
1225                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1226                         list( $whole, $protocol, $host, $rest ) = $matches;
1227
1228                         // Characters that will be ignored in IDNs.
1229                         // http://tools.ietf.org/html/3454#section-3.1
1230                         // Strip them before further processing so blacklists and such work.
1231                         $strip = "/
1232                                 \\s|          # general whitespace
1233                                 \xc2\xad|     # 00ad SOFT HYPHEN
1234                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1235                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1236                                 \xe2\x81\xa0| # 2060 WORD JOINER
1237                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1238                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1239                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1240                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1241                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1242                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1243                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1244                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1245                                 /xuD";
1246
1247                         $host = preg_replace( $strip, '', $host );
1248
1249                         // @fixme: validate hostnames here
1250
1251                         return $protocol . $host . $rest;
1252                 } else {
1253                         return $url;
1254                 }
1255         }
1256
1257 }
1258
1259 ?>