includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         if ( in_array($ot, $htmlsingleallowed) ) {
 395                                                                 # Pop all elements with an optional close tag
 396                                                                 # and see if we find a match below them
 397                                                                 $optstack = array();
 398                                                                 array_push ($optstack, $ot);
 399                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 400                                                                                                 in_array($ot, $htmlsingleallowed) ) {
 401                                                                         array_push ($optstack, $ot);
 402                                                                 }
 403                                                                 if ( $t != $ot ) {
 404                                                                         # No match. Push the optinal elements back again
 405                                                                         $badtag = 1;
 406                                                                         while ( $ot = @array_pop( $optstack ) ) {
 407                                                                                 array_push( $tagstack, $ot );
 408                                                                         }
 409                                                                 }
 410                                                         } else {
 411                                                                 @array_push( $tagstack, $ot );
 412                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 413                                                                 if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 414                                                                         $badtag = 1;
 415                                                                 }
 416                                                         }
 417                                                 } else {
 418                                                         if ( $t == 'table' ) {
 419                                                                 $tagstack = array_pop( $tablestack );
 420                                                         }
 421                                                 }
 422                                                 $newparams = '';
 423                                         } else {
 424                                                 # Keep track for later
 425                                                 if ( in_array( $t, $tabletags ) &&
 426                                                 ! in_array( 'table', $tagstack ) ) {
 427                                                         $badtag = 1;
 428                                                 } else if ( in_array( $t, $tagstack ) &&
 429                                                 ! in_array ( $t , $htmlnest ) ) {
 430                                                         $badtag = 1 ;
 431                                                 # Is it a self closed htmlpair ? (bug 5487)
 432                                                 } else if( $brace == '/>' &&
 433                                                 in_array($t, $htmlpairs) ) {
 434                                                         $badtag = 1;
 435                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 436                                                         # Hack to force empty tag for uncloseable elements
 437                                                         $brace = '/>';
 438                                                 } else if( in_array( $t, $htmlsingle ) ) {
 439                                                         # Hack to not close $htmlsingle tags
 440                                                         $brace = NULL;
 441                                                 } else {
 442                                                         if ( $t == 'table' ) {
 443                                                                 array_push( $tablestack, $tagstack );
 444                                                                 $tagstack = array();
 445                                                         }
 446                                                         array_push( $tagstack, $t );
 447                                                 }
 448
 449                                                 # Replace any variables or template parameters with
 450                                                 # plaintext results.
 451                                                 if( is_callable( $processCallback ) ) {
 452                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 453                                                 }
 454
 455                                                 # Strip non-approved attributes from the tag
 456                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 457                                         }
 458                                         if ( ! $badtag ) {
 459                                                 $rest = str_replace( '>', '&gt;', $rest );
 460                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 461                                                 $text .= "<$slash$t$newparams$close>$rest";
 462                                                 continue;
 463                                         }
 464                                 }
 465                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 466                         }
 467                         # Close off any remaining tags
 468                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 469                                 $text .= "</$t>\n";
 470                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 471                         }
 472                 } else {
 473                         # this might be possible using tidy itself
 474                         foreach ( $bits as $x ) {
 475                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 476                                 $x, $regs );
 477                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 478                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 479                                         if( is_callable( $processCallback ) ) {
 480                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 481                                         }
 482                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 483                                         $rest = str_replace( '>', '&gt;', $rest );
 484                                         $text .= "<$slash$t$newparams$brace$rest";
 485                                 } else {
 486                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 487                                 }
 488                         }
 489                 }
 490                 wfProfileOut( $fname );
 491                 return $text;
 492         }
 493
 494         /**
 495          * Remove '<!--', '-->', and everything between.
 496          * To avoid leaving blank lines, when a comment is both preceded
 497          * and followed by a newline (ignoring spaces), trim leading and
 498          * trailing spaces and one of the newlines.
 499          *
 500          * @private
 501          * @param string $text
 502          * @return string
 503          */
 504         static function removeHTMLcomments( $text ) {
 505                 $fname='Parser::removeHTMLcomments';
 506                 wfProfileIn( $fname );
 507                 while (($start = strpos($text, '<!--')) !== false) {
 508                         $end = strpos($text, '-->', $start + 4);
 509                         if ($end === false) {
 510                                 # Unterminated comment; bail out
 511                                 break;
 512                         }
 513
 514                         $end += 3;
 515
 516                         # Trim space and newline if the comment is both
 517                         # preceded and followed by a newline
 518                         $spaceStart = max($start - 1, 0);
 519                         $spaceLen = $end - $spaceStart;
 520                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 521                                 $spaceStart--;
 522                                 $spaceLen++;
 523                         }
 524                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 525                                 $spaceLen++;
 526                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 527                                 # Remove the comment, leading and trailing
 528                                 # spaces, and leave only one newline.
 529                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 530                         }
 531                         else {
 532                                 # Remove just the comment.
 533                                 $text = substr_replace($text, '', $start, $end - $start);
 534                         }
 535                 }
 536                 wfProfileOut( $fname );
 537                 return $text;
 538         }
 539
 540         /**
 541          * Take an array of attribute names and values and normalize or discard
 542          * illegal values for the given element type.
 543          *
 544          * - Discards attributes not on a whitelist for the given element
 545          * - Unsafe style attributes are discarded
 546          *
 547          * @param array $attribs
 548          * @param string $element
 549          * @return array
 550          *
 551          * @todo Check for legal values where the DTD limits things.
 552          * @todo Check for unique id attribute :P
 553          */
 554         static function validateTagAttributes( $attribs, $element ) {
 555                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 556                 $out = array();
 557                 foreach( $attribs as $attribute => $value ) {
 558                         if( !isset( $whitelist[$attribute] ) ) {
 559                                 continue;
 560                         }
 561                         # Strip javascript "expression" from stylesheets.
 562                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 563                         if( $attribute == 'style' ) {
 564                                 $value = Sanitizer::checkCss( $value );
 565                                 if( $value === false ) {
 566                                         # haxx0r
 567                                         continue;
 568                                 }
 569                         }
 570
 571                         if ( $attribute === 'id' )
 572                                 $value = Sanitizer::escapeId( $value );
 573
 574                         // If this attribute was previously set, override it.
 575                         // Output should only have one attribute of each name.
 576                         $out[$attribute] = $value;
 577                 }
 578                 return $out;
 579         }
 580
 581         /**
 582          * Pick apart some CSS and check it for forbidden or unsafe structures.
 583          * Returns a sanitized string, or false if it was just too evil.
 584          *
 585          * Currently URL references, 'expression', 'tps' are forbidden.
 586          *
 587          * @param string $value
 588          * @return mixed
 589          */
 590         static function checkCss( $value ) {
 591                 $stripped = Sanitizer::decodeCharReferences( $value );
 592
 593                 // Remove any comments; IE gets token splitting wrong
 594                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 595                 $value = $stripped;
 596
 597                 // ... and continue checks
 598                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 599                         'codepointToUtf8(hexdec("$1"))', $stripped );
 600                 $stripped = str_replace( '\\', '', $stripped );
 601                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 602                                 $stripped ) ) {
 603                         # haxx0r
 604                         return false;
 605                 }
 606
 607                 return $value;
 608         }
 609
 610         /**
 611          * Take a tag soup fragment listing an HTML element's attributes
 612          * and normalize it to well-formed XML, discarding unwanted attributes.
 613          * Output is safe for further wikitext processing, with escaping of
 614          * values that could trigger problems.
 615          *
 616          * - Normalizes attribute names to lowercase
 617          * - Discards attributes not on a whitelist for the given element
 618          * - Turns broken or invalid entities into plaintext
 619          * - Double-quotes all attribute values
 620          * - Attributes without values are given the name as attribute
 621          * - Double attributes are discarded
 622          * - Unsafe style attributes are discarded
 623          * - Prepends space if there are attributes.
 624          *
 625          * @param string $text
 626          * @param string $element
 627          * @return string
 628          */
 629         static function fixTagAttributes( $text, $element ) {
 630                 if( trim( $text ) == '' ) {
 631                         return '';
 632                 }
 633
 634                 $stripped = Sanitizer::validateTagAttributes(
 635                         Sanitizer::decodeTagAttributes( $text ), $element );
 636
 637                 $attribs = array();
 638                 foreach( $stripped as $attribute => $value ) {
 639                         $encAttribute = htmlspecialchars( $attribute );
 640                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 641
 642                         $attribs[] = "$encAttribute=\"$encValue\"";
 643                 }
 644                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 645         }
 646
 647         /**
 648          * Encode an attribute value for HTML output.
 649          * @param $text
 650          * @return HTML-encoded text fragment
 651          */
 652         static function encodeAttribute( $text ) {
 653                 $encValue = htmlspecialchars( $text );
 654
 655                 // Whitespace is normalized during attribute decoding,
 656                 // so if we've been passed non-spaces we must encode them
 657                 // ahead of time or they won't be preserved.
 658                 $encValue = strtr( $encValue, array(
 659                         "\n" => '&#10;',
 660                         "\r" => '&#13;',
 661                         "\t" => '&#9;',
 662                 ) );
 663
 664                 return $encValue;
 665         }
 666
 667         /**
 668          * Encode an attribute value for HTML tags, with extra armoring
 669          * against further wiki processing.
 670          * @param $text
 671          * @return HTML-encoded text fragment
 672          */
 673         static function safeEncodeAttribute( $text ) {
 674                 $encValue = Sanitizer::encodeAttribute( $text );
 675
 676                 # Templates and links may be expanded in later parsing,
 677                 # creating invalid or dangerous output. Suppress this.
 678                 $encValue = strtr( $encValue, array(
 679                         '<'    => '&lt;',   // This should never happen,
 680                         '>'    => '&gt;',   // we've received invalid input
 681                         '"'    => '&quot;', // which should have been escaped.
 682                         '{'    => '&#123;',
 683                         '['    => '&#91;',
 684                         "''"   => '&#39;&#39;',
 685                         'ISBN' => '&#73;SBN',
 686                         'RFC'  => '&#82;FC',
 687                         'PMID' => '&#80;MID',
 688                         '|'    => '&#124;',
 689                         '__'   => '&#95;_',
 690                 ) );
 691
 692                 # Stupid hack
 693                 $encValue = preg_replace_callback(
 694                         '/(' . wfUrlProtocols() . ')/',
 695                         array( 'Sanitizer', 'armorLinksCallback' ),
 696                         $encValue );
 697                 return $encValue;
 698         }
 699
 700         /**
 701          * Given a value escape it so that it can be used in an id attribute and
 702          * return it, this does not validate the value however (see first link)
 703          *
 704          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 705          *                                                          in the id and
 706          *                                                          name attributes
 707          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 708          *
 709          * @bug 4461
 710          *
 711          * @static
 712          *
 713          * @param string $id
 714          * @return string
 715          */
 716         static function escapeId( $id ) {
 717                 static $replace = array(
 718                         '%3A' => ':',
 719                         '%' => '.'
 720                 );
 721
 722                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 723
 724                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 725         }
 726
 727         /**
 728          * Regex replace callback for armoring links against further processing.
 729          * @param array $matches
 730          * @return string
 731          * @private
 732          */
 733         private static function armorLinksCallback( $matches ) {
 734                 return str_replace( ':', '&#58;', $matches[1] );
 735         }
 736
 737         /**
 738          * Return an associative array of attribute names and values from
 739          * a partial tag string. Attribute names are forces to lowercase,
 740          * character references are decoded to UTF-8 text.
 741          *
 742          * @param string
 743          * @return array
 744          */
 745         static function decodeTagAttributes( $text ) {
 746                 $attribs = array();
 747
 748                 if( trim( $text ) == '' ) {
 749                         return $attribs;
 750                 }
 751
 752                 $pairs = array();
 753                 if( !preg_match_all(
 754                         MW_ATTRIBS_REGEX,
 755                         $text,
 756                         $pairs,
 757                         PREG_SET_ORDER ) ) {
 758                         return $attribs;
 759                 }
 760
 761                 foreach( $pairs as $set ) {
 762                         $attribute = strtolower( $set[1] );
 763                         $value = Sanitizer::getTagAttributeCallback( $set );
 764
 765                         // Normalize whitespace
 766                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 767                         $value = trim( $value );
 768
 769                         // Decode character references
 770                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 771                 }
 772                 return $attribs;
 773         }
 774
 775         /**
 776          * Pick the appropriate attribute value from a match set from the
 777          * MW_ATTRIBS_REGEX matches.
 778          *
 779          * @param array $set
 780          * @return string
 781          * @private
 782          */
 783         private static function getTagAttributeCallback( $set ) {
 784                 if( isset( $set[6] ) ) {
 785                         # Illegal #XXXXXX color with no quotes.
 786                         return $set[6];
 787                 } elseif( isset( $set[5] ) ) {
 788                         # No quotes.
 789                         return $set[5];
 790                 } elseif( isset( $set[4] ) ) {
 791                         # Single-quoted
 792                         return $set[4];
 793                 } elseif( isset( $set[3] ) ) {
 794                         # Double-quoted
 795                         return $set[3];
 796                 } elseif( !isset( $set[2] ) ) {
 797                         # In XHTML, attributes must have a value.
 798                         # For 'reduced' form, return explicitly the attribute name here.
 799                         return $set[1];
 800                 } else {
 801                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 802                 }
 803         }
 804
 805         /**
 806          * Normalize whitespace and character references in an XML source-
 807          * encoded text for an attribute value.
 808          *
 809          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 810          * but note that we're not returning the value, but are returning
 811          * XML source fragments that will be slapped into output.
 812          *
 813          * @param string $text
 814          * @return string
 815          * @private
 816          */
 817         private static function normalizeAttributeValue( $text ) {
 818                 return str_replace( '"', '&quot;',
 819                         preg_replace(
 820                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 821                                 ' ',
 822                                 Sanitizer::normalizeCharReferences( $text ) ) );
 823         }
 824
 825         /**
 826          * Ensure that any entities and character references are legal
 827          * for XML and XHTML specifically. Any stray bits will be
 828          * &amp;-escaped to result in a valid text fragment.
 829          *
 830          * a. any named char refs must be known in XHTML
 831          * b. any numeric char refs must be legal chars, not invalid or forbidden
 832          * c. use &#x, not &#X
 833          * d. fix or reject non-valid attributes
 834          *
 835          * @param string $text
 836          * @return string
 837          * @private
 838          */
 839         static function normalizeCharReferences( $text ) {
 840                 return preg_replace_callback(
 841                         MW_CHAR_REFS_REGEX,
 842                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 843                         $text );
 844         }
 845         /**
 846          * @param string $matches
 847          * @return string
 848          */
 849         static function normalizeCharReferencesCallback( $matches ) {
 850                 $ret = null;
 851                 if( $matches[1] != '' ) {
 852                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 853                 } elseif( $matches[2] != '' ) {
 854                         $ret = Sanitizer::decCharReference( $matches[2] );
 855                 } elseif( $matches[3] != ''  ) {
 856                         $ret = Sanitizer::hexCharReference( $matches[3] );
 857                 } elseif( $matches[4] != '' ) {
 858                         $ret = Sanitizer::hexCharReference( $matches[4] );
 859                 }
 860                 if( is_null( $ret ) ) {
 861                         return htmlspecialchars( $matches[0] );
 862                 } else {
 863                         return $ret;
 864                 }
 865         }
 866
 867         /**
 868          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 869          * return the named entity reference as is. Otherwise, returns
 870          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 871          *
 872          * @param string $name
 873          * @return string
 874          * @static
 875          */
 876         static function normalizeEntity( $name ) {
 877                 global $wgHtmlEntities;
 878                 if( isset( $wgHtmlEntities[$name] ) ) {
 879                         return "&$name;";
 880                 } else {
 881                         return "&amp;$name;";
 882                 }
 883         }
 884
 885         static function decCharReference( $codepoint ) {
 886                 $point = intval( $codepoint );
 887                 if( Sanitizer::validateCodepoint( $point ) ) {
 888                         return sprintf( '&#%d;', $point );
 889                 } else {
 890                         return null;
 891                 }
 892         }
 893
 894         static function hexCharReference( $codepoint ) {
 895                 $point = hexdec( $codepoint );
 896                 if( Sanitizer::validateCodepoint( $point ) ) {
 897                         return sprintf( '&#x%x;', $point );
 898                 } else {
 899                         return null;
 900                 }
 901         }
 902
 903         /**
 904          * Returns true if a given Unicode codepoint is a valid character in XML.
 905          * @param int $codepoint
 906          * @return bool
 907          */
 908         private static function validateCodepoint( $codepoint ) {
 909                 return ($codepoint ==    0x09)
 910                         || ($codepoint ==    0x0a)
 911                         || ($codepoint ==    0x0d)
 912                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 913                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 914                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 915         }
 916
 917         /**
 918          * Decode any character references, numeric or named entities,
 919          * in the text and return a UTF-8 string.
 920          *
 921          * @param string $text
 922          * @return string
 923          * @public
 924          * @static
 925          */
 926         public static function decodeCharReferences( $text ) {
 927                 return preg_replace_callback(
 928                         MW_CHAR_REFS_REGEX,
 929                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 930                         $text );
 931         }
 932
 933         /**
 934          * @param string $matches
 935          * @return string
 936          */
 937         static function decodeCharReferencesCallback( $matches ) {
 938                 if( $matches[1] != '' ) {
 939                         return Sanitizer::decodeEntity( $matches[1] );
 940                 } elseif( $matches[2] != '' ) {
 941                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 942                 } elseif( $matches[3] != ''  ) {
 943                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 944                 } elseif( $matches[4] != '' ) {
 945                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 946                 }
 947                 # Last case should be an ampersand by itself
 948                 return $matches[0];
 949         }
 950
 951         /**
 952          * Return UTF-8 string for a codepoint if that is a valid
 953          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 954          * @param int $codepoint
 955          * @return string
 956          * @private
 957          */
 958         static function decodeChar( $codepoint ) {
 959                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 960                         return codepointToUtf8( $codepoint );
 961                 } else {
 962                         return UTF8_REPLACEMENT;
 963                 }
 964         }
 965
 966         /**
 967          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 968          * return the UTF-8 encoding of that character. Otherwise, returns
 969          * pseudo-entity source (eg &foo;)
 970          *
 971          * @param string $name
 972          * @return string
 973          */
 974         static function decodeEntity( $name ) {
 975                 global $wgHtmlEntities;
 976                 if( isset( $wgHtmlEntities[$name] ) ) {
 977                         return codepointToUtf8( $wgHtmlEntities[$name] );
 978                 } else {
 979                         return "&$name;";
 980                 }
 981         }
 982
 983         /**
 984          * Fetch the whitelist of acceptable attributes for a given
 985          * element name.
 986          *
 987          * @param string $element
 988          * @return array
 989          */
 990         static function attributeWhitelist( $element ) {
 991                 static $list;
 992                 if( !isset( $list ) ) {
 993                         $list = Sanitizer::setupAttributeWhitelist();
 994                 }
 995                 return isset( $list[$element] )
 996                         ? $list[$element]
 997                         : array();
 998         }
 999
1000         /**
1001          * @todo Document it a bit
1002          * @return array
1003          */
1004         static function setupAttributeWhitelist() {
1005                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1006                 $block = array_merge( $common, array( 'align' ) );
1007                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1008                 $tablecell = array( 'abbr',
1009                                     'axis',
1010                                     'headers',
1011                                     'scope',
1012                                     'rowspan',
1013                                     'colspan',
1014                                     'nowrap', # deprecated
1015                                     'width',  # deprecated
1016                                     'height', # deprecated
1017                                     'bgcolor' # deprecated
1018                                     );
1019
1020                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1021                 # See: http://www.w3.org/TR/html4/
1022                 $whitelist = array (
1023                         # 7.5.4
1024                         'div'        => $block,
1025                         'center'     => $common, # deprecated
1026                         'span'       => $block, # ??
1027
1028                         # 7.5.5
1029                         'h1'         => $block,
1030                         'h2'         => $block,
1031                         'h3'         => $block,
1032                         'h4'         => $block,
1033                         'h5'         => $block,
1034                         'h6'         => $block,
1035
1036                         # 7.5.6
1037                         # address
1038
1039                         # 8.2.4
1040                         # bdo
1041
1042                         # 9.2.1
1043                         'em'         => $common,
1044                         'strong'     => $common,
1045                         'cite'       => $common,
1046                         # dfn
1047                         'code'       => $common,
1048                         # samp
1049                         # kbd
1050                         'var'        => $common,
1051                         # abbr
1052                         # acronym
1053
1054                         # 9.2.2
1055                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1056                         # q
1057
1058                         # 9.2.3
1059                         'sub'        => $common,
1060                         'sup'        => $common,
1061
1062                         # 9.3.1
1063                         'p'          => $block,
1064
1065                         # 9.3.2
1066                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1067
1068                         # 9.3.4
1069                         'pre'        => array_merge( $common, array( 'width' ) ),
1070
1071                         # 9.4
1072                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1073                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1074
1075                         # 10.2
1076                         'ul'         => array_merge( $common, array( 'type' ) ),
1077                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1078                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1079
1080                         # 10.3
1081                         'dl'         => $common,
1082                         'dd'         => $common,
1083                         'dt'         => $common,
1084
1085                         # 11.2.1
1086                         'table'      => array_merge( $common,
1087                                                                 array( 'summary', 'width', 'border', 'frame',
1088                                                                                 'rules', 'cellspacing', 'cellpadding',
1089                                                                                 'align', 'bgcolor',
1090                                                                 ) ),
1091
1092                         # 11.2.2
1093                         'caption'    => array_merge( $common, array( 'align' ) ),
1094
1095                         # 11.2.3
1096                         'thead'      => array_merge( $common, $tablealign ),
1097                         'tfoot'      => array_merge( $common, $tablealign ),
1098                         'tbody'      => array_merge( $common, $tablealign ),
1099
1100                         # 11.2.4
1101                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1102                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1103
1104                         # 11.2.5
1105                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1106
1107                         # 11.2.6
1108                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1109                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1110
1111                         # 15.2.1
1112                         'tt'         => $common,
1113                         'b'          => $common,
1114                         'i'          => $common,
1115                         'big'        => $common,
1116                         'small'      => $common,
1117                         'strike'     => $common,
1118                         's'          => $common,
1119                         'u'          => $common,
1120
1121                         # 15.2.2
1122                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1123                         # basefont
1124
1125                         # 15.3
1126                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1127
1128                         # XHTML Ruby annotation text module, simple ruby only.
1129                         # http://www.w3c.org/TR/ruby/
1130                         'ruby'       => $common,
1131                         # rbc
1132                         # rtc
1133                         'rb'         => $common,
1134                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1135                         'rp'         => $common,
1136                         );
1137                 return $whitelist;
1138         }
1139
1140         /**
1141          * Take a fragment of (potentially invalid) HTML and return
1142          * a version with any tags removed, encoded suitably for literal
1143          * inclusion in an attribute value.
1144          *
1145          * @param string $text HTML fragment
1146          * @return string
1147          */
1148         static function stripAllTags( $text ) {
1149                 # Actual <tags>
1150                 $text = preg_replace( '/ < .*? > /x', '', $text );
1151
1152                 # Normalize &entities and whitespace
1153                 $text = Sanitizer::normalizeAttributeValue( $text );
1154
1155                 # Will be placed into "double-quoted" attributes,
1156                 # make sure remaining bits are safe.
1157                 $text = str_replace(
1158                         array('<', '>', '"'),
1159                         array('&lt;', '&gt;', '&quot;'),
1160                         $text );
1161
1162                 return $text;
1163         }
1164
1165         /**
1166          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1167          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1168          * PHP 5.1 doesn't.
1169          *
1170          * Use for passing XHTML fragments to PHP's XML parsing functions
1171          *
1172          * @return string
1173          * @static
1174          */
1175         static function hackDocType() {
1176                 global $wgHtmlEntities;
1177                 $out = "<!DOCTYPE html [\n";
1178                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1179                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1180                 }
1181                 $out .= "]>\n";
1182                 return $out;
1183         }
1184
1185         static function cleanUrl( $url, $hostname=true ) {
1186                 # Normalize any HTML entities in input. They will be
1187                 # re-escaped by makeExternalLink().
1188                 $url = Sanitizer::decodeCharReferences( $url );
1189
1190                 # Escape any control characters introduced by the above step
1191                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1192
1193                 # Validate hostname portion
1194                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1195                         list( $whole, $protocol, $host, $rest ) = $matches;
1196
1197                         // Characters that will be ignored in IDNs.
1198                         // http://tools.ietf.org/html/3454#section-3.1
1199                         // Strip them before further processing so blacklists and such work.
1200                         $strip = "/
1201                                 \\s|          # general whitespace
1202                                 \xc2\xad|     # 00ad SOFT HYPHEN
1203                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1204                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1205                                 \xe2\x81\xa0| # 2060 WORD JOINER
1206                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1207                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1208                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1209                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1210                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1211                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1212                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1213                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1214                                 /xuD";
1215
1216                         $host = preg_replace( $strip, '', $host );
1217
1218                         // @fixme: validate hostnames here
1219
1220                         return $protocol . $host . $rest;
1221                 } else {
1222                         return $url;
1223                 }
1224         }
1225
1226 }
1227
1228 ?>