includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         if ( in_array($ot, $htmlsingleallowed) ) {
 395                                                                 # Pop all elements with an optional close tag
 396                                                                 # and see if we find a match below them
 397                                                                 $optstack = array();
 398                                                                 array_push ($optstack, $ot);
 399                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 400                                                                                                 in_array($ot, $htmlsingleallowed) ) {
 401                                                                         array_push ($optstack, $ot);
 402                                                                 }
 403                                                                 if ( $t != $ot ) {
 404                                                                         # No match. Push the optinal elements back again
 405                                                                         $badtag = 1;
 406                                                                         while ( $ot = @array_pop( $optstack ) ) {
 407                                                                                 array_push( $tagstack, $ot );
 408                                                                         }
 409                                                                 }
 410                                                         } else {
 411                                                                 @array_push( $tagstack, $ot );
 412                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 413                                                                 if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 414                                                                         $badtag = 1;
 415                                                                 }
 416                                                         }
 417                                                 } else {
 418                                                         if ( $t == 'table' ) {
 419                                                                 $tagstack = array_pop( $tablestack );
 420                                                         }
 421                                                 }
 422                                                 $newparams = '';
 423                                         } else {
 424                                                 # Keep track for later
 425                                                 if ( in_array( $t, $tabletags ) &&
 426                                                 ! in_array( 'table', $tagstack ) ) {
 427                                                         $badtag = 1;
 428                                                 } else if ( in_array( $t, $tagstack ) &&
 429                                                 ! in_array ( $t , $htmlnest ) ) {
 430                                                         $badtag = 1 ;
 431                                                 # Is it a self closed htmlpair ? (bug 5487)
 432                                                 } else if( $brace == '/>' &&
 433                                                 in_array($t, $htmlpairs) ) {
 434                                                         $badtag = 1;
 435                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 436                                                         # Hack to force empty tag for uncloseable elements
 437                                                         $brace = '/>';
 438                                                 } else if( in_array( $t, $htmlsingle ) ) {
 439                                                         # Hack to not close $htmlsingle tags
 440                                                         $brace = NULL;
 441                                                 } else {
 442                                                         if ( $t == 'table' ) {
 443                                                                 array_push( $tablestack, $tagstack );
 444                                                                 $tagstack = array();
 445                                                         }
 446                                                         array_push( $tagstack, $t );
 447                                                 }
 448
 449                                                 # Replace any variables or template parameters with
 450                                                 # plaintext results.
 451                                                 if( is_callable( $processCallback ) ) {
 452                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 453                                                 }
 454
 455                                                 # Strip non-approved attributes from the tag
 456                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 457                                         }
 458                                         if ( ! $badtag ) {
 459                                                 $rest = str_replace( '>', '&gt;', $rest );
 460                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 461                                                 $text .= "<$slash$t$newparams$close>$rest";
 462                                                 continue;
 463                                         }
 464                                 }
 465                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 466                         }
 467                         # Close off any remaining tags
 468                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 469                                 $text .= "</$t>\n";
 470                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 471                         }
 472                 } else {
 473                         # this might be possible using tidy itself
 474                         foreach ( $bits as $x ) {
 475                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 476                                 $x, $regs );
 477                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 478                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 479                                         if( is_callable( $processCallback ) ) {
 480                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 481                                         }
 482                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 483                                         $rest = str_replace( '>', '&gt;', $rest );
 484                                         $text .= "<$slash$t$newparams$brace$rest";
 485                                 } else {
 486                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 487                                 }
 488                         }
 489                 }
 490                 wfProfileOut( $fname );
 491                 return $text;
 492         }
 493
 494         /**
 495          * Remove '<!--', '-->', and everything between.
 496          * To avoid leaving blank lines, when a comment is both preceded
 497          * and followed by a newline (ignoring spaces), trim leading and
 498          * trailing spaces and one of the newlines.
 499          *
 500          * @private
 501          * @param string $text
 502          * @return string
 503          */
 504         function removeHTMLcomments( $text ) {
 505                 $fname='Parser::removeHTMLcomments';
 506                 wfProfileIn( $fname );
 507                 while (($start = strpos($text, '<!--')) !== false) {
 508                         $end = strpos($text, '-->', $start + 4);
 509                         if ($end === false) {
 510                                 # Unterminated comment; bail out
 511                                 break;
 512                         }
 513
 514                         $end += 3;
 515
 516                         # Trim space and newline if the comment is both
 517                         # preceded and followed by a newline
 518                         $spaceStart = max($start - 1, 0);
 519                         $spaceLen = $end - $spaceStart;
 520                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 521                                 $spaceStart--;
 522                                 $spaceLen++;
 523                         }
 524                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 525                                 $spaceLen++;
 526                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 527                                 # Remove the comment, leading and trailing
 528                                 # spaces, and leave only one newline.
 529                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 530                         }
 531                         else {
 532                                 # Remove just the comment.
 533                                 $text = substr_replace($text, '', $start, $end - $start);
 534                         }
 535                 }
 536                 wfProfileOut( $fname );
 537                 return $text;
 538         }
 539
 540         /**
 541          * Take an array of attribute names and values and normalize or discard
 542          * illegal values for the given element type.
 543          *
 544          * - Discards attributes not on a whitelist for the given element
 545          * - Unsafe style attributes are discarded
 546          *
 547          * @param array $attribs
 548          * @param string $element
 549          * @return array
 550          *
 551          * @todo Check for legal values where the DTD limits things.
 552          * @todo Check for unique id attribute :P
 553          */
 554         function validateTagAttributes( $attribs, $element ) {
 555                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 556                 $out = array();
 557                 foreach( $attribs as $attribute => $value ) {
 558                         if( !isset( $whitelist[$attribute] ) ) {
 559                                 continue;
 560                         }
 561                         # Strip javascript "expression" from stylesheets.
 562                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 563                         if( $attribute == 'style' ) {
 564                                 $stripped = Sanitizer::decodeCharReferences( $value );
 565
 566                                 // Remove any comments; IE gets token splitting wrong
 567                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 568                                 $value = $stripped;
 569
 570                                 // ... and continue checks
 571                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 572                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 573                                 $stripped = str_replace( '\\', '', $stripped );
 574                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 575                                                 $stripped ) ) {
 576                                         # haxx0r
 577                                         continue;
 578                                 }
 579                         }
 580
 581                         if ( $attribute === 'id' )
 582                                 $value = Sanitizer::escapeId( $value );
 583
 584                         // If this attribute was previously set, override it.
 585                         // Output should only have one attribute of each name.
 586                         $out[$attribute] = $value;
 587                 }
 588                 return $out;
 589         }
 590
 591         /**
 592          * Take a tag soup fragment listing an HTML element's attributes
 593          * and normalize it to well-formed XML, discarding unwanted attributes.
 594          * Output is safe for further wikitext processing, with escaping of
 595          * values that could trigger problems.
 596          *
 597          * - Normalizes attribute names to lowercase
 598          * - Discards attributes not on a whitelist for the given element
 599          * - Turns broken or invalid entities into plaintext
 600          * - Double-quotes all attribute values
 601          * - Attributes without values are given the name as attribute
 602          * - Double attributes are discarded
 603          * - Unsafe style attributes are discarded
 604          * - Prepends space if there are attributes.
 605          *
 606          * @param string $text
 607          * @param string $element
 608          * @return string
 609          */
 610         function fixTagAttributes( $text, $element ) {
 611                 if( trim( $text ) == '' ) {
 612                         return '';
 613                 }
 614
 615                 $stripped = Sanitizer::validateTagAttributes(
 616                         Sanitizer::decodeTagAttributes( $text ), $element );
 617
 618                 $attribs = array();
 619                 foreach( $stripped as $attribute => $value ) {
 620                         $encAttribute = htmlspecialchars( $attribute );
 621                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 622
 623                         $attribs[] = "$encAttribute=\"$encValue\"";
 624                 }
 625                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 626         }
 627
 628         /**
 629          * Encode an attribute value for HTML output.
 630          * @param $text
 631          * @return HTML-encoded text fragment
 632          */
 633         function encodeAttribute( $text ) {
 634                 $encValue = htmlspecialchars( $text );
 635
 636                 // Whitespace is normalized during attribute decoding,
 637                 // so if we've been passed non-spaces we must encode them
 638                 // ahead of time or they won't be preserved.
 639                 $encValue = strtr( $encValue, array(
 640                         "\n" => '&#10;',
 641                         "\r" => '&#13;',
 642                         "\t" => '&#9;',
 643                 ) );
 644
 645                 return $encValue;
 646         }
 647
 648         /**
 649          * Encode an attribute value for HTML tags, with extra armoring
 650          * against further wiki processing.
 651          * @param $text
 652          * @return HTML-encoded text fragment
 653          */
 654         function safeEncodeAttribute( $text ) {
 655                 $encValue = Sanitizer::encodeAttribute( $text );
 656
 657                 # Templates and links may be expanded in later parsing,
 658                 # creating invalid or dangerous output. Suppress this.
 659                 $encValue = strtr( $encValue, array(
 660                         '<'    => '&lt;',   // This should never happen,
 661                         '>'    => '&gt;',   // we've received invalid input
 662                         '"'    => '&quot;', // which should have been escaped.
 663                         '{'    => '&#123;',
 664                         '['    => '&#91;',
 665                         "''"   => '&#39;&#39;',
 666                         'ISBN' => '&#73;SBN',
 667                         'RFC'  => '&#82;FC',
 668                         'PMID' => '&#80;MID',
 669                         '|'    => '&#124;',
 670                         '__'   => '&#95;_',
 671                 ) );
 672
 673                 # Stupid hack
 674                 $encValue = preg_replace_callback(
 675                         '/(' . wfUrlProtocols() . ')/',
 676                         array( 'Sanitizer', 'armorLinksCallback' ),
 677                         $encValue );
 678                 return $encValue;
 679         }
 680
 681         /**
 682          * Given a value escape it so that it can be used in an id attribute and
 683          * return it, this does not validate the value however (see first link)
 684          *
 685          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 686          *                                                          in the id and
 687          *                                                          name attributes
 688          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 689          *
 690          * @bug 4461
 691          *
 692          * @static
 693          *
 694          * @param string $id
 695          * @return string
 696          */
 697         function escapeId( $id ) {
 698                 static $replace = array(
 699                         '%3A' => ':',
 700                         '%' => '.'
 701                 );
 702
 703                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 704
 705                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 706         }
 707
 708         /**
 709          * Regex replace callback for armoring links against further processing.
 710          * @param array $matches
 711          * @return string
 712          * @private
 713          */
 714         function armorLinksCallback( $matches ) {
 715                 return str_replace( ':', '&#58;', $matches[1] );
 716         }
 717
 718         /**
 719          * Return an associative array of attribute names and values from
 720          * a partial tag string. Attribute names are forces to lowercase,
 721          * character references are decoded to UTF-8 text.
 722          *
 723          * @param string
 724          * @return array
 725          */
 726         function decodeTagAttributes( $text ) {
 727                 $attribs = array();
 728
 729                 if( trim( $text ) == '' ) {
 730                         return $attribs;
 731                 }
 732
 733                 $pairs = array();
 734                 if( !preg_match_all(
 735                         MW_ATTRIBS_REGEX,
 736                         $text,
 737                         $pairs,
 738                         PREG_SET_ORDER ) ) {
 739                         return $attribs;
 740                 }
 741
 742                 foreach( $pairs as $set ) {
 743                         $attribute = strtolower( $set[1] );
 744                         $value = Sanitizer::getTagAttributeCallback( $set );
 745
 746                         // Normalize whitespace
 747                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 748                         $value = trim( $value );
 749
 750                         // Decode character references
 751                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 752                 }
 753                 return $attribs;
 754         }
 755
 756         /**
 757          * Pick the appropriate attribute value from a match set from the
 758          * MW_ATTRIBS_REGEX matches.
 759          *
 760          * @param array $set
 761          * @return string
 762          * @private
 763          */
 764         function getTagAttributeCallback( $set ) {
 765                 if( isset( $set[6] ) ) {
 766                         # Illegal #XXXXXX color with no quotes.
 767                         return $set[6];
 768                 } elseif( isset( $set[5] ) ) {
 769                         # No quotes.
 770                         return $set[5];
 771                 } elseif( isset( $set[4] ) ) {
 772                         # Single-quoted
 773                         return $set[4];
 774                 } elseif( isset( $set[3] ) ) {
 775                         # Double-quoted
 776                         return $set[3];
 777                 } elseif( !isset( $set[2] ) ) {
 778                         # In XHTML, attributes must have a value.
 779                         # For 'reduced' form, return explicitly the attribute name here.
 780                         return $set[1];
 781                 } else {
 782                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 783                 }
 784         }
 785
 786         /**
 787          * Normalize whitespace and character references in an XML source-
 788          * encoded text for an attribute value.
 789          *
 790          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 791          * but note that we're not returning the value, but are returning
 792          * XML source fragments that will be slapped into output.
 793          *
 794          * @param string $text
 795          * @return string
 796          * @private
 797          */
 798         function normalizeAttributeValue( $text ) {
 799                 return str_replace( '"', '&quot;',
 800                         preg_replace(
 801                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 802                                 ' ',
 803                                 Sanitizer::normalizeCharReferences( $text ) ) );
 804         }
 805
 806         /**
 807          * Ensure that any entities and character references are legal
 808          * for XML and XHTML specifically. Any stray bits will be
 809          * &amp;-escaped to result in a valid text fragment.
 810          *
 811          * a. any named char refs must be known in XHTML
 812          * b. any numeric char refs must be legal chars, not invalid or forbidden
 813          * c. use &#x, not &#X
 814          * d. fix or reject non-valid attributes
 815          *
 816          * @param string $text
 817          * @return string
 818          * @private
 819          */
 820         function normalizeCharReferences( $text ) {
 821                 return preg_replace_callback(
 822                         MW_CHAR_REFS_REGEX,
 823                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 824                         $text );
 825         }
 826         /**
 827          * @param string $matches
 828          * @return string
 829          */
 830         function normalizeCharReferencesCallback( $matches ) {
 831                 $ret = null;
 832                 if( $matches[1] != '' ) {
 833                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 834                 } elseif( $matches[2] != '' ) {
 835                         $ret = Sanitizer::decCharReference( $matches[2] );
 836                 } elseif( $matches[3] != ''  ) {
 837                         $ret = Sanitizer::hexCharReference( $matches[3] );
 838                 } elseif( $matches[4] != '' ) {
 839                         $ret = Sanitizer::hexCharReference( $matches[4] );
 840                 }
 841                 if( is_null( $ret ) ) {
 842                         return htmlspecialchars( $matches[0] );
 843                 } else {
 844                         return $ret;
 845                 }
 846         }
 847
 848         /**
 849          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 850          * return the named entity reference as is. Otherwise, returns
 851          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 852          *
 853          * @param string $name
 854          * @return string
 855          */
 856         function normalizeEntity( $name ) {
 857                 global $wgHtmlEntities;
 858                 if( isset( $wgHtmlEntities[$name] ) ) {
 859                         return "&$name;";
 860                 } else {
 861                         return "&amp;$name;";
 862                 }
 863         }
 864
 865         function decCharReference( $codepoint ) {
 866                 $point = intval( $codepoint );
 867                 if( Sanitizer::validateCodepoint( $point ) ) {
 868                         return sprintf( '&#%d;', $point );
 869                 } else {
 870                         return null;
 871                 }
 872         }
 873
 874         function hexCharReference( $codepoint ) {
 875                 $point = hexdec( $codepoint );
 876                 if( Sanitizer::validateCodepoint( $point ) ) {
 877                         return sprintf( '&#x%x;', $point );
 878                 } else {
 879                         return null;
 880                 }
 881         }
 882
 883         /**
 884          * Returns true if a given Unicode codepoint is a valid character in XML.
 885          * @param int $codepoint
 886          * @return bool
 887          */
 888         function validateCodepoint( $codepoint ) {
 889                 return ($codepoint ==    0x09)
 890                         || ($codepoint ==    0x0a)
 891                         || ($codepoint ==    0x0d)
 892                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 893                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 894                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 895         }
 896
 897         /**
 898          * Decode any character references, numeric or named entities,
 899          * in the text and return a UTF-8 string.
 900          *
 901          * @param string $text
 902          * @return string
 903          * @public
 904          */
 905         function decodeCharReferences( $text ) {
 906                 return preg_replace_callback(
 907                         MW_CHAR_REFS_REGEX,
 908                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 909                         $text );
 910         }
 911
 912         /**
 913          * @param string $matches
 914          * @return string
 915          */
 916         function decodeCharReferencesCallback( $matches ) {
 917                 if( $matches[1] != '' ) {
 918                         return Sanitizer::decodeEntity( $matches[1] );
 919                 } elseif( $matches[2] != '' ) {
 920                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 921                 } elseif( $matches[3] != ''  ) {
 922                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 923                 } elseif( $matches[4] != '' ) {
 924                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 925                 }
 926                 # Last case should be an ampersand by itself
 927                 return $matches[0];
 928         }
 929
 930         /**
 931          * Return UTF-8 string for a codepoint if that is a valid
 932          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 933          * @param int $codepoint
 934          * @return string
 935          * @private
 936          */
 937         function decodeChar( $codepoint ) {
 938                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 939                         return codepointToUtf8( $codepoint );
 940                 } else {
 941                         return UTF8_REPLACEMENT;
 942                 }
 943         }
 944
 945         /**
 946          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 947          * return the UTF-8 encoding of that character. Otherwise, returns
 948          * pseudo-entity source (eg &foo;)
 949          *
 950          * @param string $name
 951          * @return string
 952          */
 953         function decodeEntity( $name ) {
 954                 global $wgHtmlEntities;
 955                 if( isset( $wgHtmlEntities[$name] ) ) {
 956                         return codepointToUtf8( $wgHtmlEntities[$name] );
 957                 } else {
 958                         return "&$name;";
 959                 }
 960         }
 961
 962         /**
 963          * Fetch the whitelist of acceptable attributes for a given
 964          * element name.
 965          *
 966          * @param string $element
 967          * @return array
 968          */
 969         function attributeWhitelist( $element ) {
 970                 static $list;
 971                 if( !isset( $list ) ) {
 972                         $list = Sanitizer::setupAttributeWhitelist();
 973                 }
 974                 return isset( $list[$element] )
 975                         ? $list[$element]
 976                         : array();
 977         }
 978
 979         /**
 980          * @return array
 981          */
 982         function setupAttributeWhitelist() {
 983                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 984                 $block = array_merge( $common, array( 'align' ) );
 985                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 986                 $tablecell = array( 'abbr',
 987                                     'axis',
 988                                     'headers',
 989                                     'scope',
 990                                     'rowspan',
 991                                     'colspan',
 992                                     'nowrap', # deprecated
 993                                     'width',  # deprecated
 994                                     'height', # deprecated
 995                                     'bgcolor' # deprecated
 996                                     );
 997
 998                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 999                 # See: http://www.w3.org/TR/html4/
1000                 $whitelist = array (
1001                         # 7.5.4
1002                         'div'        => $block,
1003                         'center'     => $common, # deprecated
1004                         'span'       => $block, # ??
1005
1006                         # 7.5.5
1007                         'h1'         => $block,
1008                         'h2'         => $block,
1009                         'h3'         => $block,
1010                         'h4'         => $block,
1011                         'h5'         => $block,
1012                         'h6'         => $block,
1013
1014                         # 7.5.6
1015                         # address
1016
1017                         # 8.2.4
1018                         # bdo
1019
1020                         # 9.2.1
1021                         'em'         => $common,
1022                         'strong'     => $common,
1023                         'cite'       => $common,
1024                         # dfn
1025                         'code'       => $common,
1026                         # samp
1027                         # kbd
1028                         'var'        => $common,
1029                         # abbr
1030                         # acronym
1031
1032                         # 9.2.2
1033                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1034                         # q
1035
1036                         # 9.2.3
1037                         'sub'        => $common,
1038                         'sup'        => $common,
1039
1040                         # 9.3.1
1041                         'p'          => $block,
1042
1043                         # 9.3.2
1044                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1045
1046                         # 9.3.4
1047                         'pre'        => array_merge( $common, array( 'width' ) ),
1048
1049                         # 9.4
1050                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1051                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1052
1053                         # 10.2
1054                         'ul'         => array_merge( $common, array( 'type' ) ),
1055                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1056                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1057
1058                         # 10.3
1059                         'dl'         => $common,
1060                         'dd'         => $common,
1061                         'dt'         => $common,
1062
1063                         # 11.2.1
1064                         'table'      => array_merge( $common,
1065                                                                 array( 'summary', 'width', 'border', 'frame',
1066                                                                                          'rules', 'cellspacing', 'cellpadding',
1067                                                                                          'align', 'bgcolor', 'frame', 'rules',
1068                                                                                          'border' ) ),
1069
1070                         # 11.2.2
1071                         'caption'    => array_merge( $common, array( 'align' ) ),
1072
1073                         # 11.2.3
1074                         'thead'      => array_merge( $common, $tablealign ),
1075                         'tfoot'      => array_merge( $common, $tablealign ),
1076                         'tbody'      => array_merge( $common, $tablealign ),
1077
1078                         # 11.2.4
1079                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1080                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1081
1082                         # 11.2.5
1083                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1084
1085                         # 11.2.6
1086                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1087                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1088
1089                         # 15.2.1
1090                         'tt'         => $common,
1091                         'b'          => $common,
1092                         'i'          => $common,
1093                         'big'        => $common,
1094                         'small'      => $common,
1095                         'strike'     => $common,
1096                         's'          => $common,
1097                         'u'          => $common,
1098
1099                         # 15.2.2
1100                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1101                         # basefont
1102
1103                         # 15.3
1104                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1105
1106                         # XHTML Ruby annotation text module, simple ruby only.
1107                         # http://www.w3c.org/TR/ruby/
1108                         'ruby'       => $common,
1109                         # rbc
1110                         # rtc
1111                         'rb'         => $common,
1112                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1113                         'rp'         => $common,
1114                         );
1115                 return $whitelist;
1116         }
1117
1118         /**
1119          * Take a fragment of (potentially invalid) HTML and return
1120          * a version with any tags removed, encoded suitably for literal
1121          * inclusion in an attribute value.
1122          *
1123          * @param string $text HTML fragment
1124          * @return string
1125          */
1126         function stripAllTags( $text ) {
1127                 # Actual <tags>
1128                 $text = preg_replace( '/ < .*? > /x', '', $text );
1129
1130                 # Normalize &entities and whitespace
1131                 $text = Sanitizer::normalizeAttributeValue( $text );
1132
1133                 # Will be placed into "double-quoted" attributes,
1134                 # make sure remaining bits are safe.
1135                 $text = str_replace(
1136                         array('<', '>', '"'),
1137                         array('&lt;', '&gt;', '&quot;'),
1138                         $text );
1139
1140                 return $text;
1141         }
1142
1143         /**
1144          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1145          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1146          * PHP 5.1 doesn't.
1147          *
1148          * Use for passing XHTML fragments to PHP's XML parsing functions
1149          *
1150          * @return string
1151          * @static
1152          */
1153         function hackDocType() {
1154                 global $wgHtmlEntities;
1155                 $out = "<!DOCTYPE html [\n";
1156                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1157                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1158                 }
1159                 $out .= "]>\n";
1160                 return $out;
1161         }
1162
1163 }
1164
1165 ?>