includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  * @subpackage Parser
  26  */
  27
  28 /**
  29  * Regular expression to match various types of character references in
  30  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  31  */
  32 define( 'MW_CHAR_REFS_REGEX',
  33         '/&([A-Za-z0-9]+);
  34          |&\#([0-9]+);
  35          |&\#x([0-9A-Za-z]+);
  36          |&\#X([0-9A-Za-z]+);
  37          |(&)/x' );
  38
  39 /**
  40  * Regular expression to match HTML/XML attribute pairs within a tag.
  41  * Allows some... latitude.
  42  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  43  */
  44 $attrib = '[A-Za-z0-9]';
  45 $space = '[\x09\x0a\x0d\x20]';
  46 define( 'MW_ATTRIBS_REGEX',
  47         "/(?:^|$space)($attrib+)
  48           ($space*=$space*
  49                 (?:
  50                  # The attribute value: quoted or alone
  51                   \"([^<\"]*)\"
  52                  | '([^<']*)'
  53                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  54                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  55                                                          # colors are specified like this.
  56                                                          # We'll be normalizing it.
  57                 )
  58            )?(?=$space|\$)/sx" );
  59
  60 /**
  61  * List of all named character entities defined in HTML 4.01
  62  * http://www.w3.org/TR/html4/sgml/entities.html
  63  * @access private
  64  */
  65 global $wgHtmlEntities;
  66 $wgHtmlEntities = array(
  67         'Aacute'   => 193,
  68         'aacute'   => 225,
  69         'Acirc'    => 194,
  70         'acirc'    => 226,
  71         'acute'    => 180,
  72         'AElig'    => 198,
  73         'aelig'    => 230,
  74         'Agrave'   => 192,
  75         'agrave'   => 224,
  76         'alefsym'  => 8501,
  77         'Alpha'    => 913,
  78         'alpha'    => 945,
  79         'amp'      => 38,
  80         'and'      => 8743,
  81         'ang'      => 8736,
  82         'Aring'    => 197,
  83         'aring'    => 229,
  84         'asymp'    => 8776,
  85         'Atilde'   => 195,
  86         'atilde'   => 227,
  87         'Auml'     => 196,
  88         'auml'     => 228,
  89         'bdquo'    => 8222,
  90         'Beta'     => 914,
  91         'beta'     => 946,
  92         'brvbar'   => 166,
  93         'bull'     => 8226,
  94         'cap'      => 8745,
  95         'Ccedil'   => 199,
  96         'ccedil'   => 231,
  97         'cedil'    => 184,
  98         'cent'     => 162,
  99         'Chi'      => 935,
 100         'chi'      => 967,
 101         'circ'     => 710,
 102         'clubs'    => 9827,
 103         'cong'     => 8773,
 104         'copy'     => 169,
 105         'crarr'    => 8629,
 106         'cup'      => 8746,
 107         'curren'   => 164,
 108         'dagger'   => 8224,
 109         'Dagger'   => 8225,
 110         'darr'     => 8595,
 111         'dArr'     => 8659,
 112         'deg'      => 176,
 113         'Delta'    => 916,
 114         'delta'    => 948,
 115         'diams'    => 9830,
 116         'divide'   => 247,
 117         'Eacute'   => 201,
 118         'eacute'   => 233,
 119         'Ecirc'    => 202,
 120         'ecirc'    => 234,
 121         'Egrave'   => 200,
 122         'egrave'   => 232,
 123         'empty'    => 8709,
 124         'emsp'     => 8195,
 125         'ensp'     => 8194,
 126         'Epsilon'  => 917,
 127         'epsilon'  => 949,
 128         'equiv'    => 8801,
 129         'Eta'      => 919,
 130         'eta'      => 951,
 131         'ETH'      => 208,
 132         'eth'      => 240,
 133         'Euml'     => 203,
 134         'euml'     => 235,
 135         'euro'     => 8364,
 136         'exist'    => 8707,
 137         'fnof'     => 402,
 138         'forall'   => 8704,
 139         'frac12'   => 189,
 140         'frac14'   => 188,
 141         'frac34'   => 190,
 142         'frasl'    => 8260,
 143         'Gamma'    => 915,
 144         'gamma'    => 947,
 145         'ge'       => 8805,
 146         'gt'       => 62,
 147         'harr'     => 8596,
 148         'hArr'     => 8660,
 149         'hearts'   => 9829,
 150         'hellip'   => 8230,
 151         'Iacute'   => 205,
 152         'iacute'   => 237,
 153         'Icirc'    => 206,
 154         'icirc'    => 238,
 155         'iexcl'    => 161,
 156         'Igrave'   => 204,
 157         'igrave'   => 236,
 158         'image'    => 8465,
 159         'infin'    => 8734,
 160         'int'      => 8747,
 161         'Iota'     => 921,
 162         'iota'     => 953,
 163         'iquest'   => 191,
 164         'isin'     => 8712,
 165         'Iuml'     => 207,
 166         'iuml'     => 239,
 167         'Kappa'    => 922,
 168         'kappa'    => 954,
 169         'Lambda'   => 923,
 170         'lambda'   => 955,
 171         'lang'     => 9001,
 172         'laquo'    => 171,
 173         'larr'     => 8592,
 174         'lArr'     => 8656,
 175         'lceil'    => 8968,
 176         'ldquo'    => 8220,
 177         'le'       => 8804,
 178         'lfloor'   => 8970,
 179         'lowast'   => 8727,
 180         'loz'      => 9674,
 181         'lrm'      => 8206,
 182         'lsaquo'   => 8249,
 183         'lsquo'    => 8216,
 184         'lt'       => 60,
 185         'macr'     => 175,
 186         'mdash'    => 8212,
 187         'micro'    => 181,
 188         'middot'   => 183,
 189         'minus'    => 8722,
 190         'Mu'       => 924,
 191         'mu'       => 956,
 192         'nabla'    => 8711,
 193         'nbsp'     => 160,
 194         'ndash'    => 8211,
 195         'ne'       => 8800,
 196         'ni'       => 8715,
 197         'not'      => 172,
 198         'notin'    => 8713,
 199         'nsub'     => 8836,
 200         'Ntilde'   => 209,
 201         'ntilde'   => 241,
 202         'Nu'       => 925,
 203         'nu'       => 957,
 204         'Oacute'   => 211,
 205         'oacute'   => 243,
 206         'Ocirc'    => 212,
 207         'ocirc'    => 244,
 208         'OElig'    => 338,
 209         'oelig'    => 339,
 210         'Ograve'   => 210,
 211         'ograve'   => 242,
 212         'oline'    => 8254,
 213         'Omega'    => 937,
 214         'omega'    => 969,
 215         'Omicron'  => 927,
 216         'omicron'  => 959,
 217         'oplus'    => 8853,
 218         'or'       => 8744,
 219         'ordf'     => 170,
 220         'ordm'     => 186,
 221         'Oslash'   => 216,
 222         'oslash'   => 248,
 223         'Otilde'   => 213,
 224         'otilde'   => 245,
 225         'otimes'   => 8855,
 226         'Ouml'     => 214,
 227         'ouml'     => 246,
 228         'para'     => 182,
 229         'part'     => 8706,
 230         'permil'   => 8240,
 231         'perp'     => 8869,
 232         'Phi'      => 934,
 233         'phi'      => 966,
 234         'Pi'       => 928,
 235         'pi'       => 960,
 236         'piv'      => 982,
 237         'plusmn'   => 177,
 238         'pound'    => 163,
 239         'prime'    => 8242,
 240         'Prime'    => 8243,
 241         'prod'     => 8719,
 242         'prop'     => 8733,
 243         'Psi'      => 936,
 244         'psi'      => 968,
 245         'quot'     => 34,
 246         'radic'    => 8730,
 247         'rang'     => 9002,
 248         'raquo'    => 187,
 249         'rarr'     => 8594,
 250         'rArr'     => 8658,
 251         'rceil'    => 8969,
 252         'rdquo'    => 8221,
 253         'real'     => 8476,
 254         'reg'      => 174,
 255         'rfloor'   => 8971,
 256         'Rho'      => 929,
 257         'rho'      => 961,
 258         'rlm'      => 8207,
 259         'rsaquo'   => 8250,
 260         'rsquo'    => 8217,
 261         'sbquo'    => 8218,
 262         'Scaron'   => 352,
 263         'scaron'   => 353,
 264         'sdot'     => 8901,
 265         'sect'     => 167,
 266         'shy'      => 173,
 267         'Sigma'    => 931,
 268         'sigma'    => 963,
 269         'sigmaf'   => 962,
 270         'sim'      => 8764,
 271         'spades'   => 9824,
 272         'sub'      => 8834,
 273         'sube'     => 8838,
 274         'sum'      => 8721,
 275         'sup'      => 8835,
 276         'sup1'     => 185,
 277         'sup2'     => 178,
 278         'sup3'     => 179,
 279         'supe'     => 8839,
 280         'szlig'    => 223,
 281         'Tau'      => 932,
 282         'tau'      => 964,
 283         'there4'   => 8756,
 284         'Theta'    => 920,
 285         'theta'    => 952,
 286         'thetasym' => 977,
 287         'thinsp'   => 8201,
 288         'THORN'    => 222,
 289         'thorn'    => 254,
 290         'tilde'    => 732,
 291         'times'    => 215,
 292         'trade'    => 8482,
 293         'Uacute'   => 218,
 294         'uacute'   => 250,
 295         'uarr'     => 8593,
 296         'uArr'     => 8657,
 297         'Ucirc'    => 219,
 298         'ucirc'    => 251,
 299         'Ugrave'   => 217,
 300         'ugrave'   => 249,
 301         'uml'      => 168,
 302         'upsih'    => 978,
 303         'Upsilon'  => 933,
 304         'upsilon'  => 965,
 305         'Uuml'     => 220,
 306         'uuml'     => 252,
 307         'weierp'   => 8472,
 308         'Xi'       => 926,
 309         'xi'       => 958,
 310         'Yacute'   => 221,
 311         'yacute'   => 253,
 312         'yen'      => 165,
 313         'Yuml'     => 376,
 314         'yuml'     => 255,
 315         'Zeta'     => 918,
 316         'zeta'     => 950,
 317         'zwj'      => 8205,
 318         'zwnj'     => 8204 );
 319
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @access private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr'
 355                         );
 356                 } else {
 357                         $htmlpairs = array();
 358                         $htmlsingle = array();
 359                         $htmlnest = array();
 360                         $tabletags = array();
 361                 }
 362
 363                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 364                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 365
 366                 # Remove HTML comments
 367                 $text = Sanitizer::removeHTMLcomments( $text );
 368
 369                 $bits = explode( '<', $text );
 370                 $text = array_shift( $bits );
 371                 if(!$wgUseTidy) {
 372                         $tagstack = array(); $tablestack = array();
 373                         foreach ( $bits as $x ) {
 374                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 375                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 376                                 $x, $regs );
 377                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 378                                 error_reporting( $prev );
 379
 380                                 $badtag = 0 ;
 381                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 382                                         # Check our stack
 383                                         if ( $slash ) {
 384                                                 # Closing a tag...
 385                                                 if( in_array( $t, $htmlsingleonly ) ) {
 386                                                         $badtag = 1;
 387                                                 } elseif( !in_array( $t, $htmlsingle ) &&
 388                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
 389                                                         @array_push( $tagstack, $ot );
 390                                                         $badtag = 1;
 391                                                 } else {
 392                                                         if ( $t == 'table' ) {
 393                                                                 $tagstack = array_pop( $tablestack );
 394                                                         }
 395                                                         $newparams = '';
 396                                                 }
 397                                         } else {
 398                                                 # Keep track for later
 399                                                 if ( in_array( $t, $tabletags ) &&
 400                                                 ! in_array( 'table', $tagstack ) ) {
 401                                                         $badtag = 1;
 402                                                 } else if ( in_array( $t, $tagstack ) &&
 403                                                 ! in_array ( $t , $htmlnest ) ) {
 404                                                         $badtag = 1 ;
 405                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 406                                                         # Hack to force empty tag for uncloseable elements
 407                                                         $brace = '/>';
 408                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 409                                                         if ( $t == 'table' ) {
 410                                                                 array_push( $tablestack, $tagstack );
 411                                                                 $tagstack = array();
 412                                                         }
 413                                                         array_push( $tagstack, $t );
 414                                                 }
 415
 416                                                 # Replace any variables or template parameters with
 417                                                 # plaintext results.
 418                                                 if( is_callable( $processCallback ) ) {
 419                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 420                                                 }
 421
 422                                                 # Strip non-approved attributes from the tag
 423                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 424                                         }
 425                                         if ( ! $badtag ) {
 426                                                 $rest = str_replace( '>', '&gt;', $rest );
 427                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 428                                                 $text .= "<$slash$t$newparams$close>$rest";
 429                                                 continue;
 430                                         }
 431                                 }
 432                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 433                         }
 434                         # Close off any remaining tags
 435                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 436                                 $text .= "</$t>\n";
 437                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 438                         }
 439                 } else {
 440                         # this might be possible using tidy itself
 441                         foreach ( $bits as $x ) {
 442                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 443                                 $x, $regs );
 444                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 445                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 446                                         if( is_callable( $processCallback ) ) {
 447                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 448                                         }
 449                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 450                                         $rest = str_replace( '>', '&gt;', $rest );
 451                                         $text .= "<$slash$t$newparams$brace$rest";
 452                                 } else {
 453                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 454                                 }
 455                         }
 456                 }
 457                 wfProfileOut( $fname );
 458                 return $text;
 459         }
 460
 461         /**
 462          * Remove '<!--', '-->', and everything between.
 463          * To avoid leaving blank lines, when a comment is both preceded
 464          * and followed by a newline (ignoring spaces), trim leading and
 465          * trailing spaces and one of the newlines.
 466          *
 467          * @access private
 468          * @param string $text
 469          * @return string
 470          */
 471         function removeHTMLcomments( $text ) {
 472                 $fname='Parser::removeHTMLcomments';
 473                 wfProfileIn( $fname );
 474                 while (($start = strpos($text, '<!--')) !== false) {
 475                         $end = strpos($text, '-->', $start + 4);
 476                         if ($end === false) {
 477                                 # Unterminated comment; bail out
 478                                 break;
 479                         }
 480
 481                         $end += 3;
 482
 483                         # Trim space and newline if the comment is both
 484                         # preceded and followed by a newline
 485                         $spaceStart = max($start - 1, 0);
 486                         $spaceLen = $end - $spaceStart;
 487                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 488                                 $spaceStart--;
 489                                 $spaceLen++;
 490                         }
 491                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 492                                 $spaceLen++;
 493                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 494                                 # Remove the comment, leading and trailing
 495                                 # spaces, and leave only one newline.
 496                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 497                         }
 498                         else {
 499                                 # Remove just the comment.
 500                                 $text = substr_replace($text, '', $start, $end - $start);
 501                         }
 502                 }
 503                 wfProfileOut( $fname );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Take a tag soup fragment listing an HTML element's attributes
 509          * and normalize it to well-formed XML, discarding unwanted attributes.
 510          *
 511          * - Normalizes attribute names to lowercase
 512          * - Discards attributes not on a whitelist for the given element
 513          * - Turns broken or invalid entities into plaintext
 514          * - Double-quotes all attribute values
 515          * - Attributes without values are given the name as attribute
 516          * - Double attributes are discarded
 517          * - Unsafe style attributes are discarded
 518          * - Prepends space if there are attributes.
 519          *
 520          * @param string $text
 521          * @param string $element
 522          * @return string
 523          *
 524          * @todo Check for legal values where the DTD limits things.
 525          * @todo Check for unique id attribute :P
 526          */
 527         function fixTagAttributes( $text, $element ) {
 528                 if( trim( $text ) == '' ) {
 529                         return '';
 530                 }
 531
 532                 # Unquoted attribute
 533                 # Since we quote this later, this can be anything distinguishable
 534                 # from the end of the attribute
 535                 if( !preg_match_all(
 536                         MW_ATTRIBS_REGEX,
 537                         $text,
 538                         $pairs,
 539                         PREG_SET_ORDER ) ) {
 540                         return '';
 541                 }
 542
 543                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 544                 $attribs = array();
 545                 foreach( $pairs as $set ) {
 546                         $attribute = strtolower( $set[1] );
 547                         if( !isset( $whitelist[$attribute] ) ) {
 548                                 continue;
 549                         }
 550
 551                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 552                         $value = Sanitizer::normalizeAttributeValue( $raw );
 553
 554                         # Strip javascript "expression" from stylesheets.
 555                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 556                         if( $attribute == 'style' && preg_match(
 557                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 558                                         Sanitizer::decodeCharReferences( $value ) ) ) {
 559                                 # haxx0r
 560                                 continue;
 561                         }
 562
 563                         # Templates and links may be expanded in later parsing,
 564                         # creating invalid or dangerous output. Suppress this.
 565                         $value = strtr( $value, array(
 566                                 '{'    => '&#123;',
 567                                 '['    => '&#91;',
 568                                 "''"   => '&#39;&#39;',
 569                                 'ISBN' => '&#73;SBN',
 570                                 'RFC'  => '&#82;FC',
 571                                 'PMID' => '&#80;MID',
 572                         ) );
 573                         $value = preg_replace(
 574                                 '/(' . URL_PROTOCOLS . '):/',
 575                                 '\\1&#58;', $value );
 576
 577                         if( !isset( $attribs[$attribute] ) ) {
 578                                 $attribs[$attribute] = "$attribute=\"$value\"";
 579                         }
 580                 }
 581                 if( empty( $attribs ) ) {
 582                         return '';
 583                 } else {
 584                         return ' ' . implode( ' ', $attribs );
 585                 }
 586         }
 587
 588         /**
 589          * Return an associative array of attribute names and values from
 590          * a partial tag string. Attribute names are forces to lowercase,
 591          * character references are decoded to UTF-8 text.
 592          *
 593          * @param string
 594          * @return array
 595          */
 596         function decodeTagAttributes( $text ) {
 597                 $attribs = array();
 598
 599                 if( trim( $text ) == '' ) {
 600                         return $attribs;
 601                 }
 602
 603                 if( !preg_match_all(
 604                         MW_ATTRIBS_REGEX,
 605                         $text,
 606                         $pairs,
 607                         PREG_SET_ORDER ) ) {
 608                         return $attribs;
 609                 }
 610
 611                 foreach( $pairs as $set ) {
 612                         $attribute = strtolower( $set[1] );
 613                         $value = Sanitizer::getTagAttributeCallback( $set );
 614                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 615                 }
 616                 return $attribs;
 617         }
 618
 619         /**
 620          * Pick the appropriate attribute value from a match set from the
 621          * MW_ATTRIBS_REGEX matches.
 622          *
 623          * @param array $set
 624          * @return string
 625          * @access private
 626          */
 627         function getTagAttributeCallback( $set ) {
 628                 if( isset( $set[6] ) ) {
 629                         # Illegal #XXXXXX color with no quotes.
 630                         return $set[6];
 631                 } elseif( isset( $set[5] ) ) {
 632                         # No quotes.
 633                         return $set[5];
 634                 } elseif( isset( $set[4] ) ) {
 635                         # Single-quoted
 636                         return $set[4];
 637                 } elseif( isset( $set[3] ) ) {
 638                         # Double-quoted
 639                         return $set[3];
 640                 } elseif( !isset( $set[2] ) ) {
 641                         # In XHTML, attributes must have a value.
 642                         # For 'reduced' form, return explicitly the attribute name here.
 643                         return $set[1];
 644                 } else {
 645                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 646                 }
 647         }
 648
 649         /**
 650          * Normalize whitespace and character references in an XML source-
 651          * encoded text for an attribute value.
 652          *
 653          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 654          * but note that we're not returning the value, but are returning
 655          * XML source fragments that will be slapped into output.
 656          *
 657          * @param string $text
 658          * @return string
 659          * @access private
 660          */
 661         function normalizeAttributeValue( $text ) {
 662                 return str_replace( '"', '&quot;',
 663                         preg_replace(
 664                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 665                                 ' ',
 666                                 Sanitizer::normalizeCharReferences( $text ) ) );
 667         }
 668
 669         /**
 670          * Ensure that any entities and character references are legal
 671          * for XML and XHTML specifically. Any stray bits will be
 672          * &amp;-escaped to result in a valid text fragment.
 673          *
 674          * a. any named char refs must be known in XHTML
 675          * b. any numeric char refs must be legal chars, not invalid or forbidden
 676          * c. use &#x, not &#X
 677          * d. fix or reject non-valid attributes
 678          *
 679          * @param string $text
 680          * @return string
 681          * @access private
 682          */
 683         function normalizeCharReferences( $text ) {
 684                 return preg_replace_callback(
 685                         MW_CHAR_REFS_REGEX,
 686                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 687                         $text );
 688         }
 689         /**
 690          * @param string $matches
 691          * @return string
 692          */
 693         function normalizeCharReferencesCallback( $matches ) {
 694                 $ret = null;
 695                 if( $matches[1] != '' ) {
 696                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 697                 } elseif( $matches[2] != '' ) {
 698                         $ret = Sanitizer::decCharReference( $matches[2] );
 699                 } elseif( $matches[3] != ''  ) {
 700                         $ret = Sanitizer::hexCharReference( $matches[3] );
 701                 } elseif( $matches[4] != '' ) {
 702                         $ret = Sanitizer::hexCharReference( $matches[4] );
 703                 }
 704                 if( is_null( $ret ) ) {
 705                         return htmlspecialchars( $matches[0] );
 706                 } else {
 707                         return $ret;
 708                 }
 709         }
 710
 711         /**
 712          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 713          * return the named entity reference as is. Otherwise, returns
 714          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 715          *
 716          * @param string $name
 717          * @return string
 718          */
 719         function normalizeEntity( $name ) {
 720                 global $wgHtmlEntities;
 721                 if( isset( $wgHtmlEntities[$name] ) ) {
 722                         return "&$name;";
 723                 } else {
 724                         return "&amp;$name;";
 725                 }
 726         }
 727
 728         function decCharReference( $codepoint ) {
 729                 $point = IntVal( $codepoint );
 730                 if( Sanitizer::validateCodepoint( $point ) ) {
 731                         return sprintf( '&#%d;', $point );
 732                 } else {
 733                         return null;
 734                 }
 735         }
 736
 737         function hexCharReference( $codepoint ) {
 738                 $point = hexdec( $codepoint );
 739                 if( Sanitizer::validateCodepoint( $point ) ) {
 740                         return sprintf( '&#x%x;', $point );
 741                 } else {
 742                         return null;
 743                 }
 744         }
 745
 746         /**
 747          * Returns true if a given Unicode codepoint is a valid character in XML.
 748          * @param int $codepoint
 749          * @return bool
 750          */
 751         function validateCodepoint( $codepoint ) {
 752                 return ($codepoint ==    0x09)
 753                         || ($codepoint ==    0x0a)
 754                         || ($codepoint ==    0x0d)
 755                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 756                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 757                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 758         }
 759
 760         /**
 761          * Decode any character references, numeric or named entities,
 762          * in the text and return a UTF-8 string.
 763          *
 764          * @param string $text
 765          * @return string
 766          * @access public
 767          */
 768         function decodeCharReferences( $text ) {
 769                 return preg_replace_callback(
 770                         MW_CHAR_REFS_REGEX,
 771                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 772                         $text );
 773         }
 774
 775         /**
 776          * @param string $matches
 777          * @return string
 778          */
 779         function decodeCharReferencesCallback( $matches ) {
 780                 if( $matches[1] != '' ) {
 781                         return Sanitizer::decodeEntity( $matches[1] );
 782                 } elseif( $matches[2] != '' ) {
 783                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 784                 } elseif( $matches[3] != ''  ) {
 785                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 786                 } elseif( $matches[4] != '' ) {
 787                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 788                 }
 789                 # Last case should be an ampersand by itself
 790                 return $matches[0];
 791         }
 792
 793         /**
 794          * Return UTF-8 string for a codepoint if that is a valid
 795          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 796          * @param int $codepoint
 797          * @return string
 798          * @access private
 799          */
 800         function decodeChar( $codepoint ) {
 801                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 802                         return codepointToUtf8( $codepoint );
 803                 } else {
 804                         return UTF8_REPLACEMENT;
 805                 }
 806         }
 807
 808         /**
 809          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 810          * return the UTF-8 encoding of that character. Otherwise, returns
 811          * pseudo-entity source (eg &foo;)
 812          *
 813          * @param string $name
 814          * @return string
 815          */
 816         function decodeEntity( $name ) {
 817                 global $wgHtmlEntities;
 818                 if( isset( $wgHtmlEntities[$name] ) ) {
 819                         return codepointToUtf8( $wgHtmlEntities[$name] );
 820                 } else {
 821                         return "&$name;";
 822                 }
 823         }
 824
 825         /**
 826          * Fetch the whitelist of acceptable attributes for a given
 827          * element name.
 828          *
 829          * @param string $element
 830          * @return array
 831          */
 832         function attributeWhitelist( $element ) {
 833                 static $list;
 834                 if( !isset( $list ) ) {
 835                         $list = Sanitizer::setupAttributeWhitelist();
 836                 }
 837                 return isset( $list[$element] )
 838                         ? $list[$element]
 839                         : array();
 840         }
 841
 842         /**
 843          * @return array
 844          */
 845         function setupAttributeWhitelist() {
 846                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 847                 $block = array_merge( $common, array( 'align' ) );
 848                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 849                 $tablecell = array( 'abbr',
 850                                     'axis',
 851                                     'headers',
 852                                     'scope',
 853                                     'rowspan',
 854                                     'colspan',
 855                                     'nowrap', # deprecated
 856                                     'width',  # deprecated
 857                                     'height', # deprecated
 858                                     'bgcolor' # deprecated
 859                                     );
 860
 861                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 862                 # See: http://www.w3.org/TR/html4/
 863                 $whitelist = array (
 864                         # 7.5.4
 865                         'div'        => $block,
 866                         'center'     => $common, # deprecated
 867                         'span'       => $block, # ??
 868
 869                         # 7.5.5
 870                         'h1'         => $block,
 871                         'h2'         => $block,
 872                         'h3'         => $block,
 873                         'h4'         => $block,
 874                         'h5'         => $block,
 875                         'h6'         => $block,
 876
 877                         # 7.5.6
 878                         # address
 879
 880                         # 8.2.4
 881                         # bdo
 882
 883                         # 9.2.1
 884                         'em'         => $common,
 885                         'strong'     => $common,
 886                         'cite'       => $common,
 887                         # dfn
 888                         'code'       => $common,
 889                         # samp
 890                         # kbd
 891                         'var'        => $common,
 892                         # abbr
 893                         # acronym
 894
 895                         # 9.2.2
 896                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 897                         # q
 898
 899                         # 9.2.3
 900                         'sub'        => $common,
 901                         'sup'        => $common,
 902
 903                         # 9.3.1
 904                         'p'          => $block,
 905
 906                         # 9.3.2
 907                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 908
 909                         # 9.3.4
 910                         'pre'        => array_merge( $common, array( 'width' ) ),
 911
 912                         # 9.4
 913                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 914                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 915
 916                         # 10.2
 917                         'ul'         => array_merge( $common, array( 'type' ) ),
 918                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 919                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 920
 921                         # 10.3
 922                         'dl'         => $common,
 923                         'dd'         => $common,
 924                         'dt'         => $common,
 925
 926                         # 11.2.1
 927                         'table'      => array_merge( $common,
 928                                                                 array( 'summary', 'width', 'border', 'frame',
 929                                                                                          'rules', 'cellspacing', 'cellpadding',
 930                                                                                          'align', 'bgcolor', 'frame', 'rules',
 931                                                                                          'border' ) ),
 932
 933                         # 11.2.2
 934                         'caption'    => array_merge( $common, array( 'align' ) ),
 935
 936                         # 11.2.3
 937                         'thead'      => array_merge( $common, $tablealign ),
 938                         'tfoot'      => array_merge( $common, $tablealign ),
 939                         'tbody'      => array_merge( $common, $tablealign ),
 940
 941                         # 11.2.4
 942                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 943                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 944
 945                         # 11.2.5
 946                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 947
 948                         # 11.2.6
 949                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 950                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 951
 952                         # 15.2.1
 953                         'tt'         => $common,
 954                         'b'          => $common,
 955                         'i'          => $common,
 956                         'big'        => $common,
 957                         'small'      => $common,
 958                         'strike'     => $common,
 959                         's'          => $common,
 960                         'u'          => $common,
 961
 962                         # 15.2.2
 963                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 964                         # basefont
 965
 966                         # 15.3
 967                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 968
 969                         # XHTML Ruby annotation text module, simple ruby only.
 970                         # http://www.w3c.org/TR/ruby/
 971                         'ruby'       => $common,
 972                         # rbc
 973                         # rtc
 974                         'rb'         => $common,
 975                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 976                         'rp'         => $common,
 977                         );
 978                 return $whitelist;
 979         }
 980
 981         /**
 982          * Take a fragment of (potentially invalid) HTML and return
 983          * a version with any tags removed, encoded suitably for literal
 984          * inclusion in an attribute value.
 985          *
 986          * @param string $text HTML fragment
 987          * @return string
 988          */
 989         function stripAllTags( $text ) {
 990                 # Actual <tags>
 991                 $text = preg_replace( '/<[^>]*>/', '', $text );
 992
 993                 # Normalize &entities and whitespace
 994                 $text = Sanitizer::normalizeAttributeValue( $text );
 995
 996                 # Will be placed into "double-quoted" attributes,
 997                 # make sure remaining bits are safe.
 998                 $text = str_replace(
 999                         array('<', '>', '"'),
1000                         array('&lt;', '&gt;', '&quot;'),
1001                         $text );
1002
1003                 return $text;
1004         }
1005
1006 }
1007
1008 ?>