includes/Sanitizer.php

   1 <?php
   2 /**
   3  * (X)HTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @access private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @access private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr'
 355                         );
 356                 } else {
 357                         $htmlpairs = array();
 358                         $htmlsingle = array();
 359                         $htmlnest = array();
 360                         $tabletags = array();
 361                 }
 362
 363                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 364                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 365
 366                 # Remove HTML comments
 367                 $text = Sanitizer::removeHTMLcomments( $text );
 368
 369                 $bits = explode( '<', $text );
 370                 $text = array_shift( $bits );
 371                 if(!$wgUseTidy) {
 372                         $tagstack = array(); $tablestack = array();
 373                         foreach ( $bits as $x ) {
 374                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 375                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 376                                 $x, $regs );
 377                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 378                                 error_reporting( $prev );
 379
 380                                 $badtag = 0 ;
 381                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 382                                         # Check our stack
 383                                         if ( $slash ) {
 384                                                 # Closing a tag...
 385                                                 if( in_array( $t, $htmlsingleonly ) ) {
 386                                                         $badtag = 1;
 387                                                 } elseif( !in_array( $t, $htmlsingle ) &&
 388                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
 389                                                         @array_push( $tagstack, $ot );
 390                                                         $badtag = 1;
 391                                                 } else {
 392                                                         if ( $t == 'table' ) {
 393                                                                 $tagstack = array_pop( $tablestack );
 394                                                         }
 395                                                         $newparams = '';
 396                                                 }
 397                                         } else {
 398                                                 # Keep track for later
 399                                                 if ( in_array( $t, $tabletags ) &&
 400                                                 ! in_array( 'table', $tagstack ) ) {
 401                                                         $badtag = 1;
 402                                                 } else if ( in_array( $t, $tagstack ) &&
 403                                                 ! in_array ( $t , $htmlnest ) ) {
 404                                                         $badtag = 1 ;
 405                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 406                                                         # Hack to force empty tag for uncloseable elements
 407                                                         $brace = '/>';
 408                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 409                                                         if ( $t == 'table' ) {
 410                                                                 array_push( $tablestack, $tagstack );
 411                                                                 $tagstack = array();
 412                                                         }
 413                                                         array_push( $tagstack, $t );
 414                                                 }
 415
 416                                                 # Replace any variables or template parameters with
 417                                                 # plaintext results.
 418                                                 if( is_callable( $processCallback ) ) {
 419                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 420                                                 }
 421
 422                                                 # Strip non-approved attributes from the tag
 423                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 424                                         }
 425                                         if ( ! $badtag ) {
 426                                                 $rest = str_replace( '>', '&gt;', $rest );
 427                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 428                                                 $text .= "<$slash$t$newparams$close>$rest";
 429                                                 continue;
 430                                         }
 431                                 }
 432                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 433                         }
 434                         # Close off any remaining tags
 435                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 436                                 $text .= "</$t>\n";
 437                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 438                         }
 439                 } else {
 440                         # this might be possible using tidy itself
 441                         foreach ( $bits as $x ) {
 442                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 443                                 $x, $regs );
 444                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 445                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 446                                         if( is_callable( $processCallback ) ) {
 447                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 448                                         }
 449                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 450                                         $rest = str_replace( '>', '&gt;', $rest );
 451                                         $text .= "<$slash$t$newparams$brace$rest";
 452                                 } else {
 453                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 454                                 }
 455                         }
 456                 }
 457                 wfProfileOut( $fname );
 458                 return $text;
 459         }
 460
 461         /**
 462          * Remove '<!--', '-->', and everything between.
 463          * To avoid leaving blank lines, when a comment is both preceded
 464          * and followed by a newline (ignoring spaces), trim leading and
 465          * trailing spaces and one of the newlines.
 466          *
 467          * @access private
 468          * @param string $text
 469          * @return string
 470          */
 471         function removeHTMLcomments( $text ) {
 472                 $fname='Parser::removeHTMLcomments';
 473                 wfProfileIn( $fname );
 474                 while (($start = strpos($text, '<!--')) !== false) {
 475                         $end = strpos($text, '-->', $start + 4);
 476                         if ($end === false) {
 477                                 # Unterminated comment; bail out
 478                                 break;
 479                         }
 480
 481                         $end += 3;
 482
 483                         # Trim space and newline if the comment is both
 484                         # preceded and followed by a newline
 485                         $spaceStart = max($start - 1, 0);
 486                         $spaceLen = $end - $spaceStart;
 487                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 488                                 $spaceStart--;
 489                                 $spaceLen++;
 490                         }
 491                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 492                                 $spaceLen++;
 493                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 494                                 # Remove the comment, leading and trailing
 495                                 # spaces, and leave only one newline.
 496                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 497                         }
 498                         else {
 499                                 # Remove just the comment.
 500                                 $text = substr_replace($text, '', $start, $end - $start);
 501                         }
 502                 }
 503                 wfProfileOut( $fname );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Take a tag soup fragment listing an HTML element's attributes
 509          * and normalize it to well-formed XML, discarding unwanted attributes.
 510          *
 511          * - Normalizes attribute names to lowercase
 512          * - Discards attributes not on a whitelist for the given element
 513          * - Turns broken or invalid entities into plaintext
 514          * - Double-quotes all attribute values
 515          * - Attributes without values are given the name as attribute
 516          * - Double attributes are discarded
 517          * - Unsafe style attributes are discarded
 518          * - Prepends space if there are attributes.
 519          *
 520          * @param string $text
 521          * @param string $element
 522          * @return string
 523          *
 524          * @todo Check for legal values where the DTD limits things.
 525          * @todo Check for unique id attribute :P
 526          */
 527         function fixTagAttributes( $text, $element ) {
 528                 global $wgUrlProtocols;
 529                 if( trim( $text ) == '' ) {
 530                         return '';
 531                 }
 532
 533                 # Unquoted attribute
 534                 # Since we quote this later, this can be anything distinguishable
 535                 # from the end of the attribute
 536                 if( !preg_match_all(
 537                         MW_ATTRIBS_REGEX,
 538                         $text,
 539                         $pairs,
 540                         PREG_SET_ORDER ) ) {
 541                         return '';
 542                 }
 543
 544                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 545                 $attribs = array();
 546                 foreach( $pairs as $set ) {
 547                         $attribute = strtolower( $set[1] );
 548                         if( !isset( $whitelist[$attribute] ) ) {
 549                                 continue;
 550                         }
 551
 552                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 553                         $value = Sanitizer::normalizeAttributeValue( $raw );
 554
 555                         # Strip javascript "expression" from stylesheets.
 556                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 557                         if( $attribute == 'style' && preg_match(
 558                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 559                                         Sanitizer::decodeCharReferences( $value ) ) ) {
 560                                 # haxx0r
 561                                 continue;
 562                         }
 563
 564                         # Templates and links may be expanded in later parsing,
 565                         # creating invalid or dangerous output. Suppress this.
 566                         $value = strtr( $value, array(
 567                                 '{'    => '&#123;',
 568                                 '['    => '&#91;',
 569                                 "''"   => '&#39;&#39;',
 570                                 'ISBN' => '&#73;SBN',
 571                                 'RFC'  => '&#82;FC',
 572                                 'PMID' => '&#80;MID',
 573                         ) );
 574
 575                         # Stupid hack
 576                         $value = preg_replace_callback(
 577                                 '/(' . $wgUrlProtocols . ')/',
 578                                 array( 'Sanitizer', 'armorLinksCallback' ),
 579                                 $value );
 580
 581                         // If this attribute was previously set, override it.
 582                         // Output should only have one attribute of each name.
 583                         $attribs[$attribute] = "$attribute=\"$value\"";
 584                 }
 585                 if( empty( $attribs ) ) {
 586                         return '';
 587                 } else {
 588                         return ' ' . implode( ' ', $attribs );
 589                 }
 590         }
 591
 592         /**
 593          * Regex replace callback for armoring links against further processing.
 594          * @param array $matches
 595          * @return string
 596          * @access private
 597          */
 598         function armorLinksCallback( $matches ) {
 599                 return str_replace( ':', '&#58;', $matches[1] );
 600         }
 601
 602         /**
 603          * Return an associative array of attribute names and values from
 604          * a partial tag string. Attribute names are forces to lowercase,
 605          * character references are decoded to UTF-8 text.
 606          *
 607          * @param string
 608          * @return array
 609          */
 610         function decodeTagAttributes( $text ) {
 611                 $attribs = array();
 612
 613                 if( trim( $text ) == '' ) {
 614                         return $attribs;
 615                 }
 616
 617                 if( !preg_match_all(
 618                         MW_ATTRIBS_REGEX,
 619                         $text,
 620                         $pairs,
 621                         PREG_SET_ORDER ) ) {
 622                         return $attribs;
 623                 }
 624
 625                 foreach( $pairs as $set ) {
 626                         $attribute = strtolower( $set[1] );
 627                         $value = Sanitizer::getTagAttributeCallback( $set );
 628                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 629                 }
 630                 return $attribs;
 631         }
 632
 633         /**
 634          * Pick the appropriate attribute value from a match set from the
 635          * MW_ATTRIBS_REGEX matches.
 636          *
 637          * @param array $set
 638          * @return string
 639          * @access private
 640          */
 641         function getTagAttributeCallback( $set ) {
 642                 if( isset( $set[6] ) ) {
 643                         # Illegal #XXXXXX color with no quotes.
 644                         return $set[6];
 645                 } elseif( isset( $set[5] ) ) {
 646                         # No quotes.
 647                         return $set[5];
 648                 } elseif( isset( $set[4] ) ) {
 649                         # Single-quoted
 650                         return $set[4];
 651                 } elseif( isset( $set[3] ) ) {
 652                         # Double-quoted
 653                         return $set[3];
 654                 } elseif( !isset( $set[2] ) ) {
 655                         # In XHTML, attributes must have a value.
 656                         # For 'reduced' form, return explicitly the attribute name here.
 657                         return $set[1];
 658                 } else {
 659                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 660                 }
 661         }
 662
 663         /**
 664          * Normalize whitespace and character references in an XML source-
 665          * encoded text for an attribute value.
 666          *
 667          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 668          * but note that we're not returning the value, but are returning
 669          * XML source fragments that will be slapped into output.
 670          *
 671          * @param string $text
 672          * @return string
 673          * @access private
 674          */
 675         function normalizeAttributeValue( $text ) {
 676                 return str_replace( '"', '&quot;',
 677                         preg_replace(
 678                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 679                                 ' ',
 680                                 Sanitizer::normalizeCharReferences( $text ) ) );
 681         }
 682
 683         /**
 684          * Ensure that any entities and character references are legal
 685          * for XML and XHTML specifically. Any stray bits will be
 686          * &amp;-escaped to result in a valid text fragment.
 687          *
 688          * a. any named char refs must be known in XHTML
 689          * b. any numeric char refs must be legal chars, not invalid or forbidden
 690          * c. use &#x, not &#X
 691          * d. fix or reject non-valid attributes
 692          *
 693          * @param string $text
 694          * @return string
 695          * @access private
 696          */
 697         function normalizeCharReferences( $text ) {
 698                 return preg_replace_callback(
 699                         MW_CHAR_REFS_REGEX,
 700                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 701                         $text );
 702         }
 703         /**
 704          * @param string $matches
 705          * @return string
 706          */
 707         function normalizeCharReferencesCallback( $matches ) {
 708                 $ret = null;
 709                 if( $matches[1] != '' ) {
 710                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 711                 } elseif( $matches[2] != '' ) {
 712                         $ret = Sanitizer::decCharReference( $matches[2] );
 713                 } elseif( $matches[3] != ''  ) {
 714                         $ret = Sanitizer::hexCharReference( $matches[3] );
 715                 } elseif( $matches[4] != '' ) {
 716                         $ret = Sanitizer::hexCharReference( $matches[4] );
 717                 }
 718                 if( is_null( $ret ) ) {
 719                         return htmlspecialchars( $matches[0] );
 720                 } else {
 721                         return $ret;
 722                 }
 723         }
 724
 725         /**
 726          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 727          * return the named entity reference as is. Otherwise, returns
 728          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 729          *
 730          * @param string $name
 731          * @return string
 732          */
 733         function normalizeEntity( $name ) {
 734                 global $wgHtmlEntities;
 735                 if( isset( $wgHtmlEntities[$name] ) ) {
 736                         return "&$name;";
 737                 } else {
 738                         return "&amp;$name;";
 739                 }
 740         }
 741
 742         function decCharReference( $codepoint ) {
 743                 $point = intval( $codepoint );
 744                 if( Sanitizer::validateCodepoint( $point ) ) {
 745                         return sprintf( '&#%d;', $point );
 746                 } else {
 747                         return null;
 748                 }
 749         }
 750
 751         function hexCharReference( $codepoint ) {
 752                 $point = hexdec( $codepoint );
 753                 if( Sanitizer::validateCodepoint( $point ) ) {
 754                         return sprintf( '&#x%x;', $point );
 755                 } else {
 756                         return null;
 757                 }
 758         }
 759
 760         /**
 761          * Returns true if a given Unicode codepoint is a valid character in XML.
 762          * @param int $codepoint
 763          * @return bool
 764          */
 765         function validateCodepoint( $codepoint ) {
 766                 return ($codepoint ==    0x09)
 767                         || ($codepoint ==    0x0a)
 768                         || ($codepoint ==    0x0d)
 769                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 770                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 771                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 772         }
 773
 774         /**
 775          * Decode any character references, numeric or named entities,
 776          * in the text and return a UTF-8 string.
 777          *
 778          * @param string $text
 779          * @return string
 780          * @access public
 781          */
 782         function decodeCharReferences( $text ) {
 783                 return preg_replace_callback(
 784                         MW_CHAR_REFS_REGEX,
 785                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 786                         $text );
 787         }
 788
 789         /**
 790          * @param string $matches
 791          * @return string
 792          */
 793         function decodeCharReferencesCallback( $matches ) {
 794                 if( $matches[1] != '' ) {
 795                         return Sanitizer::decodeEntity( $matches[1] );
 796                 } elseif( $matches[2] != '' ) {
 797                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 798                 } elseif( $matches[3] != ''  ) {
 799                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 800                 } elseif( $matches[4] != '' ) {
 801                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 802                 }
 803                 # Last case should be an ampersand by itself
 804                 return $matches[0];
 805         }
 806
 807         /**
 808          * Return UTF-8 string for a codepoint if that is a valid
 809          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 810          * @param int $codepoint
 811          * @return string
 812          * @access private
 813          */
 814         function decodeChar( $codepoint ) {
 815                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 816                         return codepointToUtf8( $codepoint );
 817                 } else {
 818                         return UTF8_REPLACEMENT;
 819                 }
 820         }
 821
 822         /**
 823          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 824          * return the UTF-8 encoding of that character. Otherwise, returns
 825          * pseudo-entity source (eg &foo;)
 826          *
 827          * @param string $name
 828          * @return string
 829          */
 830         function decodeEntity( $name ) {
 831                 global $wgHtmlEntities;
 832                 if( isset( $wgHtmlEntities[$name] ) ) {
 833                         return codepointToUtf8( $wgHtmlEntities[$name] );
 834                 } else {
 835                         return "&$name;";
 836                 }
 837         }
 838
 839         /**
 840          * Fetch the whitelist of acceptable attributes for a given
 841          * element name.
 842          *
 843          * @param string $element
 844          * @return array
 845          */
 846         function attributeWhitelist( $element ) {
 847                 static $list;
 848                 if( !isset( $list ) ) {
 849                         $list = Sanitizer::setupAttributeWhitelist();
 850                 }
 851                 return isset( $list[$element] )
 852                         ? $list[$element]
 853                         : array();
 854         }
 855
 856         /**
 857          * @return array
 858          */
 859         function setupAttributeWhitelist() {
 860                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 861                 $block = array_merge( $common, array( 'align' ) );
 862                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 863                 $tablecell = array( 'abbr',
 864                                     'axis',
 865                                     'headers',
 866                                     'scope',
 867                                     'rowspan',
 868                                     'colspan',
 869                                     'nowrap', # deprecated
 870                                     'width',  # deprecated
 871                                     'height', # deprecated
 872                                     'bgcolor' # deprecated
 873                                     );
 874
 875                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 876                 # See: http://www.w3.org/TR/html4/
 877                 $whitelist = array (
 878                         # 7.5.4
 879                         'div'        => $block,
 880                         'center'     => $common, # deprecated
 881                         'span'       => $block, # ??
 882
 883                         # 7.5.5
 884                         'h1'         => $block,
 885                         'h2'         => $block,
 886                         'h3'         => $block,
 887                         'h4'         => $block,
 888                         'h5'         => $block,
 889                         'h6'         => $block,
 890
 891                         # 7.5.6
 892                         # address
 893
 894                         # 8.2.4
 895                         # bdo
 896
 897                         # 9.2.1
 898                         'em'         => $common,
 899                         'strong'     => $common,
 900                         'cite'       => $common,
 901                         # dfn
 902                         'code'       => $common,
 903                         # samp
 904                         # kbd
 905                         'var'        => $common,
 906                         # abbr
 907                         # acronym
 908
 909                         # 9.2.2
 910                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 911                         # q
 912
 913                         # 9.2.3
 914                         'sub'        => $common,
 915                         'sup'        => $common,
 916
 917                         # 9.3.1
 918                         'p'          => $block,
 919
 920                         # 9.3.2
 921                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 922
 923                         # 9.3.4
 924                         'pre'        => array_merge( $common, array( 'width' ) ),
 925
 926                         # 9.4
 927                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 928                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 929
 930                         # 10.2
 931                         'ul'         => array_merge( $common, array( 'type' ) ),
 932                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 933                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 934
 935                         # 10.3
 936                         'dl'         => $common,
 937                         'dd'         => $common,
 938                         'dt'         => $common,
 939
 940                         # 11.2.1
 941                         'table'      => array_merge( $common,
 942                                                                 array( 'summary', 'width', 'border', 'frame',
 943                                                                                          'rules', 'cellspacing', 'cellpadding',
 944                                                                                          'align', 'bgcolor', 'frame', 'rules',
 945                                                                                          'border' ) ),
 946
 947                         # 11.2.2
 948                         'caption'    => array_merge( $common, array( 'align' ) ),
 949
 950                         # 11.2.3
 951                         'thead'      => array_merge( $common, $tablealign ),
 952                         'tfoot'      => array_merge( $common, $tablealign ),
 953                         'tbody'      => array_merge( $common, $tablealign ),
 954
 955                         # 11.2.4
 956                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 957                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 958
 959                         # 11.2.5
 960                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 961
 962                         # 11.2.6
 963                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 964                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 965
 966                         # 15.2.1
 967                         'tt'         => $common,
 968                         'b'          => $common,
 969                         'i'          => $common,
 970                         'big'        => $common,
 971                         'small'      => $common,
 972                         'strike'     => $common,
 973                         's'          => $common,
 974                         'u'          => $common,
 975
 976                         # 15.2.2
 977                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 978                         # basefont
 979
 980                         # 15.3
 981                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 982
 983                         # XHTML Ruby annotation text module, simple ruby only.
 984                         # http://www.w3c.org/TR/ruby/
 985                         'ruby'       => $common,
 986                         # rbc
 987                         # rtc
 988                         'rb'         => $common,
 989                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 990                         'rp'         => $common,
 991                         );
 992                 return $whitelist;
 993         }
 994
 995         /**
 996          * Take a fragment of (potentially invalid) HTML and return
 997          * a version with any tags removed, encoded suitably for literal
 998          * inclusion in an attribute value.
 999          *
1000          * @param string $text HTML fragment
1001          * @return string
1002          */
1003         function stripAllTags( $text ) {
1004                 # Actual <tags>
1005                 $text = preg_replace( '/<[^>]*>/', '', $text );
1006
1007                 # Normalize &entities and whitespace
1008                 $text = Sanitizer::normalizeAttributeValue( $text );
1009
1010                 # Will be placed into "double-quoted" attributes,
1011                 # make sure remaining bits are safe.
1012                 $text = str_replace(
1013                         array('<', '>', '"'),
1014                         array('&lt;', '&gt;', '&quot;'),
1015                         $text );
1016
1017                 return $text;
1018         }
1019
1020 }
1021
1022 ?>