includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         @array_push( $tagstack, $ot );
 395                                                         # <li> can be nested in <ul> or <ol>, skip those cases:
 396                                                         if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 397                                                                 $badtag = 1;
 398                                                         }
 399                                                 } else {
 400                                                         if ( $t == 'table' ) {
 401                                                                 $tagstack = array_pop( $tablestack );
 402                                                         }
 403                                                         $newparams = '';
 404                                                 }
 405                                         } else {
 406                                                 # Keep track for later
 407                                                 if ( in_array( $t, $tabletags ) &&
 408                                                 ! in_array( 'table', $tagstack ) ) {
 409                                                         $badtag = 1;
 410                                                 } else if ( in_array( $t, $tagstack ) &&
 411                                                 ! in_array ( $t , $htmlnest ) ) {
 412                                                         $badtag = 1 ;
 413                                                 # Is it a self closed htmlpair ? (bug 5487)
 414                                                 } else if( $brace == '/>' &&
 415                                                 in_array($t, $htmlpairs) ) {
 416                                                         $badtag = 1;
 417                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 418                                                         # Hack to force empty tag for uncloseable elements
 419                                                         $brace = '/>';
 420                                                 } else if( in_array( $t, $htmlsingle ) ) {
 421                                                         # Hack to not close $htmlsingle tags
 422                                                         $brace = NULL;
 423                                                 } else {
 424                                                         if ( $t == 'table' ) {
 425                                                                 array_push( $tablestack, $tagstack );
 426                                                                 $tagstack = array();
 427                                                         }
 428                                                         array_push( $tagstack, $t );
 429                                                 }
 430
 431                                                 # Replace any variables or template parameters with
 432                                                 # plaintext results.
 433                                                 if( is_callable( $processCallback ) ) {
 434                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 435                                                 }
 436
 437                                                 # Strip non-approved attributes from the tag
 438                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 439                                         }
 440                                         if ( ! $badtag ) {
 441                                                 $rest = str_replace( '>', '&gt;', $rest );
 442                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 443                                                 $text .= "<$slash$t$newparams$close>$rest";
 444                                                 continue;
 445                                         }
 446                                 }
 447                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 448                         }
 449                         # Close off any remaining tags
 450                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 451                                 $text .= "</$t>\n";
 452                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 453                         }
 454                 } else {
 455                         # this might be possible using tidy itself
 456                         foreach ( $bits as $x ) {
 457                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 458                                 $x, $regs );
 459                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 460                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 461                                         if( is_callable( $processCallback ) ) {
 462                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 463                                         }
 464                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 465                                         $rest = str_replace( '>', '&gt;', $rest );
 466                                         $text .= "<$slash$t$newparams$brace$rest";
 467                                 } else {
 468                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 469                                 }
 470                         }
 471                 }
 472                 wfProfileOut( $fname );
 473                 return $text;
 474         }
 475
 476         /**
 477          * Remove '<!--', '-->', and everything between.
 478          * To avoid leaving blank lines, when a comment is both preceded
 479          * and followed by a newline (ignoring spaces), trim leading and
 480          * trailing spaces and one of the newlines.
 481          *
 482          * @private
 483          * @param string $text
 484          * @return string
 485          */
 486         function removeHTMLcomments( $text ) {
 487                 $fname='Parser::removeHTMLcomments';
 488                 wfProfileIn( $fname );
 489                 while (($start = strpos($text, '<!--')) !== false) {
 490                         $end = strpos($text, '-->', $start + 4);
 491                         if ($end === false) {
 492                                 # Unterminated comment; bail out
 493                                 break;
 494                         }
 495
 496                         $end += 3;
 497
 498                         # Trim space and newline if the comment is both
 499                         # preceded and followed by a newline
 500                         $spaceStart = max($start - 1, 0);
 501                         $spaceLen = $end - $spaceStart;
 502                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 503                                 $spaceStart--;
 504                                 $spaceLen++;
 505                         }
 506                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 507                                 $spaceLen++;
 508                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 509                                 # Remove the comment, leading and trailing
 510                                 # spaces, and leave only one newline.
 511                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 512                         }
 513                         else {
 514                                 # Remove just the comment.
 515                                 $text = substr_replace($text, '', $start, $end - $start);
 516                         }
 517                 }
 518                 wfProfileOut( $fname );
 519                 return $text;
 520         }
 521
 522         /**
 523          * Take a tag soup fragment listing an HTML element's attributes
 524          * and normalize it to well-formed XML, discarding unwanted attributes.
 525          *
 526          * - Normalizes attribute names to lowercase
 527          * - Discards attributes not on a whitelist for the given element
 528          * - Turns broken or invalid entities into plaintext
 529          * - Double-quotes all attribute values
 530          * - Attributes without values are given the name as attribute
 531          * - Double attributes are discarded
 532          * - Unsafe style attributes are discarded
 533          * - Prepends space if there are attributes.
 534          *
 535          * @param string $text
 536          * @param string $element
 537          * @return string
 538          *
 539          * @todo Check for legal values where the DTD limits things.
 540          * @todo Check for unique id attribute :P
 541          */
 542         function fixTagAttributes( $text, $element ) {
 543                 if( trim( $text ) == '' ) {
 544                         return '';
 545                 }
 546
 547                 # Unquoted attribute
 548                 # Since we quote this later, this can be anything distinguishable
 549                 # from the end of the attribute
 550                 $pairs = array();
 551                 if( !preg_match_all(
 552                         MW_ATTRIBS_REGEX,
 553                         $text,
 554                         $pairs,
 555                         PREG_SET_ORDER ) ) {
 556                         return '';
 557                 }
 558
 559                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 560                 $attribs = array();
 561                 foreach( $pairs as $set ) {
 562                         $attribute = strtolower( $set[1] );
 563                         if( !isset( $whitelist[$attribute] ) ) {
 564                                 continue;
 565                         }
 566
 567                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 568                         $value = Sanitizer::normalizeAttributeValue( $raw );
 569
 570                         # Strip javascript "expression" from stylesheets.
 571                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 572                         if( $attribute == 'style' ) {
 573                                 $stripped = Sanitizer::decodeCharReferences( $value );
 574
 575                                 // Remove any comments; IE gets token splitting wrong
 576                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 577                                 $value = htmlspecialchars( $stripped );
 578
 579                                 // ... and continue checks
 580                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 581                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 582                                 $stripped = str_replace( '\\', '', $stripped );
 583                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 584                                                 $stripped ) ) {
 585                                         # haxx0r
 586                                         continue;
 587                                 }
 588                         }
 589
 590                         if ( $attribute === 'id' )
 591                                 $value = Sanitizer::escapeId( $value );
 592
 593                         # Templates and links may be expanded in later parsing,
 594                         # creating invalid or dangerous output. Suppress this.
 595                         $value = strtr( $value, array(
 596                                 '<'    => '&lt;',   // This should never happen,
 597                                 '>'    => '&gt;',   // we've received invalid input
 598                                 '"'    => '&quot;', // which should have been escaped.
 599                                 '{'    => '&#123;',
 600                                 '['    => '&#91;',
 601                                 "''"   => '&#39;&#39;',
 602                                 'ISBN' => '&#73;SBN',
 603                                 'RFC'  => '&#82;FC',
 604                                 'PMID' => '&#80;MID',
 605                         ) );
 606
 607                         # Stupid hack
 608                         $value = preg_replace_callback(
 609                                 '/(' . wfUrlProtocols() . ')/',
 610                                 array( 'Sanitizer', 'armorLinksCallback' ),
 611                                 $value );
 612
 613                         // If this attribute was previously set, override it.
 614                         // Output should only have one attribute of each name.
 615                         $attribs[$attribute] = "$attribute=\"$value\"";
 616                 }
 617
 618                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 619         }
 620
 621         /**
 622          * Given a value escape it so that it can be used in an id attribute and
 623          * return it, this does not validate the value however (see first link)
 624          *
 625          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 626          *                                                          in the id and
 627          *                                                          name attributes
 628          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 629          *
 630          * @bug 4461
 631          *
 632          * @static
 633          *
 634          * @param string $id
 635          * @return string
 636          */
 637         function escapeId( $id ) {
 638                 static $replace = array(
 639                         '%3A' => ':',
 640                         '%' => '.'
 641                 );
 642
 643                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 644
 645                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 646         }
 647
 648         /**
 649          * Regex replace callback for armoring links against further processing.
 650          * @param array $matches
 651          * @return string
 652          * @private
 653          */
 654         function armorLinksCallback( $matches ) {
 655                 return str_replace( ':', '&#58;', $matches[1] );
 656         }
 657
 658         /**
 659          * Return an associative array of attribute names and values from
 660          * a partial tag string. Attribute names are forces to lowercase,
 661          * character references are decoded to UTF-8 text.
 662          *
 663          * @param string
 664          * @return array
 665          */
 666         function decodeTagAttributes( $text ) {
 667                 $attribs = array();
 668
 669                 if( trim( $text ) == '' ) {
 670                         return $attribs;
 671                 }
 672
 673                 $pairs = array();
 674                 if( !preg_match_all(
 675                         MW_ATTRIBS_REGEX,
 676                         $text,
 677                         $pairs,
 678                         PREG_SET_ORDER ) ) {
 679                         return $attribs;
 680                 }
 681
 682                 foreach( $pairs as $set ) {
 683                         $attribute = strtolower( $set[1] );
 684                         $value = Sanitizer::getTagAttributeCallback( $set );
 685                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 686                 }
 687                 return $attribs;
 688         }
 689
 690         /**
 691          * Pick the appropriate attribute value from a match set from the
 692          * MW_ATTRIBS_REGEX matches.
 693          *
 694          * @param array $set
 695          * @return string
 696          * @private
 697          */
 698         function getTagAttributeCallback( $set ) {
 699                 if( isset( $set[6] ) ) {
 700                         # Illegal #XXXXXX color with no quotes.
 701                         return $set[6];
 702                 } elseif( isset( $set[5] ) ) {
 703                         # No quotes.
 704                         return $set[5];
 705                 } elseif( isset( $set[4] ) ) {
 706                         # Single-quoted
 707                         return $set[4];
 708                 } elseif( isset( $set[3] ) ) {
 709                         # Double-quoted
 710                         return $set[3];
 711                 } elseif( !isset( $set[2] ) ) {
 712                         # In XHTML, attributes must have a value.
 713                         # For 'reduced' form, return explicitly the attribute name here.
 714                         return $set[1];
 715                 } else {
 716                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 717                 }
 718         }
 719
 720         /**
 721          * Normalize whitespace and character references in an XML source-
 722          * encoded text for an attribute value.
 723          *
 724          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 725          * but note that we're not returning the value, but are returning
 726          * XML source fragments that will be slapped into output.
 727          *
 728          * @param string $text
 729          * @return string
 730          * @private
 731          */
 732         function normalizeAttributeValue( $text ) {
 733                 return str_replace( '"', '&quot;',
 734                         preg_replace(
 735                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 736                                 ' ',
 737                                 Sanitizer::normalizeCharReferences( $text ) ) );
 738         }
 739
 740         /**
 741          * Ensure that any entities and character references are legal
 742          * for XML and XHTML specifically. Any stray bits will be
 743          * &amp;-escaped to result in a valid text fragment.
 744          *
 745          * a. any named char refs must be known in XHTML
 746          * b. any numeric char refs must be legal chars, not invalid or forbidden
 747          * c. use &#x, not &#X
 748          * d. fix or reject non-valid attributes
 749          *
 750          * @param string $text
 751          * @return string
 752          * @private
 753          */
 754         function normalizeCharReferences( $text ) {
 755                 return preg_replace_callback(
 756                         MW_CHAR_REFS_REGEX,
 757                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 758                         $text );
 759         }
 760         /**
 761          * @param string $matches
 762          * @return string
 763          */
 764         function normalizeCharReferencesCallback( $matches ) {
 765                 $ret = null;
 766                 if( $matches[1] != '' ) {
 767                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 768                 } elseif( $matches[2] != '' ) {
 769                         $ret = Sanitizer::decCharReference( $matches[2] );
 770                 } elseif( $matches[3] != ''  ) {
 771                         $ret = Sanitizer::hexCharReference( $matches[3] );
 772                 } elseif( $matches[4] != '' ) {
 773                         $ret = Sanitizer::hexCharReference( $matches[4] );
 774                 }
 775                 if( is_null( $ret ) ) {
 776                         return htmlspecialchars( $matches[0] );
 777                 } else {
 778                         return $ret;
 779                 }
 780         }
 781
 782         /**
 783          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 784          * return the named entity reference as is. Otherwise, returns
 785          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 786          *
 787          * @param string $name
 788          * @return string
 789          */
 790         function normalizeEntity( $name ) {
 791                 global $wgHtmlEntities;
 792                 if( isset( $wgHtmlEntities[$name] ) ) {
 793                         return "&$name;";
 794                 } else {
 795                         return "&amp;$name;";
 796                 }
 797         }
 798
 799         function decCharReference( $codepoint ) {
 800                 $point = intval( $codepoint );
 801                 if( Sanitizer::validateCodepoint( $point ) ) {
 802                         return sprintf( '&#%d;', $point );
 803                 } else {
 804                         return null;
 805                 }
 806         }
 807
 808         function hexCharReference( $codepoint ) {
 809                 $point = hexdec( $codepoint );
 810                 if( Sanitizer::validateCodepoint( $point ) ) {
 811                         return sprintf( '&#x%x;', $point );
 812                 } else {
 813                         return null;
 814                 }
 815         }
 816
 817         /**
 818          * Returns true if a given Unicode codepoint is a valid character in XML.
 819          * @param int $codepoint
 820          * @return bool
 821          */
 822         function validateCodepoint( $codepoint ) {
 823                 return ($codepoint ==    0x09)
 824                         || ($codepoint ==    0x0a)
 825                         || ($codepoint ==    0x0d)
 826                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 827                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 828                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 829         }
 830
 831         /**
 832          * Decode any character references, numeric or named entities,
 833          * in the text and return a UTF-8 string.
 834          *
 835          * @param string $text
 836          * @return string
 837          * @public
 838          */
 839         function decodeCharReferences( $text ) {
 840                 return preg_replace_callback(
 841                         MW_CHAR_REFS_REGEX,
 842                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 843                         $text );
 844         }
 845
 846         /**
 847          * @param string $matches
 848          * @return string
 849          */
 850         function decodeCharReferencesCallback( $matches ) {
 851                 if( $matches[1] != '' ) {
 852                         return Sanitizer::decodeEntity( $matches[1] );
 853                 } elseif( $matches[2] != '' ) {
 854                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 855                 } elseif( $matches[3] != ''  ) {
 856                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 857                 } elseif( $matches[4] != '' ) {
 858                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 859                 }
 860                 # Last case should be an ampersand by itself
 861                 return $matches[0];
 862         }
 863
 864         /**
 865          * Return UTF-8 string for a codepoint if that is a valid
 866          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 867          * @param int $codepoint
 868          * @return string
 869          * @private
 870          */
 871         function decodeChar( $codepoint ) {
 872                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 873                         return codepointToUtf8( $codepoint );
 874                 } else {
 875                         return UTF8_REPLACEMENT;
 876                 }
 877         }
 878
 879         /**
 880          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 881          * return the UTF-8 encoding of that character. Otherwise, returns
 882          * pseudo-entity source (eg &foo;)
 883          *
 884          * @param string $name
 885          * @return string
 886          */
 887         function decodeEntity( $name ) {
 888                 global $wgHtmlEntities;
 889                 if( isset( $wgHtmlEntities[$name] ) ) {
 890                         return codepointToUtf8( $wgHtmlEntities[$name] );
 891                 } else {
 892                         return "&$name;";
 893                 }
 894         }
 895
 896         /**
 897          * Fetch the whitelist of acceptable attributes for a given
 898          * element name.
 899          *
 900          * @param string $element
 901          * @return array
 902          */
 903         function attributeWhitelist( $element ) {
 904                 static $list;
 905                 if( !isset( $list ) ) {
 906                         $list = Sanitizer::setupAttributeWhitelist();
 907                 }
 908                 return isset( $list[$element] )
 909                         ? $list[$element]
 910                         : array();
 911         }
 912
 913         /**
 914          * @return array
 915          */
 916         function setupAttributeWhitelist() {
 917                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 918                 $block = array_merge( $common, array( 'align' ) );
 919                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 920                 $tablecell = array( 'abbr',
 921                                     'axis',
 922                                     'headers',
 923                                     'scope',
 924                                     'rowspan',
 925                                     'colspan',
 926                                     'nowrap', # deprecated
 927                                     'width',  # deprecated
 928                                     'height', # deprecated
 929                                     'bgcolor' # deprecated
 930                                     );
 931
 932                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 933                 # See: http://www.w3.org/TR/html4/
 934                 $whitelist = array (
 935                         # 7.5.4
 936                         'div'        => $block,
 937                         'center'     => $common, # deprecated
 938                         'span'       => $block, # ??
 939
 940                         # 7.5.5
 941                         'h1'         => $block,
 942                         'h2'         => $block,
 943                         'h3'         => $block,
 944                         'h4'         => $block,
 945                         'h5'         => $block,
 946                         'h6'         => $block,
 947
 948                         # 7.5.6
 949                         # address
 950
 951                         # 8.2.4
 952                         # bdo
 953
 954                         # 9.2.1
 955                         'em'         => $common,
 956                         'strong'     => $common,
 957                         'cite'       => $common,
 958                         # dfn
 959                         'code'       => $common,
 960                         # samp
 961                         # kbd
 962                         'var'        => $common,
 963                         # abbr
 964                         # acronym
 965
 966                         # 9.2.2
 967                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 968                         # q
 969
 970                         # 9.2.3
 971                         'sub'        => $common,
 972                         'sup'        => $common,
 973
 974                         # 9.3.1
 975                         'p'          => $block,
 976
 977                         # 9.3.2
 978                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 979
 980                         # 9.3.4
 981                         'pre'        => array_merge( $common, array( 'width' ) ),
 982
 983                         # 9.4
 984                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 985                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 986
 987                         # 10.2
 988                         'ul'         => array_merge( $common, array( 'type' ) ),
 989                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 990                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 991
 992                         # 10.3
 993                         'dl'         => $common,
 994                         'dd'         => $common,
 995                         'dt'         => $common,
 996
 997                         # 11.2.1
 998                         'table'      => array_merge( $common,
 999                                                                 array( 'summary', 'width', 'border', 'frame',
1000                                                                                          'rules', 'cellspacing', 'cellpadding',
1001                                                                                          'align', 'bgcolor', 'frame', 'rules',
1002                                                                                          'border' ) ),
1003
1004                         # 11.2.2
1005                         'caption'    => array_merge( $common, array( 'align' ) ),
1006
1007                         # 11.2.3
1008                         'thead'      => array_merge( $common, $tablealign ),
1009                         'tfoot'      => array_merge( $common, $tablealign ),
1010                         'tbody'      => array_merge( $common, $tablealign ),
1011
1012                         # 11.2.4
1013                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1014                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1015
1016                         # 11.2.5
1017                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1018
1019                         # 11.2.6
1020                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1021                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1022
1023                         # 15.2.1
1024                         'tt'         => $common,
1025                         'b'          => $common,
1026                         'i'          => $common,
1027                         'big'        => $common,
1028                         'small'      => $common,
1029                         'strike'     => $common,
1030                         's'          => $common,
1031                         'u'          => $common,
1032
1033                         # 15.2.2
1034                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1035                         # basefont
1036
1037                         # 15.3
1038                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1039
1040                         # XHTML Ruby annotation text module, simple ruby only.
1041                         # http://www.w3c.org/TR/ruby/
1042                         'ruby'       => $common,
1043                         # rbc
1044                         # rtc
1045                         'rb'         => $common,
1046                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1047                         'rp'         => $common,
1048                         );
1049                 return $whitelist;
1050         }
1051
1052         /**
1053          * Take a fragment of (potentially invalid) HTML and return
1054          * a version with any tags removed, encoded suitably for literal
1055          * inclusion in an attribute value.
1056          *
1057          * @param string $text HTML fragment
1058          * @return string
1059          */
1060         function stripAllTags( $text ) {
1061                 # Actual <tags>
1062                 $text = preg_replace( '/ < .*? > /x', '', $text );
1063
1064                 # Normalize &entities and whitespace
1065                 $text = Sanitizer::normalizeAttributeValue( $text );
1066
1067                 # Will be placed into "double-quoted" attributes,
1068                 # make sure remaining bits are safe.
1069                 $text = str_replace(
1070                         array('<', '>', '"'),
1071                         array('&lt;', '&gt;', '&quot;'),
1072                         $text );
1073
1074                 return $text;
1075         }
1076
1077         /**
1078          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1079          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1080          * PHP 5.1 doesn't.
1081          *
1082          * Use for passing XHTML fragments to PHP's XML parsing functions
1083          *
1084          * @return string
1085          * @static
1086          */
1087         function hackDocType() {
1088                 global $wgHtmlEntities;
1089                 $out = "<!DOCTYPE html [\n";
1090                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1091                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1092                 }
1093                 $out .= "]>\n";
1094                 return $out;
1095         }
1096
1097 }
1098
1099 ?>