includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         @array_push( $tagstack, $ot );
 395                                                         # <li> can be nested in <ul> or <ol>, skip those cases:
 396                                                         if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 397                                                                 $badtag = 1;
 398                                                         }
 399                                                 } else {
 400                                                         if ( $t == 'table' ) {
 401                                                                 $tagstack = array_pop( $tablestack );
 402                                                         }
 403                                                         $newparams = '';
 404                                                 }
 405                                         } else {
 406                                                 # Keep track for later
 407                                                 if ( in_array( $t, $tabletags ) &&
 408                                                 ! in_array( 'table', $tagstack ) ) {
 409                                                         $badtag = 1;
 410                                                 } else if ( in_array( $t, $tagstack ) &&
 411                                                 ! in_array ( $t , $htmlnest ) ) {
 412                                                         $badtag = 1 ;
 413                                                 # Is it a self closed htmlpair ? (bug 5487)
 414                                                 } else if( $brace == '/>' &&
 415                                                 in_array($t, $htmlpairs) ) {
 416                                                         $badtag = 1;
 417                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 418                                                         # Hack to force empty tag for uncloseable elements
 419                                                         $brace = '/>';
 420                                                 } else if( in_array( $t, $htmlsingle ) ) {
 421                                                         # Hack to not close $htmlsingle tags
 422                                                         $brace = NULL;
 423                                                 } else {
 424                                                         if ( $t == 'table' ) {
 425                                                                 array_push( $tablestack, $tagstack );
 426                                                                 $tagstack = array();
 427                                                         }
 428                                                         array_push( $tagstack, $t );
 429                                                 }
 430
 431                                                 # Replace any variables or template parameters with
 432                                                 # plaintext results.
 433                                                 if( is_callable( $processCallback ) ) {
 434                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 435                                                 }
 436
 437                                                 # Strip non-approved attributes from the tag
 438                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 439                                         }
 440                                         if ( ! $badtag ) {
 441                                                 $rest = str_replace( '>', '&gt;', $rest );
 442                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 443                                                 $text .= "<$slash$t$newparams$close>$rest";
 444                                                 continue;
 445                                         }
 446                                 }
 447                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 448                         }
 449                         # Close off any remaining tags
 450                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 451                                 $text .= "</$t>\n";
 452                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 453                         }
 454                 } else {
 455                         # this might be possible using tidy itself
 456                         foreach ( $bits as $x ) {
 457                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 458                                 $x, $regs );
 459                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 460                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 461                                         if( is_callable( $processCallback ) ) {
 462                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 463                                         }
 464                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 465                                         $rest = str_replace( '>', '&gt;', $rest );
 466                                         $text .= "<$slash$t$newparams$brace$rest";
 467                                 } else {
 468                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 469                                 }
 470                         }
 471                 }
 472                 wfProfileOut( $fname );
 473                 return $text;
 474         }
 475
 476         /**
 477          * Remove '<!--', '-->', and everything between.
 478          * To avoid leaving blank lines, when a comment is both preceded
 479          * and followed by a newline (ignoring spaces), trim leading and
 480          * trailing spaces and one of the newlines.
 481          *
 482          * @private
 483          * @param string $text
 484          * @return string
 485          */
 486         function removeHTMLcomments( $text ) {
 487                 $fname='Parser::removeHTMLcomments';
 488                 wfProfileIn( $fname );
 489                 while (($start = strpos($text, '<!--')) !== false) {
 490                         $end = strpos($text, '-->', $start + 4);
 491                         if ($end === false) {
 492                                 # Unterminated comment; bail out
 493                                 break;
 494                         }
 495
 496                         $end += 3;
 497
 498                         # Trim space and newline if the comment is both
 499                         # preceded and followed by a newline
 500                         $spaceStart = max($start - 1, 0);
 501                         $spaceLen = $end - $spaceStart;
 502                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 503                                 $spaceStart--;
 504                                 $spaceLen++;
 505                         }
 506                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 507                                 $spaceLen++;
 508                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 509                                 # Remove the comment, leading and trailing
 510                                 # spaces, and leave only one newline.
 511                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 512                         }
 513                         else {
 514                                 # Remove just the comment.
 515                                 $text = substr_replace($text, '', $start, $end - $start);
 516                         }
 517                 }
 518                 wfProfileOut( $fname );
 519                 return $text;
 520         }
 521
 522         /**
 523          * Take a tag soup fragment listing an HTML element's attributes
 524          * and normalize it to well-formed XML, discarding unwanted attributes.
 525          *
 526          * - Normalizes attribute names to lowercase
 527          * - Discards attributes not on a whitelist for the given element
 528          * - Turns broken or invalid entities into plaintext
 529          * - Double-quotes all attribute values
 530          * - Attributes without values are given the name as attribute
 531          * - Double attributes are discarded
 532          * - Unsafe style attributes are discarded
 533          * - Prepends space if there are attributes.
 534          *
 535          * @param string $text
 536          * @param string $element
 537          * @return string
 538          *
 539          * @todo Check for legal values where the DTD limits things.
 540          * @todo Check for unique id attribute :P
 541          */
 542         function fixTagAttributes( $text, $element ) {
 543                 if( trim( $text ) == '' ) {
 544                         return '';
 545                 }
 546
 547                 # Unquoted attribute
 548                 # Since we quote this later, this can be anything distinguishable
 549                 # from the end of the attribute
 550                 $pairs = array();
 551                 if( !preg_match_all(
 552                         MW_ATTRIBS_REGEX,
 553                         $text,
 554                         $pairs,
 555                         PREG_SET_ORDER ) ) {
 556                         return '';
 557                 }
 558
 559                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 560                 $attribs = array();
 561                 foreach( $pairs as $set ) {
 562                         $attribute = strtolower( $set[1] );
 563                         if( !isset( $whitelist[$attribute] ) ) {
 564                                 continue;
 565                         }
 566
 567                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 568                         $value = Sanitizer::normalizeAttributeValue( $raw );
 569
 570                         # Strip javascript "expression" from stylesheets.
 571                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 572                         if( $attribute == 'style' ) {
 573                                 $stripped = Sanitizer::decodeCharReferences( $value );
 574
 575                                 // Remove any comments; IE gets token splitting wrong
 576                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 577                                 $value = htmlspecialchars( $stripped );
 578
 579                                 // ... and continue checks
 580                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 581                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 582                                 $stripped = str_replace( '\\', '', $stripped );
 583                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 584                                                 $stripped ) ) {
 585                                         # haxx0r
 586                                         continue;
 587                                 }
 588                         }
 589
 590                         if ( $attribute === 'id' )
 591                                 $value = Sanitizer::escapeId( $value );
 592
 593                         # Templates and links may be expanded in later parsing,
 594                         # creating invalid or dangerous output. Suppress this.
 595                         $value = strtr( $value, array(
 596                                 '<'    => '&lt;',   // This should never happen,
 597                                 '>'    => '&gt;',   // we've received invalid input
 598                                 '"'    => '&quot;', // which should have been escaped.
 599                                 '{'    => '&#123;',
 600                                 '['    => '&#91;',
 601                                 "''"   => '&#39;&#39;',
 602                                 'ISBN' => '&#73;SBN',
 603                                 'RFC'  => '&#82;FC',
 604                                 'PMID' => '&#80;MID',
 605                                 '|'    => '&#124;',
 606                                 '__'   => '&#95;_',
 607                         ) );
 608
 609                         # Stupid hack
 610                         $value = preg_replace_callback(
 611                                 '/(' . wfUrlProtocols() . ')/',
 612                                 array( 'Sanitizer', 'armorLinksCallback' ),
 613                                 $value );
 614
 615                         // If this attribute was previously set, override it.
 616                         // Output should only have one attribute of each name.
 617                         $attribs[$attribute] = "$attribute=\"$value\"";
 618                 }
 619
 620                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 621         }
 622
 623         /**
 624          * Given a value escape it so that it can be used in an id attribute and
 625          * return it, this does not validate the value however (see first link)
 626          *
 627          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 628          *                                                          in the id and
 629          *                                                          name attributes
 630          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 631          *
 632          * @bug 4461
 633          *
 634          * @static
 635          *
 636          * @param string $id
 637          * @return string
 638          */
 639         function escapeId( $id ) {
 640                 static $replace = array(
 641                         '%3A' => ':',
 642                         '%' => '.'
 643                 );
 644
 645                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 646
 647                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 648         }
 649
 650         /**
 651          * Regex replace callback for armoring links against further processing.
 652          * @param array $matches
 653          * @return string
 654          * @private
 655          */
 656         function armorLinksCallback( $matches ) {
 657                 return str_replace( ':', '&#58;', $matches[1] );
 658         }
 659
 660         /**
 661          * Return an associative array of attribute names and values from
 662          * a partial tag string. Attribute names are forces to lowercase,
 663          * character references are decoded to UTF-8 text.
 664          *
 665          * @param string
 666          * @return array
 667          */
 668         function decodeTagAttributes( $text ) {
 669                 $attribs = array();
 670
 671                 if( trim( $text ) == '' ) {
 672                         return $attribs;
 673                 }
 674
 675                 $pairs = array();
 676                 if( !preg_match_all(
 677                         MW_ATTRIBS_REGEX,
 678                         $text,
 679                         $pairs,
 680                         PREG_SET_ORDER ) ) {
 681                         return $attribs;
 682                 }
 683
 684                 foreach( $pairs as $set ) {
 685                         $attribute = strtolower( $set[1] );
 686                         $value = Sanitizer::getTagAttributeCallback( $set );
 687                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 688                 }
 689                 return $attribs;
 690         }
 691
 692         /**
 693          * Pick the appropriate attribute value from a match set from the
 694          * MW_ATTRIBS_REGEX matches.
 695          *
 696          * @param array $set
 697          * @return string
 698          * @private
 699          */
 700         function getTagAttributeCallback( $set ) {
 701                 if( isset( $set[6] ) ) {
 702                         # Illegal #XXXXXX color with no quotes.
 703                         return $set[6];
 704                 } elseif( isset( $set[5] ) ) {
 705                         # No quotes.
 706                         return $set[5];
 707                 } elseif( isset( $set[4] ) ) {
 708                         # Single-quoted
 709                         return $set[4];
 710                 } elseif( isset( $set[3] ) ) {
 711                         # Double-quoted
 712                         return $set[3];
 713                 } elseif( !isset( $set[2] ) ) {
 714                         # In XHTML, attributes must have a value.
 715                         # For 'reduced' form, return explicitly the attribute name here.
 716                         return $set[1];
 717                 } else {
 718                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 719                 }
 720         }
 721
 722         /**
 723          * Normalize whitespace and character references in an XML source-
 724          * encoded text for an attribute value.
 725          *
 726          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 727          * but note that we're not returning the value, but are returning
 728          * XML source fragments that will be slapped into output.
 729          *
 730          * @param string $text
 731          * @return string
 732          * @private
 733          */
 734         function normalizeAttributeValue( $text ) {
 735                 return str_replace( '"', '&quot;',
 736                         preg_replace(
 737                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 738                                 ' ',
 739                                 Sanitizer::normalizeCharReferences( $text ) ) );
 740         }
 741
 742         /**
 743          * Ensure that any entities and character references are legal
 744          * for XML and XHTML specifically. Any stray bits will be
 745          * &amp;-escaped to result in a valid text fragment.
 746          *
 747          * a. any named char refs must be known in XHTML
 748          * b. any numeric char refs must be legal chars, not invalid or forbidden
 749          * c. use &#x, not &#X
 750          * d. fix or reject non-valid attributes
 751          *
 752          * @param string $text
 753          * @return string
 754          * @private
 755          */
 756         function normalizeCharReferences( $text ) {
 757                 return preg_replace_callback(
 758                         MW_CHAR_REFS_REGEX,
 759                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 760                         $text );
 761         }
 762         /**
 763          * @param string $matches
 764          * @return string
 765          */
 766         function normalizeCharReferencesCallback( $matches ) {
 767                 $ret = null;
 768                 if( $matches[1] != '' ) {
 769                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 770                 } elseif( $matches[2] != '' ) {
 771                         $ret = Sanitizer::decCharReference( $matches[2] );
 772                 } elseif( $matches[3] != ''  ) {
 773                         $ret = Sanitizer::hexCharReference( $matches[3] );
 774                 } elseif( $matches[4] != '' ) {
 775                         $ret = Sanitizer::hexCharReference( $matches[4] );
 776                 }
 777                 if( is_null( $ret ) ) {
 778                         return htmlspecialchars( $matches[0] );
 779                 } else {
 780                         return $ret;
 781                 }
 782         }
 783
 784         /**
 785          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 786          * return the named entity reference as is. Otherwise, returns
 787          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 788          *
 789          * @param string $name
 790          * @return string
 791          */
 792         function normalizeEntity( $name ) {
 793                 global $wgHtmlEntities;
 794                 if( isset( $wgHtmlEntities[$name] ) ) {
 795                         return "&$name;";
 796                 } else {
 797                         return "&amp;$name;";
 798                 }
 799         }
 800
 801         function decCharReference( $codepoint ) {
 802                 $point = intval( $codepoint );
 803                 if( Sanitizer::validateCodepoint( $point ) ) {
 804                         return sprintf( '&#%d;', $point );
 805                 } else {
 806                         return null;
 807                 }
 808         }
 809
 810         function hexCharReference( $codepoint ) {
 811                 $point = hexdec( $codepoint );
 812                 if( Sanitizer::validateCodepoint( $point ) ) {
 813                         return sprintf( '&#x%x;', $point );
 814                 } else {
 815                         return null;
 816                 }
 817         }
 818
 819         /**
 820          * Returns true if a given Unicode codepoint is a valid character in XML.
 821          * @param int $codepoint
 822          * @return bool
 823          */
 824         function validateCodepoint( $codepoint ) {
 825                 return ($codepoint ==    0x09)
 826                         || ($codepoint ==    0x0a)
 827                         || ($codepoint ==    0x0d)
 828                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 829                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 830                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 831         }
 832
 833         /**
 834          * Decode any character references, numeric or named entities,
 835          * in the text and return a UTF-8 string.
 836          *
 837          * @param string $text
 838          * @return string
 839          * @public
 840          */
 841         function decodeCharReferences( $text ) {
 842                 return preg_replace_callback(
 843                         MW_CHAR_REFS_REGEX,
 844                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 845                         $text );
 846         }
 847
 848         /**
 849          * @param string $matches
 850          * @return string
 851          */
 852         function decodeCharReferencesCallback( $matches ) {
 853                 if( $matches[1] != '' ) {
 854                         return Sanitizer::decodeEntity( $matches[1] );
 855                 } elseif( $matches[2] != '' ) {
 856                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 857                 } elseif( $matches[3] != ''  ) {
 858                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 859                 } elseif( $matches[4] != '' ) {
 860                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 861                 }
 862                 # Last case should be an ampersand by itself
 863                 return $matches[0];
 864         }
 865
 866         /**
 867          * Return UTF-8 string for a codepoint if that is a valid
 868          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 869          * @param int $codepoint
 870          * @return string
 871          * @private
 872          */
 873         function decodeChar( $codepoint ) {
 874                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 875                         return codepointToUtf8( $codepoint );
 876                 } else {
 877                         return UTF8_REPLACEMENT;
 878                 }
 879         }
 880
 881         /**
 882          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 883          * return the UTF-8 encoding of that character. Otherwise, returns
 884          * pseudo-entity source (eg &foo;)
 885          *
 886          * @param string $name
 887          * @return string
 888          */
 889         function decodeEntity( $name ) {
 890                 global $wgHtmlEntities;
 891                 if( isset( $wgHtmlEntities[$name] ) ) {
 892                         return codepointToUtf8( $wgHtmlEntities[$name] );
 893                 } else {
 894                         return "&$name;";
 895                 }
 896         }
 897
 898         /**
 899          * Fetch the whitelist of acceptable attributes for a given
 900          * element name.
 901          *
 902          * @param string $element
 903          * @return array
 904          */
 905         function attributeWhitelist( $element ) {
 906                 static $list;
 907                 if( !isset( $list ) ) {
 908                         $list = Sanitizer::setupAttributeWhitelist();
 909                 }
 910                 return isset( $list[$element] )
 911                         ? $list[$element]
 912                         : array();
 913         }
 914
 915         /**
 916          * @return array
 917          */
 918         function setupAttributeWhitelist() {
 919                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 920                 $block = array_merge( $common, array( 'align' ) );
 921                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 922                 $tablecell = array( 'abbr',
 923                                     'axis',
 924                                     'headers',
 925                                     'scope',
 926                                     'rowspan',
 927                                     'colspan',
 928                                     'nowrap', # deprecated
 929                                     'width',  # deprecated
 930                                     'height', # deprecated
 931                                     'bgcolor' # deprecated
 932                                     );
 933
 934                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 935                 # See: http://www.w3.org/TR/html4/
 936                 $whitelist = array (
 937                         # 7.5.4
 938                         'div'        => $block,
 939                         'center'     => $common, # deprecated
 940                         'span'       => $block, # ??
 941
 942                         # 7.5.5
 943                         'h1'         => $block,
 944                         'h2'         => $block,
 945                         'h3'         => $block,
 946                         'h4'         => $block,
 947                         'h5'         => $block,
 948                         'h6'         => $block,
 949
 950                         # 7.5.6
 951                         # address
 952
 953                         # 8.2.4
 954                         # bdo
 955
 956                         # 9.2.1
 957                         'em'         => $common,
 958                         'strong'     => $common,
 959                         'cite'       => $common,
 960                         # dfn
 961                         'code'       => $common,
 962                         # samp
 963                         # kbd
 964                         'var'        => $common,
 965                         # abbr
 966                         # acronym
 967
 968                         # 9.2.2
 969                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 970                         # q
 971
 972                         # 9.2.3
 973                         'sub'        => $common,
 974                         'sup'        => $common,
 975
 976                         # 9.3.1
 977                         'p'          => $block,
 978
 979                         # 9.3.2
 980                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 981
 982                         # 9.3.4
 983                         'pre'        => array_merge( $common, array( 'width' ) ),
 984
 985                         # 9.4
 986                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 987                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 988
 989                         # 10.2
 990                         'ul'         => array_merge( $common, array( 'type' ) ),
 991                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 992                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 993
 994                         # 10.3
 995                         'dl'         => $common,
 996                         'dd'         => $common,
 997                         'dt'         => $common,
 998
 999                         # 11.2.1
1000                         'table'      => array_merge( $common,
1001                                                                 array( 'summary', 'width', 'border', 'frame',
1002                                                                                          'rules', 'cellspacing', 'cellpadding',
1003                                                                                          'align', 'bgcolor', 'frame', 'rules',
1004                                                                                          'border' ) ),
1005
1006                         # 11.2.2
1007                         'caption'    => array_merge( $common, array( 'align' ) ),
1008
1009                         # 11.2.3
1010                         'thead'      => array_merge( $common, $tablealign ),
1011                         'tfoot'      => array_merge( $common, $tablealign ),
1012                         'tbody'      => array_merge( $common, $tablealign ),
1013
1014                         # 11.2.4
1015                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1016                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1017
1018                         # 11.2.5
1019                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1020
1021                         # 11.2.6
1022                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1023                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1024
1025                         # 15.2.1
1026                         'tt'         => $common,
1027                         'b'          => $common,
1028                         'i'          => $common,
1029                         'big'        => $common,
1030                         'small'      => $common,
1031                         'strike'     => $common,
1032                         's'          => $common,
1033                         'u'          => $common,
1034
1035                         # 15.2.2
1036                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1037                         # basefont
1038
1039                         # 15.3
1040                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1041
1042                         # XHTML Ruby annotation text module, simple ruby only.
1043                         # http://www.w3c.org/TR/ruby/
1044                         'ruby'       => $common,
1045                         # rbc
1046                         # rtc
1047                         'rb'         => $common,
1048                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1049                         'rp'         => $common,
1050                         );
1051                 return $whitelist;
1052         }
1053
1054         /**
1055          * Take a fragment of (potentially invalid) HTML and return
1056          * a version with any tags removed, encoded suitably for literal
1057          * inclusion in an attribute value.
1058          *
1059          * @param string $text HTML fragment
1060          * @return string
1061          */
1062         function stripAllTags( $text ) {
1063                 # Actual <tags>
1064                 $text = preg_replace( '/ < .*? > /x', '', $text );
1065
1066                 # Normalize &entities and whitespace
1067                 $text = Sanitizer::normalizeAttributeValue( $text );
1068
1069                 # Will be placed into "double-quoted" attributes,
1070                 # make sure remaining bits are safe.
1071                 $text = str_replace(
1072                         array('<', '>', '"'),
1073                         array('&lt;', '&gt;', '&quot;'),
1074                         $text );
1075
1076                 return $text;
1077         }
1078
1079         /**
1080          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1081          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1082          * PHP 5.1 doesn't.
1083          *
1084          * Use for passing XHTML fragments to PHP's XML parsing functions
1085          *
1086          * @return string
1087          * @static
1088          */
1089         function hackDocType() {
1090                 global $wgHtmlEntities;
1091                 $out = "<!DOCTYPE html [\n";
1092                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1093                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1094                 }
1095                 $out .= "]>\n";
1096                 return $out;
1097         }
1098
1099 }
1100
1101 ?>