includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  */
  26
  27 class Sanitizer {
  28         /**
  29          * Cleans up HTML, removes dangerous tags and attributes, and
  30          * removes HTML comments
  31          * @access private
  32          */
  33         function removeHTMLtags( $text ) {
  34                 global $wgUseTidy, $wgUserHtml;
  35                 $fname = 'Parser::removeHTMLtags';
  36                 wfProfileIn( $fname );
  37
  38                 if( $wgUserHtml ) {
  39                         $htmlpairs = array( # Tags that must be closed
  40                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  41                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  42                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
  43                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  44                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
  45                         );
  46                         $htmlsingle = array(
  47                                 'br', 'hr', 'li', 'dt', 'dd'
  48                         );
  49                         $htmlnest = array( # Tags that can be nested--??
  50                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  51                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  52                         );
  53                         $tabletags = array( # Can only appear inside table
  54                                 'td', 'th', 'tr'
  55                         );
  56                 } else {
  57                         $htmlpairs = array();
  58                         $htmlsingle = array();
  59                         $htmlnest = array();
  60                         $tabletags = array();
  61                 }
  62
  63                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
  64                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
  65
  66                 # Remove HTML comments
  67                 $text = Sanitizer::removeHTMLcomments( $text );
  68
  69                 $bits = explode( '<', $text );
  70                 $text = array_shift( $bits );
  71                 if(!$wgUseTidy) {
  72                         $tagstack = array(); $tablestack = array();
  73                         foreach ( $bits as $x ) {
  74                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
  75                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
  76                                 $x, $regs );
  77                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
  78                                 error_reporting( $prev );
  79
  80                                 $badtag = 0 ;
  81                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
  82                                         # Check our stack
  83                                         if ( $slash ) {
  84                                                 # Closing a tag...
  85                                                 if ( ! in_array( $t, $htmlsingle ) &&
  86                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
  87                                                         @array_push( $tagstack, $ot );
  88                                                         $badtag = 1;
  89                                                 } else {
  90                                                         if ( $t == 'table' ) {
  91                                                                 $tagstack = array_pop( $tablestack );
  92                                                         }
  93                                                         $newparams = '';
  94                                                 }
  95                                         } else {
  96                                                 # Keep track for later
  97                                                 if ( in_array( $t, $tabletags ) &&
  98                                                 ! in_array( 'table', $tagstack ) ) {
  99                                                         $badtag = 1;
 100                                                 } else if ( in_array( $t, $tagstack ) &&
 101                                                 ! in_array ( $t , $htmlnest ) ) {
 102                                                         $badtag = 1 ;
 103                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 104                                                         if ( $t == 'table' ) {
 105                                                                 array_push( $tablestack, $tagstack );
 106                                                                 $tagstack = array();
 107                                                         }
 108                                                         array_push( $tagstack, $t );
 109                                                 }
 110                                                 # Strip non-approved attributes from the tag
 111                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 112                                         }
 113                                         if ( ! $badtag ) {
 114                                                 $rest = str_replace( '>', '&gt;', $rest );
 115                                                 $text .= "<$slash$t$newparams$brace$rest";
 116                                                 continue;
 117                                         }
 118                                 }
 119                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 120                         }
 121                         # Close off any remaining tags
 122                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 123                                 $text .= "</$t>\n";
 124                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 125                         }
 126                 } else {
 127                         # this might be possible using tidy itself
 128                         foreach ( $bits as $x ) {
 129                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 130                                 $x, $regs );
 131                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 132                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 133                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 134                                         $rest = str_replace( '>', '&gt;', $rest );
 135                                         $text .= "<$slash$t$newparams$brace$rest";
 136                                 } else {
 137                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 138                                 }
 139                         }
 140                 }
 141                 wfProfileOut( $fname );
 142                 return $text;
 143         }
 144
 145         /**
 146          * Remove '<!--', '-->', and everything between.
 147          * To avoid leaving blank lines, when a comment is both preceded
 148          * and followed by a newline (ignoring spaces), trim leading and
 149          * trailing spaces and one of the newlines.
 150          *
 151          * @access private
 152          */
 153         function removeHTMLcomments( $text ) {
 154                 $fname='Parser::removeHTMLcomments';
 155                 wfProfileIn( $fname );
 156                 while (($start = strpos($text, '<!--')) !== false) {
 157                         $end = strpos($text, '-->', $start + 4);
 158                         if ($end === false) {
 159                                 # Unterminated comment; bail out
 160                                 break;
 161                         }
 162
 163                         $end += 3;
 164
 165                         # Trim space and newline if the comment is both
 166                         # preceded and followed by a newline
 167                         $spaceStart = max($start - 1, 0);
 168                         $spaceLen = $end - $spaceStart;
 169                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 170                                 $spaceStart--;
 171                                 $spaceLen++;
 172                         }
 173                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 174                                 $spaceLen++;
 175                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 176                                 # Remove the comment, leading and trailing
 177                                 # spaces, and leave only one newline.
 178                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 179                         }
 180                         else {
 181                                 # Remove just the comment.
 182                                 $text = substr_replace($text, '', $start, $end - $start);
 183                         }
 184                 }
 185                 wfProfileOut( $fname );
 186                 return $text;
 187         }
 188
 189         /**
 190          * Take a tag soup fragment listing an HTML element's attributes
 191          * and normalize it to well-formed XML, discarding unwanted attributes.
 192          *
 193          * - Normalizes attribute names to lowercase
 194          * - Discards attributes not on a whitelist for the given element
 195          * - Turns broken or invalid entities into plaintext
 196          * - Double-quotes all attribute values
 197          * - Attributes without values are given the name as attribute
 198          * - Double attributes are discarded
 199          * - Unsafe style attributes are discarded
 200          * - Prepends space if there are attributes.
 201          *
 202          * @param string $text
 203          * @param string $element
 204          * @return string
 205          *
 206          * @todo Check for legal values where the DTD limits things.
 207          * @todo Check for unique id attribute :P
 208          */
 209         function fixTagAttributes( $text, $element ) {
 210                 if( trim( $text ) == '' ) {
 211                         return '';
 212                 }
 213
 214                 $attrib = '[A-Za-z0-9]'; #FIXME
 215                 $space = '[\x09\x0a\x0d\x20]';
 216                 if( !preg_match_all(
 217                         "/(?:^|$space)($attrib+)
 218                           ($space*=$space*
 219                             (?:
 220                              # The attribute value: quoted or alone
 221                               \"([^<\"]*)\"
 222                              | '([^<']*)'
 223                              |  ([a-zA-Z0-9._:-]+)
 224                              |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 225                                                  # colors are specified like this.
 226                                                  # We'll be normalizing it.
 227                             )
 228                            )?(?=$space|\$)/sx",
 229                         $text,
 230                         $pairs,
 231                         PREG_SET_ORDER ) ) {
 232                         return '';
 233                 }
 234
 235                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 236                 $attribs = array();
 237                 foreach( $pairs as $set ) {
 238                         $attribute = strtolower( $set[1] );
 239                         if( !isset( $whitelist[$attribute] ) ) {
 240                                 continue;
 241                         }
 242                         if( $set[2] == '' ) {
 243                                 # In XHTML, attributes must have a value.
 244                                 $value = $set[1];
 245                         } elseif( $set[3] != '' ) {
 246                                 # Double-quoted
 247                                 $value = Sanitizer::normalizeAttributeValue( $set[3] );
 248                         } elseif( $set[4] != '' ) {
 249                                 # Single-quoted
 250                                 $value = str_replace( '"', '&quot;',
 251                                         Sanitizer::normalizeAttributeValue( $set[4] ) );
 252                         } elseif( $set[5] != '' ) {
 253                                 # No quotes.
 254                                 $value = Sanitizer::normalizeAttributeValue( $set[5] );
 255                         } elseif( $set[6] != '' ) {
 256                                 # Illegal #XXXXXX color with no quotes.
 257                                 $value = Sanitizer::normalizeAttributeValue( $set[6] );
 258                         } else {
 259                                 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
 260                         }
 261
 262                         # Strip javascript "expression" from stylesheets.
 263                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 264                         if( $attribute == 'style' && preg_match(
 265                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 266                                         wfMungeToUtf8( $value ) ) ) {
 267                                 # haxx0r
 268                                 continue;
 269                         }
 270
 271                         if( !isset( $attribs[$attribute] ) ) {
 272                                 $attribs[$attribute] = "$attribute=\"$value\"";
 273                         }
 274                 }
 275                 if( empty( $attribs ) ) {
 276                         return '';
 277                 } else {
 278                         return ' ' . implode( ' ', $attribs );
 279                 }
 280         }
 281
 282         /**
 283          * Normalize whitespace and character references in an XML source-
 284          * encoded text for an attribute value.
 285          *
 286          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 287          * but note that we're not returning the value, but are returning
 288          * XML source fragments that will be slapped into output.
 289          *
 290          * @param string $text
 291          * @return string
 292          * @access private
 293          */
 294         function normalizeAttributeValue( $text ) {
 295                 return preg_replace(
 296                         '/\r\n|[\x20\x0d\x0a\x09]/',
 297                         ' ',
 298                         Sanitizer::normalizeCharReferences( $text ) );
 299         }
 300
 301         /**
 302          * Ensure that any entities and character references are legal
 303          * for XML and XHTML specifically. Any stray bits will be
 304          * &amp;-escaped to result in a valid text fragment.
 305          *
 306          * a. any named char refs must be known in XHTML
 307          * b. any numeric char refs must be legal chars, not invalid or forbidden
 308          * c. use &#x, not &#X
 309          * d. fix or reject non-valid attributes
 310          *
 311          * @param string $text
 312          * @return string
 313          * @access private
 314          */
 315         function normalizeCharReferences( $text ) {
 316                 return preg_replace_callback(
 317                         '/&([A-Za-z0-9]+);
 318                          |&\#([0-9]+);
 319                          |&\#x([0-9A-Za-z]+);
 320                          |&\#X([0-9A-Za-z]+);
 321                          |(&)/x',
 322                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 323                         $text );
 324         }
 325
 326         function normalizeCharReferencesCallback( $matches ) {
 327                 $ret = null;
 328                 if( $matches[1] != '' ) {
 329                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 330                 } elseif( $matches[2] != '' ) {
 331                         $ret = Sanitizer::decCharReference( $matches[2] );
 332                 } elseif( $matches[3] != ''  ) {
 333                         $ret = Sanitizer::hexCharReference( $matches[3] );
 334                 } elseif( $matches[4] != '' ) {
 335                         $ret = Sanitizer::hexCharReference( $matches[4] );
 336                 }
 337                 if( is_null( $ret ) ) {
 338                         return htmlspecialchars( $matches[0] );
 339                 } else {
 340                         return $ret;
 341                 }
 342         }
 343
 344         /**
 345          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 346          * return the named entity reference as is. Otherwise, returns
 347          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 348          *
 349          * @return string
 350          */
 351         function normalizeEntity( $name ) {
 352                 # List of all named character entities defined in HTML 4.01
 353                 # http://www.w3.org/TR/html4/sgml/entities.html
 354                 static $htmlEntities = array(
 355                         'aacute' => true,
 356                         'Aacute' => true,
 357                         'acirc' => true,
 358                         'Acirc' => true,
 359                         'acute' => true,
 360                         'aelig' => true,
 361                         'AElig' => true,
 362                         'agrave' => true,
 363                         'Agrave' => true,
 364                         'alefsym' => true,
 365                         'alpha' => true,
 366                         'Alpha' => true,
 367                         'amp' => true,
 368                         'and' => true,
 369                         'ang' => true,
 370                         'apos' => true,
 371                         'aring' => true,
 372                         'Aring' => true,
 373                         'asymp' => true,
 374                         'atilde' => true,
 375                         'Atilde' => true,
 376                         'auml' => true,
 377                         'Auml' => true,
 378                         'bdquo' => true,
 379                         'beta' => true,
 380                         'Beta' => true,
 381                         'brvbar' => true,
 382                         'bull' => true,
 383                         'cap' => true,
 384                         'ccedil' => true,
 385                         'Ccedil' => true,
 386                         'cedil' => true,
 387                         'cent' => true,
 388                         'chi' => true,
 389                         'Chi' => true,
 390                         'circ' => true,
 391                         'clubs' => true,
 392                         'cong' => true,
 393                         'copy' => true,
 394                         'crarr' => true,
 395                         'cup' => true,
 396                         'curren' => true,
 397                         'dagger' => true,
 398                         'Dagger' => true,
 399                         'darr' => true,
 400                         'dArr' => true,
 401                         'deg' => true,
 402                         'delta' => true,
 403                         'Delta' => true,
 404                         'diams' => true,
 405                         'divide' => true,
 406                         'eacute' => true,
 407                         'Eacute' => true,
 408                         'ecirc' => true,
 409                         'Ecirc' => true,
 410                         'egrave' => true,
 411                         'Egrave' => true,
 412                         'empty' => true,
 413                         'emsp' => true,
 414                         'ensp' => true,
 415                         'epsilon' => true,
 416                         'Epsilon' => true,
 417                         'equiv' => true,
 418                         'eta' => true,
 419                         'Eta' => true,
 420                         'eth' => true,
 421                         'ETH' => true,
 422                         'euml' => true,
 423                         'Euml' => true,
 424                         'euro' => true,
 425                         'exist' => true,
 426                         'fnof' => true,
 427                         'forall' => true,
 428                         'frac12' => true,
 429                         'frac14' => true,
 430                         'frac34' => true,
 431                         'frasl' => true,
 432                         'gamma' => true,
 433                         'Gamma' => true,
 434                         'ge' => true,
 435                         'gt' => true,
 436                         'harr' => true,
 437                         'hArr' => true,
 438                         'hearts' => true,
 439                         'hellip' => true,
 440                         'iacute' => true,
 441                         'Iacute' => true,
 442                         'icirc' => true,
 443                         'Icirc' => true,
 444                         'iexcl' => true,
 445                         'igrave' => true,
 446                         'Igrave' => true,
 447                         'image' => true,
 448                         'infin' => true,
 449                         'int' => true,
 450                         'iota' => true,
 451                         'Iota' => true,
 452                         'iquest' => true,
 453                         'isin' => true,
 454                         'iuml' => true,
 455                         'Iuml' => true,
 456                         'kappa' => true,
 457                         'Kappa' => true,
 458                         'lambda' => true,
 459                         'Lambda' => true,
 460                         'lang' => true,
 461                         'laquo' => true,
 462                         'larr' => true,
 463                         'lArr' => true,
 464                         'lceil' => true,
 465                         'ldquo' => true,
 466                         'le' => true,
 467                         'lfloor' => true,
 468                         'lowast' => true,
 469                         'loz' => true,
 470                         'lrm' => true,
 471                         'lsaquo' => true,
 472                         'lsquo' => true,
 473                         'lt' => true,
 474                         'macr' => true,
 475                         'mdash' => true,
 476                         'micro' => true,
 477                         'middot' => true,
 478                         'minus' => true,
 479                         'mu' => true,
 480                         'Mu' => true,
 481                         'nabla' => true,
 482                         'nbsp' => true,
 483                         'ndash' => true,
 484                         'ne' => true,
 485                         'ni' => true,
 486                         'not' => true,
 487                         'notin' => true,
 488                         'nsub' => true,
 489                         'ntilde' => true,
 490                         'Ntilde' => true,
 491                         'nu' => true,
 492                         'Nu' => true,
 493                         'oacute' => true,
 494                         'Oacute' => true,
 495                         'ocirc' => true,
 496                         'Ocirc' => true,
 497                         'oelig' => true,
 498                         'OElig' => true,
 499                         'ograve' => true,
 500                         'Ograve' => true,
 501                         'oline' => true,
 502                         'omega' => true,
 503                         'Omega' => true,
 504                         'omicron' => true,
 505                         'Omicron' => true,
 506                         'oplus' => true,
 507                         'or' => true,
 508                         'ordf' => true,
 509                         'ordm' => true,
 510                         'oslash' => true,
 511                         'Oslash' => true,
 512                         'otilde' => true,
 513                         'Otilde' => true,
 514                         'otimes' => true,
 515                         'ouml' => true,
 516                         'Ouml' => true,
 517                         'para' => true,
 518                         'part' => true,
 519                         'permil' => true,
 520                         'perp' => true,
 521                         'phi' => true,
 522                         'Phi' => true,
 523                         'pi' => true,
 524                         'Pi' => true,
 525                         'piv' => true,
 526                         'plusmn' => true,
 527                         'pound' => true,
 528                         'prime' => true,
 529                         'Prime' => true,
 530                         'prod' => true,
 531                         'prop' => true,
 532                         'psi' => true,
 533                         'Psi' => true,
 534                         'quot' => true,
 535                         'radic' => true,
 536                         'rang' => true,
 537                         'raquo' => true,
 538                         'rarr' => true,
 539                         'rArr' => true,
 540                         'rceil' => true,
 541                         'rdquo' => true,
 542                         'real' => true,
 543                         'reg' => true,
 544                         'rfloor' => true,
 545                         'rho' => true,
 546                         'Rho' => true,
 547                         'rlm' => true,
 548                         'rsaquo' => true,
 549                         'rsquo' => true,
 550                         'sbquo' => true,
 551                         'scaron' => true,
 552                         'Scaron' => true,
 553                         'sdot' => true,
 554                         'sect' => true,
 555                         'shy' => true,
 556                         'sigma' => true,
 557                         'Sigma' => true,
 558                         'sigmaf' => true,
 559                         'sim' => true,
 560                         'spades' => true,
 561                         'sub' => true,
 562                         'sube' => true,
 563                         'sum' => true,
 564                         'sup' => true,
 565                         'sup1' => true,
 566                         'sup2' => true,
 567                         'sup3' => true,
 568                         'supe' => true,
 569                         'szlig' => true,
 570                         'tau' => true,
 571                         'Tau' => true,
 572                         'there4' => true,
 573                         'theta' => true,
 574                         'Theta' => true,
 575                         'thetasym' => true,
 576                         'thinsp' => true,
 577                         'thorn' => true,
 578                         'THORN' => true,
 579                         'tilde' => true,
 580                         'times' => true,
 581                         'trade' => true,
 582                         'uacute' => true,
 583                         'Uacute' => true,
 584                         'uarr' => true,
 585                         'uArr' => true,
 586                         'ucirc' => true,
 587                         'Ucirc' => true,
 588                         'ugrave' => true,
 589                         'Ugrave' => true,
 590                         'uml' => true,
 591                         'upsih' => true,
 592                         'upsilon' => true,
 593                         'Upsilon' => true,
 594                         'uuml' => true,
 595                         'Uuml' => true,
 596                         'weierp' => true,
 597                         'xi' => true,
 598                         'Xi' => true,
 599                         'yacute' => true,
 600                         'Yacute' => true,
 601                         'yen' => true,
 602                         'yuml' => true,
 603                         'Yuml' => true,
 604                         'zeta' => true,
 605                         'Zeta' => true,
 606                         'zwj' => true,
 607                         'zwnj' => true );
 608                 if( isset( $htmlEntities[$name] ) ) {
 609                         return "&$name;";
 610                 } else {
 611                         return "&amp;$name;";
 612                 }
 613         }
 614
 615         function decCharReference( $codepoint ) {
 616                 $point = IntVal( $codepoint );
 617                 if( Sanitizer::validateCodepoint( $point ) ) {
 618                         return sprintf( '&#%d;', $point );
 619                 } else {
 620                         return null;
 621                 }
 622         }
 623
 624         function hexCharReference( $codepoint ) {
 625                 $point = hexdec( $codepoint );
 626                 if( Sanitizer::validateCodepoint( $point ) ) {
 627                         return sprintf( '&#x%x;', $point );
 628                 } else {
 629                         return null;
 630                 }
 631         }
 632
 633         /**
 634          * Returns true if a given Unicode codepoint is a valid character in XML.
 635          * @param int $codepoint
 636          * @return bool
 637          */
 638         function validateCodepoint( $codepoint ) {
 639                 return ($codepoint ==    0x09)
 640                         || ($codepoint ==    0x0a)
 641                         || ($codepoint ==    0x0d)
 642                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 643                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 644                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 645         }
 646
 647         /**
 648          * Fetch the whitelist of acceptable attributes for a given
 649          * element name.
 650          *
 651          * @param string $element
 652          * @return array
 653          */
 654         function attributeWhitelist( $element ) {
 655                 $list = Sanitizer::setupAttributeWhitelist();
 656                 return isset( $list[$element] )
 657                         ? $list[$element]
 658                         : array();
 659         }
 660
 661         /**
 662          * @return array
 663          */
 664         function setupAttributeWhitelist() {
 665                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 666                 $block = array_merge( $common, array( 'align' ) );
 667                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 668                 $tablecell = array( 'abbr',
 669                                     'axis',
 670                                     'headers',
 671                                     'scope',
 672                                     'rowspan',
 673                                     'colspan',
 674                                     'nowrap', # deprecated
 675                                     'width', # deprecated
 676                                     'height' # deprecated
 677                                     );
 678
 679                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 680                 # See: http://www.w3.org/TR/html4/
 681                 $whitelist = array (
 682                         # 7.5.4
 683                         'div'        => $block,
 684                         'center'     => $common, # deprecated
 685                         'span'       => $block, # ??
 686
 687                         # 7.5.5
 688                         'h1'         => $block,
 689                         'h2'         => $block,
 690                         'h3'         => $block,
 691                         'h4'         => $block,
 692                         'h5'         => $block,
 693                         'h6'         => $block,
 694
 695                         # 7.5.6
 696                         # address
 697
 698                         # 8.2.4
 699                         # bdo
 700
 701                         # 9.2.1
 702                         'em'         => $common,
 703                         'strong'     => $common,
 704                         'cite'       => $common,
 705                         # dfn
 706                         'code'       => $common,
 707                         # samp
 708                         # kbd
 709                         'var'        => $common,
 710                         # abbr
 711                         # acronym
 712
 713                         # 9.2.2
 714                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 715                         # q
 716
 717                         # 9.2.3
 718                         'sub'        => $common,
 719                         'sup'        => $common,
 720
 721                         # 9.3.1
 722                         'p'          => $block,
 723
 724                         # 9.3.2
 725                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 726
 727                         # 9.3.4
 728                         'pre'        => array_merge( $common, array( 'width' ) ),
 729
 730                         # 9.4
 731                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 732                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 733
 734                         # 10.2
 735                         'ul'         => array_merge( $common, array( 'type' ) ),
 736                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 737                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 738
 739                         # 10.3
 740                         'dl'         => $common,
 741                         'dd'         => $common,
 742                         'dt'         => $common,
 743
 744                         # 11.2.1
 745                         'table'      => array_merge( $common,
 746                                                                 array( 'summary', 'width', 'border', 'frame',
 747                                                                                          'rules', 'cellspacing', 'cellpadding',
 748                                                                                          'align', 'bgcolor', 'frame', 'rules',
 749                                                                                          'border' ) ),
 750
 751                         # 11.2.2
 752                         'caption'    => array_merge( $common, array( 'align' ) ),
 753
 754                         # 11.2.3
 755                         'thead'      => array_merge( $common, $tablealign ),
 756                         'tfoot'      => array_merge( $common, $tablealign ),
 757                         'tbody'      => array_merge( $common, $tablealign ),
 758
 759                         # 11.2.4
 760                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 761                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 762
 763                         # 11.2.5
 764                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 765
 766                         # 11.2.6
 767                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 768                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 769
 770                         # 15.2.1
 771                         'tt'         => $common,
 772                         'b'          => $common,
 773                         'i'          => $common,
 774                         'big'        => $common,
 775                         'small'      => $common,
 776                         'strike'     => $common,
 777                         's'          => $common,
 778                         'u'          => $common,
 779
 780                         # 15.2.2
 781                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 782                         # basefont
 783
 784                         # 15.3
 785                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 786
 787                         # XHTML Ruby annotation text module, simple ruby only.
 788                         # http://www.w3c.org/TR/ruby/
 789                         'ruby'       => $common,
 790                         # rbc
 791                         # rtc
 792                         'rb'         => $common,
 793                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 794                         'rp'         => $common,
 795                         );
 796                 return $whitelist;
 797         }
 798
 799         /**
 800          * Take a fragment of (potentially invalid) HTML and return
 801          * a version with any tags removed, encoded suitably for literal
 802          * inclusion in an attribute value.
 803          *
 804          * @param string $text HTML fragment
 805          * @return string
 806          */
 807         function stripAllTags( $text ) {
 808                 # Actual <tags>
 809                 $text = preg_replace( '/<[^>]*>/', '', $text );
 810
 811                 # Normalize &entities and whitespace
 812                 $text = Sanitizer::normalizeAttributeValue( $text );
 813
 814                 # Will be placed into "double-quoted" attributes,
 815                 # make sure remaining bits are safe.
 816                 $text = str_replace(
 817                         array('<', '>', '"'),
 818                         array('&lt;', '&gt;', '&quot;'),
 819                         $text );
 820
 821                 return $text;
 822         }
 823
 824 }
 825
 826 ?>