includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  * @subpackage Parser
  26  */
  27
  28 class Sanitizer {
  29         /**
  30          * Cleans up HTML, removes dangerous tags and attributes, and
  31          * removes HTML comments
  32          * @access private
  33          * @param string $text
  34          * @return string
  35          */
  36         function removeHTMLtags( $text ) {
  37                 global $wgUseTidy, $wgUserHtml;
  38                 $fname = 'Parser::removeHTMLtags';
  39                 wfProfileIn( $fname );
  40
  41                 if( $wgUserHtml ) {
  42                         $htmlpairs = array( # Tags that must be closed
  43                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  44                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  45                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
  46                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  47                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
  48                         );
  49                         $htmlsingle = array(
  50                                 'br', 'hr', 'li', 'dt', 'dd'
  51                         );
  52                         $htmlnest = array( # Tags that can be nested--??
  53                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  54                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  55                         );
  56                         $tabletags = array( # Can only appear inside table
  57                                 'td', 'th', 'tr'
  58                         );
  59                 } else {
  60                         $htmlpairs = array();
  61                         $htmlsingle = array();
  62                         $htmlnest = array();
  63                         $tabletags = array();
  64                 }
  65
  66                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
  67                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
  68
  69                 # Remove HTML comments
  70                 $text = Sanitizer::removeHTMLcomments( $text );
  71
  72                 $bits = explode( '<', $text );
  73                 $text = array_shift( $bits );
  74                 if(!$wgUseTidy) {
  75                         $tagstack = array(); $tablestack = array();
  76                         foreach ( $bits as $x ) {
  77                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
  78                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
  79                                 $x, $regs );
  80                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
  81                                 error_reporting( $prev );
  82
  83                                 $badtag = 0 ;
  84                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
  85                                         # Check our stack
  86                                         if ( $slash ) {
  87                                                 # Closing a tag...
  88                                                 if ( ! in_array( $t, $htmlsingle ) &&
  89                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
  90                                                         @array_push( $tagstack, $ot );
  91                                                         $badtag = 1;
  92                                                 } else {
  93                                                         if ( $t == 'table' ) {
  94                                                                 $tagstack = array_pop( $tablestack );
  95                                                         }
  96                                                         $newparams = '';
  97                                                 }
  98                                         } else {
  99                                                 # Keep track for later
 100                                                 if ( in_array( $t, $tabletags ) &&
 101                                                 ! in_array( 'table', $tagstack ) ) {
 102                                                         $badtag = 1;
 103                                                 } else if ( in_array( $t, $tagstack ) &&
 104                                                 ! in_array ( $t , $htmlnest ) ) {
 105                                                         $badtag = 1 ;
 106                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 107                                                         if ( $t == 'table' ) {
 108                                                                 array_push( $tablestack, $tagstack );
 109                                                                 $tagstack = array();
 110                                                         }
 111                                                         array_push( $tagstack, $t );
 112                                                 }
 113                                                 # Strip non-approved attributes from the tag
 114                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 115                                         }
 116                                         if ( ! $badtag ) {
 117                                                 $rest = str_replace( '>', '&gt;', $rest );
 118                                                 $text .= "<$slash$t$newparams$brace$rest";
 119                                                 continue;
 120                                         }
 121                                 }
 122                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 123                         }
 124                         # Close off any remaining tags
 125                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 126                                 $text .= "</$t>\n";
 127                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 128                         }
 129                 } else {
 130                         # this might be possible using tidy itself
 131                         foreach ( $bits as $x ) {
 132                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 133                                 $x, $regs );
 134                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 135                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 136                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 137                                         $rest = str_replace( '>', '&gt;', $rest );
 138                                         $text .= "<$slash$t$newparams$brace$rest";
 139                                 } else {
 140                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 141                                 }
 142                         }
 143                 }
 144                 wfProfileOut( $fname );
 145                 return $text;
 146         }
 147
 148         /**
 149          * Remove '<!--', '-->', and everything between.
 150          * To avoid leaving blank lines, when a comment is both preceded
 151          * and followed by a newline (ignoring spaces), trim leading and
 152          * trailing spaces and one of the newlines.
 153          *
 154          * @access private
 155          * @param string $text
 156          * @return string
 157          */
 158         function removeHTMLcomments( $text ) {
 159                 $fname='Parser::removeHTMLcomments';
 160                 wfProfileIn( $fname );
 161                 while (($start = strpos($text, '<!--')) !== false) {
 162                         $end = strpos($text, '-->', $start + 4);
 163                         if ($end === false) {
 164                                 # Unterminated comment; bail out
 165                                 break;
 166                         }
 167
 168                         $end += 3;
 169
 170                         # Trim space and newline if the comment is both
 171                         # preceded and followed by a newline
 172                         $spaceStart = max($start - 1, 0);
 173                         $spaceLen = $end - $spaceStart;
 174                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 175                                 $spaceStart--;
 176                                 $spaceLen++;
 177                         }
 178                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 179                                 $spaceLen++;
 180                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 181                                 # Remove the comment, leading and trailing
 182                                 # spaces, and leave only one newline.
 183                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 184                         }
 185                         else {
 186                                 # Remove just the comment.
 187                                 $text = substr_replace($text, '', $start, $end - $start);
 188                         }
 189                 }
 190                 wfProfileOut( $fname );
 191                 return $text;
 192         }
 193
 194         /**
 195          * Take a tag soup fragment listing an HTML element's attributes
 196          * and normalize it to well-formed XML, discarding unwanted attributes.
 197          *
 198          * - Normalizes attribute names to lowercase
 199          * - Discards attributes not on a whitelist for the given element
 200          * - Turns broken or invalid entities into plaintext
 201          * - Double-quotes all attribute values
 202          * - Attributes without values are given the name as attribute
 203          * - Double attributes are discarded
 204          * - Unsafe style attributes are discarded
 205          * - Prepends space if there are attributes.
 206          *
 207          * @param string $text
 208          * @param string $element
 209          * @return string
 210          *
 211          * @todo Check for legal values where the DTD limits things.
 212          * @todo Check for unique id attribute :P
 213          */
 214         function fixTagAttributes( $text, $element ) {
 215                 if( trim( $text ) == '' ) {
 216                         return '';
 217                 }
 218
 219                 $attrib = '[A-Za-z0-9]'; #FIXME
 220                 $space = '[\x09\x0a\x0d\x20]';
 221                 if( !preg_match_all(
 222                         "/(?:^|$space)($attrib+)
 223                           ($space*=$space*
 224                             (?:
 225                              # The attribute value: quoted or alone
 226                               \"([^<\"]*)\"
 227                              | '([^<']*)'
 228                              |  ([a-zA-Z0-9._:-]+)
 229                              |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 230                                                  # colors are specified like this.
 231                                                  # We'll be normalizing it.
 232                             )
 233                            )?(?=$space|\$)/sx",
 234                         $text,
 235                         $pairs,
 236                         PREG_SET_ORDER ) ) {
 237                         return '';
 238                 }
 239
 240                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 241                 $attribs = array();
 242                 foreach( $pairs as $set ) {
 243                         $attribute = strtolower( $set[1] );
 244                         if( !isset( $whitelist[$attribute] ) ) {
 245                                 continue;
 246                         }
 247                         if( $set[2] == '' ) {
 248                                 # In XHTML, attributes must have a value.
 249                                 $value = $set[1];
 250                         } elseif( $set[3] != '' ) {
 251                                 # Double-quoted
 252                                 $value = Sanitizer::normalizeAttributeValue( $set[3] );
 253                         } elseif( $set[4] != '' ) {
 254                                 # Single-quoted
 255                                 $value = str_replace( '"', '&quot;',
 256                                         Sanitizer::normalizeAttributeValue( $set[4] ) );
 257                         } elseif( $set[5] != '' ) {
 258                                 # No quotes.
 259                                 $value = Sanitizer::normalizeAttributeValue( $set[5] );
 260                         } elseif( $set[6] != '' ) {
 261                                 # Illegal #XXXXXX color with no quotes.
 262                                 $value = Sanitizer::normalizeAttributeValue( $set[6] );
 263                         } else {
 264                                 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
 265                         }
 266
 267                         # Strip javascript "expression" from stylesheets.
 268                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 269                         if( $attribute == 'style' && preg_match(
 270                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 271                                         wfMungeToUtf8( $value ) ) ) {
 272                                 # haxx0r
 273                                 continue;
 274                         }
 275
 276                         if( !isset( $attribs[$attribute] ) ) {
 277                                 $attribs[$attribute] = "$attribute=\"$value\"";
 278                         }
 279                 }
 280                 if( empty( $attribs ) ) {
 281                         return '';
 282                 } else {
 283                         return ' ' . implode( ' ', $attribs );
 284                 }
 285         }
 286
 287         /**
 288          * Normalize whitespace and character references in an XML source-
 289          * encoded text for an attribute value.
 290          *
 291          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 292          * but note that we're not returning the value, but are returning
 293          * XML source fragments that will be slapped into output.
 294          *
 295          * @param string $text
 296          * @return string
 297          * @access private
 298          */
 299         function normalizeAttributeValue( $text ) {
 300                 return preg_replace(
 301                         '/\r\n|[\x20\x0d\x0a\x09]/',
 302                         ' ',
 303                         Sanitizer::normalizeCharReferences( $text ) );
 304         }
 305
 306         /**
 307          * Ensure that any entities and character references are legal
 308          * for XML and XHTML specifically. Any stray bits will be
 309          * &amp;-escaped to result in a valid text fragment.
 310          *
 311          * a. any named char refs must be known in XHTML
 312          * b. any numeric char refs must be legal chars, not invalid or forbidden
 313          * c. use &#x, not &#X
 314          * d. fix or reject non-valid attributes
 315          *
 316          * @param string $text
 317          * @return string
 318          * @access private
 319          */
 320         function normalizeCharReferences( $text ) {
 321                 return preg_replace_callback(
 322                         '/&([A-Za-z0-9]+);
 323                          |&\#([0-9]+);
 324                          |&\#x([0-9A-Za-z]+);
 325                          |&\#X([0-9A-Za-z]+);
 326                          |(&)/x',
 327                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 328                         $text );
 329         }
 330         /**
 331          * @param string $matches
 332          * @return string
 333          */
 334         function normalizeCharReferencesCallback( $matches ) {
 335                 $ret = null;
 336                 if( $matches[1] != '' ) {
 337                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 338                 } elseif( $matches[2] != '' ) {
 339                         $ret = Sanitizer::decCharReference( $matches[2] );
 340                 } elseif( $matches[3] != ''  ) {
 341                         $ret = Sanitizer::hexCharReference( $matches[3] );
 342                 } elseif( $matches[4] != '' ) {
 343                         $ret = Sanitizer::hexCharReference( $matches[4] );
 344                 }
 345                 if( is_null( $ret ) ) {
 346                         return htmlspecialchars( $matches[0] );
 347                 } else {
 348                         return $ret;
 349                 }
 350         }
 351
 352         /**
 353          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 354          * return the named entity reference as is. Otherwise, returns
 355          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 356          *
 357          * @param string $name
 358          * @return string
 359          */
 360         function normalizeEntity( $name ) {
 361                 # List of all named character entities defined in HTML 4.01
 362                 # http://www.w3.org/TR/html4/sgml/entities.html
 363                 static $htmlEntities = array(
 364                         'aacute' => true,
 365                         'Aacute' => true,
 366                         'acirc' => true,
 367                         'Acirc' => true,
 368                         'acute' => true,
 369                         'aelig' => true,
 370                         'AElig' => true,
 371                         'agrave' => true,
 372                         'Agrave' => true,
 373                         'alefsym' => true,
 374                         'alpha' => true,
 375                         'Alpha' => true,
 376                         'amp' => true,
 377                         'and' => true,
 378                         'ang' => true,
 379                         'apos' => true,
 380                         'aring' => true,
 381                         'Aring' => true,
 382                         'asymp' => true,
 383                         'atilde' => true,
 384                         'Atilde' => true,
 385                         'auml' => true,
 386                         'Auml' => true,
 387                         'bdquo' => true,
 388                         'beta' => true,
 389                         'Beta' => true,
 390                         'brvbar' => true,
 391                         'bull' => true,
 392                         'cap' => true,
 393                         'ccedil' => true,
 394                         'Ccedil' => true,
 395                         'cedil' => true,
 396                         'cent' => true,
 397                         'chi' => true,
 398                         'Chi' => true,
 399                         'circ' => true,
 400                         'clubs' => true,
 401                         'cong' => true,
 402                         'copy' => true,
 403                         'crarr' => true,
 404                         'cup' => true,
 405                         'curren' => true,
 406                         'dagger' => true,
 407                         'Dagger' => true,
 408                         'darr' => true,
 409                         'dArr' => true,
 410                         'deg' => true,
 411                         'delta' => true,
 412                         'Delta' => true,
 413                         'diams' => true,
 414                         'divide' => true,
 415                         'eacute' => true,
 416                         'Eacute' => true,
 417                         'ecirc' => true,
 418                         'Ecirc' => true,
 419                         'egrave' => true,
 420                         'Egrave' => true,
 421                         'empty' => true,
 422                         'emsp' => true,
 423                         'ensp' => true,
 424                         'epsilon' => true,
 425                         'Epsilon' => true,
 426                         'equiv' => true,
 427                         'eta' => true,
 428                         'Eta' => true,
 429                         'eth' => true,
 430                         'ETH' => true,
 431                         'euml' => true,
 432                         'Euml' => true,
 433                         'euro' => true,
 434                         'exist' => true,
 435                         'fnof' => true,
 436                         'forall' => true,
 437                         'frac12' => true,
 438                         'frac14' => true,
 439                         'frac34' => true,
 440                         'frasl' => true,
 441                         'gamma' => true,
 442                         'Gamma' => true,
 443                         'ge' => true,
 444                         'gt' => true,
 445                         'harr' => true,
 446                         'hArr' => true,
 447                         'hearts' => true,
 448                         'hellip' => true,
 449                         'iacute' => true,
 450                         'Iacute' => true,
 451                         'icirc' => true,
 452                         'Icirc' => true,
 453                         'iexcl' => true,
 454                         'igrave' => true,
 455                         'Igrave' => true,
 456                         'image' => true,
 457                         'infin' => true,
 458                         'int' => true,
 459                         'iota' => true,
 460                         'Iota' => true,
 461                         'iquest' => true,
 462                         'isin' => true,
 463                         'iuml' => true,
 464                         'Iuml' => true,
 465                         'kappa' => true,
 466                         'Kappa' => true,
 467                         'lambda' => true,
 468                         'Lambda' => true,
 469                         'lang' => true,
 470                         'laquo' => true,
 471                         'larr' => true,
 472                         'lArr' => true,
 473                         'lceil' => true,
 474                         'ldquo' => true,
 475                         'le' => true,
 476                         'lfloor' => true,
 477                         'lowast' => true,
 478                         'loz' => true,
 479                         'lrm' => true,
 480                         'lsaquo' => true,
 481                         'lsquo' => true,
 482                         'lt' => true,
 483                         'macr' => true,
 484                         'mdash' => true,
 485                         'micro' => true,
 486                         'middot' => true,
 487                         'minus' => true,
 488                         'mu' => true,
 489                         'Mu' => true,
 490                         'nabla' => true,
 491                         'nbsp' => true,
 492                         'ndash' => true,
 493                         'ne' => true,
 494                         'ni' => true,
 495                         'not' => true,
 496                         'notin' => true,
 497                         'nsub' => true,
 498                         'ntilde' => true,
 499                         'Ntilde' => true,
 500                         'nu' => true,
 501                         'Nu' => true,
 502                         'oacute' => true,
 503                         'Oacute' => true,
 504                         'ocirc' => true,
 505                         'Ocirc' => true,
 506                         'oelig' => true,
 507                         'OElig' => true,
 508                         'ograve' => true,
 509                         'Ograve' => true,
 510                         'oline' => true,
 511                         'omega' => true,
 512                         'Omega' => true,
 513                         'omicron' => true,
 514                         'Omicron' => true,
 515                         'oplus' => true,
 516                         'or' => true,
 517                         'ordf' => true,
 518                         'ordm' => true,
 519                         'oslash' => true,
 520                         'Oslash' => true,
 521                         'otilde' => true,
 522                         'Otilde' => true,
 523                         'otimes' => true,
 524                         'ouml' => true,
 525                         'Ouml' => true,
 526                         'para' => true,
 527                         'part' => true,
 528                         'permil' => true,
 529                         'perp' => true,
 530                         'phi' => true,
 531                         'Phi' => true,
 532                         'pi' => true,
 533                         'Pi' => true,
 534                         'piv' => true,
 535                         'plusmn' => true,
 536                         'pound' => true,
 537                         'prime' => true,
 538                         'Prime' => true,
 539                         'prod' => true,
 540                         'prop' => true,
 541                         'psi' => true,
 542                         'Psi' => true,
 543                         'quot' => true,
 544                         'radic' => true,
 545                         'rang' => true,
 546                         'raquo' => true,
 547                         'rarr' => true,
 548                         'rArr' => true,
 549                         'rceil' => true,
 550                         'rdquo' => true,
 551                         'real' => true,
 552                         'reg' => true,
 553                         'rfloor' => true,
 554                         'rho' => true,
 555                         'Rho' => true,
 556                         'rlm' => true,
 557                         'rsaquo' => true,
 558                         'rsquo' => true,
 559                         'sbquo' => true,
 560                         'scaron' => true,
 561                         'Scaron' => true,
 562                         'sdot' => true,
 563                         'sect' => true,
 564                         'shy' => true,
 565                         'sigma' => true,
 566                         'Sigma' => true,
 567                         'sigmaf' => true,
 568                         'sim' => true,
 569                         'spades' => true,
 570                         'sub' => true,
 571                         'sube' => true,
 572                         'sum' => true,
 573                         'sup' => true,
 574                         'sup1' => true,
 575                         'sup2' => true,
 576                         'sup3' => true,
 577                         'supe' => true,
 578                         'szlig' => true,
 579                         'tau' => true,
 580                         'Tau' => true,
 581                         'there4' => true,
 582                         'theta' => true,
 583                         'Theta' => true,
 584                         'thetasym' => true,
 585                         'thinsp' => true,
 586                         'thorn' => true,
 587                         'THORN' => true,
 588                         'tilde' => true,
 589                         'times' => true,
 590                         'trade' => true,
 591                         'uacute' => true,
 592                         'Uacute' => true,
 593                         'uarr' => true,
 594                         'uArr' => true,
 595                         'ucirc' => true,
 596                         'Ucirc' => true,
 597                         'ugrave' => true,
 598                         'Ugrave' => true,
 599                         'uml' => true,
 600                         'upsih' => true,
 601                         'upsilon' => true,
 602                         'Upsilon' => true,
 603                         'uuml' => true,
 604                         'Uuml' => true,
 605                         'weierp' => true,
 606                         'xi' => true,
 607                         'Xi' => true,
 608                         'yacute' => true,
 609                         'Yacute' => true,
 610                         'yen' => true,
 611                         'yuml' => true,
 612                         'Yuml' => true,
 613                         'zeta' => true,
 614                         'Zeta' => true,
 615                         'zwj' => true,
 616                         'zwnj' => true );
 617                 if( isset( $htmlEntities[$name] ) ) {
 618                         return "&$name;";
 619                 } else {
 620                         return "&amp;$name;";
 621                 }
 622         }
 623
 624         function decCharReference( $codepoint ) {
 625                 $point = IntVal( $codepoint );
 626                 if( Sanitizer::validateCodepoint( $point ) ) {
 627                         return sprintf( '&#%d;', $point );
 628                 } else {
 629                         return null;
 630                 }
 631         }
 632
 633         function hexCharReference( $codepoint ) {
 634                 $point = hexdec( $codepoint );
 635                 if( Sanitizer::validateCodepoint( $point ) ) {
 636                         return sprintf( '&#x%x;', $point );
 637                 } else {
 638                         return null;
 639                 }
 640         }
 641
 642         /**
 643          * Returns true if a given Unicode codepoint is a valid character in XML.
 644          * @param int $codepoint
 645          * @return bool
 646          */
 647         function validateCodepoint( $codepoint ) {
 648                 return ($codepoint ==    0x09)
 649                         || ($codepoint ==    0x0a)
 650                         || ($codepoint ==    0x0d)
 651                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 652                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 653                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 654         }
 655
 656         /**
 657          * Fetch the whitelist of acceptable attributes for a given
 658          * element name.
 659          *
 660          * @param string $element
 661          * @return array
 662          */
 663         function attributeWhitelist( $element ) {
 664                 $list = Sanitizer::setupAttributeWhitelist();
 665                 return isset( $list[$element] )
 666                         ? $list[$element]
 667                         : array();
 668         }
 669
 670         /**
 671          * @return array
 672          */
 673         function setupAttributeWhitelist() {
 674                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 675                 $block = array_merge( $common, array( 'align' ) );
 676                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 677                 $tablecell = array( 'abbr',
 678                                     'axis',
 679                                     'headers',
 680                                     'scope',
 681                                     'rowspan',
 682                                     'colspan',
 683                                     'nowrap', # deprecated
 684                                     'width', # deprecated
 685                                     'height' # deprecated
 686                                     );
 687
 688                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 689                 # See: http://www.w3.org/TR/html4/
 690                 $whitelist = array (
 691                         # 7.5.4
 692                         'div'        => $block,
 693                         'center'     => $common, # deprecated
 694                         'span'       => $block, # ??
 695
 696                         # 7.5.5
 697                         'h1'         => $block,
 698                         'h2'         => $block,
 699                         'h3'         => $block,
 700                         'h4'         => $block,
 701                         'h5'         => $block,
 702                         'h6'         => $block,
 703
 704                         # 7.5.6
 705                         # address
 706
 707                         # 8.2.4
 708                         # bdo
 709
 710                         # 9.2.1
 711                         'em'         => $common,
 712                         'strong'     => $common,
 713                         'cite'       => $common,
 714                         # dfn
 715                         'code'       => $common,
 716                         # samp
 717                         # kbd
 718                         'var'        => $common,
 719                         # abbr
 720                         # acronym
 721
 722                         # 9.2.2
 723                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 724                         # q
 725
 726                         # 9.2.3
 727                         'sub'        => $common,
 728                         'sup'        => $common,
 729
 730                         # 9.3.1
 731                         'p'          => $block,
 732
 733                         # 9.3.2
 734                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 735
 736                         # 9.3.4
 737                         'pre'        => array_merge( $common, array( 'width' ) ),
 738
 739                         # 9.4
 740                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 741                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 742
 743                         # 10.2
 744                         'ul'         => array_merge( $common, array( 'type' ) ),
 745                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 746                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 747
 748                         # 10.3
 749                         'dl'         => $common,
 750                         'dd'         => $common,
 751                         'dt'         => $common,
 752
 753                         # 11.2.1
 754                         'table'      => array_merge( $common,
 755                                                                 array( 'summary', 'width', 'border', 'frame',
 756                                                                                          'rules', 'cellspacing', 'cellpadding',
 757                                                                                          'align', 'bgcolor', 'frame', 'rules',
 758                                                                                          'border' ) ),
 759
 760                         # 11.2.2
 761                         'caption'    => array_merge( $common, array( 'align' ) ),
 762
 763                         # 11.2.3
 764                         'thead'      => array_merge( $common, $tablealign ),
 765                         'tfoot'      => array_merge( $common, $tablealign ),
 766                         'tbody'      => array_merge( $common, $tablealign ),
 767
 768                         # 11.2.4
 769                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 770                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 771
 772                         # 11.2.5
 773                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 774
 775                         # 11.2.6
 776                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 777                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 778
 779                         # 15.2.1
 780                         'tt'         => $common,
 781                         'b'          => $common,
 782                         'i'          => $common,
 783                         'big'        => $common,
 784                         'small'      => $common,
 785                         'strike'     => $common,
 786                         's'          => $common,
 787                         'u'          => $common,
 788
 789                         # 15.2.2
 790                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 791                         # basefont
 792
 793                         # 15.3
 794                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 795
 796                         # XHTML Ruby annotation text module, simple ruby only.
 797                         # http://www.w3c.org/TR/ruby/
 798                         'ruby'       => $common,
 799                         # rbc
 800                         # rtc
 801                         'rb'         => $common,
 802                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 803                         'rp'         => $common,
 804                         );
 805                 return $whitelist;
 806         }
 807
 808         /**
 809          * Take a fragment of (potentially invalid) HTML and return
 810          * a version with any tags removed, encoded suitably for literal
 811          * inclusion in an attribute value.
 812          *
 813          * @param string $text HTML fragment
 814          * @return string
 815          */
 816         function stripAllTags( $text ) {
 817                 # Actual <tags>
 818                 $text = preg_replace( '/<[^>]*>/', '', $text );
 819
 820                 # Normalize &entities and whitespace
 821                 $text = Sanitizer::normalizeAttributeValue( $text );
 822
 823                 # Will be placed into "double-quoted" attributes,
 824                 # make sure remaining bits are safe.
 825                 $text = str_replace(
 826                         array('<', '>', '"'),
 827                         array('&lt;', '&gt;', '&quot;'),
 828                         $text );
 829
 830                 return $text;
 831         }
 832
 833 }
 834
 835 ?>