includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  */
  26
  27 class Sanitizer {
  28         /**
  29          * Cleans up HTML, removes dangerous tags and attributes, and
  30          * removes HTML comments
  31          * @access private
  32          * @param string $text
  33          * @return string
  34          */
  35         function removeHTMLtags( $text ) {
  36                 global $wgUseTidy, $wgUserHtml;
  37                 $fname = 'Parser::removeHTMLtags';
  38                 wfProfileIn( $fname );
  39
  40                 if( $wgUserHtml ) {
  41                         $htmlpairs = array( # Tags that must be closed
  42                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  43                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  44                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
  45                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  46                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
  47                         );
  48                         $htmlsingle = array(
  49                                 'br', 'hr', 'li', 'dt', 'dd'
  50                         );
  51                         $htmlnest = array( # Tags that can be nested--??
  52                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  53                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  54                         );
  55                         $tabletags = array( # Can only appear inside table
  56                                 'td', 'th', 'tr'
  57                         );
  58                 } else {
  59                         $htmlpairs = array();
  60                         $htmlsingle = array();
  61                         $htmlnest = array();
  62                         $tabletags = array();
  63                 }
  64
  65                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
  66                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
  67
  68                 # Remove HTML comments
  69                 $text = Sanitizer::removeHTMLcomments( $text );
  70
  71                 $bits = explode( '<', $text );
  72                 $text = array_shift( $bits );
  73                 if(!$wgUseTidy) {
  74                         $tagstack = array(); $tablestack = array();
  75                         foreach ( $bits as $x ) {
  76                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
  77                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
  78                                 $x, $regs );
  79                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
  80                                 error_reporting( $prev );
  81
  82                                 $badtag = 0 ;
  83                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
  84                                         # Check our stack
  85                                         if ( $slash ) {
  86                                                 # Closing a tag...
  87                                                 if ( ! in_array( $t, $htmlsingle ) &&
  88                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
  89                                                         @array_push( $tagstack, $ot );
  90                                                         $badtag = 1;
  91                                                 } else {
  92                                                         if ( $t == 'table' ) {
  93                                                                 $tagstack = array_pop( $tablestack );
  94                                                         }
  95                                                         $newparams = '';
  96                                                 }
  97                                         } else {
  98                                                 # Keep track for later
  99                                                 if ( in_array( $t, $tabletags ) &&
 100                                                 ! in_array( 'table', $tagstack ) ) {
 101                                                         $badtag = 1;
 102                                                 } else if ( in_array( $t, $tagstack ) &&
 103                                                 ! in_array ( $t , $htmlnest ) ) {
 104                                                         $badtag = 1 ;
 105                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 106                                                         if ( $t == 'table' ) {
 107                                                                 array_push( $tablestack, $tagstack );
 108                                                                 $tagstack = array();
 109                                                         }
 110                                                         array_push( $tagstack, $t );
 111                                                 }
 112                                                 # Strip non-approved attributes from the tag
 113                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 114                                         }
 115                                         if ( ! $badtag ) {
 116                                                 $rest = str_replace( '>', '&gt;', $rest );
 117                                                 $text .= "<$slash$t$newparams$brace$rest";
 118                                                 continue;
 119                                         }
 120                                 }
 121                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 122                         }
 123                         # Close off any remaining tags
 124                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 125                                 $text .= "</$t>\n";
 126                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 127                         }
 128                 } else {
 129                         # this might be possible using tidy itself
 130                         foreach ( $bits as $x ) {
 131                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 132                                 $x, $regs );
 133                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 134                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 135                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 136                                         $rest = str_replace( '>', '&gt;', $rest );
 137                                         $text .= "<$slash$t$newparams$brace$rest";
 138                                 } else {
 139                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 140                                 }
 141                         }
 142                 }
 143                 wfProfileOut( $fname );
 144                 return $text;
 145         }
 146
 147         /**
 148          * Remove '<!--', '-->', and everything between.
 149          * To avoid leaving blank lines, when a comment is both preceded
 150          * and followed by a newline (ignoring spaces), trim leading and
 151          * trailing spaces and one of the newlines.
 152          *
 153          * @access private
 154          * @param string $text
 155          * @return string
 156          */
 157         function removeHTMLcomments( $text ) {
 158                 $fname='Parser::removeHTMLcomments';
 159                 wfProfileIn( $fname );
 160                 while (($start = strpos($text, '<!--')) !== false) {
 161                         $end = strpos($text, '-->', $start + 4);
 162                         if ($end === false) {
 163                                 # Unterminated comment; bail out
 164                                 break;
 165                         }
 166
 167                         $end += 3;
 168
 169                         # Trim space and newline if the comment is both
 170                         # preceded and followed by a newline
 171                         $spaceStart = max($start - 1, 0);
 172                         $spaceLen = $end - $spaceStart;
 173                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 174                                 $spaceStart--;
 175                                 $spaceLen++;
 176                         }
 177                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 178                                 $spaceLen++;
 179                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 180                                 # Remove the comment, leading and trailing
 181                                 # spaces, and leave only one newline.
 182                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 183                         }
 184                         else {
 185                                 # Remove just the comment.
 186                                 $text = substr_replace($text, '', $start, $end - $start);
 187                         }
 188                 }
 189                 wfProfileOut( $fname );
 190                 return $text;
 191         }
 192
 193         /**
 194          * Take a tag soup fragment listing an HTML element's attributes
 195          * and normalize it to well-formed XML, discarding unwanted attributes.
 196          *
 197          * - Normalizes attribute names to lowercase
 198          * - Discards attributes not on a whitelist for the given element
 199          * - Turns broken or invalid entities into plaintext
 200          * - Double-quotes all attribute values
 201          * - Attributes without values are given the name as attribute
 202          * - Double attributes are discarded
 203          * - Unsafe style attributes are discarded
 204          * - Prepends space if there are attributes.
 205          *
 206          * @param string $text
 207          * @param string $element
 208          * @return string
 209          *
 210          * @todo Check for legal values where the DTD limits things.
 211          * @todo Check for unique id attribute :P
 212          */
 213         function fixTagAttributes( $text, $element ) {
 214                 if( trim( $text ) == '' ) {
 215                         return '';
 216                 }
 217
 218                 $attrib = '[A-Za-z0-9]'; #FIXME
 219                 $space = '[\x09\x0a\x0d\x20]';
 220                 if( !preg_match_all(
 221                         "/(?:^|$space)($attrib+)
 222                           ($space*=$space*
 223                             (?:
 224                              # The attribute value: quoted or alone
 225                               \"([^<\"]*)\"
 226                              | '([^<']*)'
 227                              |  ([a-zA-Z0-9._:-]+)
 228                              |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 229                                                  # colors are specified like this.
 230                                                  # We'll be normalizing it.
 231                             )
 232                            )?(?=$space|\$)/sx",
 233                         $text,
 234                         $pairs,
 235                         PREG_SET_ORDER ) ) {
 236                         return '';
 237                 }
 238
 239                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 240                 $attribs = array();
 241                 foreach( $pairs as $set ) {
 242                         $attribute = strtolower( $set[1] );
 243                         if( !isset( $whitelist[$attribute] ) ) {
 244                                 continue;
 245                         }
 246                         if( $set[2] == '' ) {
 247                                 # In XHTML, attributes must have a value.
 248                                 $value = $set[1];
 249                         } elseif( $set[3] != '' ) {
 250                                 # Double-quoted
 251                                 $value = Sanitizer::normalizeAttributeValue( $set[3] );
 252                         } elseif( $set[4] != '' ) {
 253                                 # Single-quoted
 254                                 $value = str_replace( '"', '&quot;',
 255                                         Sanitizer::normalizeAttributeValue( $set[4] ) );
 256                         } elseif( $set[5] != '' ) {
 257                                 # No quotes.
 258                                 $value = Sanitizer::normalizeAttributeValue( $set[5] );
 259                         } elseif( $set[6] != '' ) {
 260                                 # Illegal #XXXXXX color with no quotes.
 261                                 $value = Sanitizer::normalizeAttributeValue( $set[6] );
 262                         } else {
 263                                 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
 264                         }
 265
 266                         # Strip javascript "expression" from stylesheets.
 267                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 268                         if( $attribute == 'style' && preg_match(
 269                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 270                                         wfMungeToUtf8( $value ) ) ) {
 271                                 # haxx0r
 272                                 continue;
 273                         }
 274
 275                         if( !isset( $attribs[$attribute] ) ) {
 276                                 $attribs[$attribute] = "$attribute=\"$value\"";
 277                         }
 278                 }
 279                 if( empty( $attribs ) ) {
 280                         return '';
 281                 } else {
 282                         return ' ' . implode( ' ', $attribs );
 283                 }
 284         }
 285
 286         /**
 287          * Normalize whitespace and character references in an XML source-
 288          * encoded text for an attribute value.
 289          *
 290          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 291          * but note that we're not returning the value, but are returning
 292          * XML source fragments that will be slapped into output.
 293          *
 294          * @param string $text
 295          * @return string
 296          * @access private
 297          */
 298         function normalizeAttributeValue( $text ) {
 299                 return preg_replace(
 300                         '/\r\n|[\x20\x0d\x0a\x09]/',
 301                         ' ',
 302                         Sanitizer::normalizeCharReferences( $text ) );
 303         }
 304
 305         /**
 306          * Ensure that any entities and character references are legal
 307          * for XML and XHTML specifically. Any stray bits will be
 308          * &amp;-escaped to result in a valid text fragment.
 309          *
 310          * a. any named char refs must be known in XHTML
 311          * b. any numeric char refs must be legal chars, not invalid or forbidden
 312          * c. use &#x, not &#X
 313          * d. fix or reject non-valid attributes
 314          *
 315          * @param string $text
 316          * @return string
 317          * @access private
 318          */
 319         function normalizeCharReferences( $text ) {
 320                 return preg_replace_callback(
 321                         '/&([A-Za-z0-9]+);
 322                          |&\#([0-9]+);
 323                          |&\#x([0-9A-Za-z]+);
 324                          |&\#X([0-9A-Za-z]+);
 325                          |(&)/x',
 326                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 327                         $text );
 328         }
 329         /**
 330          * @param string $matches
 331          * @return string
 332          */
 333         function normalizeCharReferencesCallback( $matches ) {
 334                 $ret = null;
 335                 if( $matches[1] != '' ) {
 336                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 337                 } elseif( $matches[2] != '' ) {
 338                         $ret = Sanitizer::decCharReference( $matches[2] );
 339                 } elseif( $matches[3] != ''  ) {
 340                         $ret = Sanitizer::hexCharReference( $matches[3] );
 341                 } elseif( $matches[4] != '' ) {
 342                         $ret = Sanitizer::hexCharReference( $matches[4] );
 343                 }
 344                 if( is_null( $ret ) ) {
 345                         return htmlspecialchars( $matches[0] );
 346                 } else {
 347                         return $ret;
 348                 }
 349         }
 350
 351         /**
 352          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 353          * return the named entity reference as is. Otherwise, returns
 354          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 355          *
 356          * @param string $name
 357          * @return string
 358          */
 359         function normalizeEntity( $name ) {
 360                 # List of all named character entities defined in HTML 4.01
 361                 # http://www.w3.org/TR/html4/sgml/entities.html
 362                 static $htmlEntities = array(
 363                         'aacute' => true,
 364                         'Aacute' => true,
 365                         'acirc' => true,
 366                         'Acirc' => true,
 367                         'acute' => true,
 368                         'aelig' => true,
 369                         'AElig' => true,
 370                         'agrave' => true,
 371                         'Agrave' => true,
 372                         'alefsym' => true,
 373                         'alpha' => true,
 374                         'Alpha' => true,
 375                         'amp' => true,
 376                         'and' => true,
 377                         'ang' => true,
 378                         'apos' => true,
 379                         'aring' => true,
 380                         'Aring' => true,
 381                         'asymp' => true,
 382                         'atilde' => true,
 383                         'Atilde' => true,
 384                         'auml' => true,
 385                         'Auml' => true,
 386                         'bdquo' => true,
 387                         'beta' => true,
 388                         'Beta' => true,
 389                         'brvbar' => true,
 390                         'bull' => true,
 391                         'cap' => true,
 392                         'ccedil' => true,
 393                         'Ccedil' => true,
 394                         'cedil' => true,
 395                         'cent' => true,
 396                         'chi' => true,
 397                         'Chi' => true,
 398                         'circ' => true,
 399                         'clubs' => true,
 400                         'cong' => true,
 401                         'copy' => true,
 402                         'crarr' => true,
 403                         'cup' => true,
 404                         'curren' => true,
 405                         'dagger' => true,
 406                         'Dagger' => true,
 407                         'darr' => true,
 408                         'dArr' => true,
 409                         'deg' => true,
 410                         'delta' => true,
 411                         'Delta' => true,
 412                         'diams' => true,
 413                         'divide' => true,
 414                         'eacute' => true,
 415                         'Eacute' => true,
 416                         'ecirc' => true,
 417                         'Ecirc' => true,
 418                         'egrave' => true,
 419                         'Egrave' => true,
 420                         'empty' => true,
 421                         'emsp' => true,
 422                         'ensp' => true,
 423                         'epsilon' => true,
 424                         'Epsilon' => true,
 425                         'equiv' => true,
 426                         'eta' => true,
 427                         'Eta' => true,
 428                         'eth' => true,
 429                         'ETH' => true,
 430                         'euml' => true,
 431                         'Euml' => true,
 432                         'euro' => true,
 433                         'exist' => true,
 434                         'fnof' => true,
 435                         'forall' => true,
 436                         'frac12' => true,
 437                         'frac14' => true,
 438                         'frac34' => true,
 439                         'frasl' => true,
 440                         'gamma' => true,
 441                         'Gamma' => true,
 442                         'ge' => true,
 443                         'gt' => true,
 444                         'harr' => true,
 445                         'hArr' => true,
 446                         'hearts' => true,
 447                         'hellip' => true,
 448                         'iacute' => true,
 449                         'Iacute' => true,
 450                         'icirc' => true,
 451                         'Icirc' => true,
 452                         'iexcl' => true,
 453                         'igrave' => true,
 454                         'Igrave' => true,
 455                         'image' => true,
 456                         'infin' => true,
 457                         'int' => true,
 458                         'iota' => true,
 459                         'Iota' => true,
 460                         'iquest' => true,
 461                         'isin' => true,
 462                         'iuml' => true,
 463                         'Iuml' => true,
 464                         'kappa' => true,
 465                         'Kappa' => true,
 466                         'lambda' => true,
 467                         'Lambda' => true,
 468                         'lang' => true,
 469                         'laquo' => true,
 470                         'larr' => true,
 471                         'lArr' => true,
 472                         'lceil' => true,
 473                         'ldquo' => true,
 474                         'le' => true,
 475                         'lfloor' => true,
 476                         'lowast' => true,
 477                         'loz' => true,
 478                         'lrm' => true,
 479                         'lsaquo' => true,
 480                         'lsquo' => true,
 481                         'lt' => true,
 482                         'macr' => true,
 483                         'mdash' => true,
 484                         'micro' => true,
 485                         'middot' => true,
 486                         'minus' => true,
 487                         'mu' => true,
 488                         'Mu' => true,
 489                         'nabla' => true,
 490                         'nbsp' => true,
 491                         'ndash' => true,
 492                         'ne' => true,
 493                         'ni' => true,
 494                         'not' => true,
 495                         'notin' => true,
 496                         'nsub' => true,
 497                         'ntilde' => true,
 498                         'Ntilde' => true,
 499                         'nu' => true,
 500                         'Nu' => true,
 501                         'oacute' => true,
 502                         'Oacute' => true,
 503                         'ocirc' => true,
 504                         'Ocirc' => true,
 505                         'oelig' => true,
 506                         'OElig' => true,
 507                         'ograve' => true,
 508                         'Ograve' => true,
 509                         'oline' => true,
 510                         'omega' => true,
 511                         'Omega' => true,
 512                         'omicron' => true,
 513                         'Omicron' => true,
 514                         'oplus' => true,
 515                         'or' => true,
 516                         'ordf' => true,
 517                         'ordm' => true,
 518                         'oslash' => true,
 519                         'Oslash' => true,
 520                         'otilde' => true,
 521                         'Otilde' => true,
 522                         'otimes' => true,
 523                         'ouml' => true,
 524                         'Ouml' => true,
 525                         'para' => true,
 526                         'part' => true,
 527                         'permil' => true,
 528                         'perp' => true,
 529                         'phi' => true,
 530                         'Phi' => true,
 531                         'pi' => true,
 532                         'Pi' => true,
 533                         'piv' => true,
 534                         'plusmn' => true,
 535                         'pound' => true,
 536                         'prime' => true,
 537                         'Prime' => true,
 538                         'prod' => true,
 539                         'prop' => true,
 540                         'psi' => true,
 541                         'Psi' => true,
 542                         'quot' => true,
 543                         'radic' => true,
 544                         'rang' => true,
 545                         'raquo' => true,
 546                         'rarr' => true,
 547                         'rArr' => true,
 548                         'rceil' => true,
 549                         'rdquo' => true,
 550                         'real' => true,
 551                         'reg' => true,
 552                         'rfloor' => true,
 553                         'rho' => true,
 554                         'Rho' => true,
 555                         'rlm' => true,
 556                         'rsaquo' => true,
 557                         'rsquo' => true,
 558                         'sbquo' => true,
 559                         'scaron' => true,
 560                         'Scaron' => true,
 561                         'sdot' => true,
 562                         'sect' => true,
 563                         'shy' => true,
 564                         'sigma' => true,
 565                         'Sigma' => true,
 566                         'sigmaf' => true,
 567                         'sim' => true,
 568                         'spades' => true,
 569                         'sub' => true,
 570                         'sube' => true,
 571                         'sum' => true,
 572                         'sup' => true,
 573                         'sup1' => true,
 574                         'sup2' => true,
 575                         'sup3' => true,
 576                         'supe' => true,
 577                         'szlig' => true,
 578                         'tau' => true,
 579                         'Tau' => true,
 580                         'there4' => true,
 581                         'theta' => true,
 582                         'Theta' => true,
 583                         'thetasym' => true,
 584                         'thinsp' => true,
 585                         'thorn' => true,
 586                         'THORN' => true,
 587                         'tilde' => true,
 588                         'times' => true,
 589                         'trade' => true,
 590                         'uacute' => true,
 591                         'Uacute' => true,
 592                         'uarr' => true,
 593                         'uArr' => true,
 594                         'ucirc' => true,
 595                         'Ucirc' => true,
 596                         'ugrave' => true,
 597                         'Ugrave' => true,
 598                         'uml' => true,
 599                         'upsih' => true,
 600                         'upsilon' => true,
 601                         'Upsilon' => true,
 602                         'uuml' => true,
 603                         'Uuml' => true,
 604                         'weierp' => true,
 605                         'xi' => true,
 606                         'Xi' => true,
 607                         'yacute' => true,
 608                         'Yacute' => true,
 609                         'yen' => true,
 610                         'yuml' => true,
 611                         'Yuml' => true,
 612                         'zeta' => true,
 613                         'Zeta' => true,
 614                         'zwj' => true,
 615                         'zwnj' => true );
 616                 if( isset( $htmlEntities[$name] ) ) {
 617                         return "&$name;";
 618                 } else {
 619                         return "&amp;$name;";
 620                 }
 621         }
 622
 623         function decCharReference( $codepoint ) {
 624                 $point = IntVal( $codepoint );
 625                 if( Sanitizer::validateCodepoint( $point ) ) {
 626                         return sprintf( '&#%d;', $point );
 627                 } else {
 628                         return null;
 629                 }
 630         }
 631
 632         function hexCharReference( $codepoint ) {
 633                 $point = hexdec( $codepoint );
 634                 if( Sanitizer::validateCodepoint( $point ) ) {
 635                         return sprintf( '&#x%x;', $point );
 636                 } else {
 637                         return null;
 638                 }
 639         }
 640
 641         /**
 642          * Returns true if a given Unicode codepoint is a valid character in XML.
 643          * @param int $codepoint
 644          * @return bool
 645          */
 646         function validateCodepoint( $codepoint ) {
 647                 return ($codepoint ==    0x09)
 648                         || ($codepoint ==    0x0a)
 649                         || ($codepoint ==    0x0d)
 650                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 651                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 652                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 653         }
 654
 655         /**
 656          * Fetch the whitelist of acceptable attributes for a given
 657          * element name.
 658          *
 659          * @param string $element
 660          * @return array
 661          */
 662         function attributeWhitelist( $element ) {
 663                 $list = Sanitizer::setupAttributeWhitelist();
 664                 return isset( $list[$element] )
 665                         ? $list[$element]
 666                         : array();
 667         }
 668
 669         /**
 670          * @return array
 671          */
 672         function setupAttributeWhitelist() {
 673                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 674                 $block = array_merge( $common, array( 'align' ) );
 675                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 676                 $tablecell = array( 'abbr',
 677                                     'axis',
 678                                     'headers',
 679                                     'scope',
 680                                     'rowspan',
 681                                     'colspan',
 682                                     'nowrap', # deprecated
 683                                     'width', # deprecated
 684                                     'height' # deprecated
 685                                     );
 686
 687                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 688                 # See: http://www.w3.org/TR/html4/
 689                 $whitelist = array (
 690                         # 7.5.4
 691                         'div'        => $block,
 692                         'center'     => $common, # deprecated
 693                         'span'       => $block, # ??
 694
 695                         # 7.5.5
 696                         'h1'         => $block,
 697                         'h2'         => $block,
 698                         'h3'         => $block,
 699                         'h4'         => $block,
 700                         'h5'         => $block,
 701                         'h6'         => $block,
 702
 703                         # 7.5.6
 704                         # address
 705
 706                         # 8.2.4
 707                         # bdo
 708
 709                         # 9.2.1
 710                         'em'         => $common,
 711                         'strong'     => $common,
 712                         'cite'       => $common,
 713                         # dfn
 714                         'code'       => $common,
 715                         # samp
 716                         # kbd
 717                         'var'        => $common,
 718                         # abbr
 719                         # acronym
 720
 721                         # 9.2.2
 722                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 723                         # q
 724
 725                         # 9.2.3
 726                         'sub'        => $common,
 727                         'sup'        => $common,
 728
 729                         # 9.3.1
 730                         'p'          => $block,
 731
 732                         # 9.3.2
 733                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 734
 735                         # 9.3.4
 736                         'pre'        => array_merge( $common, array( 'width' ) ),
 737
 738                         # 9.4
 739                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 740                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 741
 742                         # 10.2
 743                         'ul'         => array_merge( $common, array( 'type' ) ),
 744                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 745                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 746
 747                         # 10.3
 748                         'dl'         => $common,
 749                         'dd'         => $common,
 750                         'dt'         => $common,
 751
 752                         # 11.2.1
 753                         'table'      => array_merge( $common,
 754                                                                 array( 'summary', 'width', 'border', 'frame',
 755                                                                                          'rules', 'cellspacing', 'cellpadding',
 756                                                                                          'align', 'bgcolor', 'frame', 'rules',
 757                                                                                          'border' ) ),
 758
 759                         # 11.2.2
 760                         'caption'    => array_merge( $common, array( 'align' ) ),
 761
 762                         # 11.2.3
 763                         'thead'      => array_merge( $common, $tablealign ),
 764                         'tfoot'      => array_merge( $common, $tablealign ),
 765                         'tbody'      => array_merge( $common, $tablealign ),
 766
 767                         # 11.2.4
 768                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 769                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 770
 771                         # 11.2.5
 772                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 773
 774                         # 11.2.6
 775                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 776                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 777
 778                         # 15.2.1
 779                         'tt'         => $common,
 780                         'b'          => $common,
 781                         'i'          => $common,
 782                         'big'        => $common,
 783                         'small'      => $common,
 784                         'strike'     => $common,
 785                         's'          => $common,
 786                         'u'          => $common,
 787
 788                         # 15.2.2
 789                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 790                         # basefont
 791
 792                         # 15.3
 793                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 794
 795                         # XHTML Ruby annotation text module, simple ruby only.
 796                         # http://www.w3c.org/TR/ruby/
 797                         'ruby'       => $common,
 798                         # rbc
 799                         # rtc
 800                         'rb'         => $common,
 801                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 802                         'rp'         => $common,
 803                         );
 804                 return $whitelist;
 805         }
 806
 807         /**
 808          * Take a fragment of (potentially invalid) HTML and return
 809          * a version with any tags removed, encoded suitably for literal
 810          * inclusion in an attribute value.
 811          *
 812          * @param string $text HTML fragment
 813          * @return string
 814          */
 815         function stripAllTags( $text ) {
 816                 # Actual <tags>
 817                 $text = preg_replace( '/<[^>]*>/', '', $text );
 818
 819                 # Normalize &entities and whitespace
 820                 $text = Sanitizer::normalizeAttributeValue( $text );
 821
 822                 # Will be placed into "double-quoted" attributes,
 823                 # make sure remaining bits are safe.
 824                 $text = str_replace(
 825                         array('<', '>', '"'),
 826                         array('&lt;', '&gt;', '&quot;'),
 827                         $text );
 828
 829                 return $text;
 830         }
 831
 832 }
 833
 834 ?>