* Adding "CC ?= gcc" portability
[mediawiki.git] / includes / Sanitizer.php
blob576bd4423a1e316647e7aa22290ed913690f9065
1 <?php
3 /**
4 * (X)HTML sanitizer for MediaWiki
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
24 * @package MediaWiki
27 class Sanitizer {
28 /**
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
31 * @access private
32 * @param string $text
33 * @return string
35 function removeHTMLtags( $text ) {
36 global $wgUseTidy, $wgUserHtml;
37 $fname = 'Parser::removeHTMLtags';
38 wfProfileIn( $fname );
40 if( $wgUserHtml ) {
41 $htmlpairs = array( # Tags that must be closed
42 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
43 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
44 'strike', 'strong', 'tt', 'var', 'div', 'center',
45 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
46 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
48 $htmlsingle = array(
49 'br', 'hr', 'li', 'dt', 'dd'
51 $htmlnest = array( # Tags that can be nested--??
52 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
53 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
55 $tabletags = array( # Can only appear inside table
56 'td', 'th', 'tr'
58 } else {
59 $htmlpairs = array();
60 $htmlsingle = array();
61 $htmlnest = array();
62 $tabletags = array();
65 $htmlsingle = array_merge( $tabletags, $htmlsingle );
66 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
68 # Remove HTML comments
69 $text = Sanitizer::removeHTMLcomments( $text );
71 $bits = explode( '<', $text );
72 $text = array_shift( $bits );
73 if(!$wgUseTidy) {
74 $tagstack = array(); $tablestack = array();
75 foreach ( $bits as $x ) {
76 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
77 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
78 $x, $regs );
79 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
80 error_reporting( $prev );
82 $badtag = 0 ;
83 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
84 # Check our stack
85 if ( $slash ) {
86 # Closing a tag...
87 if ( ! in_array( $t, $htmlsingle ) &&
88 ( $ot = @array_pop( $tagstack ) ) != $t ) {
89 @array_push( $tagstack, $ot );
90 $badtag = 1;
91 } else {
92 if ( $t == 'table' ) {
93 $tagstack = array_pop( $tablestack );
95 $newparams = '';
97 } else {
98 # Keep track for later
99 if ( in_array( $t, $tabletags ) &&
100 ! in_array( 'table', $tagstack ) ) {
101 $badtag = 1;
102 } else if ( in_array( $t, $tagstack ) &&
103 ! in_array ( $t , $htmlnest ) ) {
104 $badtag = 1 ;
105 } else if ( ! in_array( $t, $htmlsingle ) ) {
106 if ( $t == 'table' ) {
107 array_push( $tablestack, $tagstack );
108 $tagstack = array();
110 array_push( $tagstack, $t );
112 # Strip non-approved attributes from the tag
113 $newparams = Sanitizer::fixTagAttributes( $params, $t );
115 if ( ! $badtag ) {
116 $rest = str_replace( '>', '&gt;', $rest );
117 $text .= "<$slash$t$newparams$brace$rest";
118 continue;
121 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
123 # Close off any remaining tags
124 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
125 $text .= "</$t>\n";
126 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
128 } else {
129 # this might be possible using tidy itself
130 foreach ( $bits as $x ) {
131 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
132 $x, $regs );
133 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
134 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
135 $newparams = Sanitizer::fixTagAttributes( $params, $t );
136 $rest = str_replace( '>', '&gt;', $rest );
137 $text .= "<$slash$t$newparams$brace$rest";
138 } else {
139 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
143 wfProfileOut( $fname );
144 return $text;
148 * Remove '<!--', '-->', and everything between.
149 * To avoid leaving blank lines, when a comment is both preceded
150 * and followed by a newline (ignoring spaces), trim leading and
151 * trailing spaces and one of the newlines.
153 * @access private
154 * @param string $text
155 * @return string
157 function removeHTMLcomments( $text ) {
158 $fname='Parser::removeHTMLcomments';
159 wfProfileIn( $fname );
160 while (($start = strpos($text, '<!--')) !== false) {
161 $end = strpos($text, '-->', $start + 4);
162 if ($end === false) {
163 # Unterminated comment; bail out
164 break;
167 $end += 3;
169 # Trim space and newline if the comment is both
170 # preceded and followed by a newline
171 $spaceStart = max($start - 1, 0);
172 $spaceLen = $end - $spaceStart;
173 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
174 $spaceStart--;
175 $spaceLen++;
177 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
178 $spaceLen++;
179 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
180 # Remove the comment, leading and trailing
181 # spaces, and leave only one newline.
182 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
184 else {
185 # Remove just the comment.
186 $text = substr_replace($text, '', $start, $end - $start);
189 wfProfileOut( $fname );
190 return $text;
194 * Take a tag soup fragment listing an HTML element's attributes
195 * and normalize it to well-formed XML, discarding unwanted attributes.
197 * - Normalizes attribute names to lowercase
198 * - Discards attributes not on a whitelist for the given element
199 * - Turns broken or invalid entities into plaintext
200 * - Double-quotes all attribute values
201 * - Attributes without values are given the name as attribute
202 * - Double attributes are discarded
203 * - Unsafe style attributes are discarded
204 * - Prepends space if there are attributes.
206 * @param string $text
207 * @param string $element
208 * @return string
210 * @todo Check for legal values where the DTD limits things.
211 * @todo Check for unique id attribute :P
213 function fixTagAttributes( $text, $element ) {
214 if( trim( $text ) == '' ) {
215 return '';
218 $attrib = '[A-Za-z0-9]'; #FIXME
219 $space = '[\x09\x0a\x0d\x20]';
220 if( !preg_match_all(
221 "/(?:^|$space)($attrib+)
222 ($space*=$space*
224 # The attribute value: quoted or alone
225 \"([^<\"]*)\"
226 | '([^<']*)'
227 | ([a-zA-Z0-9._:-]+)
228 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
229 # colors are specified like this.
230 # We'll be normalizing it.
232 )?(?=$space|\$)/sx",
233 $text,
234 $pairs,
235 PREG_SET_ORDER ) ) {
236 return '';
239 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
240 $attribs = array();
241 foreach( $pairs as $set ) {
242 $attribute = strtolower( $set[1] );
243 if( !isset( $whitelist[$attribute] ) ) {
244 continue;
246 if( $set[2] == '' ) {
247 # In XHTML, attributes must have a value.
248 $value = $set[1];
249 } elseif( $set[3] != '' ) {
250 # Double-quoted
251 $value = Sanitizer::normalizeAttributeValue( $set[3] );
252 } elseif( $set[4] != '' ) {
253 # Single-quoted
254 $value = str_replace( '"', '&quot;',
255 Sanitizer::normalizeAttributeValue( $set[4] ) );
256 } elseif( $set[5] != '' ) {
257 # No quotes.
258 $value = Sanitizer::normalizeAttributeValue( $set[5] );
259 } elseif( $set[6] != '' ) {
260 # Illegal #XXXXXX color with no quotes.
261 $value = Sanitizer::normalizeAttributeValue( $set[6] );
262 } else {
263 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
266 # Strip javascript "expression" from stylesheets.
267 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
268 if( $attribute == 'style' && preg_match(
269 '/(expression|tps*:\/\/|url\\s*\().*/is',
270 wfMungeToUtf8( $value ) ) ) {
271 # haxx0r
272 continue;
275 if( !isset( $attribs[$attribute] ) ) {
276 $attribs[$attribute] = "$attribute=\"$value\"";
279 if( empty( $attribs ) ) {
280 return '';
281 } else {
282 return ' ' . implode( ' ', $attribs );
287 * Normalize whitespace and character references in an XML source-
288 * encoded text for an attribute value.
290 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
291 * but note that we're not returning the value, but are returning
292 * XML source fragments that will be slapped into output.
294 * @param string $text
295 * @return string
296 * @access private
298 function normalizeAttributeValue( $text ) {
299 return preg_replace(
300 '/\r\n|[\x20\x0d\x0a\x09]/',
301 ' ',
302 Sanitizer::normalizeCharReferences( $text ) );
306 * Ensure that any entities and character references are legal
307 * for XML and XHTML specifically. Any stray bits will be
308 * &amp;-escaped to result in a valid text fragment.
310 * a. any named char refs must be known in XHTML
311 * b. any numeric char refs must be legal chars, not invalid or forbidden
312 * c. use &#x, not &#X
313 * d. fix or reject non-valid attributes
315 * @param string $text
316 * @return string
317 * @access private
319 function normalizeCharReferences( $text ) {
320 return preg_replace_callback(
321 '/&([A-Za-z0-9]+);
322 |&\#([0-9]+);
323 |&\#x([0-9A-Za-z]+);
324 |&\#X([0-9A-Za-z]+);
325 |(&)/x',
326 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
327 $text );
330 * @param string $matches
331 * @return string
333 function normalizeCharReferencesCallback( $matches ) {
334 $ret = null;
335 if( $matches[1] != '' ) {
336 $ret = Sanitizer::normalizeEntity( $matches[1] );
337 } elseif( $matches[2] != '' ) {
338 $ret = Sanitizer::decCharReference( $matches[2] );
339 } elseif( $matches[3] != '' ) {
340 $ret = Sanitizer::hexCharReference( $matches[3] );
341 } elseif( $matches[4] != '' ) {
342 $ret = Sanitizer::hexCharReference( $matches[4] );
344 if( is_null( $ret ) ) {
345 return htmlspecialchars( $matches[0] );
346 } else {
347 return $ret;
352 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
353 * return the named entity reference as is. Otherwise, returns
354 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
356 * @param string $name
357 * @return string
359 function normalizeEntity( $name ) {
360 # List of all named character entities defined in HTML 4.01
361 # http://www.w3.org/TR/html4/sgml/entities.html
362 static $htmlEntities = array(
363 'aacute' => true,
364 'Aacute' => true,
365 'acirc' => true,
366 'Acirc' => true,
367 'acute' => true,
368 'aelig' => true,
369 'AElig' => true,
370 'agrave' => true,
371 'Agrave' => true,
372 'alefsym' => true,
373 'alpha' => true,
374 'Alpha' => true,
375 'amp' => true,
376 'and' => true,
377 'ang' => true,
378 'apos' => true,
379 'aring' => true,
380 'Aring' => true,
381 'asymp' => true,
382 'atilde' => true,
383 'Atilde' => true,
384 'auml' => true,
385 'Auml' => true,
386 'bdquo' => true,
387 'beta' => true,
388 'Beta' => true,
389 'brvbar' => true,
390 'bull' => true,
391 'cap' => true,
392 'ccedil' => true,
393 'Ccedil' => true,
394 'cedil' => true,
395 'cent' => true,
396 'chi' => true,
397 'Chi' => true,
398 'circ' => true,
399 'clubs' => true,
400 'cong' => true,
401 'copy' => true,
402 'crarr' => true,
403 'cup' => true,
404 'curren' => true,
405 'dagger' => true,
406 'Dagger' => true,
407 'darr' => true,
408 'dArr' => true,
409 'deg' => true,
410 'delta' => true,
411 'Delta' => true,
412 'diams' => true,
413 'divide' => true,
414 'eacute' => true,
415 'Eacute' => true,
416 'ecirc' => true,
417 'Ecirc' => true,
418 'egrave' => true,
419 'Egrave' => true,
420 'empty' => true,
421 'emsp' => true,
422 'ensp' => true,
423 'epsilon' => true,
424 'Epsilon' => true,
425 'equiv' => true,
426 'eta' => true,
427 'Eta' => true,
428 'eth' => true,
429 'ETH' => true,
430 'euml' => true,
431 'Euml' => true,
432 'euro' => true,
433 'exist' => true,
434 'fnof' => true,
435 'forall' => true,
436 'frac12' => true,
437 'frac14' => true,
438 'frac34' => true,
439 'frasl' => true,
440 'gamma' => true,
441 'Gamma' => true,
442 'ge' => true,
443 'gt' => true,
444 'harr' => true,
445 'hArr' => true,
446 'hearts' => true,
447 'hellip' => true,
448 'iacute' => true,
449 'Iacute' => true,
450 'icirc' => true,
451 'Icirc' => true,
452 'iexcl' => true,
453 'igrave' => true,
454 'Igrave' => true,
455 'image' => true,
456 'infin' => true,
457 'int' => true,
458 'iota' => true,
459 'Iota' => true,
460 'iquest' => true,
461 'isin' => true,
462 'iuml' => true,
463 'Iuml' => true,
464 'kappa' => true,
465 'Kappa' => true,
466 'lambda' => true,
467 'Lambda' => true,
468 'lang' => true,
469 'laquo' => true,
470 'larr' => true,
471 'lArr' => true,
472 'lceil' => true,
473 'ldquo' => true,
474 'le' => true,
475 'lfloor' => true,
476 'lowast' => true,
477 'loz' => true,
478 'lrm' => true,
479 'lsaquo' => true,
480 'lsquo' => true,
481 'lt' => true,
482 'macr' => true,
483 'mdash' => true,
484 'micro' => true,
485 'middot' => true,
486 'minus' => true,
487 'mu' => true,
488 'Mu' => true,
489 'nabla' => true,
490 'nbsp' => true,
491 'ndash' => true,
492 'ne' => true,
493 'ni' => true,
494 'not' => true,
495 'notin' => true,
496 'nsub' => true,
497 'ntilde' => true,
498 'Ntilde' => true,
499 'nu' => true,
500 'Nu' => true,
501 'oacute' => true,
502 'Oacute' => true,
503 'ocirc' => true,
504 'Ocirc' => true,
505 'oelig' => true,
506 'OElig' => true,
507 'ograve' => true,
508 'Ograve' => true,
509 'oline' => true,
510 'omega' => true,
511 'Omega' => true,
512 'omicron' => true,
513 'Omicron' => true,
514 'oplus' => true,
515 'or' => true,
516 'ordf' => true,
517 'ordm' => true,
518 'oslash' => true,
519 'Oslash' => true,
520 'otilde' => true,
521 'Otilde' => true,
522 'otimes' => true,
523 'ouml' => true,
524 'Ouml' => true,
525 'para' => true,
526 'part' => true,
527 'permil' => true,
528 'perp' => true,
529 'phi' => true,
530 'Phi' => true,
531 'pi' => true,
532 'Pi' => true,
533 'piv' => true,
534 'plusmn' => true,
535 'pound' => true,
536 'prime' => true,
537 'Prime' => true,
538 'prod' => true,
539 'prop' => true,
540 'psi' => true,
541 'Psi' => true,
542 'quot' => true,
543 'radic' => true,
544 'rang' => true,
545 'raquo' => true,
546 'rarr' => true,
547 'rArr' => true,
548 'rceil' => true,
549 'rdquo' => true,
550 'real' => true,
551 'reg' => true,
552 'rfloor' => true,
553 'rho' => true,
554 'Rho' => true,
555 'rlm' => true,
556 'rsaquo' => true,
557 'rsquo' => true,
558 'sbquo' => true,
559 'scaron' => true,
560 'Scaron' => true,
561 'sdot' => true,
562 'sect' => true,
563 'shy' => true,
564 'sigma' => true,
565 'Sigma' => true,
566 'sigmaf' => true,
567 'sim' => true,
568 'spades' => true,
569 'sub' => true,
570 'sube' => true,
571 'sum' => true,
572 'sup' => true,
573 'sup1' => true,
574 'sup2' => true,
575 'sup3' => true,
576 'supe' => true,
577 'szlig' => true,
578 'tau' => true,
579 'Tau' => true,
580 'there4' => true,
581 'theta' => true,
582 'Theta' => true,
583 'thetasym' => true,
584 'thinsp' => true,
585 'thorn' => true,
586 'THORN' => true,
587 'tilde' => true,
588 'times' => true,
589 'trade' => true,
590 'uacute' => true,
591 'Uacute' => true,
592 'uarr' => true,
593 'uArr' => true,
594 'ucirc' => true,
595 'Ucirc' => true,
596 'ugrave' => true,
597 'Ugrave' => true,
598 'uml' => true,
599 'upsih' => true,
600 'upsilon' => true,
601 'Upsilon' => true,
602 'uuml' => true,
603 'Uuml' => true,
604 'weierp' => true,
605 'xi' => true,
606 'Xi' => true,
607 'yacute' => true,
608 'Yacute' => true,
609 'yen' => true,
610 'yuml' => true,
611 'Yuml' => true,
612 'zeta' => true,
613 'Zeta' => true,
614 'zwj' => true,
615 'zwnj' => true );
616 if( isset( $htmlEntities[$name] ) ) {
617 return "&$name;";
618 } else {
619 return "&amp;$name;";
623 function decCharReference( $codepoint ) {
624 $point = IntVal( $codepoint );
625 if( Sanitizer::validateCodepoint( $point ) ) {
626 return sprintf( '&#%d;', $point );
627 } else {
628 return null;
632 function hexCharReference( $codepoint ) {
633 $point = hexdec( $codepoint );
634 if( Sanitizer::validateCodepoint( $point ) ) {
635 return sprintf( '&#x%x;', $point );
636 } else {
637 return null;
642 * Returns true if a given Unicode codepoint is a valid character in XML.
643 * @param int $codepoint
644 * @return bool
646 function validateCodepoint( $codepoint ) {
647 return ($codepoint == 0x09)
648 || ($codepoint == 0x0a)
649 || ($codepoint == 0x0d)
650 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
651 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
652 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
656 * Fetch the whitelist of acceptable attributes for a given
657 * element name.
659 * @param string $element
660 * @return array
662 function attributeWhitelist( $element ) {
663 $list = Sanitizer::setupAttributeWhitelist();
664 return isset( $list[$element] )
665 ? $list[$element]
666 : array();
670 * @return array
672 function setupAttributeWhitelist() {
673 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
674 $block = array_merge( $common, array( 'align' ) );
675 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
676 $tablecell = array( 'abbr',
677 'axis',
678 'headers',
679 'scope',
680 'rowspan',
681 'colspan',
682 'nowrap', # deprecated
683 'width', # deprecated
684 'height' # deprecated
687 # Numbers refer to sections in HTML 4.01 standard describing the element.
688 # See: http://www.w3.org/TR/html4/
689 $whitelist = array (
690 # 7.5.4
691 'div' => $block,
692 'center' => $common, # deprecated
693 'span' => $block, # ??
695 # 7.5.5
696 'h1' => $block,
697 'h2' => $block,
698 'h3' => $block,
699 'h4' => $block,
700 'h5' => $block,
701 'h6' => $block,
703 # 7.5.6
704 # address
706 # 8.2.4
707 # bdo
709 # 9.2.1
710 'em' => $common,
711 'strong' => $common,
712 'cite' => $common,
713 # dfn
714 'code' => $common,
715 # samp
716 # kbd
717 'var' => $common,
718 # abbr
719 # acronym
721 # 9.2.2
722 'blockquote' => array_merge( $common, array( 'cite' ) ),
725 # 9.2.3
726 'sub' => $common,
727 'sup' => $common,
729 # 9.3.1
730 'p' => $block,
732 # 9.3.2
733 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
735 # 9.3.4
736 'pre' => array_merge( $common, array( 'width' ) ),
738 # 9.4
739 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
740 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
742 # 10.2
743 'ul' => array_merge( $common, array( 'type' ) ),
744 'ol' => array_merge( $common, array( 'type', 'start' ) ),
745 'li' => array_merge( $common, array( 'type', 'value' ) ),
747 # 10.3
748 'dl' => $common,
749 'dd' => $common,
750 'dt' => $common,
752 # 11.2.1
753 'table' => array_merge( $common,
754 array( 'summary', 'width', 'border', 'frame',
755 'rules', 'cellspacing', 'cellpadding',
756 'align', 'bgcolor', 'frame', 'rules',
757 'border' ) ),
759 # 11.2.2
760 'caption' => array_merge( $common, array( 'align' ) ),
762 # 11.2.3
763 'thead' => array_merge( $common, $tablealign ),
764 'tfoot' => array_merge( $common, $tablealign ),
765 'tbody' => array_merge( $common, $tablealign ),
767 # 11.2.4
768 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
769 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
771 # 11.2.5
772 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
774 # 11.2.6
775 'td' => array_merge( $common, $tablecell, $tablealign ),
776 'th' => array_merge( $common, $tablecell, $tablealign ),
778 # 15.2.1
779 'tt' => $common,
780 'b' => $common,
781 'i' => $common,
782 'big' => $common,
783 'small' => $common,
784 'strike' => $common,
785 's' => $common,
786 'u' => $common,
788 # 15.2.2
789 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
790 # basefont
792 # 15.3
793 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
795 # XHTML Ruby annotation text module, simple ruby only.
796 # http://www.w3c.org/TR/ruby/
797 'ruby' => $common,
798 # rbc
799 # rtc
800 'rb' => $common,
801 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
802 'rp' => $common,
804 return $whitelist;
808 * Take a fragment of (potentially invalid) HTML and return
809 * a version with any tags removed, encoded suitably for literal
810 * inclusion in an attribute value.
812 * @param string $text HTML fragment
813 * @return string
815 function stripAllTags( $text ) {
816 # Actual <tags>
817 $text = preg_replace( '/<[^>]*>/', '', $text );
819 # Normalize &entities and whitespace
820 $text = Sanitizer::normalizeAttributeValue( $text );
822 # Will be placed into "double-quoted" attributes,
823 # make sure remaining bits are safe.
824 $text = str_replace(
825 array('<', '>', '"'),
826 array('&lt;', '&gt;', '&quot;'),
827 $text );
829 return $text;