@ considered evil
[mediawiki.git] / includes / Sanitizer.php
blobfec5d720bec8a81e229635673b0443ca96f99d62
1 <?php
3 /**
4 * (X)HTML sanitizer for MediaWiki
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
24 * @package MediaWiki
27 class Sanitizer {
28 /**
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
31 * @access private
33 function removeHTMLtags( $text ) {
34 global $wgUseTidy, $wgUserHtml;
35 $fname = 'Parser::removeHTMLtags';
36 wfProfileIn( $fname );
38 if( $wgUserHtml ) {
39 $htmlpairs = array( # Tags that must be closed
40 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
41 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
42 'strike', 'strong', 'tt', 'var', 'div', 'center',
43 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
44 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
46 $htmlsingle = array(
47 'br', 'hr', 'li', 'dt', 'dd'
49 $htmlnest = array( # Tags that can be nested--??
50 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
51 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
53 $tabletags = array( # Can only appear inside table
54 'td', 'th', 'tr'
56 } else {
57 $htmlpairs = array();
58 $htmlsingle = array();
59 $htmlnest = array();
60 $tabletags = array();
63 $htmlsingle = array_merge( $tabletags, $htmlsingle );
64 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
66 # Remove HTML comments
67 $text = Sanitizer::removeHTMLcomments( $text );
69 $bits = explode( '<', $text );
70 $text = array_shift( $bits );
71 if(!$wgUseTidy) {
72 $tagstack = array(); $tablestack = array();
73 foreach ( $bits as $x ) {
74 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
75 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
76 $x, $regs );
77 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
78 error_reporting( $prev );
80 $badtag = 0 ;
81 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
82 # Check our stack
83 if ( $slash ) {
84 # Closing a tag...
85 if ( ! in_array( $t, $htmlsingle ) &&
86 ( $ot = @array_pop( $tagstack ) ) != $t ) {
87 @array_push( $tagstack, $ot );
88 $badtag = 1;
89 } else {
90 if ( $t == 'table' ) {
91 $tagstack = array_pop( $tablestack );
93 $newparams = '';
95 } else {
96 # Keep track for later
97 if ( in_array( $t, $tabletags ) &&
98 ! in_array( 'table', $tagstack ) ) {
99 $badtag = 1;
100 } else if ( in_array( $t, $tagstack ) &&
101 ! in_array ( $t , $htmlnest ) ) {
102 $badtag = 1 ;
103 } else if ( ! in_array( $t, $htmlsingle ) ) {
104 if ( $t == 'table' ) {
105 array_push( $tablestack, $tagstack );
106 $tagstack = array();
108 array_push( $tagstack, $t );
110 # Strip non-approved attributes from the tag
111 $newparams = Sanitizer::fixTagAttributes( $params, $t );
113 if ( ! $badtag ) {
114 $rest = str_replace( '>', '&gt;', $rest );
115 $text .= "<$slash$t$newparams$brace$rest";
116 continue;
119 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
121 # Close off any remaining tags
122 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
123 $text .= "</$t>\n";
124 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
126 } else {
127 # this might be possible using tidy itself
128 foreach ( $bits as $x ) {
129 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
130 $x, $regs );
131 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
132 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
133 $newparams = Sanitizer::fixTagAttributes( $params, $t );
134 $rest = str_replace( '>', '&gt;', $rest );
135 $text .= "<$slash$t$newparams$brace$rest";
136 } else {
137 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
141 wfProfileOut( $fname );
142 return $text;
146 * Remove '<!--', '-->', and everything between.
147 * To avoid leaving blank lines, when a comment is both preceded
148 * and followed by a newline (ignoring spaces), trim leading and
149 * trailing spaces and one of the newlines.
151 * @access private
153 function removeHTMLcomments( $text ) {
154 $fname='Parser::removeHTMLcomments';
155 wfProfileIn( $fname );
156 while (($start = strpos($text, '<!--')) !== false) {
157 $end = strpos($text, '-->', $start + 4);
158 if ($end === false) {
159 # Unterminated comment; bail out
160 break;
163 $end += 3;
165 # Trim space and newline if the comment is both
166 # preceded and followed by a newline
167 $spaceStart = max($start - 1, 0);
168 $spaceLen = $end - $spaceStart;
169 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
170 $spaceStart--;
171 $spaceLen++;
173 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
174 $spaceLen++;
175 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
176 # Remove the comment, leading and trailing
177 # spaces, and leave only one newline.
178 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
180 else {
181 # Remove just the comment.
182 $text = substr_replace($text, '', $start, $end - $start);
185 wfProfileOut( $fname );
186 return $text;
190 * Take a tag soup fragment listing an HTML element's attributes
191 * and normalize it to well-formed XML, discarding unwanted attributes.
193 * - Normalizes attribute names to lowercase
194 * - Discards attributes not on a whitelist for the given element
195 * - Turns broken or invalid entities into plaintext
196 * - Double-quotes all attribute values
197 * - Attributes without values are given the name as attribute
198 * - Double attributes are discarded
199 * - Unsafe style attributes are discarded
200 * - Prepends space if there are attributes.
202 * @param string $text
203 * @param string $element
204 * @return string
206 * @todo Check for legal values where the DTD limits things.
207 * @todo Check for unique id attribute :P
209 function fixTagAttributes( $text, $element ) {
210 if( trim( $text ) == '' ) {
211 return '';
214 $attrib = '[A-Za-z0-9]'; #FIXME
215 $space = '[\x09\x0a\x0d\x20]';
216 if( !preg_match_all(
217 "/(?:^|$space)($attrib+)
218 ($space*=$space*
220 # The attribute value: quoted or alone
221 \"([^<\"]*)\"
222 | '([^<']*)'
223 | ([a-zA-Z0-9._:-]+)
224 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
225 # colors are specified like this.
226 # We'll be normalizing it.
228 )?(?=$space|\$)/sx",
229 $text,
230 $pairs,
231 PREG_SET_ORDER ) ) {
232 return '';
235 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
236 $attribs = array();
237 foreach( $pairs as $set ) {
238 $attribute = strtolower( $set[1] );
239 if( !isset( $whitelist[$attribute] ) ) {
240 continue;
242 if( $set[2] == '' ) {
243 # In XHTML, attributes must have a value.
244 $value = $set[1];
245 } elseif( $set[3] != '' ) {
246 # Double-quoted
247 $value = Sanitizer::normalizeAttributeValue( $set[3] );
248 } elseif( $set[4] != '' ) {
249 # Single-quoted
250 $value = str_replace( '"', '&quot;',
251 Sanitizer::normalizeAttributeValue( $set[4] ) );
252 } elseif( $set[5] != '' ) {
253 # No quotes.
254 $value = Sanitizer::normalizeAttributeValue( $set[5] );
255 } elseif( $set[6] != '' ) {
256 # Illegal #XXXXXX color with no quotes.
257 $value = Sanitizer::normalizeAttributeValue( $set[6] );
258 } else {
259 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
262 # Strip javascript "expression" from stylesheets.
263 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
264 if( $attribute == 'style' && preg_match(
265 '/(expression|tps*:\/\/|url\\s*\().*/is',
266 wfMungeToUtf8( $value ) ) ) {
267 # haxx0r
268 continue;
271 if( !isset( $attribs[$attribute] ) ) {
272 $attribs[$attribute] = "$attribute=\"$value\"";
275 if( empty( $attribs ) ) {
276 return '';
277 } else {
278 return ' ' . implode( ' ', $attribs );
283 * Normalize whitespace and character references in an XML source-
284 * encoded text for an attribute value.
286 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
287 * but note that we're not returning the value, but are returning
288 * XML source fragments that will be slapped into output.
290 * @param string $text
291 * @return string
292 * @access private
294 function normalizeAttributeValue( $text ) {
295 return preg_replace(
296 '/\r\n|[\x20\x0d\x0a\x09]/',
297 ' ',
298 Sanitizer::normalizeCharReferences( $text ) );
302 * Ensure that any entities and character references are legal
303 * for XML and XHTML specifically. Any stray bits will be
304 * &amp;-escaped to result in a valid text fragment.
306 * a. any named char refs must be known in XHTML
307 * b. any numeric char refs must be legal chars, not invalid or forbidden
308 * c. use &#x, not &#X
309 * d. fix or reject non-valid attributes
311 * @param string $text
312 * @return string
313 * @access private
315 function normalizeCharReferences( $text ) {
316 return preg_replace_callback(
317 '/&([A-Za-z0-9]+);
318 |&\#([0-9]+);
319 |&\#x([0-9A-Za-z]+);
320 |&\#X([0-9A-Za-z]+);
321 |(&)/x',
322 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
323 $text );
326 function normalizeCharReferencesCallback( $matches ) {
327 $ret = null;
328 if( $matches[1] != '' ) {
329 $ret = Sanitizer::normalizeEntity( $matches[1] );
330 } elseif( $matches[2] != '' ) {
331 $ret = Sanitizer::decCharReference( $matches[2] );
332 } elseif( $matches[3] != '' ) {
333 $ret = Sanitizer::hexCharReference( $matches[3] );
334 } elseif( $matches[4] != '' ) {
335 $ret = Sanitizer::hexCharReference( $matches[4] );
337 if( is_null( $ret ) ) {
338 return htmlspecialchars( $matches[0] );
339 } else {
340 return $ret;
345 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
346 * return the named entity reference as is. Otherwise, returns
347 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
349 * @return string
351 function normalizeEntity( $name ) {
352 # List of all named character entities defined in HTML 4.01
353 # http://www.w3.org/TR/html4/sgml/entities.html
354 static $htmlEntities = array(
355 'aacute' => true,
356 'Aacute' => true,
357 'acirc' => true,
358 'Acirc' => true,
359 'acute' => true,
360 'aelig' => true,
361 'AElig' => true,
362 'agrave' => true,
363 'Agrave' => true,
364 'alefsym' => true,
365 'alpha' => true,
366 'Alpha' => true,
367 'amp' => true,
368 'and' => true,
369 'ang' => true,
370 'apos' => true,
371 'aring' => true,
372 'Aring' => true,
373 'asymp' => true,
374 'atilde' => true,
375 'Atilde' => true,
376 'auml' => true,
377 'Auml' => true,
378 'bdquo' => true,
379 'beta' => true,
380 'Beta' => true,
381 'brvbar' => true,
382 'bull' => true,
383 'cap' => true,
384 'ccedil' => true,
385 'Ccedil' => true,
386 'cedil' => true,
387 'cent' => true,
388 'chi' => true,
389 'Chi' => true,
390 'circ' => true,
391 'clubs' => true,
392 'cong' => true,
393 'copy' => true,
394 'crarr' => true,
395 'cup' => true,
396 'curren' => true,
397 'dagger' => true,
398 'Dagger' => true,
399 'darr' => true,
400 'dArr' => true,
401 'deg' => true,
402 'delta' => true,
403 'Delta' => true,
404 'diams' => true,
405 'divide' => true,
406 'eacute' => true,
407 'Eacute' => true,
408 'ecirc' => true,
409 'Ecirc' => true,
410 'egrave' => true,
411 'Egrave' => true,
412 'empty' => true,
413 'emsp' => true,
414 'ensp' => true,
415 'epsilon' => true,
416 'Epsilon' => true,
417 'equiv' => true,
418 'eta' => true,
419 'Eta' => true,
420 'eth' => true,
421 'ETH' => true,
422 'euml' => true,
423 'Euml' => true,
424 'euro' => true,
425 'exist' => true,
426 'fnof' => true,
427 'forall' => true,
428 'frac12' => true,
429 'frac14' => true,
430 'frac34' => true,
431 'frasl' => true,
432 'gamma' => true,
433 'Gamma' => true,
434 'ge' => true,
435 'gt' => true,
436 'harr' => true,
437 'hArr' => true,
438 'hearts' => true,
439 'hellip' => true,
440 'iacute' => true,
441 'Iacute' => true,
442 'icirc' => true,
443 'Icirc' => true,
444 'iexcl' => true,
445 'igrave' => true,
446 'Igrave' => true,
447 'image' => true,
448 'infin' => true,
449 'int' => true,
450 'iota' => true,
451 'Iota' => true,
452 'iquest' => true,
453 'isin' => true,
454 'iuml' => true,
455 'Iuml' => true,
456 'kappa' => true,
457 'Kappa' => true,
458 'lambda' => true,
459 'Lambda' => true,
460 'lang' => true,
461 'laquo' => true,
462 'larr' => true,
463 'lArr' => true,
464 'lceil' => true,
465 'ldquo' => true,
466 'le' => true,
467 'lfloor' => true,
468 'lowast' => true,
469 'loz' => true,
470 'lrm' => true,
471 'lsaquo' => true,
472 'lsquo' => true,
473 'lt' => true,
474 'macr' => true,
475 'mdash' => true,
476 'micro' => true,
477 'middot' => true,
478 'minus' => true,
479 'mu' => true,
480 'Mu' => true,
481 'nabla' => true,
482 'nbsp' => true,
483 'ndash' => true,
484 'ne' => true,
485 'ni' => true,
486 'not' => true,
487 'notin' => true,
488 'nsub' => true,
489 'ntilde' => true,
490 'Ntilde' => true,
491 'nu' => true,
492 'Nu' => true,
493 'oacute' => true,
494 'Oacute' => true,
495 'ocirc' => true,
496 'Ocirc' => true,
497 'oelig' => true,
498 'OElig' => true,
499 'ograve' => true,
500 'Ograve' => true,
501 'oline' => true,
502 'omega' => true,
503 'Omega' => true,
504 'omicron' => true,
505 'Omicron' => true,
506 'oplus' => true,
507 'or' => true,
508 'ordf' => true,
509 'ordm' => true,
510 'oslash' => true,
511 'Oslash' => true,
512 'otilde' => true,
513 'Otilde' => true,
514 'otimes' => true,
515 'ouml' => true,
516 'Ouml' => true,
517 'para' => true,
518 'part' => true,
519 'permil' => true,
520 'perp' => true,
521 'phi' => true,
522 'Phi' => true,
523 'pi' => true,
524 'Pi' => true,
525 'piv' => true,
526 'plusmn' => true,
527 'pound' => true,
528 'prime' => true,
529 'Prime' => true,
530 'prod' => true,
531 'prop' => true,
532 'psi' => true,
533 'Psi' => true,
534 'quot' => true,
535 'radic' => true,
536 'rang' => true,
537 'raquo' => true,
538 'rarr' => true,
539 'rArr' => true,
540 'rceil' => true,
541 'rdquo' => true,
542 'real' => true,
543 'reg' => true,
544 'rfloor' => true,
545 'rho' => true,
546 'Rho' => true,
547 'rlm' => true,
548 'rsaquo' => true,
549 'rsquo' => true,
550 'sbquo' => true,
551 'scaron' => true,
552 'Scaron' => true,
553 'sdot' => true,
554 'sect' => true,
555 'shy' => true,
556 'sigma' => true,
557 'Sigma' => true,
558 'sigmaf' => true,
559 'sim' => true,
560 'spades' => true,
561 'sub' => true,
562 'sube' => true,
563 'sum' => true,
564 'sup' => true,
565 'sup1' => true,
566 'sup2' => true,
567 'sup3' => true,
568 'supe' => true,
569 'szlig' => true,
570 'tau' => true,
571 'Tau' => true,
572 'there4' => true,
573 'theta' => true,
574 'Theta' => true,
575 'thetasym' => true,
576 'thinsp' => true,
577 'thorn' => true,
578 'THORN' => true,
579 'tilde' => true,
580 'times' => true,
581 'trade' => true,
582 'uacute' => true,
583 'Uacute' => true,
584 'uarr' => true,
585 'uArr' => true,
586 'ucirc' => true,
587 'Ucirc' => true,
588 'ugrave' => true,
589 'Ugrave' => true,
590 'uml' => true,
591 'upsih' => true,
592 'upsilon' => true,
593 'Upsilon' => true,
594 'uuml' => true,
595 'Uuml' => true,
596 'weierp' => true,
597 'xi' => true,
598 'Xi' => true,
599 'yacute' => true,
600 'Yacute' => true,
601 'yen' => true,
602 'yuml' => true,
603 'Yuml' => true,
604 'zeta' => true,
605 'Zeta' => true,
606 'zwj' => true,
607 'zwnj' => true );
608 if( isset( $htmlEntities[$name] ) ) {
609 return "&$name;";
610 } else {
611 return "&amp;$name;";
615 function decCharReference( $codepoint ) {
616 $point = IntVal( $codepoint );
617 if( Sanitizer::validateCodepoint( $point ) ) {
618 return sprintf( '&#%d;', $point );
619 } else {
620 return null;
624 function hexCharReference( $codepoint ) {
625 $point = hexdec( $codepoint );
626 if( Sanitizer::validateCodepoint( $point ) ) {
627 return sprintf( '&#x%x;', $point );
628 } else {
629 return null;
634 * Returns true if a given Unicode codepoint is a valid character in XML.
635 * @param int $codepoint
636 * @return bool
638 function validateCodepoint( $codepoint ) {
639 return ($codepoint == 0x09)
640 || ($codepoint == 0x0a)
641 || ($codepoint == 0x0d)
642 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
643 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
644 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
648 * Fetch the whitelist of acceptable attributes for a given
649 * element name.
651 * @param string $element
652 * @return array
654 function attributeWhitelist( $element ) {
655 $list = Sanitizer::setupAttributeWhitelist();
656 return isset( $list[$element] )
657 ? $list[$element]
658 : array();
662 * @return array
664 function setupAttributeWhitelist() {
665 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
666 $block = array_merge( $common, array( 'align' ) );
667 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
668 $tablecell = array( 'abbr',
669 'axis',
670 'headers',
671 'scope',
672 'rowspan',
673 'colspan',
674 'nowrap', # deprecated
675 'width', # deprecated
676 'height' # deprecated
679 # Numbers refer to sections in HTML 4.01 standard describing the element.
680 # See: http://www.w3.org/TR/html4/
681 $whitelist = array (
682 # 7.5.4
683 'div' => $block,
684 'center' => $common, # deprecated
685 'span' => $block, # ??
687 # 7.5.5
688 'h1' => $block,
689 'h2' => $block,
690 'h3' => $block,
691 'h4' => $block,
692 'h5' => $block,
693 'h6' => $block,
695 # 7.5.6
696 # address
698 # 8.2.4
699 # bdo
701 # 9.2.1
702 'em' => $common,
703 'strong' => $common,
704 'cite' => $common,
705 # dfn
706 'code' => $common,
707 # samp
708 # kbd
709 'var' => $common,
710 # abbr
711 # acronym
713 # 9.2.2
714 'blockquote' => array_merge( $common, array( 'cite' ) ),
717 # 9.2.3
718 'sub' => $common,
719 'sup' => $common,
721 # 9.3.1
722 'p' => $block,
724 # 9.3.2
725 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
727 # 9.3.4
728 'pre' => array_merge( $common, array( 'width' ) ),
730 # 9.4
731 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
732 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
734 # 10.2
735 'ul' => array_merge( $common, array( 'type' ) ),
736 'ol' => array_merge( $common, array( 'type', 'start' ) ),
737 'li' => array_merge( $common, array( 'type', 'value' ) ),
739 # 10.3
740 'dl' => $common,
741 'dd' => $common,
742 'dt' => $common,
744 # 11.2.1
745 'table' => array_merge( $common,
746 array( 'summary', 'width', 'border', 'frame',
747 'rules', 'cellspacing', 'cellpadding',
748 'align', 'bgcolor', 'frame', 'rules',
749 'border' ) ),
751 # 11.2.2
752 'caption' => array_merge( $common, array( 'align' ) ),
754 # 11.2.3
755 'thead' => array_merge( $common, $tablealign ),
756 'tfoot' => array_merge( $common, $tablealign ),
757 'tbody' => array_merge( $common, $tablealign ),
759 # 11.2.4
760 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
761 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
763 # 11.2.5
764 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
766 # 11.2.6
767 'td' => array_merge( $common, $tablecell, $tablealign ),
768 'th' => array_merge( $common, $tablecell, $tablealign ),
770 # 15.2.1
771 'tt' => $common,
772 'b' => $common,
773 'i' => $common,
774 'big' => $common,
775 'small' => $common,
776 'strike' => $common,
777 's' => $common,
778 'u' => $common,
780 # 15.2.2
781 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
782 # basefont
784 # 15.3
785 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
787 # XHTML Ruby annotation text module, simple ruby only.
788 # http://www.w3c.org/TR/ruby/
789 'ruby' => $common,
790 # rbc
791 # rtc
792 'rb' => $common,
793 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
794 'rp' => $common,
796 return $whitelist;
800 * Take a fragment of (potentially invalid) HTML and return
801 * a version with any tags removed, encoded suitably for literal
802 * inclusion in an attribute value.
804 * @param string $text HTML fragment
805 * @return string
807 function stripAllTags( $text ) {
808 # Actual <tags>
809 $text = preg_replace( '/<[^>]*>/', '', $text );
811 # Normalize &entities and whitespace
812 $text = Sanitizer::normalizeAttributeValue( $text );
814 # Will be placed into "double-quoted" attributes,
815 # make sure remaining bits are safe.
816 $text = str_replace(
817 array('<', '>', '"'),
818 array('&lt;', '&gt;', '&quot;'),
819 $text );
821 return $text;