4 * (X)HTML sanitizer for MediaWiki
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
33 function removeHTMLtags( $text ) {
34 global $wgUseTidy, $wgUserHtml;
35 $fname = 'Parser::removeHTMLtags';
36 wfProfileIn( $fname );
39 $htmlpairs = array( # Tags that must be closed
40 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
41 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
42 'strike', 'strong', 'tt', 'var', 'div', 'center',
43 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
44 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
47 'br', 'hr', 'li', 'dt', 'dd'
49 $htmlnest = array( # Tags that can be nested--??
50 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
51 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
53 $tabletags = array( # Can only appear inside table
58 $htmlsingle = array();
63 $htmlsingle = array_merge( $tabletags, $htmlsingle );
64 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
66 # Remove HTML comments
67 $text = Sanitizer
::removeHTMLcomments( $text );
69 $bits = explode( '<', $text );
70 $text = array_shift( $bits );
72 $tagstack = array(); $tablestack = array();
73 foreach ( $bits as $x ) {
74 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
75 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
77 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
78 error_reporting( $prev );
81 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
85 if ( ! in_array( $t, $htmlsingle ) &&
86 ( $ot = @array_pop
( $tagstack ) ) != $t ) {
87 @array_push
( $tagstack, $ot );
90 if ( $t == 'table' ) {
91 $tagstack = array_pop( $tablestack );
96 # Keep track for later
97 if ( in_array( $t, $tabletags ) &&
98 ! in_array( 'table', $tagstack ) ) {
100 } else if ( in_array( $t, $tagstack ) &&
101 ! in_array ( $t , $htmlnest ) ) {
103 } else if ( ! in_array( $t, $htmlsingle ) ) {
104 if ( $t == 'table' ) {
105 array_push( $tablestack, $tagstack );
108 array_push( $tagstack, $t );
110 # Strip non-approved attributes from the tag
111 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
114 $rest = str_replace( '>', '>', $rest );
115 $text .= "<$slash$t$newparams$brace$rest";
119 $text .= '<' . str_replace( '>', '>', $x);
121 # Close off any remaining tags
122 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
124 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
127 # this might be possible using tidy itself
128 foreach ( $bits as $x ) {
129 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
131 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
132 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
133 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
134 $rest = str_replace( '>', '>', $rest );
135 $text .= "<$slash$t$newparams$brace$rest";
137 $text .= '<' . str_replace( '>', '>', $x);
141 wfProfileOut( $fname );
146 * Remove '<!--', '-->', and everything between.
147 * To avoid leaving blank lines, when a comment is both preceded
148 * and followed by a newline (ignoring spaces), trim leading and
149 * trailing spaces and one of the newlines.
153 function removeHTMLcomments( $text ) {
154 $fname='Parser::removeHTMLcomments';
155 wfProfileIn( $fname );
156 while (($start = strpos($text, '<!--')) !== false) {
157 $end = strpos($text, '-->', $start +
4);
158 if ($end === false) {
159 # Unterminated comment; bail out
165 # Trim space and newline if the comment is both
166 # preceded and followed by a newline
167 $spaceStart = max($start - 1, 0);
168 $spaceLen = $end - $spaceStart;
169 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
173 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
175 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
176 # Remove the comment, leading and trailing
177 # spaces, and leave only one newline.
178 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
181 # Remove just the comment.
182 $text = substr_replace($text, '', $start, $end - $start);
185 wfProfileOut( $fname );
190 * Take a tag soup fragment listing an HTML element's attributes
191 * and normalize it to well-formed XML, discarding unwanted attributes.
193 * - Normalizes attribute names to lowercase
194 * - Discards attributes not on a whitelist for the given element
195 * - Turns broken or invalid entities into plaintext
196 * - Double-quotes all attribute values
197 * - Attributes without values are given the name as attribute
198 * - Double attributes are discarded
199 * - Unsafe style attributes are discarded
200 * - Prepends space if there are attributes.
202 * @param string $text
203 * @param string $element
206 * @todo Check for legal values where the DTD limits things.
207 * @todo Check for unique id attribute :P
209 function fixTagAttributes( $text, $element ) {
210 if( trim( $text ) == '' ) {
214 $attrib = '[A-Za-z0-9]'; #FIXME
215 $space = '[\x09\x0a\x0d\x20]';
217 "/(?:^|$space)($attrib+)
220 # The attribute value: quoted or alone
224 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
225 # colors are specified like this.
226 # We'll be normalizing it.
235 $whitelist = array_flip( Sanitizer
::attributeWhitelist( $element ) );
237 foreach( $pairs as $set ) {
238 $attribute = strtolower( $set[1] );
239 if( !isset( $whitelist[$attribute] ) ) {
242 if( $set[2] == '' ) {
243 # In XHTML, attributes must have a value.
245 } elseif( $set[3] != '' ) {
247 $value = Sanitizer
::normalizeAttributeValue( $set[3] );
248 } elseif( $set[4] != '' ) {
250 $value = str_replace( '"', '"',
251 Sanitizer
::normalizeAttributeValue( $set[4] ) );
252 } elseif( $set[5] != '' ) {
254 $value = Sanitizer
::normalizeAttributeValue( $set[5] );
255 } elseif( $set[6] != '' ) {
256 # Illegal #XXXXXX color with no quotes.
257 $value = Sanitizer
::normalizeAttributeValue( $set[6] );
259 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
262 # Strip javascript "expression" from stylesheets.
263 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
264 if( $attribute == 'style' && preg_match(
265 '/(expression|tps*:\/\/|url\\s*\().*/is',
266 wfMungeToUtf8( $value ) ) ) {
271 if( !isset( $attribs[$attribute] ) ) {
272 $attribs[$attribute] = "$attribute=\"$value\"";
275 if( empty( $attribs ) ) {
278 return ' ' . implode( ' ', $attribs );
283 * Normalize whitespace and character references in an XML source-
284 * encoded text for an attribute value.
286 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
287 * but note that we're not returning the value, but are returning
288 * XML source fragments that will be slapped into output.
290 * @param string $text
294 function normalizeAttributeValue( $text ) {
296 '/\r\n|[\x20\x0d\x0a\x09]/',
298 Sanitizer
::normalizeCharReferences( $text ) );
302 * Ensure that any entities and character references are legal
303 * for XML and XHTML specifically. Any stray bits will be
304 * &-escaped to result in a valid text fragment.
306 * a. any named char refs must be known in XHTML
307 * b. any numeric char refs must be legal chars, not invalid or forbidden
308 * c. use &#x, not &#X
309 * d. fix or reject non-valid attributes
311 * @param string $text
315 function normalizeCharReferences( $text ) {
316 return preg_replace_callback(
322 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
326 function normalizeCharReferencesCallback( $matches ) {
328 if( $matches[1] != '' ) {
329 $ret = Sanitizer
::normalizeEntity( $matches[1] );
330 } elseif( $matches[2] != '' ) {
331 $ret = Sanitizer
::decCharReference( $matches[2] );
332 } elseif( $matches[3] != '' ) {
333 $ret = Sanitizer
::hexCharReference( $matches[3] );
334 } elseif( $matches[4] != '' ) {
335 $ret = Sanitizer
::hexCharReference( $matches[4] );
337 if( is_null( $ret ) ) {
338 return htmlspecialchars( $matches[0] );
345 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
346 * return the named entity reference as is. Otherwise, returns
347 * HTML-escaped text of pseudo-entity source (eg &foo;)
351 function normalizeEntity( $name ) {
352 # List of all named character entities defined in HTML 4.01
353 # http://www.w3.org/TR/html4/sgml/entities.html
354 static $htmlEntities = array(
608 if( isset( $htmlEntities[$name] ) ) {
611 return "&$name;";
615 function decCharReference( $codepoint ) {
616 $point = IntVal( $codepoint );
617 if( Sanitizer
::validateCodepoint( $point ) ) {
618 return sprintf( '&#%d;', $point );
624 function hexCharReference( $codepoint ) {
625 $point = hexdec( $codepoint );
626 if( Sanitizer
::validateCodepoint( $point ) ) {
627 return sprintf( '&#x%x;', $point );
634 * Returns true if a given Unicode codepoint is a valid character in XML.
635 * @param int $codepoint
638 function validateCodepoint( $codepoint ) {
639 return ($codepoint == 0x09)
640 ||
($codepoint == 0x0a)
641 ||
($codepoint == 0x0d)
642 ||
($codepoint >= 0x20 && $codepoint <= 0xd7ff)
643 ||
($codepoint >= 0xe000 && $codepoint <= 0xfffd)
644 ||
($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
648 * Fetch the whitelist of acceptable attributes for a given
651 * @param string $element
654 function attributeWhitelist( $element ) {
655 $list = Sanitizer
::setupAttributeWhitelist();
656 return isset( $list[$element] )
664 function setupAttributeWhitelist() {
665 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
666 $block = array_merge( $common, array( 'align' ) );
667 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
668 $tablecell = array( 'abbr',
674 'nowrap', # deprecated
675 'width', # deprecated
676 'height' # deprecated
679 # Numbers refer to sections in HTML 4.01 standard describing the element.
680 # See: http://www.w3.org/TR/html4/
684 'center' => $common, # deprecated
685 'span' => $block, # ??
714 'blockquote' => array_merge( $common, array( 'cite' ) ),
725 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
728 'pre' => array_merge( $common, array( 'width' ) ),
731 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
732 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
735 'ul' => array_merge( $common, array( 'type' ) ),
736 'ol' => array_merge( $common, array( 'type', 'start' ) ),
737 'li' => array_merge( $common, array( 'type', 'value' ) ),
745 'table' => array_merge( $common,
746 array( 'summary', 'width', 'border', 'frame',
747 'rules', 'cellspacing', 'cellpadding',
748 'align', 'bgcolor', 'frame', 'rules',
752 'caption' => array_merge( $common, array( 'align' ) ),
755 'thead' => array_merge( $common, $tablealign ),
756 'tfoot' => array_merge( $common, $tablealign ),
757 'tbody' => array_merge( $common, $tablealign ),
760 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
761 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
764 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
767 'td' => array_merge( $common, $tablecell, $tablealign ),
768 'th' => array_merge( $common, $tablecell, $tablealign ),
781 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
785 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
787 # XHTML Ruby annotation text module, simple ruby only.
788 # http://www.w3c.org/TR/ruby/
793 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
800 * Take a fragment of (potentially invalid) HTML and return
801 * a version with any tags removed, encoded suitably for literal
802 * inclusion in an attribute value.
804 * @param string $text HTML fragment
807 function stripAllTags( $text ) {
809 $text = preg_replace( '/<[^>]*>/', '', $text );
811 # Normalize &entities and whitespace
812 $text = Sanitizer
::normalizeAttributeValue( $text );
814 # Will be placed into "double-quoted" attributes,
815 # make sure remaining bits are safe.
817 array('<', '>', '"'),
818 array('<', '>', '"'),