4 * (X)HTML sanitizer for MediaWiki
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
35 function removeHTMLtags( $text ) {
36 global $wgUseTidy, $wgUserHtml;
37 $fname = 'Parser::removeHTMLtags';
38 wfProfileIn( $fname );
41 $htmlpairs = array( # Tags that must be closed
42 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
43 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
44 'strike', 'strong', 'tt', 'var', 'div', 'center',
45 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
46 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
49 'br', 'hr', 'li', 'dt', 'dd'
51 $htmlnest = array( # Tags that can be nested--??
52 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
53 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
55 $tabletags = array( # Can only appear inside table
60 $htmlsingle = array();
65 $htmlsingle = array_merge( $tabletags, $htmlsingle );
66 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
68 # Remove HTML comments
69 $text = Sanitizer
::removeHTMLcomments( $text );
71 $bits = explode( '<', $text );
72 $text = array_shift( $bits );
74 $tagstack = array(); $tablestack = array();
75 foreach ( $bits as $x ) {
76 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
77 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
79 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
80 error_reporting( $prev );
83 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
87 if ( ! in_array( $t, $htmlsingle ) &&
88 ( $ot = @array_pop
( $tagstack ) ) != $t ) {
89 @array_push
( $tagstack, $ot );
92 if ( $t == 'table' ) {
93 $tagstack = array_pop( $tablestack );
98 # Keep track for later
99 if ( in_array( $t, $tabletags ) &&
100 ! in_array( 'table', $tagstack ) ) {
102 } else if ( in_array( $t, $tagstack ) &&
103 ! in_array ( $t , $htmlnest ) ) {
105 } else if ( ! in_array( $t, $htmlsingle ) ) {
106 if ( $t == 'table' ) {
107 array_push( $tablestack, $tagstack );
110 array_push( $tagstack, $t );
112 # Strip non-approved attributes from the tag
113 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
116 $rest = str_replace( '>', '>', $rest );
117 $text .= "<$slash$t$newparams$brace$rest";
121 $text .= '<' . str_replace( '>', '>', $x);
123 # Close off any remaining tags
124 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
126 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
129 # this might be possible using tidy itself
130 foreach ( $bits as $x ) {
131 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
133 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
134 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
135 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
136 $rest = str_replace( '>', '>', $rest );
137 $text .= "<$slash$t$newparams$brace$rest";
139 $text .= '<' . str_replace( '>', '>', $x);
143 wfProfileOut( $fname );
148 * Remove '<!--', '-->', and everything between.
149 * To avoid leaving blank lines, when a comment is both preceded
150 * and followed by a newline (ignoring spaces), trim leading and
151 * trailing spaces and one of the newlines.
154 * @param string $text
157 function removeHTMLcomments( $text ) {
158 $fname='Parser::removeHTMLcomments';
159 wfProfileIn( $fname );
160 while (($start = strpos($text, '<!--')) !== false) {
161 $end = strpos($text, '-->', $start +
4);
162 if ($end === false) {
163 # Unterminated comment; bail out
169 # Trim space and newline if the comment is both
170 # preceded and followed by a newline
171 $spaceStart = max($start - 1, 0);
172 $spaceLen = $end - $spaceStart;
173 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
177 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
179 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
180 # Remove the comment, leading and trailing
181 # spaces, and leave only one newline.
182 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
185 # Remove just the comment.
186 $text = substr_replace($text, '', $start, $end - $start);
189 wfProfileOut( $fname );
194 * Take a tag soup fragment listing an HTML element's attributes
195 * and normalize it to well-formed XML, discarding unwanted attributes.
197 * - Normalizes attribute names to lowercase
198 * - Discards attributes not on a whitelist for the given element
199 * - Turns broken or invalid entities into plaintext
200 * - Double-quotes all attribute values
201 * - Attributes without values are given the name as attribute
202 * - Double attributes are discarded
203 * - Unsafe style attributes are discarded
204 * - Prepends space if there are attributes.
206 * @param string $text
207 * @param string $element
210 * @todo Check for legal values where the DTD limits things.
211 * @todo Check for unique id attribute :P
213 function fixTagAttributes( $text, $element ) {
214 if( trim( $text ) == '' ) {
218 $attrib = '[A-Za-z0-9]'; #FIXME
219 $space = '[\x09\x0a\x0d\x20]';
221 "/(?:^|$space)($attrib+)
224 # The attribute value: quoted or alone
228 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
229 # colors are specified like this.
230 # We'll be normalizing it.
239 $whitelist = array_flip( Sanitizer
::attributeWhitelist( $element ) );
241 foreach( $pairs as $set ) {
242 $attribute = strtolower( $set[1] );
243 if( !isset( $whitelist[$attribute] ) ) {
246 if( $set[2] == '' ) {
247 # In XHTML, attributes must have a value.
249 } elseif( $set[3] != '' ) {
251 $value = Sanitizer
::normalizeAttributeValue( $set[3] );
252 } elseif( $set[4] != '' ) {
254 $value = str_replace( '"', '"',
255 Sanitizer
::normalizeAttributeValue( $set[4] ) );
256 } elseif( $set[5] != '' ) {
258 $value = Sanitizer
::normalizeAttributeValue( $set[5] );
259 } elseif( $set[6] != '' ) {
260 # Illegal #XXXXXX color with no quotes.
261 $value = Sanitizer
::normalizeAttributeValue( $set[6] );
263 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
266 # Strip javascript "expression" from stylesheets.
267 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
268 if( $attribute == 'style' && preg_match(
269 '/(expression|tps*:\/\/|url\\s*\().*/is',
270 wfMungeToUtf8( $value ) ) ) {
275 if( !isset( $attribs[$attribute] ) ) {
276 $attribs[$attribute] = "$attribute=\"$value\"";
279 if( empty( $attribs ) ) {
282 return ' ' . implode( ' ', $attribs );
287 * Normalize whitespace and character references in an XML source-
288 * encoded text for an attribute value.
290 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
291 * but note that we're not returning the value, but are returning
292 * XML source fragments that will be slapped into output.
294 * @param string $text
298 function normalizeAttributeValue( $text ) {
300 '/\r\n|[\x20\x0d\x0a\x09]/',
302 Sanitizer
::normalizeCharReferences( $text ) );
306 * Ensure that any entities and character references are legal
307 * for XML and XHTML specifically. Any stray bits will be
308 * &-escaped to result in a valid text fragment.
310 * a. any named char refs must be known in XHTML
311 * b. any numeric char refs must be legal chars, not invalid or forbidden
312 * c. use &#x, not &#X
313 * d. fix or reject non-valid attributes
315 * @param string $text
319 function normalizeCharReferences( $text ) {
320 return preg_replace_callback(
326 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
330 * @param string $matches
333 function normalizeCharReferencesCallback( $matches ) {
335 if( $matches[1] != '' ) {
336 $ret = Sanitizer
::normalizeEntity( $matches[1] );
337 } elseif( $matches[2] != '' ) {
338 $ret = Sanitizer
::decCharReference( $matches[2] );
339 } elseif( $matches[3] != '' ) {
340 $ret = Sanitizer
::hexCharReference( $matches[3] );
341 } elseif( $matches[4] != '' ) {
342 $ret = Sanitizer
::hexCharReference( $matches[4] );
344 if( is_null( $ret ) ) {
345 return htmlspecialchars( $matches[0] );
352 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
353 * return the named entity reference as is. Otherwise, returns
354 * HTML-escaped text of pseudo-entity source (eg &foo;)
356 * @param string $name
359 function normalizeEntity( $name ) {
360 # List of all named character entities defined in HTML 4.01
361 # http://www.w3.org/TR/html4/sgml/entities.html
362 static $htmlEntities = array(
616 if( isset( $htmlEntities[$name] ) ) {
619 return "&$name;";
623 function decCharReference( $codepoint ) {
624 $point = IntVal( $codepoint );
625 if( Sanitizer
::validateCodepoint( $point ) ) {
626 return sprintf( '&#%d;', $point );
632 function hexCharReference( $codepoint ) {
633 $point = hexdec( $codepoint );
634 if( Sanitizer
::validateCodepoint( $point ) ) {
635 return sprintf( '&#x%x;', $point );
642 * Returns true if a given Unicode codepoint is a valid character in XML.
643 * @param int $codepoint
646 function validateCodepoint( $codepoint ) {
647 return ($codepoint == 0x09)
648 ||
($codepoint == 0x0a)
649 ||
($codepoint == 0x0d)
650 ||
($codepoint >= 0x20 && $codepoint <= 0xd7ff)
651 ||
($codepoint >= 0xe000 && $codepoint <= 0xfffd)
652 ||
($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
656 * Fetch the whitelist of acceptable attributes for a given
659 * @param string $element
662 function attributeWhitelist( $element ) {
663 $list = Sanitizer
::setupAttributeWhitelist();
664 return isset( $list[$element] )
672 function setupAttributeWhitelist() {
673 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
674 $block = array_merge( $common, array( 'align' ) );
675 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
676 $tablecell = array( 'abbr',
682 'nowrap', # deprecated
683 'width', # deprecated
684 'height' # deprecated
687 # Numbers refer to sections in HTML 4.01 standard describing the element.
688 # See: http://www.w3.org/TR/html4/
692 'center' => $common, # deprecated
693 'span' => $block, # ??
722 'blockquote' => array_merge( $common, array( 'cite' ) ),
733 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
736 'pre' => array_merge( $common, array( 'width' ) ),
739 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
740 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
743 'ul' => array_merge( $common, array( 'type' ) ),
744 'ol' => array_merge( $common, array( 'type', 'start' ) ),
745 'li' => array_merge( $common, array( 'type', 'value' ) ),
753 'table' => array_merge( $common,
754 array( 'summary', 'width', 'border', 'frame',
755 'rules', 'cellspacing', 'cellpadding',
756 'align', 'bgcolor', 'frame', 'rules',
760 'caption' => array_merge( $common, array( 'align' ) ),
763 'thead' => array_merge( $common, $tablealign ),
764 'tfoot' => array_merge( $common, $tablealign ),
765 'tbody' => array_merge( $common, $tablealign ),
768 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
769 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
772 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
775 'td' => array_merge( $common, $tablecell, $tablealign ),
776 'th' => array_merge( $common, $tablecell, $tablealign ),
789 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
793 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
795 # XHTML Ruby annotation text module, simple ruby only.
796 # http://www.w3c.org/TR/ruby/
801 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
808 * Take a fragment of (potentially invalid) HTML and return
809 * a version with any tags removed, encoded suitably for literal
810 * inclusion in an attribute value.
812 * @param string $text HTML fragment
815 function stripAllTags( $text ) {
817 $text = preg_replace( '/<[^>]*>/', '', $text );
819 # Normalize &entities and whitespace
820 $text = Sanitizer
::normalizeAttributeValue( $text );
822 # Will be placed into "double-quoted" attributes,
823 # make sure remaining bits are safe.
825 array('<', '>', '"'),
826 array('<', '>', '"'),