3 HTMLPurifier_ConfigSchema
::define(
4 'Core', 'Encoding', 'utf-8', 'istring',
5 'If for some reason you are unable to convert all webpages to UTF-8, '.
6 'you can use this directive as a stop-gap compatibility change to '.
7 'let HTML Purifier deal with non UTF-8 input. This technique has '.
8 'notable deficiencies: absolutely no characters outside of the selected '.
9 'character encoding will be preserved, not even the ones that have '.
10 'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
11 'that automatically resolves all entities), making it pretty useless '.
12 'for anything except the most I18N-blind applications, although '.
13 '%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
14 'another tradeoff. This directive '.
15 'only accepts ISO-8859-1 if iconv is not enabled.'
18 HTMLPurifier_ConfigSchema
::define(
19 'Core', 'EscapeNonASCIICharacters', false, 'bool',
20 'This directive overcomes a deficiency in %Core.Encoding by blindly '.
21 'converting all non-ASCII characters into decimal numeric entities before '.
22 'converting it to its native encoding. This means that even '.
23 'characters that can be expressed in the non-UTF-8 encoding will '.
24 'be entity-ized, which can be a real downer for encodings like Big5. '.
25 'It also assumes that the ASCII repetoire is available, although '.
26 'this is the case for almost all encodings. Anyway, use UTF-8! This '.
27 'directive has been available since 1.4.0.'
30 if ( !function_exists('iconv') ) {
31 // only encodings with native PHP support
32 HTMLPurifier_ConfigSchema
::defineAllowedValues(
33 'Core', 'Encoding', array(
38 HTMLPurifier_ConfigSchema
::defineValueAliases(
39 'Core', 'Encoding', array(
40 'iso8859-1' => 'iso-8859-1'
45 HTMLPurifier_ConfigSchema
::define(
46 'Test', 'ForceNoIconv', false, 'bool',
47 'When set to true, HTMLPurifier_Encoder will act as if iconv does not '.
48 'exist and use only pure PHP implementations.'
52 * A UTF-8 specific character encoder that handles cleaning and transforming.
53 * @note All functions in this class should be static.
55 class HTMLPurifier_Encoder
59 * Constructor throws fatal error if you attempt to instantiate class
61 function HTMLPurifier_Encoder() {
62 trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR
);
66 * Cleans a UTF-8 string for well-formedness and SGML validity
68 * It will parse according to UTF-8 and return a valid UTF8 string, with
69 * non-SGML codepoints excluded.
72 * @note Just for reference, the non-SGML code points are 0 to 31 and
73 * 127 to 159, inclusive. However, we allow code points 9, 10
74 * and 13, which are the tab, line feed and carriage return
75 * respectively. 128 and above the code points map to multibyte
76 * UTF-8 representations.
78 * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
79 * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
80 * LGPL license. Notes on what changed are inside, but in general,
81 * the original code transformed UTF-8 text into an array of integer
82 * Unicode codepoints. Understandably, transforming that back to
83 * a string would be somewhat expensive, so the function was modded to
84 * directly operate on the string. However, this discourages code
85 * reuse, and the logic enumerated here would be useful for any
86 * function that needs to be able to understand UTF-8 characters.
87 * As of right now, only smart lossless character encoding converters
88 * would need that, and I'm probably not going to implement them.
89 * Once again, PHP 6 should solve all our problems.
91 function cleanUTF8($str, $force_php = false) {
93 static $non_sgml_chars = array();
94 if (empty($non_sgml_chars)) {
95 for ($i = 0; $i <= 31; $i++
) {
96 // non-SGML ASCII chars
98 if ($i == 9 ||
$i == 13 ||
$i == 10) continue;
99 $non_sgml_chars[chr($i)] = '';
101 for ($i = 127; $i <= 159; $i++
) {
102 $non_sgml_chars[HTMLPurifier_Encoder
::unichr($i)] = '';
106 static $iconv = null;
107 if ($iconv === null) $iconv = function_exists('iconv');
109 if ($iconv && !$force_php) {
110 // do the shortcut way
111 $str = @iconv
('UTF-8', 'UTF-8//IGNORE', $str);
112 return strtr($str, $non_sgml_chars);
115 $mState = 0; // cached expected number of octets after the current octet
116 // until the beginning of the next UTF8 character sequence
117 $mUcs4 = 0; // cached Unicode character
118 $mBytes = 1; // cached expected number of octets in the current sequence
120 // original code involved an $out that was an array of Unicode
121 // codepoints. Instead of having to convert back into UTF-8, we've
122 // decided to directly append valid UTF-8 characters onto a string
123 // $out once they're done. $char accumulates raw bytes, while $mUcs4
124 // turns into the Unicode code point, so there's some redundancy.
130 for($i = 0; $i < $len; $i++
) {
132 $char .= $str[$i]; // append byte to char
134 // When mState is zero we expect either a US-ASCII character
135 // or a multi-octet sequence.
136 if (0 == (0x80 & ($in))) {
137 // US-ASCII, pass straight through.
138 if (($in <= 31 ||
$in == 127) &&
139 !($in == 9 ||
$in == 13 ||
$in == 10) // save \r\t\n
141 // control characters, remove
148 } elseif (0xC0 == (0xE0 & ($in))) {
149 // First octet of 2 octet sequence
151 $mUcs4 = ($mUcs4 & 0x1F) << 6;
154 } elseif (0xE0 == (0xF0 & ($in))) {
155 // First octet of 3 octet sequence
157 $mUcs4 = ($mUcs4 & 0x0F) << 12;
160 } elseif (0xF0 == (0xF8 & ($in))) {
161 // First octet of 4 octet sequence
163 $mUcs4 = ($mUcs4 & 0x07) << 18;
166 } elseif (0xF8 == (0xFC & ($in))) {
167 // First octet of 5 octet sequence.
169 // This is illegal because the encoded codepoint must be
171 // (a) not the shortest form or
172 // (b) outside the Unicode range of 0-0x10FFFF.
173 // Rather than trying to resynchronize, we will carry on
174 // until the end of the sequence and let the later error
175 // handling code catch it.
177 $mUcs4 = ($mUcs4 & 0x03) << 24;
180 } elseif (0xFC == (0xFE & ($in))) {
181 // First octet of 6 octet sequence, see comments for 5
184 $mUcs4 = ($mUcs4 & 1) << 30;
188 // Current octet is neither in the US-ASCII range nor a
189 // legal first octet of a multi-octet sequence.
196 // When mState is non-zero, we expect a continuation of the
197 // multi-octet sequence
198 if (0x80 == (0xC0 & ($in))) {
199 // Legal continuation.
200 $shift = ($mState - 1) * 6;
202 $tmp = ($tmp & 0x0000003F) << $shift;
205 if (0 == --$mState) {
206 // End of the multi-octet sequence. mUcs4 now contains
207 // the final Unicode codepoint to be output
209 // Check for illegal sequences and codepoints.
211 // From Unicode 3.1, non-shortest form is illegal
212 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
213 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
214 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
216 // From Unicode 3.2, surrogate characters = illegal
217 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
218 // Codepoints outside the Unicode range are illegal
222 } elseif (0xFEFF != $mUcs4 && // omit BOM
223 !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
227 // initialize UTF8 cache (reset)
234 // ((0xC0 & (*in) != 0x80) && (mState != 0))
235 // Incomplete multi-octet sequence.
236 // used to result in complete fail, but we'll reset
248 * Translates a Unicode codepoint into its corresponding UTF-8 character.
250 * @note Based on Feyd's function at
251 * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
252 * which is in public domain.
253 * @note While we're going to do code point parsing anyway, a good
254 * optimization would be to refuse to translate code points that
255 * are non-SGML characters. However, this could lead to duplication.
256 * @note This is very similar to the unichr function in
257 * maintenance/generate-entity-file.php (although this is superior,
258 * due to its sanity checks).
261 // +----------+----------+----------+----------+
262 // | 33222222 | 22221111 | 111111 | |
263 // | 10987654 | 32109876 | 54321098 | 76543210 | bit
264 // +----------+----------+----------+----------+
265 // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
266 // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
267 // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
268 // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
269 // +----------+----------+----------+----------+
270 // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
271 // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
272 // +----------+----------+----------+----------+
274 function unichr($code) {
275 if($code > 1114111 or $code < 0 or
276 ($code >= 55296 and $code <= 57343) ) {
277 // bits are set outside the "valid" range as defined
282 $x = $y = $z = $w = 0;
284 // regular ASCII character
287 // set up bits for UTF-8
288 $x = ($code & 63) |
128;
290 $y = (($code & 2047) >> 6) |
192;
292 $y = (($code & 4032) >> 6) |
128;
294 $z = (($code >> 12) & 15) |
224;
296 $z = (($code >> 12) & 63) |
128;
297 $w = (($code >> 18) & 7) |
240;
301 // set up the actual character
303 if($w) $ret .= chr($w);
304 if($z) $ret .= chr($z);
305 if($y) $ret .= chr($y);
312 * Converts a string to UTF-8 based on configuration.
315 function convertToUTF8($str, $config, &$context) {
316 static $iconv = null;
317 if ($iconv === null) $iconv = function_exists('iconv');
318 $encoding = $config->get('Core', 'Encoding');
319 if ($encoding === 'utf-8') return $str;
320 if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
321 return @iconv
($encoding, 'utf-8//IGNORE', $str);
322 } elseif ($encoding === 'iso-8859-1') {
323 return @utf8_encode
($str);
325 trigger_error('Encoding not supported', E_USER_ERROR
);
329 * Converts a string from UTF-8 based on configuration.
331 * @note Currently, this is a lossy conversion, with unexpressable
332 * characters being omitted.
334 function convertFromUTF8($str, $config, &$context) {
335 static $iconv = null;
336 if ($iconv === null) $iconv = function_exists('iconv');
337 $encoding = $config->get('Core', 'Encoding');
338 if ($encoding === 'utf-8') return $str;
339 if ($config->get('Core', 'EscapeNonASCIICharacters')) {
340 $str = HTMLPurifier_Encoder
::convertToASCIIDumbLossless($str);
342 if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
343 return @iconv
('utf-8', $encoding . '//IGNORE', $str);
344 } elseif ($encoding === 'iso-8859-1') {
345 return @utf8_decode
($str);
347 trigger_error('Encoding not supported', E_USER_ERROR
);
351 * Lossless (character-wise) conversion of HTML to ASCII
353 * @param $str UTF-8 string to be converted to ASCII
354 * @returns ASCII encoded string with non-ASCII character entity-ized
355 * @warning Adapted from MediaWiki, claiming fair use: this is a common
356 * algorithm. If you disagree with this license fudgery,
357 * implement it yourself.
358 * @note Uses decimal numeric entities since they are best supported.
359 * @note This is a DUMB function: it has no concept of keeping
360 * character entities that the projected character encoding
361 * can allow. We could possibly implement a smart version
362 * but that would require it to also know which Unicode
363 * codepoints the charset supported (not an easy task).
364 * @note Sort of with cleanUTF8() but it assumes that $str is
367 function convertToASCIIDumbLossless($str) {
372 for( $i = 0; $i < $len; $i++
) {
373 $bytevalue = ord( $str[$i] );
374 if( $bytevalue <= 0x7F ) { //0xxx xxxx
375 $result .= chr( $bytevalue );
377 } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
378 $working = $working << 6;
379 $working +
= ($bytevalue & 0x3F);
381 if( $bytesleft <= 0 ) {
382 $result .= "&#" . $working . ";";
384 } elseif( $bytevalue <= 0xDF ) { //110x xxxx
385 $working = $bytevalue & 0x1F;
387 } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
388 $working = $bytevalue & 0x0F;
391 $working = $bytevalue & 0x07;