lib/htmlpurifier/HTMLPurifier/Encoder.php

   1 <?php
   2
   3 HTMLPurifier_ConfigSchema::define(
   4     'Core', 'Encoding', 'utf-8', 'istring',
   5     'If for some reason you are unable to convert all webpages to UTF-8, '.
   6     'you can use this directive as a stop-gap compatibility change to '.
   7     'let HTML Purifier deal with non UTF-8 input.  This technique has '.
   8     'notable deficiencies: absolutely no characters outside of the selected '.
   9     'character encoding will be preserved, not even the ones that have '.
  10     'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
  11     'that automatically resolves all entities), making it pretty useless '.
  12     'for anything except the most I18N-blind applications, although '.
  13     '%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
  14     'another tradeoff. This directive '.
  15     'only accepts ISO-8859-1 if iconv is not enabled.'
  16 );
  17
  18 HTMLPurifier_ConfigSchema::define(
  19     'Core', 'EscapeNonASCIICharacters', false, 'bool',
  20     'This directive overcomes a deficiency in %Core.Encoding by blindly '.
  21     'converting all non-ASCII characters into decimal numeric entities before '.
  22     'converting it to its native encoding. This means that even '.
  23     'characters that can be expressed in the non-UTF-8 encoding will '.
  24     'be entity-ized, which can be a real downer for encodings like Big5. '.
  25     'It also assumes that the ASCII repetoire is available, although '.
  26     'this is the case for almost all encodings. Anyway, use UTF-8! This '.
  27     'directive has been available since 1.4.0.'
  28 );
  29
  30 if ( !function_exists('iconv') ) {
  31     // only encodings with native PHP support
  32     HTMLPurifier_ConfigSchema::defineAllowedValues(
  33         'Core', 'Encoding', array(
  34             'utf-8',
  35             'iso-8859-1'
  36         )
  37     );
  38     HTMLPurifier_ConfigSchema::defineValueAliases(
  39         'Core', 'Encoding', array(
  40             'iso8859-1' => 'iso-8859-1'
  41         )
  42     );
  43 }
  44
  45 HTMLPurifier_ConfigSchema::define(
  46     'Test', 'ForceNoIconv', false, 'bool',
  47     'When set to true, HTMLPurifier_Encoder will act as if iconv does not '.
  48     'exist and use only pure PHP implementations.'
  49 );
  50
  51 /**
  52  * A UTF-8 specific character encoder that handles cleaning and transforming.
  53  * @note All functions in this class should be static.
  54  */
  55 class HTMLPurifier_Encoder
  56 {
  57
  58     /**
  59      * Constructor throws fatal error if you attempt to instantiate class
  60      */
  61     function HTMLPurifier_Encoder() {
  62         trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  63     }
  64
  65     /**
  66      * Error-handler that mutes errors, alternative to shut-up operator.
  67      */
  68     function muteErrorHandler() {}
  69
  70     /**
  71      * Cleans a UTF-8 string for well-formedness and SGML validity
  72      *
  73      * It will parse according to UTF-8 and return a valid UTF8 string, with
  74      * non-SGML codepoints excluded.
  75      *
  76      * @static
  77      * @note Just for reference, the non-SGML code points are 0 to 31 and
  78      *       127 to 159, inclusive.  However, we allow code points 9, 10
  79      *       and 13, which are the tab, line feed and carriage return
  80      *       respectively. 128 and above the code points map to multibyte
  81      *       UTF-8 representations.
  82      *
  83      * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  84      *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  85      *       LGPL license.  Notes on what changed are inside, but in general,
  86      *       the original code transformed UTF-8 text into an array of integer
  87      *       Unicode codepoints. Understandably, transforming that back to
  88      *       a string would be somewhat expensive, so the function was modded to
  89      *       directly operate on the string.  However, this discourages code
  90      *       reuse, and the logic enumerated here would be useful for any
  91      *       function that needs to be able to understand UTF-8 characters.
  92      *       As of right now, only smart lossless character encoding converters
  93      *       would need that, and I'm probably not going to implement them.
  94      *       Once again, PHP 6 should solve all our problems.
  95      */
  96     function cleanUTF8($str, $force_php = false) {
  97
  98         static $non_sgml_chars = array();
  99         if (empty($non_sgml_chars)) {
 100             for ($i = 0; $i <= 31; $i++) {
 101                 // non-SGML ASCII chars
 102                 // save \r, \t and \n
 103                 if ($i == 9 || $i == 13 || $i == 10) continue;
 104                 $non_sgml_chars[chr($i)] = '';
 105             }
 106             for ($i = 127; $i <= 159; $i++) {
 107                 $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
 108             }
 109         }
 110
 111         static $iconv = null;
 112         if ($iconv === null) $iconv = function_exists('iconv');
 113
 114         // UTF-8 validity is checked since PHP 4.3.5
 115         // This is an optimization: if the string is already valid UTF-8, no
 116         // need to do iconv/php stuff. 99% of the time, this will be the case.
 117         if (preg_match('/^.{1}/us', $str)) {
 118             return strtr($str, $non_sgml_chars);
 119         }
 120
 121         if ($iconv && !$force_php) {
 122             // do the shortcut way
 123             set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
 124             $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
 125             restore_error_handler();
 126             return strtr($str, $non_sgml_chars);
 127         }
 128
 129         $mState = 0; // cached expected number of octets after the current octet
 130                      // until the beginning of the next UTF8 character sequence
 131         $mUcs4  = 0; // cached Unicode character
 132         $mBytes = 1; // cached expected number of octets in the current sequence
 133
 134         // original code involved an $out that was an array of Unicode
 135         // codepoints.  Instead of having to convert back into UTF-8, we've
 136         // decided to directly append valid UTF-8 characters onto a string
 137         // $out once they're done.  $char accumulates raw bytes, while $mUcs4
 138         // turns into the Unicode code point, so there's some redundancy.
 139
 140         $out = '';
 141         $char = '';
 142
 143         $len = strlen($str);
 144         for($i = 0; $i < $len; $i++) {
 145             $in = ord($str{$i});
 146             $char .= $str[$i]; // append byte to char
 147             if (0 == $mState) {
 148                 // When mState is zero we expect either a US-ASCII character
 149                 // or a multi-octet sequence.
 150                 if (0 == (0x80 & ($in))) {
 151                     // US-ASCII, pass straight through.
 152                     if (($in <= 31 || $in == 127) &&
 153                         !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
 154                     ) {
 155                         // control characters, remove
 156                     } else {
 157                         $out .= $char;
 158                     }
 159                     // reset
 160                     $char = '';
 161                     $mBytes = 1;
 162                 } elseif (0xC0 == (0xE0 & ($in))) {
 163                     // First octet of 2 octet sequence
 164                     $mUcs4 = ($in);
 165                     $mUcs4 = ($mUcs4 & 0x1F) << 6;
 166                     $mState = 1;
 167                     $mBytes = 2;
 168                 } elseif (0xE0 == (0xF0 & ($in))) {
 169                     // First octet of 3 octet sequence
 170                     $mUcs4 = ($in);
 171                     $mUcs4 = ($mUcs4 & 0x0F) << 12;
 172                     $mState = 2;
 173                     $mBytes = 3;
 174                 } elseif (0xF0 == (0xF8 & ($in))) {
 175                     // First octet of 4 octet sequence
 176                     $mUcs4 = ($in);
 177                     $mUcs4 = ($mUcs4 & 0x07) << 18;
 178                     $mState = 3;
 179                     $mBytes = 4;
 180                 } elseif (0xF8 == (0xFC & ($in))) {
 181                     // First octet of 5 octet sequence.
 182                     //
 183                     // This is illegal because the encoded codepoint must be
 184                     // either:
 185                     // (a) not the shortest form or
 186                     // (b) outside the Unicode range of 0-0x10FFFF.
 187                     // Rather than trying to resynchronize, we will carry on
 188                     // until the end of the sequence and let the later error
 189                     // handling code catch it.
 190                     $mUcs4 = ($in);
 191                     $mUcs4 = ($mUcs4 & 0x03) << 24;
 192                     $mState = 4;
 193                     $mBytes = 5;
 194                 } elseif (0xFC == (0xFE & ($in))) {
 195                     // First octet of 6 octet sequence, see comments for 5
 196                     // octet sequence.
 197                     $mUcs4 = ($in);
 198                     $mUcs4 = ($mUcs4 & 1) << 30;
 199                     $mState = 5;
 200                     $mBytes = 6;
 201                 } else {
 202                     // Current octet is neither in the US-ASCII range nor a
 203                     // legal first octet of a multi-octet sequence.
 204                     $mState = 0;
 205                     $mUcs4  = 0;
 206                     $mBytes = 1;
 207                     $char = '';
 208                 }
 209             } else {
 210                 // When mState is non-zero, we expect a continuation of the
 211                 // multi-octet sequence
 212                 if (0x80 == (0xC0 & ($in))) {
 213                     // Legal continuation.
 214                     $shift = ($mState - 1) * 6;
 215                     $tmp = $in;
 216                     $tmp = ($tmp & 0x0000003F) << $shift;
 217                     $mUcs4 |= $tmp;
 218
 219                     if (0 == --$mState) {
 220                         // End of the multi-octet sequence. mUcs4 now contains
 221                         // the final Unicode codepoint to be output
 222
 223                         // Check for illegal sequences and codepoints.
 224
 225                         // From Unicode 3.1, non-shortest form is illegal
 226                         if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 227                             ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 228                             ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 229                             (4 < $mBytes) ||
 230                             // From Unicode 3.2, surrogate characters = illegal
 231                             (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 232                             // Codepoints outside the Unicode range are illegal
 233                             ($mUcs4 > 0x10FFFF)
 234                         ) {
 235
 236                         } elseif (0xFEFF != $mUcs4 && // omit BOM
 237                             !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
 238                         ) {
 239                             $out .= $char;
 240                         }
 241                         // initialize UTF8 cache (reset)
 242                         $mState = 0;
 243                         $mUcs4  = 0;
 244                         $mBytes = 1;
 245                         $char = '';
 246                     }
 247                 } else {
 248                     // ((0xC0 & (*in) != 0x80) && (mState != 0))
 249                     // Incomplete multi-octet sequence.
 250                     // used to result in complete fail, but we'll reset
 251                     $mState = 0;
 252                     $mUcs4  = 0;
 253                     $mBytes = 1;
 254                     $char ='';
 255                 }
 256             }
 257         }
 258         return $out;
 259     }
 260
 261     /**
 262      * Translates a Unicode codepoint into its corresponding UTF-8 character.
 263      * @static
 264      * @note Based on Feyd's function at
 265      *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
 266      *       which is in public domain.
 267      * @note While we're going to do code point parsing anyway, a good
 268      *       optimization would be to refuse to translate code points that
 269      *       are non-SGML characters.  However, this could lead to duplication.
 270      * @note This is very similar to the unichr function in
 271      *       maintenance/generate-entity-file.php (although this is superior,
 272      *       due to its sanity checks).
 273      */
 274
 275     // +----------+----------+----------+----------+
 276     // | 33222222 | 22221111 | 111111   |          |
 277     // | 10987654 | 32109876 | 54321098 | 76543210 | bit
 278     // +----------+----------+----------+----------+
 279     // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
 280     // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
 281     // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
 282     // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
 283     // +----------+----------+----------+----------+
 284     // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
 285     // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
 286     // +----------+----------+----------+----------+
 287
 288     function unichr($code) {
 289         if($code > 1114111 or $code < 0 or
 290           ($code >= 55296 and $code <= 57343) ) {
 291             // bits are set outside the "valid" range as defined
 292             // by UNICODE 4.1.0
 293             return '';
 294         }
 295
 296         $x = $y = $z = $w = 0;
 297         if ($code < 128) {
 298             // regular ASCII character
 299             $x = $code;
 300         } else {
 301             // set up bits for UTF-8
 302             $x = ($code & 63) | 128;
 303             if ($code < 2048) {
 304                 $y = (($code & 2047) >> 6) | 192;
 305             } else {
 306                 $y = (($code & 4032) >> 6) | 128;
 307                 if($code < 65536) {
 308                     $z = (($code >> 12) & 15) | 224;
 309                 } else {
 310                     $z = (($code >> 12) & 63) | 128;
 311                     $w = (($code >> 18) & 7)  | 240;
 312                 }
 313             }
 314         }
 315         // set up the actual character
 316         $ret = '';
 317         if($w) $ret .= chr($w);
 318         if($z) $ret .= chr($z);
 319         if($y) $ret .= chr($y);
 320         $ret .= chr($x);
 321
 322         return $ret;
 323     }
 324
 325     /**
 326      * Converts a string to UTF-8 based on configuration.
 327      * @static
 328      */
 329     function convertToUTF8($str, $config, &$context) {
 330         static $iconv = null;
 331         if ($iconv === null) $iconv = function_exists('iconv');
 332         $encoding = $config->get('Core', 'Encoding');
 333         if ($encoding === 'utf-8') return $str;
 334         if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
 335             return @iconv($encoding, 'utf-8//IGNORE', $str);
 336         } elseif ($encoding === 'iso-8859-1') {
 337             return @utf8_encode($str);
 338         }
 339         trigger_error('Encoding not supported', E_USER_ERROR);
 340     }
 341
 342     /**
 343      * Converts a string from UTF-8 based on configuration.
 344      * @static
 345      * @note Currently, this is a lossy conversion, with unexpressable
 346      *       characters being omitted.
 347      */
 348     function convertFromUTF8($str, $config, &$context) {
 349         static $iconv = null;
 350         if ($iconv === null) $iconv = function_exists('iconv');
 351         $encoding = $config->get('Core', 'Encoding');
 352         if ($encoding === 'utf-8') return $str;
 353         if ($config->get('Core', 'EscapeNonASCIICharacters')) {
 354             $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
 355         }
 356         if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
 357             return @iconv('utf-8', $encoding . '//IGNORE', $str);
 358         } elseif ($encoding === 'iso-8859-1') {
 359             return @utf8_decode($str);
 360         }
 361         trigger_error('Encoding not supported', E_USER_ERROR);
 362     }
 363
 364     /**
 365      * Lossless (character-wise) conversion of HTML to ASCII
 366      * @static
 367      * @param $str UTF-8 string to be converted to ASCII
 368      * @returns ASCII encoded string with non-ASCII character entity-ized
 369      * @warning Adapted from MediaWiki, claiming fair use: this is a common
 370      *       algorithm. If you disagree with this license fudgery,
 371      *       implement it yourself.
 372      * @note Uses decimal numeric entities since they are best supported.
 373      * @note This is a DUMB function: it has no concept of keeping
 374      *       character entities that the projected character encoding
 375      *       can allow. We could possibly implement a smart version
 376      *       but that would require it to also know which Unicode
 377      *       codepoints the charset supported (not an easy task).
 378      * @note Sort of with cleanUTF8() but it assumes that $str is
 379      *       well-formed UTF-8
 380      */
 381     function convertToASCIIDumbLossless($str) {
 382         $bytesleft = 0;
 383         $result = '';
 384         $working = 0;
 385         $len = strlen($str);
 386         for( $i = 0; $i < $len; $i++ ) {
 387             $bytevalue = ord( $str[$i] );
 388             if( $bytevalue <= 0x7F ) { //0xxx xxxx
 389                 $result .= chr( $bytevalue );
 390                 $bytesleft = 0;
 391             } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
 392                 $working = $working << 6;
 393                 $working += ($bytevalue & 0x3F);
 394                 $bytesleft--;
 395                 if( $bytesleft <= 0 ) {
 396                     $result .= "&#" . $working . ";";
 397                 }
 398             } elseif( $bytevalue <= 0xDF ) { //110x xxxx
 399                 $working = $bytevalue & 0x1F;
 400                 $bytesleft = 1;
 401             } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
 402                 $working = $bytevalue & 0x0F;
 403                 $bytesleft = 2;
 404             } else { //1111 0xxx
 405                 $working = $bytevalue & 0x07;
 406                 $bytesleft = 3;
 407             }
 408         }
 409         return $result;
 410     }
 411
 412
 413 }
 414