lib/htmlpurifier/HTMLPurifier/Encoder.php

   1 <?php
   2
   3 require_once 'HTMLPurifier/EntityLookup.php';
   4
   5 HTMLPurifier_ConfigSchema::define(
   6     'Core', 'Encoding', 'utf-8', 'istring',
   7     'If for some reason you are unable to convert all webpages to UTF-8, '.
   8     'you can use this directive as a stop-gap compatibility change to '.
   9     'let HTML Purifier deal with non UTF-8 input.  This technique has '.
  10     'notable deficiencies: absolutely no characters outside of the selected '.
  11     'character encoding will be preserved, not even the ones that have '.
  12     'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
  13     'that automatically resolves all entities), making it pretty useless '.
  14     'for anything except the most I18N-blind applications, although '.
  15     '%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
  16     'another tradeoff. This directive '.
  17     'only accepts ISO-8859-1 if iconv is not enabled.'
  18 );
  19
  20 HTMLPurifier_ConfigSchema::define(
  21     'Core', 'EscapeNonASCIICharacters', false, 'bool',
  22     'This directive overcomes a deficiency in %Core.Encoding by blindly '.
  23     'converting all non-ASCII characters into decimal numeric entities before '.
  24     'converting it to its native encoding. This means that even '.
  25     'characters that can be expressed in the non-UTF-8 encoding will '.
  26     'be entity-ized, which can be a real downer for encodings like Big5. '.
  27     'It also assumes that the ASCII repetoire is available, although '.
  28     'this is the case for almost all encodings. Anyway, use UTF-8! This '.
  29     'directive has been available since 1.4.0.'
  30 );
  31
  32 if ( !function_exists('iconv') ) {
  33     // only encodings with native PHP support
  34     HTMLPurifier_ConfigSchema::defineAllowedValues(
  35         'Core', 'Encoding', array(
  36             'utf-8',
  37             'iso-8859-1'
  38         )
  39     );
  40     HTMLPurifier_ConfigSchema::defineValueAliases(
  41         'Core', 'Encoding', array(
  42             'iso8859-1' => 'iso-8859-1'
  43         )
  44     );
  45 }
  46
  47 HTMLPurifier_ConfigSchema::define(
  48     'Test', 'ForceNoIconv', false, 'bool',
  49     'When set to true, HTMLPurifier_Encoder will act as if iconv does not '.
  50     'exist and use only pure PHP implementations.'
  51 );
  52
  53 /**
  54  * A UTF-8 specific character encoder that handles cleaning and transforming.
  55  * @note All functions in this class should be static.
  56  */
  57 class HTMLPurifier_Encoder
  58 {
  59
  60     /**
  61      * Constructor throws fatal error if you attempt to instantiate class
  62      */
  63     function HTMLPurifier_Encoder() {
  64         trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  65     }
  66
  67     /**
  68      * Cleans a UTF-8 string for well-formedness and SGML validity
  69      *
  70      * It will parse according to UTF-8 and return a valid UTF8 string, with
  71      * non-SGML codepoints excluded.
  72      *
  73      * @static
  74      * @note Just for reference, the non-SGML code points are 0 to 31 and
  75      *       127 to 159, inclusive.  However, we allow code points 9, 10
  76      *       and 13, which are the tab, line feed and carriage return
  77      *       respectively. 128 and above the code points map to multibyte
  78      *       UTF-8 representations.
  79      *
  80      * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  81      *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  82      *       LGPL license.  Notes on what changed are inside, but in general,
  83      *       the original code transformed UTF-8 text into an array of integer
  84      *       Unicode codepoints. Understandably, transforming that back to
  85      *       a string would be somewhat expensive, so the function was modded to
  86      *       directly operate on the string.  However, this discourages code
  87      *       reuse, and the logic enumerated here would be useful for any
  88      *       function that needs to be able to understand UTF-8 characters.
  89      *       As of right now, only smart lossless character encoding converters
  90      *       would need that, and I'm probably not going to implement them.
  91      *       Once again, PHP 6 should solve all our problems.
  92      */
  93     function cleanUTF8($str, $force_php = false) {
  94
  95         static $non_sgml_chars = array();
  96         if (empty($non_sgml_chars)) {
  97             for ($i = 0; $i <= 31; $i++) {
  98                 // non-SGML ASCII chars
  99                 // save \r, \t and \n
 100                 if ($i == 9 || $i == 13 || $i == 10) continue;
 101                 $non_sgml_chars[chr($i)] = '';
 102             }
 103             for ($i = 127; $i <= 159; $i++) {
 104                 $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
 105             }
 106         }
 107
 108         static $iconv = null;
 109         if ($iconv === null) $iconv = function_exists('iconv');
 110
 111         if ($iconv && !$force_php) {
 112             // do the shortcut way
 113             $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
 114             return strtr($str, $non_sgml_chars);
 115         }
 116
 117         $mState = 0; // cached expected number of octets after the current octet
 118                      // until the beginning of the next UTF8 character sequence
 119         $mUcs4  = 0; // cached Unicode character
 120         $mBytes = 1; // cached expected number of octets in the current sequence
 121
 122         // original code involved an $out that was an array of Unicode
 123         // codepoints.  Instead of having to convert back into UTF-8, we've
 124         // decided to directly append valid UTF-8 characters onto a string
 125         // $out once they're done.  $char accumulates raw bytes, while $mUcs4
 126         // turns into the Unicode code point, so there's some redundancy.
 127
 128         $out = '';
 129         $char = '';
 130
 131         $len = strlen($str);
 132         for($i = 0; $i < $len; $i++) {
 133             $in = ord($str{$i});
 134             $char .= $str[$i]; // append byte to char
 135             if (0 == $mState) {
 136                 // When mState is zero we expect either a US-ASCII character
 137                 // or a multi-octet sequence.
 138                 if (0 == (0x80 & ($in))) {
 139                     // US-ASCII, pass straight through.
 140                     if (($in <= 31 || $in == 127) &&
 141                         !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
 142                     ) {
 143                         // control characters, remove
 144                     } else {
 145                         $out .= $char;
 146                     }
 147                     // reset
 148                     $char = '';
 149                     $mBytes = 1;
 150                 } elseif (0xC0 == (0xE0 & ($in))) {
 151                     // First octet of 2 octet sequence
 152                     $mUcs4 = ($in);
 153                     $mUcs4 = ($mUcs4 & 0x1F) << 6;
 154                     $mState = 1;
 155                     $mBytes = 2;
 156                 } elseif (0xE0 == (0xF0 & ($in))) {
 157                     // First octet of 3 octet sequence
 158                     $mUcs4 = ($in);
 159                     $mUcs4 = ($mUcs4 & 0x0F) << 12;
 160                     $mState = 2;
 161                     $mBytes = 3;
 162                 } elseif (0xF0 == (0xF8 & ($in))) {
 163                     // First octet of 4 octet sequence
 164                     $mUcs4 = ($in);
 165                     $mUcs4 = ($mUcs4 & 0x07) << 18;
 166                     $mState = 3;
 167                     $mBytes = 4;
 168                 } elseif (0xF8 == (0xFC & ($in))) {
 169                     // First octet of 5 octet sequence.
 170                     //
 171                     // This is illegal because the encoded codepoint must be
 172                     // either:
 173                     // (a) not the shortest form or
 174                     // (b) outside the Unicode range of 0-0x10FFFF.
 175                     // Rather than trying to resynchronize, we will carry on
 176                     // until the end of the sequence and let the later error
 177                     // handling code catch it.
 178                     $mUcs4 = ($in);
 179                     $mUcs4 = ($mUcs4 & 0x03) << 24;
 180                     $mState = 4;
 181                     $mBytes = 5;
 182                 } elseif (0xFC == (0xFE & ($in))) {
 183                     // First octet of 6 octet sequence, see comments for 5
 184                     // octet sequence.
 185                     $mUcs4 = ($in);
 186                     $mUcs4 = ($mUcs4 & 1) << 30;
 187                     $mState = 5;
 188                     $mBytes = 6;
 189                 } else {
 190                     // Current octet is neither in the US-ASCII range nor a
 191                     // legal first octet of a multi-octet sequence.
 192                     $mState = 0;
 193                     $mUcs4  = 0;
 194                     $mBytes = 1;
 195                     $char = '';
 196                 }
 197             } else {
 198                 // When mState is non-zero, we expect a continuation of the
 199                 // multi-octet sequence
 200                 if (0x80 == (0xC0 & ($in))) {
 201                     // Legal continuation.
 202                     $shift = ($mState - 1) * 6;
 203                     $tmp = $in;
 204                     $tmp = ($tmp & 0x0000003F) << $shift;
 205                     $mUcs4 |= $tmp;
 206
 207                     if (0 == --$mState) {
 208                         // End of the multi-octet sequence. mUcs4 now contains
 209                         // the final Unicode codepoint to be output
 210
 211                         // Check for illegal sequences and codepoints.
 212
 213                         // From Unicode 3.1, non-shortest form is illegal
 214                         if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 215                             ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 216                             ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 217                             (4 < $mBytes) ||
 218                             // From Unicode 3.2, surrogate characters = illegal
 219                             (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 220                             // Codepoints outside the Unicode range are illegal
 221                             ($mUcs4 > 0x10FFFF)
 222                         ) {
 223
 224                         } elseif (0xFEFF != $mUcs4 && // omit BOM
 225                             !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
 226                         ) {
 227                             $out .= $char;
 228                         }
 229                         // initialize UTF8 cache (reset)
 230                         $mState = 0;
 231                         $mUcs4  = 0;
 232                         $mBytes = 1;
 233                         $char = '';
 234                     }
 235                 } else {
 236                     // ((0xC0 & (*in) != 0x80) && (mState != 0))
 237                     // Incomplete multi-octet sequence.
 238                     // used to result in complete fail, but we'll reset
 239                     $mState = 0;
 240                     $mUcs4  = 0;
 241                     $mBytes = 1;
 242                     $char ='';
 243                 }
 244             }
 245         }
 246         return $out;
 247     }
 248
 249     /**
 250      * Translates a Unicode codepoint into its corresponding UTF-8 character.
 251      * @static
 252      * @note Based on Feyd's function at
 253      *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
 254      *       which is in public domain.
 255      * @note While we're going to do code point parsing anyway, a good
 256      *       optimization would be to refuse to translate code points that
 257      *       are non-SGML characters.  However, this could lead to duplication.
 258      * @note This is very similar to the unichr function in
 259      *       maintenance/generate-entity-file.php (although this is superior,
 260      *       due to its sanity checks).
 261      */
 262
 263     // +----------+----------+----------+----------+
 264     // | 33222222 | 22221111 | 111111   |          |
 265     // | 10987654 | 32109876 | 54321098 | 76543210 | bit
 266     // +----------+----------+----------+----------+
 267     // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
 268     // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
 269     // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
 270     // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
 271     // +----------+----------+----------+----------+
 272     // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
 273     // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
 274     // +----------+----------+----------+----------+
 275
 276     function unichr($code) {
 277         if($code > 1114111 or $code < 0 or
 278           ($code >= 55296 and $code <= 57343) ) {
 279             // bits are set outside the "valid" range as defined
 280             // by UNICODE 4.1.0
 281             return '';
 282         }
 283
 284         $x = $y = $z = $w = 0;
 285         if ($code < 128) {
 286             // regular ASCII character
 287             $x = $code;
 288         } else {
 289             // set up bits for UTF-8
 290             $x = ($code & 63) | 128;
 291             if ($code < 2048) {
 292                 $y = (($code & 2047) >> 6) | 192;
 293             } else {
 294                 $y = (($code & 4032) >> 6) | 128;
 295                 if($code < 65536) {
 296                     $z = (($code >> 12) & 15) | 224;
 297                 } else {
 298                     $z = (($code >> 12) & 63) | 128;
 299                     $w = (($code >> 18) & 7)  | 240;
 300                 }
 301             }
 302         }
 303         // set up the actual character
 304         $ret = '';
 305         if($w) $ret .= chr($w);
 306         if($z) $ret .= chr($z);
 307         if($y) $ret .= chr($y);
 308         $ret .= chr($x);
 309
 310         return $ret;
 311     }
 312
 313     /**
 314      * Converts a string to UTF-8 based on configuration.
 315      * @static
 316      */
 317     function convertToUTF8($str, $config, &$context) {
 318         static $iconv = null;
 319         if ($iconv === null) $iconv = function_exists('iconv');
 320         $encoding = $config->get('Core', 'Encoding');
 321         if ($encoding === 'utf-8') return $str;
 322         if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
 323             return @iconv($encoding, 'utf-8//IGNORE', $str);
 324         } elseif ($encoding === 'iso-8859-1') {
 325             return @utf8_encode($str);
 326         }
 327         trigger_error('Encoding not supported', E_USER_ERROR);
 328     }
 329
 330     /**
 331      * Converts a string from UTF-8 based on configuration.
 332      * @static
 333      * @note Currently, this is a lossy conversion, with unexpressable
 334      *       characters being omitted.
 335      */
 336     function convertFromUTF8($str, $config, &$context) {
 337         static $iconv = null;
 338         if ($iconv === null) $iconv = function_exists('iconv');
 339         $encoding = $config->get('Core', 'Encoding');
 340         if ($encoding === 'utf-8') return $str;
 341         if ($config->get('Core', 'EscapeNonASCIICharacters')) {
 342             $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
 343         }
 344         if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
 345             return @iconv('utf-8', $encoding . '//IGNORE', $str);
 346         } elseif ($encoding === 'iso-8859-1') {
 347             return @utf8_decode($str);
 348         }
 349         trigger_error('Encoding not supported', E_USER_ERROR);
 350     }
 351
 352     /**
 353      * Lossless (character-wise) conversion of HTML to ASCII
 354      * @static
 355      * @param $str UTF-8 string to be converted to ASCII
 356      * @returns ASCII encoded string with non-ASCII character entity-ized
 357      * @warning Adapted from MediaWiki, claiming fair use: this is a common
 358      *       algorithm. If you disagree with this license fudgery,
 359      *       implement it yourself.
 360      * @note Uses decimal numeric entities since they are best supported.
 361      * @note This is a DUMB function: it has no concept of keeping
 362      *       character entities that the projected character encoding
 363      *       can allow. We could possibly implement a smart version
 364      *       but that would require it to also know which Unicode
 365      *       codepoints the charset supported (not an easy task).
 366      * @note Sort of with cleanUTF8() but it assumes that $str is
 367      *       well-formed UTF-8
 368      */
 369     function convertToASCIIDumbLossless($str) {
 370         $bytesleft = 0;
 371         $result = '';
 372         $working = 0;
 373         $len = strlen($str);
 374         for( $i = 0; $i < $len; $i++ ) {
 375             $bytevalue = ord( $str[$i] );
 376             if( $bytevalue <= 0x7F ) { //0xxx xxxx
 377                 $result .= chr( $bytevalue );
 378                 $bytesleft = 0;
 379             } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
 380                 $working = $working << 6;
 381                 $working += ($bytevalue & 0x3F);
 382                 $bytesleft--;
 383                 if( $bytesleft <= 0 ) {
 384                     $result .= "&#" . $working . ";";
 385                 }
 386             } elseif( $bytevalue <= 0xDF ) { //110x xxxx
 387                 $working = $bytevalue & 0x1F;
 388                 $bytesleft = 1;
 389             } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
 390                 $working = $bytevalue & 0x0F;
 391                 $bytesleft = 2;
 392             } else { //1111 0xxx
 393                 $working = $bytevalue & 0x07;
 394                 $bytesleft = 3;
 395             }
 396         }
 397         return $result;
 398     }
 399
 400
 401 }
 402
 403 ?>