system/core/utf8/to_unicode.php

   1 <?php defined('SYSPATH') OR die('No direct access allowed.');
   2 /**
   3  * utf8::to_unicode
   4  *
   5  * @package    Core
   6  * @author     Kohana Team
   7  * @copyright  (c) 2007 Kohana Team
   8  * @copyright  (c) 2005 Harry Fuecks
   9  * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
  10  */
  11 function _to_unicode($str)
  12 {
  13         $mState = 0; // cached expected number of octets after the current octet until the beginning of the next UTF8 character sequence
  14         $mUcs4  = 0; // cached Unicode character
  15         $mBytes = 1; // cached expected number of octets in the current sequence
  16
  17         $out = array();
  18
  19         $len = strlen($str);
  20
  21         for ($i = 0; $i < $len; $i++)
  22         {
  23                 $in = ord($str[$i]);
  24
  25                 if ($mState == 0)
  26                 {
  27                         // When mState is zero we expect either a US-ASCII character or a
  28                         // multi-octet sequence.
  29                         if (0 == (0x80 & $in))
  30                         {
  31                                 // US-ASCII, pass straight through.
  32                                 $out[] = $in;
  33                                 $mBytes = 1;
  34                         }
  35                         elseif (0xC0 == (0xE0 & $in))
  36                         {
  37                                 // First octet of 2 octet sequence
  38                                 $mUcs4 = $in;
  39                                 $mUcs4 = ($mUcs4 & 0x1F) << 6;
  40                                 $mState = 1;
  41                                 $mBytes = 2;
  42                         }
  43                         elseif (0xE0 == (0xF0 & $in))
  44                         {
  45                                 // First octet of 3 octet sequence
  46                                 $mUcs4 = $in;
  47                                 $mUcs4 = ($mUcs4 & 0x0F) << 12;
  48                                 $mState = 2;
  49                                 $mBytes = 3;
  50                         }
  51                         elseif (0xF0 == (0xF8 & $in))
  52                         {
  53                                 // First octet of 4 octet sequence
  54                                 $mUcs4 = $in;
  55                                 $mUcs4 = ($mUcs4 & 0x07) << 18;
  56                                 $mState = 3;
  57                                 $mBytes = 4;
  58                         }
  59                         elseif (0xF8 == (0xFC & $in))
  60                         {
  61                                 // First octet of 5 octet sequence.
  62                                 //
  63                                 // This is illegal because the encoded codepoint must be either
  64                                 // (a) not the shortest form or
  65                                 // (b) outside the Unicode range of 0-0x10FFFF.
  66                                 // Rather than trying to resynchronize, we will carry on until the end
  67                                 // of the sequence and let the later error handling code catch it.
  68                                 $mUcs4 = $in;
  69                                 $mUcs4 = ($mUcs4 & 0x03) << 24;
  70                                 $mState = 4;
  71                                 $mBytes = 5;
  72                         }
  73                         elseif (0xFC == (0xFE & $in))
  74                         {
  75                                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
  76                                 $mUcs4 = $in;
  77                                 $mUcs4 = ($mUcs4 & 1) << 30;
  78                                 $mState = 5;
  79                                 $mBytes = 6;
  80                         }
  81                         else
  82                         {
  83                                 // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.
  84                                 trigger_error('utf8::to_unicode: Illegal sequence identifier in UTF-8 at byte '.$i, E_USER_WARNING);
  85                                 return FALSE;
  86                         }
  87                 }
  88                 else
  89                 {
  90                         // When mState is non-zero, we expect a continuation of the multi-octet sequence
  91                         if (0x80 == (0xC0 & $in))
  92                         {
  93                                 // Legal continuation
  94                                 $shift = ($mState - 1) * 6;
  95                                 $tmp = $in;
  96                                 $tmp = ($tmp & 0x0000003F) << $shift;
  97                                 $mUcs4 |= $tmp;
  98
  99                                 // End of the multi-octet sequence. mUcs4 now contains the final Unicode codepoint to be output
 100                                 if (0 == --$mState)
 101                                 {
 102                                         // Check for illegal sequences and codepoints
 103
 104                                         // From Unicode 3.1, non-shortest form is illegal
 105                                         if (((2 == $mBytes) AND ($mUcs4 < 0x0080)) OR
 106                                                 ((3 == $mBytes) AND ($mUcs4 < 0x0800)) OR
 107                                                 ((4 == $mBytes) AND ($mUcs4 < 0x10000)) OR
 108                                                 (4 < $mBytes) OR
 109                                                 // From Unicode 3.2, surrogate characters are illegal
 110                                                 (($mUcs4 & 0xFFFFF800) == 0xD800) OR
 111                                                 // Codepoints outside the Unicode range are illegal
 112                                                 ($mUcs4 > 0x10FFFF))
 113                                         {
 114                                                 trigger_error('utf8::to_unicode: Illegal sequence or codepoint in UTF-8 at byte '.$i, E_USER_WARNING);
 115                                                 return FALSE;
 116                                         }
 117
 118                                         if (0xFEFF != $mUcs4)
 119                                         {
 120                                                 // BOM is legal but we don't want to output it
 121                                                 $out[] = $mUcs4;
 122                                         }
 123
 124                                         // Initialize UTF-8 cache
 125                                         $mState = 0;
 126                                         $mUcs4  = 0;
 127                                         $mBytes = 1;
 128                                 }
 129                         }
 130                         else
 131                         {
 132                                 // ((0xC0 & (*in) != 0x80) AND (mState != 0))
 133                                 // Incomplete multi-octet sequence
 134                                 trigger_error('utf8::to_unicode: Incomplete multi-octet sequence in UTF-8 at byte '.$i, E_USER_WARNING);
 135                                 return FALSE;
 136                         }
 137                 }
 138         }
 139
 140         return $out;
 141 }