utf8/to_unicode.php

   1 <?php defined('SYSPATH') OR die('No direct script access.');
   2 /**
   3  * UTF8::to_unicode
   4  *
   5  * @package    Kohana
   6  * @author     Kohana Team
   7  * @copyright  (c) 2007-2012 Kohana Team
   8  * @copyright  (c) 2005 Harry Fuecks
   9  * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
  10  */
  11 function _to_unicode($str)
  12 {
  13         // Cached expected number of octets after the current octet until the beginning of the next UTF8 character sequence
  14         $m_state = 0;
  15         // Cached Unicode character
  16         $m_ucs4  = 0;
  17         // Cached expected number of octets in the current sequence
  18         $m_bytes = 1;
  19
  20         $out = array();
  21
  22         $len = strlen($str);
  23
  24         for ($i = 0; $i < $len; $i++)
  25         {
  26                 $in = ord($str[$i]);
  27
  28                 if ($m_state == 0)
  29                 {
  30                         // When m_state is zero we expect either a US-ASCII character or a multi-octet sequence.
  31                         if (0 == (0x80 & $in))
  32                         {
  33                                 // US-ASCII, pass straight through.
  34                                 $out[] = $in;
  35                                 $m_bytes = 1;
  36                         }
  37                         elseif (0xC0 == (0xE0 & $in))
  38                         {
  39                                 // First octet of 2 octet sequence
  40                                 $m_ucs4 = $in;
  41                                 $m_ucs4 = ($m_ucs4 & 0x1F) << 6;
  42                                 $m_state = 1;
  43                                 $m_bytes = 2;
  44                         }
  45                         elseif (0xE0 == (0xF0 & $in))
  46                         {
  47                                 // First octet of 3 octet sequence
  48                                 $m_ucs4 = $in;
  49                                 $m_ucs4 = ($m_ucs4 & 0x0F) << 12;
  50                                 $m_state = 2;
  51                                 $m_bytes = 3;
  52                         }
  53                         elseif (0xF0 == (0xF8 & $in))
  54                         {
  55                                 // First octet of 4 octet sequence
  56                                 $m_ucs4 = $in;
  57                                 $m_ucs4 = ($m_ucs4 & 0x07) << 18;
  58                                 $m_state = 3;
  59                                 $m_bytes = 4;
  60                         }
  61                         elseif (0xF8 == (0xFC & $in))
  62                         {
  63                                 /** First octet of 5 octet sequence.
  64                                  *
  65                                  * This is illegal because the encoded codepoint must be either
  66                                  * (a) not the shortest form or
  67                                  * (b) outside the Unicode range of 0-0x10FFFF.
  68                                  * Rather than trying to resynchronize, we will carry on until the end
  69                                  * of the sequence and let the later error handling code catch it.
  70                                  **/
  71                                 $m_ucs4 = $in;
  72                                 $m_ucs4 = ($m_ucs4 & 0x03) << 24;
  73                                 $m_state = 4;
  74                                 $m_bytes = 5;
  75                         }
  76                         elseif (0xFC == (0xFE & $in))
  77                         {
  78                                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
  79                                 $m_ucs4 = $in;
  80                                 $m_ucs4 = ($m_ucs4 & 1) << 30;
  81                                 $m_state = 5;
  82                                 $m_bytes = 6;
  83                         }
  84                         else
  85                         {
  86                                 // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence.
  87                                 trigger_error('UTF8::to_unicode: Illegal sequence identifier in UTF-8 at byte '.$i, E_USER_WARNING);
  88                                 return FALSE;
  89                         }
  90                 }
  91                 else
  92                 {
  93                         // When m_state is non-zero, we expect a continuation of the multi-octet sequence
  94                         if (0x80 == (0xC0 & $in))
  95                         {
  96                                 // Legal continuation
  97                                 $shift = ($m_state - 1) * 6;
  98                                 $tmp = $in;
  99                                 $tmp = ($tmp & 0x0000003F) << $shift;
 100                                 $m_ucs4 |= $tmp;
 101
 102                                 // End of the multi-octet sequence. mUcs4 now contains the final Unicode codepoint to be output
 103                                 if (0 == --$m_state)
 104                                 {
 105                                         // Check for illegal sequences and codepoints
 106
 107                                         // From Unicode 3.1, non-shortest form is illegal
 108                                         if (((2 == $m_bytes) AND ($m_ucs4 < 0x0080)) OR
 109                                                 ((3 == $m_bytes) AND ($m_ucs4 < 0x0800)) OR
 110                                                 ((4 == $m_bytes) AND ($m_ucs4 < 0x10000)) OR
 111                                                 (4 < $m_bytes) OR
 112                                                 // From Unicode 3.2, surrogate characters are illegal
 113                                                 (($m_ucs4 & 0xFFFFF800) == 0xD800) OR
 114                                                 // Codepoints outside the Unicode range are illegal
 115                                                 ($m_ucs4 > 0x10FFFF))
 116                                         {
 117                                                 trigger_error('UTF8::to_unicode: Illegal sequence or codepoint in UTF-8 at byte '.$i, E_USER_WARNING);
 118                                                 return FALSE;
 119                                         }
 120
 121                                         if (0xFEFF != $m_ucs4)
 122                                         {
 123                                                 // BOM is legal but we don't want to output it
 124                                                 $out[] = $m_ucs4;
 125                                         }
 126
 127                                         // Initialize UTF-8 cache
 128                                         $m_state = 0;
 129                                         $m_ucs4  = 0;
 130                                         $m_bytes = 1;
 131                                 }
 132                         }
 133                         else
 134                         {
 135                                 // ((0xC0 & (*in) != 0x80) AND (m_state != 0))
 136                                 // Incomplete multi-octet sequence
 137                                 throw new UTF8_Exception("UTF8::to_unicode: Incomplete multi-octet sequence in UTF-8 at byte ':byte'", array(
 138                                         ':byte' => $i,
 139                                 ));
 140                         }
 141                 }
 142         }
 143
 144         return $out;
 145 }