src/plugins/litehtml_viewer/litehtml/utf8_strings.cpp

   1 #include "html.h"
   2 #include "utf8_strings.h"
   3
   4
   5 litehtml::utf8_to_wchar::utf8_to_wchar(const char* val)
   6 {
   7         m_utf8 = (const byte*) val;
   8         if (!m_utf8) return;
   9
  10         while (true)
  11         {
  12                 ucode_t wch = get_char();
  13                 if (!wch) break;
  14                 m_str += wch;
  15         }
  16 }
  17
  18 litehtml::ucode_t litehtml::utf8_to_wchar::get_char()
  19 {
  20         ucode_t b1 = getb();
  21
  22         if (!b1)
  23         {
  24                 return 0;
  25         }
  26
  27         // Determine whether we are dealing
  28         // with a one-, two-, three-, or four-
  29         // byte sequence.
  30         if ((b1 & 0x80) == 0)
  31         {
  32                 // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
  33                 return b1;
  34         }
  35         else if ((b1 & 0xe0) == 0xc0)
  36         {
  37                 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
  38                 ucode_t r = (b1 & 0x1f) << 6;
  39                 r |= get_next_utf8(getb());
  40                 return r;
  41         }
  42         else if ((b1 & 0xf0) == 0xe0)
  43         {
  44                 // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
  45                 ucode_t r = (b1 & 0x0f) << 12;
  46                 r |= get_next_utf8(getb()) << 6;
  47                 r |= get_next_utf8(getb());
  48                 return r;
  49         }
  50         else if ((b1 & 0xf8) == 0xf0)
  51         {
  52                 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
  53                 //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
  54                 // (uuuuu = wwww + 1)
  55                 int b2 = get_next_utf8(getb());
  56                 int b3 = get_next_utf8(getb());
  57                 int b4 = get_next_utf8(getb());
  58                 return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |
  59                         ((b3 & 0x3f) << 6) | (b4 & 0x3f);
  60         }
  61
  62         //bad start for UTF-8 multi-byte sequence
  63         return '?';
  64 }
  65
  66 litehtml::wchar_to_utf8::wchar_to_utf8(const std::wstring& val)
  67 {
  68         unsigned int code;
  69         for (int i = 0; val[i]; i++)
  70         {
  71                 code = val[i];
  72                 if (code <= 0x7F)
  73                 {
  74                         m_str += (char)code;
  75                 }
  76                 else if (code <= 0x7FF)
  77                 {
  78                         m_str += (code >> 6) + 192;
  79                         m_str += (code & 63) + 128;
  80                 }
  81                 else if (0xd800 <= code && code <= 0xdfff)
  82                 {
  83                         //invalid block of utf8
  84                 }
  85                 else if (code <= 0xFFFF)
  86                 {
  87                         m_str += (code >> 12) + 224;
  88                         m_str += ((code >> 6) & 63) + 128;
  89                         m_str += (code & 63) + 128;
  90                 }
  91                 else if (code <= 0x10FFFF)
  92                 {
  93                         m_str += (code >> 18) + 240;
  94                         m_str += ((code >> 12) & 63) + 128;
  95                         m_str += ((code >> 6) & 63) + 128;
  96                         m_str += (code & 63) + 128;
  97                 }
  98         }
  99 }