2 #include "utf8_strings.h"
5 litehtml::utf8_to_wchar::utf8_to_wchar(const char* val
)
7 m_utf8
= (const byte
*) val
;
12 ucode_t wch
= get_char();
18 litehtml::ucode_t
litehtml::utf8_to_wchar::get_char()
27 // Determine whether we are dealing
28 // with a one-, two-, three-, or four-
32 // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
35 else if ((b1
& 0xe0) == 0xc0)
37 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
38 ucode_t r
= (b1
& 0x1f) << 6;
39 r
|= get_next_utf8(getb());
42 else if ((b1
& 0xf0) == 0xe0)
44 // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
45 ucode_t r
= (b1
& 0x0f) << 12;
46 r
|= get_next_utf8(getb()) << 6;
47 r
|= get_next_utf8(getb());
50 else if ((b1
& 0xf8) == 0xf0)
52 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
53 // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
55 int b2
= get_next_utf8(getb());
56 int b3
= get_next_utf8(getb());
57 int b4
= get_next_utf8(getb());
58 return ((b1
& 7) << 18) | ((b2
& 0x3f) << 12) |
59 ((b3
& 0x3f) << 6) | (b4
& 0x3f);
62 //bad start for UTF-8 multi-byte sequence
66 litehtml::wchar_to_utf8::wchar_to_utf8(const std::wstring
& val
)
69 for (int i
= 0; val
[i
]; i
++)
76 else if (code
<= 0x7FF)
78 m_str
+= (code
>> 6) + 192;
79 m_str
+= (code
& 63) + 128;
81 else if (0xd800 <= code
&& code
<= 0xdfff)
83 //invalid block of utf8
85 else if (code
<= 0xFFFF)
87 m_str
+= (code
>> 12) + 224;
88 m_str
+= ((code
>> 6) & 63) + 128;
89 m_str
+= (code
& 63) + 128;
91 else if (code
<= 0x10FFFF)
93 m_str
+= (code
>> 18) + 240;
94 m_str
+= ((code
>> 12) & 63) + 128;
95 m_str
+= ((code
>> 6) & 63) + 128;
96 m_str
+= (code
& 63) + 128;