1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/basictypes.h"
6 #include "base/logging.h"
7 #include "base/string_piece.h"
8 #include "base/string_util.h"
9 #include "base/utf_string_conversions.h"
10 #include "testing/gtest/include/gtest/gtest.h"
16 const wchar_t* const kConvertRoundtripCases
[] = {
19 L
"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",
21 L
"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
22 L
"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",
23 // "Поиск страниц на русском"
24 L
"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"
25 L
"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"
26 L
"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c",
28 L
"\xc804\xccb4\xc11c\xbe44\xc2a4",
30 // Test characters that take more than 16 bits. This will depend on whether
31 // wchar_t is 16 or 32 bits.
32 #if defined(WCHAR_T_IS_UTF16)
34 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
35 L
"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44",
36 #elif defined(WCHAR_T_IS_UTF32)
38 // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
39 L
"\x11d40\x11d41\x11d42\x11d43\x11d44",
45 TEST(UTFStringConversionsTest
, ConvertUTF8AndWide
) {
46 // we round-trip all the wide strings through UTF-8 to make sure everything
47 // agrees on the conversion. This uses the stream operators to test them
49 for (size_t i
= 0; i
< arraysize(kConvertRoundtripCases
); ++i
) {
50 std::ostringstream utf8
;
51 utf8
<< WideToUTF8(kConvertRoundtripCases
[i
]);
52 std::wostringstream wide
;
53 wide
<< UTF8ToWide(utf8
.str());
55 EXPECT_EQ(kConvertRoundtripCases
[i
], wide
.str());
59 TEST(UTFStringConversionsTest
, ConvertUTF8AndWideEmptyString
) {
60 // An empty std::wstring should be converted to an empty std::string,
64 EXPECT_EQ(empty
, WideToUTF8(wempty
));
65 EXPECT_EQ(wempty
, UTF8ToWide(empty
));
68 TEST(UTFStringConversionsTest
, ConvertUTF8ToWide
) {
69 struct UTF8ToWideCase
{
74 // Regular UTF-8 input.
75 {"\xe4\xbd\xa0\xe5\xa5\xbd", L
"\x4f60\x597d", true},
76 // Non-character is passed through.
77 {"\xef\xbf\xbfHello", L
"\xffffHello", true},
78 // Truncated UTF-8 sequence.
79 {"\xe4\xa0\xe5\xa5\xbd", L
"\xfffd\x597d", false},
80 // Truncated off the end.
81 {"\xe5\xa5\xbd\xe4\xa0", L
"\x597d\xfffd", false},
82 // Non-shortest-form UTF-8.
83 {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L
"\xfffd\x597d", false},
84 // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
85 {"\xed\xb0\x80", L
"\xfffd", false},
86 // Non-BMP characters. The second is a non-character regarded as valid.
87 // The result will either be in UTF-16 or UTF-32.
88 #if defined(WCHAR_T_IS_UTF16)
89 {"A\xF0\x90\x8C\x80z", L
"A\xd800\xdf00z", true},
90 {"A\xF4\x8F\xBF\xBEz", L
"A\xdbff\xdffez", true},
91 #elif defined(WCHAR_T_IS_UTF32)
92 {"A\xF0\x90\x8C\x80z", L
"A\x10300z", true},
93 {"A\xF4\x8F\xBF\xBEz", L
"A\x10fffez", true},
97 for (size_t i
= 0; i
< ARRAYSIZE_UNSAFE(convert_cases
); i
++) {
98 std::wstring converted
;
99 EXPECT_EQ(convert_cases
[i
].success
,
100 UTF8ToWide(convert_cases
[i
].utf8
,
101 strlen(convert_cases
[i
].utf8
),
103 std::wstring
expected(convert_cases
[i
].wide
);
104 EXPECT_EQ(expected
, converted
);
107 // Manually test an embedded NULL.
108 std::wstring converted
;
109 EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted
));
110 ASSERT_EQ(3U, converted
.length());
111 EXPECT_EQ(static_cast<wchar_t>(0), converted
[0]);
112 EXPECT_EQ('Z', converted
[1]);
113 EXPECT_EQ('\t', converted
[2]);
115 // Make sure that conversion replaces, not appends.
116 EXPECT_TRUE(UTF8ToWide("B", 1, &converted
));
117 ASSERT_EQ(1U, converted
.length());
118 EXPECT_EQ('B', converted
[0]);
121 #if defined(WCHAR_T_IS_UTF16)
122 // This test is only valid when wchar_t == UTF-16.
123 TEST(UTFStringConversionsTest
, ConvertUTF16ToUTF8
) {
124 struct WideToUTF8Case
{
125 const wchar_t* utf16
;
128 } convert_cases
[] = {
129 // Regular UTF-16 input.
130 {L
"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
131 // Test a non-BMP character.
132 {L
"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
133 // Non-characters are passed through.
134 {L
"\xffffHello", "\xEF\xBF\xBFHello", true},
135 {L
"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
136 // The first character is a truncated UTF-16 character.
137 {L
"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
138 // Truncated at the end.
139 {L
"\x597d\xd800", "\xe5\xa5\xbd\xef\xbf\xbd", false},
142 for (int i
= 0; i
< arraysize(convert_cases
); i
++) {
143 std::string converted
;
144 EXPECT_EQ(convert_cases
[i
].success
,
145 WideToUTF8(convert_cases
[i
].utf16
,
146 wcslen(convert_cases
[i
].utf16
),
148 std::string
expected(convert_cases
[i
].utf8
);
149 EXPECT_EQ(expected
, converted
);
153 #elif defined(WCHAR_T_IS_UTF32)
154 // This test is only valid when wchar_t == UTF-32.
155 TEST(UTFStringConversionsTest
, ConvertUTF32ToUTF8
) {
156 struct WideToUTF8Case
{
157 const wchar_t* utf32
;
160 } convert_cases
[] = {
161 // Regular 16-bit input.
162 {L
"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
163 // Test a non-BMP character.
164 {L
"A\x10300z", "A\xF0\x90\x8C\x80z", true},
165 // Non-characters are passed through.
166 {L
"\xffffHello", "\xEF\xBF\xBFHello", true},
167 {L
"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
168 // Invalid Unicode code points.
169 {L
"\xfffffffHello", "\xEF\xBF\xBDHello", false},
170 // The first character is a truncated UTF-16 character.
171 {L
"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
172 {L
"\xdc01Hello", "\xef\xbf\xbdHello", false},
175 for (size_t i
= 0; i
< ARRAYSIZE_UNSAFE(convert_cases
); i
++) {
176 std::string converted
;
177 EXPECT_EQ(convert_cases
[i
].success
,
178 WideToUTF8(convert_cases
[i
].utf32
,
179 wcslen(convert_cases
[i
].utf32
),
181 std::string
expected(convert_cases
[i
].utf8
);
182 EXPECT_EQ(expected
, converted
);
185 #endif // defined(WCHAR_T_IS_UTF32)
187 TEST(UTFStringConversionsTest
, ConvertMultiString
) {
188 static wchar_t wmulti
[] = {
189 L
'f', L
'o', L
'o', L
'\0',
190 L
'b', L
'a', L
'r', L
'\0',
191 L
'b', L
'a', L
'z', L
'\0',
194 static char multi
[] = {
200 std::wstring wmultistring
;
201 memcpy(WriteInto(&wmultistring
, arraysize(wmulti
)), wmulti
, sizeof(wmulti
));
202 EXPECT_EQ(arraysize(wmulti
) - 1, wmultistring
.length());
203 std::string expected
;
204 memcpy(WriteInto(&expected
, arraysize(multi
)), multi
, sizeof(multi
));
205 EXPECT_EQ(arraysize(multi
) - 1, expected
.length());
206 const std::string
& converted
= WideToUTF8(wmultistring
);
207 EXPECT_EQ(arraysize(multi
) - 1, converted
.length());
208 EXPECT_EQ(expected
, converted
);