BuildBot fix, compiler complains about array decay to pointer
[llvm-core.git] / unittests / Support / ConvertUTFTest.cpp
blobdd6e0df3688fd9dea6051f0758fd8cb3071cf248
1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
10 #include "llvm/Support/ConvertUTF.h"
11 #include "llvm/ADT/ArrayRef.h"
12 #include "gtest/gtest.h"
13 #include <string>
14 #include <vector>
16 using namespace llvm;
18 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
19 // Src is the look of disapproval.
20 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
21 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
22 std::string Result;
23 bool Success = convertUTF16ToUTF8String(Ref, Result);
24 EXPECT_TRUE(Success);
25 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
26 EXPECT_EQ(Expected, Result);
29 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
30 // Src is the look of disapproval.
31 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
32 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33 std::string Result;
34 bool Success = convertUTF16ToUTF8String(Ref, Result);
35 EXPECT_TRUE(Success);
36 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
37 EXPECT_EQ(Expected, Result);
40 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
41 // Src is the look of disapproval.
42 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
43 StringRef Ref(Src, sizeof(Src) - 1);
44 SmallVector<UTF16, 5> Result;
45 bool Success = convertUTF8ToUTF16String(Ref, Result);
46 EXPECT_TRUE(Success);
47 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
48 ASSERT_EQ(3u, Result.size());
49 for (int I = 0, E = 3; I != E; ++I)
50 EXPECT_EQ(Expected[I], Result[I]);
53 TEST(ConvertUTFTest, OddLengthInput) {
54 std::string Result;
55 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
56 EXPECT_FALSE(Success);
59 TEST(ConvertUTFTest, Empty) {
60 std::string Result;
61 bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
62 EXPECT_TRUE(Success);
63 EXPECT_TRUE(Result.empty());
66 TEST(ConvertUTFTest, HasUTF16BOM) {
67 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
68 EXPECT_TRUE(HasBOM);
69 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
70 EXPECT_TRUE(HasBOM);
71 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
72 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
73 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
74 EXPECT_TRUE(HasBOM);
76 HasBOM = hasUTF16ByteOrderMark(None);
77 EXPECT_FALSE(HasBOM);
78 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
79 EXPECT_FALSE(HasBOM);
82 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
83 // Src is the look of disapproval.
84 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
85 ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
86 std::string Result;
87 bool Success = convertUTF16ToUTF8String(SrcRef, Result);
88 EXPECT_TRUE(Success);
89 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
90 EXPECT_EQ(Expected, Result);
93 TEST(ConvertUTFTest, ConvertUTF8toWide) {
94 // Src is the look of disapproval.
95 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
96 std::wstring Result;
97 bool Success = ConvertUTF8toWide((const char*)Src, Result);
98 EXPECT_TRUE(Success);
99 std::wstring Expected(L"\x0ca0_\x0ca0");
100 EXPECT_EQ(Expected, Result);
101 Result.clear();
102 Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
103 EXPECT_TRUE(Success);
104 EXPECT_EQ(Expected, Result);
107 TEST(ConvertUTFTest, convertWideToUTF8) {
108 // Src is the look of disapproval.
109 static const wchar_t Src[] = L"\x0ca0_\x0ca0";
110 std::string Result;
111 bool Success = convertWideToUTF8(Src, Result);
112 EXPECT_TRUE(Success);
113 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
114 EXPECT_EQ(Expected, Result);
117 struct ConvertUTFResultContainer {
118 ConversionResult ErrorCode;
119 std::vector<unsigned> UnicodeScalars;
121 ConvertUTFResultContainer(ConversionResult ErrorCode)
122 : ErrorCode(ErrorCode) {}
124 ConvertUTFResultContainer
125 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
126 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
127 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
128 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
129 ConvertUTFResultContainer Result(*this);
130 if (US0 != 0x110000)
131 Result.UnicodeScalars.push_back(US0);
132 if (US1 != 0x110000)
133 Result.UnicodeScalars.push_back(US1);
134 if (US2 != 0x110000)
135 Result.UnicodeScalars.push_back(US2);
136 if (US3 != 0x110000)
137 Result.UnicodeScalars.push_back(US3);
138 if (US4 != 0x110000)
139 Result.UnicodeScalars.push_back(US4);
140 if (US5 != 0x110000)
141 Result.UnicodeScalars.push_back(US5);
142 if (US6 != 0x110000)
143 Result.UnicodeScalars.push_back(US6);
144 if (US7 != 0x110000)
145 Result.UnicodeScalars.push_back(US7);
146 return Result;
150 std::pair<ConversionResult, std::vector<unsigned>>
151 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
152 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
154 const UTF8 *SourceNext = SourceStart;
155 std::vector<UTF32> Decoded(S.size(), 0);
156 UTF32 *TargetStart = Decoded.data();
158 auto ErrorCode =
159 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
160 Decoded.data() + Decoded.size(), lenientConversion);
162 Decoded.resize(TargetStart - Decoded.data());
164 return std::make_pair(ErrorCode, Decoded);
167 std::pair<ConversionResult, std::vector<unsigned>>
168 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
169 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
171 const UTF8 *SourceNext = SourceStart;
172 std::vector<UTF32> Decoded(S.size(), 0);
173 UTF32 *TargetStart = Decoded.data();
175 auto ErrorCode = ConvertUTF8toUTF32Partial(
176 &SourceNext, SourceStart + S.size(), &TargetStart,
177 Decoded.data() + Decoded.size(), lenientConversion);
179 Decoded.resize(TargetStart - Decoded.data());
181 return std::make_pair(ErrorCode, Decoded);
184 ::testing::AssertionResult
185 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
186 StringRef S, bool Partial = false) {
187 ConversionResult ErrorCode;
188 std::vector<unsigned> Decoded;
189 if (!Partial)
190 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
191 else
192 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
194 if (Expected.ErrorCode != ErrorCode)
195 return ::testing::AssertionFailure() << "Expected error code "
196 << Expected.ErrorCode << ", actual "
197 << ErrorCode;
199 if (Expected.UnicodeScalars != Decoded)
200 return ::testing::AssertionFailure()
201 << "Expected lenient decoded result:\n"
202 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
203 << "Actual result:\n" << ::testing::PrintToString(Decoded);
205 return ::testing::AssertionSuccess();
208 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
211 // 1-byte sequences
214 // U+0041 LATIN CAPITAL LETTER A
215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
216 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
219 // 2-byte sequences
222 // U+0283 LATIN SMALL LETTER ESH
223 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
224 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
225 "\xca\x83"));
227 // U+03BA GREEK SMALL LETTER KAPPA
228 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
229 // U+03C3 GREEK SMALL LETTER SIGMA
230 // U+03BC GREEK SMALL LETTER MU
231 // U+03B5 GREEK SMALL LETTER EPSILON
232 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
233 ConvertUTFResultContainer(conversionOK)
234 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
235 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
238 // 3-byte sequences
241 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
242 // U+6587 CJK UNIFIED IDEOGRAPH-6587
243 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
244 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
245 "\xe4\xbe\x8b\xe6\x96\x87"));
247 // U+D55C HANGUL SYLLABLE HAN
248 // U+AE00 HANGUL SYLLABLE GEUL
249 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
250 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
251 "\xed\x95\x9c\xea\xb8\x80"));
253 // U+1112 HANGUL CHOSEONG HIEUH
254 // U+1161 HANGUL JUNGSEONG A
255 // U+11AB HANGUL JONGSEONG NIEUN
256 // U+1100 HANGUL CHOSEONG KIYEOK
257 // U+1173 HANGUL JUNGSEONG EU
258 // U+11AF HANGUL JONGSEONG RIEUL
259 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
260 ConvertUTFResultContainer(conversionOK)
261 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
262 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
263 "\xe1\x86\xaf"));
266 // 4-byte sequences
269 // U+E0100 VARIATION SELECTOR-17
270 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
271 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
272 "\xf3\xa0\x84\x80"));
275 // First possible sequence of a certain length
278 // U+0000 NULL
279 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
280 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
281 StringRef("\x00", 1)));
283 // U+0080 PADDING CHARACTER
284 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
285 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
286 "\xc2\x80"));
288 // U+0800 SAMARITAN LETTER ALAF
289 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
290 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
291 "\xe0\xa0\x80"));
293 // U+10000 LINEAR B SYLLABLE B008 A
294 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
295 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
296 "\xf0\x90\x80\x80"));
298 // U+200000 (invalid)
299 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
300 ConvertUTFResultContainer(sourceIllegal)
301 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
302 "\xf8\x88\x80\x80\x80"));
304 // U+4000000 (invalid)
305 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
306 ConvertUTFResultContainer(sourceIllegal)
307 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
308 "\xfc\x84\x80\x80\x80\x80"));
311 // Last possible sequence of a certain length
314 // U+007F DELETE
315 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
316 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
318 // U+07FF (unassigned)
319 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
320 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
321 "\xdf\xbf"));
323 // U+FFFF (noncharacter)
324 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
325 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
326 "\xef\xbf\xbf"));
328 // U+1FFFFF (invalid)
329 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
330 ConvertUTFResultContainer(sourceIllegal)
331 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
332 "\xf7\xbf\xbf\xbf"));
334 // U+3FFFFFF (invalid)
335 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
336 ConvertUTFResultContainer(sourceIllegal)
337 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
338 "\xfb\xbf\xbf\xbf\xbf"));
340 // U+7FFFFFFF (invalid)
341 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
342 ConvertUTFResultContainer(sourceIllegal)
343 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
344 "\xfd\xbf\xbf\xbf\xbf\xbf"));
347 // Other boundary conditions
350 // U+D7FF (unassigned)
351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
352 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
353 "\xed\x9f\xbf"));
355 // U+E000 (private use)
356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
358 "\xee\x80\x80"));
360 // U+FFFD REPLACEMENT CHARACTER
361 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
362 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
363 "\xef\xbf\xbd"));
365 // U+10FFFF (noncharacter)
366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
367 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
368 "\xf4\x8f\xbf\xbf"));
370 // U+110000 (invalid)
371 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
372 ConvertUTFResultContainer(sourceIllegal)
373 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
374 "\xf4\x90\x80\x80"));
377 // Unexpected continuation bytes
380 // A sequence of unexpected continuation bytes that don't follow a first
381 // byte, every byte is a maximal subpart.
383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
384 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
386 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
387 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
388 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
389 "\x80\x80"));
390 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
391 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
392 "\x80\xbf"));
393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
394 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
395 "\xbf\x80"));
396 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
397 ConvertUTFResultContainer(sourceIllegal)
398 .withScalars(0xfffd, 0xfffd, 0xfffd),
399 "\x80\xbf\x80"));
400 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
401 ConvertUTFResultContainer(sourceIllegal)
402 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
403 "\x80\xbf\x80\xbf"));
404 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
405 ConvertUTFResultContainer(sourceIllegal)
406 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
407 "\x80\xbf\x82\xbf\xaa"));
408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
409 ConvertUTFResultContainer(sourceIllegal)
410 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
411 "\xaa\xb0\xbb\xbf\xaa\xa0"));
412 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
413 ConvertUTFResultContainer(sourceIllegal)
414 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
415 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
417 // All continuation bytes (0x80--0xbf).
418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
419 ConvertUTFResultContainer(sourceIllegal)
420 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
421 0xfffd, 0xfffd, 0xfffd, 0xfffd)
422 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
423 0xfffd, 0xfffd, 0xfffd, 0xfffd)
424 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
425 0xfffd, 0xfffd, 0xfffd, 0xfffd)
426 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
427 0xfffd, 0xfffd, 0xfffd, 0xfffd)
428 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
429 0xfffd, 0xfffd, 0xfffd, 0xfffd)
430 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
431 0xfffd, 0xfffd, 0xfffd, 0xfffd)
432 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
433 0xfffd, 0xfffd, 0xfffd, 0xfffd)
434 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
435 0xfffd, 0xfffd, 0xfffd, 0xfffd),
436 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
437 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
438 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
439 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
442 // Lonely start bytes
445 // Start bytes of 2-byte sequences (0xc0--0xdf).
446 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
447 ConvertUTFResultContainer(sourceIllegal)
448 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
449 0xfffd, 0xfffd, 0xfffd, 0xfffd)
450 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
451 0xfffd, 0xfffd, 0xfffd, 0xfffd)
452 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
453 0xfffd, 0xfffd, 0xfffd, 0xfffd)
454 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
455 0xfffd, 0xfffd, 0xfffd, 0xfffd),
456 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
457 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
459 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
460 ConvertUTFResultContainer(sourceIllegal)
461 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
462 0xfffd, 0x0020, 0xfffd, 0x0020)
463 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
464 0xfffd, 0x0020, 0xfffd, 0x0020)
465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466 0xfffd, 0x0020, 0xfffd, 0x0020)
467 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
468 0xfffd, 0x0020, 0xfffd, 0x0020)
469 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
470 0xfffd, 0x0020, 0xfffd, 0x0020)
471 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
472 0xfffd, 0x0020, 0xfffd, 0x0020)
473 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
474 0xfffd, 0x0020, 0xfffd, 0x0020)
475 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
476 0xfffd, 0x0020, 0xfffd, 0x0020),
477 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
478 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
479 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
480 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
482 // Start bytes of 3-byte sequences (0xe0--0xef).
483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
484 ConvertUTFResultContainer(sourceIllegal)
485 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
486 0xfffd, 0xfffd, 0xfffd, 0xfffd)
487 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
488 0xfffd, 0xfffd, 0xfffd, 0xfffd),
489 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
492 ConvertUTFResultContainer(sourceIllegal)
493 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494 0xfffd, 0x0020, 0xfffd, 0x0020)
495 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
496 0xfffd, 0x0020, 0xfffd, 0x0020)
497 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
498 0xfffd, 0x0020, 0xfffd, 0x0020)
499 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
500 0xfffd, 0x0020, 0xfffd, 0x0020),
501 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
502 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
504 // Start bytes of 4-byte sequences (0xf0--0xf7).
505 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
506 ConvertUTFResultContainer(sourceIllegal)
507 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
508 0xfffd, 0xfffd, 0xfffd, 0xfffd),
509 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512 ConvertUTFResultContainer(sourceIllegal)
513 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
514 0xfffd, 0x0020, 0xfffd, 0x0020)
515 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
516 0xfffd, 0x0020, 0xfffd, 0x0020),
517 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
519 // Start bytes of 5-byte sequences (0xf8--0xfb).
520 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
521 ConvertUTFResultContainer(sourceIllegal)
522 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
523 "\xf8\xf9\xfa\xfb"));
525 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
526 ConvertUTFResultContainer(sourceIllegal)
527 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
528 0xfffd, 0x0020, 0xfffd, 0x0020),
529 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
531 // Start bytes of 6-byte sequences (0xfc--0xfd).
532 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
533 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
534 "\xfc\xfd"));
536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
537 ConvertUTFResultContainer(sourceIllegal)
538 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
539 "\xfc\x20\xfd\x20"));
542 // Other bytes (0xc0--0xc1, 0xfe--0xff).
545 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
546 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
547 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
548 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
551 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
552 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
554 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
555 ConvertUTFResultContainer(sourceIllegal)
556 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
557 "\xc0\xc1\xfe\xff"));
559 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
560 ConvertUTFResultContainer(sourceIllegal)
561 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
562 "\xfe\xfe\xff\xff"));
564 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
565 ConvertUTFResultContainer(sourceIllegal)
566 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
567 "\xfe\x80\x80\x80\x80\x80"));
569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570 ConvertUTFResultContainer(sourceIllegal)
571 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
572 "\xff\x80\x80\x80\x80\x80"));
574 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
575 ConvertUTFResultContainer(sourceIllegal)
576 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
577 0xfffd, 0x0020, 0xfffd, 0x0020),
578 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
581 // Sequences with one continuation byte missing
584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
586 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
587 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
589 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
590 "\xe0\xa0"));
591 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
592 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
593 "\xe0\xbf"));
594 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
595 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
596 "\xe1\x80"));
597 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
598 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
599 "\xec\xbf"));
600 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
601 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
602 "\xed\x80"));
603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
604 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
605 "\xed\x9f"));
606 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
607 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
608 "\xee\x80"));
609 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
610 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
611 "\xef\xbf"));
612 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
613 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
614 "\xf0\x90\x80"));
615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
617 "\xf0\xbf\xbf"));
618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
620 "\xf1\x80\x80"));
621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
623 "\xf3\xbf\xbf"));
624 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
625 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
626 "\xf4\x80\x80"));
627 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
628 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
629 "\xf4\x8f\xbf"));
631 // Overlong sequences with one trailing byte missing.
632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
633 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
634 "\xc0"));
635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
636 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
637 "\xc1"));
638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
640 "\xe0\x80"));
641 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
642 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
643 "\xe0\x9f"));
644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
645 ConvertUTFResultContainer(sourceIllegal)
646 .withScalars(0xfffd, 0xfffd, 0xfffd),
647 "\xf0\x80\x80"));
648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
649 ConvertUTFResultContainer(sourceIllegal)
650 .withScalars(0xfffd, 0xfffd, 0xfffd),
651 "\xf0\x8f\x80"));
652 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
653 ConvertUTFResultContainer(sourceIllegal)
654 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
655 "\xf8\x80\x80\x80"));
656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
657 ConvertUTFResultContainer(sourceIllegal)
658 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
659 "\xfc\x80\x80\x80\x80"));
661 // Sequences that represent surrogates with one trailing byte missing.
662 // High surrogates
663 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
664 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
665 "\xed\xa0"));
666 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
667 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
668 "\xed\xac"));
669 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
670 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
671 "\xed\xaf"));
672 // Low surrogates
673 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
674 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
675 "\xed\xb0"));
676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
678 "\xed\xb4"));
679 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
680 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
681 "\xed\xbf"));
683 // Ill-formed 4-byte sequences.
684 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
685 // U+1100xx (invalid)
686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
687 ConvertUTFResultContainer(sourceIllegal)
688 .withScalars(0xfffd, 0xfffd, 0xfffd),
689 "\xf4\x90\x80"));
690 // U+13FBxx (invalid)
691 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
692 ConvertUTFResultContainer(sourceIllegal)
693 .withScalars(0xfffd, 0xfffd, 0xfffd),
694 "\xf4\xbf\xbf"));
695 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
696 ConvertUTFResultContainer(sourceIllegal)
697 .withScalars(0xfffd, 0xfffd, 0xfffd),
698 "\xf5\x80\x80"));
699 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
700 ConvertUTFResultContainer(sourceIllegal)
701 .withScalars(0xfffd, 0xfffd, 0xfffd),
702 "\xf6\x80\x80"));
703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704 ConvertUTFResultContainer(sourceIllegal)
705 .withScalars(0xfffd, 0xfffd, 0xfffd),
706 "\xf7\x80\x80"));
707 // U+1FFBxx (invalid)
708 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
709 ConvertUTFResultContainer(sourceIllegal)
710 .withScalars(0xfffd, 0xfffd, 0xfffd),
711 "\xf7\xbf\xbf"));
713 // Ill-formed 5-byte sequences.
714 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
715 // U+2000xx (invalid)
716 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
717 ConvertUTFResultContainer(sourceIllegal)
718 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
719 "\xf8\x88\x80\x80"));
720 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
721 ConvertUTFResultContainer(sourceIllegal)
722 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
723 "\xf8\xbf\xbf\xbf"));
724 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
725 ConvertUTFResultContainer(sourceIllegal)
726 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
727 "\xf9\x80\x80\x80"));
728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
729 ConvertUTFResultContainer(sourceIllegal)
730 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
731 "\xfa\x80\x80\x80"));
732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
733 ConvertUTFResultContainer(sourceIllegal)
734 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
735 "\xfb\x80\x80\x80"));
736 // U+3FFFFxx (invalid)
737 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
738 ConvertUTFResultContainer(sourceIllegal)
739 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
740 "\xfb\xbf\xbf\xbf"));
742 // Ill-formed 6-byte sequences.
743 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
744 // U+40000xx (invalid)
745 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
746 ConvertUTFResultContainer(sourceIllegal)
747 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
748 "\xfc\x84\x80\x80\x80"));
749 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
750 ConvertUTFResultContainer(sourceIllegal)
751 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
752 "\xfc\xbf\xbf\xbf\xbf"));
753 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
754 ConvertUTFResultContainer(sourceIllegal)
755 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
756 "\xfd\x80\x80\x80\x80"));
757 // U+7FFFFFxx (invalid)
758 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
759 ConvertUTFResultContainer(sourceIllegal)
760 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
761 "\xfd\xbf\xbf\xbf\xbf"));
764 // Sequences with two continuation bytes missing
767 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
768 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
769 "\xf0\x90"));
770 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
771 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
772 "\xf0\xbf"));
773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
775 "\xf1\x80"));
776 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
778 "\xf3\xbf"));
779 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
780 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
781 "\xf4\x80"));
782 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
783 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
784 "\xf4\x8f"));
786 // Overlong sequences with two trailing byte missing.
787 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
789 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
790 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
791 "\xf0\x80"));
792 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
793 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
794 "\xf0\x8f"));
795 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
796 ConvertUTFResultContainer(sourceIllegal)
797 .withScalars(0xfffd, 0xfffd, 0xfffd),
798 "\xf8\x80\x80"));
799 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800 ConvertUTFResultContainer(sourceIllegal)
801 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
802 "\xfc\x80\x80\x80"));
804 // Sequences that represent surrogates with two trailing bytes missing.
805 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
806 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
808 // Ill-formed 4-byte sequences.
809 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
810 // U+110yxx (invalid)
811 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
812 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
813 "\xf4\x90"));
814 // U+13Fyxx (invalid)
815 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
816 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
817 "\xf4\xbf"));
818 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
819 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
820 "\xf5\x80"));
821 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
822 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
823 "\xf6\x80"));
824 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
825 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
826 "\xf7\x80"));
827 // U+1FFyxx (invalid)
828 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
830 "\xf7\xbf"));
832 // Ill-formed 5-byte sequences.
833 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
834 // U+200yxx (invalid)
835 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
836 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
837 "\xf8\x88\x80"));
838 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
839 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
840 "\xf8\xbf\xbf"));
841 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
842 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
843 "\xf9\x80\x80"));
844 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
845 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
846 "\xfa\x80\x80"));
847 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
848 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
849 "\xfb\x80\x80"));
850 // U+3FFFyxx (invalid)
851 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
852 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
853 "\xfb\xbf\xbf"));
855 // Ill-formed 6-byte sequences.
856 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
857 // U+4000yxx (invalid)
858 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
859 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
860 "\xfc\x84\x80\x80"));
861 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
862 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
863 "\xfc\xbf\xbf\xbf"));
864 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
865 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
866 "\xfd\x80\x80\x80"));
867 // U+7FFFFyxx (invalid)
868 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
870 "\xfd\xbf\xbf\xbf"));
873 // Sequences with three continuation bytes missing
876 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
877 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
878 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
879 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
880 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
881 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
882 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
883 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
884 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
885 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
887 // Broken overlong sequences.
888 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
889 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
890 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
891 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
892 "\xf8\x80"));
893 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
894 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
895 "\xfc\x80\x80"));
897 // Ill-formed 4-byte sequences.
898 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
899 // U+14yyxx (invalid)
900 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
902 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
903 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
904 // U+1Cyyxx (invalid)
905 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
906 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
908 // Ill-formed 5-byte sequences.
909 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
910 // U+20yyxx (invalid)
911 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
913 "\xf8\x88"));
914 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
915 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
916 "\xf8\xbf"));
917 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
918 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
919 "\xf9\x80"));
920 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
921 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
922 "\xfa\x80"));
923 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
924 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
925 "\xfb\x80"));
926 // U+3FCyyxx (invalid)
927 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
928 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
929 "\xfb\xbf"));
931 // Ill-formed 6-byte sequences.
932 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
933 // U+400yyxx (invalid)
934 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
935 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
936 "\xfc\x84\x80"));
937 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
938 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
939 "\xfc\xbf\xbf"));
940 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
941 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
942 "\xfd\x80\x80"));
943 // U+7FFCyyxx (invalid)
944 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
945 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
946 "\xfd\xbf\xbf"));
949 // Sequences with four continuation bytes missing
952 // Ill-formed 5-byte sequences.
953 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
954 // U+uzyyxx (invalid)
955 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
956 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
957 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
958 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
959 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
960 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
961 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
962 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
963 // U+3zyyxx (invalid)
964 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
967 // Broken overlong sequences.
968 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
969 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
970 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
971 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
972 "\xfc\x80"));
974 // Ill-formed 6-byte sequences.
975 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
976 // U+uzzyyxx (invalid)
977 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
978 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
979 "\xfc\x84"));
980 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
982 "\xfc\xbf"));
983 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
984 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
985 "\xfd\x80"));
986 // U+7Fzzyyxx (invalid)
987 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
989 "\xfd\xbf"));
992 // Sequences with five continuation bytes missing
995 // Ill-formed 6-byte sequences.
996 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
997 // U+uzzyyxx (invalid)
998 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
999 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
1000 // U+uuzzyyxx (invalid)
1001 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
1005 // Consecutive sequences with trailing bytes missing
1008 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1009 ConvertUTFResultContainer(sourceIllegal)
1010 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1011 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1012 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1013 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1014 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1015 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1016 "\xc0" "\xe0\x80" "\xf0\x80\x80"
1017 "\xf8\x80\x80\x80"
1018 "\xfc\x80\x80\x80\x80"
1019 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1020 "\xfb\xbf\xbf\xbf"
1021 "\xfd\xbf\xbf\xbf\xbf"));
1024 // Overlong UTF-8 sequences
1027 // U+002F SOLIDUS
1028 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1029 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
1031 // Overlong sequences of the above.
1032 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1033 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1034 "\xc0\xaf"));
1035 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1036 ConvertUTFResultContainer(sourceIllegal)
1037 .withScalars(0xfffd, 0xfffd, 0xfffd),
1038 "\xe0\x80\xaf"));
1039 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1040 ConvertUTFResultContainer(sourceIllegal)
1041 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1042 "\xf0\x80\x80\xaf"));
1043 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1044 ConvertUTFResultContainer(sourceIllegal)
1045 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1046 "\xf8\x80\x80\x80\xaf"));
1047 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1048 ConvertUTFResultContainer(sourceIllegal)
1049 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1050 "\xfc\x80\x80\x80\x80\xaf"));
1052 // U+0000 NULL
1053 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1054 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1055 StringRef("\x00", 1)));
1057 // Overlong sequences of the above.
1058 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1059 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1060 "\xc0\x80"));
1061 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1062 ConvertUTFResultContainer(sourceIllegal)
1063 .withScalars(0xfffd, 0xfffd, 0xfffd),
1064 "\xe0\x80\x80"));
1065 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1066 ConvertUTFResultContainer(sourceIllegal)
1067 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1068 "\xf0\x80\x80\x80"));
1069 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1070 ConvertUTFResultContainer(sourceIllegal)
1071 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1072 "\xf8\x80\x80\x80\x80"));
1073 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1074 ConvertUTFResultContainer(sourceIllegal)
1075 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1076 "\xfc\x80\x80\x80\x80\x80"));
1078 // Other overlong sequences.
1079 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1080 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1081 "\xc0\xbf"));
1082 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1083 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1084 "\xc1\x80"));
1085 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1086 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1087 "\xc1\xbf"));
1088 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1089 ConvertUTFResultContainer(sourceIllegal)
1090 .withScalars(0xfffd, 0xfffd, 0xfffd),
1091 "\xe0\x9f\xbf"));
1092 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093 ConvertUTFResultContainer(sourceIllegal)
1094 .withScalars(0xfffd, 0xfffd, 0xfffd),
1095 "\xed\xa0\x80"));
1096 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1097 ConvertUTFResultContainer(sourceIllegal)
1098 .withScalars(0xfffd, 0xfffd, 0xfffd),
1099 "\xed\xbf\xbf"));
1100 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1101 ConvertUTFResultContainer(sourceIllegal)
1102 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1103 "\xf0\x8f\x80\x80"));
1104 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1105 ConvertUTFResultContainer(sourceIllegal)
1106 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1107 "\xf0\x8f\xbf\xbf"));
1108 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1109 ConvertUTFResultContainer(sourceIllegal)
1110 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1111 "\xf8\x87\xbf\xbf\xbf"));
1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113 ConvertUTFResultContainer(sourceIllegal)
1114 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1115 "\xfc\x83\xbf\xbf\xbf\xbf"));
1118 // Isolated surrogates
1121 // Unicode 6.3.0:
1123 // D71. High-surrogate code point: A Unicode code point in the range
1124 // U+D800 to U+DBFF.
1126 // D73. Low-surrogate code point: A Unicode code point in the range
1127 // U+DC00 to U+DFFF.
1129 // Note: U+E0100 is <DB40 DD00> in UTF16.
1131 // High surrogates
1133 // U+D800
1134 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1135 ConvertUTFResultContainer(sourceIllegal)
1136 .withScalars(0xfffd, 0xfffd, 0xfffd),
1137 "\xed\xa0\x80"));
1139 // U+DB40
1140 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1141 ConvertUTFResultContainer(sourceIllegal)
1142 .withScalars(0xfffd, 0xfffd, 0xfffd),
1143 "\xed\xac\xa0"));
1145 // U+DBFF
1146 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1147 ConvertUTFResultContainer(sourceIllegal)
1148 .withScalars(0xfffd, 0xfffd, 0xfffd),
1149 "\xed\xaf\xbf"));
1151 // Low surrogates
1153 // U+DC00
1154 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1155 ConvertUTFResultContainer(sourceIllegal)
1156 .withScalars(0xfffd, 0xfffd, 0xfffd),
1157 "\xed\xb0\x80"));
1159 // U+DD00
1160 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1161 ConvertUTFResultContainer(sourceIllegal)
1162 .withScalars(0xfffd, 0xfffd, 0xfffd),
1163 "\xed\xb4\x80"));
1165 // U+DFFF
1166 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1167 ConvertUTFResultContainer(sourceIllegal)
1168 .withScalars(0xfffd, 0xfffd, 0xfffd),
1169 "\xed\xbf\xbf"));
1171 // Surrogate pairs
1173 // U+D800 U+DC00
1174 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1175 ConvertUTFResultContainer(sourceIllegal)
1176 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1177 "\xed\xa0\x80\xed\xb0\x80"));
1179 // U+D800 U+DD00
1180 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1181 ConvertUTFResultContainer(sourceIllegal)
1182 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1183 "\xed\xa0\x80\xed\xb4\x80"));
1185 // U+D800 U+DFFF
1186 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1187 ConvertUTFResultContainer(sourceIllegal)
1188 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1189 "\xed\xa0\x80\xed\xbf\xbf"));
1191 // U+DB40 U+DC00
1192 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1193 ConvertUTFResultContainer(sourceIllegal)
1194 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1195 "\xed\xac\xa0\xed\xb0\x80"));
1197 // U+DB40 U+DD00
1198 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1199 ConvertUTFResultContainer(sourceIllegal)
1200 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1201 "\xed\xac\xa0\xed\xb4\x80"));
1203 // U+DB40 U+DFFF
1204 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1205 ConvertUTFResultContainer(sourceIllegal)
1206 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1207 "\xed\xac\xa0\xed\xbf\xbf"));
1209 // U+DBFF U+DC00
1210 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211 ConvertUTFResultContainer(sourceIllegal)
1212 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1213 "\xed\xaf\xbf\xed\xb0\x80"));
1215 // U+DBFF U+DD00
1216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217 ConvertUTFResultContainer(sourceIllegal)
1218 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1219 "\xed\xaf\xbf\xed\xb4\x80"));
1221 // U+DBFF U+DFFF
1222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1223 ConvertUTFResultContainer(sourceIllegal)
1224 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1225 "\xed\xaf\xbf\xed\xbf\xbf"));
1228 // Noncharacters
1231 // Unicode 6.3.0:
1233 // D14. Noncharacter: A code point that is permanently reserved for
1234 // internal use and that should never be interchanged. Noncharacters
1235 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1236 // and the values U+FDD0..U+FDEF.
1238 // U+FFFE
1239 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1240 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1241 "\xef\xbf\xbe"));
1243 // U+FFFF
1244 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1245 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1246 "\xef\xbf\xbf"));
1248 // U+1FFFE
1249 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1250 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1251 "\xf0\x9f\xbf\xbe"));
1253 // U+1FFFF
1254 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1255 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1256 "\xf0\x9f\xbf\xbf"));
1258 // U+2FFFE
1259 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1260 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1261 "\xf0\xaf\xbf\xbe"));
1263 // U+2FFFF
1264 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1265 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1266 "\xf0\xaf\xbf\xbf"));
1268 // U+3FFFE
1269 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1270 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1271 "\xf0\xbf\xbf\xbe"));
1273 // U+3FFFF
1274 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1275 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1276 "\xf0\xbf\xbf\xbf"));
1278 // U+4FFFE
1279 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1280 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1281 "\xf1\x8f\xbf\xbe"));
1283 // U+4FFFF
1284 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1285 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1286 "\xf1\x8f\xbf\xbf"));
1288 // U+5FFFE
1289 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1290 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1291 "\xf1\x9f\xbf\xbe"));
1293 // U+5FFFF
1294 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1295 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1296 "\xf1\x9f\xbf\xbf"));
1298 // U+6FFFE
1299 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1300 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1301 "\xf1\xaf\xbf\xbe"));
1303 // U+6FFFF
1304 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1305 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1306 "\xf1\xaf\xbf\xbf"));
1308 // U+7FFFE
1309 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1310 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1311 "\xf1\xbf\xbf\xbe"));
1313 // U+7FFFF
1314 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1315 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1316 "\xf1\xbf\xbf\xbf"));
1318 // U+8FFFE
1319 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1320 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1321 "\xf2\x8f\xbf\xbe"));
1323 // U+8FFFF
1324 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1325 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1326 "\xf2\x8f\xbf\xbf"));
1328 // U+9FFFE
1329 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1330 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1331 "\xf2\x9f\xbf\xbe"));
1333 // U+9FFFF
1334 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1335 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1336 "\xf2\x9f\xbf\xbf"));
1338 // U+AFFFE
1339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1340 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1341 "\xf2\xaf\xbf\xbe"));
1343 // U+AFFFF
1344 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1345 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1346 "\xf2\xaf\xbf\xbf"));
1348 // U+BFFFE
1349 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1350 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1351 "\xf2\xbf\xbf\xbe"));
1353 // U+BFFFF
1354 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1355 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1356 "\xf2\xbf\xbf\xbf"));
1358 // U+CFFFE
1359 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1360 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1361 "\xf3\x8f\xbf\xbe"));
1363 // U+CFFFF
1364 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1365 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1366 "\xf3\x8f\xbf\xbf"));
1368 // U+DFFFE
1369 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1370 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1371 "\xf3\x9f\xbf\xbe"));
1373 // U+DFFFF
1374 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1375 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1376 "\xf3\x9f\xbf\xbf"));
1378 // U+EFFFE
1379 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1380 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1381 "\xf3\xaf\xbf\xbe"));
1383 // U+EFFFF
1384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1385 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1386 "\xf3\xaf\xbf\xbf"));
1388 // U+FFFFE
1389 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1390 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1391 "\xf3\xbf\xbf\xbe"));
1393 // U+FFFFF
1394 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1395 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1396 "\xf3\xbf\xbf\xbf"));
1398 // U+10FFFE
1399 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1400 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1401 "\xf4\x8f\xbf\xbe"));
1403 // U+10FFFF
1404 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1405 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1406 "\xf4\x8f\xbf\xbf"));
1408 // U+FDD0
1409 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1410 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1411 "\xef\xb7\x90"));
1413 // U+FDD1
1414 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1415 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1416 "\xef\xb7\x91"));
1418 // U+FDD2
1419 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1420 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1421 "\xef\xb7\x92"));
1423 // U+FDD3
1424 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1425 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1426 "\xef\xb7\x93"));
1428 // U+FDD4
1429 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1430 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1431 "\xef\xb7\x94"));
1433 // U+FDD5
1434 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1435 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1436 "\xef\xb7\x95"));
1438 // U+FDD6
1439 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1440 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1441 "\xef\xb7\x96"));
1443 // U+FDD7
1444 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1445 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1446 "\xef\xb7\x97"));
1448 // U+FDD8
1449 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1450 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1451 "\xef\xb7\x98"));
1453 // U+FDD9
1454 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1455 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1456 "\xef\xb7\x99"));
1458 // U+FDDA
1459 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1460 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1461 "\xef\xb7\x9a"));
1463 // U+FDDB
1464 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1465 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1466 "\xef\xb7\x9b"));
1468 // U+FDDC
1469 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1470 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1471 "\xef\xb7\x9c"));
1473 // U+FDDD
1474 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1475 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1476 "\xef\xb7\x9d"));
1478 // U+FDDE
1479 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1480 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1481 "\xef\xb7\x9e"));
1483 // U+FDDF
1484 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1485 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1486 "\xef\xb7\x9f"));
1488 // U+FDE0
1489 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1490 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1491 "\xef\xb7\xa0"));
1493 // U+FDE1
1494 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1495 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1496 "\xef\xb7\xa1"));
1498 // U+FDE2
1499 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1500 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1501 "\xef\xb7\xa2"));
1503 // U+FDE3
1504 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1505 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1506 "\xef\xb7\xa3"));
1508 // U+FDE4
1509 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1510 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1511 "\xef\xb7\xa4"));
1513 // U+FDE5
1514 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1515 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1516 "\xef\xb7\xa5"));
1518 // U+FDE6
1519 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1520 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1521 "\xef\xb7\xa6"));
1523 // U+FDE7
1524 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1525 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1526 "\xef\xb7\xa7"));
1528 // U+FDE8
1529 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1530 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1531 "\xef\xb7\xa8"));
1533 // U+FDE9
1534 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1535 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1536 "\xef\xb7\xa9"));
1538 // U+FDEA
1539 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1540 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1541 "\xef\xb7\xaa"));
1543 // U+FDEB
1544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1545 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1546 "\xef\xb7\xab"));
1548 // U+FDEC
1549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1550 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1551 "\xef\xb7\xac"));
1553 // U+FDED
1554 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1555 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1556 "\xef\xb7\xad"));
1558 // U+FDEE
1559 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1560 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1561 "\xef\xb7\xae"));
1563 // U+FDEF
1564 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1565 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1566 "\xef\xb7\xaf"));
1568 // U+FDF0
1569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1570 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1571 "\xef\xb7\xb0"));
1573 // U+FDF1
1574 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1575 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1576 "\xef\xb7\xb1"));
1578 // U+FDF2
1579 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1580 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1581 "\xef\xb7\xb2"));
1583 // U+FDF3
1584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1585 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1586 "\xef\xb7\xb3"));
1588 // U+FDF4
1589 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1590 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1591 "\xef\xb7\xb4"));
1593 // U+FDF5
1594 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1595 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1596 "\xef\xb7\xb5"));
1598 // U+FDF6
1599 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1600 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1601 "\xef\xb7\xb6"));
1603 // U+FDF7
1604 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1605 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1606 "\xef\xb7\xb7"));
1608 // U+FDF8
1609 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1610 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1611 "\xef\xb7\xb8"));
1613 // U+FDF9
1614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1615 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1616 "\xef\xb7\xb9"));
1618 // U+FDFA
1619 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1620 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1621 "\xef\xb7\xba"));
1623 // U+FDFB
1624 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1625 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1626 "\xef\xb7\xbb"));
1628 // U+FDFC
1629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1630 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1631 "\xef\xb7\xbc"));
1633 // U+FDFD
1634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1635 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1636 "\xef\xb7\xbd"));
1638 // U+FDFE
1639 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1640 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1641 "\xef\xb7\xbe"));
1643 // U+FDFF
1644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1645 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1646 "\xef\xb7\xbf"));
1649 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1650 // U+0041 LATIN CAPITAL LETTER A
1651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1652 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1653 "\x41", true));
1656 // Sequences with one continuation byte missing
1659 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1660 ConvertUTFResultContainer(sourceExhausted),
1661 "\xc2", true));
1662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1663 ConvertUTFResultContainer(sourceExhausted),
1664 "\xdf", true));
1665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1666 ConvertUTFResultContainer(sourceExhausted),
1667 "\xe0\xa0", true));
1668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1669 ConvertUTFResultContainer(sourceExhausted),
1670 "\xe0\xbf", true));
1671 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1672 ConvertUTFResultContainer(sourceExhausted),
1673 "\xe1\x80", true));
1674 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1675 ConvertUTFResultContainer(sourceExhausted),
1676 "\xec\xbf", true));
1677 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1678 ConvertUTFResultContainer(sourceExhausted),
1679 "\xed\x80", true));
1680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1681 ConvertUTFResultContainer(sourceExhausted),
1682 "\xed\x9f", true));
1683 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1684 ConvertUTFResultContainer(sourceExhausted),
1685 "\xee\x80", true));
1686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1687 ConvertUTFResultContainer(sourceExhausted),
1688 "\xef\xbf", true));
1689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1690 ConvertUTFResultContainer(sourceExhausted),
1691 "\xf0\x90\x80", true));
1692 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1693 ConvertUTFResultContainer(sourceExhausted),
1694 "\xf0\xbf\xbf", true));
1695 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1696 ConvertUTFResultContainer(sourceExhausted),
1697 "\xf1\x80\x80", true));
1698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1699 ConvertUTFResultContainer(sourceExhausted),
1700 "\xf3\xbf\xbf", true));
1701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1702 ConvertUTFResultContainer(sourceExhausted),
1703 "\xf4\x80\x80", true));
1704 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1705 ConvertUTFResultContainer(sourceExhausted),
1706 "\xf4\x8f\xbf", true));
1708 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1709 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1710 "\x41\xc2", true));