Bump version to 19.1.0-rc3
[llvm-project.git] / llvm / unittests / Support / ConvertUTFTest.cpp
blob6e75fbae0969ba1bf0a76c4d79a123e405a8dae7
1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "llvm/Support/ConvertUTF.h"
10 #include "llvm/ADT/ArrayRef.h"
11 #include "gtest/gtest.h"
12 #include <string>
13 #include <vector>
15 using namespace llvm;
17 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
18 // Src is the look of disapproval.
19 alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
20 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
21 std::string Result;
22 bool Success = convertUTF16ToUTF8String(Ref, Result);
23 EXPECT_TRUE(Success);
24 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
25 EXPECT_EQ(Expected, Result);
28 TEST(ConvertUTFTest, ConvertUTF32LittleEndianToUTF8String) {
29 // Src is the look of disapproval.
30 alignas(UTF32) static const char Src[] =
31 "\xFF\xFE\x00\x00\xA0\x0C\x00\x00\x5F\x00\x00\x00\xA0\x0C\x00\x00";
32 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33 std::string Result;
34 bool Success = convertUTF32ToUTF8String(Ref, Result);
35 EXPECT_TRUE(Success);
36 std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
37 EXPECT_EQ(Expected, Result);
40 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
41 // Src is the look of disapproval.
42 alignas(UTF16) static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
43 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
44 std::string Result;
45 bool Success = convertUTF16ToUTF8String(Ref, Result);
46 EXPECT_TRUE(Success);
47 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
48 EXPECT_EQ(Expected, Result);
51 TEST(ConvertUTFTest, ConvertUTF32BigEndianToUTF8String) {
52 // Src is the look of disapproval.
53 alignas(UTF32) static const char Src[] =
54 "\x00\x00\xFE\xFF\x00\x00\x0C\xA0\x00\x00\x00\x5F\x00\x00\x0C\xA0";
55 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
56 std::string Result;
57 bool Success = convertUTF32ToUTF8String(Ref, Result);
58 EXPECT_TRUE(Success);
59 std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
60 EXPECT_EQ(Expected, Result);
63 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
64 // Src is the look of disapproval.
65 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
66 StringRef Ref(Src, sizeof(Src) - 1);
67 SmallVector<UTF16, 5> Result;
68 bool Success = convertUTF8ToUTF16String(Ref, Result);
69 EXPECT_TRUE(Success);
70 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
71 ASSERT_EQ(3u, Result.size());
72 for (int I = 0, E = 3; I != E; ++I)
73 EXPECT_EQ(Expected[I], Result[I]);
76 TEST(ConvertUTFTest, OddLengthInput) {
77 std::string Result;
78 bool Success = convertUTF16ToUTF8String(ArrayRef("xxxxx", 5), Result);
79 EXPECT_FALSE(Success);
82 TEST(ConvertUTFTest, Empty) {
83 std::string Result;
84 bool Success =
85 convertUTF16ToUTF8String(llvm::ArrayRef<char>(std::nullopt), Result);
86 EXPECT_TRUE(Success);
87 EXPECT_TRUE(Result.empty());
90 TEST(ConvertUTFTest, HasUTF16BOM) {
91 bool HasBOM = hasUTF16ByteOrderMark(ArrayRef("\xff\xfe", 2));
92 EXPECT_TRUE(HasBOM);
93 HasBOM = hasUTF16ByteOrderMark(ArrayRef("\xfe\xff", 2));
94 EXPECT_TRUE(HasBOM);
95 HasBOM = hasUTF16ByteOrderMark(ArrayRef("\xfe\xff ", 3));
96 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
97 HasBOM = hasUTF16ByteOrderMark(ArrayRef("\xfe\xff\x00asdf", 6));
98 EXPECT_TRUE(HasBOM);
100 HasBOM = hasUTF16ByteOrderMark(std::nullopt);
101 EXPECT_FALSE(HasBOM);
102 HasBOM = hasUTF16ByteOrderMark(ArrayRef("\xfe", 1));
103 EXPECT_FALSE(HasBOM);
106 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
107 // Src is the look of disapproval.
108 alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
109 ArrayRef<UTF16> SrcRef = ArrayRef((const UTF16 *)Src, 4);
110 std::string Result;
111 bool Success = convertUTF16ToUTF8String(SrcRef, Result);
112 EXPECT_TRUE(Success);
113 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
114 EXPECT_EQ(Expected, Result);
117 TEST(ConvertUTFTest, ConvertUTF8toWide) {
118 // Src is the look of disapproval.
119 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
120 std::wstring Result;
121 bool Success = ConvertUTF8toWide((const char*)Src, Result);
122 EXPECT_TRUE(Success);
123 std::wstring Expected(L"\x0ca0_\x0ca0");
124 EXPECT_EQ(Expected, Result);
125 Result.clear();
126 Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
127 EXPECT_TRUE(Success);
128 EXPECT_EQ(Expected, Result);
131 TEST(ConvertUTFTest, convertWideToUTF8) {
132 // Src is the look of disapproval.
133 static const wchar_t Src[] = L"\x0ca0_\x0ca0";
134 std::string Result;
135 bool Success = convertWideToUTF8(Src, Result);
136 EXPECT_TRUE(Success);
137 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
138 EXPECT_EQ(Expected, Result);
141 struct ConvertUTFResultContainer {
142 ConversionResult ErrorCode;
143 std::vector<unsigned> UnicodeScalars;
145 ConvertUTFResultContainer(ConversionResult ErrorCode)
146 : ErrorCode(ErrorCode) {}
148 ConvertUTFResultContainer
149 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
150 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
151 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
152 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
153 ConvertUTFResultContainer Result(*this);
154 if (US0 != 0x110000)
155 Result.UnicodeScalars.push_back(US0);
156 if (US1 != 0x110000)
157 Result.UnicodeScalars.push_back(US1);
158 if (US2 != 0x110000)
159 Result.UnicodeScalars.push_back(US2);
160 if (US3 != 0x110000)
161 Result.UnicodeScalars.push_back(US3);
162 if (US4 != 0x110000)
163 Result.UnicodeScalars.push_back(US4);
164 if (US5 != 0x110000)
165 Result.UnicodeScalars.push_back(US5);
166 if (US6 != 0x110000)
167 Result.UnicodeScalars.push_back(US6);
168 if (US7 != 0x110000)
169 Result.UnicodeScalars.push_back(US7);
170 return Result;
174 std::pair<ConversionResult, std::vector<unsigned>>
175 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
176 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
178 const UTF8 *SourceNext = SourceStart;
179 std::vector<UTF32> Decoded(S.size(), 0);
180 UTF32 *TargetStart = Decoded.data();
182 auto ErrorCode =
183 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
184 Decoded.data() + Decoded.size(), lenientConversion);
186 Decoded.resize(TargetStart - Decoded.data());
188 return std::make_pair(ErrorCode, Decoded);
191 std::pair<ConversionResult, std::vector<unsigned>>
192 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
193 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
195 const UTF8 *SourceNext = SourceStart;
196 std::vector<UTF32> Decoded(S.size(), 0);
197 UTF32 *TargetStart = Decoded.data();
199 auto ErrorCode = ConvertUTF8toUTF32Partial(
200 &SourceNext, SourceStart + S.size(), &TargetStart,
201 Decoded.data() + Decoded.size(), lenientConversion);
203 Decoded.resize(TargetStart - Decoded.data());
205 return std::make_pair(ErrorCode, Decoded);
208 ::testing::AssertionResult
209 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
210 StringRef S, bool Partial = false) {
211 ConversionResult ErrorCode;
212 std::vector<unsigned> Decoded;
213 if (!Partial)
214 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
215 else
216 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
218 if (Expected.ErrorCode != ErrorCode)
219 return ::testing::AssertionFailure() << "Expected error code "
220 << Expected.ErrorCode << ", actual "
221 << ErrorCode;
223 if (Expected.UnicodeScalars != Decoded)
224 return ::testing::AssertionFailure()
225 << "Expected lenient decoded result:\n"
226 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
227 << "Actual result:\n" << ::testing::PrintToString(Decoded);
229 return ::testing::AssertionSuccess();
232 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
235 // 1-byte sequences
238 // U+0041 LATIN CAPITAL LETTER A
239 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
240 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
243 // 2-byte sequences
246 // U+0283 LATIN SMALL LETTER ESH
247 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
248 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
249 "\xca\x83"));
251 // U+03BA GREEK SMALL LETTER KAPPA
252 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
253 // U+03C3 GREEK SMALL LETTER SIGMA
254 // U+03BC GREEK SMALL LETTER MU
255 // U+03B5 GREEK SMALL LETTER EPSILON
256 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
257 ConvertUTFResultContainer(conversionOK)
258 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
259 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
262 // 3-byte sequences
265 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
266 // U+6587 CJK UNIFIED IDEOGRAPH-6587
267 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
269 "\xe4\xbe\x8b\xe6\x96\x87"));
271 // U+D55C HANGUL SYLLABLE HAN
272 // U+AE00 HANGUL SYLLABLE GEUL
273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
274 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
275 "\xed\x95\x9c\xea\xb8\x80"));
277 // U+1112 HANGUL CHOSEONG HIEUH
278 // U+1161 HANGUL JUNGSEONG A
279 // U+11AB HANGUL JONGSEONG NIEUN
280 // U+1100 HANGUL CHOSEONG KIYEOK
281 // U+1173 HANGUL JUNGSEONG EU
282 // U+11AF HANGUL JONGSEONG RIEUL
283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
284 ConvertUTFResultContainer(conversionOK)
285 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
286 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
287 "\xe1\x86\xaf"));
290 // 4-byte sequences
293 // U+E0100 VARIATION SELECTOR-17
294 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
295 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
296 "\xf3\xa0\x84\x80"));
299 // First possible sequence of a certain length
302 // U+0000 NULL
303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
305 StringRef("\x00", 1)));
307 // U+0080 PADDING CHARACTER
308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
310 "\xc2\x80"));
312 // U+0800 SAMARITAN LETTER ALAF
313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
315 "\xe0\xa0\x80"));
317 // U+10000 LINEAR B SYLLABLE B008 A
318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
320 "\xf0\x90\x80\x80"));
322 // U+200000 (invalid)
323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324 ConvertUTFResultContainer(sourceIllegal)
325 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
326 "\xf8\x88\x80\x80\x80"));
328 // U+4000000 (invalid)
329 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
330 ConvertUTFResultContainer(sourceIllegal)
331 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
332 "\xfc\x84\x80\x80\x80\x80"));
335 // Last possible sequence of a certain length
338 // U+007F DELETE
339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
342 // U+07FF (unassigned)
343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
344 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
345 "\xdf\xbf"));
347 // U+FFFF (noncharacter)
348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
350 "\xef\xbf\xbf"));
352 // U+1FFFFF (invalid)
353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
354 ConvertUTFResultContainer(sourceIllegal)
355 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
356 "\xf7\xbf\xbf\xbf"));
358 // U+3FFFFFF (invalid)
359 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
360 ConvertUTFResultContainer(sourceIllegal)
361 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
362 "\xfb\xbf\xbf\xbf\xbf"));
364 // U+7FFFFFFF (invalid)
365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
366 ConvertUTFResultContainer(sourceIllegal)
367 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
368 "\xfd\xbf\xbf\xbf\xbf\xbf"));
371 // Other boundary conditions
374 // U+D7FF (unassigned)
375 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
376 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
377 "\xed\x9f\xbf"));
379 // U+E000 (private use)
380 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
381 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
382 "\xee\x80\x80"));
384 // U+FFFD REPLACEMENT CHARACTER
385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
386 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
387 "\xef\xbf\xbd"));
389 // U+10FFFF (noncharacter)
390 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
391 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
392 "\xf4\x8f\xbf\xbf"));
394 // U+110000 (invalid)
395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
396 ConvertUTFResultContainer(sourceIllegal)
397 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
398 "\xf4\x90\x80\x80"));
401 // Unexpected continuation bytes
404 // A sequence of unexpected continuation bytes that don't follow a first
405 // byte, every byte is a maximal subpart.
407 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
408 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
409 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
410 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
413 "\x80\x80"));
414 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
415 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
416 "\x80\xbf"));
417 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
418 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
419 "\xbf\x80"));
420 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
421 ConvertUTFResultContainer(sourceIllegal)
422 .withScalars(0xfffd, 0xfffd, 0xfffd),
423 "\x80\xbf\x80"));
424 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
425 ConvertUTFResultContainer(sourceIllegal)
426 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
427 "\x80\xbf\x80\xbf"));
428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
429 ConvertUTFResultContainer(sourceIllegal)
430 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
431 "\x80\xbf\x82\xbf\xaa"));
432 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
433 ConvertUTFResultContainer(sourceIllegal)
434 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
435 "\xaa\xb0\xbb\xbf\xaa\xa0"));
436 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
437 ConvertUTFResultContainer(sourceIllegal)
438 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
439 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
441 // All continuation bytes (0x80--0xbf).
442 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
443 ConvertUTFResultContainer(sourceIllegal)
444 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
445 0xfffd, 0xfffd, 0xfffd, 0xfffd)
446 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
447 0xfffd, 0xfffd, 0xfffd, 0xfffd)
448 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
449 0xfffd, 0xfffd, 0xfffd, 0xfffd)
450 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
451 0xfffd, 0xfffd, 0xfffd, 0xfffd)
452 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
453 0xfffd, 0xfffd, 0xfffd, 0xfffd)
454 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
455 0xfffd, 0xfffd, 0xfffd, 0xfffd)
456 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
457 0xfffd, 0xfffd, 0xfffd, 0xfffd)
458 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
459 0xfffd, 0xfffd, 0xfffd, 0xfffd),
460 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
461 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
462 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
463 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
466 // Lonely start bytes
469 // Start bytes of 2-byte sequences (0xc0--0xdf).
470 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
471 ConvertUTFResultContainer(sourceIllegal)
472 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
473 0xfffd, 0xfffd, 0xfffd, 0xfffd)
474 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
475 0xfffd, 0xfffd, 0xfffd, 0xfffd)
476 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
477 0xfffd, 0xfffd, 0xfffd, 0xfffd)
478 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
479 0xfffd, 0xfffd, 0xfffd, 0xfffd),
480 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
481 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
484 ConvertUTFResultContainer(sourceIllegal)
485 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
486 0xfffd, 0x0020, 0xfffd, 0x0020)
487 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
488 0xfffd, 0x0020, 0xfffd, 0x0020)
489 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
490 0xfffd, 0x0020, 0xfffd, 0x0020)
491 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
492 0xfffd, 0x0020, 0xfffd, 0x0020)
493 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494 0xfffd, 0x0020, 0xfffd, 0x0020)
495 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
496 0xfffd, 0x0020, 0xfffd, 0x0020)
497 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
498 0xfffd, 0x0020, 0xfffd, 0x0020)
499 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
500 0xfffd, 0x0020, 0xfffd, 0x0020),
501 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
502 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
503 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
504 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
506 // Start bytes of 3-byte sequences (0xe0--0xef).
507 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
508 ConvertUTFResultContainer(sourceIllegal)
509 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
510 0xfffd, 0xfffd, 0xfffd, 0xfffd)
511 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
512 0xfffd, 0xfffd, 0xfffd, 0xfffd),
513 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
515 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
516 ConvertUTFResultContainer(sourceIllegal)
517 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
518 0xfffd, 0x0020, 0xfffd, 0x0020)
519 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
520 0xfffd, 0x0020, 0xfffd, 0x0020)
521 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
522 0xfffd, 0x0020, 0xfffd, 0x0020)
523 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
524 0xfffd, 0x0020, 0xfffd, 0x0020),
525 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
526 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
528 // Start bytes of 4-byte sequences (0xf0--0xf7).
529 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
530 ConvertUTFResultContainer(sourceIllegal)
531 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
532 0xfffd, 0xfffd, 0xfffd, 0xfffd),
533 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536 ConvertUTFResultContainer(sourceIllegal)
537 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
538 0xfffd, 0x0020, 0xfffd, 0x0020)
539 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
540 0xfffd, 0x0020, 0xfffd, 0x0020),
541 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
543 // Start bytes of 5-byte sequences (0xf8--0xfb).
544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
545 ConvertUTFResultContainer(sourceIllegal)
546 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
547 "\xf8\xf9\xfa\xfb"));
549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550 ConvertUTFResultContainer(sourceIllegal)
551 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
552 0xfffd, 0x0020, 0xfffd, 0x0020),
553 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
555 // Start bytes of 6-byte sequences (0xfc--0xfd).
556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
557 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
558 "\xfc\xfd"));
560 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
561 ConvertUTFResultContainer(sourceIllegal)
562 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
563 "\xfc\x20\xfd\x20"));
566 // Other bytes (0xc0--0xc1, 0xfe--0xff).
569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
572 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
575 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
576 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
579 ConvertUTFResultContainer(sourceIllegal)
580 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
581 "\xc0\xc1\xfe\xff"));
583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
584 ConvertUTFResultContainer(sourceIllegal)
585 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
586 "\xfe\xfe\xff\xff"));
588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
589 ConvertUTFResultContainer(sourceIllegal)
590 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
591 "\xfe\x80\x80\x80\x80\x80"));
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal)
595 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
596 "\xff\x80\x80\x80\x80\x80"));
598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
599 ConvertUTFResultContainer(sourceIllegal)
600 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
601 0xfffd, 0x0020, 0xfffd, 0x0020),
602 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
605 // Sequences with one continuation byte missing
608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
610 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
611 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
612 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
613 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
614 "\xe0\xa0"));
615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
617 "\xe0\xbf"));
618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
620 "\xe1\x80"));
621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
623 "\xec\xbf"));
624 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
625 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
626 "\xed\x80"));
627 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
628 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
629 "\xed\x9f"));
630 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
631 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
632 "\xee\x80"));
633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
634 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
635 "\xef\xbf"));
636 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
637 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
638 "\xf0\x90\x80"));
639 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
640 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
641 "\xf0\xbf\xbf"));
642 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
643 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
644 "\xf1\x80\x80"));
645 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
646 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
647 "\xf3\xbf\xbf"));
648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
649 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
650 "\xf4\x80\x80"));
651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
653 "\xf4\x8f\xbf"));
655 // Overlong sequences with one trailing byte missing.
656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
657 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
658 "\xc0"));
659 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
660 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
661 "\xc1"));
662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
663 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
664 "\xe0\x80"));
665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
667 "\xe0\x9f"));
668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669 ConvertUTFResultContainer(sourceIllegal)
670 .withScalars(0xfffd, 0xfffd, 0xfffd),
671 "\xf0\x80\x80"));
672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673 ConvertUTFResultContainer(sourceIllegal)
674 .withScalars(0xfffd, 0xfffd, 0xfffd),
675 "\xf0\x8f\x80"));
676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677 ConvertUTFResultContainer(sourceIllegal)
678 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679 "\xf8\x80\x80\x80"));
680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681 ConvertUTFResultContainer(sourceIllegal)
682 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
683 "\xfc\x80\x80\x80\x80"));
685 // Sequences that represent surrogates with one trailing byte missing.
686 // High surrogates
687 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
688 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
689 "\xed\xa0"));
690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
692 "\xed\xac"));
693 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
694 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
695 "\xed\xaf"));
696 // Low surrogates
697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
699 "\xed\xb0"));
700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
701 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
702 "\xed\xb4"));
703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
705 "\xed\xbf"));
707 // Ill-formed 4-byte sequences.
708 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
709 // U+1100xx (invalid)
710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711 ConvertUTFResultContainer(sourceIllegal)
712 .withScalars(0xfffd, 0xfffd, 0xfffd),
713 "\xf4\x90\x80"));
714 // U+13FBxx (invalid)
715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716 ConvertUTFResultContainer(sourceIllegal)
717 .withScalars(0xfffd, 0xfffd, 0xfffd),
718 "\xf4\xbf\xbf"));
719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal)
721 .withScalars(0xfffd, 0xfffd, 0xfffd),
722 "\xf5\x80\x80"));
723 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
724 ConvertUTFResultContainer(sourceIllegal)
725 .withScalars(0xfffd, 0xfffd, 0xfffd),
726 "\xf6\x80\x80"));
727 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
728 ConvertUTFResultContainer(sourceIllegal)
729 .withScalars(0xfffd, 0xfffd, 0xfffd),
730 "\xf7\x80\x80"));
731 // U+1FFBxx (invalid)
732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
733 ConvertUTFResultContainer(sourceIllegal)
734 .withScalars(0xfffd, 0xfffd, 0xfffd),
735 "\xf7\xbf\xbf"));
737 // Ill-formed 5-byte sequences.
738 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
739 // U+2000xx (invalid)
740 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
741 ConvertUTFResultContainer(sourceIllegal)
742 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
743 "\xf8\x88\x80\x80"));
744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745 ConvertUTFResultContainer(sourceIllegal)
746 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
747 "\xf8\xbf\xbf\xbf"));
748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749 ConvertUTFResultContainer(sourceIllegal)
750 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
751 "\xf9\x80\x80\x80"));
752 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
753 ConvertUTFResultContainer(sourceIllegal)
754 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
755 "\xfa\x80\x80\x80"));
756 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
757 ConvertUTFResultContainer(sourceIllegal)
758 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
759 "\xfb\x80\x80\x80"));
760 // U+3FFFFxx (invalid)
761 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
762 ConvertUTFResultContainer(sourceIllegal)
763 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
764 "\xfb\xbf\xbf\xbf"));
766 // Ill-formed 6-byte sequences.
767 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
768 // U+40000xx (invalid)
769 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
770 ConvertUTFResultContainer(sourceIllegal)
771 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
772 "\xfc\x84\x80\x80\x80"));
773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774 ConvertUTFResultContainer(sourceIllegal)
775 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
776 "\xfc\xbf\xbf\xbf\xbf"));
777 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
778 ConvertUTFResultContainer(sourceIllegal)
779 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
780 "\xfd\x80\x80\x80\x80"));
781 // U+7FFFFFxx (invalid)
782 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
783 ConvertUTFResultContainer(sourceIllegal)
784 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
785 "\xfd\xbf\xbf\xbf\xbf"));
788 // Sequences with two continuation bytes missing
791 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
792 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
793 "\xf0\x90"));
794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
796 "\xf0\xbf"));
797 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
798 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
799 "\xf1\x80"));
800 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
801 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
802 "\xf3\xbf"));
803 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
805 "\xf4\x80"));
806 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
807 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
808 "\xf4\x8f"));
810 // Overlong sequences with two trailing byte missing.
811 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
812 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
815 "\xf0\x80"));
816 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
818 "\xf0\x8f"));
819 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
820 ConvertUTFResultContainer(sourceIllegal)
821 .withScalars(0xfffd, 0xfffd, 0xfffd),
822 "\xf8\x80\x80"));
823 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
824 ConvertUTFResultContainer(sourceIllegal)
825 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
826 "\xfc\x80\x80\x80"));
828 // Sequences that represent surrogates with two trailing bytes missing.
829 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
830 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
832 // Ill-formed 4-byte sequences.
833 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
834 // U+110yxx (invalid)
835 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
836 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
837 "\xf4\x90"));
838 // U+13Fyxx (invalid)
839 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
840 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
841 "\xf4\xbf"));
842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
844 "\xf5\x80"));
845 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
847 "\xf6\x80"));
848 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
849 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
850 "\xf7\x80"));
851 // U+1FFyxx (invalid)
852 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
854 "\xf7\xbf"));
856 // Ill-formed 5-byte sequences.
857 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
858 // U+200yxx (invalid)
859 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
860 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
861 "\xf8\x88\x80"));
862 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
863 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
864 "\xf8\xbf\xbf"));
865 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
866 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
867 "\xf9\x80\x80"));
868 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
870 "\xfa\x80\x80"));
871 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
872 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
873 "\xfb\x80\x80"));
874 // U+3FFFyxx (invalid)
875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
877 "\xfb\xbf\xbf"));
879 // Ill-formed 6-byte sequences.
880 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
881 // U+4000yxx (invalid)
882 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
883 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
884 "\xfc\x84\x80\x80"));
885 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
886 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
887 "\xfc\xbf\xbf\xbf"));
888 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
889 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
890 "\xfd\x80\x80\x80"));
891 // U+7FFFFyxx (invalid)
892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
894 "\xfd\xbf\xbf\xbf"));
897 // Sequences with three continuation bytes missing
900 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
902 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
903 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
904 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
905 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
906 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
907 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
908 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
909 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
911 // Broken overlong sequences.
912 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
913 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
914 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
915 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
916 "\xf8\x80"));
917 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
918 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
919 "\xfc\x80\x80"));
921 // Ill-formed 4-byte sequences.
922 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
923 // U+14yyxx (invalid)
924 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
925 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
926 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
927 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
928 // U+1Cyyxx (invalid)
929 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
932 // Ill-formed 5-byte sequences.
933 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
934 // U+20yyxx (invalid)
935 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
937 "\xf8\x88"));
938 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
939 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
940 "\xf8\xbf"));
941 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
942 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
943 "\xf9\x80"));
944 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
945 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
946 "\xfa\x80"));
947 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
948 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
949 "\xfb\x80"));
950 // U+3FCyyxx (invalid)
951 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
952 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
953 "\xfb\xbf"));
955 // Ill-formed 6-byte sequences.
956 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
957 // U+400yyxx (invalid)
958 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
959 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
960 "\xfc\x84\x80"));
961 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
962 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
963 "\xfc\xbf\xbf"));
964 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
966 "\xfd\x80\x80"));
967 // U+7FFCyyxx (invalid)
968 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
969 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
970 "\xfd\xbf\xbf"));
973 // Sequences with four continuation bytes missing
976 // Ill-formed 5-byte sequences.
977 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
978 // U+uzyyxx (invalid)
979 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
980 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
981 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
982 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
983 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
984 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
985 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
986 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
987 // U+3zyyxx (invalid)
988 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
989 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
991 // Broken overlong sequences.
992 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
993 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
994 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
995 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
996 "\xfc\x80"));
998 // Ill-formed 6-byte sequences.
999 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
1000 // U+uzzyyxx (invalid)
1001 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1003 "\xfc\x84"));
1004 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1005 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1006 "\xfc\xbf"));
1007 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1008 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1009 "\xfd\x80"));
1010 // U+7Fzzyyxx (invalid)
1011 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1012 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1013 "\xfd\xbf"));
1016 // Sequences with five continuation bytes missing
1019 // Ill-formed 6-byte sequences.
1020 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
1021 // U+uzzyyxx (invalid)
1022 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1023 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
1024 // U+uuzzyyxx (invalid)
1025 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
1029 // Consecutive sequences with trailing bytes missing
1032 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1033 ConvertUTFResultContainer(sourceIllegal)
1034 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1035 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1036 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1037 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1038 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1039 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1040 "\xc0" "\xe0\x80" "\xf0\x80\x80"
1041 "\xf8\x80\x80\x80"
1042 "\xfc\x80\x80\x80\x80"
1043 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1044 "\xfb\xbf\xbf\xbf"
1045 "\xfd\xbf\xbf\xbf\xbf"));
1048 // Overlong UTF-8 sequences
1051 // U+002F SOLIDUS
1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
1055 // Overlong sequences of the above.
1056 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1058 "\xc0\xaf"));
1059 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1060 ConvertUTFResultContainer(sourceIllegal)
1061 .withScalars(0xfffd, 0xfffd, 0xfffd),
1062 "\xe0\x80\xaf"));
1063 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1064 ConvertUTFResultContainer(sourceIllegal)
1065 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1066 "\xf0\x80\x80\xaf"));
1067 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1068 ConvertUTFResultContainer(sourceIllegal)
1069 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1070 "\xf8\x80\x80\x80\xaf"));
1071 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1072 ConvertUTFResultContainer(sourceIllegal)
1073 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1074 "\xfc\x80\x80\x80\x80\xaf"));
1076 // U+0000 NULL
1077 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1078 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1079 StringRef("\x00", 1)));
1081 // Overlong sequences of the above.
1082 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1083 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1084 "\xc0\x80"));
1085 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1086 ConvertUTFResultContainer(sourceIllegal)
1087 .withScalars(0xfffd, 0xfffd, 0xfffd),
1088 "\xe0\x80\x80"));
1089 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1090 ConvertUTFResultContainer(sourceIllegal)
1091 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1092 "\xf0\x80\x80\x80"));
1093 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1094 ConvertUTFResultContainer(sourceIllegal)
1095 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1096 "\xf8\x80\x80\x80\x80"));
1097 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1098 ConvertUTFResultContainer(sourceIllegal)
1099 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1100 "\xfc\x80\x80\x80\x80\x80"));
1102 // Other overlong sequences.
1103 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1104 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1105 "\xc0\xbf"));
1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1108 "\xc1\x80"));
1109 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1110 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1111 "\xc1\xbf"));
1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113 ConvertUTFResultContainer(sourceIllegal)
1114 .withScalars(0xfffd, 0xfffd, 0xfffd),
1115 "\xe0\x9f\xbf"));
1116 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1117 ConvertUTFResultContainer(sourceIllegal)
1118 .withScalars(0xfffd, 0xfffd, 0xfffd),
1119 "\xed\xa0\x80"));
1120 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1121 ConvertUTFResultContainer(sourceIllegal)
1122 .withScalars(0xfffd, 0xfffd, 0xfffd),
1123 "\xed\xbf\xbf"));
1124 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1125 ConvertUTFResultContainer(sourceIllegal)
1126 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1127 "\xf0\x8f\x80\x80"));
1128 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1129 ConvertUTFResultContainer(sourceIllegal)
1130 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1131 "\xf0\x8f\xbf\xbf"));
1132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133 ConvertUTFResultContainer(sourceIllegal)
1134 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135 "\xf8\x87\xbf\xbf\xbf"));
1136 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1137 ConvertUTFResultContainer(sourceIllegal)
1138 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1139 "\xfc\x83\xbf\xbf\xbf\xbf"));
1142 // Isolated surrogates
1145 // Unicode 6.3.0:
1147 // D71. High-surrogate code point: A Unicode code point in the range
1148 // U+D800 to U+DBFF.
1150 // D73. Low-surrogate code point: A Unicode code point in the range
1151 // U+DC00 to U+DFFF.
1153 // Note: U+E0100 is <DB40 DD00> in UTF16.
1155 // High surrogates
1157 // U+D800
1158 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1159 ConvertUTFResultContainer(sourceIllegal)
1160 .withScalars(0xfffd, 0xfffd, 0xfffd),
1161 "\xed\xa0\x80"));
1163 // U+DB40
1164 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1165 ConvertUTFResultContainer(sourceIllegal)
1166 .withScalars(0xfffd, 0xfffd, 0xfffd),
1167 "\xed\xac\xa0"));
1169 // U+DBFF
1170 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1171 ConvertUTFResultContainer(sourceIllegal)
1172 .withScalars(0xfffd, 0xfffd, 0xfffd),
1173 "\xed\xaf\xbf"));
1175 // Low surrogates
1177 // U+DC00
1178 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1179 ConvertUTFResultContainer(sourceIllegal)
1180 .withScalars(0xfffd, 0xfffd, 0xfffd),
1181 "\xed\xb0\x80"));
1183 // U+DD00
1184 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1185 ConvertUTFResultContainer(sourceIllegal)
1186 .withScalars(0xfffd, 0xfffd, 0xfffd),
1187 "\xed\xb4\x80"));
1189 // U+DFFF
1190 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1191 ConvertUTFResultContainer(sourceIllegal)
1192 .withScalars(0xfffd, 0xfffd, 0xfffd),
1193 "\xed\xbf\xbf"));
1195 // Surrogate pairs
1197 // U+D800 U+DC00
1198 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1199 ConvertUTFResultContainer(sourceIllegal)
1200 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1201 "\xed\xa0\x80\xed\xb0\x80"));
1203 // U+D800 U+DD00
1204 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1205 ConvertUTFResultContainer(sourceIllegal)
1206 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1207 "\xed\xa0\x80\xed\xb4\x80"));
1209 // U+D800 U+DFFF
1210 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211 ConvertUTFResultContainer(sourceIllegal)
1212 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1213 "\xed\xa0\x80\xed\xbf\xbf"));
1215 // U+DB40 U+DC00
1216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217 ConvertUTFResultContainer(sourceIllegal)
1218 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1219 "\xed\xac\xa0\xed\xb0\x80"));
1221 // U+DB40 U+DD00
1222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1223 ConvertUTFResultContainer(sourceIllegal)
1224 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1225 "\xed\xac\xa0\xed\xb4\x80"));
1227 // U+DB40 U+DFFF
1228 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1229 ConvertUTFResultContainer(sourceIllegal)
1230 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1231 "\xed\xac\xa0\xed\xbf\xbf"));
1233 // U+DBFF U+DC00
1234 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1235 ConvertUTFResultContainer(sourceIllegal)
1236 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1237 "\xed\xaf\xbf\xed\xb0\x80"));
1239 // U+DBFF U+DD00
1240 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1241 ConvertUTFResultContainer(sourceIllegal)
1242 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1243 "\xed\xaf\xbf\xed\xb4\x80"));
1245 // U+DBFF U+DFFF
1246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247 ConvertUTFResultContainer(sourceIllegal)
1248 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1249 "\xed\xaf\xbf\xed\xbf\xbf"));
1252 // Noncharacters
1255 // Unicode 6.3.0:
1257 // D14. Noncharacter: A code point that is permanently reserved for
1258 // internal use and that should never be interchanged. Noncharacters
1259 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1260 // and the values U+FDD0..U+FDEF.
1262 // U+FFFE
1263 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1264 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1265 "\xef\xbf\xbe"));
1267 // U+FFFF
1268 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1269 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1270 "\xef\xbf\xbf"));
1272 // U+1FFFE
1273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1274 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1275 "\xf0\x9f\xbf\xbe"));
1277 // U+1FFFF
1278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1279 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1280 "\xf0\x9f\xbf\xbf"));
1282 // U+2FFFE
1283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1284 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1285 "\xf0\xaf\xbf\xbe"));
1287 // U+2FFFF
1288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1289 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1290 "\xf0\xaf\xbf\xbf"));
1292 // U+3FFFE
1293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1294 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1295 "\xf0\xbf\xbf\xbe"));
1297 // U+3FFFF
1298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1299 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1300 "\xf0\xbf\xbf\xbf"));
1302 // U+4FFFE
1303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1304 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1305 "\xf1\x8f\xbf\xbe"));
1307 // U+4FFFF
1308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1309 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1310 "\xf1\x8f\xbf\xbf"));
1312 // U+5FFFE
1313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1314 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1315 "\xf1\x9f\xbf\xbe"));
1317 // U+5FFFF
1318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1319 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1320 "\xf1\x9f\xbf\xbf"));
1322 // U+6FFFE
1323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1324 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1325 "\xf1\xaf\xbf\xbe"));
1327 // U+6FFFF
1328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1329 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1330 "\xf1\xaf\xbf\xbf"));
1332 // U+7FFFE
1333 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1334 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1335 "\xf1\xbf\xbf\xbe"));
1337 // U+7FFFF
1338 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1339 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1340 "\xf1\xbf\xbf\xbf"));
1342 // U+8FFFE
1343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1344 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1345 "\xf2\x8f\xbf\xbe"));
1347 // U+8FFFF
1348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1349 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1350 "\xf2\x8f\xbf\xbf"));
1352 // U+9FFFE
1353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1354 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1355 "\xf2\x9f\xbf\xbe"));
1357 // U+9FFFF
1358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1359 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1360 "\xf2\x9f\xbf\xbf"));
1362 // U+AFFFE
1363 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1364 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1365 "\xf2\xaf\xbf\xbe"));
1367 // U+AFFFF
1368 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1369 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1370 "\xf2\xaf\xbf\xbf"));
1372 // U+BFFFE
1373 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1374 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1375 "\xf2\xbf\xbf\xbe"));
1377 // U+BFFFF
1378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1379 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1380 "\xf2\xbf\xbf\xbf"));
1382 // U+CFFFE
1383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1384 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1385 "\xf3\x8f\xbf\xbe"));
1387 // U+CFFFF
1388 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1389 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1390 "\xf3\x8f\xbf\xbf"));
1392 // U+DFFFE
1393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1394 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1395 "\xf3\x9f\xbf\xbe"));
1397 // U+DFFFF
1398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1399 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1400 "\xf3\x9f\xbf\xbf"));
1402 // U+EFFFE
1403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1404 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1405 "\xf3\xaf\xbf\xbe"));
1407 // U+EFFFF
1408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1409 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1410 "\xf3\xaf\xbf\xbf"));
1412 // U+FFFFE
1413 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1414 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1415 "\xf3\xbf\xbf\xbe"));
1417 // U+FFFFF
1418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1419 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1420 "\xf3\xbf\xbf\xbf"));
1422 // U+10FFFE
1423 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1424 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1425 "\xf4\x8f\xbf\xbe"));
1427 // U+10FFFF
1428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1429 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1430 "\xf4\x8f\xbf\xbf"));
1432 // U+FDD0
1433 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1434 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1435 "\xef\xb7\x90"));
1437 // U+FDD1
1438 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1439 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1440 "\xef\xb7\x91"));
1442 // U+FDD2
1443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1444 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1445 "\xef\xb7\x92"));
1447 // U+FDD3
1448 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1449 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1450 "\xef\xb7\x93"));
1452 // U+FDD4
1453 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1454 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1455 "\xef\xb7\x94"));
1457 // U+FDD5
1458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1459 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1460 "\xef\xb7\x95"));
1462 // U+FDD6
1463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1464 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1465 "\xef\xb7\x96"));
1467 // U+FDD7
1468 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1469 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1470 "\xef\xb7\x97"));
1472 // U+FDD8
1473 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1474 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1475 "\xef\xb7\x98"));
1477 // U+FDD9
1478 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1479 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1480 "\xef\xb7\x99"));
1482 // U+FDDA
1483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1484 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1485 "\xef\xb7\x9a"));
1487 // U+FDDB
1488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1489 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1490 "\xef\xb7\x9b"));
1492 // U+FDDC
1493 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1494 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1495 "\xef\xb7\x9c"));
1497 // U+FDDD
1498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1499 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1500 "\xef\xb7\x9d"));
1502 // U+FDDE
1503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1504 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1505 "\xef\xb7\x9e"));
1507 // U+FDDF
1508 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1509 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1510 "\xef\xb7\x9f"));
1512 // U+FDE0
1513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1514 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1515 "\xef\xb7\xa0"));
1517 // U+FDE1
1518 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1519 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1520 "\xef\xb7\xa1"));
1522 // U+FDE2
1523 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1524 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1525 "\xef\xb7\xa2"));
1527 // U+FDE3
1528 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1529 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1530 "\xef\xb7\xa3"));
1532 // U+FDE4
1533 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1534 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1535 "\xef\xb7\xa4"));
1537 // U+FDE5
1538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1539 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1540 "\xef\xb7\xa5"));
1542 // U+FDE6
1543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1544 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1545 "\xef\xb7\xa6"));
1547 // U+FDE7
1548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1549 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1550 "\xef\xb7\xa7"));
1552 // U+FDE8
1553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1554 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1555 "\xef\xb7\xa8"));
1557 // U+FDE9
1558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1559 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1560 "\xef\xb7\xa9"));
1562 // U+FDEA
1563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1564 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1565 "\xef\xb7\xaa"));
1567 // U+FDEB
1568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1569 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1570 "\xef\xb7\xab"));
1572 // U+FDEC
1573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1574 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1575 "\xef\xb7\xac"));
1577 // U+FDED
1578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1579 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1580 "\xef\xb7\xad"));
1582 // U+FDEE
1583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1584 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1585 "\xef\xb7\xae"));
1587 // U+FDEF
1588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1589 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1590 "\xef\xb7\xaf"));
1592 // U+FDF0
1593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1594 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1595 "\xef\xb7\xb0"));
1597 // U+FDF1
1598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1599 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1600 "\xef\xb7\xb1"));
1602 // U+FDF2
1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1605 "\xef\xb7\xb2"));
1607 // U+FDF3
1608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1609 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1610 "\xef\xb7\xb3"));
1612 // U+FDF4
1613 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1614 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1615 "\xef\xb7\xb4"));
1617 // U+FDF5
1618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1619 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1620 "\xef\xb7\xb5"));
1622 // U+FDF6
1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1625 "\xef\xb7\xb6"));
1627 // U+FDF7
1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1630 "\xef\xb7\xb7"));
1632 // U+FDF8
1633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1634 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1635 "\xef\xb7\xb8"));
1637 // U+FDF9
1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1640 "\xef\xb7\xb9"));
1642 // U+FDFA
1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1645 "\xef\xb7\xba"));
1647 // U+FDFB
1648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1649 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1650 "\xef\xb7\xbb"));
1652 // U+FDFC
1653 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1655 "\xef\xb7\xbc"));
1657 // U+FDFD
1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1660 "\xef\xb7\xbd"));
1662 // U+FDFE
1663 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1664 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1665 "\xef\xb7\xbe"));
1667 // U+FDFF
1668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1669 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1670 "\xef\xb7\xbf"));
1673 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1674 // U+0041 LATIN CAPITAL LETTER A
1675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1676 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1677 "\x41", true));
1680 // Sequences with one continuation byte missing
1683 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1684 ConvertUTFResultContainer(sourceExhausted),
1685 "\xc2", true));
1686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1687 ConvertUTFResultContainer(sourceExhausted),
1688 "\xdf", true));
1689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1690 ConvertUTFResultContainer(sourceExhausted),
1691 "\xe0\xa0", true));
1692 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1693 ConvertUTFResultContainer(sourceExhausted),
1694 "\xe0\xbf", true));
1695 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1696 ConvertUTFResultContainer(sourceExhausted),
1697 "\xe1\x80", true));
1698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1699 ConvertUTFResultContainer(sourceExhausted),
1700 "\xec\xbf", true));
1701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1702 ConvertUTFResultContainer(sourceExhausted),
1703 "\xed\x80", true));
1704 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1705 ConvertUTFResultContainer(sourceExhausted),
1706 "\xed\x9f", true));
1707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1708 ConvertUTFResultContainer(sourceExhausted),
1709 "\xee\x80", true));
1710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1711 ConvertUTFResultContainer(sourceExhausted),
1712 "\xef\xbf", true));
1713 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1714 ConvertUTFResultContainer(sourceExhausted),
1715 "\xf0\x90\x80", true));
1716 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1717 ConvertUTFResultContainer(sourceExhausted),
1718 "\xf0\xbf\xbf", true));
1719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1720 ConvertUTFResultContainer(sourceExhausted),
1721 "\xf1\x80\x80", true));
1722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1723 ConvertUTFResultContainer(sourceExhausted),
1724 "\xf3\xbf\xbf", true));
1725 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1726 ConvertUTFResultContainer(sourceExhausted),
1727 "\xf4\x80\x80", true));
1728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1729 ConvertUTFResultContainer(sourceExhausted),
1730 "\xf4\x8f\xbf", true));
1732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1733 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1734 "\x41\xc2", true));