1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/Support/ConvertUTF.h"
10 #include "llvm/ADT/ArrayRef.h"
11 #include "gtest/gtest.h"
17 TEST(ConvertUTFTest
, ConvertUTF16LittleEndianToUTF8String
) {
18 // Src is the look of disapproval.
19 alignas(UTF16
) static const char Src
[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
20 ArrayRef
<char> Ref(Src
, sizeof(Src
) - 1);
22 bool Success
= convertUTF16ToUTF8String(Ref
, Result
);
24 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
25 EXPECT_EQ(Expected
, Result
);
28 TEST(ConvertUTFTest
, ConvertUTF32LittleEndianToUTF8String
) {
29 // Src is the look of disapproval.
30 alignas(UTF32
) static const char Src
[] =
31 "\xFF\xFE\x00\x00\xA0\x0C\x00\x00\x5F\x00\x00\x00\xA0\x0C\x00\x00";
32 ArrayRef
<char> Ref(Src
, sizeof(Src
) - 1);
34 bool Success
= convertUTF32ToUTF8String(Ref
, Result
);
36 std::string
Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
37 EXPECT_EQ(Expected
, Result
);
40 TEST(ConvertUTFTest
, ConvertUTF16BigEndianToUTF8String
) {
41 // Src is the look of disapproval.
42 alignas(UTF16
) static const char Src
[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
43 ArrayRef
<char> Ref(Src
, sizeof(Src
) - 1);
45 bool Success
= convertUTF16ToUTF8String(Ref
, Result
);
47 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
48 EXPECT_EQ(Expected
, Result
);
51 TEST(ConvertUTFTest
, ConvertUTF32BigEndianToUTF8String
) {
52 // Src is the look of disapproval.
53 alignas(UTF32
) static const char Src
[] =
54 "\x00\x00\xFE\xFF\x00\x00\x0C\xA0\x00\x00\x00\x5F\x00\x00\x0C\xA0";
55 ArrayRef
<char> Ref(Src
, sizeof(Src
) - 1);
57 bool Success
= convertUTF32ToUTF8String(Ref
, Result
);
59 std::string
Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
60 EXPECT_EQ(Expected
, Result
);
63 TEST(ConvertUTFTest
, ConvertUTF8ToUTF16String
) {
64 // Src is the look of disapproval.
65 static const char Src
[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
66 StringRef
Ref(Src
, sizeof(Src
) - 1);
67 SmallVector
<UTF16
, 5> Result
;
68 bool Success
= convertUTF8ToUTF16String(Ref
, Result
);
70 static const UTF16 Expected
[] = {0x0CA0, 0x005f, 0x0CA0, 0};
71 ASSERT_EQ(3u, Result
.size());
72 for (int I
= 0, E
= 3; I
!= E
; ++I
)
73 EXPECT_EQ(Expected
[I
], Result
[I
]);
76 TEST(ConvertUTFTest
, OddLengthInput
) {
78 bool Success
= convertUTF16ToUTF8String(ArrayRef("xxxxx", 5), Result
);
79 EXPECT_FALSE(Success
);
82 TEST(ConvertUTFTest
, Empty
) {
85 convertUTF16ToUTF8String(llvm::ArrayRef
<char>(std::nullopt
), Result
);
87 EXPECT_TRUE(Result
.empty());
90 TEST(ConvertUTFTest
, HasUTF16BOM
) {
91 bool HasBOM
= hasUTF16ByteOrderMark(ArrayRef("\xff\xfe", 2));
93 HasBOM
= hasUTF16ByteOrderMark(ArrayRef("\xfe\xff", 2));
95 HasBOM
= hasUTF16ByteOrderMark(ArrayRef("\xfe\xff ", 3));
96 EXPECT_TRUE(HasBOM
); // Don't care about odd lengths.
97 HasBOM
= hasUTF16ByteOrderMark(ArrayRef("\xfe\xff\x00asdf", 6));
100 HasBOM
= hasUTF16ByteOrderMark(std::nullopt
);
101 EXPECT_FALSE(HasBOM
);
102 HasBOM
= hasUTF16ByteOrderMark(ArrayRef("\xfe", 1));
103 EXPECT_FALSE(HasBOM
);
106 TEST(ConvertUTFTest
, UTF16WrappersForConvertUTF16ToUTF8String
) {
107 // Src is the look of disapproval.
108 alignas(UTF16
) static const char Src
[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
109 ArrayRef
<UTF16
> SrcRef
= ArrayRef((const UTF16
*)Src
, 4);
111 bool Success
= convertUTF16ToUTF8String(SrcRef
, Result
);
112 EXPECT_TRUE(Success
);
113 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
114 EXPECT_EQ(Expected
, Result
);
117 TEST(ConvertUTFTest
, ConvertUTF8toWide
) {
118 // Src is the look of disapproval.
119 static const char Src
[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
121 bool Success
= ConvertUTF8toWide((const char*)Src
, Result
);
122 EXPECT_TRUE(Success
);
123 std::wstring
Expected(L
"\x0ca0_\x0ca0");
124 EXPECT_EQ(Expected
, Result
);
126 Success
= ConvertUTF8toWide(StringRef(Src
, 7), Result
);
127 EXPECT_TRUE(Success
);
128 EXPECT_EQ(Expected
, Result
);
131 TEST(ConvertUTFTest
, convertWideToUTF8
) {
132 // Src is the look of disapproval.
133 static const wchar_t Src
[] = L
"\x0ca0_\x0ca0";
135 bool Success
= convertWideToUTF8(Src
, Result
);
136 EXPECT_TRUE(Success
);
137 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
138 EXPECT_EQ(Expected
, Result
);
141 struct ConvertUTFResultContainer
{
142 ConversionResult ErrorCode
;
143 std::vector
<unsigned> UnicodeScalars
;
145 ConvertUTFResultContainer(ConversionResult ErrorCode
)
146 : ErrorCode(ErrorCode
) {}
148 ConvertUTFResultContainer
149 withScalars(unsigned US0
= 0x110000, unsigned US1
= 0x110000,
150 unsigned US2
= 0x110000, unsigned US3
= 0x110000,
151 unsigned US4
= 0x110000, unsigned US5
= 0x110000,
152 unsigned US6
= 0x110000, unsigned US7
= 0x110000) {
153 ConvertUTFResultContainer
Result(*this);
155 Result
.UnicodeScalars
.push_back(US0
);
157 Result
.UnicodeScalars
.push_back(US1
);
159 Result
.UnicodeScalars
.push_back(US2
);
161 Result
.UnicodeScalars
.push_back(US3
);
163 Result
.UnicodeScalars
.push_back(US4
);
165 Result
.UnicodeScalars
.push_back(US5
);
167 Result
.UnicodeScalars
.push_back(US6
);
169 Result
.UnicodeScalars
.push_back(US7
);
174 std::pair
<ConversionResult
, std::vector
<unsigned>>
175 ConvertUTF8ToUnicodeScalarsLenient(StringRef S
) {
176 const UTF8
*SourceStart
= reinterpret_cast<const UTF8
*>(S
.data());
178 const UTF8
*SourceNext
= SourceStart
;
179 std::vector
<UTF32
> Decoded(S
.size(), 0);
180 UTF32
*TargetStart
= Decoded
.data();
183 ConvertUTF8toUTF32(&SourceNext
, SourceStart
+ S
.size(), &TargetStart
,
184 Decoded
.data() + Decoded
.size(), lenientConversion
);
186 Decoded
.resize(TargetStart
- Decoded
.data());
188 return std::make_pair(ErrorCode
, Decoded
);
191 std::pair
<ConversionResult
, std::vector
<unsigned>>
192 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S
) {
193 const UTF8
*SourceStart
= reinterpret_cast<const UTF8
*>(S
.data());
195 const UTF8
*SourceNext
= SourceStart
;
196 std::vector
<UTF32
> Decoded(S
.size(), 0);
197 UTF32
*TargetStart
= Decoded
.data();
199 auto ErrorCode
= ConvertUTF8toUTF32Partial(
200 &SourceNext
, SourceStart
+ S
.size(), &TargetStart
,
201 Decoded
.data() + Decoded
.size(), lenientConversion
);
203 Decoded
.resize(TargetStart
- Decoded
.data());
205 return std::make_pair(ErrorCode
, Decoded
);
208 ::testing::AssertionResult
209 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected
,
210 StringRef S
, bool Partial
= false) {
211 ConversionResult ErrorCode
;
212 std::vector
<unsigned> Decoded
;
214 std::tie(ErrorCode
, Decoded
) = ConvertUTF8ToUnicodeScalarsLenient(S
);
216 std::tie(ErrorCode
, Decoded
) = ConvertUTF8ToUnicodeScalarsPartialLenient(S
);
218 if (Expected
.ErrorCode
!= ErrorCode
)
219 return ::testing::AssertionFailure() << "Expected error code "
220 << Expected
.ErrorCode
<< ", actual "
223 if (Expected
.UnicodeScalars
!= Decoded
)
224 return ::testing::AssertionFailure()
225 << "Expected lenient decoded result:\n"
226 << ::testing::PrintToString(Expected
.UnicodeScalars
) << "\n"
227 << "Actual result:\n" << ::testing::PrintToString(Decoded
);
229 return ::testing::AssertionSuccess();
232 TEST(ConvertUTFTest
, UTF8ToUTF32Lenient
) {
238 // U+0041 LATIN CAPITAL LETTER A
239 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
240 ConvertUTFResultContainer(conversionOK
).withScalars(0x0041), "\x41"));
246 // U+0283 LATIN SMALL LETTER ESH
247 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
248 ConvertUTFResultContainer(conversionOK
).withScalars(0x0283),
251 // U+03BA GREEK SMALL LETTER KAPPA
252 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
253 // U+03C3 GREEK SMALL LETTER SIGMA
254 // U+03BC GREEK SMALL LETTER MU
255 // U+03B5 GREEK SMALL LETTER EPSILON
256 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
257 ConvertUTFResultContainer(conversionOK
)
258 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
259 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
265 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
266 // U+6587 CJK UNIFIED IDEOGRAPH-6587
267 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268 ConvertUTFResultContainer(conversionOK
).withScalars(0x4f8b, 0x6587),
269 "\xe4\xbe\x8b\xe6\x96\x87"));
271 // U+D55C HANGUL SYLLABLE HAN
272 // U+AE00 HANGUL SYLLABLE GEUL
273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
274 ConvertUTFResultContainer(conversionOK
).withScalars(0xd55c, 0xae00),
275 "\xed\x95\x9c\xea\xb8\x80"));
277 // U+1112 HANGUL CHOSEONG HIEUH
278 // U+1161 HANGUL JUNGSEONG A
279 // U+11AB HANGUL JONGSEONG NIEUN
280 // U+1100 HANGUL CHOSEONG KIYEOK
281 // U+1173 HANGUL JUNGSEONG EU
282 // U+11AF HANGUL JONGSEONG RIEUL
283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
284 ConvertUTFResultContainer(conversionOK
)
285 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
286 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
293 // U+E0100 VARIATION SELECTOR-17
294 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
295 ConvertUTFResultContainer(conversionOK
).withScalars(0x000E0100),
296 "\xf3\xa0\x84\x80"));
299 // First possible sequence of a certain length
303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304 ConvertUTFResultContainer(conversionOK
).withScalars(0x0000),
305 StringRef("\x00", 1)));
307 // U+0080 PADDING CHARACTER
308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309 ConvertUTFResultContainer(conversionOK
).withScalars(0x0080),
312 // U+0800 SAMARITAN LETTER ALAF
313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314 ConvertUTFResultContainer(conversionOK
).withScalars(0x0800),
317 // U+10000 LINEAR B SYLLABLE B008 A
318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319 ConvertUTFResultContainer(conversionOK
).withScalars(0x10000),
320 "\xf0\x90\x80\x80"));
322 // U+200000 (invalid)
323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324 ConvertUTFResultContainer(sourceIllegal
)
325 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
326 "\xf8\x88\x80\x80\x80"));
328 // U+4000000 (invalid)
329 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
330 ConvertUTFResultContainer(sourceIllegal
)
331 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
332 "\xfc\x84\x80\x80\x80\x80"));
335 // Last possible sequence of a certain length
339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340 ConvertUTFResultContainer(conversionOK
).withScalars(0x007f), "\x7f"));
342 // U+07FF (unassigned)
343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
344 ConvertUTFResultContainer(conversionOK
).withScalars(0x07ff),
347 // U+FFFF (noncharacter)
348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349 ConvertUTFResultContainer(conversionOK
).withScalars(0xffff),
352 // U+1FFFFF (invalid)
353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
354 ConvertUTFResultContainer(sourceIllegal
)
355 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
356 "\xf7\xbf\xbf\xbf"));
358 // U+3FFFFFF (invalid)
359 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
360 ConvertUTFResultContainer(sourceIllegal
)
361 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
362 "\xfb\xbf\xbf\xbf\xbf"));
364 // U+7FFFFFFF (invalid)
365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
366 ConvertUTFResultContainer(sourceIllegal
)
367 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
368 "\xfd\xbf\xbf\xbf\xbf\xbf"));
371 // Other boundary conditions
374 // U+D7FF (unassigned)
375 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
376 ConvertUTFResultContainer(conversionOK
).withScalars(0xd7ff),
379 // U+E000 (private use)
380 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
381 ConvertUTFResultContainer(conversionOK
).withScalars(0xe000),
384 // U+FFFD REPLACEMENT CHARACTER
385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
386 ConvertUTFResultContainer(conversionOK
).withScalars(0xfffd),
389 // U+10FFFF (noncharacter)
390 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
391 ConvertUTFResultContainer(conversionOK
).withScalars(0x10ffff),
392 "\xf4\x8f\xbf\xbf"));
394 // U+110000 (invalid)
395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
396 ConvertUTFResultContainer(sourceIllegal
)
397 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
398 "\xf4\x90\x80\x80"));
401 // Unexpected continuation bytes
404 // A sequence of unexpected continuation bytes that don't follow a first
405 // byte, every byte is a maximal subpart.
407 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
408 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\x80"));
409 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
410 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xbf"));
411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
414 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
415 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
417 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
418 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
420 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
421 ConvertUTFResultContainer(sourceIllegal
)
422 .withScalars(0xfffd, 0xfffd, 0xfffd),
424 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
425 ConvertUTFResultContainer(sourceIllegal
)
426 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
427 "\x80\xbf\x80\xbf"));
428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
429 ConvertUTFResultContainer(sourceIllegal
)
430 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
431 "\x80\xbf\x82\xbf\xaa"));
432 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
433 ConvertUTFResultContainer(sourceIllegal
)
434 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
435 "\xaa\xb0\xbb\xbf\xaa\xa0"));
436 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
437 ConvertUTFResultContainer(sourceIllegal
)
438 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
439 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
441 // All continuation bytes (0x80--0xbf).
442 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
443 ConvertUTFResultContainer(sourceIllegal
)
444 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
445 0xfffd, 0xfffd, 0xfffd, 0xfffd)
446 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
447 0xfffd, 0xfffd, 0xfffd, 0xfffd)
448 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
449 0xfffd, 0xfffd, 0xfffd, 0xfffd)
450 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
451 0xfffd, 0xfffd, 0xfffd, 0xfffd)
452 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
453 0xfffd, 0xfffd, 0xfffd, 0xfffd)
454 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
455 0xfffd, 0xfffd, 0xfffd, 0xfffd)
456 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
457 0xfffd, 0xfffd, 0xfffd, 0xfffd)
458 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
459 0xfffd, 0xfffd, 0xfffd, 0xfffd),
460 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
461 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
462 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
463 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
466 // Lonely start bytes
469 // Start bytes of 2-byte sequences (0xc0--0xdf).
470 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
471 ConvertUTFResultContainer(sourceIllegal
)
472 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
473 0xfffd, 0xfffd, 0xfffd, 0xfffd)
474 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
475 0xfffd, 0xfffd, 0xfffd, 0xfffd)
476 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
477 0xfffd, 0xfffd, 0xfffd, 0xfffd)
478 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
479 0xfffd, 0xfffd, 0xfffd, 0xfffd),
480 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
481 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
484 ConvertUTFResultContainer(sourceIllegal
)
485 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
486 0xfffd, 0x0020, 0xfffd, 0x0020)
487 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
488 0xfffd, 0x0020, 0xfffd, 0x0020)
489 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
490 0xfffd, 0x0020, 0xfffd, 0x0020)
491 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
492 0xfffd, 0x0020, 0xfffd, 0x0020)
493 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494 0xfffd, 0x0020, 0xfffd, 0x0020)
495 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
496 0xfffd, 0x0020, 0xfffd, 0x0020)
497 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
498 0xfffd, 0x0020, 0xfffd, 0x0020)
499 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
500 0xfffd, 0x0020, 0xfffd, 0x0020),
501 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
502 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
503 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
504 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
506 // Start bytes of 3-byte sequences (0xe0--0xef).
507 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
508 ConvertUTFResultContainer(sourceIllegal
)
509 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
510 0xfffd, 0xfffd, 0xfffd, 0xfffd)
511 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
512 0xfffd, 0xfffd, 0xfffd, 0xfffd),
513 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
515 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
516 ConvertUTFResultContainer(sourceIllegal
)
517 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
518 0xfffd, 0x0020, 0xfffd, 0x0020)
519 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
520 0xfffd, 0x0020, 0xfffd, 0x0020)
521 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
522 0xfffd, 0x0020, 0xfffd, 0x0020)
523 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
524 0xfffd, 0x0020, 0xfffd, 0x0020),
525 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
526 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
528 // Start bytes of 4-byte sequences (0xf0--0xf7).
529 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
530 ConvertUTFResultContainer(sourceIllegal
)
531 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
532 0xfffd, 0xfffd, 0xfffd, 0xfffd),
533 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536 ConvertUTFResultContainer(sourceIllegal
)
537 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
538 0xfffd, 0x0020, 0xfffd, 0x0020)
539 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
540 0xfffd, 0x0020, 0xfffd, 0x0020),
541 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
543 // Start bytes of 5-byte sequences (0xf8--0xfb).
544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
545 ConvertUTFResultContainer(sourceIllegal
)
546 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
547 "\xf8\xf9\xfa\xfb"));
549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550 ConvertUTFResultContainer(sourceIllegal
)
551 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
552 0xfffd, 0x0020, 0xfffd, 0x0020),
553 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
555 // Start bytes of 6-byte sequences (0xfc--0xfd).
556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
557 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
560 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
561 ConvertUTFResultContainer(sourceIllegal
)
562 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
563 "\xfc\x20\xfd\x20"));
566 // Other bytes (0xc0--0xc1, 0xfe--0xff).
569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xc0"));
571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
572 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xc1"));
573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfe"));
575 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
576 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xff"));
578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
579 ConvertUTFResultContainer(sourceIllegal
)
580 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
581 "\xc0\xc1\xfe\xff"));
583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
584 ConvertUTFResultContainer(sourceIllegal
)
585 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
586 "\xfe\xfe\xff\xff"));
588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
589 ConvertUTFResultContainer(sourceIllegal
)
590 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
591 "\xfe\x80\x80\x80\x80\x80"));
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal
)
595 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
596 "\xff\x80\x80\x80\x80\x80"));
598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
599 ConvertUTFResultContainer(sourceIllegal
)
600 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
601 0xfffd, 0x0020, 0xfffd, 0x0020),
602 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
605 // Sequences with one continuation byte missing
608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xc2"));
610 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
611 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xdf"));
612 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
613 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
624 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
625 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
627 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
628 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
630 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
631 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
634 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
636 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
637 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
639 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
640 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
642 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
643 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
645 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
646 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
649 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
655 // Overlong sequences with one trailing byte missing.
656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
657 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
659 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
660 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
663 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669 ConvertUTFResultContainer(sourceIllegal
)
670 .withScalars(0xfffd, 0xfffd, 0xfffd),
672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673 ConvertUTFResultContainer(sourceIllegal
)
674 .withScalars(0xfffd, 0xfffd, 0xfffd),
676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677 ConvertUTFResultContainer(sourceIllegal
)
678 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679 "\xf8\x80\x80\x80"));
680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681 ConvertUTFResultContainer(sourceIllegal
)
682 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
683 "\xfc\x80\x80\x80\x80"));
685 // Sequences that represent surrogates with one trailing byte missing.
687 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
688 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
693 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
694 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
701 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
707 // Ill-formed 4-byte sequences.
708 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
709 // U+1100xx (invalid)
710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711 ConvertUTFResultContainer(sourceIllegal
)
712 .withScalars(0xfffd, 0xfffd, 0xfffd),
714 // U+13FBxx (invalid)
715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716 ConvertUTFResultContainer(sourceIllegal
)
717 .withScalars(0xfffd, 0xfffd, 0xfffd),
719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal
)
721 .withScalars(0xfffd, 0xfffd, 0xfffd),
723 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
724 ConvertUTFResultContainer(sourceIllegal
)
725 .withScalars(0xfffd, 0xfffd, 0xfffd),
727 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
728 ConvertUTFResultContainer(sourceIllegal
)
729 .withScalars(0xfffd, 0xfffd, 0xfffd),
731 // U+1FFBxx (invalid)
732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
733 ConvertUTFResultContainer(sourceIllegal
)
734 .withScalars(0xfffd, 0xfffd, 0xfffd),
737 // Ill-formed 5-byte sequences.
738 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
739 // U+2000xx (invalid)
740 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
741 ConvertUTFResultContainer(sourceIllegal
)
742 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
743 "\xf8\x88\x80\x80"));
744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745 ConvertUTFResultContainer(sourceIllegal
)
746 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
747 "\xf8\xbf\xbf\xbf"));
748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749 ConvertUTFResultContainer(sourceIllegal
)
750 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
751 "\xf9\x80\x80\x80"));
752 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
753 ConvertUTFResultContainer(sourceIllegal
)
754 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
755 "\xfa\x80\x80\x80"));
756 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
757 ConvertUTFResultContainer(sourceIllegal
)
758 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
759 "\xfb\x80\x80\x80"));
760 // U+3FFFFxx (invalid)
761 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
762 ConvertUTFResultContainer(sourceIllegal
)
763 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
764 "\xfb\xbf\xbf\xbf"));
766 // Ill-formed 6-byte sequences.
767 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
768 // U+40000xx (invalid)
769 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
770 ConvertUTFResultContainer(sourceIllegal
)
771 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
772 "\xfc\x84\x80\x80\x80"));
773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774 ConvertUTFResultContainer(sourceIllegal
)
775 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
776 "\xfc\xbf\xbf\xbf\xbf"));
777 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
778 ConvertUTFResultContainer(sourceIllegal
)
779 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
780 "\xfd\x80\x80\x80\x80"));
781 // U+7FFFFFxx (invalid)
782 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
783 ConvertUTFResultContainer(sourceIllegal
)
784 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
785 "\xfd\xbf\xbf\xbf\xbf"));
788 // Sequences with two continuation bytes missing
791 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
792 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
797 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
798 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
800 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
801 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
803 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
806 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
807 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
810 // Overlong sequences with two trailing byte missing.
811 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
812 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xe0"));
813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
816 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
819 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
820 ConvertUTFResultContainer(sourceIllegal
)
821 .withScalars(0xfffd, 0xfffd, 0xfffd),
823 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
824 ConvertUTFResultContainer(sourceIllegal
)
825 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
826 "\xfc\x80\x80\x80"));
828 // Sequences that represent surrogates with two trailing bytes missing.
829 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
830 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xed"));
832 // Ill-formed 4-byte sequences.
833 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
834 // U+110yxx (invalid)
835 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
836 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
838 // U+13Fyxx (invalid)
839 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
840 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
845 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
848 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
849 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
851 // U+1FFyxx (invalid)
852 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
856 // Ill-formed 5-byte sequences.
857 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
858 // U+200yxx (invalid)
859 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
860 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
862 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
863 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
865 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
866 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
868 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
871 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
872 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
874 // U+3FFFyxx (invalid)
875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
879 // Ill-formed 6-byte sequences.
880 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
881 // U+4000yxx (invalid)
882 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
883 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
884 "\xfc\x84\x80\x80"));
885 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
886 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
887 "\xfc\xbf\xbf\xbf"));
888 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
889 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
890 "\xfd\x80\x80\x80"));
891 // U+7FFFFyxx (invalid)
892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
894 "\xfd\xbf\xbf\xbf"));
897 // Sequences with three continuation bytes missing
900 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf0"));
902 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
903 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf1"));
904 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
905 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf2"));
906 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
907 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf3"));
908 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
909 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf4"));
911 // Broken overlong sequences.
912 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
913 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf0"));
914 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
915 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
917 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
918 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
921 // Ill-formed 4-byte sequences.
922 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
923 // U+14yyxx (invalid)
924 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
925 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf5"));
926 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
927 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf6"));
928 // U+1Cyyxx (invalid)
929 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf7"));
932 // Ill-formed 5-byte sequences.
933 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
934 // U+20yyxx (invalid)
935 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
938 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
939 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
941 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
942 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
944 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
945 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
947 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
948 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
950 // U+3FCyyxx (invalid)
951 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
952 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
955 // Ill-formed 6-byte sequences.
956 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
957 // U+400yyxx (invalid)
958 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
959 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
961 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
962 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
964 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
967 // U+7FFCyyxx (invalid)
968 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
969 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
973 // Sequences with four continuation bytes missing
976 // Ill-formed 5-byte sequences.
977 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
978 // U+uzyyxx (invalid)
979 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
980 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf8"));
981 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
982 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf9"));
983 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
984 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfa"));
985 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
986 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfb"));
987 // U+3zyyxx (invalid)
988 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
989 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfb"));
991 // Broken overlong sequences.
992 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
993 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf8"));
994 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
995 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
998 // Ill-formed 6-byte sequences.
999 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
1000 // U+uzzyyxx (invalid)
1001 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1004 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1005 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1007 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1008 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1010 // U+7Fzzyyxx (invalid)
1011 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1012 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1016 // Sequences with five continuation bytes missing
1019 // Ill-formed 6-byte sequences.
1020 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
1021 // U+uzzyyxx (invalid)
1022 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1023 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfc"));
1024 // U+uuzzyyxx (invalid)
1025 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfd"));
1029 // Consecutive sequences with trailing bytes missing
1032 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1033 ConvertUTFResultContainer(sourceIllegal
)
1034 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1035 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1036 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1037 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1038 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1039 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1040 "\xc0" "\xe0\x80" "\xf0\x80\x80"
1042 "\xfc\x80\x80\x80\x80"
1043 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1045 "\xfd\xbf\xbf\xbf\xbf"));
1048 // Overlong UTF-8 sequences
1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053 ConvertUTFResultContainer(conversionOK
).withScalars(0x002f), "\x2f"));
1055 // Overlong sequences of the above.
1056 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1059 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1060 ConvertUTFResultContainer(sourceIllegal
)
1061 .withScalars(0xfffd, 0xfffd, 0xfffd),
1063 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1064 ConvertUTFResultContainer(sourceIllegal
)
1065 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1066 "\xf0\x80\x80\xaf"));
1067 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1068 ConvertUTFResultContainer(sourceIllegal
)
1069 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1070 "\xf8\x80\x80\x80\xaf"));
1071 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1072 ConvertUTFResultContainer(sourceIllegal
)
1073 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1074 "\xfc\x80\x80\x80\x80\xaf"));
1077 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1078 ConvertUTFResultContainer(conversionOK
).withScalars(0x0000),
1079 StringRef("\x00", 1)));
1081 // Overlong sequences of the above.
1082 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1083 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1085 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1086 ConvertUTFResultContainer(sourceIllegal
)
1087 .withScalars(0xfffd, 0xfffd, 0xfffd),
1089 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1090 ConvertUTFResultContainer(sourceIllegal
)
1091 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1092 "\xf0\x80\x80\x80"));
1093 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1094 ConvertUTFResultContainer(sourceIllegal
)
1095 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1096 "\xf8\x80\x80\x80\x80"));
1097 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1098 ConvertUTFResultContainer(sourceIllegal
)
1099 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1100 "\xfc\x80\x80\x80\x80\x80"));
1102 // Other overlong sequences.
1103 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1104 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1109 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1110 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113 ConvertUTFResultContainer(sourceIllegal
)
1114 .withScalars(0xfffd, 0xfffd, 0xfffd),
1116 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1117 ConvertUTFResultContainer(sourceIllegal
)
1118 .withScalars(0xfffd, 0xfffd, 0xfffd),
1120 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1121 ConvertUTFResultContainer(sourceIllegal
)
1122 .withScalars(0xfffd, 0xfffd, 0xfffd),
1124 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1125 ConvertUTFResultContainer(sourceIllegal
)
1126 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1127 "\xf0\x8f\x80\x80"));
1128 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1129 ConvertUTFResultContainer(sourceIllegal
)
1130 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1131 "\xf0\x8f\xbf\xbf"));
1132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133 ConvertUTFResultContainer(sourceIllegal
)
1134 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135 "\xf8\x87\xbf\xbf\xbf"));
1136 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1137 ConvertUTFResultContainer(sourceIllegal
)
1138 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1139 "\xfc\x83\xbf\xbf\xbf\xbf"));
1142 // Isolated surrogates
1147 // D71. High-surrogate code point: A Unicode code point in the range
1148 // U+D800 to U+DBFF.
1150 // D73. Low-surrogate code point: A Unicode code point in the range
1151 // U+DC00 to U+DFFF.
1153 // Note: U+E0100 is <DB40 DD00> in UTF16.
1158 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1159 ConvertUTFResultContainer(sourceIllegal
)
1160 .withScalars(0xfffd, 0xfffd, 0xfffd),
1164 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1165 ConvertUTFResultContainer(sourceIllegal
)
1166 .withScalars(0xfffd, 0xfffd, 0xfffd),
1170 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1171 ConvertUTFResultContainer(sourceIllegal
)
1172 .withScalars(0xfffd, 0xfffd, 0xfffd),
1178 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1179 ConvertUTFResultContainer(sourceIllegal
)
1180 .withScalars(0xfffd, 0xfffd, 0xfffd),
1184 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1185 ConvertUTFResultContainer(sourceIllegal
)
1186 .withScalars(0xfffd, 0xfffd, 0xfffd),
1190 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1191 ConvertUTFResultContainer(sourceIllegal
)
1192 .withScalars(0xfffd, 0xfffd, 0xfffd),
1198 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1199 ConvertUTFResultContainer(sourceIllegal
)
1200 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1201 "\xed\xa0\x80\xed\xb0\x80"));
1204 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1205 ConvertUTFResultContainer(sourceIllegal
)
1206 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1207 "\xed\xa0\x80\xed\xb4\x80"));
1210 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211 ConvertUTFResultContainer(sourceIllegal
)
1212 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1213 "\xed\xa0\x80\xed\xbf\xbf"));
1216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217 ConvertUTFResultContainer(sourceIllegal
)
1218 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1219 "\xed\xac\xa0\xed\xb0\x80"));
1222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1223 ConvertUTFResultContainer(sourceIllegal
)
1224 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1225 "\xed\xac\xa0\xed\xb4\x80"));
1228 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1229 ConvertUTFResultContainer(sourceIllegal
)
1230 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1231 "\xed\xac\xa0\xed\xbf\xbf"));
1234 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1235 ConvertUTFResultContainer(sourceIllegal
)
1236 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1237 "\xed\xaf\xbf\xed\xb0\x80"));
1240 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1241 ConvertUTFResultContainer(sourceIllegal
)
1242 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1243 "\xed\xaf\xbf\xed\xb4\x80"));
1246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247 ConvertUTFResultContainer(sourceIllegal
)
1248 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1249 "\xed\xaf\xbf\xed\xbf\xbf"));
1257 // D14. Noncharacter: A code point that is permanently reserved for
1258 // internal use and that should never be interchanged. Noncharacters
1259 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1260 // and the values U+FDD0..U+FDEF.
1263 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1264 ConvertUTFResultContainer(conversionOK
).withScalars(0xfffe),
1268 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1269 ConvertUTFResultContainer(conversionOK
).withScalars(0xffff),
1273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1274 ConvertUTFResultContainer(conversionOK
).withScalars(0x1fffe),
1275 "\xf0\x9f\xbf\xbe"));
1278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1279 ConvertUTFResultContainer(conversionOK
).withScalars(0x1ffff),
1280 "\xf0\x9f\xbf\xbf"));
1283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1284 ConvertUTFResultContainer(conversionOK
).withScalars(0x2fffe),
1285 "\xf0\xaf\xbf\xbe"));
1288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1289 ConvertUTFResultContainer(conversionOK
).withScalars(0x2ffff),
1290 "\xf0\xaf\xbf\xbf"));
1293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1294 ConvertUTFResultContainer(conversionOK
).withScalars(0x3fffe),
1295 "\xf0\xbf\xbf\xbe"));
1298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1299 ConvertUTFResultContainer(conversionOK
).withScalars(0x3ffff),
1300 "\xf0\xbf\xbf\xbf"));
1303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1304 ConvertUTFResultContainer(conversionOK
).withScalars(0x4fffe),
1305 "\xf1\x8f\xbf\xbe"));
1308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1309 ConvertUTFResultContainer(conversionOK
).withScalars(0x4ffff),
1310 "\xf1\x8f\xbf\xbf"));
1313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1314 ConvertUTFResultContainer(conversionOK
).withScalars(0x5fffe),
1315 "\xf1\x9f\xbf\xbe"));
1318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1319 ConvertUTFResultContainer(conversionOK
).withScalars(0x5ffff),
1320 "\xf1\x9f\xbf\xbf"));
1323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1324 ConvertUTFResultContainer(conversionOK
).withScalars(0x6fffe),
1325 "\xf1\xaf\xbf\xbe"));
1328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1329 ConvertUTFResultContainer(conversionOK
).withScalars(0x6ffff),
1330 "\xf1\xaf\xbf\xbf"));
1333 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1334 ConvertUTFResultContainer(conversionOK
).withScalars(0x7fffe),
1335 "\xf1\xbf\xbf\xbe"));
1338 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1339 ConvertUTFResultContainer(conversionOK
).withScalars(0x7ffff),
1340 "\xf1\xbf\xbf\xbf"));
1343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1344 ConvertUTFResultContainer(conversionOK
).withScalars(0x8fffe),
1345 "\xf2\x8f\xbf\xbe"));
1348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1349 ConvertUTFResultContainer(conversionOK
).withScalars(0x8ffff),
1350 "\xf2\x8f\xbf\xbf"));
1353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1354 ConvertUTFResultContainer(conversionOK
).withScalars(0x9fffe),
1355 "\xf2\x9f\xbf\xbe"));
1358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1359 ConvertUTFResultContainer(conversionOK
).withScalars(0x9ffff),
1360 "\xf2\x9f\xbf\xbf"));
1363 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1364 ConvertUTFResultContainer(conversionOK
).withScalars(0xafffe),
1365 "\xf2\xaf\xbf\xbe"));
1368 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1369 ConvertUTFResultContainer(conversionOK
).withScalars(0xaffff),
1370 "\xf2\xaf\xbf\xbf"));
1373 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1374 ConvertUTFResultContainer(conversionOK
).withScalars(0xbfffe),
1375 "\xf2\xbf\xbf\xbe"));
1378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1379 ConvertUTFResultContainer(conversionOK
).withScalars(0xbffff),
1380 "\xf2\xbf\xbf\xbf"));
1383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1384 ConvertUTFResultContainer(conversionOK
).withScalars(0xcfffe),
1385 "\xf3\x8f\xbf\xbe"));
1388 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1389 ConvertUTFResultContainer(conversionOK
).withScalars(0xcfffF),
1390 "\xf3\x8f\xbf\xbf"));
1393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1394 ConvertUTFResultContainer(conversionOK
).withScalars(0xdfffe),
1395 "\xf3\x9f\xbf\xbe"));
1398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1399 ConvertUTFResultContainer(conversionOK
).withScalars(0xdffff),
1400 "\xf3\x9f\xbf\xbf"));
1403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1404 ConvertUTFResultContainer(conversionOK
).withScalars(0xefffe),
1405 "\xf3\xaf\xbf\xbe"));
1408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1409 ConvertUTFResultContainer(conversionOK
).withScalars(0xeffff),
1410 "\xf3\xaf\xbf\xbf"));
1413 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1414 ConvertUTFResultContainer(conversionOK
).withScalars(0xffffe),
1415 "\xf3\xbf\xbf\xbe"));
1418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1419 ConvertUTFResultContainer(conversionOK
).withScalars(0xfffff),
1420 "\xf3\xbf\xbf\xbf"));
1423 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1424 ConvertUTFResultContainer(conversionOK
).withScalars(0x10fffe),
1425 "\xf4\x8f\xbf\xbe"));
1428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1429 ConvertUTFResultContainer(conversionOK
).withScalars(0x10ffff),
1430 "\xf4\x8f\xbf\xbf"));
1433 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1434 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd0),
1438 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1439 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd1),
1443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1444 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd2),
1448 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1449 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd3),
1453 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1454 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd4),
1458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1459 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd5),
1463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1464 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd6),
1468 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1469 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd7),
1473 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1474 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd8),
1478 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1479 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd9),
1483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1484 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdda),
1488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1489 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddb),
1493 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1494 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddc),
1498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1499 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddd),
1503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1504 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdde),
1508 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1509 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddf),
1513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1514 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde0),
1518 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1519 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde1),
1523 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1524 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde2),
1528 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1529 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde3),
1533 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1534 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde4),
1538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1539 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde5),
1543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1544 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde6),
1548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1549 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde7),
1553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1554 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde8),
1558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1559 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde9),
1563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1564 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdea),
1568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1569 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdeb),
1573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1574 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdec),
1578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1579 ConvertUTFResultContainer(conversionOK
).withScalars(0xfded),
1583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1584 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdee),
1588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1589 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdef),
1593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1594 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf0),
1598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1599 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf1),
1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf2),
1608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1609 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf3),
1613 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1614 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf4),
1618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1619 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf5),
1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf6),
1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf7),
1633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1634 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf8),
1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf9),
1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfa),
1648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1649 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfb),
1653 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfc),
1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfd),
1663 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1664 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfe),
1668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1669 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdff),
1673 TEST(ConvertUTFTest
, UTF8ToUTF32PartialLenient
) {
1674 // U+0041 LATIN CAPITAL LETTER A
1675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1676 ConvertUTFResultContainer(conversionOK
).withScalars(0x0041),
1680 // Sequences with one continuation byte missing
1683 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1684 ConvertUTFResultContainer(sourceExhausted
),
1686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1687 ConvertUTFResultContainer(sourceExhausted
),
1689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1690 ConvertUTFResultContainer(sourceExhausted
),
1692 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1693 ConvertUTFResultContainer(sourceExhausted
),
1695 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1696 ConvertUTFResultContainer(sourceExhausted
),
1698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1699 ConvertUTFResultContainer(sourceExhausted
),
1701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1702 ConvertUTFResultContainer(sourceExhausted
),
1704 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1705 ConvertUTFResultContainer(sourceExhausted
),
1707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1708 ConvertUTFResultContainer(sourceExhausted
),
1710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1711 ConvertUTFResultContainer(sourceExhausted
),
1713 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1714 ConvertUTFResultContainer(sourceExhausted
),
1715 "\xf0\x90\x80", true));
1716 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1717 ConvertUTFResultContainer(sourceExhausted
),
1718 "\xf0\xbf\xbf", true));
1719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1720 ConvertUTFResultContainer(sourceExhausted
),
1721 "\xf1\x80\x80", true));
1722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1723 ConvertUTFResultContainer(sourceExhausted
),
1724 "\xf3\xbf\xbf", true));
1725 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1726 ConvertUTFResultContainer(sourceExhausted
),
1727 "\xf4\x80\x80", true));
1728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1729 ConvertUTFResultContainer(sourceExhausted
),
1730 "\xf4\x8f\xbf", true));
1732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1733 ConvertUTFResultContainer(sourceExhausted
).withScalars(0x0041),