1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/Support/ConvertUTF.h"
10 #include "llvm/ADT/ArrayRef.h"
11 #include "gtest/gtest.h"
17 TEST(ConvertUTFTest
, ConvertUTF16LittleEndianToUTF8String
) {
18 // Src is the look of disapproval.
19 static const char Src
[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
20 ArrayRef
<char> Ref(Src
, sizeof(Src
) - 1);
22 bool Success
= convertUTF16ToUTF8String(Ref
, Result
);
24 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
25 EXPECT_EQ(Expected
, Result
);
28 TEST(ConvertUTFTest
, ConvertUTF16BigEndianToUTF8String
) {
29 // Src is the look of disapproval.
30 static const char Src
[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
31 ArrayRef
<char> Ref(Src
, sizeof(Src
) - 1);
33 bool Success
= convertUTF16ToUTF8String(Ref
, Result
);
35 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
36 EXPECT_EQ(Expected
, Result
);
39 TEST(ConvertUTFTest
, ConvertUTF8ToUTF16String
) {
40 // Src is the look of disapproval.
41 static const char Src
[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
42 StringRef
Ref(Src
, sizeof(Src
) - 1);
43 SmallVector
<UTF16
, 5> Result
;
44 bool Success
= convertUTF8ToUTF16String(Ref
, Result
);
46 static const UTF16 Expected
[] = {0x0CA0, 0x005f, 0x0CA0, 0};
47 ASSERT_EQ(3u, Result
.size());
48 for (int I
= 0, E
= 3; I
!= E
; ++I
)
49 EXPECT_EQ(Expected
[I
], Result
[I
]);
52 TEST(ConvertUTFTest
, OddLengthInput
) {
54 bool Success
= convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result
);
55 EXPECT_FALSE(Success
);
58 TEST(ConvertUTFTest
, Empty
) {
60 bool Success
= convertUTF16ToUTF8String(llvm::ArrayRef
<char>(None
), Result
);
62 EXPECT_TRUE(Result
.empty());
65 TEST(ConvertUTFTest
, HasUTF16BOM
) {
66 bool HasBOM
= hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
68 HasBOM
= hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
70 HasBOM
= hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
71 EXPECT_TRUE(HasBOM
); // Don't care about odd lengths.
72 HasBOM
= hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
75 HasBOM
= hasUTF16ByteOrderMark(None
);
77 HasBOM
= hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
81 TEST(ConvertUTFTest
, UTF16WrappersForConvertUTF16ToUTF8String
) {
82 // Src is the look of disapproval.
83 static const char Src
[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
84 ArrayRef
<UTF16
> SrcRef
= makeArrayRef((const UTF16
*)Src
, 4);
86 bool Success
= convertUTF16ToUTF8String(SrcRef
, Result
);
88 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
89 EXPECT_EQ(Expected
, Result
);
92 TEST(ConvertUTFTest
, ConvertUTF8toWide
) {
93 // Src is the look of disapproval.
94 static const char Src
[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
96 bool Success
= ConvertUTF8toWide((const char*)Src
, Result
);
98 std::wstring
Expected(L
"\x0ca0_\x0ca0");
99 EXPECT_EQ(Expected
, Result
);
101 Success
= ConvertUTF8toWide(StringRef(Src
, 7), Result
);
102 EXPECT_TRUE(Success
);
103 EXPECT_EQ(Expected
, Result
);
106 TEST(ConvertUTFTest
, convertWideToUTF8
) {
107 // Src is the look of disapproval.
108 static const wchar_t Src
[] = L
"\x0ca0_\x0ca0";
110 bool Success
= convertWideToUTF8(Src
, Result
);
111 EXPECT_TRUE(Success
);
112 std::string
Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
113 EXPECT_EQ(Expected
, Result
);
116 struct ConvertUTFResultContainer
{
117 ConversionResult ErrorCode
;
118 std::vector
<unsigned> UnicodeScalars
;
120 ConvertUTFResultContainer(ConversionResult ErrorCode
)
121 : ErrorCode(ErrorCode
) {}
123 ConvertUTFResultContainer
124 withScalars(unsigned US0
= 0x110000, unsigned US1
= 0x110000,
125 unsigned US2
= 0x110000, unsigned US3
= 0x110000,
126 unsigned US4
= 0x110000, unsigned US5
= 0x110000,
127 unsigned US6
= 0x110000, unsigned US7
= 0x110000) {
128 ConvertUTFResultContainer
Result(*this);
130 Result
.UnicodeScalars
.push_back(US0
);
132 Result
.UnicodeScalars
.push_back(US1
);
134 Result
.UnicodeScalars
.push_back(US2
);
136 Result
.UnicodeScalars
.push_back(US3
);
138 Result
.UnicodeScalars
.push_back(US4
);
140 Result
.UnicodeScalars
.push_back(US5
);
142 Result
.UnicodeScalars
.push_back(US6
);
144 Result
.UnicodeScalars
.push_back(US7
);
149 std::pair
<ConversionResult
, std::vector
<unsigned>>
150 ConvertUTF8ToUnicodeScalarsLenient(StringRef S
) {
151 const UTF8
*SourceStart
= reinterpret_cast<const UTF8
*>(S
.data());
153 const UTF8
*SourceNext
= SourceStart
;
154 std::vector
<UTF32
> Decoded(S
.size(), 0);
155 UTF32
*TargetStart
= Decoded
.data();
158 ConvertUTF8toUTF32(&SourceNext
, SourceStart
+ S
.size(), &TargetStart
,
159 Decoded
.data() + Decoded
.size(), lenientConversion
);
161 Decoded
.resize(TargetStart
- Decoded
.data());
163 return std::make_pair(ErrorCode
, Decoded
);
166 std::pair
<ConversionResult
, std::vector
<unsigned>>
167 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S
) {
168 const UTF8
*SourceStart
= reinterpret_cast<const UTF8
*>(S
.data());
170 const UTF8
*SourceNext
= SourceStart
;
171 std::vector
<UTF32
> Decoded(S
.size(), 0);
172 UTF32
*TargetStart
= Decoded
.data();
174 auto ErrorCode
= ConvertUTF8toUTF32Partial(
175 &SourceNext
, SourceStart
+ S
.size(), &TargetStart
,
176 Decoded
.data() + Decoded
.size(), lenientConversion
);
178 Decoded
.resize(TargetStart
- Decoded
.data());
180 return std::make_pair(ErrorCode
, Decoded
);
183 ::testing::AssertionResult
184 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected
,
185 StringRef S
, bool Partial
= false) {
186 ConversionResult ErrorCode
;
187 std::vector
<unsigned> Decoded
;
189 std::tie(ErrorCode
, Decoded
) = ConvertUTF8ToUnicodeScalarsLenient(S
);
191 std::tie(ErrorCode
, Decoded
) = ConvertUTF8ToUnicodeScalarsPartialLenient(S
);
193 if (Expected
.ErrorCode
!= ErrorCode
)
194 return ::testing::AssertionFailure() << "Expected error code "
195 << Expected
.ErrorCode
<< ", actual "
198 if (Expected
.UnicodeScalars
!= Decoded
)
199 return ::testing::AssertionFailure()
200 << "Expected lenient decoded result:\n"
201 << ::testing::PrintToString(Expected
.UnicodeScalars
) << "\n"
202 << "Actual result:\n" << ::testing::PrintToString(Decoded
);
204 return ::testing::AssertionSuccess();
207 TEST(ConvertUTFTest
, UTF8ToUTF32Lenient
) {
213 // U+0041 LATIN CAPITAL LETTER A
214 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
215 ConvertUTFResultContainer(conversionOK
).withScalars(0x0041), "\x41"));
221 // U+0283 LATIN SMALL LETTER ESH
222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
223 ConvertUTFResultContainer(conversionOK
).withScalars(0x0283),
226 // U+03BA GREEK SMALL LETTER KAPPA
227 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
228 // U+03C3 GREEK SMALL LETTER SIGMA
229 // U+03BC GREEK SMALL LETTER MU
230 // U+03B5 GREEK SMALL LETTER EPSILON
231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
232 ConvertUTFResultContainer(conversionOK
)
233 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
234 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
240 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
241 // U+6587 CJK UNIFIED IDEOGRAPH-6587
242 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
243 ConvertUTFResultContainer(conversionOK
).withScalars(0x4f8b, 0x6587),
244 "\xe4\xbe\x8b\xe6\x96\x87"));
246 // U+D55C HANGUL SYLLABLE HAN
247 // U+AE00 HANGUL SYLLABLE GEUL
248 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
249 ConvertUTFResultContainer(conversionOK
).withScalars(0xd55c, 0xae00),
250 "\xed\x95\x9c\xea\xb8\x80"));
252 // U+1112 HANGUL CHOSEONG HIEUH
253 // U+1161 HANGUL JUNGSEONG A
254 // U+11AB HANGUL JONGSEONG NIEUN
255 // U+1100 HANGUL CHOSEONG KIYEOK
256 // U+1173 HANGUL JUNGSEONG EU
257 // U+11AF HANGUL JONGSEONG RIEUL
258 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
259 ConvertUTFResultContainer(conversionOK
)
260 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
261 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
268 // U+E0100 VARIATION SELECTOR-17
269 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
270 ConvertUTFResultContainer(conversionOK
).withScalars(0x000E0100),
271 "\xf3\xa0\x84\x80"));
274 // First possible sequence of a certain length
278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
279 ConvertUTFResultContainer(conversionOK
).withScalars(0x0000),
280 StringRef("\x00", 1)));
282 // U+0080 PADDING CHARACTER
283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
284 ConvertUTFResultContainer(conversionOK
).withScalars(0x0080),
287 // U+0800 SAMARITAN LETTER ALAF
288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
289 ConvertUTFResultContainer(conversionOK
).withScalars(0x0800),
292 // U+10000 LINEAR B SYLLABLE B008 A
293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294 ConvertUTFResultContainer(conversionOK
).withScalars(0x10000),
295 "\xf0\x90\x80\x80"));
297 // U+200000 (invalid)
298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
299 ConvertUTFResultContainer(sourceIllegal
)
300 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
301 "\xf8\x88\x80\x80\x80"));
303 // U+4000000 (invalid)
304 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
305 ConvertUTFResultContainer(sourceIllegal
)
306 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
307 "\xfc\x84\x80\x80\x80\x80"));
310 // Last possible sequence of a certain length
314 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
315 ConvertUTFResultContainer(conversionOK
).withScalars(0x007f), "\x7f"));
317 // U+07FF (unassigned)
318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319 ConvertUTFResultContainer(conversionOK
).withScalars(0x07ff),
322 // U+FFFF (noncharacter)
323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324 ConvertUTFResultContainer(conversionOK
).withScalars(0xffff),
327 // U+1FFFFF (invalid)
328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
329 ConvertUTFResultContainer(sourceIllegal
)
330 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
331 "\xf7\xbf\xbf\xbf"));
333 // U+3FFFFFF (invalid)
334 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
335 ConvertUTFResultContainer(sourceIllegal
)
336 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
337 "\xfb\xbf\xbf\xbf\xbf"));
339 // U+7FFFFFFF (invalid)
340 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
341 ConvertUTFResultContainer(sourceIllegal
)
342 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
343 "\xfd\xbf\xbf\xbf\xbf\xbf"));
346 // Other boundary conditions
349 // U+D7FF (unassigned)
350 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
351 ConvertUTFResultContainer(conversionOK
).withScalars(0xd7ff),
354 // U+E000 (private use)
355 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
356 ConvertUTFResultContainer(conversionOK
).withScalars(0xe000),
359 // U+FFFD REPLACEMENT CHARACTER
360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
361 ConvertUTFResultContainer(conversionOK
).withScalars(0xfffd),
364 // U+10FFFF (noncharacter)
365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
366 ConvertUTFResultContainer(conversionOK
).withScalars(0x10ffff),
367 "\xf4\x8f\xbf\xbf"));
369 // U+110000 (invalid)
370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371 ConvertUTFResultContainer(sourceIllegal
)
372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
373 "\xf4\x90\x80\x80"));
376 // Unexpected continuation bytes
379 // A sequence of unexpected continuation bytes that don't follow a first
380 // byte, every byte is a maximal subpart.
382 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
383 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\x80"));
384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
385 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xbf"));
386 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
387 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
389 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
390 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
392 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
393 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
396 ConvertUTFResultContainer(sourceIllegal
)
397 .withScalars(0xfffd, 0xfffd, 0xfffd),
399 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
400 ConvertUTFResultContainer(sourceIllegal
)
401 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
402 "\x80\xbf\x80\xbf"));
403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
404 ConvertUTFResultContainer(sourceIllegal
)
405 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
406 "\x80\xbf\x82\xbf\xaa"));
407 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
408 ConvertUTFResultContainer(sourceIllegal
)
409 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
410 "\xaa\xb0\xbb\xbf\xaa\xa0"));
411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412 ConvertUTFResultContainer(sourceIllegal
)
413 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
414 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
416 // All continuation bytes (0x80--0xbf).
417 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
418 ConvertUTFResultContainer(sourceIllegal
)
419 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
420 0xfffd, 0xfffd, 0xfffd, 0xfffd)
421 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
422 0xfffd, 0xfffd, 0xfffd, 0xfffd)
423 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
424 0xfffd, 0xfffd, 0xfffd, 0xfffd)
425 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
426 0xfffd, 0xfffd, 0xfffd, 0xfffd)
427 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
428 0xfffd, 0xfffd, 0xfffd, 0xfffd)
429 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
430 0xfffd, 0xfffd, 0xfffd, 0xfffd)
431 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
432 0xfffd, 0xfffd, 0xfffd, 0xfffd)
433 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
434 0xfffd, 0xfffd, 0xfffd, 0xfffd),
435 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
436 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
437 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
438 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
441 // Lonely start bytes
444 // Start bytes of 2-byte sequences (0xc0--0xdf).
445 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
446 ConvertUTFResultContainer(sourceIllegal
)
447 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
448 0xfffd, 0xfffd, 0xfffd, 0xfffd)
449 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
450 0xfffd, 0xfffd, 0xfffd, 0xfffd)
451 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
452 0xfffd, 0xfffd, 0xfffd, 0xfffd)
453 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
454 0xfffd, 0xfffd, 0xfffd, 0xfffd),
455 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
456 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
459 ConvertUTFResultContainer(sourceIllegal
)
460 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
461 0xfffd, 0x0020, 0xfffd, 0x0020)
462 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
463 0xfffd, 0x0020, 0xfffd, 0x0020)
464 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
465 0xfffd, 0x0020, 0xfffd, 0x0020)
466 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
467 0xfffd, 0x0020, 0xfffd, 0x0020)
468 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
469 0xfffd, 0x0020, 0xfffd, 0x0020)
470 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
471 0xfffd, 0x0020, 0xfffd, 0x0020)
472 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
473 0xfffd, 0x0020, 0xfffd, 0x0020)
474 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
475 0xfffd, 0x0020, 0xfffd, 0x0020),
476 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
477 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
478 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
479 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
481 // Start bytes of 3-byte sequences (0xe0--0xef).
482 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
483 ConvertUTFResultContainer(sourceIllegal
)
484 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
485 0xfffd, 0xfffd, 0xfffd, 0xfffd)
486 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
487 0xfffd, 0xfffd, 0xfffd, 0xfffd),
488 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
490 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
491 ConvertUTFResultContainer(sourceIllegal
)
492 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
493 0xfffd, 0x0020, 0xfffd, 0x0020)
494 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
495 0xfffd, 0x0020, 0xfffd, 0x0020)
496 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
497 0xfffd, 0x0020, 0xfffd, 0x0020)
498 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
499 0xfffd, 0x0020, 0xfffd, 0x0020),
500 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
501 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
503 // Start bytes of 4-byte sequences (0xf0--0xf7).
504 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
505 ConvertUTFResultContainer(sourceIllegal
)
506 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
507 0xfffd, 0xfffd, 0xfffd, 0xfffd),
508 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
510 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
511 ConvertUTFResultContainer(sourceIllegal
)
512 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
513 0xfffd, 0x0020, 0xfffd, 0x0020)
514 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
515 0xfffd, 0x0020, 0xfffd, 0x0020),
516 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
518 // Start bytes of 5-byte sequences (0xf8--0xfb).
519 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
520 ConvertUTFResultContainer(sourceIllegal
)
521 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
522 "\xf8\xf9\xfa\xfb"));
524 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
525 ConvertUTFResultContainer(sourceIllegal
)
526 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
527 0xfffd, 0x0020, 0xfffd, 0x0020),
528 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
530 // Start bytes of 6-byte sequences (0xfc--0xfd).
531 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
532 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536 ConvertUTFResultContainer(sourceIllegal
)
537 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
538 "\xfc\x20\xfd\x20"));
541 // Other bytes (0xc0--0xc1, 0xfe--0xff).
544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
545 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xc0"));
546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
547 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xc1"));
548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
549 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfe"));
550 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
551 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xff"));
553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
554 ConvertUTFResultContainer(sourceIllegal
)
555 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
556 "\xc0\xc1\xfe\xff"));
558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
559 ConvertUTFResultContainer(sourceIllegal
)
560 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
561 "\xfe\xfe\xff\xff"));
563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
564 ConvertUTFResultContainer(sourceIllegal
)
565 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
566 "\xfe\x80\x80\x80\x80\x80"));
568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
569 ConvertUTFResultContainer(sourceIllegal
)
570 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
571 "\xff\x80\x80\x80\x80\x80"));
573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574 ConvertUTFResultContainer(sourceIllegal
)
575 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
576 0xfffd, 0x0020, 0xfffd, 0x0020),
577 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
580 // Sequences with one continuation byte missing
583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
584 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xc2"));
585 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
586 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xdf"));
587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
597 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
599 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
600 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
602 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
603 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
605 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
606 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
612 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
615 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
618 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
621 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
624 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
627 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
630 // Overlong sequences with one trailing byte missing.
631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
632 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
635 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
637 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
638 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
640 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
641 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
644 ConvertUTFResultContainer(sourceIllegal
)
645 .withScalars(0xfffd, 0xfffd, 0xfffd),
647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648 ConvertUTFResultContainer(sourceIllegal
)
649 .withScalars(0xfffd, 0xfffd, 0xfffd),
651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652 ConvertUTFResultContainer(sourceIllegal
)
653 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
654 "\xf8\x80\x80\x80"));
655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656 ConvertUTFResultContainer(sourceIllegal
)
657 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
658 "\xfc\x80\x80\x80\x80"));
660 // Sequences that represent surrogates with one trailing byte missing.
662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
663 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
676 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
678 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
679 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
682 // Ill-formed 4-byte sequences.
683 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
684 // U+1100xx (invalid)
685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
686 ConvertUTFResultContainer(sourceIllegal
)
687 .withScalars(0xfffd, 0xfffd, 0xfffd),
689 // U+13FBxx (invalid)
690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691 ConvertUTFResultContainer(sourceIllegal
)
692 .withScalars(0xfffd, 0xfffd, 0xfffd),
694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
695 ConvertUTFResultContainer(sourceIllegal
)
696 .withScalars(0xfffd, 0xfffd, 0xfffd),
698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
699 ConvertUTFResultContainer(sourceIllegal
)
700 .withScalars(0xfffd, 0xfffd, 0xfffd),
702 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
703 ConvertUTFResultContainer(sourceIllegal
)
704 .withScalars(0xfffd, 0xfffd, 0xfffd),
706 // U+1FFBxx (invalid)
707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
708 ConvertUTFResultContainer(sourceIllegal
)
709 .withScalars(0xfffd, 0xfffd, 0xfffd),
712 // Ill-formed 5-byte sequences.
713 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
714 // U+2000xx (invalid)
715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716 ConvertUTFResultContainer(sourceIllegal
)
717 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
718 "\xf8\x88\x80\x80"));
719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal
)
721 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
722 "\xf8\xbf\xbf\xbf"));
723 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
724 ConvertUTFResultContainer(sourceIllegal
)
725 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
726 "\xf9\x80\x80\x80"));
727 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
728 ConvertUTFResultContainer(sourceIllegal
)
729 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
730 "\xfa\x80\x80\x80"));
731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732 ConvertUTFResultContainer(sourceIllegal
)
733 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
734 "\xfb\x80\x80\x80"));
735 // U+3FFFFxx (invalid)
736 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
737 ConvertUTFResultContainer(sourceIllegal
)
738 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
739 "\xfb\xbf\xbf\xbf"));
741 // Ill-formed 6-byte sequences.
742 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
743 // U+40000xx (invalid)
744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745 ConvertUTFResultContainer(sourceIllegal
)
746 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
747 "\xfc\x84\x80\x80\x80"));
748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749 ConvertUTFResultContainer(sourceIllegal
)
750 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
751 "\xfc\xbf\xbf\xbf\xbf"));
752 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
753 ConvertUTFResultContainer(sourceIllegal
)
754 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
755 "\xfd\x80\x80\x80\x80"));
756 // U+7FFFFFxx (invalid)
757 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
758 ConvertUTFResultContainer(sourceIllegal
)
759 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
760 "\xfd\xbf\xbf\xbf\xbf"));
763 // Sequences with two continuation bytes missing
766 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
767 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
769 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
770 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
772 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
773 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
775 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
776 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
778 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
779 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
781 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
782 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd),
785 // Overlong sequences with two trailing byte missing.
786 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
787 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xe0"));
788 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
789 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
791 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
792 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795 ConvertUTFResultContainer(sourceIllegal
)
796 .withScalars(0xfffd, 0xfffd, 0xfffd),
798 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
799 ConvertUTFResultContainer(sourceIllegal
)
800 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
801 "\xfc\x80\x80\x80"));
803 // Sequences that represent surrogates with two trailing bytes missing.
804 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
805 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xed"));
807 // Ill-formed 4-byte sequences.
808 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
809 // U+110yxx (invalid)
810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
813 // U+13Fyxx (invalid)
814 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
815 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
817 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
818 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
820 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
821 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
823 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
824 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
826 // U+1FFyxx (invalid)
827 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
828 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
831 // Ill-formed 5-byte sequences.
832 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
833 // U+200yxx (invalid)
834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
837 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
838 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
840 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
841 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
843 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
844 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
846 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
847 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
849 // U+3FFFyxx (invalid)
850 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
851 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
854 // Ill-formed 6-byte sequences.
855 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
856 // U+4000yxx (invalid)
857 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
858 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
859 "\xfc\x84\x80\x80"));
860 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
861 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
862 "\xfc\xbf\xbf\xbf"));
863 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
864 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
865 "\xfd\x80\x80\x80"));
866 // U+7FFFFyxx (invalid)
867 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
868 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
869 "\xfd\xbf\xbf\xbf"));
872 // Sequences with three continuation bytes missing
875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf0"));
877 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
878 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf1"));
879 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
880 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf2"));
881 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
882 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf3"));
883 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
884 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf4"));
886 // Broken overlong sequences.
887 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
888 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf0"));
889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
896 // Ill-formed 4-byte sequences.
897 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
898 // U+14yyxx (invalid)
899 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
900 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf5"));
901 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
902 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf6"));
903 // U+1Cyyxx (invalid)
904 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
905 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf7"));
907 // Ill-formed 5-byte sequences.
908 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
909 // U+20yyxx (invalid)
910 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
911 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
919 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
920 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
922 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
923 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
925 // U+3FCyyxx (invalid)
926 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
927 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
930 // Ill-formed 6-byte sequences.
931 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
932 // U+400yyxx (invalid)
933 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
934 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
936 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
937 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
939 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
940 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
942 // U+7FFCyyxx (invalid)
943 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
944 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd, 0xfffd),
948 // Sequences with four continuation bytes missing
951 // Ill-formed 5-byte sequences.
952 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
953 // U+uzyyxx (invalid)
954 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
955 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf8"));
956 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
957 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf9"));
958 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
959 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfa"));
960 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfb"));
962 // U+3zyyxx (invalid)
963 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
964 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfb"));
966 // Broken overlong sequences.
967 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
968 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xf8"));
969 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
970 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
973 // Ill-formed 6-byte sequences.
974 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
975 // U+uzzyyxx (invalid)
976 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
977 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
979 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
980 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
982 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
983 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
985 // U+7Fzzyyxx (invalid)
986 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
987 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
991 // Sequences with five continuation bytes missing
994 // Ill-formed 6-byte sequences.
995 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
996 // U+uzzyyxx (invalid)
997 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
998 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfc"));
999 // U+uuzzyyxx (invalid)
1000 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1001 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd), "\xfd"));
1004 // Consecutive sequences with trailing bytes missing
1007 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1008 ConvertUTFResultContainer(sourceIllegal
)
1009 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1010 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1011 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1012 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1013 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1014 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1015 "\xc0" "\xe0\x80" "\xf0\x80\x80"
1017 "\xfc\x80\x80\x80\x80"
1018 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1020 "\xfd\xbf\xbf\xbf\xbf"));
1023 // Overlong UTF-8 sequences
1027 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1028 ConvertUTFResultContainer(conversionOK
).withScalars(0x002f), "\x2f"));
1030 // Overlong sequences of the above.
1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1034 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1035 ConvertUTFResultContainer(sourceIllegal
)
1036 .withScalars(0xfffd, 0xfffd, 0xfffd),
1038 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1039 ConvertUTFResultContainer(sourceIllegal
)
1040 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1041 "\xf0\x80\x80\xaf"));
1042 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1043 ConvertUTFResultContainer(sourceIllegal
)
1044 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1045 "\xf8\x80\x80\x80\xaf"));
1046 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1047 ConvertUTFResultContainer(sourceIllegal
)
1048 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1049 "\xfc\x80\x80\x80\x80\xaf"));
1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053 ConvertUTFResultContainer(conversionOK
).withScalars(0x0000),
1054 StringRef("\x00", 1)));
1056 // Overlong sequences of the above.
1057 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1058 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1060 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1061 ConvertUTFResultContainer(sourceIllegal
)
1062 .withScalars(0xfffd, 0xfffd, 0xfffd),
1064 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1065 ConvertUTFResultContainer(sourceIllegal
)
1066 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1067 "\xf0\x80\x80\x80"));
1068 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1069 ConvertUTFResultContainer(sourceIllegal
)
1070 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1071 "\xf8\x80\x80\x80\x80"));
1072 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1073 ConvertUTFResultContainer(sourceIllegal
)
1074 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1075 "\xfc\x80\x80\x80\x80\x80"));
1077 // Other overlong sequences.
1078 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1079 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1081 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1082 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1084 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1085 ConvertUTFResultContainer(sourceIllegal
).withScalars(0xfffd, 0xfffd),
1087 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1088 ConvertUTFResultContainer(sourceIllegal
)
1089 .withScalars(0xfffd, 0xfffd, 0xfffd),
1091 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1092 ConvertUTFResultContainer(sourceIllegal
)
1093 .withScalars(0xfffd, 0xfffd, 0xfffd),
1095 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1096 ConvertUTFResultContainer(sourceIllegal
)
1097 .withScalars(0xfffd, 0xfffd, 0xfffd),
1099 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1100 ConvertUTFResultContainer(sourceIllegal
)
1101 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1102 "\xf0\x8f\x80\x80"));
1103 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1104 ConvertUTFResultContainer(sourceIllegal
)
1105 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1106 "\xf0\x8f\xbf\xbf"));
1107 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1108 ConvertUTFResultContainer(sourceIllegal
)
1109 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1110 "\xf8\x87\xbf\xbf\xbf"));
1111 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1112 ConvertUTFResultContainer(sourceIllegal
)
1113 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1114 "\xfc\x83\xbf\xbf\xbf\xbf"));
1117 // Isolated surrogates
1122 // D71. High-surrogate code point: A Unicode code point in the range
1123 // U+D800 to U+DBFF.
1125 // D73. Low-surrogate code point: A Unicode code point in the range
1126 // U+DC00 to U+DFFF.
1128 // Note: U+E0100 is <DB40 DD00> in UTF16.
1133 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1134 ConvertUTFResultContainer(sourceIllegal
)
1135 .withScalars(0xfffd, 0xfffd, 0xfffd),
1139 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1140 ConvertUTFResultContainer(sourceIllegal
)
1141 .withScalars(0xfffd, 0xfffd, 0xfffd),
1145 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1146 ConvertUTFResultContainer(sourceIllegal
)
1147 .withScalars(0xfffd, 0xfffd, 0xfffd),
1153 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1154 ConvertUTFResultContainer(sourceIllegal
)
1155 .withScalars(0xfffd, 0xfffd, 0xfffd),
1159 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1160 ConvertUTFResultContainer(sourceIllegal
)
1161 .withScalars(0xfffd, 0xfffd, 0xfffd),
1165 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1166 ConvertUTFResultContainer(sourceIllegal
)
1167 .withScalars(0xfffd, 0xfffd, 0xfffd),
1173 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1174 ConvertUTFResultContainer(sourceIllegal
)
1175 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1176 "\xed\xa0\x80\xed\xb0\x80"));
1179 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1180 ConvertUTFResultContainer(sourceIllegal
)
1181 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1182 "\xed\xa0\x80\xed\xb4\x80"));
1185 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1186 ConvertUTFResultContainer(sourceIllegal
)
1187 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1188 "\xed\xa0\x80\xed\xbf\xbf"));
1191 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1192 ConvertUTFResultContainer(sourceIllegal
)
1193 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1194 "\xed\xac\xa0\xed\xb0\x80"));
1197 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1198 ConvertUTFResultContainer(sourceIllegal
)
1199 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1200 "\xed\xac\xa0\xed\xb4\x80"));
1203 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1204 ConvertUTFResultContainer(sourceIllegal
)
1205 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1206 "\xed\xac\xa0\xed\xbf\xbf"));
1209 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1210 ConvertUTFResultContainer(sourceIllegal
)
1211 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1212 "\xed\xaf\xbf\xed\xb0\x80"));
1215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1216 ConvertUTFResultContainer(sourceIllegal
)
1217 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1218 "\xed\xaf\xbf\xed\xb4\x80"));
1221 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222 ConvertUTFResultContainer(sourceIllegal
)
1223 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1224 "\xed\xaf\xbf\xed\xbf\xbf"));
1232 // D14. Noncharacter: A code point that is permanently reserved for
1233 // internal use and that should never be interchanged. Noncharacters
1234 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1235 // and the values U+FDD0..U+FDEF.
1238 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1239 ConvertUTFResultContainer(conversionOK
).withScalars(0xfffe),
1243 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1244 ConvertUTFResultContainer(conversionOK
).withScalars(0xffff),
1248 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1249 ConvertUTFResultContainer(conversionOK
).withScalars(0x1fffe),
1250 "\xf0\x9f\xbf\xbe"));
1253 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1254 ConvertUTFResultContainer(conversionOK
).withScalars(0x1ffff),
1255 "\xf0\x9f\xbf\xbf"));
1258 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1259 ConvertUTFResultContainer(conversionOK
).withScalars(0x2fffe),
1260 "\xf0\xaf\xbf\xbe"));
1263 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1264 ConvertUTFResultContainer(conversionOK
).withScalars(0x2ffff),
1265 "\xf0\xaf\xbf\xbf"));
1268 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1269 ConvertUTFResultContainer(conversionOK
).withScalars(0x3fffe),
1270 "\xf0\xbf\xbf\xbe"));
1273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1274 ConvertUTFResultContainer(conversionOK
).withScalars(0x3ffff),
1275 "\xf0\xbf\xbf\xbf"));
1278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1279 ConvertUTFResultContainer(conversionOK
).withScalars(0x4fffe),
1280 "\xf1\x8f\xbf\xbe"));
1283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1284 ConvertUTFResultContainer(conversionOK
).withScalars(0x4ffff),
1285 "\xf1\x8f\xbf\xbf"));
1288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1289 ConvertUTFResultContainer(conversionOK
).withScalars(0x5fffe),
1290 "\xf1\x9f\xbf\xbe"));
1293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1294 ConvertUTFResultContainer(conversionOK
).withScalars(0x5ffff),
1295 "\xf1\x9f\xbf\xbf"));
1298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1299 ConvertUTFResultContainer(conversionOK
).withScalars(0x6fffe),
1300 "\xf1\xaf\xbf\xbe"));
1303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1304 ConvertUTFResultContainer(conversionOK
).withScalars(0x6ffff),
1305 "\xf1\xaf\xbf\xbf"));
1308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1309 ConvertUTFResultContainer(conversionOK
).withScalars(0x7fffe),
1310 "\xf1\xbf\xbf\xbe"));
1313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1314 ConvertUTFResultContainer(conversionOK
).withScalars(0x7ffff),
1315 "\xf1\xbf\xbf\xbf"));
1318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1319 ConvertUTFResultContainer(conversionOK
).withScalars(0x8fffe),
1320 "\xf2\x8f\xbf\xbe"));
1323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1324 ConvertUTFResultContainer(conversionOK
).withScalars(0x8ffff),
1325 "\xf2\x8f\xbf\xbf"));
1328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1329 ConvertUTFResultContainer(conversionOK
).withScalars(0x9fffe),
1330 "\xf2\x9f\xbf\xbe"));
1333 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1334 ConvertUTFResultContainer(conversionOK
).withScalars(0x9ffff),
1335 "\xf2\x9f\xbf\xbf"));
1338 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1339 ConvertUTFResultContainer(conversionOK
).withScalars(0xafffe),
1340 "\xf2\xaf\xbf\xbe"));
1343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1344 ConvertUTFResultContainer(conversionOK
).withScalars(0xaffff),
1345 "\xf2\xaf\xbf\xbf"));
1348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1349 ConvertUTFResultContainer(conversionOK
).withScalars(0xbfffe),
1350 "\xf2\xbf\xbf\xbe"));
1353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1354 ConvertUTFResultContainer(conversionOK
).withScalars(0xbffff),
1355 "\xf2\xbf\xbf\xbf"));
1358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1359 ConvertUTFResultContainer(conversionOK
).withScalars(0xcfffe),
1360 "\xf3\x8f\xbf\xbe"));
1363 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1364 ConvertUTFResultContainer(conversionOK
).withScalars(0xcfffF),
1365 "\xf3\x8f\xbf\xbf"));
1368 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1369 ConvertUTFResultContainer(conversionOK
).withScalars(0xdfffe),
1370 "\xf3\x9f\xbf\xbe"));
1373 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1374 ConvertUTFResultContainer(conversionOK
).withScalars(0xdffff),
1375 "\xf3\x9f\xbf\xbf"));
1378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1379 ConvertUTFResultContainer(conversionOK
).withScalars(0xefffe),
1380 "\xf3\xaf\xbf\xbe"));
1383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1384 ConvertUTFResultContainer(conversionOK
).withScalars(0xeffff),
1385 "\xf3\xaf\xbf\xbf"));
1388 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1389 ConvertUTFResultContainer(conversionOK
).withScalars(0xffffe),
1390 "\xf3\xbf\xbf\xbe"));
1393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1394 ConvertUTFResultContainer(conversionOK
).withScalars(0xfffff),
1395 "\xf3\xbf\xbf\xbf"));
1398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1399 ConvertUTFResultContainer(conversionOK
).withScalars(0x10fffe),
1400 "\xf4\x8f\xbf\xbe"));
1403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1404 ConvertUTFResultContainer(conversionOK
).withScalars(0x10ffff),
1405 "\xf4\x8f\xbf\xbf"));
1408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1409 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd0),
1413 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1414 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd1),
1418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1419 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd2),
1423 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1424 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd3),
1428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1429 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd4),
1433 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1434 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd5),
1438 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1439 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd6),
1443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1444 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd7),
1448 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1449 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd8),
1453 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1454 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdd9),
1458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1459 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdda),
1463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1464 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddb),
1468 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1469 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddc),
1473 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1474 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddd),
1478 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1479 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdde),
1483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1484 ConvertUTFResultContainer(conversionOK
).withScalars(0xfddf),
1488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1489 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde0),
1493 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1494 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde1),
1498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1499 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde2),
1503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1504 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde3),
1508 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1509 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde4),
1513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1514 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde5),
1518 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1519 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde6),
1523 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1524 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde7),
1528 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1529 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde8),
1533 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1534 ConvertUTFResultContainer(conversionOK
).withScalars(0xfde9),
1538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1539 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdea),
1543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1544 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdeb),
1548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1549 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdec),
1553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1554 ConvertUTFResultContainer(conversionOK
).withScalars(0xfded),
1558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1559 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdee),
1563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1564 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdef),
1568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1569 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf0),
1573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1574 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf1),
1578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1579 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf2),
1583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1584 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf3),
1588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1589 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf4),
1593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1594 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf5),
1598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1599 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf6),
1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf7),
1608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1609 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf8),
1613 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1614 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdf9),
1618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1619 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfa),
1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfb),
1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfc),
1633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1634 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfd),
1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdfe),
1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644 ConvertUTFResultContainer(conversionOK
).withScalars(0xfdff),
1648 TEST(ConvertUTFTest
, UTF8ToUTF32PartialLenient
) {
1649 // U+0041 LATIN CAPITAL LETTER A
1650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1651 ConvertUTFResultContainer(conversionOK
).withScalars(0x0041),
1655 // Sequences with one continuation byte missing
1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659 ConvertUTFResultContainer(sourceExhausted
),
1661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1662 ConvertUTFResultContainer(sourceExhausted
),
1664 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1665 ConvertUTFResultContainer(sourceExhausted
),
1667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1668 ConvertUTFResultContainer(sourceExhausted
),
1670 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1671 ConvertUTFResultContainer(sourceExhausted
),
1673 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1674 ConvertUTFResultContainer(sourceExhausted
),
1676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1677 ConvertUTFResultContainer(sourceExhausted
),
1679 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1680 ConvertUTFResultContainer(sourceExhausted
),
1682 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1683 ConvertUTFResultContainer(sourceExhausted
),
1685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1686 ConvertUTFResultContainer(sourceExhausted
),
1688 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1689 ConvertUTFResultContainer(sourceExhausted
),
1690 "\xf0\x90\x80", true));
1691 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1692 ConvertUTFResultContainer(sourceExhausted
),
1693 "\xf0\xbf\xbf", true));
1694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1695 ConvertUTFResultContainer(sourceExhausted
),
1696 "\xf1\x80\x80", true));
1697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1698 ConvertUTFResultContainer(sourceExhausted
),
1699 "\xf3\xbf\xbf", true));
1700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1701 ConvertUTFResultContainer(sourceExhausted
),
1702 "\xf4\x80\x80", true));
1703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1704 ConvertUTFResultContainer(sourceExhausted
),
1705 "\xf4\x8f\xbf", true));
1707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1708 ConvertUTFResultContainer(sourceExhausted
).withScalars(0x0041),