Revert r354244 "[DAGCombiner] Eliminate dead stores to stack."
[llvm-complete.git] / unittests / Support / ConvertUTFTest.cpp
blob83019722332d3c50fab2429835bddcc232aba060
1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "llvm/Support/ConvertUTF.h"
10 #include "llvm/ADT/ArrayRef.h"
11 #include "gtest/gtest.h"
12 #include <string>
13 #include <vector>
15 using namespace llvm;
17 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
18 // Src is the look of disapproval.
19 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
20 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
21 std::string Result;
22 bool Success = convertUTF16ToUTF8String(Ref, Result);
23 EXPECT_TRUE(Success);
24 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
25 EXPECT_EQ(Expected, Result);
28 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
29 // Src is the look of disapproval.
30 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
31 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
32 std::string Result;
33 bool Success = convertUTF16ToUTF8String(Ref, Result);
34 EXPECT_TRUE(Success);
35 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
36 EXPECT_EQ(Expected, Result);
39 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
40 // Src is the look of disapproval.
41 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
42 StringRef Ref(Src, sizeof(Src) - 1);
43 SmallVector<UTF16, 5> Result;
44 bool Success = convertUTF8ToUTF16String(Ref, Result);
45 EXPECT_TRUE(Success);
46 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
47 ASSERT_EQ(3u, Result.size());
48 for (int I = 0, E = 3; I != E; ++I)
49 EXPECT_EQ(Expected[I], Result[I]);
52 TEST(ConvertUTFTest, OddLengthInput) {
53 std::string Result;
54 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
55 EXPECT_FALSE(Success);
58 TEST(ConvertUTFTest, Empty) {
59 std::string Result;
60 bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
61 EXPECT_TRUE(Success);
62 EXPECT_TRUE(Result.empty());
65 TEST(ConvertUTFTest, HasUTF16BOM) {
66 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
67 EXPECT_TRUE(HasBOM);
68 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
69 EXPECT_TRUE(HasBOM);
70 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
71 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
72 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
73 EXPECT_TRUE(HasBOM);
75 HasBOM = hasUTF16ByteOrderMark(None);
76 EXPECT_FALSE(HasBOM);
77 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
78 EXPECT_FALSE(HasBOM);
81 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
82 // Src is the look of disapproval.
83 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
84 ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
85 std::string Result;
86 bool Success = convertUTF16ToUTF8String(SrcRef, Result);
87 EXPECT_TRUE(Success);
88 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
89 EXPECT_EQ(Expected, Result);
92 TEST(ConvertUTFTest, ConvertUTF8toWide) {
93 // Src is the look of disapproval.
94 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
95 std::wstring Result;
96 bool Success = ConvertUTF8toWide((const char*)Src, Result);
97 EXPECT_TRUE(Success);
98 std::wstring Expected(L"\x0ca0_\x0ca0");
99 EXPECT_EQ(Expected, Result);
100 Result.clear();
101 Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
102 EXPECT_TRUE(Success);
103 EXPECT_EQ(Expected, Result);
106 TEST(ConvertUTFTest, convertWideToUTF8) {
107 // Src is the look of disapproval.
108 static const wchar_t Src[] = L"\x0ca0_\x0ca0";
109 std::string Result;
110 bool Success = convertWideToUTF8(Src, Result);
111 EXPECT_TRUE(Success);
112 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
113 EXPECT_EQ(Expected, Result);
116 struct ConvertUTFResultContainer {
117 ConversionResult ErrorCode;
118 std::vector<unsigned> UnicodeScalars;
120 ConvertUTFResultContainer(ConversionResult ErrorCode)
121 : ErrorCode(ErrorCode) {}
123 ConvertUTFResultContainer
124 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
125 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
126 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
127 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
128 ConvertUTFResultContainer Result(*this);
129 if (US0 != 0x110000)
130 Result.UnicodeScalars.push_back(US0);
131 if (US1 != 0x110000)
132 Result.UnicodeScalars.push_back(US1);
133 if (US2 != 0x110000)
134 Result.UnicodeScalars.push_back(US2);
135 if (US3 != 0x110000)
136 Result.UnicodeScalars.push_back(US3);
137 if (US4 != 0x110000)
138 Result.UnicodeScalars.push_back(US4);
139 if (US5 != 0x110000)
140 Result.UnicodeScalars.push_back(US5);
141 if (US6 != 0x110000)
142 Result.UnicodeScalars.push_back(US6);
143 if (US7 != 0x110000)
144 Result.UnicodeScalars.push_back(US7);
145 return Result;
149 std::pair<ConversionResult, std::vector<unsigned>>
150 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
151 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
153 const UTF8 *SourceNext = SourceStart;
154 std::vector<UTF32> Decoded(S.size(), 0);
155 UTF32 *TargetStart = Decoded.data();
157 auto ErrorCode =
158 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
159 Decoded.data() + Decoded.size(), lenientConversion);
161 Decoded.resize(TargetStart - Decoded.data());
163 return std::make_pair(ErrorCode, Decoded);
166 std::pair<ConversionResult, std::vector<unsigned>>
167 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
168 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
170 const UTF8 *SourceNext = SourceStart;
171 std::vector<UTF32> Decoded(S.size(), 0);
172 UTF32 *TargetStart = Decoded.data();
174 auto ErrorCode = ConvertUTF8toUTF32Partial(
175 &SourceNext, SourceStart + S.size(), &TargetStart,
176 Decoded.data() + Decoded.size(), lenientConversion);
178 Decoded.resize(TargetStart - Decoded.data());
180 return std::make_pair(ErrorCode, Decoded);
183 ::testing::AssertionResult
184 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
185 StringRef S, bool Partial = false) {
186 ConversionResult ErrorCode;
187 std::vector<unsigned> Decoded;
188 if (!Partial)
189 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
190 else
191 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
193 if (Expected.ErrorCode != ErrorCode)
194 return ::testing::AssertionFailure() << "Expected error code "
195 << Expected.ErrorCode << ", actual "
196 << ErrorCode;
198 if (Expected.UnicodeScalars != Decoded)
199 return ::testing::AssertionFailure()
200 << "Expected lenient decoded result:\n"
201 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
202 << "Actual result:\n" << ::testing::PrintToString(Decoded);
204 return ::testing::AssertionSuccess();
207 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
210 // 1-byte sequences
213 // U+0041 LATIN CAPITAL LETTER A
214 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
215 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
218 // 2-byte sequences
221 // U+0283 LATIN SMALL LETTER ESH
222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
223 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
224 "\xca\x83"));
226 // U+03BA GREEK SMALL LETTER KAPPA
227 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
228 // U+03C3 GREEK SMALL LETTER SIGMA
229 // U+03BC GREEK SMALL LETTER MU
230 // U+03B5 GREEK SMALL LETTER EPSILON
231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
232 ConvertUTFResultContainer(conversionOK)
233 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
234 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
237 // 3-byte sequences
240 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
241 // U+6587 CJK UNIFIED IDEOGRAPH-6587
242 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
243 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
244 "\xe4\xbe\x8b\xe6\x96\x87"));
246 // U+D55C HANGUL SYLLABLE HAN
247 // U+AE00 HANGUL SYLLABLE GEUL
248 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
249 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
250 "\xed\x95\x9c\xea\xb8\x80"));
252 // U+1112 HANGUL CHOSEONG HIEUH
253 // U+1161 HANGUL JUNGSEONG A
254 // U+11AB HANGUL JONGSEONG NIEUN
255 // U+1100 HANGUL CHOSEONG KIYEOK
256 // U+1173 HANGUL JUNGSEONG EU
257 // U+11AF HANGUL JONGSEONG RIEUL
258 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
259 ConvertUTFResultContainer(conversionOK)
260 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
261 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
262 "\xe1\x86\xaf"));
265 // 4-byte sequences
268 // U+E0100 VARIATION SELECTOR-17
269 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
270 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
271 "\xf3\xa0\x84\x80"));
274 // First possible sequence of a certain length
277 // U+0000 NULL
278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
279 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
280 StringRef("\x00", 1)));
282 // U+0080 PADDING CHARACTER
283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
284 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
285 "\xc2\x80"));
287 // U+0800 SAMARITAN LETTER ALAF
288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
289 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
290 "\xe0\xa0\x80"));
292 // U+10000 LINEAR B SYLLABLE B008 A
293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
295 "\xf0\x90\x80\x80"));
297 // U+200000 (invalid)
298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
299 ConvertUTFResultContainer(sourceIllegal)
300 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
301 "\xf8\x88\x80\x80\x80"));
303 // U+4000000 (invalid)
304 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
305 ConvertUTFResultContainer(sourceIllegal)
306 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
307 "\xfc\x84\x80\x80\x80\x80"));
310 // Last possible sequence of a certain length
313 // U+007F DELETE
314 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
315 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
317 // U+07FF (unassigned)
318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
320 "\xdf\xbf"));
322 // U+FFFF (noncharacter)
323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
325 "\xef\xbf\xbf"));
327 // U+1FFFFF (invalid)
328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
329 ConvertUTFResultContainer(sourceIllegal)
330 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
331 "\xf7\xbf\xbf\xbf"));
333 // U+3FFFFFF (invalid)
334 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
335 ConvertUTFResultContainer(sourceIllegal)
336 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
337 "\xfb\xbf\xbf\xbf\xbf"));
339 // U+7FFFFFFF (invalid)
340 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
341 ConvertUTFResultContainer(sourceIllegal)
342 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
343 "\xfd\xbf\xbf\xbf\xbf\xbf"));
346 // Other boundary conditions
349 // U+D7FF (unassigned)
350 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
351 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
352 "\xed\x9f\xbf"));
354 // U+E000 (private use)
355 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
356 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
357 "\xee\x80\x80"));
359 // U+FFFD REPLACEMENT CHARACTER
360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
361 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
362 "\xef\xbf\xbd"));
364 // U+10FFFF (noncharacter)
365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
366 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
367 "\xf4\x8f\xbf\xbf"));
369 // U+110000 (invalid)
370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371 ConvertUTFResultContainer(sourceIllegal)
372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
373 "\xf4\x90\x80\x80"));
376 // Unexpected continuation bytes
379 // A sequence of unexpected continuation bytes that don't follow a first
380 // byte, every byte is a maximal subpart.
382 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
383 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
384 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
385 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
386 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
387 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
388 "\x80\x80"));
389 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
390 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
391 "\x80\xbf"));
392 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
393 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
394 "\xbf\x80"));
395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
396 ConvertUTFResultContainer(sourceIllegal)
397 .withScalars(0xfffd, 0xfffd, 0xfffd),
398 "\x80\xbf\x80"));
399 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
400 ConvertUTFResultContainer(sourceIllegal)
401 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
402 "\x80\xbf\x80\xbf"));
403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
404 ConvertUTFResultContainer(sourceIllegal)
405 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
406 "\x80\xbf\x82\xbf\xaa"));
407 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
408 ConvertUTFResultContainer(sourceIllegal)
409 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
410 "\xaa\xb0\xbb\xbf\xaa\xa0"));
411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412 ConvertUTFResultContainer(sourceIllegal)
413 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
414 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
416 // All continuation bytes (0x80--0xbf).
417 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
418 ConvertUTFResultContainer(sourceIllegal)
419 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
420 0xfffd, 0xfffd, 0xfffd, 0xfffd)
421 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
422 0xfffd, 0xfffd, 0xfffd, 0xfffd)
423 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
424 0xfffd, 0xfffd, 0xfffd, 0xfffd)
425 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
426 0xfffd, 0xfffd, 0xfffd, 0xfffd)
427 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
428 0xfffd, 0xfffd, 0xfffd, 0xfffd)
429 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
430 0xfffd, 0xfffd, 0xfffd, 0xfffd)
431 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
432 0xfffd, 0xfffd, 0xfffd, 0xfffd)
433 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
434 0xfffd, 0xfffd, 0xfffd, 0xfffd),
435 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
436 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
437 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
438 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
441 // Lonely start bytes
444 // Start bytes of 2-byte sequences (0xc0--0xdf).
445 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
446 ConvertUTFResultContainer(sourceIllegal)
447 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
448 0xfffd, 0xfffd, 0xfffd, 0xfffd)
449 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
450 0xfffd, 0xfffd, 0xfffd, 0xfffd)
451 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
452 0xfffd, 0xfffd, 0xfffd, 0xfffd)
453 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
454 0xfffd, 0xfffd, 0xfffd, 0xfffd),
455 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
456 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
459 ConvertUTFResultContainer(sourceIllegal)
460 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
461 0xfffd, 0x0020, 0xfffd, 0x0020)
462 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
463 0xfffd, 0x0020, 0xfffd, 0x0020)
464 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
465 0xfffd, 0x0020, 0xfffd, 0x0020)
466 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
467 0xfffd, 0x0020, 0xfffd, 0x0020)
468 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
469 0xfffd, 0x0020, 0xfffd, 0x0020)
470 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
471 0xfffd, 0x0020, 0xfffd, 0x0020)
472 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
473 0xfffd, 0x0020, 0xfffd, 0x0020)
474 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
475 0xfffd, 0x0020, 0xfffd, 0x0020),
476 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
477 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
478 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
479 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
481 // Start bytes of 3-byte sequences (0xe0--0xef).
482 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
483 ConvertUTFResultContainer(sourceIllegal)
484 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
485 0xfffd, 0xfffd, 0xfffd, 0xfffd)
486 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
487 0xfffd, 0xfffd, 0xfffd, 0xfffd),
488 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
490 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
491 ConvertUTFResultContainer(sourceIllegal)
492 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
493 0xfffd, 0x0020, 0xfffd, 0x0020)
494 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
495 0xfffd, 0x0020, 0xfffd, 0x0020)
496 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
497 0xfffd, 0x0020, 0xfffd, 0x0020)
498 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
499 0xfffd, 0x0020, 0xfffd, 0x0020),
500 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
501 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
503 // Start bytes of 4-byte sequences (0xf0--0xf7).
504 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
505 ConvertUTFResultContainer(sourceIllegal)
506 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
507 0xfffd, 0xfffd, 0xfffd, 0xfffd),
508 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
510 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
511 ConvertUTFResultContainer(sourceIllegal)
512 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
513 0xfffd, 0x0020, 0xfffd, 0x0020)
514 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
515 0xfffd, 0x0020, 0xfffd, 0x0020),
516 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
518 // Start bytes of 5-byte sequences (0xf8--0xfb).
519 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
520 ConvertUTFResultContainer(sourceIllegal)
521 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
522 "\xf8\xf9\xfa\xfb"));
524 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
525 ConvertUTFResultContainer(sourceIllegal)
526 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
527 0xfffd, 0x0020, 0xfffd, 0x0020),
528 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
530 // Start bytes of 6-byte sequences (0xfc--0xfd).
531 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
532 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
533 "\xfc\xfd"));
535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536 ConvertUTFResultContainer(sourceIllegal)
537 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
538 "\xfc\x20\xfd\x20"));
541 // Other bytes (0xc0--0xc1, 0xfe--0xff).
544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
545 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
547 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
549 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
550 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
551 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
554 ConvertUTFResultContainer(sourceIllegal)
555 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
556 "\xc0\xc1\xfe\xff"));
558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
559 ConvertUTFResultContainer(sourceIllegal)
560 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
561 "\xfe\xfe\xff\xff"));
563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
564 ConvertUTFResultContainer(sourceIllegal)
565 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
566 "\xfe\x80\x80\x80\x80\x80"));
568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
569 ConvertUTFResultContainer(sourceIllegal)
570 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
571 "\xff\x80\x80\x80\x80\x80"));
573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574 ConvertUTFResultContainer(sourceIllegal)
575 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
576 0xfffd, 0x0020, 0xfffd, 0x0020),
577 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
580 // Sequences with one continuation byte missing
583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
584 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
585 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
586 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589 "\xe0\xa0"));
590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
592 "\xe0\xbf"));
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
595 "\xe1\x80"));
596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
597 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
598 "\xec\xbf"));
599 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
600 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
601 "\xed\x80"));
602 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
603 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
604 "\xed\x9f"));
605 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
606 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
607 "\xee\x80"));
608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
610 "\xef\xbf"));
611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
612 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
613 "\xf0\x90\x80"));
614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
615 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
616 "\xf0\xbf\xbf"));
617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
618 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
619 "\xf1\x80\x80"));
620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
621 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
622 "\xf3\xbf\xbf"));
623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
624 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
625 "\xf4\x80\x80"));
626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
627 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
628 "\xf4\x8f\xbf"));
630 // Overlong sequences with one trailing byte missing.
631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
632 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
633 "\xc0"));
634 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
635 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
636 "\xc1"));
637 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
638 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
639 "\xe0\x80"));
640 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
641 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
642 "\xe0\x9f"));
643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
644 ConvertUTFResultContainer(sourceIllegal)
645 .withScalars(0xfffd, 0xfffd, 0xfffd),
646 "\xf0\x80\x80"));
647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648 ConvertUTFResultContainer(sourceIllegal)
649 .withScalars(0xfffd, 0xfffd, 0xfffd),
650 "\xf0\x8f\x80"));
651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652 ConvertUTFResultContainer(sourceIllegal)
653 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
654 "\xf8\x80\x80\x80"));
655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656 ConvertUTFResultContainer(sourceIllegal)
657 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
658 "\xfc\x80\x80\x80\x80"));
660 // Sequences that represent surrogates with one trailing byte missing.
661 // High surrogates
662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
663 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
664 "\xed\xa0"));
665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
667 "\xed\xac"));
668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
670 "\xed\xaf"));
671 // Low surrogates
672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
674 "\xed\xb0"));
675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
676 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
677 "\xed\xb4"));
678 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
679 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
680 "\xed\xbf"));
682 // Ill-formed 4-byte sequences.
683 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
684 // U+1100xx (invalid)
685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
686 ConvertUTFResultContainer(sourceIllegal)
687 .withScalars(0xfffd, 0xfffd, 0xfffd),
688 "\xf4\x90\x80"));
689 // U+13FBxx (invalid)
690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691 ConvertUTFResultContainer(sourceIllegal)
692 .withScalars(0xfffd, 0xfffd, 0xfffd),
693 "\xf4\xbf\xbf"));
694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
695 ConvertUTFResultContainer(sourceIllegal)
696 .withScalars(0xfffd, 0xfffd, 0xfffd),
697 "\xf5\x80\x80"));
698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
699 ConvertUTFResultContainer(sourceIllegal)
700 .withScalars(0xfffd, 0xfffd, 0xfffd),
701 "\xf6\x80\x80"));
702 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
703 ConvertUTFResultContainer(sourceIllegal)
704 .withScalars(0xfffd, 0xfffd, 0xfffd),
705 "\xf7\x80\x80"));
706 // U+1FFBxx (invalid)
707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
708 ConvertUTFResultContainer(sourceIllegal)
709 .withScalars(0xfffd, 0xfffd, 0xfffd),
710 "\xf7\xbf\xbf"));
712 // Ill-formed 5-byte sequences.
713 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
714 // U+2000xx (invalid)
715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716 ConvertUTFResultContainer(sourceIllegal)
717 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
718 "\xf8\x88\x80\x80"));
719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal)
721 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
722 "\xf8\xbf\xbf\xbf"));
723 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
724 ConvertUTFResultContainer(sourceIllegal)
725 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
726 "\xf9\x80\x80\x80"));
727 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
728 ConvertUTFResultContainer(sourceIllegal)
729 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
730 "\xfa\x80\x80\x80"));
731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732 ConvertUTFResultContainer(sourceIllegal)
733 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
734 "\xfb\x80\x80\x80"));
735 // U+3FFFFxx (invalid)
736 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
737 ConvertUTFResultContainer(sourceIllegal)
738 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
739 "\xfb\xbf\xbf\xbf"));
741 // Ill-formed 6-byte sequences.
742 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
743 // U+40000xx (invalid)
744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745 ConvertUTFResultContainer(sourceIllegal)
746 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
747 "\xfc\x84\x80\x80\x80"));
748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749 ConvertUTFResultContainer(sourceIllegal)
750 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
751 "\xfc\xbf\xbf\xbf\xbf"));
752 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
753 ConvertUTFResultContainer(sourceIllegal)
754 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
755 "\xfd\x80\x80\x80\x80"));
756 // U+7FFFFFxx (invalid)
757 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
758 ConvertUTFResultContainer(sourceIllegal)
759 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
760 "\xfd\xbf\xbf\xbf\xbf"));
763 // Sequences with two continuation bytes missing
766 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
767 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
768 "\xf0\x90"));
769 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
770 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
771 "\xf0\xbf"));
772 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
773 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
774 "\xf1\x80"));
775 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
776 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
777 "\xf3\xbf"));
778 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
779 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
780 "\xf4\x80"));
781 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
782 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
783 "\xf4\x8f"));
785 // Overlong sequences with two trailing byte missing.
786 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
787 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
788 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
789 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
790 "\xf0\x80"));
791 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
792 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
793 "\xf0\x8f"));
794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795 ConvertUTFResultContainer(sourceIllegal)
796 .withScalars(0xfffd, 0xfffd, 0xfffd),
797 "\xf8\x80\x80"));
798 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
799 ConvertUTFResultContainer(sourceIllegal)
800 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
801 "\xfc\x80\x80\x80"));
803 // Sequences that represent surrogates with two trailing bytes missing.
804 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
805 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
807 // Ill-formed 4-byte sequences.
808 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
809 // U+110yxx (invalid)
810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
812 "\xf4\x90"));
813 // U+13Fyxx (invalid)
814 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
815 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
816 "\xf4\xbf"));
817 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
818 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
819 "\xf5\x80"));
820 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
821 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
822 "\xf6\x80"));
823 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
824 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
825 "\xf7\x80"));
826 // U+1FFyxx (invalid)
827 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
828 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
829 "\xf7\xbf"));
831 // Ill-formed 5-byte sequences.
832 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
833 // U+200yxx (invalid)
834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
836 "\xf8\x88\x80"));
837 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
838 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
839 "\xf8\xbf\xbf"));
840 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
841 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
842 "\xf9\x80\x80"));
843 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
844 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
845 "\xfa\x80\x80"));
846 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
847 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
848 "\xfb\x80\x80"));
849 // U+3FFFyxx (invalid)
850 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
851 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
852 "\xfb\xbf\xbf"));
854 // Ill-formed 6-byte sequences.
855 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
856 // U+4000yxx (invalid)
857 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
858 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
859 "\xfc\x84\x80\x80"));
860 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
861 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
862 "\xfc\xbf\xbf\xbf"));
863 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
864 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
865 "\xfd\x80\x80\x80"));
866 // U+7FFFFyxx (invalid)
867 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
868 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
869 "\xfd\xbf\xbf\xbf"));
872 // Sequences with three continuation bytes missing
875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
877 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
878 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
879 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
880 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
881 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
882 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
883 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
884 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
886 // Broken overlong sequences.
887 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
888 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
891 "\xf8\x80"));
892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
894 "\xfc\x80\x80"));
896 // Ill-formed 4-byte sequences.
897 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
898 // U+14yyxx (invalid)
899 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
900 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
901 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
902 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
903 // U+1Cyyxx (invalid)
904 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
905 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
907 // Ill-formed 5-byte sequences.
908 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
909 // U+20yyxx (invalid)
910 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
911 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
912 "\xf8\x88"));
913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
915 "\xf8\xbf"));
916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
918 "\xf9\x80"));
919 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
920 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
921 "\xfa\x80"));
922 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
923 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
924 "\xfb\x80"));
925 // U+3FCyyxx (invalid)
926 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
927 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
928 "\xfb\xbf"));
930 // Ill-formed 6-byte sequences.
931 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
932 // U+400yyxx (invalid)
933 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
934 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
935 "\xfc\x84\x80"));
936 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
937 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
938 "\xfc\xbf\xbf"));
939 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
940 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
941 "\xfd\x80\x80"));
942 // U+7FFCyyxx (invalid)
943 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
944 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
945 "\xfd\xbf\xbf"));
948 // Sequences with four continuation bytes missing
951 // Ill-formed 5-byte sequences.
952 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
953 // U+uzyyxx (invalid)
954 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
955 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
956 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
957 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
958 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
959 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
960 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
962 // U+3zyyxx (invalid)
963 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
964 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
966 // Broken overlong sequences.
967 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
968 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
969 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
970 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
971 "\xfc\x80"));
973 // Ill-formed 6-byte sequences.
974 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
975 // U+uzzyyxx (invalid)
976 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
977 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
978 "\xfc\x84"));
979 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
980 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
981 "\xfc\xbf"));
982 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
983 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
984 "\xfd\x80"));
985 // U+7Fzzyyxx (invalid)
986 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
987 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
988 "\xfd\xbf"));
991 // Sequences with five continuation bytes missing
994 // Ill-formed 6-byte sequences.
995 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
996 // U+uzzyyxx (invalid)
997 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
998 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
999 // U+uuzzyyxx (invalid)
1000 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1001 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
1004 // Consecutive sequences with trailing bytes missing
1007 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1008 ConvertUTFResultContainer(sourceIllegal)
1009 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1010 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1011 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1012 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1013 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1014 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1015 "\xc0" "\xe0\x80" "\xf0\x80\x80"
1016 "\xf8\x80\x80\x80"
1017 "\xfc\x80\x80\x80\x80"
1018 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1019 "\xfb\xbf\xbf\xbf"
1020 "\xfd\xbf\xbf\xbf\xbf"));
1023 // Overlong UTF-8 sequences
1026 // U+002F SOLIDUS
1027 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1028 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
1030 // Overlong sequences of the above.
1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1033 "\xc0\xaf"));
1034 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1035 ConvertUTFResultContainer(sourceIllegal)
1036 .withScalars(0xfffd, 0xfffd, 0xfffd),
1037 "\xe0\x80\xaf"));
1038 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1039 ConvertUTFResultContainer(sourceIllegal)
1040 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1041 "\xf0\x80\x80\xaf"));
1042 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1043 ConvertUTFResultContainer(sourceIllegal)
1044 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1045 "\xf8\x80\x80\x80\xaf"));
1046 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1047 ConvertUTFResultContainer(sourceIllegal)
1048 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1049 "\xfc\x80\x80\x80\x80\xaf"));
1051 // U+0000 NULL
1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1054 StringRef("\x00", 1)));
1056 // Overlong sequences of the above.
1057 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1058 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1059 "\xc0\x80"));
1060 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1061 ConvertUTFResultContainer(sourceIllegal)
1062 .withScalars(0xfffd, 0xfffd, 0xfffd),
1063 "\xe0\x80\x80"));
1064 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1065 ConvertUTFResultContainer(sourceIllegal)
1066 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1067 "\xf0\x80\x80\x80"));
1068 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1069 ConvertUTFResultContainer(sourceIllegal)
1070 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1071 "\xf8\x80\x80\x80\x80"));
1072 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1073 ConvertUTFResultContainer(sourceIllegal)
1074 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1075 "\xfc\x80\x80\x80\x80\x80"));
1077 // Other overlong sequences.
1078 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1079 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1080 "\xc0\xbf"));
1081 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1082 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1083 "\xc1\x80"));
1084 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1085 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1086 "\xc1\xbf"));
1087 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1088 ConvertUTFResultContainer(sourceIllegal)
1089 .withScalars(0xfffd, 0xfffd, 0xfffd),
1090 "\xe0\x9f\xbf"));
1091 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1092 ConvertUTFResultContainer(sourceIllegal)
1093 .withScalars(0xfffd, 0xfffd, 0xfffd),
1094 "\xed\xa0\x80"));
1095 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1096 ConvertUTFResultContainer(sourceIllegal)
1097 .withScalars(0xfffd, 0xfffd, 0xfffd),
1098 "\xed\xbf\xbf"));
1099 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1100 ConvertUTFResultContainer(sourceIllegal)
1101 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1102 "\xf0\x8f\x80\x80"));
1103 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1104 ConvertUTFResultContainer(sourceIllegal)
1105 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1106 "\xf0\x8f\xbf\xbf"));
1107 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1108 ConvertUTFResultContainer(sourceIllegal)
1109 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1110 "\xf8\x87\xbf\xbf\xbf"));
1111 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1112 ConvertUTFResultContainer(sourceIllegal)
1113 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1114 "\xfc\x83\xbf\xbf\xbf\xbf"));
1117 // Isolated surrogates
1120 // Unicode 6.3.0:
1122 // D71. High-surrogate code point: A Unicode code point in the range
1123 // U+D800 to U+DBFF.
1125 // D73. Low-surrogate code point: A Unicode code point in the range
1126 // U+DC00 to U+DFFF.
1128 // Note: U+E0100 is <DB40 DD00> in UTF16.
1130 // High surrogates
1132 // U+D800
1133 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1134 ConvertUTFResultContainer(sourceIllegal)
1135 .withScalars(0xfffd, 0xfffd, 0xfffd),
1136 "\xed\xa0\x80"));
1138 // U+DB40
1139 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1140 ConvertUTFResultContainer(sourceIllegal)
1141 .withScalars(0xfffd, 0xfffd, 0xfffd),
1142 "\xed\xac\xa0"));
1144 // U+DBFF
1145 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1146 ConvertUTFResultContainer(sourceIllegal)
1147 .withScalars(0xfffd, 0xfffd, 0xfffd),
1148 "\xed\xaf\xbf"));
1150 // Low surrogates
1152 // U+DC00
1153 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1154 ConvertUTFResultContainer(sourceIllegal)
1155 .withScalars(0xfffd, 0xfffd, 0xfffd),
1156 "\xed\xb0\x80"));
1158 // U+DD00
1159 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1160 ConvertUTFResultContainer(sourceIllegal)
1161 .withScalars(0xfffd, 0xfffd, 0xfffd),
1162 "\xed\xb4\x80"));
1164 // U+DFFF
1165 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1166 ConvertUTFResultContainer(sourceIllegal)
1167 .withScalars(0xfffd, 0xfffd, 0xfffd),
1168 "\xed\xbf\xbf"));
1170 // Surrogate pairs
1172 // U+D800 U+DC00
1173 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1174 ConvertUTFResultContainer(sourceIllegal)
1175 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1176 "\xed\xa0\x80\xed\xb0\x80"));
1178 // U+D800 U+DD00
1179 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1180 ConvertUTFResultContainer(sourceIllegal)
1181 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1182 "\xed\xa0\x80\xed\xb4\x80"));
1184 // U+D800 U+DFFF
1185 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1186 ConvertUTFResultContainer(sourceIllegal)
1187 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1188 "\xed\xa0\x80\xed\xbf\xbf"));
1190 // U+DB40 U+DC00
1191 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1192 ConvertUTFResultContainer(sourceIllegal)
1193 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1194 "\xed\xac\xa0\xed\xb0\x80"));
1196 // U+DB40 U+DD00
1197 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1198 ConvertUTFResultContainer(sourceIllegal)
1199 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1200 "\xed\xac\xa0\xed\xb4\x80"));
1202 // U+DB40 U+DFFF
1203 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1204 ConvertUTFResultContainer(sourceIllegal)
1205 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1206 "\xed\xac\xa0\xed\xbf\xbf"));
1208 // U+DBFF U+DC00
1209 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1210 ConvertUTFResultContainer(sourceIllegal)
1211 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1212 "\xed\xaf\xbf\xed\xb0\x80"));
1214 // U+DBFF U+DD00
1215 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1216 ConvertUTFResultContainer(sourceIllegal)
1217 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1218 "\xed\xaf\xbf\xed\xb4\x80"));
1220 // U+DBFF U+DFFF
1221 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222 ConvertUTFResultContainer(sourceIllegal)
1223 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1224 "\xed\xaf\xbf\xed\xbf\xbf"));
1227 // Noncharacters
1230 // Unicode 6.3.0:
1232 // D14. Noncharacter: A code point that is permanently reserved for
1233 // internal use and that should never be interchanged. Noncharacters
1234 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1235 // and the values U+FDD0..U+FDEF.
1237 // U+FFFE
1238 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1239 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1240 "\xef\xbf\xbe"));
1242 // U+FFFF
1243 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1244 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1245 "\xef\xbf\xbf"));
1247 // U+1FFFE
1248 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1249 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1250 "\xf0\x9f\xbf\xbe"));
1252 // U+1FFFF
1253 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1254 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1255 "\xf0\x9f\xbf\xbf"));
1257 // U+2FFFE
1258 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1259 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1260 "\xf0\xaf\xbf\xbe"));
1262 // U+2FFFF
1263 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1264 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1265 "\xf0\xaf\xbf\xbf"));
1267 // U+3FFFE
1268 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1269 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1270 "\xf0\xbf\xbf\xbe"));
1272 // U+3FFFF
1273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1274 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1275 "\xf0\xbf\xbf\xbf"));
1277 // U+4FFFE
1278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1279 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1280 "\xf1\x8f\xbf\xbe"));
1282 // U+4FFFF
1283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1284 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1285 "\xf1\x8f\xbf\xbf"));
1287 // U+5FFFE
1288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1289 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1290 "\xf1\x9f\xbf\xbe"));
1292 // U+5FFFF
1293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1294 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1295 "\xf1\x9f\xbf\xbf"));
1297 // U+6FFFE
1298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1299 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1300 "\xf1\xaf\xbf\xbe"));
1302 // U+6FFFF
1303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1304 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1305 "\xf1\xaf\xbf\xbf"));
1307 // U+7FFFE
1308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1309 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1310 "\xf1\xbf\xbf\xbe"));
1312 // U+7FFFF
1313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1314 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1315 "\xf1\xbf\xbf\xbf"));
1317 // U+8FFFE
1318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1319 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1320 "\xf2\x8f\xbf\xbe"));
1322 // U+8FFFF
1323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1324 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1325 "\xf2\x8f\xbf\xbf"));
1327 // U+9FFFE
1328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1329 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1330 "\xf2\x9f\xbf\xbe"));
1332 // U+9FFFF
1333 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1334 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1335 "\xf2\x9f\xbf\xbf"));
1337 // U+AFFFE
1338 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1339 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1340 "\xf2\xaf\xbf\xbe"));
1342 // U+AFFFF
1343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1344 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1345 "\xf2\xaf\xbf\xbf"));
1347 // U+BFFFE
1348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1349 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1350 "\xf2\xbf\xbf\xbe"));
1352 // U+BFFFF
1353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1354 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1355 "\xf2\xbf\xbf\xbf"));
1357 // U+CFFFE
1358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1359 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1360 "\xf3\x8f\xbf\xbe"));
1362 // U+CFFFF
1363 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1364 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1365 "\xf3\x8f\xbf\xbf"));
1367 // U+DFFFE
1368 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1369 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1370 "\xf3\x9f\xbf\xbe"));
1372 // U+DFFFF
1373 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1374 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1375 "\xf3\x9f\xbf\xbf"));
1377 // U+EFFFE
1378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1379 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1380 "\xf3\xaf\xbf\xbe"));
1382 // U+EFFFF
1383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1384 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1385 "\xf3\xaf\xbf\xbf"));
1387 // U+FFFFE
1388 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1389 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1390 "\xf3\xbf\xbf\xbe"));
1392 // U+FFFFF
1393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1394 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1395 "\xf3\xbf\xbf\xbf"));
1397 // U+10FFFE
1398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1399 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1400 "\xf4\x8f\xbf\xbe"));
1402 // U+10FFFF
1403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1404 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1405 "\xf4\x8f\xbf\xbf"));
1407 // U+FDD0
1408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1409 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1410 "\xef\xb7\x90"));
1412 // U+FDD1
1413 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1414 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1415 "\xef\xb7\x91"));
1417 // U+FDD2
1418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1419 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1420 "\xef\xb7\x92"));
1422 // U+FDD3
1423 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1424 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1425 "\xef\xb7\x93"));
1427 // U+FDD4
1428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1429 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1430 "\xef\xb7\x94"));
1432 // U+FDD5
1433 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1434 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1435 "\xef\xb7\x95"));
1437 // U+FDD6
1438 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1439 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1440 "\xef\xb7\x96"));
1442 // U+FDD7
1443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1444 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1445 "\xef\xb7\x97"));
1447 // U+FDD8
1448 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1449 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1450 "\xef\xb7\x98"));
1452 // U+FDD9
1453 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1454 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1455 "\xef\xb7\x99"));
1457 // U+FDDA
1458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1459 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1460 "\xef\xb7\x9a"));
1462 // U+FDDB
1463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1464 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1465 "\xef\xb7\x9b"));
1467 // U+FDDC
1468 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1469 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1470 "\xef\xb7\x9c"));
1472 // U+FDDD
1473 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1474 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1475 "\xef\xb7\x9d"));
1477 // U+FDDE
1478 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1479 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1480 "\xef\xb7\x9e"));
1482 // U+FDDF
1483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1484 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1485 "\xef\xb7\x9f"));
1487 // U+FDE0
1488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1489 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1490 "\xef\xb7\xa0"));
1492 // U+FDE1
1493 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1494 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1495 "\xef\xb7\xa1"));
1497 // U+FDE2
1498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1499 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1500 "\xef\xb7\xa2"));
1502 // U+FDE3
1503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1504 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1505 "\xef\xb7\xa3"));
1507 // U+FDE4
1508 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1509 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1510 "\xef\xb7\xa4"));
1512 // U+FDE5
1513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1514 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1515 "\xef\xb7\xa5"));
1517 // U+FDE6
1518 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1519 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1520 "\xef\xb7\xa6"));
1522 // U+FDE7
1523 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1524 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1525 "\xef\xb7\xa7"));
1527 // U+FDE8
1528 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1529 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1530 "\xef\xb7\xa8"));
1532 // U+FDE9
1533 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1534 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1535 "\xef\xb7\xa9"));
1537 // U+FDEA
1538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1539 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1540 "\xef\xb7\xaa"));
1542 // U+FDEB
1543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1544 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1545 "\xef\xb7\xab"));
1547 // U+FDEC
1548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1549 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1550 "\xef\xb7\xac"));
1552 // U+FDED
1553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1554 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1555 "\xef\xb7\xad"));
1557 // U+FDEE
1558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1559 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1560 "\xef\xb7\xae"));
1562 // U+FDEF
1563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1564 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1565 "\xef\xb7\xaf"));
1567 // U+FDF0
1568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1569 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1570 "\xef\xb7\xb0"));
1572 // U+FDF1
1573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1574 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1575 "\xef\xb7\xb1"));
1577 // U+FDF2
1578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1579 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1580 "\xef\xb7\xb2"));
1582 // U+FDF3
1583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1584 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1585 "\xef\xb7\xb3"));
1587 // U+FDF4
1588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1589 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1590 "\xef\xb7\xb4"));
1592 // U+FDF5
1593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1594 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1595 "\xef\xb7\xb5"));
1597 // U+FDF6
1598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1599 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1600 "\xef\xb7\xb6"));
1602 // U+FDF7
1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1605 "\xef\xb7\xb7"));
1607 // U+FDF8
1608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1609 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1610 "\xef\xb7\xb8"));
1612 // U+FDF9
1613 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1614 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1615 "\xef\xb7\xb9"));
1617 // U+FDFA
1618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1619 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1620 "\xef\xb7\xba"));
1622 // U+FDFB
1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1625 "\xef\xb7\xbb"));
1627 // U+FDFC
1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1630 "\xef\xb7\xbc"));
1632 // U+FDFD
1633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1634 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1635 "\xef\xb7\xbd"));
1637 // U+FDFE
1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1640 "\xef\xb7\xbe"));
1642 // U+FDFF
1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1645 "\xef\xb7\xbf"));
1648 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1649 // U+0041 LATIN CAPITAL LETTER A
1650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1651 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1652 "\x41", true));
1655 // Sequences with one continuation byte missing
1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659 ConvertUTFResultContainer(sourceExhausted),
1660 "\xc2", true));
1661 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1662 ConvertUTFResultContainer(sourceExhausted),
1663 "\xdf", true));
1664 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1665 ConvertUTFResultContainer(sourceExhausted),
1666 "\xe0\xa0", true));
1667 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1668 ConvertUTFResultContainer(sourceExhausted),
1669 "\xe0\xbf", true));
1670 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1671 ConvertUTFResultContainer(sourceExhausted),
1672 "\xe1\x80", true));
1673 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1674 ConvertUTFResultContainer(sourceExhausted),
1675 "\xec\xbf", true));
1676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1677 ConvertUTFResultContainer(sourceExhausted),
1678 "\xed\x80", true));
1679 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1680 ConvertUTFResultContainer(sourceExhausted),
1681 "\xed\x9f", true));
1682 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1683 ConvertUTFResultContainer(sourceExhausted),
1684 "\xee\x80", true));
1685 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1686 ConvertUTFResultContainer(sourceExhausted),
1687 "\xef\xbf", true));
1688 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1689 ConvertUTFResultContainer(sourceExhausted),
1690 "\xf0\x90\x80", true));
1691 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1692 ConvertUTFResultContainer(sourceExhausted),
1693 "\xf0\xbf\xbf", true));
1694 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1695 ConvertUTFResultContainer(sourceExhausted),
1696 "\xf1\x80\x80", true));
1697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1698 ConvertUTFResultContainer(sourceExhausted),
1699 "\xf3\xbf\xbf", true));
1700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1701 ConvertUTFResultContainer(sourceExhausted),
1702 "\xf4\x80\x80", true));
1703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1704 ConvertUTFResultContainer(sourceExhausted),
1705 "\xf4\x8f\xbf", true));
1707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1708 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1709 "\x41\xc2", true));