1 // Copyright (C) 2020-2025 Free Software Foundation, Inc.
3 // This file is part of the GNU ISO C++ Library. This library is free
4 // software; you can redistribute it and/or modify it under the
5 // terms of the GNU General Public License as published by the
6 // Free Software Foundation; either version 3, or (at your option)
9 // This library is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License along
15 // with this library; see the file COPYING3. If not see
16 // <http://www.gnu.org/licenses/>.
20 #include <testsuite_hooks.h>
22 struct test_offsets_ok
24 size_t in_size
, out_size
;
26 struct test_offsets_partial
28 size_t in_size
, out_size
, expected_in_next
, expected_out_next
;
31 template <class CharT
> struct test_offsets_error
33 size_t in_size
, out_size
, expected_in_next
, expected_out_next
;
38 template <class T
, size_t N
>
39 auto constexpr array_size (const T (&)[N
]) -> size_t
44 template <class InternT
, class ExternT
>
46 utf8_to_utf32_in_ok (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
49 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
50 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
51 const char32_t expected
[] = U
"b\u0448\uAAAA\U0010AAAA";
52 static_assert (array_size (input
) == 11, "");
53 static_assert (array_size (expected
) == 5, "");
55 ExternT in
[array_size (input
)];
56 InternT exp
[array_size (expected
)];
57 copy (begin (input
), end (input
), begin (in
));
58 copy (begin (expected
), end (expected
), begin (exp
));
59 VERIFY (char_traits
<ExternT
>::length (in
) == 10);
60 VERIFY (char_traits
<InternT
>::length (exp
) == 4);
62 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
63 for (auto t
: offsets
)
65 InternT out
[array_size (exp
) - 1] = {};
66 VERIFY (t
.in_size
<= array_size (in
));
67 VERIFY (t
.out_size
<= array_size (out
));
68 auto state
= mbstate_t{};
69 auto in_next
= (const ExternT
*) nullptr;
70 auto out_next
= (InternT
*) nullptr;
71 auto res
= codecvt_base::result ();
73 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
75 VERIFY (res
== cvt
.ok
);
76 VERIFY (in_next
== in
+ t
.in_size
);
77 VERIFY (out_next
== out
+ t
.out_size
);
78 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
79 if (t
.out_size
< array_size (out
))
80 VERIFY (out
[t
.out_size
] == 0);
83 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
85 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
88 for (auto t
: offsets
)
90 InternT out
[array_size (exp
)] = {};
91 VERIFY (t
.in_size
<= array_size (in
));
92 VERIFY (t
.out_size
<= array_size (out
));
93 auto state
= mbstate_t{};
94 auto in_next
= (const ExternT
*) nullptr;
95 auto out_next
= (InternT
*) nullptr;
96 auto res
= codecvt_base::result ();
99 = cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, end (out
), out_next
);
100 VERIFY (res
== cvt
.ok
);
101 VERIFY (in_next
== in
+ t
.in_size
);
102 VERIFY (out_next
== out
+ t
.out_size
);
103 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
104 if (t
.out_size
< array_size (out
))
105 VERIFY (out
[t
.out_size
] == 0);
108 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, array_size (out
));
110 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
114 template <class InternT
, class ExternT
>
116 utf8_to_utf32_in_partial (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
119 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
120 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
121 const char32_t expected
[] = U
"b\u0448\uAAAA\U0010AAAA";
122 static_assert (array_size (input
) == 11, "");
123 static_assert (array_size (expected
) == 5, "");
125 ExternT in
[array_size (input
)];
126 InternT exp
[array_size (expected
)];
127 copy (begin (input
), end (input
), begin (in
));
128 copy (begin (expected
), end (expected
), begin (exp
));
129 VERIFY (char_traits
<ExternT
>::length (in
) == 10);
130 VERIFY (char_traits
<InternT
>::length (exp
) == 4);
132 test_offsets_partial offsets
[] = {
133 {1, 0, 0, 0}, // no space for first CP
135 {3, 1, 1, 1}, // no space for second CP
136 {2, 2, 1, 1}, // incomplete second CP
137 {2, 1, 1, 1}, // incomplete second CP, and no space for it
139 {6, 2, 3, 2}, // no space for third CP
140 {4, 3, 3, 2}, // incomplete third CP
141 {5, 3, 3, 2}, // incomplete third CP
142 {4, 2, 3, 2}, // incomplete third CP, and no space for it
143 {5, 2, 3, 2}, // incomplete third CP, and no space for it
145 {10, 3, 6, 3}, // no space for fourth CP
146 {7, 4, 6, 3}, // incomplete fourth CP
147 {8, 4, 6, 3}, // incomplete fourth CP
148 {9, 4, 6, 3}, // incomplete fourth CP
149 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
150 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
151 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
154 for (auto t
: offsets
)
156 InternT out
[array_size (exp
) - 1] = {};
157 VERIFY (t
.in_size
<= array_size (in
));
158 VERIFY (t
.out_size
<= array_size (out
));
159 VERIFY (t
.expected_in_next
<= t
.in_size
);
160 VERIFY (t
.expected_out_next
<= t
.out_size
);
161 auto state
= mbstate_t{};
162 auto in_next
= (const ExternT
*) nullptr;
163 auto out_next
= (InternT
*) nullptr;
164 auto res
= codecvt_base::result ();
166 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
168 VERIFY (res
== cvt
.partial
);
169 VERIFY (in_next
== in
+ t
.expected_in_next
);
170 VERIFY (out_next
== out
+ t
.expected_out_next
);
171 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
173 if (t
.expected_out_next
< array_size (out
))
174 VERIFY (out
[t
.expected_out_next
] == 0);
177 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
179 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
183 template <class InternT
, class ExternT
>
185 utf8_to_utf32_in_error (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
188 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
189 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
190 const char32_t expected
[] = U
"b\u0448\uD700\U0010AAAA";
191 static_assert (array_size (input
) == 11, "");
192 static_assert (array_size (expected
) == 5, "");
194 ExternT in
[array_size (input
)];
195 InternT exp
[array_size (expected
)];
196 copy (begin (input
), end (input
), begin (in
));
197 copy (begin (expected
), end (expected
), begin (exp
));
198 VERIFY (char_traits
<ExternT
>::length (in
) == 10);
199 VERIFY (char_traits
<InternT
>::length (exp
) == 4);
201 // There are 5 classes of errors in UTF-8 decoding
202 // 1. Missing leading byte
203 // 2. Missing trailing byte
205 // 4. Overlong sequence
206 // 5. CP out of Unicode range
207 test_offsets_error
<unsigned char> offsets
[] = {
209 // 1. Missing leading byte. We will replace the leading byte with
210 // non-leading byte, such as a byte that is always invalid or a trailing
213 // replace leading byte with invalid byte
214 {1, 4, 0, 0, 0xFF, 0},
215 {3, 4, 1, 1, 0xFF, 1},
216 {6, 4, 3, 2, 0xFF, 3},
217 {10, 4, 6, 3, 0xFF, 6},
219 // replace leading byte with trailing byte
220 {1, 4, 0, 0, 0b10101010, 0},
221 {3, 4, 1, 1, 0b10101010, 1},
222 {6, 4, 3, 2, 0b10101010, 3},
223 {10, 4, 6, 3, 0b10101010, 6},
225 // 2. Missing trailing byte. We will replace the trailing byte with
226 // non-trailing byte, such as a byte that is always invalid or a leading
227 // byte (simple ASCII byte in our case).
229 // replace first trailing byte with ASCII byte
230 {3, 4, 1, 1, 'z', 2},
231 {6, 4, 3, 2, 'z', 4},
232 {10, 4, 6, 3, 'z', 7},
234 // replace first trailing byte with invalid byte
235 {3, 4, 1, 1, 0xFF, 2},
236 {6, 4, 3, 2, 0xFF, 4},
237 {10, 4, 6, 3, 0xFF, 7},
239 // replace second trailing byte with ASCII byte
240 {6, 4, 3, 2, 'z', 5},
241 {10, 4, 6, 3, 'z', 8},
243 // replace second trailing byte with invalid byte
244 {6, 4, 3, 2, 0xFF, 5},
245 {10, 4, 6, 3, 0xFF, 8},
247 // replace third trailing byte
248 {10, 4, 6, 3, 'z', 9},
249 {10, 4, 6, 3, 0xFF, 9},
251 // 2.1 The following test-cases raise doubt whether error or partial should
252 // be returned. For example, we have 4-byte sequence with valid leading
253 // byte. If we hide the last byte we need to return partial. But, if the
254 // second or third byte, which are visible to the call to codecvt, are
255 // malformed then error should be returned.
257 // replace first trailing byte with ASCII byte, also incomplete at end
258 {5, 4, 3, 2, 'z', 4},
259 {8, 4, 6, 3, 'z', 7},
260 {9, 4, 6, 3, 'z', 7},
262 // replace first trailing byte with invalid byte, also incomplete at end
263 {5, 4, 3, 2, 0xFF, 4},
264 {8, 4, 6, 3, 0xFF, 7},
265 {9, 4, 6, 3, 0xFF, 7},
267 // replace second trailing byte with ASCII byte, also incomplete at end
268 {9, 4, 6, 3, 'z', 8},
270 // replace second trailing byte with invalid byte, also incomplete at end
271 {9, 4, 6, 3, 0xFF, 8},
273 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
275 {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
276 {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
277 {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
278 {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
280 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
281 // just the leading byte is enough to make them overlong, i.e. for the
282 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
284 {3, 4, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
285 {3, 4, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
286 {6, 4, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
287 {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
290 // turn U+10AAAA into U+14AAAA by changing its leading byte
291 {10, 4, 6, 3, 0b11110101, 6},
292 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
293 {10, 4, 6, 3, 0b10011010, 7},
295 for (auto t
: offsets
)
297 InternT out
[array_size (exp
) - 1] = {};
298 VERIFY (t
.in_size
<= array_size (in
));
299 VERIFY (t
.out_size
<= array_size (out
));
300 VERIFY (t
.expected_in_next
<= t
.in_size
);
301 VERIFY (t
.expected_out_next
<= t
.out_size
);
302 auto old_char
= in
[t
.replace_pos
];
303 in
[t
.replace_pos
] = t
.replace_char
;
305 auto state
= mbstate_t{};
306 auto in_next
= (const ExternT
*) nullptr;
307 auto out_next
= (InternT
*) nullptr;
308 auto res
= codecvt_base::result ();
310 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
312 VERIFY (res
== cvt
.error
);
313 VERIFY (in_next
== in
+ t
.expected_in_next
);
314 VERIFY (out_next
== out
+ t
.expected_out_next
);
315 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
317 if (t
.expected_out_next
< array_size (out
))
318 VERIFY (out
[t
.expected_out_next
] == 0);
321 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
323 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
325 in
[t
.replace_pos
] = old_char
;
329 template <class InternT
, class ExternT
>
331 utf8_to_utf32_in (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
333 utf8_to_utf32_in_ok (cvt
);
334 utf8_to_utf32_in_partial (cvt
);
335 utf8_to_utf32_in_error (cvt
);
338 template <class InternT
, class ExternT
>
340 utf32_to_utf8_out_ok (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
343 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
344 const char32_t input
[] = U
"b\u0448\uAAAA\U0010AAAA";
345 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
346 static_assert (array_size (input
) == 5, "");
347 static_assert (array_size (expected
) == 11, "");
349 InternT in
[array_size (input
)];
350 ExternT exp
[array_size (expected
)];
351 copy (begin (input
), end (input
), begin (in
));
352 copy (begin (expected
), end (expected
), begin (exp
));
353 VERIFY (char_traits
<InternT
>::length (in
) == 4);
354 VERIFY (char_traits
<ExternT
>::length (exp
) == 10);
356 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
357 for (auto t
: offsets
)
359 ExternT out
[array_size (exp
) - 1] = {};
360 VERIFY (t
.in_size
<= array_size (in
));
361 VERIFY (t
.out_size
<= array_size (out
));
362 auto state
= mbstate_t{};
363 auto in_next
= (const InternT
*) nullptr;
364 auto out_next
= (ExternT
*) nullptr;
365 auto res
= codecvt_base::result ();
367 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
369 VERIFY (res
== cvt
.ok
);
370 VERIFY (in_next
== in
+ t
.in_size
);
371 VERIFY (out_next
== out
+ t
.out_size
);
372 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.out_size
) == 0);
373 if (t
.out_size
< array_size (out
))
374 VERIFY (out
[t
.out_size
] == 0);
378 template <class InternT
, class ExternT
>
380 utf32_to_utf8_out_partial (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
383 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
384 const char32_t input
[] = U
"b\u0448\uAAAA\U0010AAAA";
385 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
386 static_assert (array_size (input
) == 5, "");
387 static_assert (array_size (expected
) == 11, "");
389 InternT in
[array_size (input
)];
390 ExternT exp
[array_size (expected
)];
391 copy (begin (input
), end (input
), begin (in
));
392 copy (begin (expected
), end (expected
), begin (exp
));
393 VERIFY (char_traits
<InternT
>::length (in
) == 4);
394 VERIFY (char_traits
<ExternT
>::length (exp
) == 10);
396 test_offsets_partial offsets
[] = {
397 {1, 0, 0, 0}, // no space for first CP
399 {2, 1, 1, 1}, // no space for second CP
400 {2, 2, 1, 1}, // no space for second CP
402 {3, 3, 2, 3}, // no space for third CP
403 {3, 4, 2, 3}, // no space for third CP
404 {3, 5, 2, 3}, // no space for third CP
406 {4, 6, 3, 6}, // no space for fourth CP
407 {4, 7, 3, 6}, // no space for fourth CP
408 {4, 8, 3, 6}, // no space for fourth CP
409 {4, 9, 3, 6}, // no space for fourth CP
411 for (auto t
: offsets
)
413 ExternT out
[array_size (exp
) - 1] = {};
414 VERIFY (t
.in_size
<= array_size (in
));
415 VERIFY (t
.out_size
<= array_size (out
));
416 VERIFY (t
.expected_in_next
<= t
.in_size
);
417 VERIFY (t
.expected_out_next
<= t
.out_size
);
418 auto state
= mbstate_t{};
419 auto in_next
= (const InternT
*) nullptr;
420 auto out_next
= (ExternT
*) nullptr;
421 auto res
= codecvt_base::result ();
423 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
425 VERIFY (res
== cvt
.partial
);
426 VERIFY (in_next
== in
+ t
.expected_in_next
);
427 VERIFY (out_next
== out
+ t
.expected_out_next
);
428 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.expected_out_next
)
430 if (t
.expected_out_next
< array_size (out
))
431 VERIFY (out
[t
.expected_out_next
] == 0);
435 template <class InternT
, class ExternT
>
437 utf32_to_utf8_out_error (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
440 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
441 const char32_t input
[] = U
"b\u0448\uAAAA\U0010AAAA";
442 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
443 static_assert (array_size (input
) == 5, "");
444 static_assert (array_size (expected
) == 11, "");
446 InternT in
[array_size (input
)];
447 ExternT exp
[array_size (expected
)];
448 copy (begin (input
), end (input
), begin (in
));
449 copy (begin (expected
), end (expected
), begin (exp
));
450 VERIFY (char_traits
<InternT
>::length (in
) == 4);
451 VERIFY (char_traits
<ExternT
>::length (exp
) == 10);
453 test_offsets_error
<InternT
> offsets
[] = {
456 {4, 10, 0, 0, 0xD800, 0},
457 {4, 10, 1, 1, 0xDBFF, 1},
458 {4, 10, 2, 3, 0xDC00, 2},
459 {4, 10, 3, 6, 0xDFFF, 3},
462 {4, 10, 0, 0, 0x00110000, 0},
463 {4, 10, 1, 1, 0x00110000, 1},
464 {4, 10, 2, 3, 0x00110000, 2},
465 {4, 10, 3, 6, 0x00110000, 3}};
467 for (auto t
: offsets
)
469 ExternT out
[array_size (exp
) - 1] = {};
470 VERIFY (t
.in_size
<= array_size (in
));
471 VERIFY (t
.out_size
<= array_size (out
));
472 VERIFY (t
.expected_in_next
<= t
.in_size
);
473 VERIFY (t
.expected_out_next
<= t
.out_size
);
474 auto old_char
= in
[t
.replace_pos
];
475 in
[t
.replace_pos
] = t
.replace_char
;
477 auto state
= mbstate_t{};
478 auto in_next
= (const InternT
*) nullptr;
479 auto out_next
= (ExternT
*) nullptr;
480 auto res
= codecvt_base::result ();
482 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
484 VERIFY (res
== cvt
.error
);
485 VERIFY (in_next
== in
+ t
.expected_in_next
);
486 VERIFY (out_next
== out
+ t
.expected_out_next
);
487 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.expected_out_next
)
489 if (t
.expected_out_next
< array_size (out
))
490 VERIFY (out
[t
.expected_out_next
] == 0);
492 in
[t
.replace_pos
] = old_char
;
496 template <class InternT
, class ExternT
>
498 utf32_to_utf8_out (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
500 utf32_to_utf8_out_ok (cvt
);
501 utf32_to_utf8_out_partial (cvt
);
502 utf32_to_utf8_out_error (cvt
);
505 template <class InternT
, class ExternT
>
507 test_utf8_utf32_cvt (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
509 utf8_to_utf32_in (cvt
);
510 utf32_to_utf8_out (cvt
);
513 template <class InternT
, class ExternT
>
515 utf8_to_utf16_in_ok (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
518 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
519 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
520 const char16_t expected
[] = u
"b\u0448\uAAAA\U0010AAAA";
521 static_assert (array_size (input
) == 11, "");
522 static_assert (array_size (expected
) == 6, "");
524 ExternT in
[array_size (input
)];
525 InternT exp
[array_size (expected
)];
526 copy (begin (input
), end (input
), begin (in
));
527 copy (begin (expected
), end (expected
), begin (exp
));
528 VERIFY (char_traits
<ExternT
>::length (in
) == 10);
529 VERIFY (char_traits
<InternT
>::length (exp
) == 5);
531 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}};
532 for (auto t
: offsets
)
534 InternT out
[array_size (exp
) - 1] = {};
535 VERIFY (t
.in_size
<= array_size (in
));
536 VERIFY (t
.out_size
<= array_size (out
));
537 auto state
= mbstate_t{};
538 auto in_next
= (const ExternT
*) nullptr;
539 auto out_next
= (InternT
*) nullptr;
540 auto res
= codecvt_base::result ();
542 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
544 VERIFY (res
== cvt
.ok
);
545 VERIFY (in_next
== in
+ t
.in_size
);
546 VERIFY (out_next
== out
+ t
.out_size
);
547 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
548 if (t
.out_size
< array_size (out
))
549 VERIFY (out
[t
.out_size
] == 0);
552 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
554 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
557 for (auto t
: offsets
)
559 InternT out
[array_size (exp
)] = {};
560 VERIFY (t
.in_size
<= array_size (in
));
561 VERIFY (t
.out_size
<= array_size (out
));
562 auto state
= mbstate_t{};
563 auto in_next
= (const ExternT
*) nullptr;
564 auto out_next
= (InternT
*) nullptr;
565 auto res
= codecvt_base::result ();
568 = cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, end (out
), out_next
);
569 VERIFY (res
== cvt
.ok
);
570 VERIFY (in_next
== in
+ t
.in_size
);
571 VERIFY (out_next
== out
+ t
.out_size
);
572 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
573 if (t
.out_size
< array_size (out
))
574 VERIFY (out
[t
.out_size
] == 0);
577 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, array_size (out
));
579 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
583 template <class InternT
, class ExternT
>
585 utf8_to_utf16_in_partial (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
588 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
589 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
590 const char16_t expected
[] = u
"b\u0448\uAAAA\U0010AAAA";
591 static_assert (array_size (input
) == 11, "");
592 static_assert (array_size (expected
) == 6, "");
594 ExternT in
[array_size (input
)];
595 InternT exp
[array_size (expected
)];
596 copy (begin (input
), end (input
), begin (in
));
597 copy (begin (expected
), end (expected
), begin (exp
));
598 VERIFY (char_traits
<ExternT
>::length (in
) == 10);
599 VERIFY (char_traits
<InternT
>::length (exp
) == 5);
601 test_offsets_partial offsets
[] = {
602 {1, 0, 0, 0}, // no space for first CP
604 {3, 1, 1, 1}, // no space for second CP
605 {2, 2, 1, 1}, // incomplete second CP
606 {2, 1, 1, 1}, // incomplete second CP, and no space for it
608 {6, 2, 3, 2}, // no space for third CP
609 {4, 3, 3, 2}, // incomplete third CP
610 {5, 3, 3, 2}, // incomplete third CP
611 {4, 2, 3, 2}, // incomplete third CP, and no space for it
612 {5, 2, 3, 2}, // incomplete third CP, and no space for it
614 {10, 3, 6, 3}, // no space for fourth CP
615 {10, 4, 6, 3}, // no space for fourth CP
616 {7, 5, 6, 3}, // incomplete fourth CP
617 {8, 5, 6, 3}, // incomplete fourth CP
618 {9, 5, 6, 3}, // incomplete fourth CP
619 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
620 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
621 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
622 {7, 4, 6, 3}, // incomplete fourth CP, and no space for it
623 {8, 4, 6, 3}, // incomplete fourth CP, and no space for it
624 {9, 4, 6, 3}, // incomplete fourth CP, and no space for it
628 for (auto t
: offsets
)
630 InternT out
[array_size (exp
) - 1] = {};
631 VERIFY (t
.in_size
<= array_size (in
));
632 VERIFY (t
.out_size
<= array_size (out
));
633 VERIFY (t
.expected_in_next
<= t
.in_size
);
634 VERIFY (t
.expected_out_next
<= t
.out_size
);
635 auto state
= mbstate_t{};
636 auto in_next
= (const ExternT
*) nullptr;
637 auto out_next
= (InternT
*) nullptr;
638 auto res
= codecvt_base::result ();
640 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
642 VERIFY (res
== cvt
.partial
);
643 VERIFY (in_next
== in
+ t
.expected_in_next
);
644 VERIFY (out_next
== out
+ t
.expected_out_next
);
645 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
647 if (t
.expected_out_next
< array_size (out
))
648 VERIFY (out
[t
.expected_out_next
] == 0);
651 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
653 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
657 template <class InternT
, class ExternT
>
659 utf8_to_utf16_in_error (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
662 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
663 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
664 const char16_t expected
[] = u
"b\u0448\uD700\U0010AAAA";
665 static_assert (array_size (input
) == 11, "");
666 static_assert (array_size (expected
) == 6, "");
668 ExternT in
[array_size (input
)];
669 InternT exp
[array_size (expected
)];
670 copy (begin (input
), end (input
), begin (in
));
671 copy (begin (expected
), end (expected
), begin (exp
));
672 VERIFY (char_traits
<ExternT
>::length (in
) == 10);
673 VERIFY (char_traits
<InternT
>::length (exp
) == 5);
675 // There are 5 classes of errors in UTF-8 decoding
676 // 1. Missing leading byte
677 // 2. Missing trailing byte
679 // 4. Overlong sequence
680 // 5. CP out of Unicode range
681 test_offsets_error
<unsigned char> offsets
[] = {
683 // 1. Missing leading byte. We will replace the leading byte with
684 // non-leading byte, such as a byte that is always invalid or a trailing
687 // replace leading byte with invalid byte
688 {1, 5, 0, 0, 0xFF, 0},
689 {3, 5, 1, 1, 0xFF, 1},
690 {6, 5, 3, 2, 0xFF, 3},
691 {10, 5, 6, 3, 0xFF, 6},
693 // replace leading byte with trailing byte
694 {1, 5, 0, 0, 0b10101010, 0},
695 {3, 5, 1, 1, 0b10101010, 1},
696 {6, 5, 3, 2, 0b10101010, 3},
697 {10, 5, 6, 3, 0b10101010, 6},
699 // 2. Missing trailing byte. We will replace the trailing byte with
700 // non-trailing byte, such as a byte that is always invalid or a leading
701 // byte (simple ASCII byte in our case).
703 // replace first trailing byte with ASCII byte
704 {3, 5, 1, 1, 'z', 2},
705 {6, 5, 3, 2, 'z', 4},
706 {10, 5, 6, 3, 'z', 7},
708 // replace first trailing byte with invalid byte
709 {3, 5, 1, 1, 0xFF, 2},
710 {6, 5, 3, 2, 0xFF, 4},
711 {10, 5, 6, 3, 0xFF, 7},
713 // replace second trailing byte with ASCII byte
714 {6, 5, 3, 2, 'z', 5},
715 {10, 5, 6, 3, 'z', 8},
717 // replace second trailing byte with invalid byte
718 {6, 5, 3, 2, 0xFF, 5},
719 {10, 5, 6, 3, 0xFF, 8},
721 // replace third trailing byte
722 {10, 5, 6, 3, 'z', 9},
723 {10, 5, 6, 3, 0xFF, 9},
725 // 2.1 The following test-cases raise doubt whether error or partial should
726 // be returned. For example, we have 4-byte sequence with valid leading
727 // byte. If we hide the last byte we need to return partial. But, if the
728 // second or third byte, which are visible to the call to codecvt, are
729 // malformed then error should be returned.
731 // replace first trailing byte with ASCII byte, also incomplete at end
732 {5, 5, 3, 2, 'z', 4},
733 {8, 5, 6, 3, 'z', 7},
734 {9, 5, 6, 3, 'z', 7},
736 // replace first trailing byte with invalid byte, also incomplete at end
737 {5, 5, 3, 2, 0xFF, 4},
738 {8, 5, 6, 3, 0xFF, 7},
739 {9, 5, 6, 3, 0xFF, 7},
741 // replace second trailing byte with ASCII byte, also incomplete at end
742 {9, 5, 6, 3, 'z', 8},
744 // replace second trailing byte with invalid byte, also incomplete at end
745 {9, 5, 6, 3, 0xFF, 8},
747 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
749 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
750 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
751 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
752 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
754 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
755 // just the leading byte is enough to make them overlong, i.e. for the
756 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
758 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
759 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
760 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
761 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
764 // turn U+10AAAA into U+14AAAA by changing its leading byte
765 {10, 5, 6, 3, 0b11110101, 6},
766 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
767 {10, 5, 6, 3, 0b10011010, 7},
769 for (auto t
: offsets
)
771 InternT out
[array_size (exp
) - 1] = {};
772 VERIFY (t
.in_size
<= array_size (in
));
773 VERIFY (t
.out_size
<= array_size (out
));
774 VERIFY (t
.expected_in_next
<= t
.in_size
);
775 VERIFY (t
.expected_out_next
<= t
.out_size
);
776 auto old_char
= in
[t
.replace_pos
];
777 in
[t
.replace_pos
] = t
.replace_char
;
779 auto state
= mbstate_t{};
780 auto in_next
= (const ExternT
*) nullptr;
781 auto out_next
= (InternT
*) nullptr;
782 auto res
= codecvt_base::result ();
784 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
786 VERIFY (res
== cvt
.error
);
787 VERIFY (in_next
== in
+ t
.expected_in_next
);
788 VERIFY (out_next
== out
+ t
.expected_out_next
);
789 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
791 if (t
.expected_out_next
< array_size (out
))
792 VERIFY (out
[t
.expected_out_next
] == 0);
795 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
797 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
799 in
[t
.replace_pos
] = old_char
;
803 template <class InternT
, class ExternT
>
805 utf8_to_utf16_in (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
807 utf8_to_utf16_in_ok (cvt
);
808 utf8_to_utf16_in_partial (cvt
);
809 utf8_to_utf16_in_error (cvt
);
812 template <class InternT
, class ExternT
>
814 utf16_to_utf8_out_ok (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
817 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
818 const char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
819 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
820 static_assert (array_size (input
) == 6, "");
821 static_assert (array_size (expected
) == 11, "");
823 InternT in
[array_size (input
)];
824 ExternT exp
[array_size (expected
)];
825 copy (begin (input
), end (input
), begin (in
));
826 copy (begin (expected
), end (expected
), begin (exp
));
827 VERIFY (char_traits
<InternT
>::length (in
) == 5);
828 VERIFY (char_traits
<ExternT
>::length (exp
) == 10);
830 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
831 for (auto t
: offsets
)
833 ExternT out
[array_size (exp
) - 1] = {};
834 VERIFY (t
.in_size
<= array_size (in
));
835 VERIFY (t
.out_size
<= array_size (out
));
836 auto state
= mbstate_t{};
837 auto in_next
= (const InternT
*) nullptr;
838 auto out_next
= (ExternT
*) nullptr;
839 auto res
= codecvt_base::result ();
841 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
843 VERIFY (res
== cvt
.ok
);
844 VERIFY (in_next
== in
+ t
.in_size
);
845 VERIFY (out_next
== out
+ t
.out_size
);
846 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.out_size
) == 0);
847 if (t
.out_size
< array_size (out
))
848 VERIFY (out
[t
.out_size
] == 0);
852 template <class InternT
, class ExternT
>
854 utf16_to_utf8_out_partial (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
857 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
858 const char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
859 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
860 static_assert (array_size (input
) == 6, "");
861 static_assert (array_size (expected
) == 11, "");
863 InternT in
[array_size (input
)];
864 ExternT exp
[array_size (expected
)];
865 copy (begin (input
), end (input
), begin (in
));
866 copy (begin (expected
), end (expected
), begin (exp
));
867 VERIFY (char_traits
<InternT
>::length (in
) == 5);
868 VERIFY (char_traits
<ExternT
>::length (exp
) == 10);
870 test_offsets_partial offsets
[] = {
871 {1, 0, 0, 0}, // no space for first CP
873 {2, 1, 1, 1}, // no space for second CP
874 {2, 2, 1, 1}, // no space for second CP
876 {3, 3, 2, 3}, // no space for third CP
877 {3, 4, 2, 3}, // no space for third CP
878 {3, 5, 2, 3}, // no space for third CP
880 {5, 6, 3, 6}, // no space for fourth CP
881 {5, 7, 3, 6}, // no space for fourth CP
882 {5, 8, 3, 6}, // no space for fourth CP
883 {5, 9, 3, 6}, // no space for fourth CP
885 {4, 10, 3, 6}, // incomplete fourth CP
887 {4, 6, 3, 6}, // incomplete fourth CP, and no space for it
888 {4, 7, 3, 6}, // incomplete fourth CP, and no space for it
889 {4, 8, 3, 6}, // incomplete fourth CP, and no space for it
890 {4, 9, 3, 6}, // incomplete fourth CP, and no space for it
892 for (auto t
: offsets
)
894 ExternT out
[array_size (exp
) - 1] = {};
895 VERIFY (t
.in_size
<= array_size (in
));
896 VERIFY (t
.out_size
<= array_size (out
));
897 VERIFY (t
.expected_in_next
<= t
.in_size
);
898 VERIFY (t
.expected_out_next
<= t
.out_size
);
899 auto state
= mbstate_t{};
900 auto in_next
= (const InternT
*) nullptr;
901 auto out_next
= (ExternT
*) nullptr;
902 auto res
= codecvt_base::result ();
904 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
906 VERIFY (res
== cvt
.partial
);
907 VERIFY (in_next
== in
+ t
.expected_in_next
);
908 VERIFY (out_next
== out
+ t
.expected_out_next
);
909 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.expected_out_next
)
911 if (t
.expected_out_next
< array_size (out
))
912 VERIFY (out
[t
.expected_out_next
] == 0);
916 template <class InternT
, class ExternT
>
918 utf16_to_utf8_out_error (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
921 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
922 const char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
923 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
924 static_assert (array_size (input
) == 6, "");
925 static_assert (array_size (expected
) == 11, "");
927 InternT in
[array_size (input
)];
928 ExternT exp
[array_size (expected
)];
929 copy (begin (input
), end (input
), begin (in
));
930 copy (begin (expected
), end (expected
), begin (exp
));
931 VERIFY (char_traits
<InternT
>::length (in
) == 5);
932 VERIFY (char_traits
<ExternT
>::length (exp
) == 10);
934 // The only possible error in UTF-16 is unpaired surrogate code units.
935 // So we replace valid code points (scalar values) with lone surrogate CU.
936 test_offsets_error
<InternT
> offsets
[] = {
937 {5, 10, 0, 0, 0xD800, 0},
938 {5, 10, 0, 0, 0xDBFF, 0},
939 {5, 10, 0, 0, 0xDC00, 0},
940 {5, 10, 0, 0, 0xDFFF, 0},
942 {5, 10, 1, 1, 0xD800, 1},
943 {5, 10, 1, 1, 0xDBFF, 1},
944 {5, 10, 1, 1, 0xDC00, 1},
945 {5, 10, 1, 1, 0xDFFF, 1},
947 {5, 10, 2, 3, 0xD800, 2},
948 {5, 10, 2, 3, 0xDBFF, 2},
949 {5, 10, 2, 3, 0xDC00, 2},
950 {5, 10, 2, 3, 0xDFFF, 2},
952 // make the leading surrogate a trailing one
953 {5, 10, 3, 6, 0xDC00, 3},
954 {5, 10, 3, 6, 0xDFFF, 3},
956 // make the trailing surrogate a leading one
957 {5, 10, 3, 6, 0xD800, 4},
958 {5, 10, 3, 6, 0xDBFF, 4},
960 // make the trailing surrogate a BMP char
961 {5, 10, 3, 6, u
'z', 4},
964 for (auto t
: offsets
)
966 ExternT out
[array_size (exp
) - 1] = {};
967 VERIFY (t
.in_size
<= array_size (in
));
968 VERIFY (t
.out_size
<= array_size (out
));
969 VERIFY (t
.expected_in_next
<= t
.in_size
);
970 VERIFY (t
.expected_out_next
<= t
.out_size
);
971 auto old_char
= in
[t
.replace_pos
];
972 in
[t
.replace_pos
] = t
.replace_char
;
974 auto state
= mbstate_t{};
975 auto in_next
= (const InternT
*) nullptr;
976 auto out_next
= (ExternT
*) nullptr;
977 auto res
= codecvt_base::result ();
979 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
981 VERIFY (res
== cvt
.error
);
982 VERIFY (in_next
== in
+ t
.expected_in_next
);
983 VERIFY (out_next
== out
+ t
.expected_out_next
);
984 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.expected_out_next
)
986 if (t
.expected_out_next
< array_size (out
))
987 VERIFY (out
[t
.expected_out_next
] == 0);
989 in
[t
.replace_pos
] = old_char
;
993 template <class InternT
, class ExternT
>
995 utf16_to_utf8_out (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
997 utf16_to_utf8_out_ok (cvt
);
998 utf16_to_utf8_out_partial (cvt
);
999 utf16_to_utf8_out_error (cvt
);
1002 template <class InternT
, class ExternT
>
1004 test_utf8_utf16_cvt (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1006 utf8_to_utf16_in (cvt
);
1007 utf16_to_utf8_out (cvt
);
1010 template <class InternT
, class ExternT
>
1012 utf8_to_ucs2_in_ok (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1014 using namespace std
;
1015 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1016 const unsigned char input
[] = "b\u0448\uAAAA";
1017 const char16_t expected
[] = u
"b\u0448\uAAAA";
1018 static_assert (array_size (input
) == 7, "");
1019 static_assert (array_size (expected
) == 4, "");
1021 ExternT in
[array_size (input
)];
1022 InternT exp
[array_size (expected
)];
1023 copy (begin (input
), end (input
), begin (in
));
1024 copy (begin (expected
), end (expected
), begin (exp
));
1025 VERIFY (char_traits
<ExternT
>::length (in
) == 6);
1026 VERIFY (char_traits
<InternT
>::length (exp
) == 3);
1028 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}};
1029 for (auto t
: offsets
)
1031 InternT out
[array_size (exp
) - 1] = {};
1032 VERIFY (t
.in_size
<= array_size (in
));
1033 VERIFY (t
.out_size
<= array_size (out
));
1034 auto state
= mbstate_t{};
1035 auto in_next
= (const ExternT
*) nullptr;
1036 auto out_next
= (InternT
*) nullptr;
1037 auto res
= codecvt_base::result ();
1039 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1041 VERIFY (res
== cvt
.ok
);
1042 VERIFY (in_next
== in
+ t
.in_size
);
1043 VERIFY (out_next
== out
+ t
.out_size
);
1044 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
1045 if (t
.out_size
< array_size (out
))
1046 VERIFY (out
[t
.out_size
] == 0);
1049 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
1051 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
1054 for (auto t
: offsets
)
1056 InternT out
[array_size (exp
)] = {};
1057 VERIFY (t
.in_size
<= array_size (in
));
1058 VERIFY (t
.out_size
<= array_size (out
));
1059 auto state
= mbstate_t{};
1060 auto in_next
= (const ExternT
*) nullptr;
1061 auto out_next
= (InternT
*) nullptr;
1062 auto res
= codecvt_base::result ();
1065 = cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, end (out
), out_next
);
1066 VERIFY (res
== cvt
.ok
);
1067 VERIFY (in_next
== in
+ t
.in_size
);
1068 VERIFY (out_next
== out
+ t
.out_size
);
1069 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
1070 if (t
.out_size
< array_size (out
))
1071 VERIFY (out
[t
.out_size
] == 0);
1074 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, array_size (out
));
1076 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
1080 template <class InternT
, class ExternT
>
1082 utf8_to_ucs2_in_partial (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1084 using namespace std
;
1085 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1086 const unsigned char input
[] = "b\u0448\uAAAA";
1087 const char16_t expected
[] = u
"b\u0448\uAAAA";
1088 static_assert (array_size (input
) == 7, "");
1089 static_assert (array_size (expected
) == 4, "");
1091 ExternT in
[array_size (input
)];
1092 InternT exp
[array_size (expected
)];
1093 copy (begin (input
), end (input
), begin (in
));
1094 copy (begin (expected
), end (expected
), begin (exp
));
1095 VERIFY (char_traits
<ExternT
>::length (in
) == 6);
1096 VERIFY (char_traits
<InternT
>::length (exp
) == 3);
1098 test_offsets_partial offsets
[] = {
1099 {1, 0, 0, 0}, // no space for first CP
1101 {3, 1, 1, 1}, // no space for second CP
1102 {2, 2, 1, 1}, // incomplete second CP
1103 {2, 1, 1, 1}, // incomplete second CP, and no space for it
1105 {6, 2, 3, 2}, // no space for third CP
1106 {4, 3, 3, 2}, // incomplete third CP
1107 {5, 3, 3, 2}, // incomplete third CP
1108 {4, 2, 3, 2}, // incomplete third CP, and no space for it
1109 {5, 2, 3, 2}, // incomplete third CP, and no space for it
1112 for (auto t
: offsets
)
1114 InternT out
[array_size (exp
) - 1] = {};
1115 VERIFY (t
.in_size
<= array_size (in
));
1116 VERIFY (t
.out_size
<= array_size (out
));
1117 VERIFY (t
.expected_in_next
<= t
.in_size
);
1118 VERIFY (t
.expected_out_next
<= t
.out_size
);
1119 auto state
= mbstate_t{};
1120 auto in_next
= (const ExternT
*) nullptr;
1121 auto out_next
= (InternT
*) nullptr;
1122 auto res
= codecvt_base::result ();
1124 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1126 VERIFY (res
== cvt
.partial
);
1127 VERIFY (in_next
== in
+ t
.expected_in_next
);
1128 VERIFY (out_next
== out
+ t
.expected_out_next
);
1129 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
1131 if (t
.expected_out_next
< array_size (out
))
1132 VERIFY (out
[t
.expected_out_next
] == 0);
1135 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
1137 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
1141 template <class InternT
, class ExternT
>
1143 utf8_to_ucs2_in_error (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1145 using namespace std
;
1146 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
1147 const char16_t expected
[] = u
"b\u0448\uD700\U0010AAAA";
1148 static_assert (array_size (input
) == 11, "");
1149 static_assert (array_size (expected
) == 6, "");
1151 ExternT in
[array_size (input
)];
1152 InternT exp
[array_size (expected
)];
1153 copy (begin (input
), end (input
), begin (in
));
1154 copy (begin (expected
), end (expected
), begin (exp
));
1155 VERIFY (char_traits
<ExternT
>::length (in
) == 10);
1156 VERIFY (char_traits
<InternT
>::length (exp
) == 5);
1158 // There are 5 classes of errors in UTF-8 decoding
1159 // 1. Missing leading byte
1160 // 2. Missing trailing byte
1162 // 4. Overlong sequence
1163 // 5. CP out of Unicode range
1164 test_offsets_error
<unsigned char> offsets
[] = {
1166 // 1. Missing leading byte. We will replace the leading byte with
1167 // non-leading byte, such as a byte that is always invalid or a trailing
1170 // replace leading byte with invalid byte
1171 {1, 5, 0, 0, 0xFF, 0},
1172 {3, 5, 1, 1, 0xFF, 1},
1173 {6, 5, 3, 2, 0xFF, 3},
1174 {10, 5, 6, 3, 0xFF, 6},
1176 // replace leading byte with trailing byte
1177 {1, 5, 0, 0, 0b10101010, 0},
1178 {3, 5, 1, 1, 0b10101010, 1},
1179 {6, 5, 3, 2, 0b10101010, 3},
1180 {10, 5, 6, 3, 0b10101010, 6},
1182 // 2. Missing trailing byte. We will replace the trailing byte with
1183 // non-trailing byte, such as a byte that is always invalid or a leading
1184 // byte (simple ASCII byte in our case).
1186 // replace first trailing byte with ASCII byte
1187 {3, 5, 1, 1, 'z', 2},
1188 {6, 5, 3, 2, 'z', 4},
1189 {10, 5, 6, 3, 'z', 7},
1191 // replace first trailing byte with invalid byte
1192 {3, 5, 1, 1, 0xFF, 2},
1193 {6, 5, 3, 2, 0xFF, 4},
1194 {10, 5, 6, 3, 0xFF, 7},
1196 // replace second trailing byte with ASCII byte
1197 {6, 5, 3, 2, 'z', 5},
1198 {10, 5, 6, 3, 'z', 8},
1200 // replace second trailing byte with invalid byte
1201 {6, 5, 3, 2, 0xFF, 5},
1202 {10, 5, 6, 3, 0xFF, 8},
1204 // replace third trailing byte
1205 {10, 5, 6, 3, 'z', 9},
1206 {10, 5, 6, 3, 0xFF, 9},
1208 // 2.1 The following test-cases raise doubt whether error or partial should
1209 // be returned. For example, we have 4-byte sequence with valid leading
1210 // byte. If we hide the last byte we need to return partial. But, if the
1211 // second or third byte, which are visible to the call to codecvt, are
1212 // malformed then error should be returned.
1214 // replace first trailing byte with ASCII byte, also incomplete at end
1215 {5, 5, 3, 2, 'z', 4},
1216 {8, 5, 6, 3, 'z', 7},
1217 {9, 5, 6, 3, 'z', 7},
1219 // replace first trailing byte with invalid byte, also incomplete at end
1220 {5, 5, 3, 2, 0xFF, 4},
1221 {8, 5, 6, 3, 0xFF, 7},
1222 {9, 5, 6, 3, 0xFF, 7},
1224 // replace second trailing byte with ASCII byte, also incomplete at end
1225 {9, 5, 6, 3, 'z', 8},
1227 // replace second trailing byte with invalid byte, also incomplete at end
1228 {9, 5, 6, 3, 0xFF, 8},
1230 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
1232 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
1233 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
1234 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
1235 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
1237 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
1238 // just the leading byte is enough to make them overlong, i.e. for the
1239 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
1241 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
1242 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
1243 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
1244 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
1246 // 5. CP above range
1247 // turn U+10AAAA into U+14AAAA by changing its leading byte
1248 {10, 5, 6, 3, 0b11110101, 6},
1249 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
1250 {10, 5, 6, 3, 0b10011010, 7},
1251 // Don't replace anything, show full 4-byte CP U+10AAAA
1252 {10, 4, 6, 3, 'b', 0},
1253 {10, 5, 6, 3, 'b', 0},
1254 // Don't replace anything, show incomplete 4-byte CP at the end. It's still
1255 // out of UCS2 range just by seeing the first byte.
1256 {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1257 {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1258 {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1259 {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1260 {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1261 {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1263 for (auto t
: offsets
)
1265 InternT out
[array_size (exp
) - 1] = {};
1266 VERIFY (t
.in_size
<= array_size (in
));
1267 VERIFY (t
.out_size
<= array_size (out
));
1268 VERIFY (t
.expected_in_next
<= t
.in_size
);
1269 VERIFY (t
.expected_out_next
<= t
.out_size
);
1270 auto old_char
= in
[t
.replace_pos
];
1271 in
[t
.replace_pos
] = t
.replace_char
;
1273 auto state
= mbstate_t{};
1274 auto in_next
= (const ExternT
*) nullptr;
1275 auto out_next
= (InternT
*) nullptr;
1276 auto res
= codecvt_base::result ();
1278 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1280 VERIFY (res
== cvt
.error
);
1281 VERIFY (in_next
== in
+ t
.expected_in_next
);
1282 VERIFY (out_next
== out
+ t
.expected_out_next
);
1283 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
1285 if (t
.expected_out_next
< array_size (out
))
1286 VERIFY (out
[t
.expected_out_next
] == 0);
1289 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
1291 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
1293 in
[t
.replace_pos
] = old_char
;
1297 template <class InternT
, class ExternT
>
1299 utf8_to_ucs2_in (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1301 utf8_to_ucs2_in_ok (cvt
);
1302 utf8_to_ucs2_in_partial (cvt
);
1303 utf8_to_ucs2_in_error (cvt
);
1306 template <class InternT
, class ExternT
>
1308 ucs2_to_utf8_out_ok (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1310 using namespace std
;
1311 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1312 const char16_t input
[] = u
"b\u0448\uAAAA";
1313 const unsigned char expected
[] = "b\u0448\uAAAA";
1314 static_assert (array_size (input
) == 4, "");
1315 static_assert (array_size (expected
) == 7, "");
1317 InternT in
[array_size (input
)];
1318 ExternT exp
[array_size (expected
)];
1319 copy (begin (input
), end (input
), begin (in
));
1320 copy (begin (expected
), end (expected
), begin (exp
));
1321 VERIFY (char_traits
<InternT
>::length (in
) == 3);
1322 VERIFY (char_traits
<ExternT
>::length (exp
) == 6);
1324 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}};
1325 for (auto t
: offsets
)
1327 ExternT out
[array_size (exp
) - 1] = {};
1328 VERIFY (t
.in_size
<= array_size (in
));
1329 VERIFY (t
.out_size
<= array_size (out
));
1330 auto state
= mbstate_t{};
1331 auto in_next
= (const InternT
*) nullptr;
1332 auto out_next
= (ExternT
*) nullptr;
1333 auto res
= codecvt_base::result ();
1335 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1337 VERIFY (res
== cvt
.ok
);
1338 VERIFY (in_next
== in
+ t
.in_size
);
1339 VERIFY (out_next
== out
+ t
.out_size
);
1340 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.out_size
) == 0);
1341 if (t
.out_size
< array_size (out
))
1342 VERIFY (out
[t
.out_size
] == 0);
1346 template <class InternT
, class ExternT
>
1348 ucs2_to_utf8_out_partial (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1350 using namespace std
;
1351 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1352 const char16_t input
[] = u
"b\u0448\uAAAA";
1353 const unsigned char expected
[] = "b\u0448\uAAAA";
1354 static_assert (array_size (input
) == 4, "");
1355 static_assert (array_size (expected
) == 7, "");
1357 InternT in
[array_size (input
)];
1358 ExternT exp
[array_size (expected
)];
1359 copy (begin (input
), end (input
), begin (in
));
1360 copy (begin (expected
), end (expected
), begin (exp
));
1361 VERIFY (char_traits
<InternT
>::length (in
) == 3);
1362 VERIFY (char_traits
<ExternT
>::length (exp
) == 6);
1364 test_offsets_partial offsets
[] = {
1365 {1, 0, 0, 0}, // no space for first CP
1367 {2, 1, 1, 1}, // no space for second CP
1368 {2, 2, 1, 1}, // no space for second CP
1370 {3, 3, 2, 3}, // no space for third CP
1371 {3, 4, 2, 3}, // no space for third CP
1372 {3, 5, 2, 3}, // no space for third CP
1374 for (auto t
: offsets
)
1376 ExternT out
[array_size (exp
) - 1] = {};
1377 VERIFY (t
.in_size
<= array_size (in
));
1378 VERIFY (t
.out_size
<= array_size (out
));
1379 VERIFY (t
.expected_in_next
<= t
.in_size
);
1380 VERIFY (t
.expected_out_next
<= t
.out_size
);
1381 auto state
= mbstate_t{};
1382 auto in_next
= (const InternT
*) nullptr;
1383 auto out_next
= (ExternT
*) nullptr;
1384 auto res
= codecvt_base::result ();
1386 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1388 VERIFY (res
== cvt
.partial
);
1389 VERIFY (in_next
== in
+ t
.expected_in_next
);
1390 VERIFY (out_next
== out
+ t
.expected_out_next
);
1391 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.expected_out_next
)
1393 if (t
.expected_out_next
< array_size (out
))
1394 VERIFY (out
[t
.expected_out_next
] == 0);
1398 template <class InternT
, class ExternT
>
1400 ucs2_to_utf8_out_error (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1402 using namespace std
;
1403 const char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
1404 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
1405 static_assert (array_size (input
) == 6, "");
1406 static_assert (array_size (expected
) == 11, "");
1408 InternT in
[array_size (input
)];
1409 ExternT exp
[array_size (expected
)];
1410 copy (begin (input
), end (input
), begin (in
));
1411 copy (begin (expected
), end (expected
), begin (exp
));
1412 VERIFY (char_traits
<InternT
>::length (in
) == 5);
1413 VERIFY (char_traits
<ExternT
>::length (exp
) == 10);
1415 test_offsets_error
<InternT
> offsets
[] = {
1416 {3, 6, 0, 0, 0xD800, 0},
1417 {3, 6, 0, 0, 0xDBFF, 0},
1418 {3, 6, 0, 0, 0xDC00, 0},
1419 {3, 6, 0, 0, 0xDFFF, 0},
1421 {3, 6, 1, 1, 0xD800, 1},
1422 {3, 6, 1, 1, 0xDBFF, 1},
1423 {3, 6, 1, 1, 0xDC00, 1},
1424 {3, 6, 1, 1, 0xDFFF, 1},
1426 {3, 6, 2, 3, 0xD800, 2},
1427 {3, 6, 2, 3, 0xDBFF, 2},
1428 {3, 6, 2, 3, 0xDC00, 2},
1429 {3, 6, 2, 3, 0xDFFF, 2},
1431 // make the leading surrogate a trailing one
1432 {5, 10, 3, 6, 0xDC00, 3},
1433 {5, 10, 3, 6, 0xDFFF, 3},
1435 // make the trailing surrogate a leading one
1436 {5, 10, 3, 6, 0xD800, 4},
1437 {5, 10, 3, 6, 0xDBFF, 4},
1439 // make the trailing surrogate a BMP char
1440 {5, 10, 3, 6, u
'z', 4},
1442 // don't replace anything in the test cases bellow, just show the surrogate
1443 // pair (fourth CP) fully or partially
1444 {5, 10, 3, 6, u
'b', 0},
1445 {5, 7, 3, 6, u
'b', 0}, // no space for fourth CP
1446 {5, 8, 3, 6, u
'b', 0}, // no space for fourth CP
1447 {5, 9, 3, 6, u
'b', 0}, // no space for fourth CP
1449 {4, 10, 3, 6, u
'b', 0}, // incomplete fourth CP
1450 {4, 7, 3, 6, u
'b', 0}, // incomplete fourth CP, and no space for it
1451 {4, 8, 3, 6, u
'b', 0}, // incomplete fourth CP, and no space for it
1452 {4, 9, 3, 6, u
'b', 0}, // incomplete fourth CP, and no space for it
1455 for (auto t
: offsets
)
1457 ExternT out
[array_size (exp
) - 1] = {};
1458 VERIFY (t
.in_size
<= array_size (in
));
1459 VERIFY (t
.out_size
<= array_size (out
));
1460 VERIFY (t
.expected_in_next
<= t
.in_size
);
1461 VERIFY (t
.expected_out_next
<= t
.out_size
);
1462 auto old_char
= in
[t
.replace_pos
];
1463 in
[t
.replace_pos
] = t
.replace_char
;
1465 auto state
= mbstate_t{};
1466 auto in_next
= (const InternT
*) nullptr;
1467 auto out_next
= (ExternT
*) nullptr;
1468 auto res
= codecvt_base::result ();
1470 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1472 VERIFY (res
== cvt
.error
);
1473 VERIFY (in_next
== in
+ t
.expected_in_next
);
1474 VERIFY (out_next
== out
+ t
.expected_out_next
);
1475 VERIFY (char_traits
<ExternT
>::compare (out
, exp
, t
.expected_out_next
)
1477 if (t
.expected_out_next
< array_size (out
))
1478 VERIFY (out
[t
.expected_out_next
] == 0);
1480 in
[t
.replace_pos
] = old_char
;
1484 template <class InternT
, class ExternT
>
1486 ucs2_to_utf8_out (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1488 ucs2_to_utf8_out_ok (cvt
);
1489 ucs2_to_utf8_out_partial (cvt
);
1490 ucs2_to_utf8_out_error (cvt
);
1493 template <class InternT
, class ExternT
>
1495 test_utf8_ucs2_cvt (const std::codecvt
<InternT
, ExternT
, mbstate_t> &cvt
)
1497 utf8_to_ucs2_in (cvt
);
1498 ucs2_to_utf8_out (cvt
);
1501 enum utf16_endianess
1507 template <class Iter1
, class Iter2
>
1509 utf16_to_bytes (Iter1 f
, Iter1 l
, Iter2 o
, utf16_endianess e
)
1511 if (e
== utf16_big_endian
)
1514 *o
++ = (*f
>> 8) & 0xFF;
1521 *o
++ = (*f
>> 8) & 0xFF;
1526 template <class InternT
>
1528 utf16_to_utf32_in_ok (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1529 utf16_endianess endianess
)
1531 using namespace std
;
1532 const char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
1533 const char32_t expected
[] = U
"b\u0448\uAAAA\U0010AAAA";
1534 static_assert (array_size (input
) == 6, "");
1535 static_assert (array_size (expected
) == 5, "");
1537 char in
[array_size (input
) * 2];
1538 InternT exp
[array_size (expected
)];
1539 utf16_to_bytes (begin (input
), end (input
), begin (in
), endianess
);
1540 copy (begin (expected
), end (expected
), begin (exp
));
1542 test_offsets_ok offsets
[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}, {10, 4}};
1543 for (auto t
: offsets
)
1545 InternT out
[array_size (exp
) - 1] = {};
1546 VERIFY (t
.in_size
<= array_size (in
));
1547 VERIFY (t
.out_size
<= array_size (out
));
1548 auto state
= mbstate_t{};
1549 auto in_next
= (const char *) nullptr;
1550 auto out_next
= (InternT
*) nullptr;
1551 auto res
= codecvt_base::result ();
1553 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1555 VERIFY (res
== cvt
.ok
);
1556 VERIFY (in_next
== in
+ t
.in_size
);
1557 VERIFY (out_next
== out
+ t
.out_size
);
1558 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
1559 if (t
.out_size
< array_size (out
))
1560 VERIFY (out
[t
.out_size
] == 0);
1563 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
1565 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
1568 for (auto t
: offsets
)
1570 InternT out
[array_size (exp
)] = {};
1571 VERIFY (t
.in_size
<= array_size (in
));
1572 VERIFY (t
.out_size
<= array_size (out
));
1573 auto state
= mbstate_t{};
1574 auto in_next
= (const char *) nullptr;
1575 auto out_next
= (InternT
*) nullptr;
1576 auto res
= codecvt_base::result ();
1579 = cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, end (out
), out_next
);
1580 VERIFY (res
== cvt
.ok
);
1581 VERIFY (in_next
== in
+ t
.in_size
);
1582 VERIFY (out_next
== out
+ t
.out_size
);
1583 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
1584 if (t
.out_size
< array_size (out
))
1585 VERIFY (out
[t
.out_size
] == 0);
1588 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, array_size (out
));
1590 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
1594 template <class InternT
>
1596 utf16_to_utf32_in_partial (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1597 utf16_endianess endianess
)
1599 using namespace std
;
1600 const char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
1601 const char32_t expected
[] = U
"b\u0448\uAAAA\U0010AAAA";
1602 static_assert (array_size (input
) == 6, "");
1603 static_assert (array_size (expected
) == 5, "");
1605 char in
[array_size (input
) * 2];
1606 InternT exp
[array_size (expected
)];
1607 utf16_to_bytes (begin (input
), end (input
), begin (in
), endianess
);
1608 copy (begin (expected
), end (expected
), begin (exp
));
1610 test_offsets_partial offsets
[] = {
1611 {2, 0, 0, 0}, // no space for first CP
1612 {1, 1, 0, 0}, // incomplete first CP
1613 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1615 {4, 1, 2, 1}, // no space for second CP
1616 {3, 2, 2, 1}, // incomplete second CP
1617 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1619 {6, 2, 4, 2}, // no space for third CP
1620 {5, 3, 4, 2}, // incomplete third CP
1621 {5, 2, 4, 2}, // incomplete third CP, and no space for it
1623 {10, 3, 6, 3}, // no space for fourth CP
1624 {7, 4, 6, 3}, // incomplete fourth CP
1625 {8, 4, 6, 3}, // incomplete fourth CP
1626 {9, 4, 6, 3}, // incomplete fourth CP
1627 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
1628 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
1629 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
1632 for (auto t
: offsets
)
1634 InternT out
[array_size (exp
) - 1] = {};
1635 VERIFY (t
.in_size
<= array_size (in
));
1636 VERIFY (t
.out_size
<= array_size (out
));
1637 VERIFY (t
.expected_in_next
<= t
.in_size
);
1638 VERIFY (t
.expected_out_next
<= t
.out_size
);
1639 auto state
= mbstate_t{};
1640 auto in_next
= (const char *) nullptr;
1641 auto out_next
= (InternT
*) nullptr;
1642 auto res
= codecvt_base::result ();
1644 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1646 VERIFY (res
== cvt
.partial
);
1647 VERIFY (in_next
== in
+ t
.expected_in_next
);
1648 VERIFY (out_next
== out
+ t
.expected_out_next
);
1649 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
1651 if (t
.expected_out_next
< array_size (out
))
1652 VERIFY (out
[t
.expected_out_next
] == 0);
1655 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
1657 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
1661 template <class InternT
>
1663 utf16_to_utf32_in_error (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1664 utf16_endianess endianess
)
1666 using namespace std
;
1667 char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
1668 const char32_t expected
[] = U
"b\u0448\uAAAA\U0010AAAA";
1669 static_assert (array_size (input
) == 6, "");
1670 static_assert (array_size (expected
) == 5, "");
1672 InternT exp
[array_size (expected
)];
1673 copy (begin (expected
), end (expected
), begin (exp
));
1675 // The only possible error in UTF-16 is unpaired surrogate code units.
1676 // So we replace valid code points (scalar values) with lone surrogate CU.
1677 test_offsets_error
<char16_t
> offsets
[] = {
1678 {10, 4, 0, 0, 0xD800, 0},
1679 {10, 4, 0, 0, 0xDBFF, 0},
1680 {10, 4, 0, 0, 0xDC00, 0},
1681 {10, 4, 0, 0, 0xDFFF, 0},
1683 {10, 4, 2, 1, 0xD800, 1},
1684 {10, 4, 2, 1, 0xDBFF, 1},
1685 {10, 4, 2, 1, 0xDC00, 1},
1686 {10, 4, 2, 1, 0xDFFF, 1},
1688 {10, 4, 4, 2, 0xD800, 2},
1689 {10, 4, 4, 2, 0xDBFF, 2},
1690 {10, 4, 4, 2, 0xDC00, 2},
1691 {10, 4, 4, 2, 0xDFFF, 2},
1693 // make the leading surrogate a trailing one
1694 {10, 4, 6, 3, 0xDC00, 3},
1695 {10, 4, 6, 3, 0xDFFF, 3},
1697 // make the trailing surrogate a leading one
1698 {10, 4, 6, 3, 0xD800, 4},
1699 {10, 4, 6, 3, 0xDBFF, 4},
1701 // make the trailing surrogate a BMP char
1702 {10, 4, 6, 3, u
'z', 4},
1705 for (auto t
: offsets
)
1707 char in
[array_size (input
) * 2];
1708 InternT out
[array_size (exp
) - 1] = {};
1709 VERIFY (t
.in_size
<= array_size (in
));
1710 VERIFY (t
.out_size
<= array_size (out
));
1711 VERIFY (t
.expected_in_next
<= t
.in_size
);
1712 VERIFY (t
.expected_out_next
<= t
.out_size
);
1713 auto old_char
= input
[t
.replace_pos
];
1714 input
[t
.replace_pos
] = t
.replace_char
; // replace in input, not in in
1715 utf16_to_bytes (begin (input
), end (input
), begin (in
), endianess
);
1717 auto state
= mbstate_t{};
1718 auto in_next
= (const char *) nullptr;
1719 auto out_next
= (InternT
*) nullptr;
1720 auto res
= codecvt_base::result ();
1722 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1724 VERIFY (res
== cvt
.error
);
1725 VERIFY (in_next
== in
+ t
.expected_in_next
);
1726 VERIFY (out_next
== out
+ t
.expected_out_next
);
1727 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
1729 if (t
.expected_out_next
< array_size (out
))
1730 VERIFY (out
[t
.expected_out_next
] == 0);
1733 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
1735 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
1737 input
[t
.replace_pos
] = old_char
;
1741 template <class InternT
>
1743 utf32_to_utf16_out_ok (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1744 utf16_endianess endianess
)
1746 using namespace std
;
1747 const char32_t input
[] = U
"b\u0448\uAAAA\U0010AAAA";
1748 const char16_t expected
[] = u
"b\u0448\uAAAA\U0010AAAA";
1749 static_assert (array_size (input
) == 5, "");
1750 static_assert (array_size (expected
) == 6, "");
1752 InternT in
[array_size (input
)];
1753 char exp
[array_size (expected
) * 2];
1754 copy (begin (input
), end (input
), begin (in
));
1755 utf16_to_bytes (begin (expected
), end (expected
), begin (exp
), endianess
);
1757 test_offsets_ok offsets
[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}, {4, 10}};
1758 for (auto t
: offsets
)
1760 char out
[array_size (exp
) - 2] = {};
1761 VERIFY (t
.in_size
<= array_size (in
));
1762 VERIFY (t
.out_size
<= array_size (out
));
1763 auto state
= mbstate_t{};
1764 auto in_next
= (const InternT
*) nullptr;
1765 auto out_next
= (char *) nullptr;
1766 auto res
= codecvt_base::result ();
1768 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1770 VERIFY (res
== cvt
.ok
);
1771 VERIFY (in_next
== in
+ t
.in_size
);
1772 VERIFY (out_next
== out
+ t
.out_size
);
1773 VERIFY (char_traits
<char>::compare (out
, exp
, t
.out_size
) == 0);
1774 if (t
.out_size
< array_size (out
))
1775 VERIFY (out
[t
.out_size
] == 0);
1779 template <class InternT
>
1781 utf32_to_utf16_out_partial (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1782 utf16_endianess endianess
)
1784 using namespace std
;
1785 const char32_t input
[] = U
"b\u0448\uAAAA\U0010AAAA";
1786 const char16_t expected
[] = u
"b\u0448\uAAAA\U0010AAAA";
1787 static_assert (array_size (input
) == 5, "");
1788 static_assert (array_size (expected
) == 6, "");
1790 InternT in
[array_size (input
)];
1791 char exp
[array_size (expected
) * 2];
1792 copy (begin (input
), end (input
), begin (in
));
1793 utf16_to_bytes (begin (expected
), end (expected
), begin (exp
), endianess
);
1795 test_offsets_partial offsets
[] = {
1796 {1, 0, 0, 0}, // no space for first CP
1797 {1, 1, 0, 0}, // no space for first CP
1799 {2, 2, 1, 2}, // no space for second CP
1800 {2, 3, 1, 2}, // no space for second CP
1802 {3, 4, 2, 4}, // no space for third CP
1803 {3, 5, 2, 4}, // no space for third CP
1805 {4, 6, 3, 6}, // no space for fourth CP
1806 {4, 7, 3, 6}, // no space for fourth CP
1807 {4, 8, 3, 6}, // no space for fourth CP
1808 {4, 9, 3, 6}, // no space for fourth CP
1810 for (auto t
: offsets
)
1812 char out
[array_size (exp
) - 2] = {};
1813 VERIFY (t
.in_size
<= array_size (in
));
1814 VERIFY (t
.out_size
<= array_size (out
));
1815 VERIFY (t
.expected_in_next
<= t
.in_size
);
1816 VERIFY (t
.expected_out_next
<= t
.out_size
);
1817 auto state
= mbstate_t{};
1818 auto in_next
= (const InternT
*) nullptr;
1819 auto out_next
= (char *) nullptr;
1820 auto res
= codecvt_base::result ();
1822 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1824 VERIFY (res
== cvt
.partial
);
1825 VERIFY (in_next
== in
+ t
.expected_in_next
);
1826 VERIFY (out_next
== out
+ t
.expected_out_next
);
1827 VERIFY (char_traits
<char>::compare (out
, exp
, t
.expected_out_next
) == 0);
1828 if (t
.expected_out_next
< array_size (out
))
1829 VERIFY (out
[t
.expected_out_next
] == 0);
1833 template <class InternT
>
1835 utf32_to_utf16_out_error (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1836 utf16_endianess endianess
)
1838 using namespace std
;
1839 const char32_t input
[] = U
"b\u0448\uAAAA\U0010AAAA";
1840 const char16_t expected
[] = u
"b\u0448\uAAAA\U0010AAAA";
1841 static_assert (array_size (input
) == 5, "");
1842 static_assert (array_size (expected
) == 6, "");
1844 InternT in
[array_size (input
)];
1845 char exp
[array_size (expected
) * 2];
1846 copy (begin (input
), end (input
), begin (in
));
1847 utf16_to_bytes (begin (expected
), end (expected
), begin (exp
), endianess
);
1849 test_offsets_error
<InternT
> offsets
[] = {
1852 {4, 10, 0, 0, 0xD800, 0},
1853 {4, 10, 1, 2, 0xDBFF, 1},
1854 {4, 10, 2, 4, 0xDC00, 2},
1855 {4, 10, 3, 6, 0xDFFF, 3},
1858 {4, 10, 0, 0, 0x00110000, 0},
1859 {4, 10, 1, 2, 0x00110000, 1},
1860 {4, 10, 2, 4, 0x00110000, 2},
1861 {4, 10, 3, 6, 0x00110000, 3}};
1863 for (auto t
: offsets
)
1865 char out
[array_size (exp
) - 2] = {};
1866 VERIFY (t
.in_size
<= array_size (in
));
1867 VERIFY (t
.out_size
<= array_size (out
));
1868 VERIFY (t
.expected_in_next
<= t
.in_size
);
1869 VERIFY (t
.expected_out_next
<= t
.out_size
);
1870 auto old_char
= in
[t
.replace_pos
];
1871 in
[t
.replace_pos
] = t
.replace_char
;
1873 auto state
= mbstate_t{};
1874 auto in_next
= (const InternT
*) nullptr;
1875 auto out_next
= (char *) nullptr;
1876 auto res
= codecvt_base::result ();
1878 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1880 VERIFY (res
== cvt
.error
);
1881 VERIFY (in_next
== in
+ t
.expected_in_next
);
1882 VERIFY (out_next
== out
+ t
.expected_out_next
);
1883 VERIFY (char_traits
<char>::compare (out
, exp
, t
.expected_out_next
) == 0);
1884 if (t
.expected_out_next
< array_size (out
))
1885 VERIFY (out
[t
.expected_out_next
] == 0);
1887 in
[t
.replace_pos
] = old_char
;
1891 template <class InternT
>
1893 test_utf16_utf32_cvt (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1894 utf16_endianess endianess
)
1896 utf16_to_utf32_in_ok (cvt
, endianess
);
1897 utf16_to_utf32_in_partial (cvt
, endianess
);
1898 utf16_to_utf32_in_error (cvt
, endianess
);
1899 utf32_to_utf16_out_ok (cvt
, endianess
);
1900 utf32_to_utf16_out_partial (cvt
, endianess
);
1901 utf32_to_utf16_out_error (cvt
, endianess
);
1904 template <class InternT
>
1906 utf16_to_ucs2_in_ok (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1907 utf16_endianess endianess
)
1909 using namespace std
;
1910 const char16_t input
[] = u
"b\u0448\uAAAA";
1911 const char16_t expected
[] = u
"b\u0448\uAAAA";
1912 static_assert (array_size (input
) == 4, "");
1913 static_assert (array_size (expected
) == 4, "");
1915 char in
[array_size (input
) * 2];
1916 InternT exp
[array_size (expected
)];
1917 utf16_to_bytes (begin (input
), end (input
), begin (in
), endianess
);
1918 copy (begin (expected
), end (expected
), begin (exp
));
1920 test_offsets_ok offsets
[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}};
1921 for (auto t
: offsets
)
1923 InternT out
[array_size (exp
) - 1] = {};
1924 VERIFY (t
.in_size
<= array_size (in
));
1925 VERIFY (t
.out_size
<= array_size (out
));
1926 auto state
= mbstate_t{};
1927 auto in_next
= (const char *) nullptr;
1928 auto out_next
= (InternT
*) nullptr;
1929 auto res
= codecvt_base::result ();
1931 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
1933 VERIFY (res
== cvt
.ok
);
1934 VERIFY (in_next
== in
+ t
.in_size
);
1935 VERIFY (out_next
== out
+ t
.out_size
);
1936 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
1937 if (t
.out_size
< array_size (out
))
1938 VERIFY (out
[t
.out_size
] == 0);
1941 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
1943 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
1946 for (auto t
: offsets
)
1948 InternT out
[array_size (exp
)] = {};
1949 VERIFY (t
.in_size
<= array_size (in
));
1950 VERIFY (t
.out_size
<= array_size (out
));
1951 auto state
= mbstate_t{};
1952 auto in_next
= (const char *) nullptr;
1953 auto out_next
= (InternT
*) nullptr;
1954 auto res
= codecvt_base::result ();
1957 = cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, end (out
), out_next
);
1958 VERIFY (res
== cvt
.ok
);
1959 VERIFY (in_next
== in
+ t
.in_size
);
1960 VERIFY (out_next
== out
+ t
.out_size
);
1961 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.out_size
) == 0);
1962 if (t
.out_size
< array_size (out
))
1963 VERIFY (out
[t
.out_size
] == 0);
1966 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, array_size (out
));
1968 VERIFY (static_cast<size_t> (len
) == t
.in_size
);
1972 template <class InternT
>
1974 utf16_to_ucs2_in_partial (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
1975 utf16_endianess endianess
)
1977 using namespace std
;
1978 const char16_t input
[] = u
"b\u0448\uAAAA";
1979 const char16_t expected
[] = u
"b\u0448\uAAAA";
1980 static_assert (array_size (input
) == 4, "");
1981 static_assert (array_size (expected
) == 4, "");
1983 char in
[array_size (input
) * 2];
1984 InternT exp
[array_size (expected
)];
1985 utf16_to_bytes (begin (input
), end (input
), begin (in
), endianess
);
1986 copy (begin (expected
), end (expected
), begin (exp
));
1988 test_offsets_partial offsets
[] = {
1989 {2, 0, 0, 0}, // no space for first CP
1990 {1, 1, 0, 0}, // incomplete first CP
1991 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1993 {4, 1, 2, 1}, // no space for second CP
1994 {3, 2, 2, 1}, // incomplete second CP
1995 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1997 {6, 2, 4, 2}, // no space for third CP
1998 {5, 3, 4, 2}, // incomplete third CP
1999 {5, 2, 4, 2}, // incomplete third CP, and no space for it
2002 for (auto t
: offsets
)
2004 InternT out
[array_size (exp
) - 1] = {};
2005 VERIFY (t
.in_size
<= array_size (in
));
2006 VERIFY (t
.out_size
<= array_size (out
));
2007 VERIFY (t
.expected_in_next
<= t
.in_size
);
2008 VERIFY (t
.expected_out_next
<= t
.out_size
);
2009 auto state
= mbstate_t{};
2010 auto in_next
= (const char *) nullptr;
2011 auto out_next
= (InternT
*) nullptr;
2012 auto res
= codecvt_base::result ();
2014 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
2016 VERIFY (res
== cvt
.partial
);
2017 VERIFY (in_next
== in
+ t
.expected_in_next
);
2018 VERIFY (out_next
== out
+ t
.expected_out_next
);
2019 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
2021 if (t
.expected_out_next
< array_size (out
))
2022 VERIFY (out
[t
.expected_out_next
] == 0);
2025 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
2027 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
2031 template <class InternT
>
2033 utf16_to_ucs2_in_error (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
2034 utf16_endianess endianess
)
2036 using namespace std
;
2037 char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
2038 const char16_t expected
[] = u
"b\u0448\uAAAA\U0010AAAA";
2039 static_assert (array_size (input
) == 6, "");
2040 static_assert (array_size (expected
) == 6, "");
2042 InternT exp
[array_size (expected
)];
2043 copy (begin (expected
), end (expected
), begin (exp
));
2045 // The only possible error in UTF-16 is unpaired surrogate code units.
2046 // Additionally, because the target encoding is UCS-2, a proper pair of
2047 // surrogates is also error. Simply, any surrogate CU is error.
2048 test_offsets_error
<char16_t
> offsets
[] = {
2049 {6, 3, 0, 0, 0xD800, 0},
2050 {6, 3, 0, 0, 0xDBFF, 0},
2051 {6, 3, 0, 0, 0xDC00, 0},
2052 {6, 3, 0, 0, 0xDFFF, 0},
2054 {6, 3, 2, 1, 0xD800, 1},
2055 {6, 3, 2, 1, 0xDBFF, 1},
2056 {6, 3, 2, 1, 0xDC00, 1},
2057 {6, 3, 2, 1, 0xDFFF, 1},
2059 {6, 3, 4, 2, 0xD800, 2},
2060 {6, 3, 4, 2, 0xDBFF, 2},
2061 {6, 3, 4, 2, 0xDC00, 2},
2062 {6, 3, 4, 2, 0xDFFF, 2},
2064 // make the leading surrogate a trailing one
2065 {10, 5, 6, 3, 0xDC00, 3},
2066 {10, 5, 6, 3, 0xDFFF, 3},
2068 // make the trailing surrogate a leading one
2069 {10, 5, 6, 3, 0xD800, 4},
2070 {10, 5, 6, 3, 0xDBFF, 4},
2072 // make the trailing surrogate a BMP char
2073 {10, 5, 6, 3, u
'z', 4},
2075 // don't replace anything in the test cases bellow, just show the surrogate
2076 // pair (fourth CP) fully or partially (just the first surrogate)
2077 {10, 5, 6, 3, u
'b', 0},
2078 {8, 5, 6, 3, u
'b', 0},
2079 {9, 5, 6, 3, u
'b', 0},
2081 {10, 4, 6, 3, u
'b', 0},
2082 {8, 4, 6, 3, u
'b', 0},
2083 {9, 4, 6, 3, u
'b', 0},
2086 for (auto t
: offsets
)
2088 char in
[array_size (input
) * 2];
2089 InternT out
[array_size (exp
) - 1] = {};
2090 VERIFY (t
.in_size
<= array_size (in
));
2091 VERIFY (t
.out_size
<= array_size (out
));
2092 VERIFY (t
.expected_in_next
<= t
.in_size
);
2093 VERIFY (t
.expected_out_next
<= t
.out_size
);
2094 auto old_char
= input
[t
.replace_pos
];
2095 input
[t
.replace_pos
] = t
.replace_char
; // replace in input, not in in
2096 utf16_to_bytes (begin (input
), end (input
), begin (in
), endianess
);
2098 auto state
= mbstate_t{};
2099 auto in_next
= (const char *) nullptr;
2100 auto out_next
= (InternT
*) nullptr;
2101 auto res
= codecvt_base::result ();
2103 res
= cvt
.in (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
2105 VERIFY (res
== cvt
.error
);
2106 VERIFY (in_next
== in
+ t
.expected_in_next
);
2107 VERIFY (out_next
== out
+ t
.expected_out_next
);
2108 VERIFY (char_traits
<InternT
>::compare (out
, exp
, t
.expected_out_next
)
2110 if (t
.expected_out_next
< array_size (out
))
2111 VERIFY (out
[t
.expected_out_next
] == 0);
2114 auto len
= cvt
.length (state
, in
, in
+ t
.in_size
, t
.out_size
);
2116 VERIFY (static_cast<size_t> (len
) == t
.expected_in_next
);
2118 input
[t
.replace_pos
] = old_char
;
2122 template <class InternT
>
2124 ucs2_to_utf16_out_ok (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
2125 utf16_endianess endianess
)
2127 using namespace std
;
2128 const char16_t input
[] = u
"b\u0448\uAAAA";
2129 const char16_t expected
[] = u
"b\u0448\uAAAA";
2130 static_assert (array_size (input
) == 4, "");
2131 static_assert (array_size (expected
) == 4, "");
2133 InternT in
[array_size (input
)];
2134 char exp
[array_size (expected
) * 2];
2135 copy (begin (input
), end (input
), begin (in
));
2136 utf16_to_bytes (begin (expected
), end (expected
), begin (exp
), endianess
);
2138 test_offsets_ok offsets
[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}};
2139 for (auto t
: offsets
)
2141 char out
[array_size (exp
) - 2] = {};
2142 VERIFY (t
.in_size
<= array_size (in
));
2143 VERIFY (t
.out_size
<= array_size (out
));
2144 auto state
= mbstate_t{};
2145 auto in_next
= (const InternT
*) nullptr;
2146 auto out_next
= (char *) nullptr;
2147 auto res
= codecvt_base::result ();
2149 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
2151 VERIFY (res
== cvt
.ok
);
2152 VERIFY (in_next
== in
+ t
.in_size
);
2153 VERIFY (out_next
== out
+ t
.out_size
);
2154 VERIFY (char_traits
<char>::compare (out
, exp
, t
.out_size
) == 0);
2155 if (t
.out_size
< array_size (out
))
2156 VERIFY (out
[t
.out_size
] == 0);
2160 template <class InternT
>
2162 ucs2_to_utf16_out_partial (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
2163 utf16_endianess endianess
)
2165 using namespace std
;
2166 const char16_t input
[] = u
"b\u0448\uAAAA";
2167 const char16_t expected
[] = u
"b\u0448\uAAAA";
2168 static_assert (array_size (input
) == 4, "");
2169 static_assert (array_size (expected
) == 4, "");
2171 InternT in
[array_size (input
)];
2172 char exp
[array_size (expected
) * 2];
2173 copy (begin (input
), end (input
), begin (in
));
2174 utf16_to_bytes (begin (expected
), end (expected
), begin (exp
), endianess
);
2176 test_offsets_partial offsets
[] = {
2177 {1, 0, 0, 0}, // no space for first CP
2178 {1, 1, 0, 0}, // no space for first CP
2180 {2, 2, 1, 2}, // no space for second CP
2181 {2, 3, 1, 2}, // no space for second CP
2183 {3, 4, 2, 4}, // no space for third CP
2184 {3, 5, 2, 4}, // no space for third CP
2186 for (auto t
: offsets
)
2188 char out
[array_size (exp
) - 2] = {};
2189 VERIFY (t
.in_size
<= array_size (in
));
2190 VERIFY (t
.out_size
<= array_size (out
));
2191 VERIFY (t
.expected_in_next
<= t
.in_size
);
2192 VERIFY (t
.expected_out_next
<= t
.out_size
);
2193 auto state
= mbstate_t{};
2194 auto in_next
= (const InternT
*) nullptr;
2195 auto out_next
= (char *) nullptr;
2196 auto res
= codecvt_base::result ();
2198 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
2200 VERIFY (res
== cvt
.partial
);
2201 VERIFY (in_next
== in
+ t
.expected_in_next
);
2202 VERIFY (out_next
== out
+ t
.expected_out_next
);
2203 VERIFY (char_traits
<char>::compare (out
, exp
, t
.expected_out_next
) == 0);
2204 if (t
.expected_out_next
< array_size (out
))
2205 VERIFY (out
[t
.expected_out_next
] == 0);
2209 template <class InternT
>
2211 ucs2_to_utf16_out_error (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
2212 utf16_endianess endianess
)
2214 using namespace std
;
2215 const char16_t input
[] = u
"b\u0448\uAAAA\U0010AAAA";
2216 const char16_t expected
[] = u
"b\u0448\uAAAA\U0010AAAA";
2217 static_assert (array_size (input
) == 6, "");
2218 static_assert (array_size (expected
) == 6, "");
2220 InternT in
[array_size (input
)];
2221 char exp
[array_size (expected
) * 2];
2222 copy (begin (input
), end (input
), begin (in
));
2223 utf16_to_bytes (begin (expected
), end (expected
), begin (exp
), endianess
);
2225 test_offsets_error
<InternT
> offsets
[] = {
2226 {3, 6, 0, 0, 0xD800, 0},
2227 {3, 6, 0, 0, 0xDBFF, 0},
2228 {3, 6, 0, 0, 0xDC00, 0},
2229 {3, 6, 0, 0, 0xDFFF, 0},
2231 {3, 6, 1, 2, 0xD800, 1},
2232 {3, 6, 1, 2, 0xDBFF, 1},
2233 {3, 6, 1, 2, 0xDC00, 1},
2234 {3, 6, 1, 2, 0xDFFF, 1},
2236 {3, 6, 2, 4, 0xD800, 2},
2237 {3, 6, 2, 4, 0xDBFF, 2},
2238 {3, 6, 2, 4, 0xDC00, 2},
2239 {3, 6, 2, 4, 0xDFFF, 2},
2241 // make the leading surrogate a trailing one
2242 {5, 10, 3, 6, 0xDC00, 3},
2243 {5, 10, 3, 6, 0xDFFF, 3},
2245 // make the trailing surrogate a leading one
2246 {5, 10, 3, 6, 0xD800, 4},
2247 {5, 10, 3, 6, 0xDBFF, 4},
2249 // make the trailing surrogate a BMP char
2250 {5, 10, 3, 6, u
'z', 4},
2252 // don't replace anything in the test cases bellow, just show the surrogate
2253 // pair (fourth CP) fully or partially (just the first surrogate)
2254 {5, 10, 3, 6, u
'b', 0},
2255 {5, 8, 3, 6, u
'b', 0},
2256 {5, 9, 3, 6, u
'b', 0},
2258 {4, 10, 3, 6, u
'b', 0},
2259 {4, 8, 3, 6, u
'b', 0},
2260 {4, 9, 3, 6, u
'b', 0},
2263 for (auto t
: offsets
)
2265 char out
[array_size (exp
) - 2] = {};
2266 VERIFY (t
.in_size
<= array_size (in
));
2267 VERIFY (t
.out_size
<= array_size (out
));
2268 VERIFY (t
.expected_in_next
<= t
.in_size
);
2269 VERIFY (t
.expected_out_next
<= t
.out_size
);
2270 auto old_char
= in
[t
.replace_pos
];
2271 in
[t
.replace_pos
] = t
.replace_char
;
2273 auto state
= mbstate_t{};
2274 auto in_next
= (const InternT
*) nullptr;
2275 auto out_next
= (char *) nullptr;
2276 auto res
= codecvt_base::result ();
2278 res
= cvt
.out (state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
,
2280 VERIFY (res
== cvt
.error
);
2281 VERIFY (in_next
== in
+ t
.expected_in_next
);
2282 VERIFY (out_next
== out
+ t
.expected_out_next
);
2283 VERIFY (char_traits
<char>::compare (out
, exp
, t
.expected_out_next
) == 0);
2284 if (t
.expected_out_next
< array_size (out
))
2285 VERIFY (out
[t
.expected_out_next
] == 0);
2287 in
[t
.replace_pos
] = old_char
;
2291 template <class InternT
>
2293 test_utf16_ucs2_cvt (const std::codecvt
<InternT
, char, mbstate_t> &cvt
,
2294 utf16_endianess endianess
)
2296 utf16_to_ucs2_in_ok (cvt
, endianess
);
2297 utf16_to_ucs2_in_partial (cvt
, endianess
);
2298 utf16_to_ucs2_in_error (cvt
, endianess
);
2299 ucs2_to_utf16_out_ok (cvt
, endianess
);
2300 ucs2_to_utf16_out_partial (cvt
, endianess
);
2301 ucs2_to_utf16_out_error (cvt
, endianess
);