1 //===----------------------------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT
11 // Requires the fix in 390840f.
12 // XFAIL: using-built-library-before-llvm-18
19 #include "test_macros.h"
21 struct test_offsets_ok
{
25 struct test_offsets_partial
{
28 size_t expected_in_next
;
29 size_t expected_out_next
;
32 template <class CharT
>
33 struct test_offsets_error
{
36 size_t expected_in_next
;
37 size_t expected_out_next
;
42 #define array_size(x) (sizeof(x) / sizeof(x)[0])
45 using std::char_traits
;
46 using std::codecvt_base
;
50 template <class InternT
, class ExternT
>
51 void utf8_to_utf32_in_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
52 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
53 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
54 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
55 static_assert(array_size(input
) == 11, "");
56 static_assert(array_size(expected
) == 5, "");
58 ExternT in
[array_size(input
)];
59 InternT exp
[array_size(expected
)];
60 copy(begin(input
), end(input
), begin(in
));
61 copy(begin(expected
), end(expected
), begin(exp
));
62 assert(char_traits
<ExternT
>::length(in
) == 10);
63 assert(char_traits
<InternT
>::length(exp
) == 4);
64 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
65 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
66 test_offsets_ok t
= *it
;
67 InternT out
[array_size(exp
) - 1] = {};
68 assert(t
.in_size
<= array_size(in
));
69 assert(t
.out_size
<= array_size(out
));
71 const ExternT
* in_next
= nullptr;
72 InternT
* out_next
= nullptr;
73 codecvt_base::result res
= codecvt_base::ok
;
75 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
76 assert(res
== cvt
.ok
);
77 assert(in_next
== in
+ t
.in_size
);
78 assert(out_next
== out
+ t
.out_size
);
79 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
80 if (t
.out_size
< array_size(out
))
81 assert(out
[t
.out_size
] == 0);
84 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
86 assert(static_cast<size_t>(len
) == t
.in_size
);
89 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
90 test_offsets_ok t
= *it
;
91 InternT out
[array_size(exp
)] = {};
92 assert(t
.in_size
<= array_size(in
));
93 assert(t
.out_size
<= array_size(out
));
95 const ExternT
* in_next
= nullptr;
96 InternT
* out_next
= nullptr;
97 codecvt_base::result res
= codecvt_base::ok
;
99 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
100 assert(res
== cvt
.ok
);
101 assert(in_next
== in
+ t
.in_size
);
102 assert(out_next
== out
+ t
.out_size
);
103 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
104 if (t
.out_size
< array_size(out
))
105 assert(out
[t
.out_size
] == 0);
108 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
110 assert(static_cast<size_t>(len
) == t
.in_size
);
114 template <class InternT
, class ExternT
>
115 void utf8_to_utf32_in_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
116 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
117 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
118 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
119 static_assert(array_size(input
) == 11, "");
120 static_assert(array_size(expected
) == 5, "");
122 ExternT in
[array_size(input
)];
123 InternT exp
[array_size(expected
)];
124 copy(begin(input
), end(input
), begin(in
));
125 copy(begin(expected
), end(expected
), begin(exp
));
126 assert(char_traits
<ExternT
>::length(in
) == 10);
127 assert(char_traits
<InternT
>::length(exp
) == 4);
129 test_offsets_partial offsets
[] = {
130 {1, 0, 0, 0}, // no space for first CP
132 {3, 1, 1, 1}, // no space for second CP
133 {2, 2, 1, 1}, // incomplete second CP
134 {2, 1, 1, 1}, // incomplete second CP, and no space for it
136 {6, 2, 3, 2}, // no space for third CP
137 {4, 3, 3, 2}, // incomplete third CP
138 {5, 3, 3, 2}, // incomplete third CP
139 {4, 2, 3, 2}, // incomplete third CP, and no space for it
140 {5, 2, 3, 2}, // incomplete third CP, and no space for it
142 {10, 3, 6, 3}, // no space for fourth CP
143 {7, 4, 6, 3}, // incomplete fourth CP
144 {8, 4, 6, 3}, // incomplete fourth CP
145 {9, 4, 6, 3}, // incomplete fourth CP
146 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
147 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
148 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
151 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
152 test_offsets_partial t
= *it
;
153 InternT out
[array_size(exp
) - 1] = {};
154 assert(t
.in_size
<= array_size(in
));
155 assert(t
.out_size
<= array_size(out
));
156 assert(t
.expected_in_next
<= t
.in_size
);
157 assert(t
.expected_out_next
<= t
.out_size
);
158 mbstate_t state
= {};
159 const ExternT
* in_next
= nullptr;
160 InternT
* out_next
= nullptr;
161 codecvt_base::result res
= codecvt_base::ok
;
163 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
164 assert(res
== cvt
.partial
);
165 assert(in_next
== in
+ t
.expected_in_next
);
166 assert(out_next
== out
+ t
.expected_out_next
);
167 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
168 if (t
.expected_out_next
< array_size(out
))
169 assert(out
[t
.expected_out_next
] == 0);
172 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
174 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
178 template <class InternT
, class ExternT
>
179 void utf8_to_utf32_in_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
180 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
181 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
182 const char32_t expected
[] = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
183 static_assert(array_size(input
) == 11, "");
184 static_assert(array_size(expected
) == 5, "");
186 ExternT in
[array_size(input
)];
187 InternT exp
[array_size(expected
)];
188 copy(begin(input
), end(input
), begin(in
));
189 copy(begin(expected
), end(expected
), begin(exp
));
190 assert(char_traits
<ExternT
>::length(in
) == 10);
191 assert(char_traits
<InternT
>::length(exp
) == 4);
193 // There are 5 classes of errors in UTF-8 decoding
194 // 1. Missing leading byte
195 // 2. Missing trailing byte
197 // 4. Overlong sequence
198 // 5. CP out of Unicode range
199 test_offsets_error
<unsigned char> offsets
[] = {
201 // 1. Missing leading byte. We will replace the leading byte with
202 // non-leading byte, such as a byte that is always invalid or a trailing
205 // replace leading byte with invalid byte
206 {1, 4, 0, 0, 0xFF, 0},
207 {3, 4, 1, 1, 0xFF, 1},
208 {6, 4, 3, 2, 0xFF, 3},
209 {10, 4, 6, 3, 0xFF, 6},
211 // replace leading byte with trailing byte
212 {1, 4, 0, 0, 0b10101010, 0},
213 {3, 4, 1, 1, 0b10101010, 1},
214 {6, 4, 3, 2, 0b10101010, 3},
215 {10, 4, 6, 3, 0b10101010, 6},
217 // 2. Missing trailing byte. We will replace the trailing byte with
218 // non-trailing byte, such as a byte that is always invalid or a leading
219 // byte (simple ASCII byte in our case).
221 // replace first trailing byte with ASCII byte
222 {3, 4, 1, 1, 'z', 2},
223 {6, 4, 3, 2, 'z', 4},
224 {10, 4, 6, 3, 'z', 7},
226 // replace first trailing byte with invalid byte
227 {3, 4, 1, 1, 0xFF, 2},
228 {6, 4, 3, 2, 0xFF, 4},
229 {10, 4, 6, 3, 0xFF, 7},
231 // replace second trailing byte with ASCII byte
232 {6, 4, 3, 2, 'z', 5},
233 {10, 4, 6, 3, 'z', 8},
235 // replace second trailing byte with invalid byte
236 {6, 4, 3, 2, 0xFF, 5},
237 {10, 4, 6, 3, 0xFF, 8},
239 // replace third trailing byte
240 {10, 4, 6, 3, 'z', 9},
241 {10, 4, 6, 3, 0xFF, 9},
243 // 2.1 The following test-cases raise doubt whether error or partial should
244 // be returned. For example, we have 4-byte sequence with valid leading
245 // byte. If we hide the last byte we need to return partial. But, if the
246 // second or third byte, which are visible to the call to codecvt, are
247 // malformed then error should be returned.
249 // replace first trailing byte with ASCII byte, also incomplete at end
250 {5, 4, 3, 2, 'z', 4},
251 {8, 4, 6, 3, 'z', 7},
252 {9, 4, 6, 3, 'z', 7},
254 // replace first trailing byte with invalid byte, also incomplete at end
255 {5, 4, 3, 2, 0xFF, 4},
256 {8, 4, 6, 3, 0xFF, 7},
257 {9, 4, 6, 3, 0xFF, 7},
259 // replace second trailing byte with ASCII byte, also incomplete at end
260 {9, 4, 6, 3, 'z', 8},
262 // replace second trailing byte with invalid byte, also incomplete at end
263 {9, 4, 6, 3, 0xFF, 8},
265 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
267 {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
268 {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
269 {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
270 {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
272 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
273 // just the leading byte is enough to make them overlong, i.e. for the
274 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
276 {3, 4, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
277 {3, 4, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
278 {6, 4, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
279 {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
282 // turn U+10AAAA into U+14AAAA by changing its leading byte
283 {10, 4, 6, 3, 0b11110101, 6},
284 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
285 {10, 4, 6, 3, 0b10011010, 7},
287 for (test_offsets_error
<unsigned char>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
288 test_offsets_error
<unsigned char> t
= *it
;
289 InternT out
[array_size(exp
) - 1] = {};
290 assert(t
.in_size
<= array_size(in
));
291 assert(t
.out_size
<= array_size(out
));
292 assert(t
.expected_in_next
<= t
.in_size
);
293 assert(t
.expected_out_next
<= t
.out_size
);
294 ExternT old_char
= in
[t
.replace_pos
];
295 in
[t
.replace_pos
] = t
.replace_char
;
297 mbstate_t state
= {};
298 const ExternT
* in_next
= nullptr;
299 InternT
* out_next
= nullptr;
300 codecvt_base::result res
= codecvt_base::ok
;
302 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
303 assert(res
== cvt
.error
);
304 assert(in_next
== in
+ t
.expected_in_next
);
305 assert(out_next
== out
+ t
.expected_out_next
);
306 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
307 if (t
.expected_out_next
< array_size(out
))
308 assert(out
[t
.expected_out_next
] == 0);
311 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
313 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
315 in
[t
.replace_pos
] = old_char
;
319 template <class InternT
, class ExternT
>
320 void utf8_to_utf32_in(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
321 utf8_to_utf32_in_ok(cvt
);
322 utf8_to_utf32_in_partial(cvt
);
323 utf8_to_utf32_in_error(cvt
);
326 template <class InternT
, class ExternT
>
327 void utf32_to_utf8_out_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
328 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
329 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
330 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
331 static_assert(array_size(input
) == 5, "");
332 static_assert(array_size(expected
) == 11, "");
334 InternT in
[array_size(input
)];
335 ExternT exp
[array_size(expected
)];
336 copy(begin(input
), end(input
), begin(in
));
337 copy(begin(expected
), end(expected
), begin(exp
));
338 assert(char_traits
<InternT
>::length(in
) == 4);
339 assert(char_traits
<ExternT
>::length(exp
) == 10);
341 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
342 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
343 test_offsets_ok t
= *it
;
344 ExternT out
[array_size(exp
) - 1] = {};
345 assert(t
.in_size
<= array_size(in
));
346 assert(t
.out_size
<= array_size(out
));
347 mbstate_t state
= {};
348 const InternT
* in_next
= nullptr;
349 ExternT
* out_next
= nullptr;
350 codecvt_base::result res
= codecvt_base::ok
;
352 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
353 assert(res
== cvt
.ok
);
354 assert(in_next
== in
+ t
.in_size
);
355 assert(out_next
== out
+ t
.out_size
);
356 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.out_size
) == 0);
357 if (t
.out_size
< array_size(out
))
358 assert(out
[t
.out_size
] == 0);
362 template <class InternT
, class ExternT
>
363 void utf32_to_utf8_out_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
364 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
365 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
366 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
367 static_assert(array_size(input
) == 5, "");
368 static_assert(array_size(expected
) == 11, "");
370 InternT in
[array_size(input
)];
371 ExternT exp
[array_size(expected
)];
372 copy(begin(input
), end(input
), begin(in
));
373 copy(begin(expected
), end(expected
), begin(exp
));
374 assert(char_traits
<InternT
>::length(in
) == 4);
375 assert(char_traits
<ExternT
>::length(exp
) == 10);
377 test_offsets_partial offsets
[] = {
378 {1, 0, 0, 0}, // no space for first CP
380 {2, 1, 1, 1}, // no space for second CP
381 {2, 2, 1, 1}, // no space for second CP
383 {3, 3, 2, 3}, // no space for third CP
384 {3, 4, 2, 3}, // no space for third CP
385 {3, 5, 2, 3}, // no space for third CP
387 {4, 6, 3, 6}, // no space for fourth CP
388 {4, 7, 3, 6}, // no space for fourth CP
389 {4, 8, 3, 6}, // no space for fourth CP
390 {4, 9, 3, 6}, // no space for fourth CP
392 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
393 test_offsets_partial t
= *it
;
394 ExternT out
[array_size(exp
) - 1] = {};
395 assert(t
.in_size
<= array_size(in
));
396 assert(t
.out_size
<= array_size(out
));
397 assert(t
.expected_in_next
<= t
.in_size
);
398 assert(t
.expected_out_next
<= t
.out_size
);
399 mbstate_t state
= {};
400 const InternT
* in_next
= nullptr;
401 ExternT
* out_next
= nullptr;
402 codecvt_base::result res
= codecvt_base::ok
;
404 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
405 assert(res
== cvt
.partial
);
406 assert(in_next
== in
+ t
.expected_in_next
);
407 assert(out_next
== out
+ t
.expected_out_next
);
408 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
409 if (t
.expected_out_next
< array_size(out
))
410 assert(out
[t
.expected_out_next
] == 0);
414 template <class InternT
, class ExternT
>
415 void utf32_to_utf8_out_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
416 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
417 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
418 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
419 static_assert(array_size(input
) == 5, "");
420 static_assert(array_size(expected
) == 11, "");
422 InternT in
[array_size(input
)];
423 ExternT exp
[array_size(expected
)];
424 copy(begin(input
), end(input
), begin(in
));
425 copy(begin(expected
), end(expected
), begin(exp
));
426 assert(char_traits
<InternT
>::length(in
) == 4);
427 assert(char_traits
<ExternT
>::length(exp
) == 10);
429 test_offsets_error
<InternT
> offsets
[] = {
432 {4, 10, 0, 0, 0xD800, 0},
433 {4, 10, 1, 1, 0xDBFF, 1},
434 {4, 10, 2, 3, 0xDC00, 2},
435 {4, 10, 3, 6, 0xDFFF, 3},
438 {4, 10, 0, 0, 0x00110000, 0},
439 {4, 10, 1, 1, 0x00110000, 1},
440 {4, 10, 2, 3, 0x00110000, 2},
441 {4, 10, 3, 6, 0x00110000, 3}};
443 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
444 test_offsets_error
<InternT
> t
= *it
;
445 ExternT out
[array_size(exp
) - 1] = {};
446 assert(t
.in_size
<= array_size(in
));
447 assert(t
.out_size
<= array_size(out
));
448 assert(t
.expected_in_next
<= t
.in_size
);
449 assert(t
.expected_out_next
<= t
.out_size
);
450 InternT old_char
= in
[t
.replace_pos
];
451 in
[t
.replace_pos
] = t
.replace_char
;
453 mbstate_t state
= {};
454 const InternT
* in_next
= nullptr;
455 ExternT
* out_next
= nullptr;
456 codecvt_base::result res
= codecvt_base::ok
;
458 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
459 assert(res
== cvt
.error
);
460 assert(in_next
== in
+ t
.expected_in_next
);
461 assert(out_next
== out
+ t
.expected_out_next
);
462 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
463 if (t
.expected_out_next
< array_size(out
))
464 assert(out
[t
.expected_out_next
] == 0);
466 in
[t
.replace_pos
] = old_char
;
470 template <class InternT
, class ExternT
>
471 void utf32_to_utf8_out(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
472 utf32_to_utf8_out_ok(cvt
);
473 utf32_to_utf8_out_partial(cvt
);
474 utf32_to_utf8_out_error(cvt
);
477 template <class InternT
, class ExternT
>
478 void test_utf8_utf32_cvt(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
479 utf8_to_utf32_in(cvt
);
480 utf32_to_utf8_out(cvt
);
483 template <class InternT
, class ExternT
>
484 void utf8_to_utf16_in_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
485 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
486 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
487 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
488 static_assert(array_size(input
) == 11, "");
489 static_assert(array_size(expected
) == 6, "");
491 ExternT in
[array_size(input
)];
492 InternT exp
[array_size(expected
)];
493 copy(begin(input
), end(input
), begin(in
));
494 copy(begin(expected
), end(expected
), begin(exp
));
495 assert(char_traits
<ExternT
>::length(in
) == 10);
496 assert(char_traits
<InternT
>::length(exp
) == 5);
498 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}};
499 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
500 test_offsets_ok t
= *it
;
501 InternT out
[array_size(exp
) - 1] = {};
502 assert(t
.in_size
<= array_size(in
));
503 assert(t
.out_size
<= array_size(out
));
504 mbstate_t state
= {};
505 const ExternT
* in_next
= nullptr;
506 InternT
* out_next
= nullptr;
507 codecvt_base::result res
= codecvt_base::ok
;
509 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
510 assert(res
== cvt
.ok
);
511 assert(in_next
== in
+ t
.in_size
);
512 assert(out_next
== out
+ t
.out_size
);
513 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
514 if (t
.out_size
< array_size(out
))
515 assert(out
[t
.out_size
] == 0);
518 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
520 assert(static_cast<size_t>(len
) == t
.in_size
);
523 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
524 test_offsets_ok t
= *it
;
525 InternT out
[array_size(exp
)] = {};
526 assert(t
.in_size
<= array_size(in
));
527 assert(t
.out_size
<= array_size(out
));
528 mbstate_t state
= {};
529 const ExternT
* in_next
= nullptr;
530 InternT
* out_next
= nullptr;
531 codecvt_base::result res
= codecvt_base::ok
;
533 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
534 assert(res
== cvt
.ok
);
535 assert(in_next
== in
+ t
.in_size
);
536 assert(out_next
== out
+ t
.out_size
);
537 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
538 if (t
.out_size
< array_size(out
))
539 assert(out
[t
.out_size
] == 0);
542 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
544 assert(static_cast<size_t>(len
) == t
.in_size
);
548 template <class InternT
, class ExternT
>
549 void utf8_to_utf16_in_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
550 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
551 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
552 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
553 static_assert(array_size(input
) == 11, "");
554 static_assert(array_size(expected
) == 6, "");
556 ExternT in
[array_size(input
)];
557 InternT exp
[array_size(expected
)];
558 copy(begin(input
), end(input
), begin(in
));
559 copy(begin(expected
), end(expected
), begin(exp
));
560 assert(char_traits
<ExternT
>::length(in
) == 10);
561 assert(char_traits
<InternT
>::length(exp
) == 5);
563 test_offsets_partial offsets
[] = {
564 {1, 0, 0, 0}, // no space for first CP
566 {3, 1, 1, 1}, // no space for second CP
567 {2, 2, 1, 1}, // incomplete second CP
568 {2, 1, 1, 1}, // incomplete second CP, and no space for it
570 {6, 2, 3, 2}, // no space for third CP
571 {4, 3, 3, 2}, // incomplete third CP
572 {5, 3, 3, 2}, // incomplete third CP
573 {4, 2, 3, 2}, // incomplete third CP, and no space for it
574 {5, 2, 3, 2}, // incomplete third CP, and no space for it
576 {10, 3, 6, 3}, // no space for fourth CP
577 {10, 4, 6, 3}, // no space for fourth CP
578 {7, 5, 6, 3}, // incomplete fourth CP
579 {8, 5, 6, 3}, // incomplete fourth CP
580 {9, 5, 6, 3}, // incomplete fourth CP
581 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
582 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
583 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
584 {7, 4, 6, 3}, // incomplete fourth CP, and no space for it
585 {8, 4, 6, 3}, // incomplete fourth CP, and no space for it
586 {9, 4, 6, 3}, // incomplete fourth CP, and no space for it
590 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
591 test_offsets_partial t
= *it
;
592 InternT out
[array_size(exp
) - 1] = {};
593 assert(t
.in_size
<= array_size(in
));
594 assert(t
.out_size
<= array_size(out
));
595 assert(t
.expected_in_next
<= t
.in_size
);
596 assert(t
.expected_out_next
<= t
.out_size
);
597 mbstate_t state
= {};
598 const ExternT
* in_next
= nullptr;
599 InternT
* out_next
= nullptr;
600 codecvt_base::result res
= codecvt_base::ok
;
602 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
603 assert(res
== cvt
.partial
);
604 assert(in_next
== in
+ t
.expected_in_next
);
605 assert(out_next
== out
+ t
.expected_out_next
);
606 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
607 if (t
.expected_out_next
< array_size(out
))
608 assert(out
[t
.expected_out_next
] == 0);
611 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
613 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
617 template <class InternT
, class ExternT
>
618 void utf8_to_utf16_in_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
619 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
620 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
621 const char16_t expected
[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
622 static_assert(array_size(input
) == 11, "");
623 static_assert(array_size(expected
) == 6, "");
625 ExternT in
[array_size(input
)];
626 InternT exp
[array_size(expected
)];
627 copy(begin(input
), end(input
), begin(in
));
628 copy(begin(expected
), end(expected
), begin(exp
));
629 assert(char_traits
<ExternT
>::length(in
) == 10);
630 assert(char_traits
<InternT
>::length(exp
) == 5);
632 // There are 5 classes of errors in UTF-8 decoding
633 // 1. Missing leading byte
634 // 2. Missing trailing byte
636 // 4. Overlong sequence
637 // 5. CP out of Unicode range
638 test_offsets_error
<unsigned char> offsets
[] = {
640 // 1. Missing leading byte. We will replace the leading byte with
641 // non-leading byte, such as a byte that is always invalid or a trailing
644 // replace leading byte with invalid byte
645 {1, 5, 0, 0, 0xFF, 0},
646 {3, 5, 1, 1, 0xFF, 1},
647 {6, 5, 3, 2, 0xFF, 3},
648 {10, 5, 6, 3, 0xFF, 6},
650 // replace leading byte with trailing byte
651 {1, 5, 0, 0, 0b10101010, 0},
652 {3, 5, 1, 1, 0b10101010, 1},
653 {6, 5, 3, 2, 0b10101010, 3},
654 {10, 5, 6, 3, 0b10101010, 6},
656 // 2. Missing trailing byte. We will replace the trailing byte with
657 // non-trailing byte, such as a byte that is always invalid or a leading
658 // byte (simple ASCII byte in our case).
660 // replace first trailing byte with ASCII byte
661 {3, 5, 1, 1, 'z', 2},
662 {6, 5, 3, 2, 'z', 4},
663 {10, 5, 6, 3, 'z', 7},
665 // replace first trailing byte with invalid byte
666 {3, 5, 1, 1, 0xFF, 2},
667 {6, 5, 3, 2, 0xFF, 4},
668 {10, 5, 6, 3, 0xFF, 7},
670 // replace second trailing byte with ASCII byte
671 {6, 5, 3, 2, 'z', 5},
672 {10, 5, 6, 3, 'z', 8},
674 // replace second trailing byte with invalid byte
675 {6, 5, 3, 2, 0xFF, 5},
676 {10, 5, 6, 3, 0xFF, 8},
678 // replace third trailing byte
679 {10, 5, 6, 3, 'z', 9},
680 {10, 5, 6, 3, 0xFF, 9},
682 // 2.1 The following test-cases raise doubt whether error or partial should
683 // be returned. For example, we have 4-byte sequence with valid leading
684 // byte. If we hide the last byte we need to return partial. But, if the
685 // second or third byte, which are visible to the call to codecvt, are
686 // malformed then error should be returned.
688 // replace first trailing byte with ASCII byte, also incomplete at end
689 {5, 5, 3, 2, 'z', 4},
690 {8, 5, 6, 3, 'z', 7},
691 {9, 5, 6, 3, 'z', 7},
693 // replace first trailing byte with invalid byte, also incomplete at end
694 {5, 5, 3, 2, 0xFF, 4},
695 {8, 5, 6, 3, 0xFF, 7},
696 {9, 5, 6, 3, 0xFF, 7},
698 // replace second trailing byte with ASCII byte, also incomplete at end
699 {9, 5, 6, 3, 'z', 8},
701 // replace second trailing byte with invalid byte, also incomplete at end
702 {9, 5, 6, 3, 0xFF, 8},
704 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
706 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
707 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
708 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
709 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
711 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
712 // just the leading byte is enough to make them overlong, i.e. for the
713 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
715 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
716 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
717 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
718 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
721 // turn U+10AAAA into U+14AAAA by changing its leading byte
722 {10, 5, 6, 3, 0b11110101, 6},
723 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
724 {10, 5, 6, 3, 0b10011010, 7},
726 for (test_offsets_error
<unsigned char>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
727 test_offsets_error
<unsigned char> t
= *it
;
728 InternT out
[array_size(exp
) - 1] = {};
729 assert(t
.in_size
<= array_size(in
));
730 assert(t
.out_size
<= array_size(out
));
731 assert(t
.expected_in_next
<= t
.in_size
);
732 assert(t
.expected_out_next
<= t
.out_size
);
733 ExternT old_char
= in
[t
.replace_pos
];
734 in
[t
.replace_pos
] = t
.replace_char
;
736 mbstate_t state
= {};
737 const ExternT
* in_next
= nullptr;
738 InternT
* out_next
= nullptr;
739 codecvt_base::result res
= codecvt_base::ok
;
741 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
742 assert(res
== cvt
.error
);
743 assert(in_next
== in
+ t
.expected_in_next
);
744 assert(out_next
== out
+ t
.expected_out_next
);
745 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
746 if (t
.expected_out_next
< array_size(out
))
747 assert(out
[t
.expected_out_next
] == 0);
750 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
752 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
754 in
[t
.replace_pos
] = old_char
;
758 template <class InternT
, class ExternT
>
759 void utf8_to_utf16_in(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
760 utf8_to_utf16_in_ok(cvt
);
761 utf8_to_utf16_in_partial(cvt
);
762 utf8_to_utf16_in_error(cvt
);
765 template <class InternT
, class ExternT
>
766 void utf16_to_utf8_out_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
767 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
768 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
769 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
770 static_assert(array_size(input
) == 6, "");
771 static_assert(array_size(expected
) == 11, "");
773 InternT in
[array_size(input
)];
774 ExternT exp
[array_size(expected
)];
775 copy(begin(input
), end(input
), begin(in
));
776 copy(begin(expected
), end(expected
), begin(exp
));
777 assert(char_traits
<InternT
>::length(in
) == 5);
778 assert(char_traits
<ExternT
>::length(exp
) == 10);
780 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
781 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
782 test_offsets_ok t
= *it
;
783 ExternT out
[array_size(exp
) - 1] = {};
784 assert(t
.in_size
<= array_size(in
));
785 assert(t
.out_size
<= array_size(out
));
786 mbstate_t state
= {};
787 const InternT
* in_next
= nullptr;
788 ExternT
* out_next
= nullptr;
789 codecvt_base::result res
= codecvt_base::ok
;
791 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
792 assert(res
== cvt
.ok
);
793 assert(in_next
== in
+ t
.in_size
);
794 assert(out_next
== out
+ t
.out_size
);
795 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.out_size
) == 0);
796 if (t
.out_size
< array_size(out
))
797 assert(out
[t
.out_size
] == 0);
801 template <class InternT
, class ExternT
>
802 void utf16_to_utf8_out_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
803 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
804 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
805 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
806 static_assert(array_size(input
) == 6, "");
807 static_assert(array_size(expected
) == 11, "");
809 InternT in
[array_size(input
)];
810 ExternT exp
[array_size(expected
)];
811 copy(begin(input
), end(input
), begin(in
));
812 copy(begin(expected
), end(expected
), begin(exp
));
813 assert(char_traits
<InternT
>::length(in
) == 5);
814 assert(char_traits
<ExternT
>::length(exp
) == 10);
816 test_offsets_partial offsets
[] = {
817 {1, 0, 0, 0}, // no space for first CP
819 {2, 1, 1, 1}, // no space for second CP
820 {2, 2, 1, 1}, // no space for second CP
822 {3, 3, 2, 3}, // no space for third CP
823 {3, 4, 2, 3}, // no space for third CP
824 {3, 5, 2, 3}, // no space for third CP
826 {5, 6, 3, 6}, // no space for fourth CP
827 {5, 7, 3, 6}, // no space for fourth CP
828 {5, 8, 3, 6}, // no space for fourth CP
829 {5, 9, 3, 6}, // no space for fourth CP
831 {4, 10, 3, 6}, // incomplete fourth CP
833 {4, 6, 3, 6}, // incomplete fourth CP, and no space for it
834 {4, 7, 3, 6}, // incomplete fourth CP, and no space for it
835 {4, 8, 3, 6}, // incomplete fourth CP, and no space for it
836 {4, 9, 3, 6}, // incomplete fourth CP, and no space for it
838 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
839 test_offsets_partial t
= *it
;
840 ExternT out
[array_size(exp
) - 1] = {};
841 assert(t
.in_size
<= array_size(in
));
842 assert(t
.out_size
<= array_size(out
));
843 assert(t
.expected_in_next
<= t
.in_size
);
844 assert(t
.expected_out_next
<= t
.out_size
);
845 mbstate_t state
= {};
846 const InternT
* in_next
= nullptr;
847 ExternT
* out_next
= nullptr;
848 codecvt_base::result res
= codecvt_base::ok
;
850 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
851 assert(res
== cvt
.partial
);
852 assert(in_next
== in
+ t
.expected_in_next
);
853 assert(out_next
== out
+ t
.expected_out_next
);
854 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
855 if (t
.expected_out_next
< array_size(out
))
856 assert(out
[t
.expected_out_next
] == 0);
860 template <class InternT
, class ExternT
>
861 void utf16_to_utf8_out_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
862 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
863 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
864 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
865 static_assert(array_size(input
) == 6, "");
866 static_assert(array_size(expected
) == 11, "");
868 InternT in
[array_size(input
)];
869 ExternT exp
[array_size(expected
)];
870 copy(begin(input
), end(input
), begin(in
));
871 copy(begin(expected
), end(expected
), begin(exp
));
872 assert(char_traits
<InternT
>::length(in
) == 5);
873 assert(char_traits
<ExternT
>::length(exp
) == 10);
875 // The only possible error in UTF-16 is unpaired surrogate code units.
876 // So we replace valid code points (scalar values) with lone surrogate CU.
877 test_offsets_error
<InternT
> offsets
[] = {
878 {5, 10, 0, 0, 0xD800, 0},
879 {5, 10, 0, 0, 0xDBFF, 0},
880 {5, 10, 0, 0, 0xDC00, 0},
881 {5, 10, 0, 0, 0xDFFF, 0},
883 {5, 10, 1, 1, 0xD800, 1},
884 {5, 10, 1, 1, 0xDBFF, 1},
885 {5, 10, 1, 1, 0xDC00, 1},
886 {5, 10, 1, 1, 0xDFFF, 1},
888 {5, 10, 2, 3, 0xD800, 2},
889 {5, 10, 2, 3, 0xDBFF, 2},
890 {5, 10, 2, 3, 0xDC00, 2},
891 {5, 10, 2, 3, 0xDFFF, 2},
893 // make the leading surrogate a trailing one
894 {5, 10, 3, 6, 0xDC00, 3},
895 {5, 10, 3, 6, 0xDFFF, 3},
897 // make the trailing surrogate a leading one
898 {5, 10, 3, 6, 0xD800, 4},
899 {5, 10, 3, 6, 0xDBFF, 4},
901 // make the trailing surrogate a BMP char
902 {5, 10, 3, 6, 'z', 4},
905 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
906 test_offsets_error
<InternT
> t
= *it
;
907 ExternT out
[array_size(exp
) - 1] = {};
908 assert(t
.in_size
<= array_size(in
));
909 assert(t
.out_size
<= array_size(out
));
910 assert(t
.expected_in_next
<= t
.in_size
);
911 assert(t
.expected_out_next
<= t
.out_size
);
912 InternT old_char
= in
[t
.replace_pos
];
913 in
[t
.replace_pos
] = t
.replace_char
;
915 mbstate_t state
= {};
916 const InternT
* in_next
= nullptr;
917 ExternT
* out_next
= nullptr;
918 codecvt_base::result res
= codecvt_base::ok
;
920 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
921 assert(res
== cvt
.error
);
922 assert(in_next
== in
+ t
.expected_in_next
);
923 assert(out_next
== out
+ t
.expected_out_next
);
924 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
925 if (t
.expected_out_next
< array_size(out
))
926 assert(out
[t
.expected_out_next
] == 0);
928 in
[t
.replace_pos
] = old_char
;
932 template <class InternT
, class ExternT
>
933 void utf16_to_utf8_out(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
934 utf16_to_utf8_out_ok(cvt
);
935 utf16_to_utf8_out_partial(cvt
);
936 utf16_to_utf8_out_error(cvt
);
939 template <class InternT
, class ExternT
>
940 void test_utf8_utf16_cvt(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
941 utf8_to_utf16_in(cvt
);
942 utf16_to_utf8_out(cvt
);
945 template <class InternT
, class ExternT
>
946 void utf8_to_ucs2_in_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
947 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
948 const unsigned char input
[] = "b\u0448\uAAAA";
949 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
950 static_assert(array_size(input
) == 7, "");
951 static_assert(array_size(expected
) == 4, "");
953 ExternT in
[array_size(input
)];
954 InternT exp
[array_size(expected
)];
955 copy(begin(input
), end(input
), begin(in
));
956 copy(begin(expected
), end(expected
), begin(exp
));
957 assert(char_traits
<ExternT
>::length(in
) == 6);
958 assert(char_traits
<InternT
>::length(exp
) == 3);
960 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}};
961 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
962 test_offsets_ok t
= *it
;
963 InternT out
[array_size(exp
) - 1] = {};
964 assert(t
.in_size
<= array_size(in
));
965 assert(t
.out_size
<= array_size(out
));
966 mbstate_t state
= {};
967 const ExternT
* in_next
= nullptr;
968 InternT
* out_next
= nullptr;
969 codecvt_base::result res
= codecvt_base::ok
;
971 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
972 assert(res
== cvt
.ok
);
973 assert(in_next
== in
+ t
.in_size
);
974 assert(out_next
== out
+ t
.out_size
);
975 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
976 if (t
.out_size
< array_size(out
))
977 assert(out
[t
.out_size
] == 0);
980 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
982 assert(static_cast<size_t>(len
) == t
.in_size
);
985 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
986 test_offsets_ok t
= *it
;
987 InternT out
[array_size(exp
)] = {};
988 assert(t
.in_size
<= array_size(in
));
989 assert(t
.out_size
<= array_size(out
));
990 mbstate_t state
= {};
991 const ExternT
* in_next
= nullptr;
992 InternT
* out_next
= nullptr;
993 codecvt_base::result res
= codecvt_base::ok
;
995 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
996 assert(res
== cvt
.ok
);
997 assert(in_next
== in
+ t
.in_size
);
998 assert(out_next
== out
+ t
.out_size
);
999 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1000 if (t
.out_size
< array_size(out
))
1001 assert(out
[t
.out_size
] == 0);
1003 state
= mbstate_t();
1004 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
1006 assert(static_cast<size_t>(len
) == t
.in_size
);
1010 template <class InternT
, class ExternT
>
1011 void utf8_to_ucs2_in_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1012 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1013 const unsigned char input
[] = "b\u0448\uAAAA";
1014 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1015 static_assert(array_size(input
) == 7, "");
1016 static_assert(array_size(expected
) == 4, "");
1018 ExternT in
[array_size(input
)];
1019 InternT exp
[array_size(expected
)];
1020 copy(begin(input
), end(input
), begin(in
));
1021 copy(begin(expected
), end(expected
), begin(exp
));
1022 assert(char_traits
<ExternT
>::length(in
) == 6);
1023 assert(char_traits
<InternT
>::length(exp
) == 3);
1025 test_offsets_partial offsets
[] = {
1026 {1, 0, 0, 0}, // no space for first CP
1028 {3, 1, 1, 1}, // no space for second CP
1029 {2, 2, 1, 1}, // incomplete second CP
1030 {2, 1, 1, 1}, // incomplete second CP, and no space for it
1032 {6, 2, 3, 2}, // no space for third CP
1033 {4, 3, 3, 2}, // incomplete third CP
1034 {5, 3, 3, 2}, // incomplete third CP
1035 {4, 2, 3, 2}, // incomplete third CP, and no space for it
1036 {5, 2, 3, 2}, // incomplete third CP, and no space for it
1039 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1040 test_offsets_partial t
= *it
;
1041 InternT out
[array_size(exp
) - 1] = {};
1042 assert(t
.in_size
<= array_size(in
));
1043 assert(t
.out_size
<= array_size(out
));
1044 assert(t
.expected_in_next
<= t
.in_size
);
1045 assert(t
.expected_out_next
<= t
.out_size
);
1046 mbstate_t state
= {};
1047 const ExternT
* in_next
= nullptr;
1048 InternT
* out_next
= nullptr;
1049 codecvt_base::result res
= codecvt_base::ok
;
1051 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1052 assert(res
== cvt
.partial
);
1053 assert(in_next
== in
+ t
.expected_in_next
);
1054 assert(out_next
== out
+ t
.expected_out_next
);
1055 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1056 if (t
.expected_out_next
< array_size(out
))
1057 assert(out
[t
.expected_out_next
] == 0);
1059 state
= mbstate_t();
1060 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1062 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1066 template <class InternT
, class ExternT
>
1067 void utf8_to_ucs2_in_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1068 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
1069 const char16_t expected
[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
1070 static_assert(array_size(input
) == 11, "");
1071 static_assert(array_size(expected
) == 6, "");
1073 ExternT in
[array_size(input
)];
1074 InternT exp
[array_size(expected
)];
1075 copy(begin(input
), end(input
), begin(in
));
1076 copy(begin(expected
), end(expected
), begin(exp
));
1077 assert(char_traits
<ExternT
>::length(in
) == 10);
1078 assert(char_traits
<InternT
>::length(exp
) == 5);
1080 // There are 5 classes of errors in UTF-8 decoding
1081 // 1. Missing leading byte
1082 // 2. Missing trailing byte
1084 // 4. Overlong sequence
1085 // 5. CP out of Unicode range
1086 test_offsets_error
<unsigned char> offsets
[] = {
1088 // 1. Missing leading byte. We will replace the leading byte with
1089 // non-leading byte, such as a byte that is always invalid or a trailing
1092 // replace leading byte with invalid byte
1093 {1, 5, 0, 0, 0xFF, 0},
1094 {3, 5, 1, 1, 0xFF, 1},
1095 {6, 5, 3, 2, 0xFF, 3},
1096 {10, 5, 6, 3, 0xFF, 6},
1098 // replace leading byte with trailing byte
1099 {1, 5, 0, 0, 0b10101010, 0},
1100 {3, 5, 1, 1, 0b10101010, 1},
1101 {6, 5, 3, 2, 0b10101010, 3},
1102 {10, 5, 6, 3, 0b10101010, 6},
1104 // 2. Missing trailing byte. We will replace the trailing byte with
1105 // non-trailing byte, such as a byte that is always invalid or a leading
1106 // byte (simple ASCII byte in our case).
1108 // replace first trailing byte with ASCII byte
1109 {3, 5, 1, 1, 'z', 2},
1110 {6, 5, 3, 2, 'z', 4},
1111 {10, 5, 6, 3, 'z', 7},
1113 // replace first trailing byte with invalid byte
1114 {3, 5, 1, 1, 0xFF, 2},
1115 {6, 5, 3, 2, 0xFF, 4},
1116 {10, 5, 6, 3, 0xFF, 7},
1118 // replace second trailing byte with ASCII byte
1119 {6, 5, 3, 2, 'z', 5},
1120 {10, 5, 6, 3, 'z', 8},
1122 // replace second trailing byte with invalid byte
1123 {6, 5, 3, 2, 0xFF, 5},
1124 {10, 5, 6, 3, 0xFF, 8},
1126 // replace third trailing byte
1127 {10, 5, 6, 3, 'z', 9},
1128 {10, 5, 6, 3, 0xFF, 9},
1130 // 2.1 The following test-cases raise doubt whether error or partial should
1131 // be returned. For example, we have 4-byte sequence with valid leading
1132 // byte. If we hide the last byte we need to return partial. But, if the
1133 // second or third byte, which are visible to the call to codecvt, are
1134 // malformed then error should be returned.
1136 // replace first trailing byte with ASCII byte, also incomplete at end
1137 {5, 5, 3, 2, 'z', 4},
1138 {8, 5, 6, 3, 'z', 7},
1139 {9, 5, 6, 3, 'z', 7},
1141 // replace first trailing byte with invalid byte, also incomplete at end
1142 {5, 5, 3, 2, 0xFF, 4},
1143 {8, 5, 6, 3, 0xFF, 7},
1144 {9, 5, 6, 3, 0xFF, 7},
1146 // replace second trailing byte with ASCII byte, also incomplete at end
1147 {9, 5, 6, 3, 'z', 8},
1149 // replace second trailing byte with invalid byte, also incomplete at end
1150 {9, 5, 6, 3, 0xFF, 8},
1152 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
1154 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
1155 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
1156 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
1157 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
1159 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
1160 // just the leading byte is enough to make them overlong, i.e. for the
1161 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
1163 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
1164 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
1165 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
1166 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
1168 // 5. CP above range
1169 // turn U+10AAAA into U+14AAAA by changing its leading byte
1170 {10, 5, 6, 3, 0b11110101, 6},
1171 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
1172 {10, 5, 6, 3, 0b10011010, 7},
1173 // Don't replace anything, show full 4-byte CP U+10AAAA
1174 {10, 4, 6, 3, 'b', 0},
1175 {10, 5, 6, 3, 'b', 0},
1176 // Don't replace anything, show incomplete 4-byte CP at the end. It's still
1177 // out of UCS2 range just by seeing the first byte.
1178 {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1179 {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1180 {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1181 {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1182 {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1183 {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1185 for (test_offsets_error
<unsigned char>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1186 test_offsets_error
<unsigned char> t
= *it
;
1187 InternT out
[array_size(exp
) - 1] = {};
1188 assert(t
.in_size
<= array_size(in
));
1189 assert(t
.out_size
<= array_size(out
));
1190 assert(t
.expected_in_next
<= t
.in_size
);
1191 assert(t
.expected_out_next
<= t
.out_size
);
1192 ExternT old_char
= in
[t
.replace_pos
];
1193 in
[t
.replace_pos
] = t
.replace_char
;
1195 mbstate_t state
= {};
1196 const ExternT
* in_next
= nullptr;
1197 InternT
* out_next
= nullptr;
1198 codecvt_base::result res
= codecvt_base::ok
;
1200 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1201 assert(res
== cvt
.error
);
1202 assert(in_next
== in
+ t
.expected_in_next
);
1203 assert(out_next
== out
+ t
.expected_out_next
);
1204 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1205 if (t
.expected_out_next
< array_size(out
))
1206 assert(out
[t
.expected_out_next
] == 0);
1208 state
= mbstate_t();
1209 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1211 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1213 in
[t
.replace_pos
] = old_char
;
1217 template <class InternT
, class ExternT
>
1218 void utf8_to_ucs2_in(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1219 utf8_to_ucs2_in_ok(cvt
);
1220 utf8_to_ucs2_in_partial(cvt
);
1221 utf8_to_ucs2_in_error(cvt
);
1224 template <class InternT
, class ExternT
>
1225 void ucs2_to_utf8_out_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1226 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1227 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1228 const unsigned char expected
[] = "b\u0448\uAAAA";
1229 static_assert(array_size(input
) == 4, "");
1230 static_assert(array_size(expected
) == 7, "");
1232 InternT in
[array_size(input
)];
1233 ExternT exp
[array_size(expected
)];
1234 copy(begin(input
), end(input
), begin(in
));
1235 copy(begin(expected
), end(expected
), begin(exp
));
1236 assert(char_traits
<InternT
>::length(in
) == 3);
1237 assert(char_traits
<ExternT
>::length(exp
) == 6);
1239 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}};
1240 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1241 test_offsets_ok t
= *it
;
1242 ExternT out
[array_size(exp
) - 1] = {};
1243 assert(t
.in_size
<= array_size(in
));
1244 assert(t
.out_size
<= array_size(out
));
1245 mbstate_t state
= {};
1246 const InternT
* in_next
= nullptr;
1247 ExternT
* out_next
= nullptr;
1248 codecvt_base::result res
= codecvt_base::ok
;
1250 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1251 assert(res
== cvt
.ok
);
1252 assert(in_next
== in
+ t
.in_size
);
1253 assert(out_next
== out
+ t
.out_size
);
1254 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.out_size
) == 0);
1255 if (t
.out_size
< array_size(out
))
1256 assert(out
[t
.out_size
] == 0);
1260 template <class InternT
, class ExternT
>
1261 void ucs2_to_utf8_out_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1262 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1263 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1264 const unsigned char expected
[] = "b\u0448\uAAAA";
1265 static_assert(array_size(input
) == 4, "");
1266 static_assert(array_size(expected
) == 7, "");
1268 InternT in
[array_size(input
)];
1269 ExternT exp
[array_size(expected
)];
1270 copy(begin(input
), end(input
), begin(in
));
1271 copy(begin(expected
), end(expected
), begin(exp
));
1272 assert(char_traits
<InternT
>::length(in
) == 3);
1273 assert(char_traits
<ExternT
>::length(exp
) == 6);
1275 test_offsets_partial offsets
[] = {
1276 {1, 0, 0, 0}, // no space for first CP
1278 {2, 1, 1, 1}, // no space for second CP
1279 {2, 2, 1, 1}, // no space for second CP
1281 {3, 3, 2, 3}, // no space for third CP
1282 {3, 4, 2, 3}, // no space for third CP
1283 {3, 5, 2, 3}, // no space for third CP
1285 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1286 test_offsets_partial t
= *it
;
1287 ExternT out
[array_size(exp
) - 1] = {};
1288 assert(t
.in_size
<= array_size(in
));
1289 assert(t
.out_size
<= array_size(out
));
1290 assert(t
.expected_in_next
<= t
.in_size
);
1291 assert(t
.expected_out_next
<= t
.out_size
);
1292 mbstate_t state
= {};
1293 const InternT
* in_next
= nullptr;
1294 ExternT
* out_next
= nullptr;
1295 codecvt_base::result res
= codecvt_base::ok
;
1297 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1298 assert(res
== cvt
.partial
);
1299 assert(in_next
== in
+ t
.expected_in_next
);
1300 assert(out_next
== out
+ t
.expected_out_next
);
1301 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1302 if (t
.expected_out_next
< array_size(out
))
1303 assert(out
[t
.expected_out_next
] == 0);
1307 template <class InternT
, class ExternT
>
1308 void ucs2_to_utf8_out_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1309 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1310 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
1311 static_assert(array_size(input
) == 6, "");
1312 static_assert(array_size(expected
) == 11, "");
1314 InternT in
[array_size(input
)];
1315 ExternT exp
[array_size(expected
)];
1316 copy(begin(input
), end(input
), begin(in
));
1317 copy(begin(expected
), end(expected
), begin(exp
));
1318 assert(char_traits
<InternT
>::length(in
) == 5);
1319 assert(char_traits
<ExternT
>::length(exp
) == 10);
1321 test_offsets_error
<InternT
> offsets
[] = {
1322 {3, 6, 0, 0, 0xD800, 0},
1323 {3, 6, 0, 0, 0xDBFF, 0},
1324 {3, 6, 0, 0, 0xDC00, 0},
1325 {3, 6, 0, 0, 0xDFFF, 0},
1327 {3, 6, 1, 1, 0xD800, 1},
1328 {3, 6, 1, 1, 0xDBFF, 1},
1329 {3, 6, 1, 1, 0xDC00, 1},
1330 {3, 6, 1, 1, 0xDFFF, 1},
1332 {3, 6, 2, 3, 0xD800, 2},
1333 {3, 6, 2, 3, 0xDBFF, 2},
1334 {3, 6, 2, 3, 0xDC00, 2},
1335 {3, 6, 2, 3, 0xDFFF, 2},
1337 // make the leading surrogate a trailing one
1338 {5, 10, 3, 6, 0xDC00, 3},
1339 {5, 10, 3, 6, 0xDFFF, 3},
1341 // make the trailing surrogate a leading one
1342 {5, 10, 3, 6, 0xD800, 4},
1343 {5, 10, 3, 6, 0xDBFF, 4},
1345 // make the trailing surrogate a BMP char
1346 {5, 10, 3, 6, 'z', 4},
1348 // don't replace anything in the test cases bellow, just show the surrogate
1349 // pair (fourth CP) fully or partially
1350 {5, 10, 3, 6, 'b', 0},
1351 {5, 7, 3, 6, 'b', 0}, // no space for fourth CP
1352 {5, 8, 3, 6, 'b', 0}, // no space for fourth CP
1353 {5, 9, 3, 6, 'b', 0}, // no space for fourth CP
1355 {4, 10, 3, 6, 'b', 0}, // incomplete fourth CP
1356 {4, 7, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1357 {4, 8, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1358 {4, 9, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1361 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1362 test_offsets_error
<InternT
> t
= *it
;
1363 ExternT out
[array_size(exp
) - 1] = {};
1364 assert(t
.in_size
<= array_size(in
));
1365 assert(t
.out_size
<= array_size(out
));
1366 assert(t
.expected_in_next
<= t
.in_size
);
1367 assert(t
.expected_out_next
<= t
.out_size
);
1368 InternT old_char
= in
[t
.replace_pos
];
1369 in
[t
.replace_pos
] = t
.replace_char
;
1371 mbstate_t state
= {};
1372 const InternT
* in_next
= nullptr;
1373 ExternT
* out_next
= nullptr;
1374 codecvt_base::result res
= codecvt_base::ok
;
1376 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1377 assert(res
== cvt
.error
);
1378 assert(in_next
== in
+ t
.expected_in_next
);
1379 assert(out_next
== out
+ t
.expected_out_next
);
1380 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1381 if (t
.expected_out_next
< array_size(out
))
1382 assert(out
[t
.expected_out_next
] == 0);
1384 in
[t
.replace_pos
] = old_char
;
1388 template <class InternT
, class ExternT
>
1389 void ucs2_to_utf8_out(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1390 ucs2_to_utf8_out_ok(cvt
);
1391 ucs2_to_utf8_out_partial(cvt
);
1392 ucs2_to_utf8_out_error(cvt
);
1395 template <class InternT
, class ExternT
>
1396 void test_utf8_ucs2_cvt(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1397 utf8_to_ucs2_in(cvt
);
1398 ucs2_to_utf8_out(cvt
);
1401 enum utf16_endianess
{ utf16_big_endian
, utf16_little_endian
};
1403 template <class Iter1
, class Iter2
>
1404 Iter2
utf16_to_bytes(Iter1 f
, Iter1 l
, Iter2 o
, utf16_endianess e
) {
1405 if (e
== utf16_big_endian
)
1406 for (; f
!= l
; ++f
) {
1407 *o
++ = (*f
>> 8) & 0xFF;
1411 for (; f
!= l
; ++f
) {
1413 *o
++ = (*f
>> 8) & 0xFF;
1418 template <class InternT
>
1419 void utf16_to_utf32_in_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1420 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1421 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1422 static_assert(array_size(input
) == 6, "");
1423 static_assert(array_size(expected
) == 5, "");
1425 char in
[array_size(input
) * 2];
1426 InternT exp
[array_size(expected
)];
1427 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1428 copy(begin(expected
), end(expected
), begin(exp
));
1430 test_offsets_ok offsets
[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}, {10, 4}};
1431 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1432 test_offsets_ok t
= *it
;
1433 InternT out
[array_size(exp
) - 1] = {};
1434 assert(t
.in_size
<= array_size(in
));
1435 assert(t
.out_size
<= array_size(out
));
1436 mbstate_t state
= {};
1437 const char* in_next
= nullptr;
1438 InternT
* out_next
= nullptr;
1439 codecvt_base::result res
= codecvt_base::ok
;
1441 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1442 assert(res
== cvt
.ok
);
1443 assert(in_next
== in
+ t
.in_size
);
1444 assert(out_next
== out
+ t
.out_size
);
1445 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1446 if (t
.out_size
< array_size(out
))
1447 assert(out
[t
.out_size
] == 0);
1449 state
= mbstate_t();
1450 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1452 assert(static_cast<size_t>(len
) == t
.in_size
);
1455 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1456 test_offsets_ok t
= *it
;
1457 InternT out
[array_size(exp
)] = {};
1458 assert(t
.in_size
<= array_size(in
));
1459 assert(t
.out_size
<= array_size(out
));
1460 mbstate_t state
= {};
1461 const char* in_next
= nullptr;
1462 InternT
* out_next
= nullptr;
1463 codecvt_base::result res
= codecvt_base::ok
;
1465 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
1466 assert(res
== cvt
.ok
);
1467 assert(in_next
== in
+ t
.in_size
);
1468 assert(out_next
== out
+ t
.out_size
);
1469 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1470 if (t
.out_size
< array_size(out
))
1471 assert(out
[t
.out_size
] == 0);
1473 state
= mbstate_t();
1474 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
1476 assert(static_cast<size_t>(len
) == t
.in_size
);
1480 template <class InternT
>
1481 void utf16_to_utf32_in_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1482 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1483 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1484 static_assert(array_size(input
) == 6, "");
1485 static_assert(array_size(expected
) == 5, "");
1487 char in
[array_size(input
) * 2];
1488 InternT exp
[array_size(expected
)];
1489 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1490 copy(begin(expected
), end(expected
), begin(exp
));
1492 test_offsets_partial offsets
[] = {
1493 {2, 0, 0, 0}, // no space for first CP
1494 {1, 1, 0, 0}, // incomplete first CP
1495 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1497 {4, 1, 2, 1}, // no space for second CP
1498 {3, 2, 2, 1}, // incomplete second CP
1499 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1501 {6, 2, 4, 2}, // no space for third CP
1502 {5, 3, 4, 2}, // incomplete third CP
1503 {5, 2, 4, 2}, // incomplete third CP, and no space for it
1505 {10, 3, 6, 3}, // no space for fourth CP
1506 {7, 4, 6, 3}, // incomplete fourth CP
1507 {8, 4, 6, 3}, // incomplete fourth CP
1508 {9, 4, 6, 3}, // incomplete fourth CP
1509 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
1510 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
1511 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
1514 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1515 test_offsets_partial t
= *it
;
1516 InternT out
[array_size(exp
) - 1] = {};
1517 assert(t
.in_size
<= array_size(in
));
1518 assert(t
.out_size
<= array_size(out
));
1519 assert(t
.expected_in_next
<= t
.in_size
);
1520 assert(t
.expected_out_next
<= t
.out_size
);
1521 mbstate_t state
= {};
1522 const char* in_next
= nullptr;
1523 InternT
* out_next
= nullptr;
1524 codecvt_base::result res
= codecvt_base::ok
;
1526 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1527 assert(res
== cvt
.partial
);
1528 assert(in_next
== in
+ t
.expected_in_next
);
1529 assert(out_next
== out
+ t
.expected_out_next
);
1530 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1531 if (t
.expected_out_next
< array_size(out
))
1532 assert(out
[t
.expected_out_next
] == 0);
1534 state
= mbstate_t();
1535 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1537 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1541 template <class InternT
>
1542 void utf16_to_utf32_in_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1543 char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1544 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1545 static_assert(array_size(input
) == 6, "");
1546 static_assert(array_size(expected
) == 5, "");
1548 InternT exp
[array_size(expected
)];
1549 copy(begin(expected
), end(expected
), begin(exp
));
1551 // The only possible error in UTF-16 is unpaired surrogate code units.
1552 // So we replace valid code points (scalar values) with lone surrogate CU.
1553 test_offsets_error
<char16_t
> offsets
[] = {
1554 {10, 4, 0, 0, 0xD800, 0},
1555 {10, 4, 0, 0, 0xDBFF, 0},
1556 {10, 4, 0, 0, 0xDC00, 0},
1557 {10, 4, 0, 0, 0xDFFF, 0},
1559 {10, 4, 2, 1, 0xD800, 1},
1560 {10, 4, 2, 1, 0xDBFF, 1},
1561 {10, 4, 2, 1, 0xDC00, 1},
1562 {10, 4, 2, 1, 0xDFFF, 1},
1564 {10, 4, 4, 2, 0xD800, 2},
1565 {10, 4, 4, 2, 0xDBFF, 2},
1566 {10, 4, 4, 2, 0xDC00, 2},
1567 {10, 4, 4, 2, 0xDFFF, 2},
1569 // make the leading surrogate a trailing one
1570 {10, 4, 6, 3, 0xDC00, 3},
1571 {10, 4, 6, 3, 0xDFFF, 3},
1573 // make the trailing surrogate a leading one
1574 {10, 4, 6, 3, 0xD800, 4},
1575 {10, 4, 6, 3, 0xDBFF, 4},
1577 // make the trailing surrogate a BMP char
1578 {10, 4, 6, 3, 'z', 4},
1581 for (test_offsets_error
<char16_t
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1582 test_offsets_error
<char16_t
> t
= *it
;
1583 char in
[array_size(input
) * 2];
1584 InternT out
[array_size(exp
) - 1] = {};
1585 assert(t
.in_size
<= array_size(in
));
1586 assert(t
.out_size
<= array_size(out
));
1587 assert(t
.expected_in_next
<= t
.in_size
);
1588 assert(t
.expected_out_next
<= t
.out_size
);
1589 char16_t old_char
= input
[t
.replace_pos
];
1590 input
[t
.replace_pos
] = t
.replace_char
; // replace in input, not in in
1591 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1593 mbstate_t state
= {};
1594 const char* in_next
= nullptr;
1595 InternT
* out_next
= nullptr;
1596 codecvt_base::result res
= codecvt_base::ok
;
1598 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1599 assert(res
== cvt
.error
);
1600 assert(in_next
== in
+ t
.expected_in_next
);
1601 assert(out_next
== out
+ t
.expected_out_next
);
1602 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1603 if (t
.expected_out_next
< array_size(out
))
1604 assert(out
[t
.expected_out_next
] == 0);
1606 state
= mbstate_t();
1607 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1609 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1611 input
[t
.replace_pos
] = old_char
;
1615 template <class InternT
>
1616 void utf32_to_utf16_out_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1617 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1618 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1619 static_assert(array_size(input
) == 5, "");
1620 static_assert(array_size(expected
) == 6, "");
1622 InternT in
[array_size(input
)];
1623 char exp
[array_size(expected
) * 2];
1624 copy(begin(input
), end(input
), begin(in
));
1625 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1627 test_offsets_ok offsets
[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}, {4, 10}};
1628 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1629 test_offsets_ok t
= *it
;
1630 char out
[array_size(exp
) - 2] = {};
1631 assert(t
.in_size
<= array_size(in
));
1632 assert(t
.out_size
<= array_size(out
));
1633 mbstate_t state
= {};
1634 const InternT
* in_next
= nullptr;
1635 char* out_next
= nullptr;
1636 codecvt_base::result res
= codecvt_base::ok
;
1638 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1639 assert(res
== cvt
.ok
);
1640 assert(in_next
== in
+ t
.in_size
);
1641 assert(out_next
== out
+ t
.out_size
);
1642 assert(char_traits
<char>::compare(out
, exp
, t
.out_size
) == 0);
1643 if (t
.out_size
< array_size(out
))
1644 assert(out
[t
.out_size
] == 0);
1648 template <class InternT
>
1649 void utf32_to_utf16_out_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1650 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1651 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1652 static_assert(array_size(input
) == 5, "");
1653 static_assert(array_size(expected
) == 6, "");
1655 InternT in
[array_size(input
)];
1656 char exp
[array_size(expected
) * 2];
1657 copy(begin(input
), end(input
), begin(in
));
1658 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1660 test_offsets_partial offsets
[] = {
1661 {1, 0, 0, 0}, // no space for first CP
1662 {1, 1, 0, 0}, // no space for first CP
1664 {2, 2, 1, 2}, // no space for second CP
1665 {2, 3, 1, 2}, // no space for second CP
1667 {3, 4, 2, 4}, // no space for third CP
1668 {3, 5, 2, 4}, // no space for third CP
1670 {4, 6, 3, 6}, // no space for fourth CP
1671 {4, 7, 3, 6}, // no space for fourth CP
1672 {4, 8, 3, 6}, // no space for fourth CP
1673 {4, 9, 3, 6}, // no space for fourth CP
1675 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1676 test_offsets_partial t
= *it
;
1677 char out
[array_size(exp
) - 2] = {};
1678 assert(t
.in_size
<= array_size(in
));
1679 assert(t
.out_size
<= array_size(out
));
1680 assert(t
.expected_in_next
<= t
.in_size
);
1681 assert(t
.expected_out_next
<= t
.out_size
);
1682 mbstate_t state
= {};
1683 const InternT
* in_next
= nullptr;
1684 char* out_next
= nullptr;
1685 codecvt_base::result res
= codecvt_base::ok
;
1687 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1688 assert(res
== cvt
.partial
);
1689 assert(in_next
== in
+ t
.expected_in_next
);
1690 assert(out_next
== out
+ t
.expected_out_next
);
1691 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
1692 if (t
.expected_out_next
< array_size(out
))
1693 assert(out
[t
.expected_out_next
] == 0);
1697 template <class InternT
>
1698 void utf32_to_utf16_out_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1699 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1700 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1701 static_assert(array_size(input
) == 5, "");
1702 static_assert(array_size(expected
) == 6, "");
1704 InternT in
[array_size(input
)];
1705 char exp
[array_size(expected
) * 2];
1706 copy(begin(input
), end(input
), begin(in
));
1707 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1709 test_offsets_error
<InternT
> offsets
[] = {
1712 {4, 10, 0, 0, 0xD800, 0},
1713 {4, 10, 1, 2, 0xDBFF, 1},
1714 {4, 10, 2, 4, 0xDC00, 2},
1715 {4, 10, 3, 6, 0xDFFF, 3},
1718 {4, 10, 0, 0, 0x00110000, 0},
1719 {4, 10, 1, 2, 0x00110000, 1},
1720 {4, 10, 2, 4, 0x00110000, 2},
1721 {4, 10, 3, 6, 0x00110000, 3}};
1723 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1724 test_offsets_error
<InternT
> t
= *it
;
1725 char out
[array_size(exp
) - 2] = {};
1726 assert(t
.in_size
<= array_size(in
));
1727 assert(t
.out_size
<= array_size(out
));
1728 assert(t
.expected_in_next
<= t
.in_size
);
1729 assert(t
.expected_out_next
<= t
.out_size
);
1730 InternT old_char
= in
[t
.replace_pos
];
1731 in
[t
.replace_pos
] = t
.replace_char
;
1733 mbstate_t state
= {};
1734 const InternT
* in_next
= nullptr;
1735 char* out_next
= nullptr;
1736 codecvt_base::result res
= codecvt_base::ok
;
1738 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1739 assert(res
== cvt
.error
);
1740 assert(in_next
== in
+ t
.expected_in_next
);
1741 assert(out_next
== out
+ t
.expected_out_next
);
1742 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
1743 if (t
.expected_out_next
< array_size(out
))
1744 assert(out
[t
.expected_out_next
] == 0);
1746 in
[t
.replace_pos
] = old_char
;
1750 template <class InternT
>
1751 void test_utf16_utf32_cvt(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1752 utf16_to_utf32_in_ok(cvt
, endianess
);
1753 utf16_to_utf32_in_partial(cvt
, endianess
);
1754 utf16_to_utf32_in_error(cvt
, endianess
);
1755 utf32_to_utf16_out_ok(cvt
, endianess
);
1756 utf32_to_utf16_out_partial(cvt
, endianess
);
1757 utf32_to_utf16_out_error(cvt
, endianess
);
1760 template <class InternT
>
1761 void utf16_to_ucs2_in_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1762 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1763 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1764 static_assert(array_size(input
) == 4, "");
1765 static_assert(array_size(expected
) == 4, "");
1767 char in
[array_size(input
) * 2];
1768 InternT exp
[array_size(expected
)];
1769 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1770 copy(begin(expected
), end(expected
), begin(exp
));
1772 test_offsets_ok offsets
[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}};
1773 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1774 test_offsets_ok t
= *it
;
1775 InternT out
[array_size(exp
) - 1] = {};
1776 assert(t
.in_size
<= array_size(in
));
1777 assert(t
.out_size
<= array_size(out
));
1778 mbstate_t state
= {};
1779 const char* in_next
= nullptr;
1780 InternT
* out_next
= nullptr;
1781 codecvt_base::result res
= codecvt_base::ok
;
1783 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1784 assert(res
== cvt
.ok
);
1785 assert(in_next
== in
+ t
.in_size
);
1786 assert(out_next
== out
+ t
.out_size
);
1787 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1788 if (t
.out_size
< array_size(out
))
1789 assert(out
[t
.out_size
] == 0);
1791 state
= mbstate_t();
1792 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1794 assert(static_cast<size_t>(len
) == t
.in_size
);
1797 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1798 test_offsets_ok t
= *it
;
1799 InternT out
[array_size(exp
)] = {};
1800 assert(t
.in_size
<= array_size(in
));
1801 assert(t
.out_size
<= array_size(out
));
1802 mbstate_t state
= {};
1803 const char* in_next
= nullptr;
1804 InternT
* out_next
= nullptr;
1805 codecvt_base::result res
= codecvt_base::ok
;
1807 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
1808 assert(res
== cvt
.ok
);
1809 assert(in_next
== in
+ t
.in_size
);
1810 assert(out_next
== out
+ t
.out_size
);
1811 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1812 if (t
.out_size
< array_size(out
))
1813 assert(out
[t
.out_size
] == 0);
1815 state
= mbstate_t();
1816 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
1818 assert(static_cast<size_t>(len
) == t
.in_size
);
1822 template <class InternT
>
1823 void utf16_to_ucs2_in_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1824 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1825 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1826 static_assert(array_size(input
) == 4, "");
1827 static_assert(array_size(expected
) == 4, "");
1829 char in
[array_size(input
) * 2];
1830 InternT exp
[array_size(expected
)];
1831 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1832 copy(begin(expected
), end(expected
), begin(exp
));
1834 test_offsets_partial offsets
[] = {
1835 {2, 0, 0, 0}, // no space for first CP
1836 {1, 1, 0, 0}, // incomplete first CP
1837 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1839 {4, 1, 2, 1}, // no space for second CP
1840 {3, 2, 2, 1}, // incomplete second CP
1841 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1843 {6, 2, 4, 2}, // no space for third CP
1844 {5, 3, 4, 2}, // incomplete third CP
1845 {5, 2, 4, 2}, // incomplete third CP, and no space for it
1848 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1849 test_offsets_partial t
= *it
;
1850 InternT out
[array_size(exp
) - 1] = {};
1851 assert(t
.in_size
<= array_size(in
));
1852 assert(t
.out_size
<= array_size(out
));
1853 assert(t
.expected_in_next
<= t
.in_size
);
1854 assert(t
.expected_out_next
<= t
.out_size
);
1855 mbstate_t state
= {};
1856 const char* in_next
= nullptr;
1857 InternT
* out_next
= nullptr;
1858 codecvt_base::result res
= codecvt_base::ok
;
1860 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1861 assert(res
== cvt
.partial
);
1862 assert(in_next
== in
+ t
.expected_in_next
);
1863 assert(out_next
== out
+ t
.expected_out_next
);
1864 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1865 if (t
.expected_out_next
< array_size(out
))
1866 assert(out
[t
.expected_out_next
] == 0);
1868 state
= mbstate_t();
1869 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1871 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1875 template <class InternT
>
1876 void utf16_to_ucs2_in_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1877 char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1878 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1879 static_assert(array_size(input
) == 6, "");
1880 static_assert(array_size(expected
) == 6, "");
1882 InternT exp
[array_size(expected
)];
1883 copy(begin(expected
), end(expected
), begin(exp
));
1885 // The only possible error in UTF-16 is unpaired surrogate code units.
1886 // Additionally, because the target encoding is UCS-2, a proper pair of
1887 // surrogates is also error. Simply, any surrogate CU is error.
1888 test_offsets_error
<char16_t
> offsets
[] = {
1889 {6, 3, 0, 0, 0xD800, 0},
1890 {6, 3, 0, 0, 0xDBFF, 0},
1891 {6, 3, 0, 0, 0xDC00, 0},
1892 {6, 3, 0, 0, 0xDFFF, 0},
1894 {6, 3, 2, 1, 0xD800, 1},
1895 {6, 3, 2, 1, 0xDBFF, 1},
1896 {6, 3, 2, 1, 0xDC00, 1},
1897 {6, 3, 2, 1, 0xDFFF, 1},
1899 {6, 3, 4, 2, 0xD800, 2},
1900 {6, 3, 4, 2, 0xDBFF, 2},
1901 {6, 3, 4, 2, 0xDC00, 2},
1902 {6, 3, 4, 2, 0xDFFF, 2},
1904 // make the leading surrogate a trailing one
1905 {10, 5, 6, 3, 0xDC00, 3},
1906 {10, 5, 6, 3, 0xDFFF, 3},
1908 // make the trailing surrogate a leading one
1909 {10, 5, 6, 3, 0xD800, 4},
1910 {10, 5, 6, 3, 0xDBFF, 4},
1912 // make the trailing surrogate a BMP char
1913 {10, 5, 6, 3, 'z', 4},
1915 // don't replace anything in the test cases bellow, just show the surrogate
1916 // pair (fourth CP) fully or partially (just the first surrogate)
1917 {10, 5, 6, 3, 'b', 0},
1918 {8, 5, 6, 3, 'b', 0},
1919 {9, 5, 6, 3, 'b', 0},
1921 {10, 4, 6, 3, 'b', 0},
1922 {8, 4, 6, 3, 'b', 0},
1923 {9, 4, 6, 3, 'b', 0},
1926 for (test_offsets_error
<char16_t
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1927 test_offsets_error
<char16_t
> t
= *it
;
1928 char in
[array_size(input
) * 2];
1929 InternT out
[array_size(exp
) - 1] = {};
1930 assert(t
.in_size
<= array_size(in
));
1931 assert(t
.out_size
<= array_size(out
));
1932 assert(t
.expected_in_next
<= t
.in_size
);
1933 assert(t
.expected_out_next
<= t
.out_size
);
1934 char16_t old_char
= input
[t
.replace_pos
];
1935 input
[t
.replace_pos
] = t
.replace_char
; // replace in input, not in in
1936 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1938 mbstate_t state
= {};
1939 const char* in_next
= nullptr;
1940 InternT
* out_next
= nullptr;
1941 codecvt_base::result res
= codecvt_base::ok
;
1943 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1944 assert(res
== cvt
.error
);
1945 assert(in_next
== in
+ t
.expected_in_next
);
1946 assert(out_next
== out
+ t
.expected_out_next
);
1947 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1948 if (t
.expected_out_next
< array_size(out
))
1949 assert(out
[t
.expected_out_next
] == 0);
1951 state
= mbstate_t();
1952 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1954 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1956 input
[t
.replace_pos
] = old_char
;
1960 template <class InternT
>
1961 void ucs2_to_utf16_out_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1962 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1963 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1964 static_assert(array_size(input
) == 4, "");
1965 static_assert(array_size(expected
) == 4, "");
1967 InternT in
[array_size(input
)];
1968 char exp
[array_size(expected
) * 2];
1969 copy(begin(input
), end(input
), begin(in
));
1970 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1972 test_offsets_ok offsets
[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}};
1973 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1974 test_offsets_ok t
= *it
;
1975 char out
[array_size(exp
) - 2] = {};
1976 assert(t
.in_size
<= array_size(in
));
1977 assert(t
.out_size
<= array_size(out
));
1978 mbstate_t state
= {};
1979 const InternT
* in_next
= nullptr;
1980 char* out_next
= nullptr;
1981 codecvt_base::result res
= codecvt_base::ok
;
1983 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1984 assert(res
== cvt
.ok
);
1985 assert(in_next
== in
+ t
.in_size
);
1986 assert(out_next
== out
+ t
.out_size
);
1987 assert(char_traits
<char>::compare(out
, exp
, t
.out_size
) == 0);
1988 if (t
.out_size
< array_size(out
))
1989 assert(out
[t
.out_size
] == 0);
1993 template <class InternT
>
1994 void ucs2_to_utf16_out_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1995 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1996 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1997 static_assert(array_size(input
) == 4, "");
1998 static_assert(array_size(expected
) == 4, "");
2000 InternT in
[array_size(input
)];
2001 char exp
[array_size(expected
) * 2];
2002 copy(begin(input
), end(input
), begin(in
));
2003 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
2005 test_offsets_partial offsets
[] = {
2006 {1, 0, 0, 0}, // no space for first CP
2007 {1, 1, 0, 0}, // no space for first CP
2009 {2, 2, 1, 2}, // no space for second CP
2010 {2, 3, 1, 2}, // no space for second CP
2012 {3, 4, 2, 4}, // no space for third CP
2013 {3, 5, 2, 4}, // no space for third CP
2015 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
2016 test_offsets_partial t
= *it
;
2017 char out
[array_size(exp
) - 2] = {};
2018 assert(t
.in_size
<= array_size(in
));
2019 assert(t
.out_size
<= array_size(out
));
2020 assert(t
.expected_in_next
<= t
.in_size
);
2021 assert(t
.expected_out_next
<= t
.out_size
);
2022 mbstate_t state
= {};
2023 const InternT
* in_next
= nullptr;
2024 char* out_next
= nullptr;
2025 codecvt_base::result res
= codecvt_base::ok
;
2027 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
2028 assert(res
== cvt
.partial
);
2029 assert(in_next
== in
+ t
.expected_in_next
);
2030 assert(out_next
== out
+ t
.expected_out_next
);
2031 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
2032 if (t
.expected_out_next
< array_size(out
))
2033 assert(out
[t
.expected_out_next
] == 0);
2037 template <class InternT
>
2038 void ucs2_to_utf16_out_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
2039 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2040 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2041 static_assert(array_size(input
) == 6, "");
2042 static_assert(array_size(expected
) == 6, "");
2044 InternT in
[array_size(input
)];
2045 char exp
[array_size(expected
) * 2];
2046 copy(begin(input
), end(input
), begin(in
));
2047 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
2049 test_offsets_error
<InternT
> offsets
[] = {
2050 {3, 6, 0, 0, 0xD800, 0},
2051 {3, 6, 0, 0, 0xDBFF, 0},
2052 {3, 6, 0, 0, 0xDC00, 0},
2053 {3, 6, 0, 0, 0xDFFF, 0},
2055 {3, 6, 1, 2, 0xD800, 1},
2056 {3, 6, 1, 2, 0xDBFF, 1},
2057 {3, 6, 1, 2, 0xDC00, 1},
2058 {3, 6, 1, 2, 0xDFFF, 1},
2060 {3, 6, 2, 4, 0xD800, 2},
2061 {3, 6, 2, 4, 0xDBFF, 2},
2062 {3, 6, 2, 4, 0xDC00, 2},
2063 {3, 6, 2, 4, 0xDFFF, 2},
2065 // make the leading surrogate a trailing one
2066 {5, 10, 3, 6, 0xDC00, 3},
2067 {5, 10, 3, 6, 0xDFFF, 3},
2069 // make the trailing surrogate a leading one
2070 {5, 10, 3, 6, 0xD800, 4},
2071 {5, 10, 3, 6, 0xDBFF, 4},
2073 // make the trailing surrogate a BMP char
2074 {5, 10, 3, 6, 'z', 4},
2076 // don't replace anything in the test cases bellow, just show the surrogate
2077 // pair (fourth CP) fully or partially (just the first surrogate)
2078 {5, 10, 3, 6, 'b', 0},
2079 {5, 8, 3, 6, 'b', 0},
2080 {5, 9, 3, 6, 'b', 0},
2082 {4, 10, 3, 6, 'b', 0},
2083 {4, 8, 3, 6, 'b', 0},
2084 {4, 9, 3, 6, 'b', 0},
2087 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
2088 test_offsets_error
<InternT
> t
= *it
;
2089 char out
[array_size(exp
) - 2] = {};
2090 assert(t
.in_size
<= array_size(in
));
2091 assert(t
.out_size
<= array_size(out
));
2092 assert(t
.expected_in_next
<= t
.in_size
);
2093 assert(t
.expected_out_next
<= t
.out_size
);
2094 InternT old_char
= in
[t
.replace_pos
];
2095 in
[t
.replace_pos
] = t
.replace_char
;
2097 mbstate_t state
= {};
2098 const InternT
* in_next
= nullptr;
2099 char* out_next
= nullptr;
2100 codecvt_base::result res
= codecvt_base::ok
;
2102 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
2103 assert(res
== cvt
.error
);
2104 assert(in_next
== in
+ t
.expected_in_next
);
2105 assert(out_next
== out
+ t
.expected_out_next
);
2106 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
2107 if (t
.expected_out_next
< array_size(out
))
2108 assert(out
[t
.expected_out_next
] == 0);
2110 in
[t
.replace_pos
] = old_char
;
2114 template <class InternT
>
2115 void test_utf16_ucs2_cvt(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
2116 utf16_to_ucs2_in_ok(cvt
, endianess
);
2117 utf16_to_ucs2_in_partial(cvt
, endianess
);
2118 utf16_to_ucs2_in_error(cvt
, endianess
);
2119 ucs2_to_utf16_out_ok(cvt
, endianess
);
2120 ucs2_to_utf16_out_partial(cvt
, endianess
);
2121 ucs2_to_utf16_out_error(cvt
, endianess
);
2125 using std::codecvt_utf16
;
2126 using std::codecvt_utf8
;
2127 using std::codecvt_utf8_utf16
;
2128 using std::has_facet
;
2130 using std::use_facet
;
2132 void test_utf8_utf32_codecvts() {
2133 typedef codecvt
<char32_t
, char, mbstate_t> codecvt_c32
;
2134 const locale
& loc_c
= locale::classic();
2135 assert(has_facet
<codecvt_c32
>(loc_c
));
2137 const codecvt_c32
& cvt
= use_facet
<codecvt_c32
>(loc_c
);
2138 test_utf8_utf32_cvt(cvt
);
2140 codecvt_utf8
<char32_t
> cvt2
;
2141 test_utf8_utf32_cvt(cvt2
);
2143 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2144 codecvt_utf8
<wchar_t> cvt3
;
2145 test_utf8_utf32_cvt(cvt3
);
2148 #ifndef TEST_HAS_NO_CHAR8_T
2149 typedef codecvt
<char32_t
, char8_t
, mbstate_t> codecvt_c32_c8
;
2150 assert(has_facet
<codecvt_c32_c8
>(loc_c
));
2151 const codecvt_c32_c8
& cvt4
= use_facet
<codecvt_c32_c8
>(loc_c
);
2152 test_utf8_utf32_cvt(cvt4
);
2156 void test_utf8_utf16_codecvts() {
2157 typedef codecvt
<char16_t
, char, mbstate_t> codecvt_c16
;
2158 const locale
& loc_c
= locale::classic();
2159 assert(has_facet
<codecvt_c16
>(loc_c
));
2161 const codecvt_c16
& cvt
= use_facet
<codecvt_c16
>(loc_c
);
2162 test_utf8_utf16_cvt(cvt
);
2164 codecvt_utf8_utf16
<char16_t
> cvt2
;
2165 test_utf8_utf16_cvt(cvt2
);
2167 codecvt_utf8_utf16
<char32_t
> cvt3
;
2168 test_utf8_utf16_cvt(cvt3
);
2170 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
2171 codecvt_utf8_utf16
<wchar_t> cvt4
;
2172 test_utf8_utf16_cvt(cvt4
);
2175 #ifndef TEST_HAS_NO_CHAR8_T
2176 typedef codecvt
<char16_t
, char8_t
, mbstate_t> codecvt_c16_c8
;
2177 assert(has_facet
<codecvt_c16_c8
>(loc_c
));
2178 const codecvt_c16_c8
& cvt5
= use_facet
<codecvt_c16_c8
>(loc_c
);
2179 test_utf8_utf16_cvt(cvt5
);
2183 void test_utf8_ucs2_codecvts() {
2184 codecvt_utf8
<char16_t
> cvt
;
2185 test_utf8_ucs2_cvt(cvt
);
2187 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2188 codecvt_utf8
<wchar_t> cvt2
;
2189 test_utf8_ucs2_cvt(cvt2
);
2193 void test_utf16_utf32_codecvts() {
2194 codecvt_utf16
<char32_t
> cvt
;
2195 test_utf16_utf32_cvt(cvt
, utf16_big_endian
);
2197 codecvt_utf16
<char32_t
, 0x10FFFF, std::little_endian
> cvt2
;
2198 test_utf16_utf32_cvt(cvt2
, utf16_little_endian
);
2200 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2201 codecvt_utf16
<wchar_t> cvt3
;
2202 test_utf16_utf32_cvt(cvt3
, utf16_big_endian
);
2204 codecvt_utf16
<wchar_t, 0x10FFFF, std::little_endian
> cvt4
;
2205 test_utf16_utf32_cvt(cvt4
, utf16_little_endian
);
2209 void test_utf16_ucs2_codecvts() {
2210 codecvt_utf16
<char16_t
> cvt
;
2211 test_utf16_ucs2_cvt(cvt
, utf16_big_endian
);
2213 codecvt_utf16
<char16_t
, 0x10FFFF, std::little_endian
> cvt2
;
2214 test_utf16_ucs2_cvt(cvt2
, utf16_little_endian
);
2216 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2217 codecvt_utf16
<wchar_t> cvt3
;
2218 test_utf16_ucs2_cvt(cvt3
, utf16_big_endian
);
2220 codecvt_utf16
<wchar_t, 0x10FFFF, std::little_endian
> cvt4
;
2221 test_utf16_ucs2_cvt(cvt4
, utf16_little_endian
);
2226 test_utf8_utf32_codecvts();
2227 test_utf8_utf16_codecvts();
2228 test_utf8_ucs2_codecvts();
2229 test_utf16_utf32_codecvts();
2230 test_utf16_ucs2_codecvts();