1 //===----------------------------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT
10 // XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}}
17 #include "test_macros.h"
19 struct test_offsets_ok
{
23 struct test_offsets_partial
{
26 size_t expected_in_next
;
27 size_t expected_out_next
;
30 template <class CharT
>
31 struct test_offsets_error
{
34 size_t expected_in_next
;
35 size_t expected_out_next
;
40 #define array_size(x) (sizeof(x) / sizeof(x)[0])
43 using std::char_traits
;
44 using std::codecvt_base
;
48 template <class InternT
, class ExternT
>
49 void utf8_to_utf32_in_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
50 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
51 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
52 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
53 static_assert(array_size(input
) == 11, "");
54 static_assert(array_size(expected
) == 5, "");
56 ExternT in
[array_size(input
)];
57 InternT exp
[array_size(expected
)];
58 copy(begin(input
), end(input
), begin(in
));
59 copy(begin(expected
), end(expected
), begin(exp
));
60 assert(char_traits
<ExternT
>::length(in
) == 10);
61 assert(char_traits
<InternT
>::length(exp
) == 4);
62 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
63 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
64 test_offsets_ok t
= *it
;
65 InternT out
[array_size(exp
) - 1] = {};
66 assert(t
.in_size
<= array_size(in
));
67 assert(t
.out_size
<= array_size(out
));
69 const ExternT
* in_next
= nullptr;
70 InternT
* out_next
= nullptr;
71 codecvt_base::result res
= codecvt_base::ok
;
73 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
74 assert(res
== cvt
.ok
);
75 assert(in_next
== in
+ t
.in_size
);
76 assert(out_next
== out
+ t
.out_size
);
77 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
78 if (t
.out_size
< array_size(out
))
79 assert(out
[t
.out_size
] == 0);
82 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
84 assert(static_cast<size_t>(len
) == t
.in_size
);
87 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
88 test_offsets_ok t
= *it
;
89 InternT out
[array_size(exp
)] = {};
90 assert(t
.in_size
<= array_size(in
));
91 assert(t
.out_size
<= array_size(out
));
93 const ExternT
* in_next
= nullptr;
94 InternT
* out_next
= nullptr;
95 codecvt_base::result res
= codecvt_base::ok
;
97 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
98 assert(res
== cvt
.ok
);
99 assert(in_next
== in
+ t
.in_size
);
100 assert(out_next
== out
+ t
.out_size
);
101 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
102 if (t
.out_size
< array_size(out
))
103 assert(out
[t
.out_size
] == 0);
106 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
108 assert(static_cast<size_t>(len
) == t
.in_size
);
112 template <class InternT
, class ExternT
>
113 void utf8_to_utf32_in_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
114 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
115 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
116 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
117 static_assert(array_size(input
) == 11, "");
118 static_assert(array_size(expected
) == 5, "");
120 ExternT in
[array_size(input
)];
121 InternT exp
[array_size(expected
)];
122 copy(begin(input
), end(input
), begin(in
));
123 copy(begin(expected
), end(expected
), begin(exp
));
124 assert(char_traits
<ExternT
>::length(in
) == 10);
125 assert(char_traits
<InternT
>::length(exp
) == 4);
127 test_offsets_partial offsets
[] = {
128 {1, 0, 0, 0}, // no space for first CP
130 {3, 1, 1, 1}, // no space for second CP
131 {2, 2, 1, 1}, // incomplete second CP
132 {2, 1, 1, 1}, // incomplete second CP, and no space for it
134 {6, 2, 3, 2}, // no space for third CP
135 {4, 3, 3, 2}, // incomplete third CP
136 {5, 3, 3, 2}, // incomplete third CP
137 {4, 2, 3, 2}, // incomplete third CP, and no space for it
138 {5, 2, 3, 2}, // incomplete third CP, and no space for it
140 {10, 3, 6, 3}, // no space for fourth CP
141 {7, 4, 6, 3}, // incomplete fourth CP
142 {8, 4, 6, 3}, // incomplete fourth CP
143 {9, 4, 6, 3}, // incomplete fourth CP
144 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
145 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
146 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
149 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
150 test_offsets_partial t
= *it
;
151 InternT out
[array_size(exp
) - 1] = {};
152 assert(t
.in_size
<= array_size(in
));
153 assert(t
.out_size
<= array_size(out
));
154 assert(t
.expected_in_next
<= t
.in_size
);
155 assert(t
.expected_out_next
<= t
.out_size
);
156 mbstate_t state
= {};
157 const ExternT
* in_next
= nullptr;
158 InternT
* out_next
= nullptr;
159 codecvt_base::result res
= codecvt_base::ok
;
161 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
162 assert(res
== cvt
.partial
);
163 assert(in_next
== in
+ t
.expected_in_next
);
164 assert(out_next
== out
+ t
.expected_out_next
);
165 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
166 if (t
.expected_out_next
< array_size(out
))
167 assert(out
[t
.expected_out_next
] == 0);
170 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
172 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
176 template <class InternT
, class ExternT
>
177 void utf8_to_utf32_in_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
178 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
179 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
180 const char32_t expected
[] = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
181 static_assert(array_size(input
) == 11, "");
182 static_assert(array_size(expected
) == 5, "");
184 ExternT in
[array_size(input
)];
185 InternT exp
[array_size(expected
)];
186 copy(begin(input
), end(input
), begin(in
));
187 copy(begin(expected
), end(expected
), begin(exp
));
188 assert(char_traits
<ExternT
>::length(in
) == 10);
189 assert(char_traits
<InternT
>::length(exp
) == 4);
191 // There are 5 classes of errors in UTF-8 decoding
192 // 1. Missing leading byte
193 // 2. Missing trailing byte
195 // 4. Overlong sequence
196 // 5. CP out of Unicode range
197 test_offsets_error
<unsigned char> offsets
[] = {
199 // 1. Missing leading byte. We will replace the leading byte with
200 // non-leading byte, such as a byte that is always invalid or a trailing
203 // replace leading byte with invalid byte
204 {1, 4, 0, 0, 0xFF, 0},
205 {3, 4, 1, 1, 0xFF, 1},
206 {6, 4, 3, 2, 0xFF, 3},
207 {10, 4, 6, 3, 0xFF, 6},
209 // replace leading byte with trailing byte
210 {1, 4, 0, 0, 0b10101010, 0},
211 {3, 4, 1, 1, 0b10101010, 1},
212 {6, 4, 3, 2, 0b10101010, 3},
213 {10, 4, 6, 3, 0b10101010, 6},
215 // 2. Missing trailing byte. We will replace the trailing byte with
216 // non-trailing byte, such as a byte that is always invalid or a leading
217 // byte (simple ASCII byte in our case).
219 // replace first trailing byte with ASCII byte
220 {3, 4, 1, 1, 'z', 2},
221 {6, 4, 3, 2, 'z', 4},
222 {10, 4, 6, 3, 'z', 7},
224 // replace first trailing byte with invalid byte
225 {3, 4, 1, 1, 0xFF, 2},
226 {6, 4, 3, 2, 0xFF, 4},
227 {10, 4, 6, 3, 0xFF, 7},
229 // replace second trailing byte with ASCII byte
230 {6, 4, 3, 2, 'z', 5},
231 {10, 4, 6, 3, 'z', 8},
233 // replace second trailing byte with invalid byte
234 {6, 4, 3, 2, 0xFF, 5},
235 {10, 4, 6, 3, 0xFF, 8},
237 // replace third trailing byte
238 {10, 4, 6, 3, 'z', 9},
239 {10, 4, 6, 3, 0xFF, 9},
241 // 2.1 The following test-cases raise doubt whether error or partial should
242 // be returned. For example, we have 4-byte sequence with valid leading
243 // byte. If we hide the last byte we need to return partial. But, if the
244 // second or third byte, which are visible to the call to codecvt, are
245 // malformed then error should be returned.
247 // replace first trailing byte with ASCII byte, also incomplete at end
248 {5, 4, 3, 2, 'z', 4},
249 {8, 4, 6, 3, 'z', 7},
250 {9, 4, 6, 3, 'z', 7},
252 // replace first trailing byte with invalid byte, also incomplete at end
253 {5, 4, 3, 2, 0xFF, 4},
254 {8, 4, 6, 3, 0xFF, 7},
255 {9, 4, 6, 3, 0xFF, 7},
257 // replace second trailing byte with ASCII byte, also incomplete at end
258 {9, 4, 6, 3, 'z', 8},
260 // replace second trailing byte with invalid byte, also incomplete at end
261 {9, 4, 6, 3, 0xFF, 8},
263 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
265 {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
266 {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
267 {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
268 {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
270 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
271 // just the leading byte is enough to make them overlong, i.e. for the
272 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
274 {3, 4, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
275 {3, 4, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
276 {6, 4, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
277 {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
280 // turn U+10AAAA into U+14AAAA by changing its leading byte
281 {10, 4, 6, 3, 0b11110101, 6},
282 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
283 {10, 4, 6, 3, 0b10011010, 7},
285 for (test_offsets_error
<unsigned char>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
286 test_offsets_error
<unsigned char> t
= *it
;
287 InternT out
[array_size(exp
) - 1] = {};
288 assert(t
.in_size
<= array_size(in
));
289 assert(t
.out_size
<= array_size(out
));
290 assert(t
.expected_in_next
<= t
.in_size
);
291 assert(t
.expected_out_next
<= t
.out_size
);
292 ExternT old_char
= in
[t
.replace_pos
];
293 in
[t
.replace_pos
] = t
.replace_char
;
295 mbstate_t state
= {};
296 const ExternT
* in_next
= nullptr;
297 InternT
* out_next
= nullptr;
298 codecvt_base::result res
= codecvt_base::ok
;
300 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
301 assert(res
== cvt
.error
);
302 assert(in_next
== in
+ t
.expected_in_next
);
303 assert(out_next
== out
+ t
.expected_out_next
);
304 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
305 if (t
.expected_out_next
< array_size(out
))
306 assert(out
[t
.expected_out_next
] == 0);
309 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
311 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
313 in
[t
.replace_pos
] = old_char
;
317 template <class InternT
, class ExternT
>
318 void utf8_to_utf32_in(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
319 utf8_to_utf32_in_ok(cvt
);
320 utf8_to_utf32_in_partial(cvt
);
321 utf8_to_utf32_in_error(cvt
);
324 template <class InternT
, class ExternT
>
325 void utf32_to_utf8_out_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
326 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
327 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
328 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
329 static_assert(array_size(input
) == 5, "");
330 static_assert(array_size(expected
) == 11, "");
332 InternT in
[array_size(input
)];
333 ExternT exp
[array_size(expected
)];
334 copy(begin(input
), end(input
), begin(in
));
335 copy(begin(expected
), end(expected
), begin(exp
));
336 assert(char_traits
<InternT
>::length(in
) == 4);
337 assert(char_traits
<ExternT
>::length(exp
) == 10);
339 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
340 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
341 test_offsets_ok t
= *it
;
342 ExternT out
[array_size(exp
) - 1] = {};
343 assert(t
.in_size
<= array_size(in
));
344 assert(t
.out_size
<= array_size(out
));
345 mbstate_t state
= {};
346 const InternT
* in_next
= nullptr;
347 ExternT
* out_next
= nullptr;
348 codecvt_base::result res
= codecvt_base::ok
;
350 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
351 assert(res
== cvt
.ok
);
352 assert(in_next
== in
+ t
.in_size
);
353 assert(out_next
== out
+ t
.out_size
);
354 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.out_size
) == 0);
355 if (t
.out_size
< array_size(out
))
356 assert(out
[t
.out_size
] == 0);
360 template <class InternT
, class ExternT
>
361 void utf32_to_utf8_out_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
362 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
363 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
364 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
365 static_assert(array_size(input
) == 5, "");
366 static_assert(array_size(expected
) == 11, "");
368 InternT in
[array_size(input
)];
369 ExternT exp
[array_size(expected
)];
370 copy(begin(input
), end(input
), begin(in
));
371 copy(begin(expected
), end(expected
), begin(exp
));
372 assert(char_traits
<InternT
>::length(in
) == 4);
373 assert(char_traits
<ExternT
>::length(exp
) == 10);
375 test_offsets_partial offsets
[] = {
376 {1, 0, 0, 0}, // no space for first CP
378 {2, 1, 1, 1}, // no space for second CP
379 {2, 2, 1, 1}, // no space for second CP
381 {3, 3, 2, 3}, // no space for third CP
382 {3, 4, 2, 3}, // no space for third CP
383 {3, 5, 2, 3}, // no space for third CP
385 {4, 6, 3, 6}, // no space for fourth CP
386 {4, 7, 3, 6}, // no space for fourth CP
387 {4, 8, 3, 6}, // no space for fourth CP
388 {4, 9, 3, 6}, // no space for fourth CP
390 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
391 test_offsets_partial t
= *it
;
392 ExternT out
[array_size(exp
) - 1] = {};
393 assert(t
.in_size
<= array_size(in
));
394 assert(t
.out_size
<= array_size(out
));
395 assert(t
.expected_in_next
<= t
.in_size
);
396 assert(t
.expected_out_next
<= t
.out_size
);
397 mbstate_t state
= {};
398 const InternT
* in_next
= nullptr;
399 ExternT
* out_next
= nullptr;
400 codecvt_base::result res
= codecvt_base::ok
;
402 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
403 assert(res
== cvt
.partial
);
404 assert(in_next
== in
+ t
.expected_in_next
);
405 assert(out_next
== out
+ t
.expected_out_next
);
406 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
407 if (t
.expected_out_next
< array_size(out
))
408 assert(out
[t
.expected_out_next
] == 0);
412 template <class InternT
, class ExternT
>
413 void utf32_to_utf8_out_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
414 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
415 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
416 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
417 static_assert(array_size(input
) == 5, "");
418 static_assert(array_size(expected
) == 11, "");
420 InternT in
[array_size(input
)];
421 ExternT exp
[array_size(expected
)];
422 copy(begin(input
), end(input
), begin(in
));
423 copy(begin(expected
), end(expected
), begin(exp
));
424 assert(char_traits
<InternT
>::length(in
) == 4);
425 assert(char_traits
<ExternT
>::length(exp
) == 10);
427 test_offsets_error
<InternT
> offsets
[] = {
430 {4, 10, 0, 0, 0xD800, 0},
431 {4, 10, 1, 1, 0xDBFF, 1},
432 {4, 10, 2, 3, 0xDC00, 2},
433 {4, 10, 3, 6, 0xDFFF, 3},
436 {4, 10, 0, 0, 0x00110000, 0},
437 {4, 10, 1, 1, 0x00110000, 1},
438 {4, 10, 2, 3, 0x00110000, 2},
439 {4, 10, 3, 6, 0x00110000, 3}};
441 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
442 test_offsets_error
<InternT
> t
= *it
;
443 ExternT out
[array_size(exp
) - 1] = {};
444 assert(t
.in_size
<= array_size(in
));
445 assert(t
.out_size
<= array_size(out
));
446 assert(t
.expected_in_next
<= t
.in_size
);
447 assert(t
.expected_out_next
<= t
.out_size
);
448 InternT old_char
= in
[t
.replace_pos
];
449 in
[t
.replace_pos
] = t
.replace_char
;
451 mbstate_t state
= {};
452 const InternT
* in_next
= nullptr;
453 ExternT
* out_next
= nullptr;
454 codecvt_base::result res
= codecvt_base::ok
;
456 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
457 assert(res
== cvt
.error
);
458 assert(in_next
== in
+ t
.expected_in_next
);
459 assert(out_next
== out
+ t
.expected_out_next
);
460 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
461 if (t
.expected_out_next
< array_size(out
))
462 assert(out
[t
.expected_out_next
] == 0);
464 in
[t
.replace_pos
] = old_char
;
468 template <class InternT
, class ExternT
>
469 void utf32_to_utf8_out(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
470 utf32_to_utf8_out_ok(cvt
);
471 utf32_to_utf8_out_partial(cvt
);
472 utf32_to_utf8_out_error(cvt
);
475 template <class InternT
, class ExternT
>
476 void test_utf8_utf32_cvt(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
477 utf8_to_utf32_in(cvt
);
478 utf32_to_utf8_out(cvt
);
481 template <class InternT
, class ExternT
>
482 void utf8_to_utf16_in_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
483 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
484 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
485 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
486 static_assert(array_size(input
) == 11, "");
487 static_assert(array_size(expected
) == 6, "");
489 ExternT in
[array_size(input
)];
490 InternT exp
[array_size(expected
)];
491 copy(begin(input
), end(input
), begin(in
));
492 copy(begin(expected
), end(expected
), begin(exp
));
493 assert(char_traits
<ExternT
>::length(in
) == 10);
494 assert(char_traits
<InternT
>::length(exp
) == 5);
496 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}};
497 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
498 test_offsets_ok t
= *it
;
499 InternT out
[array_size(exp
) - 1] = {};
500 assert(t
.in_size
<= array_size(in
));
501 assert(t
.out_size
<= array_size(out
));
502 mbstate_t state
= {};
503 const ExternT
* in_next
= nullptr;
504 InternT
* out_next
= nullptr;
505 codecvt_base::result res
= codecvt_base::ok
;
507 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
508 assert(res
== cvt
.ok
);
509 assert(in_next
== in
+ t
.in_size
);
510 assert(out_next
== out
+ t
.out_size
);
511 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
512 if (t
.out_size
< array_size(out
))
513 assert(out
[t
.out_size
] == 0);
516 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
518 assert(static_cast<size_t>(len
) == t
.in_size
);
521 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
522 test_offsets_ok t
= *it
;
523 InternT out
[array_size(exp
)] = {};
524 assert(t
.in_size
<= array_size(in
));
525 assert(t
.out_size
<= array_size(out
));
526 mbstate_t state
= {};
527 const ExternT
* in_next
= nullptr;
528 InternT
* out_next
= nullptr;
529 codecvt_base::result res
= codecvt_base::ok
;
531 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
532 assert(res
== cvt
.ok
);
533 assert(in_next
== in
+ t
.in_size
);
534 assert(out_next
== out
+ t
.out_size
);
535 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
536 if (t
.out_size
< array_size(out
))
537 assert(out
[t
.out_size
] == 0);
540 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
542 assert(static_cast<size_t>(len
) == t
.in_size
);
546 template <class InternT
, class ExternT
>
547 void utf8_to_utf16_in_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
548 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
549 const unsigned char input
[] = "b\u0448\uAAAA\U0010AAAA";
550 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
551 static_assert(array_size(input
) == 11, "");
552 static_assert(array_size(expected
) == 6, "");
554 ExternT in
[array_size(input
)];
555 InternT exp
[array_size(expected
)];
556 copy(begin(input
), end(input
), begin(in
));
557 copy(begin(expected
), end(expected
), begin(exp
));
558 assert(char_traits
<ExternT
>::length(in
) == 10);
559 assert(char_traits
<InternT
>::length(exp
) == 5);
561 test_offsets_partial offsets
[] = {
562 {1, 0, 0, 0}, // no space for first CP
564 {3, 1, 1, 1}, // no space for second CP
565 {2, 2, 1, 1}, // incomplete second CP
566 {2, 1, 1, 1}, // incomplete second CP, and no space for it
568 {6, 2, 3, 2}, // no space for third CP
569 {4, 3, 3, 2}, // incomplete third CP
570 {5, 3, 3, 2}, // incomplete third CP
571 {4, 2, 3, 2}, // incomplete third CP, and no space for it
572 {5, 2, 3, 2}, // incomplete third CP, and no space for it
574 {10, 3, 6, 3}, // no space for fourth CP
575 {10, 4, 6, 3}, // no space for fourth CP
576 {7, 5, 6, 3}, // incomplete fourth CP
577 {8, 5, 6, 3}, // incomplete fourth CP
578 {9, 5, 6, 3}, // incomplete fourth CP
579 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
580 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
581 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
582 {7, 4, 6, 3}, // incomplete fourth CP, and no space for it
583 {8, 4, 6, 3}, // incomplete fourth CP, and no space for it
584 {9, 4, 6, 3}, // incomplete fourth CP, and no space for it
588 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
589 test_offsets_partial t
= *it
;
590 InternT out
[array_size(exp
) - 1] = {};
591 assert(t
.in_size
<= array_size(in
));
592 assert(t
.out_size
<= array_size(out
));
593 assert(t
.expected_in_next
<= t
.in_size
);
594 assert(t
.expected_out_next
<= t
.out_size
);
595 mbstate_t state
= {};
596 const ExternT
* in_next
= nullptr;
597 InternT
* out_next
= nullptr;
598 codecvt_base::result res
= codecvt_base::ok
;
600 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
601 assert(res
== cvt
.partial
);
602 assert(in_next
== in
+ t
.expected_in_next
);
603 assert(out_next
== out
+ t
.expected_out_next
);
604 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
605 if (t
.expected_out_next
< array_size(out
))
606 assert(out
[t
.expected_out_next
] == 0);
609 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
611 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
615 template <class InternT
, class ExternT
>
616 void utf8_to_utf16_in_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
617 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
618 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
619 const char16_t expected
[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
620 static_assert(array_size(input
) == 11, "");
621 static_assert(array_size(expected
) == 6, "");
623 ExternT in
[array_size(input
)];
624 InternT exp
[array_size(expected
)];
625 copy(begin(input
), end(input
), begin(in
));
626 copy(begin(expected
), end(expected
), begin(exp
));
627 assert(char_traits
<ExternT
>::length(in
) == 10);
628 assert(char_traits
<InternT
>::length(exp
) == 5);
630 // There are 5 classes of errors in UTF-8 decoding
631 // 1. Missing leading byte
632 // 2. Missing trailing byte
634 // 4. Overlong sequence
635 // 5. CP out of Unicode range
636 test_offsets_error
<unsigned char> offsets
[] = {
638 // 1. Missing leading byte. We will replace the leading byte with
639 // non-leading byte, such as a byte that is always invalid or a trailing
642 // replace leading byte with invalid byte
643 {1, 5, 0, 0, 0xFF, 0},
644 {3, 5, 1, 1, 0xFF, 1},
645 {6, 5, 3, 2, 0xFF, 3},
646 {10, 5, 6, 3, 0xFF, 6},
648 // replace leading byte with trailing byte
649 {1, 5, 0, 0, 0b10101010, 0},
650 {3, 5, 1, 1, 0b10101010, 1},
651 {6, 5, 3, 2, 0b10101010, 3},
652 {10, 5, 6, 3, 0b10101010, 6},
654 // 2. Missing trailing byte. We will replace the trailing byte with
655 // non-trailing byte, such as a byte that is always invalid or a leading
656 // byte (simple ASCII byte in our case).
658 // replace first trailing byte with ASCII byte
659 {3, 5, 1, 1, 'z', 2},
660 {6, 5, 3, 2, 'z', 4},
661 {10, 5, 6, 3, 'z', 7},
663 // replace first trailing byte with invalid byte
664 {3, 5, 1, 1, 0xFF, 2},
665 {6, 5, 3, 2, 0xFF, 4},
666 {10, 5, 6, 3, 0xFF, 7},
668 // replace second trailing byte with ASCII byte
669 {6, 5, 3, 2, 'z', 5},
670 {10, 5, 6, 3, 'z', 8},
672 // replace second trailing byte with invalid byte
673 {6, 5, 3, 2, 0xFF, 5},
674 {10, 5, 6, 3, 0xFF, 8},
676 // replace third trailing byte
677 {10, 5, 6, 3, 'z', 9},
678 {10, 5, 6, 3, 0xFF, 9},
680 // 2.1 The following test-cases raise doubt whether error or partial should
681 // be returned. For example, we have 4-byte sequence with valid leading
682 // byte. If we hide the last byte we need to return partial. But, if the
683 // second or third byte, which are visible to the call to codecvt, are
684 // malformed then error should be returned.
686 // replace first trailing byte with ASCII byte, also incomplete at end
687 {5, 5, 3, 2, 'z', 4},
688 {8, 5, 6, 3, 'z', 7},
689 {9, 5, 6, 3, 'z', 7},
691 // replace first trailing byte with invalid byte, also incomplete at end
692 {5, 5, 3, 2, 0xFF, 4},
693 {8, 5, 6, 3, 0xFF, 7},
694 {9, 5, 6, 3, 0xFF, 7},
696 // replace second trailing byte with ASCII byte, also incomplete at end
697 {9, 5, 6, 3, 'z', 8},
699 // replace second trailing byte with invalid byte, also incomplete at end
700 {9, 5, 6, 3, 0xFF, 8},
702 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
704 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
705 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
706 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
707 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
709 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
710 // just the leading byte is enough to make them overlong, i.e. for the
711 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
713 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
714 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
715 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
716 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
719 // turn U+10AAAA into U+14AAAA by changing its leading byte
720 {10, 5, 6, 3, 0b11110101, 6},
721 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
722 {10, 5, 6, 3, 0b10011010, 7},
724 for (test_offsets_error
<unsigned char>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
725 test_offsets_error
<unsigned char> t
= *it
;
726 InternT out
[array_size(exp
) - 1] = {};
727 assert(t
.in_size
<= array_size(in
));
728 assert(t
.out_size
<= array_size(out
));
729 assert(t
.expected_in_next
<= t
.in_size
);
730 assert(t
.expected_out_next
<= t
.out_size
);
731 ExternT old_char
= in
[t
.replace_pos
];
732 in
[t
.replace_pos
] = t
.replace_char
;
734 mbstate_t state
= {};
735 const ExternT
* in_next
= nullptr;
736 InternT
* out_next
= nullptr;
737 codecvt_base::result res
= codecvt_base::ok
;
739 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
740 assert(res
== cvt
.error
);
741 assert(in_next
== in
+ t
.expected_in_next
);
742 assert(out_next
== out
+ t
.expected_out_next
);
743 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
744 if (t
.expected_out_next
< array_size(out
))
745 assert(out
[t
.expected_out_next
] == 0);
748 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
750 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
752 in
[t
.replace_pos
] = old_char
;
756 template <class InternT
, class ExternT
>
757 void utf8_to_utf16_in(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
758 utf8_to_utf16_in_ok(cvt
);
759 utf8_to_utf16_in_partial(cvt
);
760 utf8_to_utf16_in_error(cvt
);
763 template <class InternT
, class ExternT
>
764 void utf16_to_utf8_out_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
765 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
766 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
767 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
768 static_assert(array_size(input
) == 6, "");
769 static_assert(array_size(expected
) == 11, "");
771 InternT in
[array_size(input
)];
772 ExternT exp
[array_size(expected
)];
773 copy(begin(input
), end(input
), begin(in
));
774 copy(begin(expected
), end(expected
), begin(exp
));
775 assert(char_traits
<InternT
>::length(in
) == 5);
776 assert(char_traits
<ExternT
>::length(exp
) == 10);
778 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
779 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
780 test_offsets_ok t
= *it
;
781 ExternT out
[array_size(exp
) - 1] = {};
782 assert(t
.in_size
<= array_size(in
));
783 assert(t
.out_size
<= array_size(out
));
784 mbstate_t state
= {};
785 const InternT
* in_next
= nullptr;
786 ExternT
* out_next
= nullptr;
787 codecvt_base::result res
= codecvt_base::ok
;
789 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
790 assert(res
== cvt
.ok
);
791 assert(in_next
== in
+ t
.in_size
);
792 assert(out_next
== out
+ t
.out_size
);
793 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.out_size
) == 0);
794 if (t
.out_size
< array_size(out
))
795 assert(out
[t
.out_size
] == 0);
799 template <class InternT
, class ExternT
>
800 void utf16_to_utf8_out_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
801 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
802 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
803 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
804 static_assert(array_size(input
) == 6, "");
805 static_assert(array_size(expected
) == 11, "");
807 InternT in
[array_size(input
)];
808 ExternT exp
[array_size(expected
)];
809 copy(begin(input
), end(input
), begin(in
));
810 copy(begin(expected
), end(expected
), begin(exp
));
811 assert(char_traits
<InternT
>::length(in
) == 5);
812 assert(char_traits
<ExternT
>::length(exp
) == 10);
814 test_offsets_partial offsets
[] = {
815 {1, 0, 0, 0}, // no space for first CP
817 {2, 1, 1, 1}, // no space for second CP
818 {2, 2, 1, 1}, // no space for second CP
820 {3, 3, 2, 3}, // no space for third CP
821 {3, 4, 2, 3}, // no space for third CP
822 {3, 5, 2, 3}, // no space for third CP
824 {5, 6, 3, 6}, // no space for fourth CP
825 {5, 7, 3, 6}, // no space for fourth CP
826 {5, 8, 3, 6}, // no space for fourth CP
827 {5, 9, 3, 6}, // no space for fourth CP
829 {4, 10, 3, 6}, // incomplete fourth CP
831 {4, 6, 3, 6}, // incomplete fourth CP, and no space for it
832 {4, 7, 3, 6}, // incomplete fourth CP, and no space for it
833 {4, 8, 3, 6}, // incomplete fourth CP, and no space for it
834 {4, 9, 3, 6}, // incomplete fourth CP, and no space for it
836 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
837 test_offsets_partial t
= *it
;
838 ExternT out
[array_size(exp
) - 1] = {};
839 assert(t
.in_size
<= array_size(in
));
840 assert(t
.out_size
<= array_size(out
));
841 assert(t
.expected_in_next
<= t
.in_size
);
842 assert(t
.expected_out_next
<= t
.out_size
);
843 mbstate_t state
= {};
844 const InternT
* in_next
= nullptr;
845 ExternT
* out_next
= nullptr;
846 codecvt_base::result res
= codecvt_base::ok
;
848 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
849 assert(res
== cvt
.partial
);
850 assert(in_next
== in
+ t
.expected_in_next
);
851 assert(out_next
== out
+ t
.expected_out_next
);
852 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
853 if (t
.expected_out_next
< array_size(out
))
854 assert(out
[t
.expected_out_next
] == 0);
858 template <class InternT
, class ExternT
>
859 void utf16_to_utf8_out_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
860 // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
861 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
862 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
863 static_assert(array_size(input
) == 6, "");
864 static_assert(array_size(expected
) == 11, "");
866 InternT in
[array_size(input
)];
867 ExternT exp
[array_size(expected
)];
868 copy(begin(input
), end(input
), begin(in
));
869 copy(begin(expected
), end(expected
), begin(exp
));
870 assert(char_traits
<InternT
>::length(in
) == 5);
871 assert(char_traits
<ExternT
>::length(exp
) == 10);
873 // The only possible error in UTF-16 is unpaired surrogate code units.
874 // So we replace valid code points (scalar values) with lone surrogate CU.
875 test_offsets_error
<InternT
> offsets
[] = {
876 {5, 10, 0, 0, 0xD800, 0},
877 {5, 10, 0, 0, 0xDBFF, 0},
878 {5, 10, 0, 0, 0xDC00, 0},
879 {5, 10, 0, 0, 0xDFFF, 0},
881 {5, 10, 1, 1, 0xD800, 1},
882 {5, 10, 1, 1, 0xDBFF, 1},
883 {5, 10, 1, 1, 0xDC00, 1},
884 {5, 10, 1, 1, 0xDFFF, 1},
886 {5, 10, 2, 3, 0xD800, 2},
887 {5, 10, 2, 3, 0xDBFF, 2},
888 {5, 10, 2, 3, 0xDC00, 2},
889 {5, 10, 2, 3, 0xDFFF, 2},
891 // make the leading surrogate a trailing one
892 {5, 10, 3, 6, 0xDC00, 3},
893 {5, 10, 3, 6, 0xDFFF, 3},
895 // make the trailing surrogate a leading one
896 {5, 10, 3, 6, 0xD800, 4},
897 {5, 10, 3, 6, 0xDBFF, 4},
899 // make the trailing surrogate a BMP char
900 {5, 10, 3, 6, 'z', 4},
903 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
904 test_offsets_error
<InternT
> t
= *it
;
905 ExternT out
[array_size(exp
) - 1] = {};
906 assert(t
.in_size
<= array_size(in
));
907 assert(t
.out_size
<= array_size(out
));
908 assert(t
.expected_in_next
<= t
.in_size
);
909 assert(t
.expected_out_next
<= t
.out_size
);
910 InternT old_char
= in
[t
.replace_pos
];
911 in
[t
.replace_pos
] = t
.replace_char
;
913 mbstate_t state
= {};
914 const InternT
* in_next
= nullptr;
915 ExternT
* out_next
= nullptr;
916 codecvt_base::result res
= codecvt_base::ok
;
918 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
919 assert(res
== cvt
.error
);
920 assert(in_next
== in
+ t
.expected_in_next
);
921 assert(out_next
== out
+ t
.expected_out_next
);
922 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
923 if (t
.expected_out_next
< array_size(out
))
924 assert(out
[t
.expected_out_next
] == 0);
926 in
[t
.replace_pos
] = old_char
;
930 template <class InternT
, class ExternT
>
931 void utf16_to_utf8_out(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
932 utf16_to_utf8_out_ok(cvt
);
933 utf16_to_utf8_out_partial(cvt
);
934 utf16_to_utf8_out_error(cvt
);
937 template <class InternT
, class ExternT
>
938 void test_utf8_utf16_cvt(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
939 utf8_to_utf16_in(cvt
);
940 utf16_to_utf8_out(cvt
);
943 template <class InternT
, class ExternT
>
944 void utf8_to_ucs2_in_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
945 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
946 const unsigned char input
[] = "b\u0448\uAAAA";
947 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
948 static_assert(array_size(input
) == 7, "");
949 static_assert(array_size(expected
) == 4, "");
951 ExternT in
[array_size(input
)];
952 InternT exp
[array_size(expected
)];
953 copy(begin(input
), end(input
), begin(in
));
954 copy(begin(expected
), end(expected
), begin(exp
));
955 assert(char_traits
<ExternT
>::length(in
) == 6);
956 assert(char_traits
<InternT
>::length(exp
) == 3);
958 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}};
959 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
960 test_offsets_ok t
= *it
;
961 InternT out
[array_size(exp
) - 1] = {};
962 assert(t
.in_size
<= array_size(in
));
963 assert(t
.out_size
<= array_size(out
));
964 mbstate_t state
= {};
965 const ExternT
* in_next
= nullptr;
966 InternT
* out_next
= nullptr;
967 codecvt_base::result res
= codecvt_base::ok
;
969 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
970 assert(res
== cvt
.ok
);
971 assert(in_next
== in
+ t
.in_size
);
972 assert(out_next
== out
+ t
.out_size
);
973 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
974 if (t
.out_size
< array_size(out
))
975 assert(out
[t
.out_size
] == 0);
978 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
980 assert(static_cast<size_t>(len
) == t
.in_size
);
983 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
984 test_offsets_ok t
= *it
;
985 InternT out
[array_size(exp
)] = {};
986 assert(t
.in_size
<= array_size(in
));
987 assert(t
.out_size
<= array_size(out
));
988 mbstate_t state
= {};
989 const ExternT
* in_next
= nullptr;
990 InternT
* out_next
= nullptr;
991 codecvt_base::result res
= codecvt_base::ok
;
993 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
994 assert(res
== cvt
.ok
);
995 assert(in_next
== in
+ t
.in_size
);
996 assert(out_next
== out
+ t
.out_size
);
997 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
998 if (t
.out_size
< array_size(out
))
999 assert(out
[t
.out_size
] == 0);
1001 state
= mbstate_t();
1002 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
1004 assert(static_cast<size_t>(len
) == t
.in_size
);
1008 template <class InternT
, class ExternT
>
1009 void utf8_to_ucs2_in_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1010 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1011 const unsigned char input
[] = "b\u0448\uAAAA";
1012 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1013 static_assert(array_size(input
) == 7, "");
1014 static_assert(array_size(expected
) == 4, "");
1016 ExternT in
[array_size(input
)];
1017 InternT exp
[array_size(expected
)];
1018 copy(begin(input
), end(input
), begin(in
));
1019 copy(begin(expected
), end(expected
), begin(exp
));
1020 assert(char_traits
<ExternT
>::length(in
) == 6);
1021 assert(char_traits
<InternT
>::length(exp
) == 3);
1023 test_offsets_partial offsets
[] = {
1024 {1, 0, 0, 0}, // no space for first CP
1026 {3, 1, 1, 1}, // no space for second CP
1027 {2, 2, 1, 1}, // incomplete second CP
1028 {2, 1, 1, 1}, // incomplete second CP, and no space for it
1030 {6, 2, 3, 2}, // no space for third CP
1031 {4, 3, 3, 2}, // incomplete third CP
1032 {5, 3, 3, 2}, // incomplete third CP
1033 {4, 2, 3, 2}, // incomplete third CP, and no space for it
1034 {5, 2, 3, 2}, // incomplete third CP, and no space for it
1037 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1038 test_offsets_partial t
= *it
;
1039 InternT out
[array_size(exp
) - 1] = {};
1040 assert(t
.in_size
<= array_size(in
));
1041 assert(t
.out_size
<= array_size(out
));
1042 assert(t
.expected_in_next
<= t
.in_size
);
1043 assert(t
.expected_out_next
<= t
.out_size
);
1044 mbstate_t state
= {};
1045 const ExternT
* in_next
= nullptr;
1046 InternT
* out_next
= nullptr;
1047 codecvt_base::result res
= codecvt_base::ok
;
1049 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1050 assert(res
== cvt
.partial
);
1051 assert(in_next
== in
+ t
.expected_in_next
);
1052 assert(out_next
== out
+ t
.expected_out_next
);
1053 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1054 if (t
.expected_out_next
< array_size(out
))
1055 assert(out
[t
.expected_out_next
] == 0);
1057 state
= mbstate_t();
1058 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1060 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1064 template <class InternT
, class ExternT
>
1065 void utf8_to_ucs2_in_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1066 const unsigned char input
[] = "b\u0448\uD700\U0010AAAA";
1067 const char16_t expected
[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
1068 static_assert(array_size(input
) == 11, "");
1069 static_assert(array_size(expected
) == 6, "");
1071 ExternT in
[array_size(input
)];
1072 InternT exp
[array_size(expected
)];
1073 copy(begin(input
), end(input
), begin(in
));
1074 copy(begin(expected
), end(expected
), begin(exp
));
1075 assert(char_traits
<ExternT
>::length(in
) == 10);
1076 assert(char_traits
<InternT
>::length(exp
) == 5);
1078 // There are 5 classes of errors in UTF-8 decoding
1079 // 1. Missing leading byte
1080 // 2. Missing trailing byte
1082 // 4. Overlong sequence
1083 // 5. CP out of Unicode range
1084 test_offsets_error
<unsigned char> offsets
[] = {
1086 // 1. Missing leading byte. We will replace the leading byte with
1087 // non-leading byte, such as a byte that is always invalid or a trailing
1090 // replace leading byte with invalid byte
1091 {1, 5, 0, 0, 0xFF, 0},
1092 {3, 5, 1, 1, 0xFF, 1},
1093 {6, 5, 3, 2, 0xFF, 3},
1094 {10, 5, 6, 3, 0xFF, 6},
1096 // replace leading byte with trailing byte
1097 {1, 5, 0, 0, 0b10101010, 0},
1098 {3, 5, 1, 1, 0b10101010, 1},
1099 {6, 5, 3, 2, 0b10101010, 3},
1100 {10, 5, 6, 3, 0b10101010, 6},
1102 // 2. Missing trailing byte. We will replace the trailing byte with
1103 // non-trailing byte, such as a byte that is always invalid or a leading
1104 // byte (simple ASCII byte in our case).
1106 // replace first trailing byte with ASCII byte
1107 {3, 5, 1, 1, 'z', 2},
1108 {6, 5, 3, 2, 'z', 4},
1109 {10, 5, 6, 3, 'z', 7},
1111 // replace first trailing byte with invalid byte
1112 {3, 5, 1, 1, 0xFF, 2},
1113 {6, 5, 3, 2, 0xFF, 4},
1114 {10, 5, 6, 3, 0xFF, 7},
1116 // replace second trailing byte with ASCII byte
1117 {6, 5, 3, 2, 'z', 5},
1118 {10, 5, 6, 3, 'z', 8},
1120 // replace second trailing byte with invalid byte
1121 {6, 5, 3, 2, 0xFF, 5},
1122 {10, 5, 6, 3, 0xFF, 8},
1124 // replace third trailing byte
1125 {10, 5, 6, 3, 'z', 9},
1126 {10, 5, 6, 3, 0xFF, 9},
1128 // 2.1 The following test-cases raise doubt whether error or partial should
1129 // be returned. For example, we have 4-byte sequence with valid leading
1130 // byte. If we hide the last byte we need to return partial. But, if the
1131 // second or third byte, which are visible to the call to codecvt, are
1132 // malformed then error should be returned.
1134 // replace first trailing byte with ASCII byte, also incomplete at end
1135 {5, 5, 3, 2, 'z', 4},
1136 {8, 5, 6, 3, 'z', 7},
1137 {9, 5, 6, 3, 'z', 7},
1139 // replace first trailing byte with invalid byte, also incomplete at end
1140 {5, 5, 3, 2, 0xFF, 4},
1141 {8, 5, 6, 3, 0xFF, 7},
1142 {9, 5, 6, 3, 0xFF, 7},
1144 // replace second trailing byte with ASCII byte, also incomplete at end
1145 {9, 5, 6, 3, 'z', 8},
1147 // replace second trailing byte with invalid byte, also incomplete at end
1148 {9, 5, 6, 3, 0xFF, 8},
1150 // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
1152 {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
1153 {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
1154 {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
1155 {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
1157 // 4. Overlong sequence. The CPs in the input are chosen such as modifying
1158 // just the leading byte is enough to make them overlong, i.e. for the
1159 // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
1161 {3, 5, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
1162 {3, 5, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
1163 {6, 5, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
1164 {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
1166 // 5. CP above range
1167 // turn U+10AAAA into U+14AAAA by changing its leading byte
1168 {10, 5, 6, 3, 0b11110101, 6},
1169 // turn U+10AAAA into U+11AAAA by changing its 2nd byte
1170 {10, 5, 6, 3, 0b10011010, 7},
1171 // Don't replace anything, show full 4-byte CP U+10AAAA
1172 {10, 4, 6, 3, 'b', 0},
1173 {10, 5, 6, 3, 'b', 0},
1174 // Don't replace anything, show incomplete 4-byte CP at the end. It's still
1175 // out of UCS2 range just by seeing the first byte.
1176 {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1177 {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1178 {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1179 {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1180 {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1181 {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1183 for (test_offsets_error
<unsigned char>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1184 test_offsets_error
<unsigned char> t
= *it
;
1185 InternT out
[array_size(exp
) - 1] = {};
1186 assert(t
.in_size
<= array_size(in
));
1187 assert(t
.out_size
<= array_size(out
));
1188 assert(t
.expected_in_next
<= t
.in_size
);
1189 assert(t
.expected_out_next
<= t
.out_size
);
1190 ExternT old_char
= in
[t
.replace_pos
];
1191 in
[t
.replace_pos
] = t
.replace_char
;
1193 mbstate_t state
= {};
1194 const ExternT
* in_next
= nullptr;
1195 InternT
* out_next
= nullptr;
1196 codecvt_base::result res
= codecvt_base::ok
;
1198 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1199 assert(res
== cvt
.error
);
1200 assert(in_next
== in
+ t
.expected_in_next
);
1201 assert(out_next
== out
+ t
.expected_out_next
);
1202 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1203 if (t
.expected_out_next
< array_size(out
))
1204 assert(out
[t
.expected_out_next
] == 0);
1206 state
= mbstate_t();
1207 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1209 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1211 in
[t
.replace_pos
] = old_char
;
1215 template <class InternT
, class ExternT
>
1216 void utf8_to_ucs2_in(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1217 utf8_to_ucs2_in_ok(cvt
);
1218 utf8_to_ucs2_in_partial(cvt
);
1219 utf8_to_ucs2_in_error(cvt
);
1222 template <class InternT
, class ExternT
>
1223 void ucs2_to_utf8_out_ok(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1224 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1225 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1226 const unsigned char expected
[] = "b\u0448\uAAAA";
1227 static_assert(array_size(input
) == 4, "");
1228 static_assert(array_size(expected
) == 7, "");
1230 InternT in
[array_size(input
)];
1231 ExternT exp
[array_size(expected
)];
1232 copy(begin(input
), end(input
), begin(in
));
1233 copy(begin(expected
), end(expected
), begin(exp
));
1234 assert(char_traits
<InternT
>::length(in
) == 3);
1235 assert(char_traits
<ExternT
>::length(exp
) == 6);
1237 test_offsets_ok offsets
[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}};
1238 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1239 test_offsets_ok t
= *it
;
1240 ExternT out
[array_size(exp
) - 1] = {};
1241 assert(t
.in_size
<= array_size(in
));
1242 assert(t
.out_size
<= array_size(out
));
1243 mbstate_t state
= {};
1244 const InternT
* in_next
= nullptr;
1245 ExternT
* out_next
= nullptr;
1246 codecvt_base::result res
= codecvt_base::ok
;
1248 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1249 assert(res
== cvt
.ok
);
1250 assert(in_next
== in
+ t
.in_size
);
1251 assert(out_next
== out
+ t
.out_size
);
1252 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.out_size
) == 0);
1253 if (t
.out_size
< array_size(out
))
1254 assert(out
[t
.out_size
] == 0);
1258 template <class InternT
, class ExternT
>
1259 void ucs2_to_utf8_out_partial(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1260 // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1261 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1262 const unsigned char expected
[] = "b\u0448\uAAAA";
1263 static_assert(array_size(input
) == 4, "");
1264 static_assert(array_size(expected
) == 7, "");
1266 InternT in
[array_size(input
)];
1267 ExternT exp
[array_size(expected
)];
1268 copy(begin(input
), end(input
), begin(in
));
1269 copy(begin(expected
), end(expected
), begin(exp
));
1270 assert(char_traits
<InternT
>::length(in
) == 3);
1271 assert(char_traits
<ExternT
>::length(exp
) == 6);
1273 test_offsets_partial offsets
[] = {
1274 {1, 0, 0, 0}, // no space for first CP
1276 {2, 1, 1, 1}, // no space for second CP
1277 {2, 2, 1, 1}, // no space for second CP
1279 {3, 3, 2, 3}, // no space for third CP
1280 {3, 4, 2, 3}, // no space for third CP
1281 {3, 5, 2, 3}, // no space for third CP
1283 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1284 test_offsets_partial t
= *it
;
1285 ExternT out
[array_size(exp
) - 1] = {};
1286 assert(t
.in_size
<= array_size(in
));
1287 assert(t
.out_size
<= array_size(out
));
1288 assert(t
.expected_in_next
<= t
.in_size
);
1289 assert(t
.expected_out_next
<= t
.out_size
);
1290 mbstate_t state
= {};
1291 const InternT
* in_next
= nullptr;
1292 ExternT
* out_next
= nullptr;
1293 codecvt_base::result res
= codecvt_base::ok
;
1295 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1296 assert(res
== cvt
.partial
);
1297 assert(in_next
== in
+ t
.expected_in_next
);
1298 assert(out_next
== out
+ t
.expected_out_next
);
1299 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1300 if (t
.expected_out_next
< array_size(out
))
1301 assert(out
[t
.expected_out_next
] == 0);
1305 template <class InternT
, class ExternT
>
1306 void ucs2_to_utf8_out_error(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1307 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1308 const unsigned char expected
[] = "b\u0448\uAAAA\U0010AAAA";
1309 static_assert(array_size(input
) == 6, "");
1310 static_assert(array_size(expected
) == 11, "");
1312 InternT in
[array_size(input
)];
1313 ExternT exp
[array_size(expected
)];
1314 copy(begin(input
), end(input
), begin(in
));
1315 copy(begin(expected
), end(expected
), begin(exp
));
1316 assert(char_traits
<InternT
>::length(in
) == 5);
1317 assert(char_traits
<ExternT
>::length(exp
) == 10);
1319 test_offsets_error
<InternT
> offsets
[] = {
1320 {3, 6, 0, 0, 0xD800, 0},
1321 {3, 6, 0, 0, 0xDBFF, 0},
1322 {3, 6, 0, 0, 0xDC00, 0},
1323 {3, 6, 0, 0, 0xDFFF, 0},
1325 {3, 6, 1, 1, 0xD800, 1},
1326 {3, 6, 1, 1, 0xDBFF, 1},
1327 {3, 6, 1, 1, 0xDC00, 1},
1328 {3, 6, 1, 1, 0xDFFF, 1},
1330 {3, 6, 2, 3, 0xD800, 2},
1331 {3, 6, 2, 3, 0xDBFF, 2},
1332 {3, 6, 2, 3, 0xDC00, 2},
1333 {3, 6, 2, 3, 0xDFFF, 2},
1335 // make the leading surrogate a trailing one
1336 {5, 10, 3, 6, 0xDC00, 3},
1337 {5, 10, 3, 6, 0xDFFF, 3},
1339 // make the trailing surrogate a leading one
1340 {5, 10, 3, 6, 0xD800, 4},
1341 {5, 10, 3, 6, 0xDBFF, 4},
1343 // make the trailing surrogate a BMP char
1344 {5, 10, 3, 6, 'z', 4},
1346 // don't replace anything in the test cases bellow, just show the surrogate
1347 // pair (fourth CP) fully or partially
1348 {5, 10, 3, 6, 'b', 0},
1349 {5, 7, 3, 6, 'b', 0}, // no space for fourth CP
1350 {5, 8, 3, 6, 'b', 0}, // no space for fourth CP
1351 {5, 9, 3, 6, 'b', 0}, // no space for fourth CP
1353 {4, 10, 3, 6, 'b', 0}, // incomplete fourth CP
1354 {4, 7, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1355 {4, 8, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1356 {4, 9, 3, 6, 'b', 0}, // incomplete fourth CP, and no space for it
1359 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1360 test_offsets_error
<InternT
> t
= *it
;
1361 ExternT out
[array_size(exp
) - 1] = {};
1362 assert(t
.in_size
<= array_size(in
));
1363 assert(t
.out_size
<= array_size(out
));
1364 assert(t
.expected_in_next
<= t
.in_size
);
1365 assert(t
.expected_out_next
<= t
.out_size
);
1366 InternT old_char
= in
[t
.replace_pos
];
1367 in
[t
.replace_pos
] = t
.replace_char
;
1369 mbstate_t state
= {};
1370 const InternT
* in_next
= nullptr;
1371 ExternT
* out_next
= nullptr;
1372 codecvt_base::result res
= codecvt_base::ok
;
1374 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1375 assert(res
== cvt
.error
);
1376 assert(in_next
== in
+ t
.expected_in_next
);
1377 assert(out_next
== out
+ t
.expected_out_next
);
1378 assert(char_traits
<ExternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1379 if (t
.expected_out_next
< array_size(out
))
1380 assert(out
[t
.expected_out_next
] == 0);
1382 in
[t
.replace_pos
] = old_char
;
1386 template <class InternT
, class ExternT
>
1387 void ucs2_to_utf8_out(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1388 ucs2_to_utf8_out_ok(cvt
);
1389 ucs2_to_utf8_out_partial(cvt
);
1390 ucs2_to_utf8_out_error(cvt
);
1393 template <class InternT
, class ExternT
>
1394 void test_utf8_ucs2_cvt(const std::codecvt
<InternT
, ExternT
, mbstate_t>& cvt
) {
1395 utf8_to_ucs2_in(cvt
);
1396 ucs2_to_utf8_out(cvt
);
1399 enum utf16_endianess
{ utf16_big_endian
, utf16_little_endian
};
1401 template <class Iter1
, class Iter2
>
1402 Iter2
utf16_to_bytes(Iter1 f
, Iter1 l
, Iter2 o
, utf16_endianess e
) {
1403 if (e
== utf16_big_endian
)
1404 for (; f
!= l
; ++f
) {
1405 *o
++ = (*f
>> 8) & 0xFF;
1409 for (; f
!= l
; ++f
) {
1411 *o
++ = (*f
>> 8) & 0xFF;
1416 template <class InternT
>
1417 void utf16_to_utf32_in_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1418 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1419 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1420 static_assert(array_size(input
) == 6, "");
1421 static_assert(array_size(expected
) == 5, "");
1423 char in
[array_size(input
) * 2];
1424 InternT exp
[array_size(expected
)];
1425 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1426 copy(begin(expected
), end(expected
), begin(exp
));
1428 test_offsets_ok offsets
[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}, {10, 4}};
1429 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1430 test_offsets_ok t
= *it
;
1431 InternT out
[array_size(exp
) - 1] = {};
1432 assert(t
.in_size
<= array_size(in
));
1433 assert(t
.out_size
<= array_size(out
));
1434 mbstate_t state
= {};
1435 const char* in_next
= nullptr;
1436 InternT
* out_next
= nullptr;
1437 codecvt_base::result res
= codecvt_base::ok
;
1439 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1440 assert(res
== cvt
.ok
);
1441 assert(in_next
== in
+ t
.in_size
);
1442 assert(out_next
== out
+ t
.out_size
);
1443 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1444 if (t
.out_size
< array_size(out
))
1445 assert(out
[t
.out_size
] == 0);
1447 state
= mbstate_t();
1448 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1450 assert(static_cast<size_t>(len
) == t
.in_size
);
1453 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1454 test_offsets_ok t
= *it
;
1455 InternT out
[array_size(exp
)] = {};
1456 assert(t
.in_size
<= array_size(in
));
1457 assert(t
.out_size
<= array_size(out
));
1458 mbstate_t state
= {};
1459 const char* in_next
= nullptr;
1460 InternT
* out_next
= nullptr;
1461 codecvt_base::result res
= codecvt_base::ok
;
1463 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
1464 assert(res
== cvt
.ok
);
1465 assert(in_next
== in
+ t
.in_size
);
1466 assert(out_next
== out
+ t
.out_size
);
1467 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1468 if (t
.out_size
< array_size(out
))
1469 assert(out
[t
.out_size
] == 0);
1471 state
= mbstate_t();
1472 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
1474 assert(static_cast<size_t>(len
) == t
.in_size
);
1478 template <class InternT
>
1479 void utf16_to_utf32_in_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1480 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1481 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1482 static_assert(array_size(input
) == 6, "");
1483 static_assert(array_size(expected
) == 5, "");
1485 char in
[array_size(input
) * 2];
1486 InternT exp
[array_size(expected
)];
1487 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1488 copy(begin(expected
), end(expected
), begin(exp
));
1490 test_offsets_partial offsets
[] = {
1491 {2, 0, 0, 0}, // no space for first CP
1492 {1, 1, 0, 0}, // incomplete first CP
1493 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1495 {4, 1, 2, 1}, // no space for second CP
1496 {3, 2, 2, 1}, // incomplete second CP
1497 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1499 {6, 2, 4, 2}, // no space for third CP
1500 {5, 3, 4, 2}, // incomplete third CP
1501 {5, 2, 4, 2}, // incomplete third CP, and no space for it
1503 {10, 3, 6, 3}, // no space for fourth CP
1504 {7, 4, 6, 3}, // incomplete fourth CP
1505 {8, 4, 6, 3}, // incomplete fourth CP
1506 {9, 4, 6, 3}, // incomplete fourth CP
1507 {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
1508 {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
1509 {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
1512 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1513 test_offsets_partial t
= *it
;
1514 InternT out
[array_size(exp
) - 1] = {};
1515 assert(t
.in_size
<= array_size(in
));
1516 assert(t
.out_size
<= array_size(out
));
1517 assert(t
.expected_in_next
<= t
.in_size
);
1518 assert(t
.expected_out_next
<= t
.out_size
);
1519 mbstate_t state
= {};
1520 const char* in_next
= nullptr;
1521 InternT
* out_next
= nullptr;
1522 codecvt_base::result res
= codecvt_base::ok
;
1524 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1525 assert(res
== cvt
.partial
);
1526 assert(in_next
== in
+ t
.expected_in_next
);
1527 assert(out_next
== out
+ t
.expected_out_next
);
1528 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1529 if (t
.expected_out_next
< array_size(out
))
1530 assert(out
[t
.expected_out_next
] == 0);
1532 state
= mbstate_t();
1533 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1535 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1539 template <class InternT
>
1540 void utf16_to_utf32_in_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1541 char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1542 const char32_t expected
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1543 static_assert(array_size(input
) == 6, "");
1544 static_assert(array_size(expected
) == 5, "");
1546 InternT exp
[array_size(expected
)];
1547 copy(begin(expected
), end(expected
), begin(exp
));
1549 // The only possible error in UTF-16 is unpaired surrogate code units.
1550 // So we replace valid code points (scalar values) with lone surrogate CU.
1551 test_offsets_error
<char16_t
> offsets
[] = {
1552 {10, 4, 0, 0, 0xD800, 0},
1553 {10, 4, 0, 0, 0xDBFF, 0},
1554 {10, 4, 0, 0, 0xDC00, 0},
1555 {10, 4, 0, 0, 0xDFFF, 0},
1557 {10, 4, 2, 1, 0xD800, 1},
1558 {10, 4, 2, 1, 0xDBFF, 1},
1559 {10, 4, 2, 1, 0xDC00, 1},
1560 {10, 4, 2, 1, 0xDFFF, 1},
1562 {10, 4, 4, 2, 0xD800, 2},
1563 {10, 4, 4, 2, 0xDBFF, 2},
1564 {10, 4, 4, 2, 0xDC00, 2},
1565 {10, 4, 4, 2, 0xDFFF, 2},
1567 // make the leading surrogate a trailing one
1568 {10, 4, 6, 3, 0xDC00, 3},
1569 {10, 4, 6, 3, 0xDFFF, 3},
1571 // make the trailing surrogate a leading one
1572 {10, 4, 6, 3, 0xD800, 4},
1573 {10, 4, 6, 3, 0xDBFF, 4},
1575 // make the trailing surrogate a BMP char
1576 {10, 4, 6, 3, 'z', 4},
1579 for (test_offsets_error
<char16_t
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1580 test_offsets_error
<char16_t
> t
= *it
;
1581 char in
[array_size(input
) * 2];
1582 InternT out
[array_size(exp
) - 1] = {};
1583 assert(t
.in_size
<= array_size(in
));
1584 assert(t
.out_size
<= array_size(out
));
1585 assert(t
.expected_in_next
<= t
.in_size
);
1586 assert(t
.expected_out_next
<= t
.out_size
);
1587 char16_t old_char
= input
[t
.replace_pos
];
1588 input
[t
.replace_pos
] = t
.replace_char
; // replace in input, not in in
1589 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1591 mbstate_t state
= {};
1592 const char* in_next
= nullptr;
1593 InternT
* out_next
= nullptr;
1594 codecvt_base::result res
= codecvt_base::ok
;
1596 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1597 assert(res
== cvt
.error
);
1598 assert(in_next
== in
+ t
.expected_in_next
);
1599 assert(out_next
== out
+ t
.expected_out_next
);
1600 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1601 if (t
.expected_out_next
< array_size(out
))
1602 assert(out
[t
.expected_out_next
] == 0);
1604 state
= mbstate_t();
1605 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1607 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1609 input
[t
.replace_pos
] = old_char
;
1613 template <class InternT
>
1614 void utf32_to_utf16_out_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1615 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1616 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1617 static_assert(array_size(input
) == 5, "");
1618 static_assert(array_size(expected
) == 6, "");
1620 InternT in
[array_size(input
)];
1621 char exp
[array_size(expected
) * 2];
1622 copy(begin(input
), end(input
), begin(in
));
1623 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1625 test_offsets_ok offsets
[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}, {4, 10}};
1626 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1627 test_offsets_ok t
= *it
;
1628 char out
[array_size(exp
) - 2] = {};
1629 assert(t
.in_size
<= array_size(in
));
1630 assert(t
.out_size
<= array_size(out
));
1631 mbstate_t state
= {};
1632 const InternT
* in_next
= nullptr;
1633 char* out_next
= nullptr;
1634 codecvt_base::result res
= codecvt_base::ok
;
1636 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1637 assert(res
== cvt
.ok
);
1638 assert(in_next
== in
+ t
.in_size
);
1639 assert(out_next
== out
+ t
.out_size
);
1640 assert(char_traits
<char>::compare(out
, exp
, t
.out_size
) == 0);
1641 if (t
.out_size
< array_size(out
))
1642 assert(out
[t
.out_size
] == 0);
1646 template <class InternT
>
1647 void utf32_to_utf16_out_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1648 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1649 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1650 static_assert(array_size(input
) == 5, "");
1651 static_assert(array_size(expected
) == 6, "");
1653 InternT in
[array_size(input
)];
1654 char exp
[array_size(expected
) * 2];
1655 copy(begin(input
), end(input
), begin(in
));
1656 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1658 test_offsets_partial offsets
[] = {
1659 {1, 0, 0, 0}, // no space for first CP
1660 {1, 1, 0, 0}, // no space for first CP
1662 {2, 2, 1, 2}, // no space for second CP
1663 {2, 3, 1, 2}, // no space for second CP
1665 {3, 4, 2, 4}, // no space for third CP
1666 {3, 5, 2, 4}, // no space for third CP
1668 {4, 6, 3, 6}, // no space for fourth CP
1669 {4, 7, 3, 6}, // no space for fourth CP
1670 {4, 8, 3, 6}, // no space for fourth CP
1671 {4, 9, 3, 6}, // no space for fourth CP
1673 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1674 test_offsets_partial t
= *it
;
1675 char out
[array_size(exp
) - 2] = {};
1676 assert(t
.in_size
<= array_size(in
));
1677 assert(t
.out_size
<= array_size(out
));
1678 assert(t
.expected_in_next
<= t
.in_size
);
1679 assert(t
.expected_out_next
<= t
.out_size
);
1680 mbstate_t state
= {};
1681 const InternT
* in_next
= nullptr;
1682 char* out_next
= nullptr;
1683 codecvt_base::result res
= codecvt_base::ok
;
1685 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1686 assert(res
== cvt
.partial
);
1687 assert(in_next
== in
+ t
.expected_in_next
);
1688 assert(out_next
== out
+ t
.expected_out_next
);
1689 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
1690 if (t
.expected_out_next
< array_size(out
))
1691 assert(out
[t
.expected_out_next
] == 0);
1695 template <class InternT
>
1696 void utf32_to_utf16_out_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1697 const char32_t input
[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1698 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1699 static_assert(array_size(input
) == 5, "");
1700 static_assert(array_size(expected
) == 6, "");
1702 InternT in
[array_size(input
)];
1703 char exp
[array_size(expected
) * 2];
1704 copy(begin(input
), end(input
), begin(in
));
1705 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1707 test_offsets_error
<InternT
> offsets
[] = {
1710 {4, 10, 0, 0, 0xD800, 0},
1711 {4, 10, 1, 2, 0xDBFF, 1},
1712 {4, 10, 2, 4, 0xDC00, 2},
1713 {4, 10, 3, 6, 0xDFFF, 3},
1716 {4, 10, 0, 0, 0x00110000, 0},
1717 {4, 10, 1, 2, 0x00110000, 1},
1718 {4, 10, 2, 4, 0x00110000, 2},
1719 {4, 10, 3, 6, 0x00110000, 3}};
1721 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1722 test_offsets_error
<InternT
> t
= *it
;
1723 char out
[array_size(exp
) - 2] = {};
1724 assert(t
.in_size
<= array_size(in
));
1725 assert(t
.out_size
<= array_size(out
));
1726 assert(t
.expected_in_next
<= t
.in_size
);
1727 assert(t
.expected_out_next
<= t
.out_size
);
1728 InternT old_char
= in
[t
.replace_pos
];
1729 in
[t
.replace_pos
] = t
.replace_char
;
1731 mbstate_t state
= {};
1732 const InternT
* in_next
= nullptr;
1733 char* out_next
= nullptr;
1734 codecvt_base::result res
= codecvt_base::ok
;
1736 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1737 assert(res
== cvt
.error
);
1738 assert(in_next
== in
+ t
.expected_in_next
);
1739 assert(out_next
== out
+ t
.expected_out_next
);
1740 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
1741 if (t
.expected_out_next
< array_size(out
))
1742 assert(out
[t
.expected_out_next
] == 0);
1744 in
[t
.replace_pos
] = old_char
;
1748 template <class InternT
>
1749 void test_utf16_utf32_cvt(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1750 utf16_to_utf32_in_ok(cvt
, endianess
);
1751 utf16_to_utf32_in_partial(cvt
, endianess
);
1752 utf16_to_utf32_in_error(cvt
, endianess
);
1753 utf32_to_utf16_out_ok(cvt
, endianess
);
1754 utf32_to_utf16_out_partial(cvt
, endianess
);
1755 utf32_to_utf16_out_error(cvt
, endianess
);
1758 template <class InternT
>
1759 void utf16_to_ucs2_in_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1760 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1761 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1762 static_assert(array_size(input
) == 4, "");
1763 static_assert(array_size(expected
) == 4, "");
1765 char in
[array_size(input
) * 2];
1766 InternT exp
[array_size(expected
)];
1767 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1768 copy(begin(expected
), end(expected
), begin(exp
));
1770 test_offsets_ok offsets
[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}};
1771 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1772 test_offsets_ok t
= *it
;
1773 InternT out
[array_size(exp
) - 1] = {};
1774 assert(t
.in_size
<= array_size(in
));
1775 assert(t
.out_size
<= array_size(out
));
1776 mbstate_t state
= {};
1777 const char* in_next
= nullptr;
1778 InternT
* out_next
= nullptr;
1779 codecvt_base::result res
= codecvt_base::ok
;
1781 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1782 assert(res
== cvt
.ok
);
1783 assert(in_next
== in
+ t
.in_size
);
1784 assert(out_next
== out
+ t
.out_size
);
1785 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1786 if (t
.out_size
< array_size(out
))
1787 assert(out
[t
.out_size
] == 0);
1789 state
= mbstate_t();
1790 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1792 assert(static_cast<size_t>(len
) == t
.in_size
);
1795 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1796 test_offsets_ok t
= *it
;
1797 InternT out
[array_size(exp
)] = {};
1798 assert(t
.in_size
<= array_size(in
));
1799 assert(t
.out_size
<= array_size(out
));
1800 mbstate_t state
= {};
1801 const char* in_next
= nullptr;
1802 InternT
* out_next
= nullptr;
1803 codecvt_base::result res
= codecvt_base::ok
;
1805 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, end(out
), out_next
);
1806 assert(res
== cvt
.ok
);
1807 assert(in_next
== in
+ t
.in_size
);
1808 assert(out_next
== out
+ t
.out_size
);
1809 assert(char_traits
<InternT
>::compare(out
, exp
, t
.out_size
) == 0);
1810 if (t
.out_size
< array_size(out
))
1811 assert(out
[t
.out_size
] == 0);
1813 state
= mbstate_t();
1814 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, array_size(out
));
1816 assert(static_cast<size_t>(len
) == t
.in_size
);
1820 template <class InternT
>
1821 void utf16_to_ucs2_in_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1822 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1823 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1824 static_assert(array_size(input
) == 4, "");
1825 static_assert(array_size(expected
) == 4, "");
1827 char in
[array_size(input
) * 2];
1828 InternT exp
[array_size(expected
)];
1829 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1830 copy(begin(expected
), end(expected
), begin(exp
));
1832 test_offsets_partial offsets
[] = {
1833 {2, 0, 0, 0}, // no space for first CP
1834 {1, 1, 0, 0}, // incomplete first CP
1835 {1, 0, 0, 0}, // incomplete first CP, and no space for it
1837 {4, 1, 2, 1}, // no space for second CP
1838 {3, 2, 2, 1}, // incomplete second CP
1839 {3, 1, 2, 1}, // incomplete second CP, and no space for it
1841 {6, 2, 4, 2}, // no space for third CP
1842 {5, 3, 4, 2}, // incomplete third CP
1843 {5, 2, 4, 2}, // incomplete third CP, and no space for it
1846 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1847 test_offsets_partial t
= *it
;
1848 InternT out
[array_size(exp
) - 1] = {};
1849 assert(t
.in_size
<= array_size(in
));
1850 assert(t
.out_size
<= array_size(out
));
1851 assert(t
.expected_in_next
<= t
.in_size
);
1852 assert(t
.expected_out_next
<= t
.out_size
);
1853 mbstate_t state
= {};
1854 const char* in_next
= nullptr;
1855 InternT
* out_next
= nullptr;
1856 codecvt_base::result res
= codecvt_base::ok
;
1858 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1859 assert(res
== cvt
.partial
);
1860 assert(in_next
== in
+ t
.expected_in_next
);
1861 assert(out_next
== out
+ t
.expected_out_next
);
1862 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1863 if (t
.expected_out_next
< array_size(out
))
1864 assert(out
[t
.expected_out_next
] == 0);
1866 state
= mbstate_t();
1867 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1869 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1873 template <class InternT
>
1874 void utf16_to_ucs2_in_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1875 char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1876 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1877 static_assert(array_size(input
) == 6, "");
1878 static_assert(array_size(expected
) == 6, "");
1880 InternT exp
[array_size(expected
)];
1881 copy(begin(expected
), end(expected
), begin(exp
));
1883 // The only possible error in UTF-16 is unpaired surrogate code units.
1884 // Additionally, because the target encoding is UCS-2, a proper pair of
1885 // surrogates is also error. Simply, any surrogate CU is error.
1886 test_offsets_error
<char16_t
> offsets
[] = {
1887 {6, 3, 0, 0, 0xD800, 0},
1888 {6, 3, 0, 0, 0xDBFF, 0},
1889 {6, 3, 0, 0, 0xDC00, 0},
1890 {6, 3, 0, 0, 0xDFFF, 0},
1892 {6, 3, 2, 1, 0xD800, 1},
1893 {6, 3, 2, 1, 0xDBFF, 1},
1894 {6, 3, 2, 1, 0xDC00, 1},
1895 {6, 3, 2, 1, 0xDFFF, 1},
1897 {6, 3, 4, 2, 0xD800, 2},
1898 {6, 3, 4, 2, 0xDBFF, 2},
1899 {6, 3, 4, 2, 0xDC00, 2},
1900 {6, 3, 4, 2, 0xDFFF, 2},
1902 // make the leading surrogate a trailing one
1903 {10, 5, 6, 3, 0xDC00, 3},
1904 {10, 5, 6, 3, 0xDFFF, 3},
1906 // make the trailing surrogate a leading one
1907 {10, 5, 6, 3, 0xD800, 4},
1908 {10, 5, 6, 3, 0xDBFF, 4},
1910 // make the trailing surrogate a BMP char
1911 {10, 5, 6, 3, 'z', 4},
1913 // don't replace anything in the test cases bellow, just show the surrogate
1914 // pair (fourth CP) fully or partially (just the first surrogate)
1915 {10, 5, 6, 3, 'b', 0},
1916 {8, 5, 6, 3, 'b', 0},
1917 {9, 5, 6, 3, 'b', 0},
1919 {10, 4, 6, 3, 'b', 0},
1920 {8, 4, 6, 3, 'b', 0},
1921 {9, 4, 6, 3, 'b', 0},
1924 for (test_offsets_error
<char16_t
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1925 test_offsets_error
<char16_t
> t
= *it
;
1926 char in
[array_size(input
) * 2];
1927 InternT out
[array_size(exp
) - 1] = {};
1928 assert(t
.in_size
<= array_size(in
));
1929 assert(t
.out_size
<= array_size(out
));
1930 assert(t
.expected_in_next
<= t
.in_size
);
1931 assert(t
.expected_out_next
<= t
.out_size
);
1932 char16_t old_char
= input
[t
.replace_pos
];
1933 input
[t
.replace_pos
] = t
.replace_char
; // replace in input, not in in
1934 utf16_to_bytes(begin(input
), end(input
), begin(in
), endianess
);
1936 mbstate_t state
= {};
1937 const char* in_next
= nullptr;
1938 InternT
* out_next
= nullptr;
1939 codecvt_base::result res
= codecvt_base::ok
;
1941 res
= cvt
.in(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1942 assert(res
== cvt
.error
);
1943 assert(in_next
== in
+ t
.expected_in_next
);
1944 assert(out_next
== out
+ t
.expected_out_next
);
1945 assert(char_traits
<InternT
>::compare(out
, exp
, t
.expected_out_next
) == 0);
1946 if (t
.expected_out_next
< array_size(out
))
1947 assert(out
[t
.expected_out_next
] == 0);
1949 state
= mbstate_t();
1950 int len
= cvt
.length(state
, in
, in
+ t
.in_size
, t
.out_size
);
1952 assert(static_cast<size_t>(len
) == t
.expected_in_next
);
1954 input
[t
.replace_pos
] = old_char
;
1958 template <class InternT
>
1959 void ucs2_to_utf16_out_ok(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1960 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1961 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1962 static_assert(array_size(input
) == 4, "");
1963 static_assert(array_size(expected
) == 4, "");
1965 InternT in
[array_size(input
)];
1966 char exp
[array_size(expected
) * 2];
1967 copy(begin(input
), end(input
), begin(in
));
1968 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
1970 test_offsets_ok offsets
[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}};
1971 for (test_offsets_ok
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
1972 test_offsets_ok t
= *it
;
1973 char out
[array_size(exp
) - 2] = {};
1974 assert(t
.in_size
<= array_size(in
));
1975 assert(t
.out_size
<= array_size(out
));
1976 mbstate_t state
= {};
1977 const InternT
* in_next
= nullptr;
1978 char* out_next
= nullptr;
1979 codecvt_base::result res
= codecvt_base::ok
;
1981 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
1982 assert(res
== cvt
.ok
);
1983 assert(in_next
== in
+ t
.in_size
);
1984 assert(out_next
== out
+ t
.out_size
);
1985 assert(char_traits
<char>::compare(out
, exp
, t
.out_size
) == 0);
1986 if (t
.out_size
< array_size(out
))
1987 assert(out
[t
.out_size
] == 0);
1991 template <class InternT
>
1992 void ucs2_to_utf16_out_partial(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
1993 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0};
1994 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0};
1995 static_assert(array_size(input
) == 4, "");
1996 static_assert(array_size(expected
) == 4, "");
1998 InternT in
[array_size(input
)];
1999 char exp
[array_size(expected
) * 2];
2000 copy(begin(input
), end(input
), begin(in
));
2001 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
2003 test_offsets_partial offsets
[] = {
2004 {1, 0, 0, 0}, // no space for first CP
2005 {1, 1, 0, 0}, // no space for first CP
2007 {2, 2, 1, 2}, // no space for second CP
2008 {2, 3, 1, 2}, // no space for second CP
2010 {3, 4, 2, 4}, // no space for third CP
2011 {3, 5, 2, 4}, // no space for third CP
2013 for (test_offsets_partial
* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
2014 test_offsets_partial t
= *it
;
2015 char out
[array_size(exp
) - 2] = {};
2016 assert(t
.in_size
<= array_size(in
));
2017 assert(t
.out_size
<= array_size(out
));
2018 assert(t
.expected_in_next
<= t
.in_size
);
2019 assert(t
.expected_out_next
<= t
.out_size
);
2020 mbstate_t state
= {};
2021 const InternT
* in_next
= nullptr;
2022 char* out_next
= nullptr;
2023 codecvt_base::result res
= codecvt_base::ok
;
2025 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
2026 assert(res
== cvt
.partial
);
2027 assert(in_next
== in
+ t
.expected_in_next
);
2028 assert(out_next
== out
+ t
.expected_out_next
);
2029 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
2030 if (t
.expected_out_next
< array_size(out
))
2031 assert(out
[t
.expected_out_next
] == 0);
2035 template <class InternT
>
2036 void ucs2_to_utf16_out_error(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
2037 const char16_t input
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2038 const char16_t expected
[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2039 static_assert(array_size(input
) == 6, "");
2040 static_assert(array_size(expected
) == 6, "");
2042 InternT in
[array_size(input
)];
2043 char exp
[array_size(expected
) * 2];
2044 copy(begin(input
), end(input
), begin(in
));
2045 utf16_to_bytes(begin(expected
), end(expected
), begin(exp
), endianess
);
2047 test_offsets_error
<InternT
> offsets
[] = {
2048 {3, 6, 0, 0, 0xD800, 0},
2049 {3, 6, 0, 0, 0xDBFF, 0},
2050 {3, 6, 0, 0, 0xDC00, 0},
2051 {3, 6, 0, 0, 0xDFFF, 0},
2053 {3, 6, 1, 2, 0xD800, 1},
2054 {3, 6, 1, 2, 0xDBFF, 1},
2055 {3, 6, 1, 2, 0xDC00, 1},
2056 {3, 6, 1, 2, 0xDFFF, 1},
2058 {3, 6, 2, 4, 0xD800, 2},
2059 {3, 6, 2, 4, 0xDBFF, 2},
2060 {3, 6, 2, 4, 0xDC00, 2},
2061 {3, 6, 2, 4, 0xDFFF, 2},
2063 // make the leading surrogate a trailing one
2064 {5, 10, 3, 6, 0xDC00, 3},
2065 {5, 10, 3, 6, 0xDFFF, 3},
2067 // make the trailing surrogate a leading one
2068 {5, 10, 3, 6, 0xD800, 4},
2069 {5, 10, 3, 6, 0xDBFF, 4},
2071 // make the trailing surrogate a BMP char
2072 {5, 10, 3, 6, 'z', 4},
2074 // don't replace anything in the test cases bellow, just show the surrogate
2075 // pair (fourth CP) fully or partially (just the first surrogate)
2076 {5, 10, 3, 6, 'b', 0},
2077 {5, 8, 3, 6, 'b', 0},
2078 {5, 9, 3, 6, 'b', 0},
2080 {4, 10, 3, 6, 'b', 0},
2081 {4, 8, 3, 6, 'b', 0},
2082 {4, 9, 3, 6, 'b', 0},
2085 for (test_offsets_error
<InternT
>* it
= begin(offsets
); it
!= end(offsets
); ++it
) {
2086 test_offsets_error
<InternT
> t
= *it
;
2087 char out
[array_size(exp
) - 2] = {};
2088 assert(t
.in_size
<= array_size(in
));
2089 assert(t
.out_size
<= array_size(out
));
2090 assert(t
.expected_in_next
<= t
.in_size
);
2091 assert(t
.expected_out_next
<= t
.out_size
);
2092 InternT old_char
= in
[t
.replace_pos
];
2093 in
[t
.replace_pos
] = t
.replace_char
;
2095 mbstate_t state
= {};
2096 const InternT
* in_next
= nullptr;
2097 char* out_next
= nullptr;
2098 codecvt_base::result res
= codecvt_base::ok
;
2100 res
= cvt
.out(state
, in
, in
+ t
.in_size
, in_next
, out
, out
+ t
.out_size
, out_next
);
2101 assert(res
== cvt
.error
);
2102 assert(in_next
== in
+ t
.expected_in_next
);
2103 assert(out_next
== out
+ t
.expected_out_next
);
2104 assert(char_traits
<char>::compare(out
, exp
, t
.expected_out_next
) == 0);
2105 if (t
.expected_out_next
< array_size(out
))
2106 assert(out
[t
.expected_out_next
] == 0);
2108 in
[t
.replace_pos
] = old_char
;
2112 template <class InternT
>
2113 void test_utf16_ucs2_cvt(const std::codecvt
<InternT
, char, mbstate_t>& cvt
, utf16_endianess endianess
) {
2114 utf16_to_ucs2_in_ok(cvt
, endianess
);
2115 utf16_to_ucs2_in_partial(cvt
, endianess
);
2116 utf16_to_ucs2_in_error(cvt
, endianess
);
2117 ucs2_to_utf16_out_ok(cvt
, endianess
);
2118 ucs2_to_utf16_out_partial(cvt
, endianess
);
2119 ucs2_to_utf16_out_error(cvt
, endianess
);
2123 using std::codecvt_utf16
;
2124 using std::codecvt_utf8
;
2125 using std::codecvt_utf8_utf16
;
2126 using std::has_facet
;
2128 using std::use_facet
;
2130 void test_utf8_utf32_codecvts() {
2131 typedef codecvt
<char32_t
, char, mbstate_t> codecvt_c32
;
2132 const locale
& loc_c
= locale::classic();
2133 assert(has_facet
<codecvt_c32
>(loc_c
));
2135 const codecvt_c32
& cvt
= use_facet
<codecvt_c32
>(loc_c
);
2136 test_utf8_utf32_cvt(cvt
);
2138 codecvt_utf8
<char32_t
> cvt2
;
2139 test_utf8_utf32_cvt(cvt2
);
2141 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2142 codecvt_utf8
<wchar_t> cvt3
;
2143 test_utf8_utf32_cvt(cvt3
);
2146 #ifndef TEST_HAS_NO_CHAR8_T
2147 typedef codecvt
<char32_t
, char8_t
, mbstate_t> codecvt_c32_c8
;
2148 assert(has_facet
<codecvt_c32_c8
>(loc_c
));
2149 const codecvt_c32_c8
& cvt4
= use_facet
<codecvt_c32_c8
>(loc_c
);
2150 test_utf8_utf32_cvt(cvt4
);
2154 void test_utf8_utf16_codecvts() {
2155 typedef codecvt
<char16_t
, char, mbstate_t> codecvt_c16
;
2156 const locale
& loc_c
= locale::classic();
2157 assert(has_facet
<codecvt_c16
>(loc_c
));
2159 const codecvt_c16
& cvt
= use_facet
<codecvt_c16
>(loc_c
);
2160 test_utf8_utf16_cvt(cvt
);
2162 codecvt_utf8_utf16
<char16_t
> cvt2
;
2163 test_utf8_utf16_cvt(cvt2
);
2165 codecvt_utf8_utf16
<char32_t
> cvt3
;
2166 test_utf8_utf16_cvt(cvt3
);
2168 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
2169 codecvt_utf8_utf16
<wchar_t> cvt4
;
2170 test_utf8_utf16_cvt(cvt4
);
2173 #ifndef TEST_HAS_NO_CHAR8_T
2174 typedef codecvt
<char16_t
, char8_t
, mbstate_t> codecvt_c16_c8
;
2175 assert(has_facet
<codecvt_c16_c8
>(loc_c
));
2176 const codecvt_c16_c8
& cvt5
= use_facet
<codecvt_c16_c8
>(loc_c
);
2177 test_utf8_utf16_cvt(cvt5
);
2181 void test_utf8_ucs2_codecvts() {
2182 codecvt_utf8
<char16_t
> cvt
;
2183 test_utf8_ucs2_cvt(cvt
);
2185 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2186 codecvt_utf8
<wchar_t> cvt2
;
2187 test_utf8_ucs2_cvt(cvt2
);
2191 void test_utf16_utf32_codecvts() {
2192 codecvt_utf16
<char32_t
> cvt
;
2193 test_utf16_utf32_cvt(cvt
, utf16_big_endian
);
2195 codecvt_utf16
<char32_t
, 0x10FFFF, std::little_endian
> cvt2
;
2196 test_utf16_utf32_cvt(cvt2
, utf16_little_endian
);
2198 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2199 codecvt_utf16
<wchar_t> cvt3
;
2200 test_utf16_utf32_cvt(cvt3
, utf16_big_endian
);
2202 codecvt_utf16
<wchar_t, 0x10FFFF, std::little_endian
> cvt4
;
2203 test_utf16_utf32_cvt(cvt4
, utf16_little_endian
);
2207 void test_utf16_ucs2_codecvts() {
2208 codecvt_utf16
<char16_t
> cvt
;
2209 test_utf16_ucs2_cvt(cvt
, utf16_big_endian
);
2211 codecvt_utf16
<char16_t
, 0x10FFFF, std::little_endian
> cvt2
;
2212 test_utf16_ucs2_cvt(cvt2
, utf16_little_endian
);
2214 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2215 codecvt_utf16
<wchar_t> cvt3
;
2216 test_utf16_ucs2_cvt(cvt3
, utf16_big_endian
);
2218 codecvt_utf16
<wchar_t, 0x10FFFF, std::little_endian
> cvt4
;
2219 test_utf16_ucs2_cvt(cvt4
, utf16_little_endian
);
2224 test_utf8_utf32_codecvts();
2225 test_utf8_utf16_codecvts();
2226 test_utf8_ucs2_codecvts();
2227 test_utf16_utf32_codecvts();
2228 test_utf16_ucs2_codecvts();