1 // Written in the D programming language.
4 Encode and decode UTF-8, UTF-16 and UTF-32 strings.
6 UTF character support is restricted to
7 $(D '\u0000' <= character <= '\U0010FFFF').
9 $(SCRIPT inhibitQuickIndex = 1;)
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
17 $(TR $(TD Lazy decode) $(TD
24 $(TR $(TD Encode) $(TD
32 $(TR $(TD Length) $(TD
42 $(TR $(TD Validation) $(TD
44 $(LREF isValidCodepoint)
47 $(TR $(TD Miscellaneous) $(TD
48 $(LREF replacementDchar)
49 $(LREF UseReplacementDchar)
54 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
55 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
56 $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
57 Copyright: Copyright The D Language Foundation 2000 - 2012.
58 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
59 Authors: $(HTTP digitalmars.com, Walter Bright) and
60 $(HTTP jmdavisprog.com, Jonathan M Davis)
61 Source: $(PHOBOSSRC std/utf.d)
65 import std
.exception
: basicExceptionCtors
;
66 import core
.exception
: UnicodeException
;
67 import std
.meta
: AliasSeq
;
68 import std
.range
.primitives
;
69 import std
.traits
: isAutodecodableString
, isPointer
, isSomeChar
,
70 isSomeString
, isStaticArray
, Unqual
, isConvertibleToString
;
71 import std
.typecons
: Flag
, Yes
, No
;
75 Exception thrown on errors in std.utf functions.
77 class UTFException
: UnicodeException
79 import core
.internal
.string
: unsignedToTempString
, UnsignedStringBuf
;
84 @safe pure nothrow @nogc
85 UTFException
setSequence(scope uint[] data
...) return
87 assert(data
.length
<= 4);
89 len
= data
.length
< 4 ? data
.length
: 4;
90 sequence
[0 .. len
] = data
[0 .. len
];
95 // FIXME: Use std.exception.basicExceptionCtors here once
96 // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
99 Standard exception constructors.
101 this(string msg
, string file
= __FILE__
, size_t line
= __LINE__
,
102 Throwable next
= null) @nogc @safe pure nothrow
104 super(msg
, 0, file
, line
, next
);
107 this(string msg
, size_t index
, string file
= __FILE__
,
108 size_t line
= __LINE__
, Throwable next
= null) @safe pure nothrow
110 UnsignedStringBuf buf
= void;
111 msg
~= " (at index " ~ unsignedToTempString(index
, buf
) ~ ")";
112 super(msg
, index
, file
, line
, next
);
117 A `string` detailing the invalid UTF sequence.
119 override string
toString() const
123 /* Exception.toString() is not marked as const, although
124 * it is const-compatible.
126 //return super.toString();
127 auto e
= () @trusted { return cast(Exception
) super; } ();
131 string result
= "Invalid UTF sequence:";
133 foreach (i
; sequence
[0 .. len
])
135 UnsignedStringBuf buf
= void;
137 auto h
= unsignedToTempString
!16(i
, buf
);
144 if (super.msg
.length
> 0)
157 import std
.exception
: assertThrown
;
160 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
161 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
162 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
163 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
164 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
168 Provide array of invalidly encoded UTF strings. Useful for testing.
171 Char = char, wchar, or dchar
174 an array of invalidly encoded UTF strings
177 package auto invalidUTFstrings(Char
)() @safe pure @nogc nothrow
180 static if (is(Char
== char))
182 enum x
= 0xDC00; // invalid surrogate value
183 enum y
= 0x110000; // out of range
185 static immutable string
[8] result
=
187 "\x80", // not a start byte
189 "\xC0\xC0", // invalid continuation
190 "\xF0\x82\x82\xAC", // overlong
193 0x80 |
((x
>> 6) & 0x3F),
197 cast(char)(0xF0 |
(y
>> 18)),
198 cast(char)(0x80 |
((y
>> 12) & 0x3F)),
199 cast(char)(0x80 |
((y
>> 6) & 0x3F)),
200 cast(char)(0x80 |
(y
& 0x3F))
203 cast(char)(0xF8 |
3), // 5 byte encoding
204 cast(char)(0x80 |
3),
205 cast(char)(0x80 |
3),
206 cast(char)(0x80 |
3),
207 cast(char)(0x80 |
3),
210 cast(char)(0xFC |
3), // 6 byte encoding
211 cast(char)(0x80 |
3),
212 cast(char)(0x80 |
3),
213 cast(char)(0x80 |
3),
214 cast(char)(0x80 |
3),
215 cast(char)(0x80 |
3),
221 else static if (is(Char
== wchar))
223 static immutable wstring
[5] result
=
246 else static if (is(Char
== dchar))
248 static immutable dstring
[3] result
=
250 [ cast(dchar) 0x110000 ],
251 [ cast(dchar) 0x00D800 ],
252 [ cast(dchar) 0x00DFFF ],
262 Check whether the given Unicode code point is valid.
265 c = code point to check
268 `true` if and only if `c` is a valid Unicode code point
271 `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
272 as they are permitted for internal use by an application, but they are
273 not allowed for interchange by the Unicode standard.
275 bool isValidDchar(dchar c
) pure nothrow @safe @nogc
277 return c
< 0xD800 ||
(c
> 0xDFFF && c
<= 0x10FFFF);
281 @safe @nogc pure nothrow unittest
283 assert( isValidDchar(cast(dchar) 0x41));
284 assert( isValidDchar(cast(dchar) 0x00));
285 assert(!isValidDchar(cast(dchar) 0xD800));
286 assert(!isValidDchar(cast(dchar) 0x11FFFF));
289 pure nothrow @safe @nogc unittest
291 import std
.exception
;
295 assert( isValidDchar(cast(dchar)'a') == true);
296 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
298 assert(!isValidDchar(cast(dchar) 0x00D800));
299 assert(!isValidDchar(cast(dchar) 0x00DBFF));
300 assert(!isValidDchar(cast(dchar) 0x00DC00));
301 assert(!isValidDchar(cast(dchar) 0x00DFFF));
302 assert( isValidDchar(cast(dchar) 0x00FFFE));
303 assert( isValidDchar(cast(dchar) 0x00FFFF));
304 assert( isValidDchar(cast(dchar) 0x01FFFF));
305 assert( isValidDchar(cast(dchar) 0x10FFFF));
306 assert(!isValidDchar(cast(dchar) 0x110000));
311 Checks if a single character forms a valid code point.
313 When standing alone, some characters are invalid code points. For
314 example the `wchar` `0xD800` is a so called high surrogate, which can
315 only be interpreted together with a low surrogate following it. As a
316 standalone character it is considered invalid.
318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
319 Unicode Standard, D90, D91 and D92) for more details.
322 c = character to test
323 Char = character type of `c`
326 `true`, if `c` forms a valid code point.
328 bool isValidCodepoint(Char
)(Char c
)
331 alias UChar
= Unqual
!Char
;
332 static if (is(UChar
== char))
336 else static if (is(UChar
== wchar))
338 return c
<= 0xD7FF || c
>= 0xE000;
340 else static if (is(UChar
== dchar))
342 return isValidDchar(c
);
345 static assert(false, "unknown character type: `" ~ Char
.stringof
~ "`");
349 @safe pure nothrow unittest
351 assert( isValidCodepoint(cast(char) 0x40));
352 assert(!isValidCodepoint(cast(char) 0x80));
353 assert( isValidCodepoint(cast(wchar) 0x1234));
354 assert(!isValidCodepoint(cast(wchar) 0xD800));
355 assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
356 assert(!isValidCodepoint(cast(dchar) 0x12345678));
360 Calculate the length of the UTF sequence starting at `index`
364 str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
365 of UTF code units. Must be random access if `index` is passed
366 index = starting index of UTF sequence (default: `0`)
369 The number of code units in the UTF sequence. For UTF-8, this is a
370 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
371 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
374 May throw a `UTFException` if `str[index]` is not the start of a
378 `stride` will only analyze the first `str[index]` element. It
379 will not fully verify the validity of the UTF sequence, nor even verify
380 the presence of the sequence: it will not actually guarantee that
381 $(D index + stride(str, index) <= str.length).
383 uint stride(S
)(auto ref S
str, size_t index
)
384 if (is(S
: const char[]) ||
385 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable char)))
387 static if (is(typeof(str.length
) : ulong))
388 assert(index
< str.length
, "Past the end of the UTF-8 sequence");
389 immutable c
= str[index
];
394 return strideImpl(c
, index
);
398 uint stride(S
)(auto ref S
str)
399 if (is(S
: const char[]) ||
400 (isInputRange
!S
&& is(immutable ElementType
!S
== immutable char)))
402 static if (is(S
: const char[]))
403 immutable c
= str[0];
405 immutable c
= str.front
;
410 return strideImpl(c
, 0);
415 import core
.exception
: AssertError
;
416 import std
.conv
: to
;
417 import std
.exception
;
418 import std
.string
: format
;
419 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
420 static void test(string s
, dchar c
, size_t i
= 0, size_t line
= __LINE__
)
422 enforce(stride(s
, i
) == codeLength
!char(c
),
423 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
425 enforce(stride(RandomCU
!char(s
), i
) == codeLength
!char(c
),
426 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
428 auto refRandom
= new RefRandomCU
!char(s
);
429 immutable randLen
= refRandom
.length
;
430 enforce(stride(refRandom
, i
) == codeLength
!char(c
),
431 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
432 enforce(refRandom
.length
== randLen
,
433 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
437 enforce(stride(s
) == codeLength
!char(c
),
438 new AssertError(format("Unit test failure string 0: %s", s
), __FILE__
, line
));
440 enforce(stride(InputCU
!char(s
)) == codeLength
!char(c
),
441 new AssertError(format("Unit test failure range 0: %s", s
), __FILE__
, line
));
443 auto refBidir
= new RefBidirCU
!char(s
);
444 immutable bidirLen
= refBidir
.length
;
445 enforce(stride(refBidir
) == codeLength
!char(c
),
446 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
447 enforce(refBidir
.length
== bidirLen
,
448 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
456 test("\u2029", '\u2029'); //paraSep
457 test("\u0100", '\u0100');
458 test("\u0430", '\u0430');
459 test("\U00010143", '\U00010143');
460 test("abcdefcdef", 'a');
461 test("hello\U00010143\u0100\U00010143", 'h', 0);
462 test("hello\U00010143\u0100\U00010143", 'e', 1);
463 test("hello\U00010143\u0100\U00010143", 'l', 2);
464 test("hello\U00010143\u0100\U00010143", 'l', 3);
465 test("hello\U00010143\u0100\U00010143", 'o', 4);
466 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
467 test("hello\U00010143\u0100\U00010143", '\u0100', 9);
468 test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
470 foreach (S
; AliasSeq
!(char[], const char[], string
))
472 enum str = to
!S("hello world");
473 static assert(isSafe
!({ stride(str, 0); }));
474 static assert(isSafe
!({ stride(str); }));
475 static assert((functionAttributes
!({ stride(str, 0); }) & FunctionAttribute
.pure_
) != 0);
476 static assert((functionAttributes
!({ stride(str); }) & FunctionAttribute
.pure_
) != 0);
481 @safe unittest // invalid start bytes
483 import std
.exception
: assertThrown
;
484 immutable char[] invalidStartBytes
= [
485 0b1111_1000, // indicating a sequence length of 5
489 0b1000_0000, // continuation byte
491 foreach (c
; invalidStartBytes
)
492 assertThrown
!UTFException(stride([c
]));
496 uint stride(S
)(auto ref S
str, size_t index
)
497 if (is(S
: const wchar[]) ||
498 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable wchar)))
500 static if (is(typeof(str.length
) : ulong))
501 assert(index
< str.length
, "Past the end of the UTF-16 sequence");
502 immutable uint u
= str[index
];
503 return 1 + (u
>= 0xD800 && u
<= 0xDBFF);
507 uint stride(S
)(auto ref S
str) @safe pure
508 if (is(S
: const wchar[]))
510 return stride(str, 0);
514 uint stride(S
)(auto ref S
str)
515 if (isInputRange
!S
&& is(immutable ElementType
!S
== immutable wchar) &&
516 !is(S
: const wchar[]))
518 assert(!str.empty
, "UTF-16 sequence is empty");
519 immutable uint u
= str.front
;
520 return 1 + (u
>= 0xD800 && u
<= 0xDBFF);
525 import core
.exception
: AssertError
;
526 import std
.conv
: to
;
527 import std
.exception
;
528 import std
.string
: format
;
529 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
530 static void test(wstring s
, dchar c
, size_t i
= 0, size_t line
= __LINE__
)
532 enforce(stride(s
, i
) == codeLength
!wchar(c
),
533 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
535 enforce(stride(RandomCU
!wchar(s
), i
) == codeLength
!wchar(c
),
536 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
538 auto refRandom
= new RefRandomCU
!wchar(s
);
539 immutable randLen
= refRandom
.length
;
540 enforce(stride(refRandom
, i
) == codeLength
!wchar(c
),
541 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
542 enforce(refRandom
.length
== randLen
,
543 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
547 enforce(stride(s
) == codeLength
!wchar(c
),
548 new AssertError(format("Unit test failure string 0: %s", s
), __FILE__
, line
));
550 enforce(stride(InputCU
!wchar(s
)) == codeLength
!wchar(c
),
551 new AssertError(format("Unit test failure range 0: %s", s
), __FILE__
, line
));
553 auto refBidir
= new RefBidirCU
!wchar(s
);
554 immutable bidirLen
= refBidir
.length
;
555 enforce(stride(refBidir
) == codeLength
!wchar(c
),
556 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
557 enforce(refBidir
.length
== bidirLen
,
558 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
566 test("\u2029", '\u2029'); //paraSep
567 test("\u0100", '\u0100');
568 test("\u0430", '\u0430');
569 test("\U00010143", '\U00010143');
570 test("abcdefcdef", 'a');
571 test("hello\U00010143\u0100\U00010143", 'h', 0);
572 test("hello\U00010143\u0100\U00010143", 'e', 1);
573 test("hello\U00010143\u0100\U00010143", 'l', 2);
574 test("hello\U00010143\u0100\U00010143", 'l', 3);
575 test("hello\U00010143\u0100\U00010143", 'o', 4);
576 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
577 test("hello\U00010143\u0100\U00010143", '\u0100', 7);
578 test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
580 foreach (S
; AliasSeq
!(wchar[], const wchar[], wstring
))
582 enum str = to
!S("hello world");
583 static assert(isSafe
!(() => stride(str, 0)));
584 static assert(isSafe
!(() => stride(str) ));
585 static assert((functionAttributes
!(() => stride(str, 0)) & FunctionAttribute
.pure_
) != 0);
586 static assert((functionAttributes
!(() => stride(str) ) & FunctionAttribute
.pure_
) != 0);
592 uint stride(S
)(auto ref S
str, size_t index
= 0)
593 if (is(S
: const dchar[]) ||
594 (isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar)))
596 static if (is(typeof(str.length
) : ulong))
597 assert(index
< str.length
, "Past the end of the UTF-32 sequence");
599 assert(!str.empty
, "UTF-32 sequence is empty.");
606 assert("a".stride
== 1);
607 assert("λ".stride
== 2);
608 assert("aλ".stride
== 1);
609 assert("aλ".stride(1) == 2);
610 assert("𐐷".stride
== 4);
615 import core
.exception
: AssertError
;
616 import std
.conv
: to
;
617 import std
.exception
;
618 import std
.string
: format
;
619 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
620 static void test(dstring s
, dchar c
, size_t i
= 0, size_t line
= __LINE__
)
622 enforce(stride(s
, i
) == codeLength
!dchar(c
),
623 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
625 enforce(stride(RandomCU
!dchar(s
), i
) == codeLength
!dchar(c
),
626 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
628 auto refRandom
= new RefRandomCU
!dchar(s
);
629 immutable randLen
= refRandom
.length
;
630 enforce(stride(refRandom
, i
) == codeLength
!dchar(c
),
631 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
632 enforce(refRandom
.length
== randLen
,
633 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
637 enforce(stride(s
) == codeLength
!dchar(c
),
638 new AssertError(format("Unit test failure string 0: %s", s
), __FILE__
, line
));
640 enforce(stride(InputCU
!dchar(s
)) == codeLength
!dchar(c
),
641 new AssertError(format("Unit test failure range 0: %s", s
), __FILE__
, line
));
643 auto refBidir
= new RefBidirCU
!dchar(s
);
644 immutable bidirLen
= refBidir
.length
;
645 enforce(stride(refBidir
) == codeLength
!dchar(c
),
646 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
647 enforce(refBidir
.length
== bidirLen
,
648 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
656 test("\u2029", '\u2029'); //paraSep
657 test("\u0100", '\u0100');
658 test("\u0430", '\u0430');
659 test("\U00010143", '\U00010143');
660 test("abcdefcdef", 'a');
661 test("hello\U00010143\u0100\U00010143", 'h', 0);
662 test("hello\U00010143\u0100\U00010143", 'e', 1);
663 test("hello\U00010143\u0100\U00010143", 'l', 2);
664 test("hello\U00010143\u0100\U00010143", 'l', 3);
665 test("hello\U00010143\u0100\U00010143", 'o', 4);
666 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
667 test("hello\U00010143\u0100\U00010143", '\u0100', 6);
668 test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
670 foreach (S
; AliasSeq
!(dchar[], const dchar[], dstring
))
672 enum str = to
!S("hello world");
673 static assert(isSafe
!(() => stride(str, 0)));
674 static assert(isSafe
!(() => stride(str) ));
675 static assert((functionAttributes
!(() => stride(str, 0)) & FunctionAttribute
.pure_
) != 0);
676 static assert((functionAttributes
!(() => stride(str) ) & FunctionAttribute
.pure_
) != 0);
681 private uint strideImpl(char c
, size_t index
) @trusted pure
682 in { assert(c
& 0x80); }
685 import core
.bitop
: bsr;
686 immutable msbs
= 7 - bsr((~uint(c
)) & 0xFF);
687 if (c
== 0xFF || msbs
< 2 || msbs
> 4)
688 throw new UTFException("Invalid UTF-8 sequence", index
);
693 Calculate the length of the UTF sequence ending one code unit before
697 str = bidirectional range of UTF code units. Must be random access if
699 index = index one past end of UTF sequence (default: `str.length`)
702 The number of code units in the UTF sequence. For UTF-8, this is a
703 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
704 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
707 May throw a `UTFException` if `str[index]` is not one past the
708 end of a valid UTF sequence.
711 `strideBack` will only analyze the element at $(D str[index - 1])
712 element. It will not fully verify the validity of the UTF sequence, nor
713 even verify the presence of the sequence: it will not actually
714 guarantee that $(D strideBack(str, index) <= index).
716 uint strideBack(S
)(auto ref S
str, size_t index
)
717 if (is(S
: const char[]) ||
718 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable char)))
720 static if (is(typeof(str.length
) : ulong))
721 assert(index
<= str.length
, "Past the end of the UTF-8 sequence");
722 assert(index
> 0, "Not the end of the UTF-8 sequence");
724 if ((str[index
-1] & 0b1100_0000) != 0b1000_0000)
727 if (index
>= 4) //single verification for most common case
729 static foreach (i
; 2 .. 5)
731 if ((str[index
-i
] & 0b1100_0000) != 0b1000_0000)
737 static foreach (i
; 2 .. 4)
739 if (index
>= i
&& (str[index
-i
] & 0b1100_0000) != 0b1000_0000)
743 throw new UTFException("Not the end of the UTF sequence", index
);
747 uint strideBack(S
)(auto ref S
str)
748 if (is(S
: const char[]) ||
749 (isRandomAccessRange
!S
&& hasLength
!S
&& is(immutable ElementType
!S
== immutable char)))
751 return strideBack(str, str.length
);
755 uint strideBack(S
)(auto ref S
str)
756 if (isBidirectionalRange
!S
&& is(immutable ElementType
!S
== immutable char) && !isRandomAccessRange
!S
)
758 assert(!str.empty
, "Past the end of the UTF-8 sequence");
759 auto temp
= str.save
;
760 foreach (i
; AliasSeq
!(1, 2, 3, 4))
762 if ((temp
.back
& 0b1100_0000) != 0b1000_0000)
768 throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
773 import core
.exception
: AssertError
;
774 import std
.conv
: to
;
775 import std
.exception
;
776 import std
.string
: format
;
777 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
778 static void test(string s
, dchar c
, size_t i
= size_t
.max
, size_t line
= __LINE__
)
780 enforce(strideBack(s
, i
== size_t
.max ? s
.length
: i
) == codeLength
!char(c
),
781 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
783 enforce(strideBack(RandomCU
!char(s
), i
== size_t
.max ? s
.length
: i
) == codeLength
!char(c
),
784 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
786 auto refRandom
= new RefRandomCU
!char(s
);
787 immutable randLen
= refRandom
.length
;
788 enforce(strideBack(refRandom
, i
== size_t
.max ? s
.length
: i
) == codeLength
!char(c
),
789 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
790 enforce(refRandom
.length
== randLen
,
791 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
795 enforce(strideBack(s
) == codeLength
!char(c
),
796 new AssertError(format("Unit test failure string code length: %s", s
), __FILE__
, line
));
798 enforce(strideBack(BidirCU
!char(s
)) == codeLength
!char(c
),
799 new AssertError(format("Unit test failure range code length: %s", s
), __FILE__
, line
));
801 auto refBidir
= new RefBidirCU
!char(s
);
802 immutable bidirLen
= refBidir
.length
;
803 enforce(strideBack(refBidir
) == codeLength
!char(c
),
804 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
805 enforce(refBidir
.length
== bidirLen
,
806 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
814 test("\u2029", '\u2029'); //paraSep
815 test("\u0100", '\u0100');
816 test("\u0430", '\u0430');
817 test("\U00010143", '\U00010143');
818 test("abcdefcdef", 'f');
819 test("\U00010143\u0100\U00010143hello", 'o', 15);
820 test("\U00010143\u0100\U00010143hello", 'l', 14);
821 test("\U00010143\u0100\U00010143hello", 'l', 13);
822 test("\U00010143\u0100\U00010143hello", 'e', 12);
823 test("\U00010143\u0100\U00010143hello", 'h', 11);
824 test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
825 test("\U00010143\u0100\U00010143hello", '\u0100', 6);
826 test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
828 foreach (S
; AliasSeq
!(char[], const char[], string
))
830 enum str = to
!S("hello world");
831 static assert(isSafe
!({ strideBack(str, 0); }));
832 static assert(isSafe
!({ strideBack(str); }));
833 static assert((functionAttributes
!({ strideBack(str, 0); }) & FunctionAttribute
.pure_
) != 0);
834 static assert((functionAttributes
!({ strideBack(str); }) & FunctionAttribute
.pure_
) != 0);
839 //UTF-16 is self synchronizing: The length of strideBack can be found from
840 //the value of a single wchar
842 uint strideBack(S
)(auto ref S
str, size_t index
)
843 if (is(S
: const wchar[]) ||
844 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable wchar)))
846 static if (is(typeof(str.length
) : ulong))
847 assert(index
<= str.length
, "Past the end of the UTF-16 sequence");
848 assert(index
> 0, "Not the end of a UTF-16 sequence");
850 immutable c2
= str[index
-1];
851 return 1 + (0xDC00 <= c2
&& c2
< 0xE000);
855 uint strideBack(S
)(auto ref S
str)
856 if (is(S
: const wchar[]) ||
857 (isBidirectionalRange
!S
&& is(immutable ElementType
!S
== immutable wchar)))
859 assert(!str.empty
, "UTF-16 sequence is empty");
861 static if (is(S
: const(wchar)[]))
862 immutable c2
= str[$ - 1];
864 immutable c2
= str.back
;
866 return 1 + (0xDC00 <= c2
&& c2
<= 0xE000);
871 import core
.exception
: AssertError
;
872 import std
.conv
: to
;
873 import std
.exception
;
874 import std
.string
: format
;
875 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
876 static void test(wstring s
, dchar c
, size_t i
= size_t
.max
, size_t line
= __LINE__
)
878 enforce(strideBack(s
, i
== size_t
.max ? s
.length
: i
) == codeLength
!wchar(c
),
879 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
881 enforce(strideBack(RandomCU
!wchar(s
), i
== size_t
.max ? s
.length
: i
) == codeLength
!wchar(c
),
882 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
884 auto refRandom
= new RefRandomCU
!wchar(s
);
885 immutable randLen
= refRandom
.length
;
886 enforce(strideBack(refRandom
, i
== size_t
.max ? s
.length
: i
) == codeLength
!wchar(c
),
887 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
888 enforce(refRandom
.length
== randLen
,
889 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
893 enforce(strideBack(s
) == codeLength
!wchar(c
),
894 new AssertError(format("Unit test failure string code length: %s", s
), __FILE__
, line
));
896 enforce(strideBack(BidirCU
!wchar(s
)) == codeLength
!wchar(c
),
897 new AssertError(format("Unit test failure range code length: %s", s
), __FILE__
, line
));
899 auto refBidir
= new RefBidirCU
!wchar(s
);
900 immutable bidirLen
= refBidir
.length
;
901 enforce(strideBack(refBidir
) == codeLength
!wchar(c
),
902 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
903 enforce(refBidir
.length
== bidirLen
,
904 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
912 test("\u2029", '\u2029'); //paraSep
913 test("\u0100", '\u0100');
914 test("\u0430", '\u0430');
915 test("\U00010143", '\U00010143');
916 test("abcdefcdef", 'f');
917 test("\U00010143\u0100\U00010143hello", 'o', 10);
918 test("\U00010143\u0100\U00010143hello", 'l', 9);
919 test("\U00010143\u0100\U00010143hello", 'l', 8);
920 test("\U00010143\u0100\U00010143hello", 'e', 7);
921 test("\U00010143\u0100\U00010143hello", 'h', 6);
922 test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
923 test("\U00010143\u0100\U00010143hello", '\u0100', 3);
924 test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
926 foreach (S
; AliasSeq
!(wchar[], const wchar[], wstring
))
928 enum str = to
!S("hello world");
929 static assert(isSafe
!(() => strideBack(str, 0)));
930 static assert(isSafe
!(() => strideBack(str) ));
931 static assert((functionAttributes
!(() => strideBack(str, 0)) & FunctionAttribute
.pure_
) != 0);
932 static assert((functionAttributes
!(() => strideBack(str) ) & FunctionAttribute
.pure_
) != 0);
938 uint strideBack(S
)(auto ref S
str, size_t index
)
939 if (isRandomAccessRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar))
941 static if (is(typeof(str.length
) : ulong))
942 assert(index
<= str.length
, "Past the end of the UTF-32 sequence");
943 assert(index
> 0, "Not the end of the UTF-32 sequence");
948 uint strideBack(S
)(auto ref S
str)
949 if (isBidirectionalRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar))
951 assert(!str.empty
, "Empty UTF-32 sequence");
958 assert("a".strideBack
== 1);
959 assert("λ".strideBack
== 2);
960 assert("aλ".strideBack
== 2);
961 assert("aλ".strideBack(1) == 1);
962 assert("𐐷".strideBack
== 4);
967 import core
.exception
: AssertError
;
968 import std
.conv
: to
;
969 import std
.exception
;
970 import std
.string
: format
;
971 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
972 static void test(dstring s
, dchar c
, size_t i
= size_t
.max
, size_t line
= __LINE__
)
974 enforce(strideBack(s
, i
== size_t
.max ? s
.length
: i
) == codeLength
!dchar(c
),
975 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
977 enforce(strideBack(RandomCU
!dchar(s
), i
== size_t
.max ? s
.length
: i
) == codeLength
!dchar(c
),
978 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
980 auto refRandom
= new RefRandomCU
!dchar(s
);
981 immutable randLen
= refRandom
.length
;
982 enforce(strideBack(refRandom
, i
== size_t
.max ? s
.length
: i
) == codeLength
!dchar(c
),
983 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
984 enforce(refRandom
.length
== randLen
,
985 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
989 enforce(strideBack(s
) == codeLength
!dchar(c
),
990 new AssertError(format("Unit test failure string code length: %s", s
), __FILE__
, line
));
992 enforce(strideBack(BidirCU
!dchar(s
)) == codeLength
!dchar(c
),
993 new AssertError(format("Unit test failure range code length: %s", s
), __FILE__
, line
));
995 auto refBidir
= new RefBidirCU
!dchar(s
);
996 immutable bidirLen
= refBidir
.length
;
997 enforce(strideBack(refBidir
) == codeLength
!dchar(c
),
998 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
999 enforce(refBidir
.length
== bidirLen
,
1000 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
1008 test("\u2029", '\u2029'); //paraSep
1009 test("\u0100", '\u0100');
1010 test("\u0430", '\u0430');
1011 test("\U00010143", '\U00010143');
1012 test("abcdefcdef", 'f');
1013 test("\U00010143\u0100\U00010143hello", 'o', 8);
1014 test("\U00010143\u0100\U00010143hello", 'l', 7);
1015 test("\U00010143\u0100\U00010143hello", 'l', 6);
1016 test("\U00010143\u0100\U00010143hello", 'e', 5);
1017 test("\U00010143\u0100\U00010143hello", 'h', 4);
1018 test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019 test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020 test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1022 foreach (S
; AliasSeq
!(dchar[], const dchar[], dstring
))
1024 enum str = to
!S("hello world");
1025 static assert(isSafe
!(() => strideBack(str, 0)));
1026 static assert(isSafe
!(() => strideBack(str) ));
1027 static assert((functionAttributes
!(() => strideBack(str, 0)) & FunctionAttribute
.pure_
) != 0);
1028 static assert((functionAttributes
!(() => strideBack(str) ) & FunctionAttribute
.pure_
) != 0);
1035 Given `index` into `str` and assuming that `index` is at the start
1036 of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037 up to `index`. So, `index` is the index of a code unit at the
1038 beginning of a code point, and the return value is how many code points into
1039 the string that that code point is.
1041 size_t
toUCSindex(C
)(const(C
)[] str, size_t index
) @safe pure
1044 static if (is(immutable C
== immutable dchar))
1051 for (; j
< index
; ++n
)
1052 j
+= stride(str, j
);
1056 static if (is(immutable C
== immutable char))
1057 throw new UTFException("Invalid UTF-8 sequence", index
);
1059 throw new UTFException("Invalid UTF-16 sequence", index
);
1069 assert(toUCSindex(`hello world`, 7) == 7);
1070 assert(toUCSindex(`hello world`w
, 7) == 7);
1071 assert(toUCSindex(`hello world`d
, 7) == 7);
1073 assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074 assert(toUCSindex(`Ma Chérie`w
, 7) == 7);
1075 assert(toUCSindex(`Ma Chérie`d
, 7) == 7);
1077 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w
, 9) == 9);
1079 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d
, 9) == 9);
1084 Given a UCS index `n` into `str`, returns the UTF index.
1085 So, `n` is how many code points into the string the code point is, and
1086 the array index of the code unit is returned.
1088 size_t
toUTFindex(C
)(const(C
)[] str, size_t n
) @safe pure
1091 static if (is(immutable C
== immutable dchar))
1100 i
+= stride(str, i
);
1109 assert(toUTFindex(`hello world`, 7) == 7);
1110 assert(toUTFindex(`hello world`w
, 7) == 7);
1111 assert(toUTFindex(`hello world`d
, 7) == 7);
1113 assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114 assert(toUTFindex(`Ma Chérie`w
, 7) == 7);
1115 assert(toUTFindex(`Ma Chérie`d
, 7) == 7);
1117 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w
, 9) == 9);
1119 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d
, 9) == 9);
1123 /* =================== Decode ======================= */
1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126 alias UseReplacementDchar
= Flag
!"useReplacementDchar";
1129 Decodes and returns the code point starting at `str[index]`. `index`
1130 is advanced to one past the decoded code point. If the code point is not
1131 well-formed, then a `UTFException` is thrown and `index` remains
1134 decode will only work with strings and random access ranges of code units
1135 with length and slicing, whereas $(LREF decodeFront) will work with any
1136 input range of code units.
1139 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140 str = input string or indexable Range
1141 index = starting index into s[]; incremented by number of code units processed
1147 $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148 sequence and useReplacementDchar is `No.useReplacementDchar`
1150 dchar decode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(auto ref S
str, ref size_t index
)
1151 if (!isSomeString
!S
&&
1152 isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
&& isSomeChar
!(ElementType
!S
))
1155 assert(index
< str.length
, "Attempted to decode past the end of a string");
1159 assert(isValidDchar(result
));
1163 if (str[index
] < codeUnitLimit
!S
)
1164 return str[index
++];
1166 return decodeImpl
!(true, useReplacementDchar
)(str, index
);
1170 dchar decode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1171 auto ref S
str, ref size_t index
) @trusted pure
1175 assert(index
< str.length
, "Attempted to decode past the end of a string");
1179 assert(isValidDchar(result
));
1183 if (str[index
] < codeUnitLimit
!S
)
1184 return str[index
++];
1185 else static if (is(immutable S
== immutable C
[], C
))
1186 return decodeImpl
!(true, useReplacementDchar
)(cast(const(C
)[]) str, index
);
1194 assert("a".decode(i
) == 'a' && i
== 1);
1196 assert("å".decode(i
) == 'å' && i
== 2);
1198 assert("aå".decode(i
) == 'å' && i
== 3);
1200 assert("å"w
.decode(i
) == 'å' && i
== 1);
1202 // ë as a multi-code point grapheme
1204 assert("e\u0308".decode(i
) == 'e' && i
== 1);
1205 // ë as a single code point grapheme
1207 assert("ë".decode(i
) == 'ë' && i
== 2);
1209 assert("ë"w
.decode(i
) == 'ë' && i
== 1);
1213 `decodeFront` is a variant of $(LREF decode) which specifically decodes
1214 the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1215 $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1216 of code units (rather than just a string or random access
1217 range). It also takes the range by `ref` and pops off the elements as it
1218 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1219 of code units which were in the code point which was decoded.
1222 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1223 str = input string or indexable Range
1224 numCodeUnits = set to number of code units processed
1230 $(LREF UTFException) if `str.front` is not the start of a valid UTF
1231 sequence. If an exception is thrown, then there is no guarantee as to
1232 the number of code units which were popped off, as it depends on the
1233 type of range being used and how many code units had to be popped off
1234 before the code point was determined to be invalid.
1236 dchar decodeFront(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1237 ref S
str, out size_t numCodeUnits
)
1238 if (!isSomeString
!S
&& isInputRange
!S
&& isSomeChar
!(ElementType
!S
))
1245 assert(isValidDchar(result
));
1249 immutable fst = str.front
;
1251 if (fst < codeUnitLimit
!S
)
1259 // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1260 // done outside of decodeImpl, which is undesirable, since not all
1261 // overloads of decodeImpl need it. So, it should be moved back into
1262 // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1264 enum canIndex
= is(S
: const char[]) || isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
;
1265 immutable retval
= decodeImpl
!(canIndex
, useReplacementDchar
)(str, numCodeUnits
);
1267 // The other range types were already popped by decodeImpl.
1268 static if (isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
)
1269 str = str[numCodeUnits
.. str.length
];
1276 dchar decodeFront(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1277 ref S
str, out size_t numCodeUnits
) @trusted pure
1285 assert(isValidDchar(result
));
1289 if (str[0] < codeUnitLimit
!S
)
1292 immutable retval
= str[0];
1296 else static if (is(immutable S
== immutable C
[], C
))
1298 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(cast(const(C
)[]) str, numCodeUnits
);
1299 str = str[numCodeUnits
.. $];
1305 dchar decodeFront(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(ref S
str)
1306 if (isInputRange
!S
&& isSomeChar
!(ElementType
!S
))
1308 size_t numCodeUnits
;
1309 return decodeFront
!useReplacementDchar(str, numCodeUnits
);
1315 import std
.range
.primitives
;
1316 string
str = "Hello, World!";
1318 assert(str.decodeFront
== 'H' && str == "ello, World!");
1320 assert(str.decodeFront
== 'å' && str.empty
);
1323 assert(str.decodeFront(i
) == 'å' && i
== 2 && str.empty
);
1327 `decodeBack` is a variant of $(LREF decode) which specifically decodes
1328 the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1329 bidirectional range of code units (rather than just a string or random access
1330 range). It also takes the range by `ref` and pops off the elements as it
1331 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1332 of code units which were in the code point which was decoded.
1335 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1336 str = input string or bidirectional Range
1337 numCodeUnits = gives the number of code units processed
1340 A decoded UTF character.
1343 $(LREF UTFException) if `str.back` is not the end of a valid UTF
1344 sequence. If an exception is thrown, the `str` itself remains unchanged,
1345 but there is no guarantee as to the value of `numCodeUnits` (when passed).
1347 dchar decodeBack(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1348 ref S
str, out size_t numCodeUnits
)
1356 assert(isValidDchar(result
));
1360 if (str[$ - 1] < codeUnitLimit
!S
)
1363 immutable retval
= str[$ - 1];
1364 str = str[0 .. $ - 1];
1367 else static if (is(immutable S
== immutable C
[], C
))
1369 numCodeUnits
= strideBack(str);
1370 immutable newLength
= str.length
- numCodeUnits
;
1371 size_t index
= newLength
;
1372 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(cast(const(C
)[]) str, index
);
1373 str = str[0 .. newLength
];
1379 dchar decodeBack(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1380 ref S
str, out size_t numCodeUnits
)
1381 if (!isSomeString
!S
&& isSomeChar
!(ElementType
!S
) && isBidirectionalRange
!S
1382 && ((isRandomAccessRange
!S
&& hasLength
!S
) ||
!isRandomAccessRange
!S
))
1389 assert(isValidDchar(result
));
1393 if (str.back
< codeUnitLimit
!S
)
1396 immutable retval
= str.back
;
1402 numCodeUnits
= strideBack(str);
1403 static if (isRandomAccessRange
!S
)
1405 size_t index
= str.length
- numCodeUnits
;
1406 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(str, index
);
1407 str.popBackExactly(numCodeUnits
);
1412 alias Char
= Unqual
!(ElementType
!S
);
1415 for (size_t i
= numCodeUnits
; i
> 0; )
1417 codeUnits
[--i
] = tmp
.back
;
1420 const Char
[] codePoint
= codeUnits
[0 .. numCodeUnits
];
1422 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(codePoint
, index
);
1430 dchar decodeBack(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(ref S
str)
1432 ||
(isRandomAccessRange
!S
&& hasLength
!S
&& isSomeChar
!(ElementType
!S
))
1433 ||
(!isRandomAccessRange
!S
&& isBidirectionalRange
!S
&& isSomeChar
!(ElementType
!S
)))
1440 assert(isValidDchar(result
));
1444 size_t numCodeUnits
;
1445 return decodeBack
!useReplacementDchar(str, numCodeUnits
);
1449 @system pure unittest
1451 import std
.range
.primitives
;
1452 string
str = "Hello, World!";
1454 assert(str.decodeBack
== '!' && str == "Hello, World");
1456 assert(str.decodeBack
== 'å' && str.empty
);
1459 assert(str.decodeBack(i
) == 'å' && i
== 2 && str.empty
);
1462 // For the given range, code unit values less than this
1463 // are guaranteed to be valid single-codepoint encodings.
1464 package template codeUnitLimit(S
)
1465 if (isSomeChar
!(ElementEncodingType
!S
))
1467 static if (is(immutable ElementEncodingType
!S
== immutable char))
1468 enum char codeUnitLimit
= 0x80;
1469 else static if (is(immutable ElementEncodingType
!S
== immutable wchar))
1470 enum wchar codeUnitLimit
= 0xD800;
1472 enum dchar codeUnitLimit
= 0xD800;
1476 * For strings, this function does its own bounds checking to give a
1477 * more useful error message when attempting to decode past the end of a string.
1478 * Subsequently it uses a pointer instead of an array to avoid
1479 * redundant bounds checking.
1481 * The three overloads of this operate on chars, wchars, and dchars.
1484 * canIndex = if S is indexable
1485 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1486 * str = input string or Range
1487 * index = starting index into s[]; incremented by number of code units processed
1492 private dchar decodeImpl(bool canIndex
, UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1493 auto ref S
str, ref size_t index
)
1495 is(S
: const char[]) ||
(isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable char)))
1497 /* The following encodings are valid, except for the 5 and 6 byte
1501 * 1110xxxx 10xxxxxx 10xxxxxx
1502 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1503 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1504 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1507 /* Dchar bitmask for different numbers of UTF-8 code units.
1509 alias bitMask
= AliasSeq
!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1511 static if (is(S
: const char[]))
1512 auto pstr
= str.ptr
+ index
; // this is what makes decodeImpl() @system code
1513 else static if (isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
)
1514 auto pstr
= str[index
.. str.length
];
1518 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1519 // outside of decodeImpl
1520 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1522 static if (canIndex
)
1524 immutable length
= str.length
- index
;
1525 ubyte fst = pstr
[0];
1529 ubyte fst = pstr
.front
;
1533 static if (!useReplacementDchar
)
1535 static if (canIndex
)
1537 static UTFException
exception(S
)(S
str, string msg
)
1539 uint[4] sequence
= void;
1544 sequence
[i
] = str[i
];
1545 } while (++i
< str.length
&& i
< 4 && (str[i
] & 0xC0) == 0x80);
1547 return new UTFException(msg
, i
).setSequence(sequence
[0 .. i
]);
1551 UTFException
invalidUTF()
1553 static if (canIndex
)
1554 return exception(pstr
[0 .. length
], "Invalid UTF-8 sequence");
1557 //We can't include the invalid sequence with input strings without
1558 //saving each of the code units along the way, and we can't do it with
1559 //forward ranges without saving the entire range. Both would incur a
1560 //cost for the decoding of every character just to provide a better
1561 //error message for the (hopefully) rare case when an invalid UTF-8
1562 //sequence is encountered, so we don't bother trying to include the
1563 //invalid sequence here, unlike with strings and sliceable ranges.
1564 return new UTFException("Invalid UTF-8 sequence");
1568 UTFException
outOfBounds()
1570 static if (canIndex
)
1571 return exception(pstr
[0 .. length
], "Attempted to decode past the end of a string");
1573 return new UTFException("Attempted to decode past the end of a string");
1577 if ((fst & 0b1100_0000) != 0b1100_0000)
1579 static if (useReplacementDchar
)
1581 ++index
; // always consume bad input to avoid infinite loops
1582 return replacementDchar
;
1585 throw invalidUTF(); // starter must have at least 2 first bits set
1588 dchar d
= fst; // upper control bits are masked out later
1591 foreach (i
; AliasSeq
!(1, 2, 3))
1594 static if (canIndex
)
1598 static if (useReplacementDchar
)
1601 return replacementDchar
;
1604 throw outOfBounds();
1611 static if (useReplacementDchar
)
1614 return replacementDchar
;
1617 throw outOfBounds();
1621 static if (canIndex
)
1629 if ((tmp
& 0xC0) != 0x80)
1631 static if (useReplacementDchar
)
1634 return replacementDchar
;
1640 d
= (d
<< 6) |
(tmp
& 0x3F);
1643 if (!(fst & 0x80)) // no more bytes
1645 d
&= bitMask
[i
]; // mask out control bits
1647 // overlong, could have been encoded with i bytes
1648 if ((d
& ~bitMask
[i
- 1]) == 0)
1650 static if (useReplacementDchar
)
1653 return replacementDchar
;
1659 // check for surrogates only needed for 3 bytes
1662 if (!isValidDchar(d
))
1664 static if (useReplacementDchar
)
1667 return replacementDchar
;
1679 static if (useReplacementDchar
)
1680 d
= replacementDchar
;
1689 static if (useReplacementDchar
)
1691 index
+= 4; // read 4 chars by now
1692 return replacementDchar
;
1698 @safe pure @nogc nothrow
1701 // Add tests for useReplacemendDchar == yes path
1705 @safe pure @nogc nothrow:
1706 this(string s
) { this.s
= s
; }
1707 @property bool empty() { return idx
== s
.length
; }
1708 @property char front() { return s
[idx
]; }
1709 void popFront() { ++idx
; }
1714 foreach (s
; invalidUTFstrings
!char())
1718 dchar dc
= decodeImpl
!(false, Yes
.useReplacementDchar
)(r
, index
);
1719 assert(dc
== replacementDchar
);
1720 assert(1 <= index
&& index
<= s
.length
);
1724 private dchar decodeImpl(bool canIndex
, UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)
1725 (auto ref S
str, ref size_t index
)
1726 if (is(S
: const wchar[]) ||
(isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable wchar)))
1728 static if (is(S
: const wchar[]))
1729 auto pstr
= str.ptr
+ index
;
1730 else static if (isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
)
1731 auto pstr
= str[index
.. str.length
];
1735 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1736 // outside of decodeImpl
1737 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1739 static if (canIndex
)
1741 immutable length
= str.length
- index
;
1746 uint u
= pstr
.front
;
1750 static if (!useReplacementDchar
)
1752 UTFException
exception(string msg
)
1754 static if (canIndex
)
1755 return new UTFException(msg
).setSequence(pstr
[0]);
1757 return new UTFException(msg
);
1761 // The < case must be taken care of before decodeImpl is called.
1762 assert(u
>= 0xD800);
1766 static if (canIndex
)
1767 immutable onlyOneCodeUnit
= length
== 1;
1769 immutable onlyOneCodeUnit
= pstr
.empty
;
1771 if (onlyOneCodeUnit
)
1773 static if (useReplacementDchar
)
1776 return replacementDchar
;
1779 throw exception("surrogate UTF-16 high value past end of string");
1782 static if (canIndex
)
1783 immutable uint u2
= pstr
[1];
1786 immutable uint u2
= pstr
.front
;
1790 if (u2
< 0xDC00 || u2
> 0xDFFF)
1792 static if (useReplacementDchar
)
1793 u
= replacementDchar
;
1795 throw exception("surrogate UTF-16 low value out of range");
1798 u
= ((u
- 0xD7C0) << 10) + (u2
- 0xDC00);
1801 else if (u
>= 0xDC00 && u
<= 0xDFFF)
1803 static if (useReplacementDchar
)
1804 u
= replacementDchar
;
1806 throw exception("unpaired surrogate UTF-16 value");
1810 // Note: u+FFFE and u+FFFF are specifically permitted by the
1811 // Unicode standard for application internal use (see isValidDchar)
1813 return cast(dchar) u
;
1816 @safe pure @nogc nothrow
1819 // Add tests for useReplacemendDchar == true path
1823 @safe pure @nogc nothrow:
1824 this(wstring s
) { this.s
= s
; }
1825 @property bool empty() { return idx
== s
.length
; }
1826 @property wchar front() { return s
[idx
]; }
1827 void popFront() { ++idx
; }
1832 foreach (s
; invalidUTFstrings
!wchar())
1836 dchar dc
= decodeImpl
!(false, Yes
.useReplacementDchar
)(r
, index
);
1837 assert(dc
== replacementDchar
);
1838 assert(1 <= index
&& index
<= s
.length
);
1842 private dchar decodeImpl(bool canIndex
, UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1843 auto ref S
str, ref size_t index
)
1844 if (is(S
: const dchar[]) ||
(isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar)))
1846 static if (is(S
: const dchar[]))
1847 auto pstr
= str.ptr
;
1851 static if (is(S
: const dchar[]) || isRandomAccessRange
!S
)
1853 dchar dc
= pstr
[index
];
1854 if (!isValidDchar(dc
))
1856 static if (useReplacementDchar
)
1857 dc
= replacementDchar
;
1859 throw new UTFException("Invalid UTF-32 value").setSequence(dc
);
1866 dchar dc
= pstr
.front
;
1867 if (!isValidDchar(dc
))
1869 static if (useReplacementDchar
)
1870 dc
= replacementDchar
;
1872 throw new UTFException("Invalid UTF-32 value").setSequence(dc
);
1880 @safe pure @nogc nothrow
1883 // Add tests for useReplacemendDchar == true path
1887 @safe pure @nogc nothrow:
1888 this(dstring s
) { this.s
= s
; }
1889 @property bool empty() { return idx
== s
.length
; }
1890 @property dchar front() { return s
[idx
]; }
1891 void popFront() { ++idx
; }
1896 foreach (s
; invalidUTFstrings
!dchar())
1900 dchar dc
= decodeImpl
!(false, Yes
.useReplacementDchar
)(r
, index
);
1901 assert(dc
== replacementDchar
);
1902 assert(1 <= index
&& index
<= s
.length
);
1907 version (StdUnittest
) private void testDecode(R
)(R range
,
1910 size_t expectedIndex
,
1911 size_t line
= __LINE__
)
1913 import core
.exception
: AssertError
;
1914 import std
.exception
: enforce
;
1915 import std
.string
: format
;
1916 import std
.traits
: isNarrowString
;
1918 static if (hasLength
!R
)
1919 immutable lenBefore
= range
.length
;
1921 static if (isRandomAccessRange
!R
&& !isNarrowString
!R
)
1924 immutable result
= decode(range
, index
);
1925 enforce(result
== expectedChar
,
1926 new AssertError(format("decode: Wrong character: %s", result
), __FILE__
, line
));
1927 enforce(index
== expectedIndex
,
1928 new AssertError(format("decode: Wrong index: %s", index
), __FILE__
, line
));
1929 static if (hasLength
!R
)
1931 enforce(range
.length
== lenBefore
,
1932 new AssertError(format("decode: length changed: %s", range
.length
), __FILE__
, line
));
1938 version (StdUnittest
) private void testDecodeFront(R
)(ref R range
,
1940 size_t expectedNumCodeUnits
,
1941 size_t line
= __LINE__
)
1943 import core
.exception
: AssertError
;
1944 import std
.exception
: enforce
;
1945 import std
.string
: format
;
1947 static if (hasLength
!R
)
1948 immutable lenBefore
= range
.length
;
1950 size_t numCodeUnits
;
1951 immutable result
= decodeFront(range
, numCodeUnits
);
1952 enforce(result
== expectedChar
,
1953 new AssertError(format("decodeFront: Wrong character: %s", result
), __FILE__
, line
));
1954 enforce(numCodeUnits
== expectedNumCodeUnits
,
1955 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits
), __FILE__
, line
));
1957 static if (hasLength
!R
)
1959 enforce(range
.length
== lenBefore
- numCodeUnits
,
1960 new AssertError(format("decodeFront: wrong length: %s", range
.length
), __FILE__
, line
));
1964 version (StdUnittest
) private void testDecodeBack(R
)(ref R range
,
1966 size_t expectedNumCodeUnits
,
1967 size_t line
= __LINE__
)
1969 // This condition is to allow unit testing all `decode` functions together
1970 static if (!isBidirectionalRange
!R
)
1974 import core
.exception
: AssertError
;
1975 import std
.exception
: enforce
;
1976 import std
.string
: format
;
1978 static if (hasLength
!R
)
1979 immutable lenBefore
= range
.length
;
1981 size_t numCodeUnits
;
1982 immutable result
= decodeBack(range
, numCodeUnits
);
1983 enforce(result
== expectedChar
,
1984 new AssertError(format("decodeBack: Wrong character: %s", result
), __FILE__
, line
));
1985 enforce(numCodeUnits
== expectedNumCodeUnits
,
1986 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits
), __FILE__
, line
));
1988 static if (hasLength
!R
)
1990 enforce(range
.length
== lenBefore
- numCodeUnits
,
1991 new AssertError(format("decodeBack: wrong length: %s", range
.length
), __FILE__
, line
));
1996 version (StdUnittest
) private void testAllDecode(R
)(R range
,
1998 size_t expectedIndex
,
1999 size_t line
= __LINE__
)
2001 testDecode(range
, 0, expectedChar
, expectedIndex
, line
);
2002 static if (isBidirectionalRange
!R
)
2004 auto rangeCopy
= range
.save
;
2005 testDecodeBack(rangeCopy
, expectedChar
, expectedIndex
, line
);
2007 testDecodeFront(range
, expectedChar
, expectedIndex
, line
);
2010 version (StdUnittest
) private void testBadDecode(R
)(R range
, size_t index
, size_t line
= __LINE__
)
2012 import core
.exception
: AssertError
;
2013 import std
.exception
: assertThrown
, enforce
;
2014 import std
.string
: format
;
2016 immutable initialIndex
= index
;
2018 static if (hasLength
!R
)
2019 immutable lenBefore
= range
.length
;
2021 static if (isRandomAccessRange
!R
)
2023 assertThrown
!UTFException(decode(range
, index
), null, __FILE__
, line
);
2024 enforce(index
== initialIndex
,
2025 new AssertError(format("decode: Wrong index: %s", index
), __FILE__
, line
));
2026 static if (hasLength
!R
)
2028 enforce(range
.length
== lenBefore
,
2029 new AssertError(format("decode: length changed:", range
.length
), __FILE__
, line
));
2033 if (initialIndex
== 0)
2034 assertThrown
!UTFException(decodeFront(range
, index
), null, __FILE__
, line
);
2037 version (StdUnittest
) private void testBadDecodeBack(R
)(R range
, size_t line
= __LINE__
)
2039 // This condition is to allow unit testing all `decode` functions together
2040 static if (!isBidirectionalRange
!R
)
2044 import core
.exception
: AssertError
;
2045 import std
.exception
: assertThrown
, enforce
;
2046 import std
.string
: format
;
2048 static if (hasLength
!R
)
2049 immutable lenBefore
= range
.length
;
2051 static if (isRandomAccessRange
!R
)
2053 assertThrown
!UTFException(decodeBack(range
), null, __FILE__
, line
);
2054 static if (hasLength
!R
)
2056 enforce(range
.length
== lenBefore
,
2057 new AssertError(format("decodeBack: length changed:", range
.length
), __FILE__
, line
));
2065 import std
.conv
: to
;
2066 import std
.exception
;
2070 foreach (S
; AliasSeq
!(to
!string
, InputCU
!char, RandomCU
!char,
2071 (string s
) => new RefBidirCU
!char(s
),
2072 (string s
) => new RefRandomCU
!char(s
)))
2074 enum sHasLength
= hasLength
!(typeof(S("abcd")));
2077 auto range
= S("abcd");
2078 testDecode(range
, 0, 'a', 1);
2079 testDecode(range
, 1, 'b', 2);
2080 testDecodeFront(range
, 'a', 1);
2081 testDecodeFront(range
, 'b', 1);
2082 assert(decodeFront(range
) == 'c');
2083 assert(decodeFront(range
) == 'd');
2087 auto range
= S("ウェブサイト");
2088 testDecode(range
, 0, 'ウ', 3);
2089 testDecode(range
, 3, 'ェ', 6);
2090 testDecodeFront(range
, 'ウ', 3);
2091 testDecodeFront(range
, 'ェ', 3);
2092 assert(decodeFront(range
) == 'ブ');
2093 assert(decodeFront(range
) == 'サ');
2097 auto range
= S("abcd");
2098 testDecodeBack(range
, 'd', 1);
2099 testDecodeBack(range
, 'c', 1);
2100 testDecodeBack(range
, 'b', 1);
2101 testDecodeBack(range
, 'a', 1);
2105 auto range
= S("ウェブサイト");
2106 testDecodeBack(range
, 'ト', 3);
2107 testDecodeBack(range
, 'イ', 3);
2108 testDecodeBack(range
, 'サ', 3);
2109 testDecodeBack(range
, 'ブ', 3);
2112 testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2113 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2115 foreach (str; ["\xE2\x89", // too short
2119 "\xF8\x80\x80\x80\x8A",
2120 "\xFC\x80\x80\x80\x80\x8A"])
2122 testBadDecode(S(str), 0);
2123 testBadDecode(S(str), 1);
2124 testBadDecodeBack(S(str));
2127 //Invalid UTF-8 sequence where the first code unit is valid.
2128 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2129 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2131 //Invalid UTF-8 sequence where the first code unit isn't valid.
2132 foreach (str; ["\xED\xA0\x80",
2140 testBadDecode(S(str), 0);
2141 testBadDecodeBack(S(str));
2149 import std
.exception
;
2152 foreach (S
; AliasSeq
!((wstring s
) => s
, InputCU
!wchar, RandomCU
!wchar,
2153 (wstring s
) => new RefBidirCU
!wchar(s
),
2154 (wstring s
) => new RefRandomCU
!wchar(s
)))
2156 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2157 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2158 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2159 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2160 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2162 testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2163 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2165 testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2166 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2169 auto range
= S("ウェブサイト");
2170 testDecode(range
, 0, 'ウ', 1);
2171 testDecode(range
, 1, 'ェ', 2);
2172 testDecodeFront(range
, 'ウ', 1);
2173 testDecodeFront(range
, 'ェ', 1);
2174 assert(decodeFront(range
) == 'ブ');
2175 assert(decodeFront(range
) == 'サ');
2179 auto range
= S("ウェブサイト");
2180 testDecodeBack(range
, 'ト', 1);
2181 testDecodeBack(range
, 'イ', 1);
2182 testDecodeBack(range
, 'サ', 1);
2183 testDecodeBack(range
, 'ブ', 1);
2187 foreach (S
; AliasSeq
!((wchar[] s
) => s
.idup
, RandomCU
!wchar, (wstring s
) => new RefRandomCU
!wchar(s
)))
2189 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2191 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2192 testDecode(str, 0, cast(dchar) 0x10000, 2);
2193 testDecode(str, 2, cast(dchar) 0x1400, 3);
2194 testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2195 testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2196 testDecodeBack(str, cast(dchar) 0x1400, 1);
2197 testDecodeBack(str, cast(dchar) 0x10000, 2);
2204 import std
.exception
;
2207 foreach (S
; AliasSeq
!((dstring s
) => s
, RandomCU
!dchar, InputCU
!dchar,
2208 (dstring s
) => new RefBidirCU
!dchar(s
),
2209 (dstring s
) => new RefRandomCU
!dchar(s
)))
2211 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2212 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2213 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2214 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2215 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2217 testBadDecode(S([cast(dchar) 0xD800]), 0);
2218 testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2219 testBadDecode(S([cast(dchar) 0x110000]), 0);
2221 testBadDecodeBack(S([cast(dchar) 0xD800]));
2222 testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2223 testBadDecodeBack(S([cast(dchar) 0x110000]));
2226 auto range
= S("ウェブサイト");
2227 testDecode(range
, 0, 'ウ', 1);
2228 testDecode(range
, 1, 'ェ', 2);
2229 testDecodeFront(range
, 'ウ', 1);
2230 testDecodeFront(range
, 'ェ', 1);
2231 assert(decodeFront(range
) == 'ブ');
2232 assert(decodeFront(range
) == 'サ');
2236 auto range
= S("ウェブサイト");
2237 testDecodeBack(range
, 'ト', 1);
2238 testDecodeBack(range
, 'イ', 1);
2239 testDecodeBack(range
, 'サ', 1);
2240 testDecodeBack(range
, 'ブ', 1);
2244 foreach (S
; AliasSeq
!((dchar[] s
) => s
.idup
, RandomCU
!dchar, (dstring s
) => new RefRandomCU
!dchar(s
)))
2246 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2247 testDecode(str, 0, 0x10000, 1);
2248 testDecode(str, 1, 0x1400, 2);
2249 testDecode(str, 2, 0xB9DDE, 3);
2250 testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2251 testDecodeBack(str, cast(dchar) 0x1400, 1);
2252 testDecodeBack(str, cast(dchar) 0x10000, 1);
2259 import std
.exception
;
2260 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
2263 foreach (S
; AliasSeq
!( char[], const( char)[], string
,
2264 wchar[], const(wchar)[], wstring
,
2265 dchar[], const(dchar)[], dstring
))
2267 static assert(isSafe
!({ S
str; size_t i
= 0; decode(str, i
); }));
2268 static assert(isSafe
!({ S
str; size_t i
= 0; decodeFront(str, i
); }));
2269 static assert(isSafe
!({ S
str; decodeFront(str); }));
2270 static assert((functionAttributes
!({ S
str; size_t i
= 0; decode(str, i
); }) & FunctionAttribute
.pure_
) != 0);
2271 static assert((functionAttributes
!({
2272 S
str; size_t i
= 0; decodeFront(str, i
);
2273 }) & FunctionAttribute
.pure_
) != 0);
2274 static assert((functionAttributes
!({ S
str; decodeFront(str); }) & FunctionAttribute
.pure_
) != 0);
2275 static assert((functionAttributes
!({
2276 S
str; size_t i
= 0; decodeBack(str, i
);
2277 }) & FunctionAttribute
.pure_
) != 0);
2278 static assert((functionAttributes
!({ S
str; decodeBack(str); }) & FunctionAttribute
.pure_
) != 0);
2285 import std
.exception
;
2287 val
[0] = 0b1111_0111;
2288 val
[1] = 0b1011_1111;
2289 val
[2] = 0b1011_1111;
2290 val
[3] = 0b1011_1111;
2292 assertThrown
!UTFException((){ dchar ch
= decode(val
[], i
); }());
2294 /* =================== Encode ======================= */
2296 private dchar _utfException(UseReplacementDchar useReplacementDchar
)(string msg
, dchar c
)
2298 static if (useReplacementDchar
)
2299 return replacementDchar
;
2301 throw new UTFException(msg
).setSequence(c
);
2305 Encodes `c` into the static array, `buf`, and returns the actual
2306 length of the encoded character (a number between `1` and `4` for
2307 `char[4]` buffers and a number between `1` and `2` for
2308 `wchar[2]` buffers).
2311 `UTFException` if `c` is not a valid UTF code point.
2313 size_t
encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2314 out char[4] buf
, dchar c
) @safe pure
2318 assert(isValidDchar(c
));
2319 buf
[0] = cast(char) c
;
2324 assert(isValidDchar(c
));
2325 buf
[0] = cast(char)(0xC0 |
(c
>> 6));
2326 buf
[1] = cast(char)(0x80 |
(c
& 0x3F));
2331 if (0xD800 <= c
&& c
<= 0xDFFF)
2332 c
= _utfException
!useReplacementDchar("Encoding a surrogate code point in UTF-8", c
);
2334 assert(isValidDchar(c
));
2336 buf
[0] = cast(char)(0xE0 |
(c
>> 12));
2337 buf
[1] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2338 buf
[2] = cast(char)(0x80 |
(c
& 0x3F));
2343 assert(isValidDchar(c
));
2344 buf
[0] = cast(char)(0xF0 |
(c
>> 18));
2345 buf
[1] = cast(char)(0x80 |
((c
>> 12) & 0x3F));
2346 buf
[2] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2347 buf
[3] = cast(char)(0x80 |
(c
& 0x3F));
2351 assert(!isValidDchar(c
));
2352 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-8", c
);
2359 import std
.exception
: assertThrown
;
2360 import std
.typecons
: Yes
;
2364 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2365 assert(encode(buf
, '\u007F') == 1 && buf
[0 .. 1] == "\u007F");
2366 assert(encode(buf
, '\u0080') == 2 && buf
[0 .. 2] == "\u0080");
2367 assert(encode(buf
, '\uE000') == 3 && buf
[0 .. 3] == "\uE000");
2368 assert(encode(buf
, 0xFFFE) == 3 && buf
[0 .. 3] == "\xEF\xBF\xBE");
2369 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2371 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2373 assert(slice
.decodeFront
== replacementDchar
);
2379 import std
.exception
: assertThrown
;
2380 import std
.typecons
: Yes
;
2384 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2385 assert(encode(buf
, '\uD7FF') == 1 && buf
[0 .. 1] == "\uD7FF");
2386 assert(encode(buf
, '\uE000') == 1 && buf
[0 .. 1] == "\uE000");
2387 assert(encode(buf
, '\U00010000') == 2 && buf
[0 .. 2] == "\U00010000");
2388 assert(encode(buf
, '\U0010FFFF') == 2 && buf
[0 .. 2] == "\U0010FFFF");
2389 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2391 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2393 assert(slice
.decodeFront
== replacementDchar
);
2399 import std
.exception
: assertThrown
;
2400 import std
.typecons
: Yes
;
2404 assert(encode(buf
, '\u0000') == 1 && buf
[0] == '\u0000');
2405 assert(encode(buf
, '\uD7FF') == 1 && buf
[0] == '\uD7FF');
2406 assert(encode(buf
, '\uE000') == 1 && buf
[0] == '\uE000');
2407 assert(encode(buf
, '\U0010FFFF') == 1 && buf
[0] == '\U0010FFFF');
2408 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2410 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2411 assert(buf
[0] == replacementDchar
);
2416 import std
.exception
;
2421 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2422 assert(encode(buf
, '\u007F') == 1 && buf
[0 .. 1] == "\u007F");
2423 assert(encode(buf
, '\u0080') == 2 && buf
[0 .. 2] == "\u0080");
2424 assert(encode(buf
, '\u07FF') == 2 && buf
[0 .. 2] == "\u07FF");
2425 assert(encode(buf
, '\u0800') == 3 && buf
[0 .. 3] == "\u0800");
2426 assert(encode(buf
, '\uD7FF') == 3 && buf
[0 .. 3] == "\uD7FF");
2427 assert(encode(buf
, '\uE000') == 3 && buf
[0 .. 3] == "\uE000");
2428 assert(encode(buf
, 0xFFFE) == 3 && buf
[0 .. 3] == "\xEF\xBF\xBE");
2429 assert(encode(buf
, 0xFFFF) == 3 && buf
[0 .. 3] == "\xEF\xBF\xBF");
2430 assert(encode(buf
, '\U00010000') == 4 && buf
[0 .. 4] == "\U00010000");
2431 assert(encode(buf
, '\U0010FFFF') == 4 && buf
[0 .. 4] == "\U0010FFFF");
2433 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2434 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2435 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2436 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2437 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2439 assert(encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000) == buf
.stride
);
2440 enum replacementDcharString
= "\uFFFD";
2441 assert(buf
[0 .. replacementDcharString
.length
] == replacementDcharString
);
2447 size_t
encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2448 out wchar[2] buf
, dchar c
) @safe pure
2452 if (0xD800 <= c
&& c
<= 0xDFFF)
2453 c
= _utfException
!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c
);
2455 assert(isValidDchar(c
));
2457 buf
[0] = cast(wchar) c
;
2462 assert(isValidDchar(c
));
2463 buf
[0] = cast(wchar)((((c
- 0x10000) >> 10) & 0x3FF) + 0xD800);
2464 buf
[1] = cast(wchar)(((c
- 0x10000) & 0x3FF) + 0xDC00);
2468 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-16", c
);
2474 import std
.exception
;
2479 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2480 assert(encode(buf
, '\uD7FF') == 1 && buf
[0 .. 1] == "\uD7FF");
2481 assert(encode(buf
, '\uE000') == 1 && buf
[0 .. 1] == "\uE000");
2482 assert(encode(buf
, 0xFFFE) == 1 && buf
[0] == 0xFFFE);
2483 assert(encode(buf
, 0xFFFF) == 1 && buf
[0] == 0xFFFF);
2484 assert(encode(buf
, '\U00010000') == 2 && buf
[0 .. 2] == "\U00010000");
2485 assert(encode(buf
, '\U0010FFFF') == 2 && buf
[0 .. 2] == "\U0010FFFF");
2487 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2488 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2489 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2490 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2491 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2493 assert(encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000) == buf
.stride
);
2494 assert(buf
.front
== replacementDchar
);
2500 size_t
encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2501 out dchar[1] buf
, dchar c
) @safe pure
2503 if ((0xD800 <= c
&& c
<= 0xDFFF) ||
0x10FFFF < c
)
2504 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-32", c
);
2506 assert(isValidDchar(c
));
2513 import std
.exception
;
2518 encode(buf
, '\u0000'); assert(buf
[0] == '\u0000');
2519 encode(buf
, '\uD7FF'); assert(buf
[0] == '\uD7FF');
2520 encode(buf
, '\uE000'); assert(buf
[0] == '\uE000');
2521 encode(buf
, 0xFFFE ); assert(buf
[0] == 0xFFFE);
2522 encode(buf
, 0xFFFF ); assert(buf
[0] == 0xFFFF);
2523 encode(buf
, '\U0010FFFF'); assert(buf
[0] == '\U0010FFFF');
2525 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2526 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2527 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2528 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2529 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2531 assert(encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000) == buf
.stride
);
2532 assert(buf
.front
== replacementDchar
);
2538 Encodes `c` in `str`'s encoding and appends it to `str`.
2541 `UTFException` if `c` is not a valid UTF code point.
2543 void encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2544 ref char[] str, dchar c
) @safe pure
2550 assert(isValidDchar(c
));
2560 assert(isValidDchar(c
));
2561 buf
[0] = cast(char)(0xC0 |
(c
>> 6));
2562 buf
[1] = cast(char)(0x80 |
(c
& 0x3F));
2565 else if (c
<= 0xFFFF)
2567 if (0xD800 <= c
&& c
<= 0xDFFF)
2568 c
= _utfException
!useReplacementDchar("Encoding a surrogate code point in UTF-8", c
);
2570 assert(isValidDchar(c
));
2572 buf
[0] = cast(char)(0xE0 |
(c
>> 12));
2573 buf
[1] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2574 buf
[2] = cast(char)(0x80 |
(c
& 0x3F));
2577 else if (c
<= 0x10FFFF)
2579 assert(isValidDchar(c
));
2580 buf
[0] = cast(char)(0xF0 |
(c
>> 18));
2581 buf
[1] = cast(char)(0x80 |
((c
>> 12) & 0x3F));
2582 buf
[2] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2583 buf
[3] = cast(char)(0x80 |
(c
& 0x3F));
2588 assert(!isValidDchar(c
));
2589 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-8", c
);
2600 char[] s
= "abcd".dup
;
2605 assert(s
.length
== 5);
2606 assert(s
== "abcda");
2608 assert(s
.length
== 7);
2609 assert(s
== "abcdaø");
2614 import std
.exception
;
2618 char[] s
= "abcd".dup
;
2619 encode(s
, cast(dchar)'a');
2620 assert(s
.length
== 5);
2621 assert(s
== "abcda");
2623 encode(s
, cast(dchar)'\u00A9');
2624 assert(s
.length
== 7);
2625 assert(s
== "abcda\xC2\xA9");
2626 //assert(s == "abcda\u00A9"); // BUG: fix compiler
2628 encode(s
, cast(dchar)'\u2260');
2629 assert(s
.length
== 10);
2630 assert(s
== "abcda\xC2\xA9\xE2\x89\xA0");
2636 import std
.exception
;
2641 encode(buf
, '\u0000'); assert(buf
[0 .. $] == "\u0000");
2642 encode(buf
, '\u007F'); assert(buf
[1 .. $] == "\u007F");
2643 encode(buf
, '\u0080'); assert(buf
[2 .. $] == "\u0080");
2644 encode(buf
, '\u07FF'); assert(buf
[4 .. $] == "\u07FF");
2645 encode(buf
, '\u0800'); assert(buf
[6 .. $] == "\u0800");
2646 encode(buf
, '\uD7FF'); assert(buf
[9 .. $] == "\uD7FF");
2647 encode(buf
, '\uE000'); assert(buf
[12 .. $] == "\uE000");
2648 encode(buf
, 0xFFFE); assert(buf
[15 .. $] == "\xEF\xBF\xBE");
2649 encode(buf
, 0xFFFF); assert(buf
[18 .. $] == "\xEF\xBF\xBF");
2650 encode(buf
, '\U00010000'); assert(buf
[21 .. $] == "\U00010000");
2651 encode(buf
, '\U0010FFFF'); assert(buf
[25 .. $] == "\U0010FFFF");
2653 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2654 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2655 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2656 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2657 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2659 enum replacementDcharString
= "\uFFFD";
2660 enum rdcslen
= replacementDcharString
.length
;
2661 assert(buf
[$ - rdcslen
.. $] != replacementDcharString
);
2662 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2663 assert(buf
[$ - rdcslen
.. $] == replacementDcharString
);
2668 void encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2669 ref wchar[] str, dchar c
) @safe pure
2675 if (0xD800 <= c
&& c
<= 0xDFFF)
2676 c
= _utfException
!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c
);
2678 assert(isValidDchar(c
));
2682 else if (c
<= 0x10FFFF)
2686 assert(isValidDchar(c
));
2687 buf
[0] = cast(wchar)((((c
- 0x10000) >> 10) & 0x3FF) + 0xD800);
2688 buf
[1] = cast(wchar)(((c
- 0x10000) & 0x3FF) + 0xDC00);
2693 assert(!isValidDchar(c
));
2694 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-16", c
);
2703 import std
.exception
;
2708 encode(buf
, '\u0000'); assert(buf
[0] == '\u0000');
2709 encode(buf
, '\uD7FF'); assert(buf
[1] == '\uD7FF');
2710 encode(buf
, '\uE000'); assert(buf
[2] == '\uE000');
2711 encode(buf
, 0xFFFE); assert(buf
[3] == 0xFFFE);
2712 encode(buf
, 0xFFFF); assert(buf
[4] == 0xFFFF);
2713 encode(buf
, '\U00010000'); assert(buf
[5 .. $] == "\U00010000");
2714 encode(buf
, '\U0010FFFF'); assert(buf
[7 .. $] == "\U0010FFFF");
2716 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2717 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2718 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2719 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2720 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2722 assert(buf
.back
!= replacementDchar
);
2723 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2724 assert(buf
.back
== replacementDchar
);
2729 void encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2730 ref dchar[] str, dchar c
) @safe pure
2732 if ((0xD800 <= c
&& c
<= 0xDFFF) ||
0x10FFFF < c
)
2733 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-32", c
);
2735 assert(isValidDchar(c
));
2741 import std
.exception
;
2746 encode(buf
, '\u0000'); assert(buf
[0] == '\u0000');
2747 encode(buf
, '\uD7FF'); assert(buf
[1] == '\uD7FF');
2748 encode(buf
, '\uE000'); assert(buf
[2] == '\uE000');
2749 encode(buf
, 0xFFFE ); assert(buf
[3] == 0xFFFE);
2750 encode(buf
, 0xFFFF ); assert(buf
[4] == 0xFFFF);
2751 encode(buf
, '\U0010FFFF'); assert(buf
[5] == '\U0010FFFF');
2753 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2754 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2755 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2756 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2757 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2759 assert(buf
.back
!= replacementDchar
);
2760 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2761 assert(buf
.back
== replacementDchar
);
2767 Returns the number of code units that are required to encode the code point
2768 `c` when `C` is the character type used to encode it.
2770 ubyte codeLength(C
)(dchar c
) @safe pure nothrow @nogc
2773 static if (C
.sizeof
== 1)
2775 if (c
<= 0x7F) return 1;
2776 if (c
<= 0x7FF) return 2;
2777 if (c
<= 0xFFFF) return 3;
2778 if (c
<= 0x10FFFF) return 4;
2781 else static if (C
.sizeof
== 2)
2783 return c
<= 0xFFFF ?
1 : 2;
2787 static assert(C
.sizeof
== 4);
2793 @safe pure nothrow @nogc unittest
2795 assert(codeLength
!char('a') == 1);
2796 assert(codeLength
!wchar('a') == 1);
2797 assert(codeLength
!dchar('a') == 1);
2799 assert(codeLength
!char('\U0010FFFF') == 4);
2800 assert(codeLength
!wchar('\U0010FFFF') == 2);
2801 assert(codeLength
!dchar('\U0010FFFF') == 1);
2806 Returns the number of code units that are required to encode `str`
2807 in a string whose character type is `C`. This is particularly useful
2808 when slicing one string with the length of another and the two string
2809 types use different character types.
2812 C = the character type to get the encoding length for
2813 input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2814 to calculate the encoding length from
2816 The number of code units in `input` when encoded to `C`
2818 size_t
codeLength(C
, InputRange
)(InputRange input
)
2819 if (isInputRange
!InputRange
&& !isInfinite
!InputRange
&& isSomeChar
!(ElementType
!InputRange
))
2821 alias EncType
= Unqual
!(ElementEncodingType
!InputRange
);
2822 static if (isSomeString
!InputRange
&& is(EncType
== C
) && is(typeof(input
.length
)))
2823 return input
.length
;
2828 foreach (c
; input
.byDchar
)
2829 total
+= codeLength
!C(c
);
2838 assert(codeLength
!char("hello world") ==
2839 "hello world".length
);
2840 assert(codeLength
!wchar("hello world") ==
2841 "hello world"w
.length
);
2842 assert(codeLength
!dchar("hello world") ==
2843 "hello world"d
.length
);
2845 assert(codeLength
!char(`プログラミング`) ==
2847 assert(codeLength
!wchar(`プログラミング`) ==
2849 assert(codeLength
!dchar(`プログラミング`) ==
2852 string haystack
= `Être sans la verité, ça, ce ne serait pas bien.`;
2853 wstring needle
= `Être sans la verité`;
2854 assert(haystack
[codeLength
!char(needle
) .. $] ==
2855 `, ça, ce ne serait pas bien.`);
2860 import std
.algorithm
.iteration
: filter
;
2861 import std
.conv
: to
;
2862 import std
.exception
;
2866 foreach (S
; AliasSeq
!( char[], const char[], string
,
2867 wchar[], const wchar[], wstring
,
2868 dchar[], const dchar[], dstring
))
2870 foreach (C
; AliasSeq
!(char, wchar, dchar))
2872 assert(codeLength
!C(to
!S("Walter Bright")) == to
!(C
[])("Walter Bright").length
);
2873 assert(codeLength
!C(to
!S(`言語`)) == to
!(C
[])(`言語`).length
);
2874 assert(codeLength
!C(to
!S(`ウェブサイト@La_Verité.com`)) ==
2875 to
!(C
[])(`ウェブサイト@La_Verité.com`).length
);
2876 assert(codeLength
!C(to
!S(`ウェブサイト@La_Verité.com`).filter
!(x
=> true)()) ==
2877 to
!(C
[])(`ウェブサイト@La_Verité.com`).length
);
2884 Internal helper function:
2886 Returns true if it is safe to search for the Codepoint `c` inside
2887 code units, without decoding.
2889 This is a runtime check that is used an optimization in various functions,
2890 particularly, in `std.string`.
2892 package bool canSearchInCodeUnits(C
)(dchar c
)
2895 static if (C
.sizeof
== 1)
2897 else static if (C
.sizeof
== 2)
2898 return c
<= 0xD7FF ||
(0xE000 <= c
&& c
<= 0xFFFF);
2899 else static if (C
.sizeof
== 4)
2906 assert( canSearchInCodeUnits
! char('a'));
2907 assert( canSearchInCodeUnits
!wchar('a'));
2908 assert( canSearchInCodeUnits
!dchar('a'));
2909 assert(!canSearchInCodeUnits
! char('ö')); //Important test: ö <= 0xFF
2910 assert(!canSearchInCodeUnits
! char(cast(char)'ö')); //Important test: ö <= 0xFF
2911 assert( canSearchInCodeUnits
!wchar('ö'));
2912 assert( canSearchInCodeUnits
!dchar('ö'));
2913 assert(!canSearchInCodeUnits
! char('日'));
2914 assert( canSearchInCodeUnits
!wchar('日'));
2915 assert( canSearchInCodeUnits
!dchar('日'));
2916 assert(!canSearchInCodeUnits
!wchar(cast(wchar) 0xDA00));
2917 assert( canSearchInCodeUnits
!dchar(cast(dchar) 0xDA00));
2918 assert(!canSearchInCodeUnits
! char('\U00010001'));
2919 assert(!canSearchInCodeUnits
!wchar('\U00010001'));
2920 assert( canSearchInCodeUnits
!dchar('\U00010001'));
2923 /* =================== Validation ======================= */
2926 Checks to see if `str` is well-formed unicode or not.
2929 `UTFException` if `str` is not well-formed.
2931 void validate(S
)(in S
str) @safe pure
2934 immutable len
= str.length
;
2935 for (size_t i
= 0; i
< len
; )
2944 import std
.exception
: assertThrown
;
2945 char[] a
= [167, 133, 175];
2946 assertThrown
!UTFException(validate(a
));
2949 // https://issues.dlang.org/show_bug.cgi?id=12923
2952 import std
.exception
;
2954 char[3]a
=[167, 133, 175];
2960 * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2961 * string of the elements.
2964 * s = the string to encode
2968 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2970 string
toUTF8(S
)(S s
)
2971 if (isInputRange
!S
&& !isInfinite
!S
&& isSomeChar
!(ElementEncodingType
!S
))
2973 return toUTFImpl
!string(s
);
2979 import std
.algorithm
.comparison
: equal
;
2981 // The ö is represented by two UTF-8 code units
2982 assert("Hellø"w
.toUTF8
.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2984 // 𐐷 is four code units in UTF-8
2985 assert("𐐷"d
.toUTF8
.equal([0xF0, 0x90, 0x90, 0xB7]));
2988 @system pure unittest
2990 import std
.algorithm
.comparison
: equal
;
2991 import std
.internal
.test.dummyrange
: ReferenceInputRange
;
2993 alias RT
= ReferenceInputRange
!(ElementType
!(string
));
2994 auto r1
= new RT("Hellø");
2995 auto r2
= new RT("𐐷");
2997 assert(r1
.toUTF8
.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2998 assert(r2
.toUTF8
.equal([0xF0, 0x90, 0x90, 0xB7]));
3002 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3003 * `wstring` of the elements.
3006 * s = the range to encode
3010 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3012 wstring
toUTF16(S
)(S s
)
3013 if (isInputRange
!S
&& !isInfinite
!S
&& isSomeChar
!(ElementEncodingType
!S
))
3015 return toUTFImpl
!wstring(s
);
3021 import std
.algorithm
.comparison
: equal
;
3023 // these graphemes are two code units in UTF-16 and one in UTF-32
3024 assert("𤭢"d
.length
== 1);
3025 assert("𐐷"d
.length
== 1);
3027 assert("𤭢"d
.toUTF16
.equal([0xD852, 0xDF62]));
3028 assert("𐐷"d
.toUTF16
.equal([0xD801, 0xDC37]));
3031 @system pure unittest
3033 import std
.algorithm
.comparison
: equal
;
3034 import std
.internal
.test.dummyrange
: ReferenceInputRange
;
3036 alias RT
= ReferenceInputRange
!(ElementType
!(string
));
3037 auto r1
= new RT("𤭢");
3038 auto r2
= new RT("𐐷");
3040 assert(r1
.toUTF16
.equal([0xD852, 0xDF62]));
3041 assert(r2
.toUTF16
.equal([0xD801, 0xDC37]));
3046 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3047 * `dstring` of the elements.
3050 * s = the range to encode
3054 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3056 dstring
toUTF32(S
)(scope S s
)
3057 if (isInputRange
!S
&& !isInfinite
!S
&& isSomeChar
!(ElementEncodingType
!S
))
3059 return toUTFImpl
!dstring(s
);
3065 import std
.algorithm
.comparison
: equal
;
3067 // these graphemes are two code units in UTF-16 and one in UTF-32
3068 assert("𤭢"w
.length
== 2);
3069 assert("𐐷"w
.length
== 2);
3071 assert("𤭢"w
.toUTF32
.equal([0x00024B62]));
3072 assert("𐐷"w
.toUTF32
.equal([0x00010437]));
3075 private T
toUTFImpl(T
, S
)(scope S s
)
3077 static if (is(S
: T
))
3083 import std
.array
: appender
;
3084 auto app
= appender
!T();
3086 static if (is(S
== C
[], C
) || hasLength
!S
)
3087 app
.reserve(s
.length
);
3089 foreach (c
; s
.byUTF
!(Unqual
!(ElementEncodingType
!T
)))
3096 /* =================== toUTFz ======================= */
3099 Returns a C-style zero-terminated string equivalent to `str`. `str`
3100 must not contain embedded `'\0'`'s as any C function will treat the first
3101 `'\0'` that it sees as the end of the string. If `str.empty` is
3102 `true`, then a string containing only `'\0'` is returned.
3104 `toUTFz` accepts any type of string and is templated on the type of
3105 character pointer that you wish to convert to. It will avoid allocating a
3106 new string if it can, but there's a decent chance that it will end up having
3107 to allocate a new string - particularly when dealing with character types
3110 $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3111 anything alters the character one past the end of `str` (which is the
3112 `'\0'` character terminating the string), then the string won't be
3113 zero-terminated anymore. The most likely scenarios for that are if you
3114 append to `str` and no reallocation takes place or when `str` is a
3115 slice of a larger array, and you alter the character in the larger array
3116 which is one character past the end of `str`. Another case where it could
3117 occur would be if you had a mutable character array immediately after
3118 `str` in memory (for example, if they're member variables in a
3119 user-defined type with one declared right after the other) and that
3120 character array happened to start with `'\0'`. Such scenarios will never
3121 occur if you immediately use the zero-terminated string after calling
3122 `toUTFz` and the C function using it doesn't keep a reference to it.
3123 Also, they are unlikely to occur even if you save the zero-terminated string
3124 (the cases above would be among the few examples of where it could happen).
3125 However, if you save the zero-terminate string and want to be absolutely
3126 certain that the string stays zero-terminated, then simply append a
3127 `'\0'` to the string and use its `ptr` property rather than calling
3130 $(RED Warning 2:) When passing a character pointer to a C function, and the
3131 C function keeps it around for any reason, make sure that you keep a
3132 reference to it in your D code. Otherwise, it may go away during a garbage
3133 collection cycle and cause a nasty bug when the C code tries to use it.
3136 if (isPointer
!P
&& isSomeChar
!(typeof(*P
.init
)))
3138 P
toUTFz(S
)(S
str) @safe pure
3141 return toUTFzImpl
!(P
, S
)(str);
3148 auto p1
= toUTFz
!(char*)("hello world");
3149 auto p2
= toUTFz
!(const(char)*)("hello world");
3150 auto p3
= toUTFz
!(immutable(char)*)("hello world");
3151 auto p4
= toUTFz
!(char*)("hello world"d
);
3152 auto p5
= toUTFz
!(const(wchar)*)("hello world");
3153 auto p6
= toUTFz
!(immutable(dchar)*)("hello world"w
);
3156 private P
toUTFzImpl(P
, S
)(return scope S
str) @safe pure
3157 if (is(immutable typeof(*P
.init
) == typeof(str[0])))
3158 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3162 typeof(*P
.init
)[] retval
= ['\0'];
3164 auto trustedPtr() @trusted { return retval
.ptr
; }
3165 return trustedPtr();
3168 alias C
= Unqual
!(ElementEncodingType
!S
);
3170 //If the P is mutable, then we have to make a copy.
3171 static if (is(Unqual
!(typeof(*P
.init
)) == typeof(*P
.init
)))
3173 return toUTFzImpl
!(P
, const(C
)[])(cast(const(C
)[])str);
3179 auto trustedPtrAdd(S s
) @trusted { return s
.ptr
+ s
.length
; }
3180 immutable p
= trustedPtrAdd(str);
3182 // Peek past end of str, if it's 0, no conversion necessary.
3183 // Note that the compiler will put a 0 past the end of static
3184 // strings, and the storage allocator will put a 0 past the end
3185 // of newly allocated char[]'s.
3186 // Is p dereferenceable? A simple test: if the p points to an
3187 // address multiple of 4, then conservatively assume the pointer
3188 // might be pointing to a new block of memory, which might be
3189 // unreadable. Otherwise, it's definitely pointing to valid
3191 if ((cast(size_t
) p
& 3) && *p
== '\0')
3195 return toUTFzImpl
!(P
, const(C
)[])(cast(const(C
)[])str);
3199 private P
toUTFzImpl(P
, S
)(return scope S
str) @safe pure
3200 if (is(typeof(str[0]) C
) && is(immutable typeof(*P
.init
) == immutable C
) && !is(C
== immutable))
3201 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3203 alias InChar
= typeof(str[0]);
3204 alias OutChar
= typeof(*P
.init
);
3206 //const(C)[] -> const(C)* or
3207 //C[] -> C* or const(C)*
3208 static if (( is(const(Unqual
!InChar
) == InChar
) && is(const(Unqual
!OutChar
) == OutChar
)) ||
3209 (!is(const(Unqual
!InChar
) == InChar
) && !is(immutable(Unqual
!OutChar
) == OutChar
)))
3213 auto trustedPtrAdd(S s
) @trusted { return s
.ptr
+ s
.length
; }
3214 auto p
= trustedPtrAdd(str);
3216 if ((cast(size_t
) p
& 3) && *p
== '\0')
3223 //const(C)[] -> C* or immutable(C)* or
3224 //C[] -> immutable(C)*
3227 import std
.array
: uninitializedArray
;
3228 auto copy
= uninitializedArray
!(Unqual
!OutChar
[])(str.length
+ 1);
3229 copy
[0 .. $ - 1] = str[];
3232 auto trustedCast(typeof(copy
) c
) @trusted { return cast(P
) c
.ptr
; }
3233 return trustedCast(copy
);
3237 private P
toUTFzImpl(P
, S
)(S
str) @safe pure
3238 if (!is(immutable typeof(*P
.init
) == immutable typeof(str[0])))
3239 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3241 import std
.array
: appender
;
3242 auto retval
= appender
!(typeof(*P
.init
)[])();
3244 foreach (dchar c
; str)
3248 return () @trusted { return cast(P
) retval
.data
.ptr
; } ();
3253 import core
.exception
: AssertError
;
3254 import std
.algorithm
;
3255 import std
.conv
: to
;
3256 import std
.exception
;
3257 import std
.string
: format
;
3261 foreach (S
; AliasSeq
!(string
, wstring
, dstring
))
3263 alias C
= Unqual
!(ElementEncodingType
!S
);
3265 auto s1
= to
!S("hello\U00010143\u0100\U00010143");
3266 auto temp
= new C
[](s1
.length
+ 1);
3267 temp
[0 .. $ - 1] = s1
[0 .. $];
3270 auto trustedAssumeUnique(T
)(T t
) @trusted { return assumeUnique(t
); }
3271 auto s2
= trustedAssumeUnique(temp
);
3274 void trustedCStringAssert(P
, S
)(S s
) @trusted
3276 auto p
= toUTFz
!P(s
);
3277 assert(p
[0 .. s
.length
] == s
);
3278 assert(p
[s
.length
] == '\0');
3281 foreach (P
; AliasSeq
!(C
*, const(C
)*, immutable(C
)*))
3283 trustedCStringAssert
!P(s1
);
3284 trustedCStringAssert
!P(s2
);
3289 static void test(P
, S
)(S s
, size_t line
= __LINE__
) @trusted
3291 static size_t
zeroLen(C
)(const(C
)* ptr
) @trusted
3294 while (*ptr
!= '\0') { ++ptr
; ++len
; }
3298 auto p
= toUTFz
!P(s
);
3299 immutable len
= zeroLen(p
);
3300 enforce(cmp(s
, p
[0 .. len
]) == 0,
3301 new AssertError(format("Unit test failed: %s %s", P
.stringof
, S
.stringof
),
3307 foreach (P
; AliasSeq
!(wchar*, const(wchar)*, immutable(wchar)*,
3308 dchar*, const(dchar)*, immutable(dchar)*))
3310 test!P("hello\U00010143\u0100\U00010143");
3312 foreach (P
; AliasSeq
!( char*, const( char)*, immutable( char)*,
3313 dchar*, const(dchar)*, immutable(dchar)*))
3315 test!P("hello\U00010143\u0100\U00010143"w
);
3317 foreach (P
; AliasSeq
!( char*, const( char)*, immutable( char)*,
3318 wchar*, const(wchar)*, immutable(wchar)*))
3320 test!P("hello\U00010143\u0100\U00010143"d
);
3322 foreach (S
; AliasSeq
!( char[], const( char)[],
3323 wchar[], const(wchar)[],
3324 dchar[], const(dchar)[]))
3326 auto s
= to
!S("hello\U00010143\u0100\U00010143");
3328 foreach (P
; AliasSeq
!( char*, const( char)*, immutable( char)*,
3329 wchar*, const(wchar)*, immutable(wchar)*,
3330 dchar*, const(dchar)*, immutable(dchar)*))
3340 `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3342 Encodes string `s` into UTF-16 and returns the encoded string.
3343 `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3344 that take an `LPCWSTR` argument.
3346 const(wchar)* toUTF16z(C
)(const(C
)[] str) @safe pure
3349 return toUTFz
!(const(wchar)*)(str);
3355 string
str = "Hello, World!";
3356 const(wchar)* p
= str.toUTF16z
;
3357 assert(p
[str.length
] == '\0');
3362 import std
.conv
: to
;
3363 //toUTFz is already thoroughly tested, so this will just verify that
3364 //toUTF16z compiles properly for the various string types.
3365 foreach (S
; AliasSeq
!(string
, wstring
, dstring
))
3366 assert(toUTF16z(to
!S("hello world")) !is null);
3370 /* ================================ tests ================================== */
3374 import std
.exception
;
3378 assert(toUTF16("hello"c
) == "hello");
3379 assert(toUTF32("hello"c
) == "hello");
3380 assert(toUTF8 ("hello"w
) == "hello");
3381 assert(toUTF32("hello"w
) == "hello");
3382 assert(toUTF8 ("hello"d
) == "hello");
3383 assert(toUTF16("hello"d
) == "hello");
3385 assert(toUTF16("hel\u1234o"c
) == "hel\u1234o");
3386 assert(toUTF32("hel\u1234o"c
) == "hel\u1234o");
3387 assert(toUTF8 ("hel\u1234o"w
) == "hel\u1234o");
3388 assert(toUTF32("hel\u1234o"w
) == "hel\u1234o");
3389 assert(toUTF8 ("hel\u1234o"d
) == "hel\u1234o");
3390 assert(toUTF16("hel\u1234o"d
) == "hel\u1234o");
3392 assert(toUTF16("he\U0010AAAAllo"c
) == "he\U0010AAAAllo");
3393 assert(toUTF32("he\U0010AAAAllo"c
) == "he\U0010AAAAllo");
3394 assert(toUTF8 ("he\U0010AAAAllo"w
) == "he\U0010AAAAllo");
3395 assert(toUTF32("he\U0010AAAAllo"w
) == "he\U0010AAAAllo");
3396 assert(toUTF8 ("he\U0010AAAAllo"d
) == "he\U0010AAAAllo");
3397 assert(toUTF16("he\U0010AAAAllo"d
) == "he\U0010AAAAllo");
3403 Returns the total number of code points encoded in `str`.
3405 Supercedes: This function supercedes $(LREF toUCSindex).
3407 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3410 `UTFException` if `str` is not well-formed.
3412 size_t
count(C
)(const(C
)[] str) @safe pure nothrow @nogc
3415 return walkLength(str.byDchar
);
3419 @safe pure nothrow @nogc unittest
3421 assert(count("") == 0);
3422 assert(count("a") == 1);
3423 assert(count("abc") == 3);
3424 assert(count("\u20AC100") == 4);
3427 @safe pure nothrow @nogc unittest
3429 import std
.exception
;
3432 assert(count("") == 0);
3433 assert(count("a") == 1);
3434 assert(count("abc") == 3);
3435 assert(count("\u20AC100") == 4);
3440 // Ranges of code units for testing.
3441 version (StdUnittest
)
3446 import std
.conv
: to
;
3447 @property bool empty() { return _str
.empty
; }
3448 @property C
front() { return _str
[0]; }
3449 void popFront() { _str
= _str
[1 .. $]; }
3451 this(inout(C
)[] str)
3453 _str
= to
!(C
[])(str);
3461 import std
.conv
: to
;
3462 @property bool empty() { return _str
.empty
; }
3463 @property C
front() { return _str
[0]; }
3464 void popFront() { _str
= _str
[1 .. $]; }
3465 @property C
back() { return _str
[$ - 1]; }
3466 void popBack() { _str
= _str
[0 .. $ - 1]; }
3467 @property auto save() { return BidirCU(_str
); }
3468 @property size_t
length() { return _str
.length
; }
3470 this(inout(C
)[] str)
3472 _str
= to
!(C
[])(str);
3480 import std
.conv
: to
;
3481 @property bool empty() { return _str
.empty
; }
3482 @property C
front() { return _str
[0]; }
3483 void popFront() { _str
= _str
[1 .. $]; }
3484 @property C
back() { return _str
[$ - 1]; }
3485 void popBack() { _str
= _str
[0 .. $ - 1]; }
3486 @property auto save() { return RandomCU(_str
); }
3487 @property size_t
length() { return _str
.length
; }
3488 C
opIndex(size_t i
) { return _str
[i
]; }
3489 auto opSlice(size_t i
, size_t j
) { return RandomCU(_str
[i
.. j
]); }
3491 this(inout(C
)[] str)
3493 _str
= to
!(C
[])(str);
3501 import std
.conv
: to
;
3502 @property bool empty() { return _str
.empty
; }
3503 @property C
front() { return _str
[0]; }
3504 void popFront() { _str
= _str
[1 .. $]; }
3505 @property C
back() { return _str
[$ - 1]; }
3506 void popBack() { _str
= _str
[0 .. $ - 1]; }
3507 @property auto save() { return new RefBidirCU(_str
); }
3508 @property size_t
length() { return _str
.length
; }
3510 this(inout(C
)[] str)
3512 _str
= to
!(C
[])(str);
3518 class RefRandomCU(C
)
3520 import std
.conv
: to
;
3521 @property bool empty() { return _str
.empty
; }
3522 @property C
front() { return _str
[0]; }
3523 void popFront() { _str
= _str
[1 .. $]; }
3524 @property C
back() { return _str
[$ - 1]; }
3525 void popBack() { _str
= _str
[0 .. $ - 1]; }
3526 @property auto save() { return new RefRandomCU(_str
); }
3527 @property size_t
length() { return _str
.length
; }
3528 C
opIndex(size_t i
) { return _str
[i
]; }
3529 auto opSlice(size_t i
, size_t j
) { return new RefRandomCU(_str
[i
.. j
]); }
3531 this(inout(C
)[] str)
3533 _str
= to
!(C
[])(str);
3542 * Inserted in place of invalid UTF sequences.
3545 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3547 enum dchar replacementDchar
= '\uFFFD';
3549 /********************************************
3550 * Iterate a range of char, wchar, or dchars by code unit.
3552 * The purpose is to bypass the special case decoding that
3553 * $(REF front, std,range,primitives) does to character arrays. As a result,
3554 * using ranges with `byCodeUnit` can be `nothrow` while
3555 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3558 * A code unit is a building block of the UTF encodings. Generally, an
3559 * individual code unit does not represent what's perceived as a full
3560 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3561 * are encoded with multiple code units. For example, the UTF-8 code units for
3562 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3563 * often does not form a character on its own. Attempting to treat it as
3564 * one while iterating over the resulting range will give nonsensical results.
3567 * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3568 * of characters (including strings) or a type that implicitly converts to a string type.
3570 * If `r` is not an auto-decodable string (i.e. a narrow string or a
3571 * user-defined type that implicits converts to a string type), then `r`
3574 * Otherwise, `r` is converted to its corresponding string type (if it's
3575 * not already a string) and wrapped in a random-access range where the
3576 * element encoding type of the string (its code unit) is the element type
3577 * of the range, and that range returned. The range has slicing.
3579 * If `r` is quirky enough to be a struct or class which is an input range
3580 * of characters on its own (i.e. it has the input range API as member
3581 * functions), $(I and) it's implicitly convertible to a string type, then
3582 * `r` is returned, and no implicit conversion takes place.
3584 * If `r` is wrapped in a new range, then that range has a `source`
3585 * property for returning the string that's currently contained within that
3589 * Refer to the $(MREF std, uni) docs for a reference on Unicode
3592 * For a range that iterates by grapheme cluster (written character) see
3593 * $(REF byGrapheme, std,uni).
3595 auto byCodeUnit(R
)(R r
)
3596 if ((isConvertibleToString
!R
&& !isStaticArray
!R
) ||
3597 (isInputRange
!R
&& isSomeChar
!(ElementEncodingType
!R
)))
3599 import std
.traits
: StringTypeOf
;
3600 static if (// This would be cleaner if we had a way to check whether a type
3601 // was a range without any implicit conversions.
3602 (isAutodecodableString
!R
&& !__traits(hasMember
, R
, "empty") &&
3603 !__traits(hasMember
, R
, "front") && !__traits(hasMember
, R
, "popFront")))
3605 static struct ByCodeUnitImpl
3607 @safe pure nothrow @nogc:
3609 @property bool empty() const { return source
.length
== 0; }
3610 @property auto ref front() inout { return source
[0]; }
3611 void popFront() { source
= source
[1 .. $]; }
3613 @property auto save() { return ByCodeUnitImpl(source
.save
); }
3615 @property auto ref back() inout { return source
[$ - 1]; }
3616 void popBack() { source
= source
[0 .. $-1]; }
3618 auto ref opIndex(size_t index
) inout { return source
[index
]; }
3619 auto opSlice(size_t lower
, size_t upper
) { return ByCodeUnitImpl(source
[lower
.. upper
]); }
3621 @property size_t
length() const { return source
.length
; }
3622 alias opDollar
= length
;
3624 StringTypeOf
!R source
;
3627 static assert(isRandomAccessRange
!ByCodeUnitImpl
);
3629 return ByCodeUnitImpl(r
);
3631 else static if (!isInputRange
!R ||
3632 (is(R
: const dchar[]) && !__traits(hasMember
, R
, "empty") &&
3633 !__traits(hasMember
, R
, "front") && !__traits(hasMember
, R
, "popFront")))
3635 return cast(StringTypeOf
!R
) r
;
3639 // byCodeUnit for ranges and dchar[] is a no-op
3647 import std
.range
.primitives
;
3648 import std
.traits
: isAutodecodableString
;
3650 auto r
= "Hello, World!".byCodeUnit();
3651 static assert(hasLength
!(typeof(r
)));
3652 static assert(hasSlicing
!(typeof(r
)));
3653 static assert(isRandomAccessRange
!(typeof(r
)));
3654 static assert(is(ElementType
!(typeof(r
)) == immutable char));
3656 // contrast with the range capabilities of standard strings (with or
3657 // without autodecoding enabled).
3658 auto s
= "Hello, World!";
3659 static assert(isBidirectionalRange
!(typeof(r
)));
3660 static if (isAutodecodableString
!(typeof(s
)))
3662 // with autodecoding enabled, strings are non-random-access ranges of
3664 static assert(is(ElementType
!(typeof(s
)) == dchar));
3665 static assert(!isRandomAccessRange
!(typeof(s
)));
3666 static assert(!hasSlicing
!(typeof(s
)));
3667 static assert(!hasLength
!(typeof(s
)));
3671 // without autodecoding, strings are normal arrays.
3672 static assert(is(ElementType
!(typeof(s
)) == immutable char));
3673 static assert(isRandomAccessRange
!(typeof(s
)));
3674 static assert(hasSlicing
!(typeof(s
)));
3675 static assert(hasLength
!(typeof(s
)));
3679 /// `byCodeUnit` does no Unicode decoding
3682 string noel1
= "noe\u0308l"; // noël using e + combining diaeresis
3683 assert(noel1
.byCodeUnit
[2] != 'ë');
3684 assert(noel1
.byCodeUnit
[2] == 'e');
3686 string noel2
= "no\u00EBl"; // noël using a precomposed ë character
3687 // Because string is UTF-8, the code unit at index 2 is just
3688 // the first of a sequence that encodes 'ë'
3689 assert(noel2
.byCodeUnit
[2] != 'ë');
3692 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3695 import std
.algorithm
.comparison
: equal
;
3696 import std
.range
: popFrontN
;
3697 import std
.traits
: isAutodecodableString
;
3699 auto range
= byCodeUnit("hello world");
3701 assert(equal(range
.save
, "lo world"));
3702 static if (isAutodecodableString
!string
) // only enabled with autodecoding
3704 string
str = range
.source
;
3705 assert(str == "lo world");
3708 // source only exists if the range was wrapped
3710 auto range
= byCodeUnit("hello world"d
);
3711 static assert(!__traits(compiles
, range
.source
));
3715 @safe pure nothrow @nogc unittest
3719 enum testStr
= "𐁄𐂌𐃯 hello ディラン";
3720 char[testStr
.length
] s
;
3722 foreach (c
; testStr
.byCodeUnit().byCodeUnit())
3726 assert(s
== testStr
);
3729 enum testStr
= "𐁄𐂌𐃯 hello ディラン"w
;
3730 wchar[testStr
.length
] s
;
3732 foreach (c
; testStr
.byCodeUnit().byCodeUnit())
3736 assert(s
== testStr
);
3739 enum testStr
= "𐁄𐂌𐃯 hello ディラン"d
;
3740 dchar[testStr
.length
] s
;
3742 foreach (c
; testStr
.byCodeUnit().byCodeUnit())
3746 assert(s
== testStr
);
3749 auto bcu
= "hello".byCodeUnit();
3750 assert(bcu
.length
== 5);
3751 assert(bcu
[3] == 'l');
3752 assert(bcu
[2 .. 4][1] == 'l');
3755 char[5] orig
= "hello";
3756 auto bcu
= orig
[].byCodeUnit();
3758 assert(bcu
.front
== 'H');
3760 assert(bcu
[1] == 'E');
3763 auto bcu
= "hello".byCodeUnit().byCodeUnit();
3764 static assert(isForwardRange
!(typeof(bcu
)));
3765 static assert(is(typeof(bcu
) == struct) == isAutodecodableString
!string
);
3768 assert(s
.front
== 'h');
3771 auto bcu
= "hello".byCodeUnit();
3772 static assert(hasSlicing
!(typeof(bcu
)));
3773 static assert(isBidirectionalRange
!(typeof(bcu
)));
3774 static assert(is(typeof(bcu
) == struct) == isAutodecodableString
!string
);
3775 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3776 auto ret = bcu
.retro
;
3777 assert(ret.front
== 'o');
3779 assert(ret.front
== 'l');
3782 auto bcu
= "κόσμε"w
.byCodeUnit();
3783 static assert(hasSlicing
!(typeof(bcu
)));
3784 static assert(isBidirectionalRange
!(typeof(bcu
)));
3785 static assert(is(typeof(bcu
) == struct) == isAutodecodableString
!wstring
);
3786 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3787 auto ret = bcu
.retro
;
3788 assert(ret.front
== 'ε');
3790 assert(ret.front
== 'μ');
3793 static struct Stringish
3799 auto orig
= Stringish("\U0010fff8 𐁊 foo 𐂓");
3800 auto bcu
= orig
.byCodeUnit();
3801 static assert(is(typeof(bcu
) == struct));
3802 static assert(!is(typeof(bcu
) == Stringish
) == isAutodecodableString
!Stringish
);
3803 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3804 static assert(is(ElementType
!(typeof(bcu
)) == immutable char));
3805 assert(bcu
.front
== cast(char) 244);
3808 static struct WStringish
3814 auto orig
= WStringish("\U0010fff8 𐁊 foo 𐂓"w
);
3815 auto bcu
= orig
.byCodeUnit();
3816 static assert(is(typeof(bcu
) == struct));
3817 static assert(!is(typeof(bcu
) == WStringish
) == isAutodecodableString
!WStringish
);
3818 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3819 static assert(is(ElementType
!(typeof(bcu
)) == immutable wchar));
3820 assert(bcu
.front
== cast(wchar) 56319);
3823 static struct DStringish
3829 auto orig
= DStringish("\U0010fff8 𐁊 foo 𐂓"d
);
3830 auto bcu
= orig
.byCodeUnit();
3831 static assert(is(typeof(bcu
) == dstring
));
3832 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3833 static assert(is(ElementType
!(typeof(bcu
)) == immutable dchar));
3834 assert(bcu
.front
== cast(dchar) 1114104);
3837 static struct FuncStringish
3840 string
s() pure nothrow @nogc { return str; }
3844 auto orig
= FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3845 auto bcu
= orig
.byCodeUnit();
3846 static if (isAutodecodableString
!FuncStringish
)
3847 static assert(is(typeof(bcu
) == struct));
3849 static assert(is(typeof(bcu
) == string
));
3850 static assert(!is(typeof(bcu
) == FuncStringish
));
3851 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3852 static assert(is(ElementType
!(typeof(bcu
)) == immutable char));
3853 assert(bcu
.front
== cast(char) 244);
3859 bool empty() pure nothrow @nogc { return data
.empty
; }
3860 char front() pure nothrow @nogc { return data
[0]; }
3861 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3864 auto orig
= Range("\U0010fff8 𐁊 foo 𐂓");
3865 auto bcu
= orig
.byCodeUnit();
3866 static assert(is(typeof(bcu
) == Range
));
3867 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3868 static assert(is(ElementType
!(typeof(bcu
)) == char));
3869 assert(bcu
.front
== cast(char) 244);
3872 static struct WRange
3875 bool empty() pure nothrow @nogc { return data
.empty
; }
3876 wchar front() pure nothrow @nogc { return data
[0]; }
3877 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3880 auto orig
= WRange("\U0010fff8 𐁊 foo 𐂓"w
);
3881 auto bcu
= orig
.byCodeUnit();
3882 static assert(is(typeof(bcu
) == WRange
));
3883 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3884 static assert(is(ElementType
!(typeof(bcu
)) == wchar));
3885 assert(bcu
.front
== 56319);
3888 static struct DRange
3891 bool empty() pure nothrow @nogc { return data
.empty
; }
3892 dchar front() pure nothrow @nogc { return data
[0]; }
3893 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3896 auto orig
= DRange("\U0010fff8 𐁊 foo 𐂓"d
);
3897 auto bcu
= orig
.byCodeUnit();
3898 static assert(is(typeof(bcu
) == DRange
));
3899 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3900 static assert(is(ElementType
!(typeof(bcu
)) == dchar));
3901 assert(bcu
.front
== 1114104);
3904 static struct RangeAndStringish
3906 bool empty() pure nothrow @nogc { return data
.empty
; }
3907 char front() pure nothrow @nogc { return data
[0]; }
3908 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3915 auto orig
= RangeAndStringish("test.d", "other");
3916 auto bcu
= orig
.byCodeUnit();
3917 static assert(is(typeof(bcu
) == RangeAndStringish
));
3918 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3919 static assert(is(ElementType
!(typeof(bcu
)) == char));
3920 assert(bcu
.front
== 't');
3923 static struct WRangeAndStringish
3925 bool empty() pure nothrow @nogc { return data
.empty
; }
3926 wchar front() pure nothrow @nogc { return data
[0]; }
3927 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3934 auto orig
= WRangeAndStringish("test.d"w
, "other"w
);
3935 auto bcu
= orig
.byCodeUnit();
3936 static assert(is(typeof(bcu
) == WRangeAndStringish
));
3937 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3938 static assert(is(ElementType
!(typeof(bcu
)) == wchar));
3939 assert(bcu
.front
== 't');
3942 static struct DRangeAndStringish
3944 bool empty() pure nothrow @nogc { return data
.empty
; }
3945 dchar front() pure nothrow @nogc { return data
[0]; }
3946 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3953 auto orig
= DRangeAndStringish("test.d"d
, "other"d
);
3954 auto bcu
= orig
.byCodeUnit();
3955 static assert(is(typeof(bcu
) == DRangeAndStringish
));
3956 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3957 static assert(is(ElementType
!(typeof(bcu
)) == dchar));
3958 assert(bcu
.front
== 't');
3961 enum Enum
: string
{ a
= "test.d" }
3964 auto bcu
= orig
.byCodeUnit();
3965 static assert(!is(typeof(bcu
) == Enum
));
3966 static if (isAutodecodableString
!Enum
)
3967 static assert(is(typeof(bcu
) == struct));
3969 static assert(is(typeof(bcu
) == string
));
3970 static assert(is(ElementType
!(typeof(bcu
)) == immutable char));
3971 assert(bcu
.front
== 't');
3974 enum WEnum
: wstring
{ a
= "test.d"w
}
3976 auto orig
= WEnum
.a
;
3977 auto bcu
= orig
.byCodeUnit();
3978 static assert(!is(typeof(bcu
) == WEnum
));
3979 static if (isAutodecodableString
!WEnum
)
3980 static assert(is(typeof(bcu
) == struct));
3982 static assert(is(typeof(bcu
) == wstring
));
3983 static assert(is(ElementType
!(typeof(bcu
)) == immutable wchar));
3984 assert(bcu
.front
== 't');
3987 enum DEnum
: dstring
{ a
= "test.d"d
}
3989 auto orig
= DEnum
.a
;
3990 auto bcu
= orig
.byCodeUnit();
3991 static assert(is(typeof(bcu
) == dstring
));
3992 static assert(is(ElementType
!(typeof(bcu
)) == immutable dchar));
3993 assert(bcu
.front
== 't');
3996 static if (autodecodeStrings
)
3998 static assert(!is(typeof(byCodeUnit("hello")) == string
));
3999 static assert(!is(typeof(byCodeUnit("hello"w
)) == wstring
));
4003 static assert(is(typeof(byCodeUnit("hello")) == string
));
4004 static assert(is(typeof(byCodeUnit("hello"w
)) == wstring
));
4006 static assert(is(typeof(byCodeUnit("hello"d
)) == dstring
));
4008 static assert(!__traits(compiles
, byCodeUnit((char[5]).init
)));
4009 static assert(!__traits(compiles
, byCodeUnit((wchar[5]).init
)));
4010 static assert(!__traits(compiles
, byCodeUnit((dchar[5]).init
)));
4012 enum SEnum
: char[5] { a
= "hello" }
4013 enum WSEnum
: wchar[5] { a
= "hello"w
}
4014 enum DSEnum
: dchar[5] { a
= "hello"d
}
4016 static assert(!__traits(compiles
, byCodeUnit(SEnum
.a
)));
4017 static assert(!__traits(compiles
, byCodeUnit(WSEnum
.a
)));
4018 static assert(!__traits(compiles
, byCodeUnit(DSEnum
.a
)));
4021 /****************************
4022 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4023 * of characters by char, wchar, or dchar.
4024 * These aliases simply forward to $(LREF byUTF) with the
4025 * corresponding C argument.
4028 * r = input range of characters, or array of characters
4030 alias byChar
= byUTF
!char;
4033 alias byWchar
= byUTF
!wchar;
4036 alias byDchar
= byUTF
!dchar;
4038 @safe pure nothrow @nogc unittest
4043 foreach (c
; "hello".byChar
.byChar())
4045 //writefln("[%d] '%c'", i, c);
4048 assert(s
== "hello");
4051 char[5+2+3+4+3+3] s
;
4054 a
[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d
;
4055 a
[8] = 0xD800; // invalid
4056 a
[9] = cast(dchar) 0x110000; // invalid
4057 foreach (c
; a
[].byChar())
4059 //writefln("[%d] '%c'", i, c);
4062 assert(s
== "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4065 auto r
= "hello"w
.byChar();
4068 assert(r
.front
== 'l');
4071 auto r
= "hello"d
.byChar();
4074 assert(r
.front
== 'l');
4077 auto r
= "hello"d
.byChar();
4078 assert(isForwardRange
!(typeof(r
)));
4081 assert(s
.front
== 'h');
4085 @safe pure nothrow @nogc unittest
4091 a
[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d
;
4092 a
[8] = 0xD800; // invalid
4093 a
[9] = cast(dchar) 0x110000; // invalid
4094 foreach (c
; a
[].byWchar())
4096 //writefln("[%d] '%c' x%x", i, c, c);
4099 foreach (j
, wchar c
; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w
)
4101 //writefln("[%d] '%c' x%x", j, c, c);
4103 assert(s
== "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w
);
4107 auto r
= "hello".byWchar();
4110 assert(r
.front
== 'l');
4113 auto r
= "hello"d
.byWchar();
4116 assert(r
.front
== 'l');
4119 auto r
= "hello"d
.byWchar();
4120 assert(isForwardRange
!(typeof(r
)));
4123 assert(s
.front
== 'h');
4127 @safe pure nothrow @nogc unittest
4132 string a
= "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4133 foreach (c
; a
.byDchar())
4137 assert(s
== "hello\u07FF\uD7FF\U00010000\U0010FFFF"d
);
4140 foreach (s
; invalidUTFstrings
!char())
4142 auto r
= s
.byDchar();
4144 assert(r
.front
== r
.front
);
4146 assert(c
== replacementDchar
);
4150 auto r
= "hello".byDchar();
4153 assert(r
.front
== 'l');
4159 wstring a
= "hello\u07FF\uD7FF\U0010FFFF"w
;
4160 foreach (c
; a
.byDchar())
4162 //writefln("[%d] '%c' x%x", i, c, c);
4165 assert(s
== "hello\u07FF\uD7FF\U0010FFFF"d
);
4168 foreach (s
; invalidUTFstrings
!wchar())
4170 auto r
= s
.byDchar();
4172 assert(r
.front
== r
.front
);
4174 assert(c
== replacementDchar
);
4180 ws
[1] = 0xDD00; // correct surrogate pair
4181 auto r
= ws
[].byDchar();
4183 assert(r
.front
== r
.front
);
4185 assert(c
== '\U00010100');
4188 auto r
= "hello"w
.byDchar();
4191 assert(r
.front
== 'l');
4197 dstring a
= "hello"d
;
4198 foreach (c
; a
.byDchar
.byDchar())
4200 //writefln("[%d] '%c' x%x", i, c, c);
4203 assert(s
== "hello"d
);
4206 auto r
= "hello".byDchar();
4207 assert(isForwardRange
!(typeof(r
)));
4210 assert(s
.front
== 'h');
4213 auto r
= "hello"w
.byDchar();
4214 assert(isForwardRange
!(typeof(r
)));
4217 assert(s
.front
== 'h');
4221 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4222 // which needs to support ranges with and without those attributes
4224 pure @safe nothrow @nogc unittest
4226 dchar[5] s
= "hello"d
;
4227 foreach (c
; s
[].byChar()) { }
4228 foreach (c
; s
[].byWchar()) { }
4229 foreach (c
; s
[].byDchar()) { }
4232 version (StdUnittest
)
4233 private int impureVariable
;
4237 static struct ImpureThrowingSystemRange(Char
)
4239 @property bool empty() const { return true; }
4240 @property Char
front() const { return Char
.init
; }
4244 throw new Exception("only for testing nothrow");
4248 foreach (Char
; AliasSeq
!(char, wchar, dchar))
4250 ImpureThrowingSystemRange
!Char range
;
4251 foreach (c
; range
.byChar()) { }
4252 foreach (c
; range
.byWchar()) { }
4253 foreach (c
; range
.byDchar()) { }
4257 /****************************
4258 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4259 * of characters by char type `C` by encoding the elements of the range.
4261 * UTF sequences that cannot be converted to the specified encoding are either
4262 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4263 * of the Unicode Standard 6.2 or result in a thrown UTFException.
4264 * Hence byUTF is not symmetric.
4265 * This algorithm is lazy, and does not allocate memory.
4266 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4270 * C = `char`, `wchar`, or `dchar`
4271 * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4272 * UseReplacementDchar.no means throw `UTFException` for invalid UTF
4275 * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes`
4278 * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no`
4281 * A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4282 * as defined by $(REF isAutodecodableString, std, traits).
4284 * A forward range if `R` is a forward range and not auto-decodable.
4286 * Or, if `R` is a range and it is auto-decodable and
4287 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4288 * to $(LREF byCodeUnit).
4290 * Otherwise, an input range of characters.
4292 template byUTF(C
, UseReplacementDchar useReplacementDchar
= Yes
.useReplacementDchar
)
4295 static if (is(immutable C
== immutable UC
, UC
) && !is(C
== UC
))
4296 alias byUTF
= byUTF
!UC
;
4299 auto ref byUTF(R
)(R r
)
4300 if (isAutodecodableString
!R
&& isInputRange
!R
&& isSomeChar
!(ElementEncodingType
!R
))
4302 return byUTF(r
.byCodeUnit());
4305 auto ref byUTF(R
)(R r
)
4306 if (!isAutodecodableString
!R
&& isInputRange
!R
&& isSomeChar
!(ElementEncodingType
!R
))
4308 static if (is(immutable ElementEncodingType
!R
== immutable RC
, RC
) && is(RC
== C
))
4310 return r
.byCodeUnit();
4312 else static if (is(C
== dchar))
4314 static struct Result
4316 enum Empty
= uint.max
; // range is empty or just constructed
4318 this(return scope R r
)
4323 this(return scope R r
, uint buff
)
4329 static if (isBidirectionalRange
!R
)
4331 this(return scope R r
, uint frontBuff
, uint backBuff
)
4334 this.buff
= frontBuff
;
4335 this.backBuff
= backBuff
;
4339 @property bool empty()
4341 static if (isBidirectionalRange
!R
)
4342 return buff
== Empty
&& backBuff
== Empty
&& r
.empty
;
4344 return buff
== Empty
&& r
.empty
;
4347 @property dchar front() scope // 'scope' required by call to decodeFront() below
4353 static if (is(RC
== wchar))
4354 enum firstMulti
= 0xD800; // First high surrogate.
4356 enum firstMulti
= 0x80; // First non-ASCII.
4360 buff
= cast(dchar) c
;
4364 buff
= () @trusted { return decodeFront
!(useReplacementDchar
)(r
); }();
4367 return cast(dchar) buff
;
4377 static if (isForwardRange
!R
)
4379 @property auto save()
4381 static if (isBidirectionalRange
!R
)
4383 return Result(r
.save
, buff
, backBuff
);
4387 return Result(r
.save
, buff
);
4392 static if (isBidirectionalRange
!R
)
4394 @property dchar back() scope // 'scope' required by call to decodeBack() below
4396 if (backBuff
!= Empty
)
4397 return cast(dchar) backBuff
;
4400 static if (is(RC
== wchar))
4401 enum firstMulti
= 0xD800; // First high surrogate.
4403 enum firstMulti
= 0x80; // First non-ASCII.
4407 backBuff
= cast(dchar) c
;
4411 backBuff
= () @trusted { return decodeBack
!useReplacementDchar(r
); }();
4413 return cast(dchar) backBuff
;
4419 if (backBuff
== Empty
)
4428 uint buff
= Empty
; // one character lookahead buffer
4429 static if (isBidirectionalRange
!R
)
4430 uint backBuff
= Empty
;
4437 static struct Result
4439 this(return scope R r
)
4444 this(return scope R r
, ushort pos
, ushort fill
, C
[4 / C
.sizeof
] buf
)
4452 static if (isBidirectionalRange
!R
)
4454 this(return scope R r
, ushort frontPos
, ushort frontFill
,
4455 ushort backPos
, ushort backFill
, C
[4 / C
.sizeof
] buf
)
4458 this.pos
= frontPos
;
4459 this.fill
= frontFill
;
4460 this.backPos
= backPos
;
4461 this.backFill
= backFill
;
4466 @property bool empty()
4468 static if (isBidirectionalRange
!R
)
4469 return pos
== fill
&& backPos
== backFill
&& r
.empty
;
4471 return pos
== fill
&& r
.empty
;
4474 @property auto front() scope // 'scope' required by call to decodeFront() below
4481 static if (C
.sizeof
>= 2 && RC
.sizeof
>= 2)
4482 enum firstMulti
= 0xD800; // First high surrogate.
4484 enum firstMulti
= 0x80; // First non-ASCII.
4489 buf
[pos
] = cast(C
) c
;
4493 static if (is(RC
== dchar))
4499 dchar dc
= () @trusted { return decodeFront
!(useReplacementDchar
)(r
); }();
4500 fill
= cast(ushort) encode
!(useReplacementDchar
)(buf
, dc
);
4513 static if (isForwardRange
!R
)
4515 @property auto save()
4517 static if (isBidirectionalRange
!R
)
4519 return Result(r
.save
, pos
, fill
, backPos
, backFill
, buf
);
4523 return Result(r
.save
, pos
, fill
, buf
);
4528 static if (isBidirectionalRange
!R
)
4530 @property auto back() scope // 'scope' required by call to decodeBack() below
4532 if (backPos
!= backFill
)
4533 return buf
[cast(ushort) (backFill
- backPos
- 1)];
4537 static if (C
.sizeof
>= 2 && RC
.sizeof
>= 2)
4538 enum firstMulti
= 0xD800; // First high surrogate.
4540 enum firstMulti
= 0x80; // First non-ASCII.
4545 buf
[cast(ushort) (backFill
- backPos
- 1)] = cast(C
) c
;
4549 static if (is(RC
== dchar))
4555 dchar dc
= () @trusted { return decodeBack
!(useReplacementDchar
)(r
); }();
4556 backFill
= cast(ushort) encode
!(useReplacementDchar
)(buf
, dc
);
4558 return buf
[cast(ushort) (backFill
- backPos
- 1)];
4563 if (backPos
== backFill
)
4573 static if (isBidirectionalRange
!R
)
4574 ushort backPos
, backFill
;
4575 C
[4 / C
.sizeof
] buf
= void;
4584 @safe pure nothrow unittest
4586 import std
.algorithm
.comparison
: equal
;
4588 // hellö as a range of `char`s, which are UTF-8
4589 assert("hell\u00F6".byUTF
!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4591 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4592 assert("hell\u00F6".byUTF
!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4594 // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4595 assert("𐐷".byUTF
!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4596 assert("𐐷".byUTF
!wchar().equal([0xD801, 0xDC37]));
4597 assert("𐐷".byUTF
!dchar().equal([0x00010437]));
4603 import std
.algorithm
.comparison
: equal
;
4604 import std
.exception
: assertThrown
;
4606 assert("hello\xF0betty".byChar
.byUTF
!(dchar, UseReplacementDchar
.yes
).equal("hello\uFFFDetty"));
4607 assertThrown
!UTFException("hello\xF0betty".byChar
.byUTF
!(dchar, UseReplacementDchar
.no
).equal("hello betty"));
4613 wchar[] s
= ['a', 'b', 0x219];
4614 auto r
= s
.byUTF
!char;
4615 assert(isBidirectionalRange
!(typeof(r
)));
4616 assert(r
.back
== 0x99);
4618 assert(r
.back
== 0xc8);
4620 assert(r
.back
== 'b');
4625 wchar[] s
= ['a', 'b', 0x219];
4626 auto r
= s
.byUTF
!wchar;
4628 assert(isBidirectionalRange
!(typeof(r
)));
4629 assert(r
.back
== 0x219);
4631 assert(r
.back
== 'b');
4635 wchar[] s
= ['a', 'b', 0x219];
4636 auto r
= s
.byUTF
!dchar;
4637 assert(isBidirectionalRange
!(typeof(r
)));
4638 assert(r
.back
== 0x219);
4640 assert(r
.back
== 'b');
4644 dchar[] s
= ['𐐷', '😁'];
4645 auto r
= s
.byUTF
!wchar;
4646 assert(r
.back
== 0xde01);
4648 assert(r
.back
== 0xd83d);
4650 assert(r
.back
== 0xdc37);
4652 assert(r
.back
== 0xd801);
4656 dchar[] s
= ['𐐷', '😁'];
4657 auto r
= s
.byUTF
!char;
4664 import std
.algorithm
.comparison
: equal
;
4665 assert(res
.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4670 auto r
= ['a', 'b', 'c', 'd', 'e'].byUTF
!dchar;
4676 import std
.algorithm
.comparison
: equal
;
4677 assert(res
.equal(['e', 'd', 'c', 'b', 'a']));
4681 //testing the save() function
4682 wchar[] s
= ['Ă','ț'];
4684 auto rc
= s
.byUTF
!char;
4686 auto rcCopy
= rc
.save
;
4687 assert(rc
.back
== rcCopy
.back
);
4688 assert(rcCopy
.back
== 0xc8);
4690 auto rd
= s
.byUTF
!dchar;
4692 auto rdCopy
= rd
.save
;
4693 assert(rd
.back
== rdCopy
.back
);
4694 assert(rdCopy
.back
== 'Ă');
4699 @safe pure nothrow unittest
4701 import std
.range
.primitives
;
4702 wchar[] s
= ['ă', 'î'];
4704 auto rc
= s
.byUTF
!char;
4705 static assert(isBidirectionalRange
!(typeof(rc
)));
4706 assert(rc
.back
== 0xae);
4708 assert(rc
.back
== 0xc3);
4710 assert(rc
.back
== 0x83);
4712 assert(rc
.back
== 0xc4);
4714 auto rw
= s
.byUTF
!wchar;
4715 static assert(isBidirectionalRange
!(typeof(rw
)));
4716 assert(rw
.back
== 'î');
4718 assert(rw
.back
== 'ă');
4720 auto rd
= s
.byUTF
!dchar;
4721 static assert(isBidirectionalRange
!(typeof(rd
)));
4722 assert(rd
.back
== 'î');
4724 assert(rd
.back
== 'ă');