1 // Written in the D programming language.
4 Encode and decode UTF-8, UTF-16 and UTF-32 strings.
6 UTF character support is restricted to
7 $(D '\u0000' <= character <= '\U0010FFFF').
9 $(SCRIPT inhibitQuickIndex = 1;)
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
17 $(TR $(TD Lazy decode) $(TD
24 $(TR $(TD Encode) $(TD
32 $(TR $(TD Length) $(TD
42 $(TR $(TD Validation) $(TD
44 $(LREF isValidCodepoint)
47 $(TR $(TD Miscellaneous) $(TD
48 $(LREF replacementDchar)
49 $(LREF UseReplacementDchar)
54 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
55 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
56 $(LINK https://web.archive.org/web/20100113043530/https://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
57 Copyright: Copyright The D Language Foundation 2000 - 2012.
58 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
59 Authors: $(HTTP digitalmars.com, Walter Bright) and
60 $(HTTP jmdavisprog.com, Jonathan M Davis)
61 Source: $(PHOBOSSRC std/utf.d)
65 import std
.exception
: basicExceptionCtors
;
66 import core
.exception
: UnicodeException
;
67 import std
.meta
: AliasSeq
;
69 import std
.traits
: isAutodecodableString
, isConvertibleToString
,
70 isSomeChar
, isSomeString
, isStaticArray
, Unqual
;
71 import std
.typecons
: Flag
, Yes
, No
;
75 Exception thrown on errors in std.utf functions.
77 class UTFException
: UnicodeException
79 import core
.internal
.string
: unsignedToTempString
, UnsignedStringBuf
;
84 @safe pure nothrow @nogc
85 UTFException
setSequence(scope uint[] data
...) return
87 assert(data
.length
<= 4);
89 len
= data
.length
< 4 ? data
.length
: 4;
90 sequence
[0 .. len
] = data
[0 .. len
];
95 // FIXME: Use std.exception.basicExceptionCtors here once
96 // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
99 Standard exception constructors.
101 this(string msg
, string file
= __FILE__
, size_t line
= __LINE__
,
102 Throwable next
= null) @nogc @safe pure nothrow
104 super(msg
, 0, file
, line
, next
);
107 this(string msg
, size_t index
, string file
= __FILE__
,
108 size_t line
= __LINE__
, Throwable next
= null) @safe pure nothrow
110 UnsignedStringBuf buf
= void;
111 msg
~= " (at index " ~ unsignedToTempString(index
, buf
) ~ ")";
112 super(msg
, index
, file
, line
, next
);
117 A `string` detailing the invalid UTF sequence.
119 override string
toString() const
123 /* Exception.toString() is not marked as const, although
124 * it is const-compatible.
126 //return super.toString();
127 auto e
= () @trusted { return cast(Exception
) super; } ();
131 string result
= "Invalid UTF sequence:";
133 foreach (i
; sequence
[0 .. len
])
135 UnsignedStringBuf buf
= void;
137 auto h
= unsignedToTempString
!16(i
, buf
);
144 if (super.msg
.length
> 0)
157 import std
.exception
: assertThrown
;
160 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
161 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
162 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
163 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
164 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
168 Provide array of invalidly encoded UTF strings. Useful for testing.
171 Char = char, wchar, or dchar
174 an array of invalidly encoded UTF strings
177 package auto invalidUTFstrings(Char
)() @safe pure @nogc nothrow
180 static if (is(Char
== char))
182 enum x
= 0xDC00; // invalid surrogate value
183 enum y
= 0x110000; // out of range
185 static immutable string
[8] result
=
187 "\x80", // not a start byte
189 "\xC0\xC0", // invalid continuation
190 "\xF0\x82\x82\xAC", // overlong
193 0x80 |
((x
>> 6) & 0x3F),
197 cast(char)(0xF0 |
(y
>> 18)),
198 cast(char)(0x80 |
((y
>> 12) & 0x3F)),
199 cast(char)(0x80 |
((y
>> 6) & 0x3F)),
200 cast(char)(0x80 |
(y
& 0x3F))
203 cast(char)(0xF8 |
3), // 5 byte encoding
204 cast(char)(0x80 |
3),
205 cast(char)(0x80 |
3),
206 cast(char)(0x80 |
3),
207 cast(char)(0x80 |
3),
210 cast(char)(0xFC |
3), // 6 byte encoding
211 cast(char)(0x80 |
3),
212 cast(char)(0x80 |
3),
213 cast(char)(0x80 |
3),
214 cast(char)(0x80 |
3),
215 cast(char)(0x80 |
3),
221 else static if (is(Char
== wchar))
223 static immutable wstring
[5] result
=
246 else static if (is(Char
== dchar))
248 static immutable dstring
[3] result
=
250 [ cast(dchar) 0x110000 ],
251 [ cast(dchar) 0x00D800 ],
252 [ cast(dchar) 0x00DFFF ],
262 Check whether the given Unicode code point is valid.
265 c = code point to check
268 `true` if and only if `c` is a valid Unicode code point
271 `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
272 as they are permitted for internal use by an application, but they are
273 not allowed for interchange by the Unicode standard.
275 bool isValidDchar(dchar c
) pure nothrow @safe @nogc
277 return c
< 0xD800 ||
(c
> 0xDFFF && c
<= 0x10FFFF);
281 @safe @nogc pure nothrow unittest
283 assert( isValidDchar(cast(dchar) 0x41));
284 assert( isValidDchar(cast(dchar) 0x00));
285 assert(!isValidDchar(cast(dchar) 0xD800));
286 assert(!isValidDchar(cast(dchar) 0x11FFFF));
289 pure nothrow @safe @nogc unittest
291 import std
.exception
;
295 assert( isValidDchar(cast(dchar)'a') == true);
296 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
298 assert(!isValidDchar(cast(dchar) 0x00D800));
299 assert(!isValidDchar(cast(dchar) 0x00DBFF));
300 assert(!isValidDchar(cast(dchar) 0x00DC00));
301 assert(!isValidDchar(cast(dchar) 0x00DFFF));
302 assert( isValidDchar(cast(dchar) 0x00FFFE));
303 assert( isValidDchar(cast(dchar) 0x00FFFF));
304 assert( isValidDchar(cast(dchar) 0x01FFFF));
305 assert( isValidDchar(cast(dchar) 0x10FFFF));
306 assert(!isValidDchar(cast(dchar) 0x110000));
311 Checks if a single character forms a valid code point.
313 When standing alone, some characters are invalid code points. For
314 example the `wchar` `0xD800` is a so called high surrogate, which can
315 only be interpreted together with a low surrogate following it. As a
316 standalone character it is considered invalid.
318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
319 Unicode Standard, D90, D91 and D92) for more details.
322 c = character to test
323 Char = character type of `c`
326 `true`, if `c` forms a valid code point.
328 bool isValidCodepoint(Char
)(Char c
)
331 alias UChar
= typeof(cast() c
);
332 static if (is(UChar
== char))
336 else static if (is(UChar
== wchar))
338 return c
<= 0xD7FF || c
>= 0xE000;
340 else static if (is(UChar
== dchar))
342 return isValidDchar(c
);
345 static assert(false, "unknown character type: `" ~ Char
.stringof
~ "`");
349 @safe pure nothrow unittest
351 assert( isValidCodepoint(cast(char) 0x40));
352 assert(!isValidCodepoint(cast(char) 0x80));
353 assert( isValidCodepoint(cast(wchar) 0x1234));
354 assert(!isValidCodepoint(cast(wchar) 0xD800));
355 assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
356 assert(!isValidCodepoint(cast(dchar) 0x12345678));
360 Calculate the length of the UTF sequence starting at `index`
364 str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
365 of UTF code units. Must be random access if `index` is passed
366 index = starting index of UTF sequence (default: `0`)
369 The number of code units in the UTF sequence. For UTF-8, this is a
370 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
371 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
374 May throw a `UTFException` if `str[index]` is not the start of a
378 `stride` will only analyze the first `str[index]` element. It
379 will not fully verify the validity of the UTF sequence, nor even verify
380 the presence of the sequence: it will not actually guarantee that
381 $(D index + stride(str, index) <= str.length).
383 uint stride(S
)(auto ref S
str, size_t index
)
384 if (is(S
: const char[]) ||
385 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable char)))
387 static if (is(typeof(str.length
) : ulong))
388 assert(index
< str.length
, "Past the end of the UTF-8 sequence");
389 immutable c
= str[index
];
394 return strideImpl(c
, index
);
398 uint stride(S
)(auto ref S
str)
399 if (is(S
: const char[]) ||
400 (isInputRange
!S
&& is(immutable ElementType
!S
== immutable char)))
402 static if (is(S
: const char[]))
403 immutable c
= str[0];
405 immutable c
= str.front
;
410 return strideImpl(c
, 0);
415 import core
.exception
: AssertError
;
416 import std
.conv
: to
;
417 import std
.exception
;
418 import std
.string
: format
;
419 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
420 static void test(string s
, dchar c
, size_t i
= 0, size_t line
= __LINE__
)
422 enforce(stride(s
, i
) == codeLength
!char(c
),
423 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
425 enforce(stride(RandomCU
!char(s
), i
) == codeLength
!char(c
),
426 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
428 auto refRandom
= new RefRandomCU
!char(s
);
429 immutable randLen
= refRandom
.length
;
430 enforce(stride(refRandom
, i
) == codeLength
!char(c
),
431 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
432 enforce(refRandom
.length
== randLen
,
433 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
437 enforce(stride(s
) == codeLength
!char(c
),
438 new AssertError(format("Unit test failure string 0: %s", s
), __FILE__
, line
));
440 enforce(stride(InputCU
!char(s
)) == codeLength
!char(c
),
441 new AssertError(format("Unit test failure range 0: %s", s
), __FILE__
, line
));
443 auto refBidir
= new RefBidirCU
!char(s
);
444 immutable bidirLen
= refBidir
.length
;
445 enforce(stride(refBidir
) == codeLength
!char(c
),
446 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
447 enforce(refBidir
.length
== bidirLen
,
448 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
456 test("\u2029", '\u2029'); //paraSep
457 test("\u0100", '\u0100');
458 test("\u0430", '\u0430');
459 test("\U00010143", '\U00010143');
460 test("abcdefcdef", 'a');
461 test("hello\U00010143\u0100\U00010143", 'h', 0);
462 test("hello\U00010143\u0100\U00010143", 'e', 1);
463 test("hello\U00010143\u0100\U00010143", 'l', 2);
464 test("hello\U00010143\u0100\U00010143", 'l', 3);
465 test("hello\U00010143\u0100\U00010143", 'o', 4);
466 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
467 test("hello\U00010143\u0100\U00010143", '\u0100', 9);
468 test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
470 foreach (S
; AliasSeq
!(char[], const char[], string
))
472 enum str = to
!S("hello world");
473 static assert(isSafe
!({ stride(str, 0); }));
474 static assert(isSafe
!({ stride(str); }));
475 static assert((functionAttributes
!({ stride(str, 0); }) & FunctionAttribute
.pure_
) != 0);
476 static assert((functionAttributes
!({ stride(str); }) & FunctionAttribute
.pure_
) != 0);
481 @safe unittest // invalid start bytes
483 import std
.exception
: assertThrown
;
484 immutable char[] invalidStartBytes
= [
485 0b1111_1000, // indicating a sequence length of 5
489 0b1000_0000, // continuation byte
491 foreach (c
; invalidStartBytes
)
492 assertThrown
!UTFException(stride([c
]));
496 uint stride(S
)(auto ref S
str, size_t index
)
497 if (is(S
: const wchar[]) ||
498 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable wchar)))
500 static if (is(typeof(str.length
) : ulong))
501 assert(index
< str.length
, "Past the end of the UTF-16 sequence");
502 immutable uint u
= str[index
];
503 return 1 + (u
>= 0xD800 && u
<= 0xDBFF);
507 uint stride(S
)(auto ref S
str) @safe pure
508 if (is(S
: const wchar[]))
510 return stride(str, 0);
514 uint stride(S
)(auto ref S
str)
515 if (isInputRange
!S
&& is(immutable ElementType
!S
== immutable wchar) &&
516 !is(S
: const wchar[]))
518 assert(!str.empty
, "UTF-16 sequence is empty");
519 immutable uint u
= str.front
;
520 return 1 + (u
>= 0xD800 && u
<= 0xDBFF);
525 import core
.exception
: AssertError
;
526 import std
.conv
: to
;
527 import std
.exception
;
528 import std
.string
: format
;
529 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
530 static void test(wstring s
, dchar c
, size_t i
= 0, size_t line
= __LINE__
)
532 enforce(stride(s
, i
) == codeLength
!wchar(c
),
533 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
535 enforce(stride(RandomCU
!wchar(s
), i
) == codeLength
!wchar(c
),
536 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
538 auto refRandom
= new RefRandomCU
!wchar(s
);
539 immutable randLen
= refRandom
.length
;
540 enforce(stride(refRandom
, i
) == codeLength
!wchar(c
),
541 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
542 enforce(refRandom
.length
== randLen
,
543 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
547 enforce(stride(s
) == codeLength
!wchar(c
),
548 new AssertError(format("Unit test failure string 0: %s", s
), __FILE__
, line
));
550 enforce(stride(InputCU
!wchar(s
)) == codeLength
!wchar(c
),
551 new AssertError(format("Unit test failure range 0: %s", s
), __FILE__
, line
));
553 auto refBidir
= new RefBidirCU
!wchar(s
);
554 immutable bidirLen
= refBidir
.length
;
555 enforce(stride(refBidir
) == codeLength
!wchar(c
),
556 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
557 enforce(refBidir
.length
== bidirLen
,
558 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
566 test("\u2029", '\u2029'); //paraSep
567 test("\u0100", '\u0100');
568 test("\u0430", '\u0430');
569 test("\U00010143", '\U00010143');
570 test("abcdefcdef", 'a');
571 test("hello\U00010143\u0100\U00010143", 'h', 0);
572 test("hello\U00010143\u0100\U00010143", 'e', 1);
573 test("hello\U00010143\u0100\U00010143", 'l', 2);
574 test("hello\U00010143\u0100\U00010143", 'l', 3);
575 test("hello\U00010143\u0100\U00010143", 'o', 4);
576 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
577 test("hello\U00010143\u0100\U00010143", '\u0100', 7);
578 test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
580 foreach (S
; AliasSeq
!(wchar[], const wchar[], wstring
))
582 enum str = to
!S("hello world");
583 static assert(isSafe
!(() => stride(str, 0)));
584 static assert(isSafe
!(() => stride(str) ));
585 static assert((functionAttributes
!(() => stride(str, 0)) & FunctionAttribute
.pure_
) != 0);
586 static assert((functionAttributes
!(() => stride(str) ) & FunctionAttribute
.pure_
) != 0);
592 uint stride(S
)(auto ref S
str, size_t index
= 0)
593 if (is(S
: const dchar[]) ||
594 (isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar)))
596 static if (is(typeof(str.length
) : ulong))
597 assert(index
< str.length
, "Past the end of the UTF-32 sequence");
599 assert(!str.empty
, "UTF-32 sequence is empty.");
606 assert("a".stride
== 1);
607 assert("λ".stride
== 2);
608 assert("aλ".stride
== 1);
609 assert("aλ".stride(1) == 2);
610 assert("𐐷".stride
== 4);
615 import core
.exception
: AssertError
;
616 import std
.conv
: to
;
617 import std
.exception
;
618 import std
.string
: format
;
619 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
620 static void test(dstring s
, dchar c
, size_t i
= 0, size_t line
= __LINE__
)
622 enforce(stride(s
, i
) == codeLength
!dchar(c
),
623 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
625 enforce(stride(RandomCU
!dchar(s
), i
) == codeLength
!dchar(c
),
626 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
628 auto refRandom
= new RefRandomCU
!dchar(s
);
629 immutable randLen
= refRandom
.length
;
630 enforce(stride(refRandom
, i
) == codeLength
!dchar(c
),
631 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
632 enforce(refRandom
.length
== randLen
,
633 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
637 enforce(stride(s
) == codeLength
!dchar(c
),
638 new AssertError(format("Unit test failure string 0: %s", s
), __FILE__
, line
));
640 enforce(stride(InputCU
!dchar(s
)) == codeLength
!dchar(c
),
641 new AssertError(format("Unit test failure range 0: %s", s
), __FILE__
, line
));
643 auto refBidir
= new RefBidirCU
!dchar(s
);
644 immutable bidirLen
= refBidir
.length
;
645 enforce(stride(refBidir
) == codeLength
!dchar(c
),
646 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
647 enforce(refBidir
.length
== bidirLen
,
648 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
656 test("\u2029", '\u2029'); //paraSep
657 test("\u0100", '\u0100');
658 test("\u0430", '\u0430');
659 test("\U00010143", '\U00010143');
660 test("abcdefcdef", 'a');
661 test("hello\U00010143\u0100\U00010143", 'h', 0);
662 test("hello\U00010143\u0100\U00010143", 'e', 1);
663 test("hello\U00010143\u0100\U00010143", 'l', 2);
664 test("hello\U00010143\u0100\U00010143", 'l', 3);
665 test("hello\U00010143\u0100\U00010143", 'o', 4);
666 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
667 test("hello\U00010143\u0100\U00010143", '\u0100', 6);
668 test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
670 foreach (S
; AliasSeq
!(dchar[], const dchar[], dstring
))
672 enum str = to
!S("hello world");
673 static assert(isSafe
!(() => stride(str, 0)));
674 static assert(isSafe
!(() => stride(str) ));
675 static assert((functionAttributes
!(() => stride(str, 0)) & FunctionAttribute
.pure_
) != 0);
676 static assert((functionAttributes
!(() => stride(str) ) & FunctionAttribute
.pure_
) != 0);
681 private uint strideImpl(char c
, size_t index
) @trusted pure
682 in { assert(c
& 0x80); }
685 import core
.bitop
: bsr;
686 immutable msbs
= 7 - bsr((~uint(c
)) & 0xFF);
687 if (c
== 0xFF || msbs
< 2 || msbs
> 4)
688 throw new UTFException("Invalid UTF-8 sequence", index
);
693 Calculate the length of the UTF sequence ending one code unit before
697 str = bidirectional range of UTF code units. Must be random access if
699 index = index one past end of UTF sequence (default: `str.length`)
702 The number of code units in the UTF sequence. For UTF-8, this is a
703 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
704 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
707 May throw a `UTFException` if `str[index]` is not one past the
708 end of a valid UTF sequence.
711 `strideBack` will only analyze the element at $(D str[index - 1])
712 element. It will not fully verify the validity of the UTF sequence, nor
713 even verify the presence of the sequence: it will not actually
714 guarantee that $(D strideBack(str, index) <= index).
716 uint strideBack(S
)(auto ref S
str, size_t index
)
717 if (is(S
: const char[]) ||
718 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable char)))
720 static if (is(typeof(str.length
) : ulong))
721 assert(index
<= str.length
, "Past the end of the UTF-8 sequence");
722 assert(index
> 0, "Not the end of the UTF-8 sequence");
724 if ((str[index
-1] & 0b1100_0000) != 0b1000_0000)
727 if (index
>= 4) //single verification for most common case
729 static foreach (i
; 2 .. 5)
731 if ((str[index
-i
] & 0b1100_0000) != 0b1000_0000)
737 static foreach (i
; 2 .. 4)
739 if (index
>= i
&& (str[index
-i
] & 0b1100_0000) != 0b1000_0000)
743 throw new UTFException("Not the end of the UTF sequence", index
);
747 uint strideBack(S
)(auto ref S
str)
748 if (is(S
: const char[]) ||
749 (isRandomAccessRange
!S
&& hasLength
!S
&& is(immutable ElementType
!S
== immutable char)))
751 return strideBack(str, str.length
);
755 uint strideBack(S
)(auto ref S
str)
756 if (isBidirectionalRange
!S
&& is(immutable ElementType
!S
== immutable char) && !isRandomAccessRange
!S
)
758 assert(!str.empty
, "Past the end of the UTF-8 sequence");
759 auto temp
= str.save
;
760 foreach (i
; AliasSeq
!(1, 2, 3, 4))
762 if ((temp
.back
& 0b1100_0000) != 0b1000_0000)
768 throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
773 import core
.exception
: AssertError
;
774 import std
.conv
: to
;
775 import std
.exception
;
776 import std
.string
: format
;
777 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
778 static void test(string s
, dchar c
, size_t i
= size_t
.max
, size_t line
= __LINE__
)
780 enforce(strideBack(s
, i
== size_t
.max ? s
.length
: i
) == codeLength
!char(c
),
781 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
783 enforce(strideBack(RandomCU
!char(s
), i
== size_t
.max ? s
.length
: i
) == codeLength
!char(c
),
784 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
786 auto refRandom
= new RefRandomCU
!char(s
);
787 immutable randLen
= refRandom
.length
;
788 enforce(strideBack(refRandom
, i
== size_t
.max ? s
.length
: i
) == codeLength
!char(c
),
789 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
790 enforce(refRandom
.length
== randLen
,
791 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
795 enforce(strideBack(s
) == codeLength
!char(c
),
796 new AssertError(format("Unit test failure string code length: %s", s
), __FILE__
, line
));
798 enforce(strideBack(BidirCU
!char(s
)) == codeLength
!char(c
),
799 new AssertError(format("Unit test failure range code length: %s", s
), __FILE__
, line
));
801 auto refBidir
= new RefBidirCU
!char(s
);
802 immutable bidirLen
= refBidir
.length
;
803 enforce(strideBack(refBidir
) == codeLength
!char(c
),
804 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
805 enforce(refBidir
.length
== bidirLen
,
806 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
814 test("\u2029", '\u2029'); //paraSep
815 test("\u0100", '\u0100');
816 test("\u0430", '\u0430');
817 test("\U00010143", '\U00010143');
818 test("abcdefcdef", 'f');
819 test("\U00010143\u0100\U00010143hello", 'o', 15);
820 test("\U00010143\u0100\U00010143hello", 'l', 14);
821 test("\U00010143\u0100\U00010143hello", 'l', 13);
822 test("\U00010143\u0100\U00010143hello", 'e', 12);
823 test("\U00010143\u0100\U00010143hello", 'h', 11);
824 test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
825 test("\U00010143\u0100\U00010143hello", '\u0100', 6);
826 test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
828 foreach (S
; AliasSeq
!(char[], const char[], string
))
830 enum str = to
!S("hello world");
831 static assert(isSafe
!({ strideBack(str, 0); }));
832 static assert(isSafe
!({ strideBack(str); }));
833 static assert((functionAttributes
!({ strideBack(str, 0); }) & FunctionAttribute
.pure_
) != 0);
834 static assert((functionAttributes
!({ strideBack(str); }) & FunctionAttribute
.pure_
) != 0);
839 //UTF-16 is self synchronizing: The length of strideBack can be found from
840 //the value of a single wchar
842 uint strideBack(S
)(auto ref S
str, size_t index
)
843 if (is(S
: const wchar[]) ||
844 (isRandomAccessRange
!S
&& is(immutable ElementType
!S
== immutable wchar)))
846 static if (is(typeof(str.length
) : ulong))
847 assert(index
<= str.length
, "Past the end of the UTF-16 sequence");
848 assert(index
> 0, "Not the end of a UTF-16 sequence");
850 immutable c2
= str[index
-1];
851 return 1 + (0xDC00 <= c2
&& c2
< 0xE000);
855 uint strideBack(S
)(auto ref S
str)
856 if (is(S
: const wchar[]) ||
857 (isBidirectionalRange
!S
&& is(immutable ElementType
!S
== immutable wchar)))
859 assert(!str.empty
, "UTF-16 sequence is empty");
861 static if (is(S
: const(wchar)[]))
862 immutable c2
= str[$ - 1];
864 immutable c2
= str.back
;
866 return 1 + (0xDC00 <= c2
&& c2
<= 0xE000);
871 import core
.exception
: AssertError
;
872 import std
.conv
: to
;
873 import std
.exception
;
874 import std
.string
: format
;
875 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
876 static void test(wstring s
, dchar c
, size_t i
= size_t
.max
, size_t line
= __LINE__
)
878 enforce(strideBack(s
, i
== size_t
.max ? s
.length
: i
) == codeLength
!wchar(c
),
879 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
881 enforce(strideBack(RandomCU
!wchar(s
), i
== size_t
.max ? s
.length
: i
) == codeLength
!wchar(c
),
882 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
884 auto refRandom
= new RefRandomCU
!wchar(s
);
885 immutable randLen
= refRandom
.length
;
886 enforce(strideBack(refRandom
, i
== size_t
.max ? s
.length
: i
) == codeLength
!wchar(c
),
887 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
888 enforce(refRandom
.length
== randLen
,
889 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
893 enforce(strideBack(s
) == codeLength
!wchar(c
),
894 new AssertError(format("Unit test failure string code length: %s", s
), __FILE__
, line
));
896 enforce(strideBack(BidirCU
!wchar(s
)) == codeLength
!wchar(c
),
897 new AssertError(format("Unit test failure range code length: %s", s
), __FILE__
, line
));
899 auto refBidir
= new RefBidirCU
!wchar(s
);
900 immutable bidirLen
= refBidir
.length
;
901 enforce(strideBack(refBidir
) == codeLength
!wchar(c
),
902 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
903 enforce(refBidir
.length
== bidirLen
,
904 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
912 test("\u2029", '\u2029'); //paraSep
913 test("\u0100", '\u0100');
914 test("\u0430", '\u0430');
915 test("\U00010143", '\U00010143');
916 test("abcdefcdef", 'f');
917 test("\U00010143\u0100\U00010143hello", 'o', 10);
918 test("\U00010143\u0100\U00010143hello", 'l', 9);
919 test("\U00010143\u0100\U00010143hello", 'l', 8);
920 test("\U00010143\u0100\U00010143hello", 'e', 7);
921 test("\U00010143\u0100\U00010143hello", 'h', 6);
922 test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
923 test("\U00010143\u0100\U00010143hello", '\u0100', 3);
924 test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
926 foreach (S
; AliasSeq
!(wchar[], const wchar[], wstring
))
928 enum str = to
!S("hello world");
929 static assert(isSafe
!(() => strideBack(str, 0)));
930 static assert(isSafe
!(() => strideBack(str) ));
931 static assert((functionAttributes
!(() => strideBack(str, 0)) & FunctionAttribute
.pure_
) != 0);
932 static assert((functionAttributes
!(() => strideBack(str) ) & FunctionAttribute
.pure_
) != 0);
938 uint strideBack(S
)(auto ref S
str, size_t index
)
939 if (isRandomAccessRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar))
941 static if (is(typeof(str.length
) : ulong))
942 assert(index
<= str.length
, "Past the end of the UTF-32 sequence");
943 assert(index
> 0, "Not the end of the UTF-32 sequence");
948 uint strideBack(S
)(auto ref S
str)
949 if (isBidirectionalRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar))
951 assert(!str.empty
, "Empty UTF-32 sequence");
958 assert("a".strideBack
== 1);
959 assert("λ".strideBack
== 2);
960 assert("aλ".strideBack
== 2);
961 assert("aλ".strideBack(1) == 1);
962 assert("𐐷".strideBack
== 4);
967 import core
.exception
: AssertError
;
968 import std
.conv
: to
;
969 import std
.exception
;
970 import std
.string
: format
;
971 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
972 static void test(dstring s
, dchar c
, size_t i
= size_t
.max
, size_t line
= __LINE__
)
974 enforce(strideBack(s
, i
== size_t
.max ? s
.length
: i
) == codeLength
!dchar(c
),
975 new AssertError(format("Unit test failure string: %s", s
), __FILE__
, line
));
977 enforce(strideBack(RandomCU
!dchar(s
), i
== size_t
.max ? s
.length
: i
) == codeLength
!dchar(c
),
978 new AssertError(format("Unit test failure range: %s", s
), __FILE__
, line
));
980 auto refRandom
= new RefRandomCU
!dchar(s
);
981 immutable randLen
= refRandom
.length
;
982 enforce(strideBack(refRandom
, i
== size_t
.max ? s
.length
: i
) == codeLength
!dchar(c
),
983 new AssertError(format("Unit test failure rand ref range: %s", s
), __FILE__
, line
));
984 enforce(refRandom
.length
== randLen
,
985 new AssertError(format("Unit test failure rand ref range length: %s", s
), __FILE__
, line
));
989 enforce(strideBack(s
) == codeLength
!dchar(c
),
990 new AssertError(format("Unit test failure string code length: %s", s
), __FILE__
, line
));
992 enforce(strideBack(BidirCU
!dchar(s
)) == codeLength
!dchar(c
),
993 new AssertError(format("Unit test failure range code length: %s", s
), __FILE__
, line
));
995 auto refBidir
= new RefBidirCU
!dchar(s
);
996 immutable bidirLen
= refBidir
.length
;
997 enforce(strideBack(refBidir
) == codeLength
!dchar(c
),
998 new AssertError(format("Unit test failure bidir ref range code length: %s", s
), __FILE__
, line
));
999 enforce(refBidir
.length
== bidirLen
,
1000 new AssertError(format("Unit test failure bidir ref range length: %s", s
), __FILE__
, line
));
1008 test("\u2029", '\u2029'); //paraSep
1009 test("\u0100", '\u0100');
1010 test("\u0430", '\u0430');
1011 test("\U00010143", '\U00010143');
1012 test("abcdefcdef", 'f');
1013 test("\U00010143\u0100\U00010143hello", 'o', 8);
1014 test("\U00010143\u0100\U00010143hello", 'l', 7);
1015 test("\U00010143\u0100\U00010143hello", 'l', 6);
1016 test("\U00010143\u0100\U00010143hello", 'e', 5);
1017 test("\U00010143\u0100\U00010143hello", 'h', 4);
1018 test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019 test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020 test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1022 foreach (S
; AliasSeq
!(dchar[], const dchar[], dstring
))
1024 enum str = to
!S("hello world");
1025 static assert(isSafe
!(() => strideBack(str, 0)));
1026 static assert(isSafe
!(() => strideBack(str) ));
1027 static assert((functionAttributes
!(() => strideBack(str, 0)) & FunctionAttribute
.pure_
) != 0);
1028 static assert((functionAttributes
!(() => strideBack(str) ) & FunctionAttribute
.pure_
) != 0);
1035 Given `index` into `str` and assuming that `index` is at the start
1036 of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037 up to `index`. So, `index` is the index of a code unit at the
1038 beginning of a code point, and the return value is how many code points into
1039 the string that that code point is.
1041 size_t
toUCSindex(C
)(const(C
)[] str, size_t index
) @safe pure
1044 static if (is(immutable C
== immutable dchar))
1051 for (; j
< index
; ++n
)
1052 j
+= stride(str, j
);
1056 static if (is(immutable C
== immutable char))
1057 throw new UTFException("Invalid UTF-8 sequence", index
);
1059 throw new UTFException("Invalid UTF-16 sequence", index
);
1069 assert(toUCSindex(`hello world`, 7) == 7);
1070 assert(toUCSindex(`hello world`w
, 7) == 7);
1071 assert(toUCSindex(`hello world`d
, 7) == 7);
1073 assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074 assert(toUCSindex(`Ma Chérie`w
, 7) == 7);
1075 assert(toUCSindex(`Ma Chérie`d
, 7) == 7);
1077 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w
, 9) == 9);
1079 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d
, 9) == 9);
1084 Given a UCS index `n` into `str`, returns the UTF index.
1085 So, `n` is how many code points into the string the code point is, and
1086 the array index of the code unit is returned.
1088 size_t
toUTFindex(C
)(const(C
)[] str, size_t n
) @safe pure
1091 static if (is(immutable C
== immutable dchar))
1100 i
+= stride(str, i
);
1109 assert(toUTFindex(`hello world`, 7) == 7);
1110 assert(toUTFindex(`hello world`w
, 7) == 7);
1111 assert(toUTFindex(`hello world`d
, 7) == 7);
1113 assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114 assert(toUTFindex(`Ma Chérie`w
, 7) == 7);
1115 assert(toUTFindex(`Ma Chérie`d
, 7) == 7);
1117 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w
, 9) == 9);
1119 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d
, 9) == 9);
1123 /* =================== Decode ======================= */
1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126 alias UseReplacementDchar
= Flag
!"useReplacementDchar";
1129 Decodes and returns the code point starting at `str[index]`. `index`
1130 is advanced to one past the decoded code point. If the code point is not
1131 well-formed, then a `UTFException` is thrown and `index` remains
1134 decode will only work with strings and random access ranges of code units
1135 with length and slicing, whereas $(LREF decodeFront) will work with any
1136 input range of code units.
1139 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140 str = input string or indexable Range
1141 index = starting index into s[]; incremented by number of code units processed
1147 $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148 sequence and useReplacementDchar is `No.useReplacementDchar`
1150 dchar decode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(auto ref S
str, ref size_t index
)
1151 if (!isSomeString
!S
&&
1152 isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
&& isSomeChar
!(ElementType
!S
))
1155 assert(index
< str.length
, "Attempted to decode past the end of a string");
1159 assert(isValidDchar(result
));
1163 if (str[index
] < codeUnitLimit
!S
)
1164 return str[index
++];
1166 return decodeImpl
!(true, useReplacementDchar
)(str, index
);
1170 dchar decode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1171 auto ref scope S
str, ref size_t index
) @trusted pure
1175 assert(index
< str.length
, "Attempted to decode past the end of a string");
1179 assert(isValidDchar(result
));
1183 if (str[index
] < codeUnitLimit
!S
)
1184 return str[index
++];
1185 else static if (is(immutable S
== immutable C
[], C
))
1186 return decodeImpl
!(true, useReplacementDchar
)(cast(const(C
)[]) str, index
);
1194 assert("a".decode(i
) == 'a' && i
== 1);
1196 assert("å".decode(i
) == 'å' && i
== 2);
1198 assert("aå".decode(i
) == 'å' && i
== 3);
1200 assert("å"w
.decode(i
) == 'å' && i
== 1);
1202 // ë as a multi-code point grapheme
1204 assert("e\u0308".decode(i
) == 'e' && i
== 1);
1205 // ë as a single code point grapheme
1207 assert("ë".decode(i
) == 'ë' && i
== 2);
1209 assert("ë"w
.decode(i
) == 'ë' && i
== 1);
1212 @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867
1214 import std
.conv
: hexString
;
1215 string data
= hexString
!"f787a598";
1217 try data
.decode(offset
);
1218 catch (UTFException ex
) assert(offset
== 0);
1222 `decodeFront` is a variant of $(LREF decode) which specifically decodes
1223 the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1224 $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1225 of code units (rather than just a string or random access
1226 range). It also takes the range by `ref` and pops off the elements as it
1227 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1228 of code units which were in the code point which was decoded.
1231 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1232 str = input string or indexable Range
1233 numCodeUnits = set to number of code units processed
1239 $(LREF UTFException) if `str.front` is not the start of a valid UTF
1240 sequence. If an exception is thrown, then there is no guarantee as to
1241 the number of code units which were popped off, as it depends on the
1242 type of range being used and how many code units had to be popped off
1243 before the code point was determined to be invalid.
1245 dchar decodeFront(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1246 ref S
str, out size_t numCodeUnits
)
1247 if (!isSomeString
!S
&& isInputRange
!S
&& isSomeChar
!(ElementType
!S
))
1254 assert(isValidDchar(result
));
1258 immutable fst = str.front
;
1260 if (fst < codeUnitLimit
!S
)
1268 // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1269 // done outside of decodeImpl, which is undesirable, since not all
1270 // overloads of decodeImpl need it. So, it should be moved back into
1271 // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1273 enum canIndex
= is(S
: const char[]) || isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
;
1274 immutable retval
= decodeImpl
!(canIndex
, useReplacementDchar
)(str, numCodeUnits
);
1276 // The other range types were already popped by decodeImpl.
1277 static if (isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
)
1278 str = str[numCodeUnits
.. str.length
];
1285 dchar decodeFront(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1286 ref scope S
str, out size_t numCodeUnits
) @trusted pure
1294 assert(isValidDchar(result
));
1298 if (str[0] < codeUnitLimit
!S
)
1301 immutable retval
= str[0];
1305 else static if (is(immutable S
== immutable C
[], C
))
1307 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(cast(const(C
)[]) str, numCodeUnits
);
1308 str = str[numCodeUnits
.. $];
1314 dchar decodeFront(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(ref S
str)
1315 if (isInputRange
!S
&& isSomeChar
!(ElementType
!S
))
1317 size_t numCodeUnits
;
1318 return decodeFront
!useReplacementDchar(str, numCodeUnits
);
1324 import std
.range
.primitives
;
1325 string
str = "Hello, World!";
1327 assert(str.decodeFront
== 'H' && str == "ello, World!");
1329 assert(str.decodeFront
== 'å' && str.empty
);
1332 assert(str.decodeFront(i
) == 'å' && i
== 2 && str.empty
);
1336 `decodeBack` is a variant of $(LREF decode) which specifically decodes
1337 the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1338 bidirectional range of code units (rather than just a string or random access
1339 range). It also takes the range by `ref` and pops off the elements as it
1340 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1341 of code units which were in the code point which was decoded.
1344 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1345 str = input string or bidirectional Range
1346 numCodeUnits = gives the number of code units processed
1349 A decoded UTF character.
1352 $(LREF UTFException) if `str.back` is not the end of a valid UTF
1353 sequence. If an exception is thrown, the `str` itself remains unchanged,
1354 but there is no guarantee as to the value of `numCodeUnits` (when passed).
1356 dchar decodeBack(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1357 ref S
str, out size_t numCodeUnits
)
1365 assert(isValidDchar(result
));
1369 if (str[$ - 1] < codeUnitLimit
!S
)
1372 immutable retval
= str[$ - 1];
1373 str = str[0 .. $ - 1];
1376 else static if (is(immutable S
== immutable C
[], C
))
1378 numCodeUnits
= strideBack(str);
1379 immutable newLength
= str.length
- numCodeUnits
;
1380 size_t index
= newLength
;
1381 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(cast(const(C
)[]) str, index
);
1382 str = str[0 .. newLength
];
1388 dchar decodeBack(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1389 ref S
str, out size_t numCodeUnits
)
1390 if (!isSomeString
!S
&& isSomeChar
!(ElementType
!S
) && isBidirectionalRange
!S
1391 && ((isRandomAccessRange
!S
&& hasLength
!S
) ||
!isRandomAccessRange
!S
))
1398 assert(isValidDchar(result
));
1402 if (str.back
< codeUnitLimit
!S
)
1405 immutable retval
= str.back
;
1411 numCodeUnits
= strideBack(str);
1412 static if (isRandomAccessRange
!S
)
1414 size_t index
= str.length
- numCodeUnits
;
1415 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(str, index
);
1416 str.popBackExactly(numCodeUnits
);
1421 alias Char
= typeof(cast() ElementType
!S
.init
);
1422 Char
[4] codeUnits
= void;
1424 for (size_t i
= numCodeUnits
; i
> 0; )
1426 codeUnits
[--i
] = tmp
.back
;
1429 const Char
[] codePoint
= codeUnits
[0 .. numCodeUnits
];
1431 immutable retval
= decodeImpl
!(true, useReplacementDchar
)(codePoint
, index
);
1439 dchar decodeBack(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(ref S
str)
1441 ||
(isRandomAccessRange
!S
&& hasLength
!S
&& isSomeChar
!(ElementType
!S
))
1442 ||
(!isRandomAccessRange
!S
&& isBidirectionalRange
!S
&& isSomeChar
!(ElementType
!S
)))
1449 assert(isValidDchar(result
));
1453 size_t numCodeUnits
;
1454 return decodeBack
!useReplacementDchar(str, numCodeUnits
);
1458 @system pure unittest
1460 import std
.range
.primitives
;
1461 string
str = "Hello, World!";
1463 assert(str.decodeBack
== '!' && str == "Hello, World");
1465 assert(str.decodeBack
== 'å' && str.empty
);
1468 assert(str.decodeBack(i
) == 'å' && i
== 2 && str.empty
);
1471 // For the given range, code unit values less than this
1472 // are guaranteed to be valid single-codepoint encodings.
1473 package template codeUnitLimit(S
)
1474 if (isSomeChar
!(ElementEncodingType
!S
))
1476 static if (is(immutable ElementEncodingType
!S
== immutable char))
1477 enum char codeUnitLimit
= 0x80;
1478 else static if (is(immutable ElementEncodingType
!S
== immutable wchar))
1479 enum wchar codeUnitLimit
= 0xD800;
1481 enum dchar codeUnitLimit
= 0xD800;
1485 * For strings, this function does its own bounds checking to give a
1486 * more useful error message when attempting to decode past the end of a string.
1487 * Subsequently it uses a pointer instead of an array to avoid
1488 * redundant bounds checking.
1490 * The three overloads of this operate on chars, wchars, and dchars.
1493 * canIndex = if S is indexable
1494 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1495 * str = input string or Range
1496 * index = starting index into s[]; incremented by number of code units processed
1501 private dchar decodeImpl(bool canIndex
, UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1502 auto ref S
str, ref size_t index
)
1504 is(S
: const char[]) ||
(isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable char)))
1506 /* The following encodings are valid, except for the 5 and 6 byte
1510 * 1110xxxx 10xxxxxx 10xxxxxx
1511 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1512 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1513 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1516 /* Dchar bitmask for different numbers of UTF-8 code units.
1518 alias bitMask
= AliasSeq
!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1520 static if (is(S
: const char[]))
1521 auto pstr
= str.ptr
+ index
; // this is what makes decodeImpl() @system code
1522 else static if (isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
)
1523 auto pstr
= str[index
.. str.length
];
1527 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1528 // outside of decodeImpl
1529 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1531 static if (canIndex
)
1533 immutable length
= str.length
- index
;
1534 ubyte fst = pstr
[0];
1538 ubyte fst = pstr
.front
;
1542 static if (!useReplacementDchar
)
1544 static if (canIndex
)
1546 static UTFException
exception(S
)(S
str, string msg
)
1548 uint[4] sequence
= void;
1553 sequence
[i
] = str[i
];
1554 } while (++i
< str.length
&& i
< 4 && (str[i
] & 0xC0) == 0x80);
1556 return new UTFException(msg
, i
).setSequence(sequence
[0 .. i
]);
1560 UTFException
invalidUTF()
1562 static if (canIndex
)
1563 return exception(pstr
[0 .. length
], "Invalid UTF-8 sequence");
1566 //We can't include the invalid sequence with input strings without
1567 //saving each of the code units along the way, and we can't do it with
1568 //forward ranges without saving the entire range. Both would incur a
1569 //cost for the decoding of every character just to provide a better
1570 //error message for the (hopefully) rare case when an invalid UTF-8
1571 //sequence is encountered, so we don't bother trying to include the
1572 //invalid sequence here, unlike with strings and sliceable ranges.
1573 return new UTFException("Invalid UTF-8 sequence");
1577 UTFException
outOfBounds()
1579 static if (canIndex
)
1580 return exception(pstr
[0 .. length
], "Attempted to decode past the end of a string");
1582 return new UTFException("Attempted to decode past the end of a string");
1586 if ((fst & 0b1100_0000) != 0b1100_0000)
1588 static if (useReplacementDchar
)
1590 ++index
; // always consume bad input to avoid infinite loops
1591 return replacementDchar
;
1594 throw invalidUTF(); // starter must have at least 2 first bits set
1597 dchar d
= fst; // upper control bits are masked out later
1600 foreach (i
; AliasSeq
!(1, 2, 3))
1603 static if (canIndex
)
1607 static if (useReplacementDchar
)
1610 return replacementDchar
;
1613 throw outOfBounds();
1620 static if (useReplacementDchar
)
1623 return replacementDchar
;
1626 throw outOfBounds();
1630 static if (canIndex
)
1638 if ((tmp
& 0xC0) != 0x80)
1640 static if (useReplacementDchar
)
1643 return replacementDchar
;
1649 d
= (d
<< 6) |
(tmp
& 0x3F);
1652 if (!(fst & 0x80)) // no more bytes
1654 d
&= bitMask
[i
]; // mask out control bits
1656 // overlong, could have been encoded with i bytes
1657 if ((d
& ~bitMask
[i
- 1]) == 0)
1659 static if (useReplacementDchar
)
1662 return replacementDchar
;
1668 // check for surrogates only needed for 3 bytes
1671 if (!isValidDchar(d
))
1673 static if (useReplacementDchar
)
1676 return replacementDchar
;
1687 static if (useReplacementDchar
)
1688 d
= replacementDchar
;
1699 static if (useReplacementDchar
)
1701 index
+= 4; // read 4 chars by now
1702 return replacementDchar
;
1708 @safe pure @nogc nothrow
1711 // Add tests for useReplacemendDchar == yes path
1715 @safe pure @nogc nothrow:
1716 this(string s
) { this.s
= s
; }
1717 @property bool empty() { return idx
== s
.length
; }
1718 @property char front() { return s
[idx
]; }
1719 void popFront() { ++idx
; }
1724 foreach (s
; invalidUTFstrings
!char())
1728 dchar dc
= decodeImpl
!(false, Yes
.useReplacementDchar
)(r
, index
);
1729 assert(dc
== replacementDchar
);
1730 assert(1 <= index
&& index
<= s
.length
);
1734 private dchar decodeImpl(bool canIndex
, UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)
1735 (auto ref S
str, ref size_t index
)
1736 if (is(S
: const wchar[]) ||
(isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable wchar)))
1738 static if (is(S
: const wchar[]))
1739 auto pstr
= str.ptr
+ index
;
1740 else static if (isRandomAccessRange
!S
&& hasSlicing
!S
&& hasLength
!S
)
1741 auto pstr
= str[index
.. str.length
];
1745 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1746 // outside of decodeImpl
1747 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1749 static if (canIndex
)
1751 immutable length
= str.length
- index
;
1756 uint u
= pstr
.front
;
1760 static if (!useReplacementDchar
)
1762 UTFException
exception(string msg
)
1764 static if (canIndex
)
1765 return new UTFException(msg
).setSequence(pstr
[0]);
1767 return new UTFException(msg
);
1771 // The < case must be taken care of before decodeImpl is called.
1772 assert(u
>= 0xD800);
1776 static if (canIndex
)
1777 immutable onlyOneCodeUnit
= length
== 1;
1779 immutable onlyOneCodeUnit
= pstr
.empty
;
1781 if (onlyOneCodeUnit
)
1783 static if (useReplacementDchar
)
1786 return replacementDchar
;
1789 throw exception("surrogate UTF-16 high value past end of string");
1792 static if (canIndex
)
1793 immutable uint u2
= pstr
[1];
1796 immutable uint u2
= pstr
.front
;
1800 if (u2
< 0xDC00 || u2
> 0xDFFF)
1802 static if (useReplacementDchar
)
1803 u
= replacementDchar
;
1805 throw exception("surrogate UTF-16 low value out of range");
1808 u
= ((u
- 0xD7C0) << 10) + (u2
- 0xDC00);
1811 else if (u
>= 0xDC00 && u
<= 0xDFFF)
1813 static if (useReplacementDchar
)
1814 u
= replacementDchar
;
1816 throw exception("unpaired surrogate UTF-16 value");
1820 // Note: u+FFFE and u+FFFF are specifically permitted by the
1821 // Unicode standard for application internal use (see isValidDchar)
1823 return cast(dchar) u
;
1826 @safe pure @nogc nothrow
1829 // Add tests for useReplacemendDchar == true path
1833 @safe pure @nogc nothrow:
1834 this(wstring s
) { this.s
= s
; }
1835 @property bool empty() { return idx
== s
.length
; }
1836 @property wchar front() { return s
[idx
]; }
1837 void popFront() { ++idx
; }
1842 foreach (s
; invalidUTFstrings
!wchar())
1846 dchar dc
= decodeImpl
!(false, Yes
.useReplacementDchar
)(r
, index
);
1847 assert(dc
== replacementDchar
);
1848 assert(1 <= index
&& index
<= s
.length
);
1852 private dchar decodeImpl(bool canIndex
, UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
, S
)(
1853 auto ref S
str, ref size_t index
)
1854 if (is(S
: const dchar[]) ||
(isInputRange
!S
&& is(immutable ElementEncodingType
!S
== immutable dchar)))
1856 static if (is(S
: const dchar[]))
1857 auto pstr
= str.ptr
;
1861 static if (is(S
: const dchar[]) || isRandomAccessRange
!S
)
1863 dchar dc
= pstr
[index
];
1864 if (!isValidDchar(dc
))
1866 static if (useReplacementDchar
)
1867 dc
= replacementDchar
;
1869 throw new UTFException("Invalid UTF-32 value").setSequence(dc
);
1876 dchar dc
= pstr
.front
;
1877 if (!isValidDchar(dc
))
1879 static if (useReplacementDchar
)
1880 dc
= replacementDchar
;
1882 throw new UTFException("Invalid UTF-32 value").setSequence(dc
);
1890 @safe pure @nogc nothrow
1893 // Add tests for useReplacemendDchar == true path
1897 @safe pure @nogc nothrow:
1898 this(dstring s
) { this.s
= s
; }
1899 @property bool empty() { return idx
== s
.length
; }
1900 @property dchar front() { return s
[idx
]; }
1901 void popFront() { ++idx
; }
1906 foreach (s
; invalidUTFstrings
!dchar())
1910 dchar dc
= decodeImpl
!(false, Yes
.useReplacementDchar
)(r
, index
);
1911 assert(dc
== replacementDchar
);
1912 assert(1 <= index
&& index
<= s
.length
);
1917 version (StdUnittest
) private void testDecode(R
)(R range
,
1920 size_t expectedIndex
,
1921 size_t line
= __LINE__
)
1923 import core
.exception
: AssertError
;
1924 import std
.exception
: enforce
;
1925 import std
.string
: format
;
1926 import std
.traits
: isNarrowString
;
1928 static if (hasLength
!R
)
1929 immutable lenBefore
= range
.length
;
1931 static if (isRandomAccessRange
!R
&& !isNarrowString
!R
)
1934 immutable result
= decode(range
, index
);
1935 enforce(result
== expectedChar
,
1936 new AssertError(format("decode: Wrong character: %s", result
), __FILE__
, line
));
1937 enforce(index
== expectedIndex
,
1938 new AssertError(format("decode: Wrong index: %s", index
), __FILE__
, line
));
1939 static if (hasLength
!R
)
1941 enforce(range
.length
== lenBefore
,
1942 new AssertError(format("decode: length changed: %s", range
.length
), __FILE__
, line
));
1948 version (StdUnittest
) private void testDecodeFront(R
)(ref R range
,
1950 size_t expectedNumCodeUnits
,
1951 size_t line
= __LINE__
)
1953 import core
.exception
: AssertError
;
1954 import std
.exception
: enforce
;
1955 import std
.string
: format
;
1957 static if (hasLength
!R
)
1958 immutable lenBefore
= range
.length
;
1960 size_t numCodeUnits
;
1961 immutable result
= decodeFront(range
, numCodeUnits
);
1962 enforce(result
== expectedChar
,
1963 new AssertError(format("decodeFront: Wrong character: %s", result
), __FILE__
, line
));
1964 enforce(numCodeUnits
== expectedNumCodeUnits
,
1965 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits
), __FILE__
, line
));
1967 static if (hasLength
!R
)
1969 enforce(range
.length
== lenBefore
- numCodeUnits
,
1970 new AssertError(format("decodeFront: wrong length: %s", range
.length
), __FILE__
, line
));
1974 version (StdUnittest
) private void testDecodeBack(R
)(ref R range
,
1976 size_t expectedNumCodeUnits
,
1977 size_t line
= __LINE__
)
1979 // This condition is to allow unit testing all `decode` functions together
1980 static if (!isBidirectionalRange
!R
)
1984 import core
.exception
: AssertError
;
1985 import std
.exception
: enforce
;
1986 import std
.string
: format
;
1988 static if (hasLength
!R
)
1989 immutable lenBefore
= range
.length
;
1991 size_t numCodeUnits
;
1992 immutable result
= decodeBack(range
, numCodeUnits
);
1993 enforce(result
== expectedChar
,
1994 new AssertError(format("decodeBack: Wrong character: %s", result
), __FILE__
, line
));
1995 enforce(numCodeUnits
== expectedNumCodeUnits
,
1996 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits
), __FILE__
, line
));
1998 static if (hasLength
!R
)
2000 enforce(range
.length
== lenBefore
- numCodeUnits
,
2001 new AssertError(format("decodeBack: wrong length: %s", range
.length
), __FILE__
, line
));
2006 version (StdUnittest
) private void testAllDecode(R
)(R range
,
2008 size_t expectedIndex
,
2009 size_t line
= __LINE__
)
2011 testDecode(range
, 0, expectedChar
, expectedIndex
, line
);
2012 static if (isBidirectionalRange
!R
)
2014 auto rangeCopy
= range
.save
;
2015 testDecodeBack(rangeCopy
, expectedChar
, expectedIndex
, line
);
2017 testDecodeFront(range
, expectedChar
, expectedIndex
, line
);
2020 version (StdUnittest
) private void testBadDecode(R
)(R range
, size_t index
, size_t line
= __LINE__
)
2022 import core
.exception
: AssertError
;
2023 import std
.exception
: assertThrown
, enforce
;
2024 import std
.string
: format
;
2026 immutable initialIndex
= index
;
2028 static if (hasLength
!R
)
2029 immutable lenBefore
= range
.length
;
2031 static if (isRandomAccessRange
!R
)
2033 assertThrown
!UTFException(decode(range
, index
), null, __FILE__
, line
);
2034 enforce(index
== initialIndex
,
2035 new AssertError(format("decode: Wrong index: %s", index
), __FILE__
, line
));
2036 static if (hasLength
!R
)
2038 enforce(range
.length
== lenBefore
,
2039 new AssertError(format("decode: length changed:", range
.length
), __FILE__
, line
));
2043 if (initialIndex
== 0)
2044 assertThrown
!UTFException(decodeFront(range
, index
), null, __FILE__
, line
);
2047 version (StdUnittest
) private void testBadDecodeBack(R
)(R range
, size_t line
= __LINE__
)
2049 // This condition is to allow unit testing all `decode` functions together
2050 static if (!isBidirectionalRange
!R
)
2054 import core
.exception
: AssertError
;
2055 import std
.exception
: assertThrown
, enforce
;
2056 import std
.string
: format
;
2058 static if (hasLength
!R
)
2059 immutable lenBefore
= range
.length
;
2061 static if (isRandomAccessRange
!R
)
2063 assertThrown
!UTFException(decodeBack(range
), null, __FILE__
, line
);
2064 static if (hasLength
!R
)
2066 enforce(range
.length
== lenBefore
,
2067 new AssertError(format("decodeBack: length changed:", range
.length
), __FILE__
, line
));
2075 import std
.conv
: to
;
2076 import std
.exception
;
2080 foreach (S
; AliasSeq
!(to
!string
, InputCU
!char, RandomCU
!char,
2081 (string s
) => new RefBidirCU
!char(s
),
2082 (string s
) => new RefRandomCU
!char(s
)))
2084 enum sHasLength
= hasLength
!(typeof(S("abcd")));
2087 auto range
= S("abcd");
2088 testDecode(range
, 0, 'a', 1);
2089 testDecode(range
, 1, 'b', 2);
2090 testDecodeFront(range
, 'a', 1);
2091 testDecodeFront(range
, 'b', 1);
2092 assert(decodeFront(range
) == 'c');
2093 assert(decodeFront(range
) == 'd');
2097 auto range
= S("ウェブサイト");
2098 testDecode(range
, 0, 'ウ', 3);
2099 testDecode(range
, 3, 'ェ', 6);
2100 testDecodeFront(range
, 'ウ', 3);
2101 testDecodeFront(range
, 'ェ', 3);
2102 assert(decodeFront(range
) == 'ブ');
2103 assert(decodeFront(range
) == 'サ');
2107 auto range
= S("abcd");
2108 testDecodeBack(range
, 'd', 1);
2109 testDecodeBack(range
, 'c', 1);
2110 testDecodeBack(range
, 'b', 1);
2111 testDecodeBack(range
, 'a', 1);
2115 auto range
= S("ウェブサイト");
2116 testDecodeBack(range
, 'ト', 3);
2117 testDecodeBack(range
, 'イ', 3);
2118 testDecodeBack(range
, 'サ', 3);
2119 testDecodeBack(range
, 'ブ', 3);
2122 testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2123 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2125 foreach (str; ["\xE2\x89", // too short
2129 "\xF8\x80\x80\x80\x8A",
2130 "\xFC\x80\x80\x80\x80\x8A"])
2132 testBadDecode(S(str), 0);
2133 testBadDecode(S(str), 1);
2134 testBadDecodeBack(S(str));
2137 //Invalid UTF-8 sequence where the first code unit is valid.
2138 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2139 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2141 //Invalid UTF-8 sequence where the first code unit isn't valid.
2142 foreach (str; ["\xED\xA0\x80",
2150 testBadDecode(S(str), 0);
2151 testBadDecodeBack(S(str));
2159 import std
.exception
;
2162 foreach (S
; AliasSeq
!((wstring s
) => s
, InputCU
!wchar, RandomCU
!wchar,
2163 (wstring s
) => new RefBidirCU
!wchar(s
),
2164 (wstring s
) => new RefRandomCU
!wchar(s
)))
2166 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2167 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2168 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2169 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2170 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2172 testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2173 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2175 testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2176 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2179 auto range
= S("ウェブサイト");
2180 testDecode(range
, 0, 'ウ', 1);
2181 testDecode(range
, 1, 'ェ', 2);
2182 testDecodeFront(range
, 'ウ', 1);
2183 testDecodeFront(range
, 'ェ', 1);
2184 assert(decodeFront(range
) == 'ブ');
2185 assert(decodeFront(range
) == 'サ');
2189 auto range
= S("ウェブサイト");
2190 testDecodeBack(range
, 'ト', 1);
2191 testDecodeBack(range
, 'イ', 1);
2192 testDecodeBack(range
, 'サ', 1);
2193 testDecodeBack(range
, 'ブ', 1);
2197 foreach (S
; AliasSeq
!((wchar[] s
) => s
.idup
, RandomCU
!wchar, (wstring s
) => new RefRandomCU
!wchar(s
)))
2199 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2201 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2202 testDecode(str, 0, cast(dchar) 0x10000, 2);
2203 testDecode(str, 2, cast(dchar) 0x1400, 3);
2204 testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2205 testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2206 testDecodeBack(str, cast(dchar) 0x1400, 1);
2207 testDecodeBack(str, cast(dchar) 0x10000, 2);
2214 import std
.exception
;
2217 foreach (S
; AliasSeq
!((dstring s
) => s
, RandomCU
!dchar, InputCU
!dchar,
2218 (dstring s
) => new RefBidirCU
!dchar(s
),
2219 (dstring s
) => new RefRandomCU
!dchar(s
)))
2221 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2222 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2223 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2224 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2225 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2227 testBadDecode(S([cast(dchar) 0xD800]), 0);
2228 testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2229 testBadDecode(S([cast(dchar) 0x110000]), 0);
2231 testBadDecodeBack(S([cast(dchar) 0xD800]));
2232 testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2233 testBadDecodeBack(S([cast(dchar) 0x110000]));
2236 auto range
= S("ウェブサイト");
2237 testDecode(range
, 0, 'ウ', 1);
2238 testDecode(range
, 1, 'ェ', 2);
2239 testDecodeFront(range
, 'ウ', 1);
2240 testDecodeFront(range
, 'ェ', 1);
2241 assert(decodeFront(range
) == 'ブ');
2242 assert(decodeFront(range
) == 'サ');
2246 auto range
= S("ウェブサイト");
2247 testDecodeBack(range
, 'ト', 1);
2248 testDecodeBack(range
, 'イ', 1);
2249 testDecodeBack(range
, 'サ', 1);
2250 testDecodeBack(range
, 'ブ', 1);
2254 foreach (S
; AliasSeq
!((dchar[] s
) => s
.idup
, RandomCU
!dchar, (dstring s
) => new RefRandomCU
!dchar(s
)))
2256 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2257 testDecode(str, 0, 0x10000, 1);
2258 testDecode(str, 1, 0x1400, 2);
2259 testDecode(str, 2, 0xB9DDE, 3);
2260 testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2261 testDecodeBack(str, cast(dchar) 0x1400, 1);
2262 testDecodeBack(str, cast(dchar) 0x10000, 1);
2269 import std
.exception
;
2270 import std
.traits
: FunctionAttribute
, functionAttributes
, isSafe
;
2273 foreach (S
; AliasSeq
!( char[], const( char)[], string
,
2274 wchar[], const(wchar)[], wstring
,
2275 dchar[], const(dchar)[], dstring
))
2277 static assert(isSafe
!({ S
str; size_t i
= 0; decode(str, i
); }));
2278 static assert(isSafe
!({ S
str; size_t i
= 0; decodeFront(str, i
); }));
2279 static assert(isSafe
!({ S
str; decodeFront(str); }));
2280 static assert((functionAttributes
!({ S
str; size_t i
= 0; decode(str, i
); }) & FunctionAttribute
.pure_
) != 0);
2281 static assert((functionAttributes
!({
2282 S
str; size_t i
= 0; decodeFront(str, i
);
2283 }) & FunctionAttribute
.pure_
) != 0);
2284 static assert((functionAttributes
!({ S
str; decodeFront(str); }) & FunctionAttribute
.pure_
) != 0);
2285 static assert((functionAttributes
!({
2286 S
str; size_t i
= 0; decodeBack(str, i
);
2287 }) & FunctionAttribute
.pure_
) != 0);
2288 static assert((functionAttributes
!({ S
str; decodeBack(str); }) & FunctionAttribute
.pure_
) != 0);
2295 import std
.exception
;
2297 val
[0] = 0b1111_0111;
2298 val
[1] = 0b1011_1111;
2299 val
[2] = 0b1011_1111;
2300 val
[3] = 0b1011_1111;
2302 assertThrown
!UTFException((){ dchar ch
= decode(val
[], i
); }());
2304 /* =================== Encode ======================= */
2306 private dchar _utfException(UseReplacementDchar useReplacementDchar
)(string msg
, dchar c
)
2308 static if (useReplacementDchar
)
2309 return replacementDchar
;
2311 throw new UTFException(msg
).setSequence(c
);
2315 Encodes `c` into the static array, `buf`, and returns the actual
2316 length of the encoded character (a number between `1` and `4` for
2317 `char[4]` buffers and a number between `1` and `2` for
2318 `wchar[2]` buffers).
2321 `UTFException` if `c` is not a valid UTF code point.
2323 size_t
encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2324 out char[4] buf
, dchar c
) @safe pure
2328 assert(isValidDchar(c
));
2329 buf
[0] = cast(char) c
;
2334 assert(isValidDchar(c
));
2335 buf
[0] = cast(char)(0xC0 |
(c
>> 6));
2336 buf
[1] = cast(char)(0x80 |
(c
& 0x3F));
2341 if (0xD800 <= c
&& c
<= 0xDFFF)
2342 c
= _utfException
!useReplacementDchar("Encoding a surrogate code point in UTF-8", c
);
2344 assert(isValidDchar(c
));
2346 buf
[0] = cast(char)(0xE0 |
(c
>> 12));
2347 buf
[1] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2348 buf
[2] = cast(char)(0x80 |
(c
& 0x3F));
2353 assert(isValidDchar(c
));
2354 buf
[0] = cast(char)(0xF0 |
(c
>> 18));
2355 buf
[1] = cast(char)(0x80 |
((c
>> 12) & 0x3F));
2356 buf
[2] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2357 buf
[3] = cast(char)(0x80 |
(c
& 0x3F));
2361 assert(!isValidDchar(c
));
2362 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-8", c
);
2369 import std
.exception
: assertThrown
;
2370 import std
.typecons
: Yes
;
2374 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2375 assert(encode(buf
, '\u007F') == 1 && buf
[0 .. 1] == "\u007F");
2376 assert(encode(buf
, '\u0080') == 2 && buf
[0 .. 2] == "\u0080");
2377 assert(encode(buf
, '\uE000') == 3 && buf
[0 .. 3] == "\uE000");
2378 assert(encode(buf
, 0xFFFE) == 3 && buf
[0 .. 3] == "\xEF\xBF\xBE");
2379 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2381 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2383 assert(slice
.decodeFront
== replacementDchar
);
2389 import std
.exception
: assertThrown
;
2390 import std
.typecons
: Yes
;
2394 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2395 assert(encode(buf
, '\uD7FF') == 1 && buf
[0 .. 1] == "\uD7FF");
2396 assert(encode(buf
, '\uE000') == 1 && buf
[0 .. 1] == "\uE000");
2397 assert(encode(buf
, '\U00010000') == 2 && buf
[0 .. 2] == "\U00010000");
2398 assert(encode(buf
, '\U0010FFFF') == 2 && buf
[0 .. 2] == "\U0010FFFF");
2399 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2401 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2403 assert(slice
.decodeFront
== replacementDchar
);
2409 import std
.exception
: assertThrown
;
2410 import std
.typecons
: Yes
;
2414 assert(encode(buf
, '\u0000') == 1 && buf
[0] == '\u0000');
2415 assert(encode(buf
, '\uD7FF') == 1 && buf
[0] == '\uD7FF');
2416 assert(encode(buf
, '\uE000') == 1 && buf
[0] == '\uE000');
2417 assert(encode(buf
, '\U0010FFFF') == 1 && buf
[0] == '\U0010FFFF');
2418 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2420 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2421 assert(buf
[0] == replacementDchar
);
2426 import std
.exception
;
2431 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2432 assert(encode(buf
, '\u007F') == 1 && buf
[0 .. 1] == "\u007F");
2433 assert(encode(buf
, '\u0080') == 2 && buf
[0 .. 2] == "\u0080");
2434 assert(encode(buf
, '\u07FF') == 2 && buf
[0 .. 2] == "\u07FF");
2435 assert(encode(buf
, '\u0800') == 3 && buf
[0 .. 3] == "\u0800");
2436 assert(encode(buf
, '\uD7FF') == 3 && buf
[0 .. 3] == "\uD7FF");
2437 assert(encode(buf
, '\uE000') == 3 && buf
[0 .. 3] == "\uE000");
2438 assert(encode(buf
, 0xFFFE) == 3 && buf
[0 .. 3] == "\xEF\xBF\xBE");
2439 assert(encode(buf
, 0xFFFF) == 3 && buf
[0 .. 3] == "\xEF\xBF\xBF");
2440 assert(encode(buf
, '\U00010000') == 4 && buf
[0 .. 4] == "\U00010000");
2441 assert(encode(buf
, '\U0010FFFF') == 4 && buf
[0 .. 4] == "\U0010FFFF");
2443 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2444 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2445 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2446 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2447 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2449 assert(encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000) == buf
.stride
);
2450 enum replacementDcharString
= "\uFFFD";
2451 assert(buf
[0 .. replacementDcharString
.length
] == replacementDcharString
);
2457 size_t
encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2458 out wchar[2] buf
, dchar c
) @safe pure
2462 if (0xD800 <= c
&& c
<= 0xDFFF)
2463 c
= _utfException
!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c
);
2465 assert(isValidDchar(c
));
2467 buf
[0] = cast(wchar) c
;
2472 assert(isValidDchar(c
));
2473 buf
[0] = cast(wchar)((((c
- 0x10000) >> 10) & 0x3FF) + 0xD800);
2474 buf
[1] = cast(wchar)(((c
- 0x10000) & 0x3FF) + 0xDC00);
2478 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-16", c
);
2484 import std
.exception
;
2489 assert(encode(buf
, '\u0000') == 1 && buf
[0 .. 1] == "\u0000");
2490 assert(encode(buf
, '\uD7FF') == 1 && buf
[0 .. 1] == "\uD7FF");
2491 assert(encode(buf
, '\uE000') == 1 && buf
[0 .. 1] == "\uE000");
2492 assert(encode(buf
, 0xFFFE) == 1 && buf
[0] == 0xFFFE);
2493 assert(encode(buf
, 0xFFFF) == 1 && buf
[0] == 0xFFFF);
2494 assert(encode(buf
, '\U00010000') == 2 && buf
[0 .. 2] == "\U00010000");
2495 assert(encode(buf
, '\U0010FFFF') == 2 && buf
[0 .. 2] == "\U0010FFFF");
2497 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2498 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2499 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2500 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2501 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2503 assert(encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000) == buf
.stride
);
2504 assert(buf
.front
== replacementDchar
);
2510 size_t
encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2511 out dchar[1] buf
, dchar c
) @safe pure
2513 if ((0xD800 <= c
&& c
<= 0xDFFF) ||
0x10FFFF < c
)
2514 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-32", c
);
2516 assert(isValidDchar(c
));
2523 import std
.exception
;
2528 encode(buf
, '\u0000'); assert(buf
[0] == '\u0000');
2529 encode(buf
, '\uD7FF'); assert(buf
[0] == '\uD7FF');
2530 encode(buf
, '\uE000'); assert(buf
[0] == '\uE000');
2531 encode(buf
, 0xFFFE); assert(buf
[0] == 0xFFFE);
2532 encode(buf
, 0xFFFF); assert(buf
[0] == 0xFFFF);
2533 encode(buf
, '\U0010FFFF'); assert(buf
[0] == '\U0010FFFF');
2535 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2536 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2537 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2538 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2539 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2541 assert(encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000) == buf
.stride
);
2542 assert(buf
.front
== replacementDchar
);
2548 Encodes `c` in `str`'s encoding and appends it to `str`.
2551 `UTFException` if `c` is not a valid UTF code point.
2553 void encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2554 ref scope char[] str, dchar c
) @safe pure
2558 assert(isValidDchar(c
));
2559 str ~= cast(char) c
;
2568 assert(isValidDchar(c
));
2569 buf
[0] = cast(char)(0xC0 |
(c
>> 6));
2570 buf
[1] = cast(char)(0x80 |
(c
& 0x3F));
2573 else if (c
<= 0xFFFF)
2575 if (0xD800 <= c
&& c
<= 0xDFFF)
2576 c
= _utfException
!useReplacementDchar("Encoding a surrogate code point in UTF-8", c
);
2578 assert(isValidDchar(c
));
2580 buf
[0] = cast(char)(0xE0 |
(c
>> 12));
2581 buf
[1] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2582 buf
[2] = cast(char)(0x80 |
(c
& 0x3F));
2585 else if (c
<= 0x10FFFF)
2587 assert(isValidDchar(c
));
2588 buf
[0] = cast(char)(0xF0 |
(c
>> 18));
2589 buf
[1] = cast(char)(0x80 |
((c
>> 12) & 0x3F));
2590 buf
[2] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
2591 buf
[3] = cast(char)(0x80 |
(c
& 0x3F));
2596 assert(!isValidDchar(c
));
2597 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-8", c
);
2607 char[] s
= "abcd".dup
;
2612 assert(s
.length
== 5);
2613 assert(s
== "abcda");
2615 assert(s
.length
== 7);
2616 assert(s
== "abcdaø");
2621 import std
.exception
;
2625 char[] s
= "abcd".dup
;
2626 encode(s
, cast(dchar)'a');
2627 assert(s
.length
== 5);
2628 assert(s
== "abcda");
2630 encode(s
, cast(dchar)'\u00A9');
2631 assert(s
.length
== 7);
2632 assert(s
== "abcda\xC2\xA9");
2633 //assert(s == "abcda\u00A9"); // BUG: fix compiler
2635 encode(s
, cast(dchar)'\u2260');
2636 assert(s
.length
== 10);
2637 assert(s
== "abcda\xC2\xA9\xE2\x89\xA0");
2643 import std
.exception
;
2648 encode(buf
, '\u0000'); assert(buf
[0 .. $] == "\u0000");
2649 encode(buf
, '\u007F'); assert(buf
[1 .. $] == "\u007F");
2650 encode(buf
, '\u0080'); assert(buf
[2 .. $] == "\u0080");
2651 encode(buf
, '\u07FF'); assert(buf
[4 .. $] == "\u07FF");
2652 encode(buf
, '\u0800'); assert(buf
[6 .. $] == "\u0800");
2653 encode(buf
, '\uD7FF'); assert(buf
[9 .. $] == "\uD7FF");
2654 encode(buf
, '\uE000'); assert(buf
[12 .. $] == "\uE000");
2655 encode(buf
, 0xFFFE); assert(buf
[15 .. $] == "\xEF\xBF\xBE");
2656 encode(buf
, 0xFFFF); assert(buf
[18 .. $] == "\xEF\xBF\xBF");
2657 encode(buf
, '\U00010000'); assert(buf
[21 .. $] == "\U00010000");
2658 encode(buf
, '\U0010FFFF'); assert(buf
[25 .. $] == "\U0010FFFF");
2660 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2661 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2662 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2663 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2664 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2666 enum replacementDcharString
= "\uFFFD";
2667 enum rdcslen
= replacementDcharString
.length
;
2668 assert(buf
[$ - rdcslen
.. $] != replacementDcharString
);
2669 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2670 assert(buf
[$ - rdcslen
.. $] == replacementDcharString
);
2675 void encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2676 ref scope wchar[] str, dchar c
) @safe pure
2680 if (0xD800 <= c
&& c
<= 0xDFFF)
2681 c
= _utfException
!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c
);
2683 assert(isValidDchar(c
));
2685 str ~= cast(wchar) c
;
2687 else if (c
<= 0x10FFFF)
2691 assert(isValidDchar(c
));
2692 buf
[0] = cast(wchar)((((c
- 0x10000) >> 10) & 0x3FF) + 0xD800);
2693 buf
[1] = cast(wchar)(((c
- 0x10000) & 0x3FF) + 0xDC00);
2698 assert(!isValidDchar(c
));
2699 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-16", c
);
2706 import std
.exception
;
2711 encode(buf
, '\u0000'); assert(buf
[0] == '\u0000');
2712 encode(buf
, '\uD7FF'); assert(buf
[1] == '\uD7FF');
2713 encode(buf
, '\uE000'); assert(buf
[2] == '\uE000');
2714 encode(buf
, 0xFFFE); assert(buf
[3] == 0xFFFE);
2715 encode(buf
, 0xFFFF); assert(buf
[4] == 0xFFFF);
2716 encode(buf
, '\U00010000'); assert(buf
[5 .. $] == "\U00010000");
2717 encode(buf
, '\U0010FFFF'); assert(buf
[7 .. $] == "\U0010FFFF");
2719 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2720 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2721 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2722 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2723 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2725 assert(buf
.back
!= replacementDchar
);
2726 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2727 assert(buf
.back
== replacementDchar
);
2732 void encode(UseReplacementDchar useReplacementDchar
= No
.useReplacementDchar
)(
2733 ref scope dchar[] str, dchar c
) @safe pure
2735 if ((0xD800 <= c
&& c
<= 0xDFFF) ||
0x10FFFF < c
)
2736 c
= _utfException
!useReplacementDchar("Encoding an invalid code point in UTF-32", c
);
2738 assert(isValidDchar(c
));
2744 import std
.exception
;
2749 encode(buf
, '\u0000'); assert(buf
[0] == '\u0000');
2750 encode(buf
, '\uD7FF'); assert(buf
[1] == '\uD7FF');
2751 encode(buf
, '\uE000'); assert(buf
[2] == '\uE000');
2752 encode(buf
, 0xFFFE); assert(buf
[3] == 0xFFFE);
2753 encode(buf
, 0xFFFF); assert(buf
[4] == 0xFFFF);
2754 encode(buf
, '\U0010FFFF'); assert(buf
[5] == '\U0010FFFF');
2756 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xD800));
2757 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDBFF));
2758 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDC00));
2759 assertThrown
!UTFException(encode(buf
, cast(dchar) 0xDFFF));
2760 assertThrown
!UTFException(encode(buf
, cast(dchar) 0x110000));
2762 assert(buf
.back
!= replacementDchar
);
2763 encode
!(Yes
.useReplacementDchar
)(buf
, cast(dchar) 0x110000);
2764 assert(buf
.back
== replacementDchar
);
2770 Returns the number of code units that are required to encode the code point
2771 `c` when `C` is the character type used to encode it.
2773 ubyte codeLength(C
)(dchar c
) @safe pure nothrow @nogc
2776 static if (C
.sizeof
== 1)
2778 if (c
<= 0x7F) return 1;
2779 if (c
<= 0x7FF) return 2;
2780 if (c
<= 0xFFFF) return 3;
2781 if (c
<= 0x10FFFF) return 4;
2784 else static if (C
.sizeof
== 2)
2786 return c
<= 0xFFFF ?
1 : 2;
2790 static assert(C
.sizeof
== 4);
2796 @safe pure nothrow @nogc unittest
2798 assert(codeLength
!char('a') == 1);
2799 assert(codeLength
!wchar('a') == 1);
2800 assert(codeLength
!dchar('a') == 1);
2802 assert(codeLength
!char('\U0010FFFF') == 4);
2803 assert(codeLength
!wchar('\U0010FFFF') == 2);
2804 assert(codeLength
!dchar('\U0010FFFF') == 1);
2809 Returns the number of code units that are required to encode `str`
2810 in a string whose character type is `C`. This is particularly useful
2811 when slicing one string with the length of another and the two string
2812 types use different character types.
2815 C = the character type to get the encoding length for
2816 input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2817 to calculate the encoding length from
2819 The number of code units in `input` when encoded to `C`
2821 size_t
codeLength(C
, InputRange
)(InputRange input
)
2822 if (isSomeFiniteCharInputRange
!InputRange
)
2824 alias EncType
= typeof(cast() ElementEncodingType
!InputRange
.init
);
2825 static if (isSomeString
!InputRange
&& is(EncType
== C
) && is(typeof(input
.length
)))
2826 return input
.length
;
2831 foreach (c
; input
.byDchar
)
2832 total
+= codeLength
!C(c
);
2841 assert(codeLength
!char("hello world") ==
2842 "hello world".length
);
2843 assert(codeLength
!wchar("hello world") ==
2844 "hello world"w
.length
);
2845 assert(codeLength
!dchar("hello world") ==
2846 "hello world"d
.length
);
2848 assert(codeLength
!char(`プログラミング`) ==
2850 assert(codeLength
!wchar(`プログラミング`) ==
2852 assert(codeLength
!dchar(`プログラミング`) ==
2855 string haystack
= `Être sans la verité, ça, ce ne serait pas bien.`;
2856 wstring needle
= `Être sans la verité`;
2857 assert(haystack
[codeLength
!char(needle
) .. $] ==
2858 `, ça, ce ne serait pas bien.`);
2863 import std
.algorithm
.iteration
: filter
;
2864 import std
.conv
: to
;
2865 import std
.exception
;
2869 foreach (S
; AliasSeq
!( char[], const char[], string
,
2870 wchar[], const wchar[], wstring
,
2871 dchar[], const dchar[], dstring
))
2873 foreach (C
; AliasSeq
!(char, wchar, dchar))
2875 assert(codeLength
!C(to
!S("Walter Bright")) == to
!(C
[])("Walter Bright").length
);
2876 assert(codeLength
!C(to
!S(`言語`)) == to
!(C
[])(`言語`).length
);
2877 assert(codeLength
!C(to
!S(`ウェブサイト@La_Verité.com`)) ==
2878 to
!(C
[])(`ウェブサイト@La_Verité.com`).length
);
2879 assert(codeLength
!C(to
!S(`ウェブサイト@La_Verité.com`).filter
!(x
=> true)()) ==
2880 to
!(C
[])(`ウェブサイト@La_Verité.com`).length
);
2887 Internal helper function:
2889 Returns true if it is safe to search for the Codepoint `c` inside
2890 code units, without decoding.
2892 This is a runtime check that is used an optimization in various functions,
2893 particularly, in `std.string`.
2895 package bool canSearchInCodeUnits(C
)(dchar c
)
2898 static if (C
.sizeof
== 1)
2900 else static if (C
.sizeof
== 2)
2901 return c
<= 0xD7FF ||
(0xE000 <= c
&& c
<= 0xFFFF);
2902 else static if (C
.sizeof
== 4)
2909 assert( canSearchInCodeUnits
! char('a'));
2910 assert( canSearchInCodeUnits
!wchar('a'));
2911 assert( canSearchInCodeUnits
!dchar('a'));
2912 assert(!canSearchInCodeUnits
! char('ö')); //Important test: ö <= 0xFF
2913 assert(!canSearchInCodeUnits
! char(cast(char)'ö')); //Important test: ö <= 0xFF
2914 assert( canSearchInCodeUnits
!wchar('ö'));
2915 assert( canSearchInCodeUnits
!dchar('ö'));
2916 assert(!canSearchInCodeUnits
! char('日'));
2917 assert( canSearchInCodeUnits
!wchar('日'));
2918 assert( canSearchInCodeUnits
!dchar('日'));
2919 assert(!canSearchInCodeUnits
!wchar(cast(wchar) 0xDA00));
2920 assert( canSearchInCodeUnits
!dchar(cast(dchar) 0xDA00));
2921 assert(!canSearchInCodeUnits
! char('\U00010001'));
2922 assert(!canSearchInCodeUnits
!wchar('\U00010001'));
2923 assert( canSearchInCodeUnits
!dchar('\U00010001'));
2926 /* =================== Validation ======================= */
2929 Checks to see if `str` is well-formed unicode or not.
2932 `UTFException` if `str` is not well-formed.
2934 void validate(S
)(in S
str) @safe pure
2937 immutable len
= str.length
;
2938 for (size_t i
= 0; i
< len
; )
2947 import std
.exception
: assertThrown
;
2948 char[] a
= [167, 133, 175];
2949 assertThrown
!UTFException(validate(a
));
2952 // https://issues.dlang.org/show_bug.cgi?id=12923
2955 import std
.exception
;
2957 char[3]a
=[167, 133, 175];
2963 * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2964 * string of the elements.
2967 * s = the string to encode
2971 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2973 string
toUTF8(S
)(S s
)
2974 if (isSomeFiniteCharInputRange
!S
)
2976 return toUTFImpl
!string(s
);
2982 import std
.algorithm
.comparison
: equal
;
2984 // The ö is represented by two UTF-8 code units
2985 assert("Hellø"w
.toUTF8
.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2987 // 𐐷 is four code units in UTF-8
2988 assert("𐐷"d
.toUTF8
.equal([0xF0, 0x90, 0x90, 0xB7]));
2991 @system pure unittest
2993 import std
.algorithm
.comparison
: equal
;
2994 import std
.internal
.test.dummyrange
: ReferenceInputRange
;
2996 alias RT
= ReferenceInputRange
!(ElementType
!(string
));
2997 auto r1
= new RT("Hellø");
2998 auto r2
= new RT("𐐷");
3000 assert(r1
.toUTF8
.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
3001 assert(r2
.toUTF8
.equal([0xF0, 0x90, 0x90, 0xB7]));
3005 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3006 * `wstring` of the elements.
3009 * s = the range to encode
3013 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3015 wstring
toUTF16(S
)(S s
)
3016 if (isSomeFiniteCharInputRange
!S
)
3018 return toUTFImpl
!wstring(s
);
3024 import std
.algorithm
.comparison
: equal
;
3026 // these graphemes are two code units in UTF-16 and one in UTF-32
3027 assert("𤭢"d
.length
== 1);
3028 assert("𐐷"d
.length
== 1);
3030 assert("𤭢"d
.toUTF16
.equal([0xD852, 0xDF62]));
3031 assert("𐐷"d
.toUTF16
.equal([0xD801, 0xDC37]));
3034 @system pure unittest
3036 import std
.algorithm
.comparison
: equal
;
3037 import std
.internal
.test.dummyrange
: ReferenceInputRange
;
3039 alias RT
= ReferenceInputRange
!(ElementType
!(string
));
3040 auto r1
= new RT("𤭢");
3041 auto r2
= new RT("𐐷");
3043 assert(r1
.toUTF16
.equal([0xD852, 0xDF62]));
3044 assert(r2
.toUTF16
.equal([0xD801, 0xDC37]));
3049 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3050 * `dstring` of the elements.
3053 * s = the range to encode
3057 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3059 dstring
toUTF32(S
)(scope S s
)
3060 if (isSomeFiniteCharInputRange
!S
)
3062 return toUTFImpl
!dstring(s
);
3068 import std
.algorithm
.comparison
: equal
;
3070 // these graphemes are two code units in UTF-16 and one in UTF-32
3071 assert("𤭢"w
.length
== 2);
3072 assert("𐐷"w
.length
== 2);
3074 assert("𤭢"w
.toUTF32
.equal([0x00024B62]));
3075 assert("𐐷"w
.toUTF32
.equal([0x00010437]));
3078 private T
toUTFImpl(T
, S
)(scope S s
)
3080 static if (is(S
: T
))
3086 import std
.array
: appender
;
3087 auto app
= appender
!T();
3089 static if (is(S
== C
[], C
) || hasLength
!S
)
3090 app
.reserve(s
.length
);
3092 ElementEncodingType
!T e
= void;
3093 foreach (c
; s
.byUTF
!(typeof(cast() ElementEncodingType
!T
.init
)))
3100 /* =================== toUTFz ======================= */
3103 Returns a C-style zero-terminated string equivalent to `str`. `str`
3104 must not contain embedded `'\0'`'s as any C function will treat the first
3105 `'\0'` that it sees as the end of the string. If `str.empty` is
3106 `true`, then a string containing only `'\0'` is returned.
3108 `toUTFz` accepts any type of string and is templated on the type of
3109 character pointer that you wish to convert to. It will avoid allocating a
3110 new string if it can, but there's a decent chance that it will end up having
3111 to allocate a new string - particularly when dealing with character types
3114 $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3115 anything alters the character one past the end of `str` (which is the
3116 `'\0'` character terminating the string), then the string won't be
3117 zero-terminated anymore. The most likely scenarios for that are if you
3118 append to `str` and no reallocation takes place or when `str` is a
3119 slice of a larger array, and you alter the character in the larger array
3120 which is one character past the end of `str`. Another case where it could
3121 occur would be if you had a mutable character array immediately after
3122 `str` in memory (for example, if they're member variables in a
3123 user-defined type with one declared right after the other) and that
3124 character array happened to start with `'\0'`. Such scenarios will never
3125 occur if you immediately use the zero-terminated string after calling
3126 `toUTFz` and the C function using it doesn't keep a reference to it.
3127 Also, they are unlikely to occur even if you save the zero-terminated string
3128 (the cases above would be among the few examples of where it could happen).
3129 However, if you save the zero-terminate string and want to be absolutely
3130 certain that the string stays zero-terminated, then simply append a
3131 `'\0'` to the string and use its `ptr` property rather than calling
3134 $(RED Warning 2:) When passing a character pointer to a C function, and the
3135 C function keeps it around for any reason, make sure that you keep a
3136 reference to it in your D code. Otherwise, it may go away during a garbage
3137 collection cycle and cause a nasty bug when the C code tries to use it.
3140 if (is(P
== C
*, C
) && isSomeChar
!C
)
3142 P
toUTFz(S
)(S
str) @safe pure
3145 return toUTFzImpl
!(P
, S
)(str);
3152 auto p1
= toUTFz
!(char*)("hello world");
3153 auto p2
= toUTFz
!(const(char)*)("hello world");
3154 auto p3
= toUTFz
!(immutable(char)*)("hello world");
3155 auto p4
= toUTFz
!(char*)("hello world"d
);
3156 auto p5
= toUTFz
!(const(wchar)*)("hello world");
3157 auto p6
= toUTFz
!(immutable(dchar)*)("hello world"w
);
3160 private P
toUTFzImpl(P
, S
)(return scope S
str) @safe pure
3161 if (is(immutable typeof(*P
.init
) == typeof(str[0])))
3162 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3166 typeof(*P
.init
)[] retval
= ['\0'];
3168 auto trustedPtr() @trusted { return retval
.ptr
; }
3169 return trustedPtr();
3172 alias C
= typeof(cast() ElementEncodingType
!S
.init
);
3174 //If the P is mutable, then we have to make a copy.
3175 static if (is(typeof(cast() *P
.init
) == typeof(*P
.init
)))
3177 return toUTFzImpl
!(P
, const(C
)[])(cast(const(C
)[])str);
3183 auto trustedPtrAdd(S s
) @trusted { return s
.ptr
+ s
.length
; }
3184 immutable p
= trustedPtrAdd(str);
3186 // Peek past end of str, if it's 0, no conversion necessary.
3187 // Note that the compiler will put a 0 past the end of static
3188 // strings, and the storage allocator will put a 0 past the end
3189 // of newly allocated char[]'s.
3190 // Is p dereferenceable? A simple test: if the p points to an
3191 // address multiple of 4, then conservatively assume the pointer
3192 // might be pointing to a new block of memory, which might be
3193 // unreadable. Otherwise, it's definitely pointing to valid
3195 if ((cast(size_t
) p
& 3) && *p
== '\0')
3199 return toUTFzImpl
!(P
, const(C
)[])(cast(const(C
)[])str);
3203 private P
toUTFzImpl(P
, S
)(return scope S
str) @safe pure
3204 if (is(typeof(str[0]) C
) && is(immutable typeof(*P
.init
) == immutable C
) && !is(C
== immutable))
3205 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3207 alias InChar
= typeof(str[0]);
3208 alias UInChar
= typeof(cast() str[0]); // unqualified version of InChar
3209 alias OutChar
= typeof(*P
.init
);
3210 alias UOutChar
= typeof(cast() *P
.init
); // unqualified version
3212 //const(C)[] -> const(C)* or
3213 //C[] -> C* or const(C)*
3214 static if (( is(const(UInChar
) == InChar
) && is( const(UOutChar
) == OutChar
)) ||
3215 (!is(const(UInChar
) == InChar
) && !is(immutable(UOutChar
) == OutChar
)))
3219 auto trustedPtrAdd(S s
) @trusted { return s
.ptr
+ s
.length
; }
3220 auto p
= trustedPtrAdd(str);
3222 if ((cast(size_t
) p
& 3) && *p
== '\0')
3229 //const(C)[] -> C* or immutable(C)* or
3230 //C[] -> immutable(C)*
3233 import std
.array
: uninitializedArray
;
3234 auto copy
= uninitializedArray
!(UOutChar
[])(str.length
+ 1);
3235 copy
[0 .. $ - 1] = str[];
3238 auto trustedCast(typeof(copy
) c
) @trusted { return cast(P
) c
.ptr
; }
3239 return trustedCast(copy
);
3243 private P
toUTFzImpl(P
, S
)(S
str) @safe pure
3244 if (!is(immutable typeof(*P
.init
) == immutable typeof(str[0])))
3245 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3247 import std
.array
: appender
;
3248 auto retval
= appender
!(typeof(*P
.init
)[])();
3250 foreach (dchar c
; str)
3254 return () @trusted { return cast(P
) retval
.data
.ptr
; } ();
3259 import core
.exception
: AssertError
;
3260 import std
.algorithm
;
3261 import std
.conv
: to
;
3262 import std
.exception
;
3263 import std
.string
: format
;
3267 foreach (S
; AliasSeq
!(string
, wstring
, dstring
))
3269 alias C
= Unqual
!(ElementEncodingType
!S
);
3271 auto s1
= to
!S("hello\U00010143\u0100\U00010143");
3272 auto temp
= new C
[](s1
.length
+ 1);
3273 temp
[0 .. $ - 1] = s1
[0 .. $];
3276 auto trustedAssumeUnique(T
)(T t
) @trusted { return assumeUnique(t
); }
3277 auto s2
= trustedAssumeUnique(temp
);
3280 void trustedCStringAssert(P
, S
)(S s
) @trusted
3282 auto p
= toUTFz
!P(s
);
3283 assert(p
[0 .. s
.length
] == s
);
3284 assert(p
[s
.length
] == '\0');
3287 foreach (P
; AliasSeq
!(C
*, const(C
)*, immutable(C
)*))
3289 trustedCStringAssert
!P(s1
);
3290 trustedCStringAssert
!P(s2
);
3295 static void test(P
, S
)(S s
, size_t line
= __LINE__
) @trusted
3297 static size_t
zeroLen(C
)(const(C
)* ptr
) @trusted
3300 while (*ptr
!= '\0') { ++ptr
; ++len
; }
3304 auto p
= toUTFz
!P(s
);
3305 immutable len
= zeroLen(p
);
3306 enforce(cmp(s
, p
[0 .. len
]) == 0,
3307 new AssertError(format("Unit test failed: %s %s", P
.stringof
, S
.stringof
),
3313 foreach (P
; AliasSeq
!(wchar*, const(wchar)*, immutable(wchar)*,
3314 dchar*, const(dchar)*, immutable(dchar)*))
3316 test!P("hello\U00010143\u0100\U00010143");
3318 foreach (P
; AliasSeq
!( char*, const( char)*, immutable( char)*,
3319 dchar*, const(dchar)*, immutable(dchar)*))
3321 test!P("hello\U00010143\u0100\U00010143"w
);
3323 foreach (P
; AliasSeq
!( char*, const( char)*, immutable( char)*,
3324 wchar*, const(wchar)*, immutable(wchar)*))
3326 test!P("hello\U00010143\u0100\U00010143"d
);
3328 foreach (S
; AliasSeq
!( char[], const( char)[],
3329 wchar[], const(wchar)[],
3330 dchar[], const(dchar)[]))
3332 auto s
= to
!S("hello\U00010143\u0100\U00010143");
3334 foreach (P
; AliasSeq
!( char*, const( char)*, immutable( char)*,
3335 wchar*, const(wchar)*, immutable(wchar)*,
3336 dchar*, const(dchar)*, immutable(dchar)*))
3346 `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3348 Encodes string `s` into UTF-16 and returns the encoded string.
3349 `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3350 that take an `LPCWSTR` argument.
3352 const(wchar)* toUTF16z(C
)(const(C
)[] str) @safe pure
3355 return toUTFz
!(const(wchar)*)(str);
3361 string
str = "Hello, World!";
3362 const(wchar)* p
= str.toUTF16z
;
3363 assert(p
[str.length
] == '\0');
3368 import std
.conv
: to
;
3369 //toUTFz is already thoroughly tested, so this will just verify that
3370 //toUTF16z compiles properly for the various string types.
3371 foreach (S
; AliasSeq
!(string
, wstring
, dstring
))
3372 assert(toUTF16z(to
!S("hello world")) !is null);
3376 /* ================================ tests ================================== */
3380 import std
.exception
;
3384 assert(toUTF16("hello"c
) == "hello");
3385 assert(toUTF32("hello"c
) == "hello");
3386 assert(toUTF8 ("hello"w
) == "hello");
3387 assert(toUTF32("hello"w
) == "hello");
3388 assert(toUTF8 ("hello"d
) == "hello");
3389 assert(toUTF16("hello"d
) == "hello");
3391 assert(toUTF16("hel\u1234o"c
) == "hel\u1234o");
3392 assert(toUTF32("hel\u1234o"c
) == "hel\u1234o");
3393 assert(toUTF8 ("hel\u1234o"w
) == "hel\u1234o");
3394 assert(toUTF32("hel\u1234o"w
) == "hel\u1234o");
3395 assert(toUTF8 ("hel\u1234o"d
) == "hel\u1234o");
3396 assert(toUTF16("hel\u1234o"d
) == "hel\u1234o");
3398 assert(toUTF16("he\U0010AAAAllo"c
) == "he\U0010AAAAllo");
3399 assert(toUTF32("he\U0010AAAAllo"c
) == "he\U0010AAAAllo");
3400 assert(toUTF8 ("he\U0010AAAAllo"w
) == "he\U0010AAAAllo");
3401 assert(toUTF32("he\U0010AAAAllo"w
) == "he\U0010AAAAllo");
3402 assert(toUTF8 ("he\U0010AAAAllo"d
) == "he\U0010AAAAllo");
3403 assert(toUTF16("he\U0010AAAAllo"d
) == "he\U0010AAAAllo");
3409 Returns the total number of code points encoded in `str`.
3411 Supercedes: This function supercedes $(LREF toUCSindex).
3413 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3416 `UTFException` if `str` is not well-formed.
3418 size_t
count(C
)(const(C
)[] str) @safe pure nothrow @nogc
3421 return walkLength(str.byDchar
);
3425 @safe pure nothrow @nogc unittest
3427 assert(count("") == 0);
3428 assert(count("a") == 1);
3429 assert(count("abc") == 3);
3430 assert(count("\u20AC100") == 4);
3433 @safe pure nothrow @nogc unittest
3435 import std
.exception
;
3438 assert(count("") == 0);
3439 assert(count("a") == 1);
3440 assert(count("abc") == 3);
3441 assert(count("\u20AC100") == 4);
3446 // Ranges of code units for testing.
3447 version (StdUnittest
)
3452 import std
.conv
: to
;
3453 @property bool empty() { return _str
.empty
; }
3454 @property C
front() { return _str
[0]; }
3455 void popFront() { _str
= _str
[1 .. $]; }
3457 this(inout(C
)[] str)
3459 _str
= to
!(C
[])(str);
3467 import std
.conv
: to
;
3468 @property bool empty() { return _str
.empty
; }
3469 @property C
front() { return _str
[0]; }
3470 void popFront() { _str
= _str
[1 .. $]; }
3471 @property C
back() { return _str
[$ - 1]; }
3472 void popBack() { _str
= _str
[0 .. $ - 1]; }
3473 @property auto save() { return BidirCU(_str
); }
3474 @property size_t
length() { return _str
.length
; }
3476 this(inout(C
)[] str)
3478 _str
= to
!(C
[])(str);
3486 import std
.conv
: to
;
3487 @property bool empty() { return _str
.empty
; }
3488 @property C
front() { return _str
[0]; }
3489 void popFront() { _str
= _str
[1 .. $]; }
3490 @property C
back() { return _str
[$ - 1]; }
3491 void popBack() { _str
= _str
[0 .. $ - 1]; }
3492 @property auto save() { return RandomCU(_str
); }
3493 @property size_t
length() { return _str
.length
; }
3494 C
opIndex(size_t i
) { return _str
[i
]; }
3495 auto opSlice(size_t i
, size_t j
) { return RandomCU(_str
[i
.. j
]); }
3497 this(inout(C
)[] str)
3499 _str
= to
!(C
[])(str);
3507 import std
.conv
: to
;
3508 @property bool empty() { return _str
.empty
; }
3509 @property C
front() { return _str
[0]; }
3510 void popFront() { _str
= _str
[1 .. $]; }
3511 @property C
back() { return _str
[$ - 1]; }
3512 void popBack() { _str
= _str
[0 .. $ - 1]; }
3513 @property auto save() { return new RefBidirCU(_str
); }
3514 @property size_t
length() { return _str
.length
; }
3516 this(inout(C
)[] str)
3518 _str
= to
!(C
[])(str);
3524 class RefRandomCU(C
)
3526 import std
.conv
: to
;
3527 @property bool empty() { return _str
.empty
; }
3528 @property C
front() { return _str
[0]; }
3529 void popFront() { _str
= _str
[1 .. $]; }
3530 @property C
back() { return _str
[$ - 1]; }
3531 void popBack() { _str
= _str
[0 .. $ - 1]; }
3532 @property auto save() { return new RefRandomCU(_str
); }
3533 @property size_t
length() { return _str
.length
; }
3534 C
opIndex(size_t i
) { return _str
[i
]; }
3535 auto opSlice(size_t i
, size_t j
) { return new RefRandomCU(_str
[i
.. j
]); }
3537 this(inout(C
)[] str)
3539 _str
= to
!(C
[])(str);
3548 * Inserted in place of invalid UTF sequences.
3551 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3553 enum dchar replacementDchar
= '\uFFFD';
3555 /********************************************
3556 * Iterate a range of char, wchar, or dchars by code unit.
3558 * The purpose is to bypass the special case decoding that
3559 * $(REF front, std,range,primitives) does to character arrays. As a result,
3560 * using ranges with `byCodeUnit` can be `nothrow` while
3561 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3564 * A code unit is a building block of the UTF encodings. Generally, an
3565 * individual code unit does not represent what's perceived as a full
3566 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3567 * are encoded with multiple code units. For example, the UTF-8 code units for
3568 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3569 * often does not form a character on its own. Attempting to treat it as
3570 * one while iterating over the resulting range will give nonsensical results.
3573 * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3574 * of characters (including strings) or a type that implicitly converts to a string type.
3576 * If `r` is not an auto-decodable string (i.e. a narrow string or a
3577 * user-defined type that implicitly converts to a string type), then `r`
3580 * Otherwise, `r` is converted to its corresponding string type (if it's
3581 * not already a string) and wrapped in a random-access range where the
3582 * element encoding type of the string (its code unit) is the element type
3583 * of the range, and that range returned. The range has slicing.
3585 * If `r` is quirky enough to be a struct or class which is an input range
3586 * of characters on its own (i.e. it has the input range API as member
3587 * functions), $(I and) it's implicitly convertible to a string type, then
3588 * `r` is returned, and no implicit conversion takes place.
3590 * If `r` is wrapped in a new range, then that range has a `source`
3591 * property for returning the string that's currently contained within that
3595 * Refer to the $(MREF std, uni) docs for a reference on Unicode
3598 * For a range that iterates by grapheme cluster (written character) see
3599 * $(REF byGrapheme, std,uni).
3601 auto byCodeUnit(R
)(R r
)
3602 if ((isConvertibleToString
!R
&& !isStaticArray
!R
) ||
3603 (isInputRange
!R
&& isSomeChar
!(ElementEncodingType
!R
)))
3605 import std
.traits
: StringTypeOf
;
3606 static if (// This would be cleaner if we had a way to check whether a type
3607 // was a range without any implicit conversions.
3608 (isAutodecodableString
!R
&& !__traits(hasMember
, R
, "empty") &&
3609 !__traits(hasMember
, R
, "front") && !__traits(hasMember
, R
, "popFront")))
3611 static struct ByCodeUnitImpl
3613 @safe pure nothrow @nogc:
3615 @property bool empty() const { return source
.length
== 0; }
3616 @property auto ref front() inout { return source
[0]; }
3617 void popFront() { source
= source
[1 .. $]; }
3619 @property auto save() { return ByCodeUnitImpl(source
.save
); }
3621 @property auto ref back() inout { return source
[$ - 1]; }
3622 void popBack() { source
= source
[0 .. $-1]; }
3624 auto ref opIndex(size_t index
) inout { return source
[index
]; }
3625 auto opSlice(size_t lower
, size_t upper
) { return ByCodeUnitImpl(source
[lower
.. upper
]); }
3627 @property size_t
length() const { return source
.length
; }
3628 alias opDollar
= length
;
3630 StringTypeOf
!R source
;
3633 static assert(isRandomAccessRange
!ByCodeUnitImpl
);
3635 return ByCodeUnitImpl(r
);
3637 else static if (!isInputRange
!R ||
3638 (is(R
: const dchar[]) && !__traits(hasMember
, R
, "empty") &&
3639 !__traits(hasMember
, R
, "front") && !__traits(hasMember
, R
, "popFront")))
3641 return cast(StringTypeOf
!R
) r
;
3645 // byCodeUnit for ranges and dchar[] is a no-op
3653 import std
.range
.primitives
;
3654 import std
.traits
: isAutodecodableString
;
3656 auto r
= "Hello, World!".byCodeUnit();
3657 static assert(hasLength
!(typeof(r
)));
3658 static assert(hasSlicing
!(typeof(r
)));
3659 static assert(isRandomAccessRange
!(typeof(r
)));
3660 static assert(is(ElementType
!(typeof(r
)) == immutable char));
3662 // contrast with the range capabilities of standard strings (with or
3663 // without autodecoding enabled).
3664 auto s
= "Hello, World!";
3665 static assert(isBidirectionalRange
!(typeof(r
)));
3666 static if (isAutodecodableString
!(typeof(s
)))
3668 // with autodecoding enabled, strings are non-random-access ranges of
3670 static assert(is(ElementType
!(typeof(s
)) == dchar));
3671 static assert(!isRandomAccessRange
!(typeof(s
)));
3672 static assert(!hasSlicing
!(typeof(s
)));
3673 static assert(!hasLength
!(typeof(s
)));
3677 // without autodecoding, strings are normal arrays.
3678 static assert(is(ElementType
!(typeof(s
)) == immutable char));
3679 static assert(isRandomAccessRange
!(typeof(s
)));
3680 static assert(hasSlicing
!(typeof(s
)));
3681 static assert(hasLength
!(typeof(s
)));
3685 /// `byCodeUnit` does no Unicode decoding
3688 string noel1
= "noe\u0308l"; // noël using e + combining diaeresis
3689 assert(noel1
.byCodeUnit
[2] != 'ë');
3690 assert(noel1
.byCodeUnit
[2] == 'e');
3692 string noel2
= "no\u00EBl"; // noël using a precomposed ë character
3693 // Because string is UTF-8, the code unit at index 2 is just
3694 // the first of a sequence that encodes 'ë'
3695 assert(noel2
.byCodeUnit
[2] != 'ë');
3698 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3701 import std
.algorithm
.comparison
: equal
;
3702 import std
.range
: popFrontN
;
3703 import std
.traits
: isAutodecodableString
;
3705 auto range
= byCodeUnit("hello world");
3707 assert(equal(range
.save
, "lo world"));
3708 static if (isAutodecodableString
!string
) // only enabled with autodecoding
3710 string
str = range
.source
;
3711 assert(str == "lo world");
3714 // source only exists if the range was wrapped
3716 auto range
= byCodeUnit("hello world"d
);
3717 static assert(!__traits(compiles
, range
.source
));
3721 @safe pure nothrow @nogc unittest
3725 enum testStr
= "𐁄𐂌𐃯 hello ディラン";
3726 char[testStr
.length
] s
;
3728 foreach (c
; testStr
.byCodeUnit().byCodeUnit())
3732 assert(s
== testStr
);
3735 enum testStr
= "𐁄𐂌𐃯 hello ディラン"w
;
3736 wchar[testStr
.length
] s
;
3738 foreach (c
; testStr
.byCodeUnit().byCodeUnit())
3742 assert(s
== testStr
);
3745 enum testStr
= "𐁄𐂌𐃯 hello ディラン"d
;
3746 dchar[testStr
.length
] s
;
3748 foreach (c
; testStr
.byCodeUnit().byCodeUnit())
3752 assert(s
== testStr
);
3755 auto bcu
= "hello".byCodeUnit();
3756 assert(bcu
.length
== 5);
3757 assert(bcu
[3] == 'l');
3758 assert(bcu
[2 .. 4][1] == 'l');
3761 char[5] orig
= "hello";
3762 auto bcu
= orig
[].byCodeUnit();
3764 assert(bcu
.front
== 'H');
3766 assert(bcu
[1] == 'E');
3769 auto bcu
= "hello".byCodeUnit().byCodeUnit();
3770 static assert(isForwardRange
!(typeof(bcu
)));
3771 static assert(is(typeof(bcu
) == struct) == isAutodecodableString
!string
);
3774 assert(s
.front
== 'h');
3777 auto bcu
= "hello".byCodeUnit();
3778 static assert(hasSlicing
!(typeof(bcu
)));
3779 static assert(isBidirectionalRange
!(typeof(bcu
)));
3780 static assert(is(typeof(bcu
) == struct) == isAutodecodableString
!string
);
3781 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3782 auto ret = bcu
.retro
;
3783 assert(ret.front
== 'o');
3785 assert(ret.front
== 'l');
3788 auto bcu
= "κόσμε"w
.byCodeUnit();
3789 static assert(hasSlicing
!(typeof(bcu
)));
3790 static assert(isBidirectionalRange
!(typeof(bcu
)));
3791 static assert(is(typeof(bcu
) == struct) == isAutodecodableString
!wstring
);
3792 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3793 auto ret = bcu
.retro
;
3794 assert(ret.front
== 'ε');
3796 assert(ret.front
== 'μ');
3799 static struct Stringish
3805 auto orig
= Stringish("\U0010fff8 𐁊 foo 𐂓");
3806 auto bcu
= orig
.byCodeUnit();
3807 static assert(is(typeof(bcu
) == struct));
3808 static assert(!is(typeof(bcu
) == Stringish
) == isAutodecodableString
!Stringish
);
3809 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3810 static assert(is(ElementType
!(typeof(bcu
)) == immutable char));
3811 assert(bcu
.front
== cast(char) 244);
3814 static struct WStringish
3820 auto orig
= WStringish("\U0010fff8 𐁊 foo 𐂓"w
);
3821 auto bcu
= orig
.byCodeUnit();
3822 static assert(is(typeof(bcu
) == struct));
3823 static assert(!is(typeof(bcu
) == WStringish
) == isAutodecodableString
!WStringish
);
3824 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3825 static assert(is(ElementType
!(typeof(bcu
)) == immutable wchar));
3826 assert(bcu
.front
== cast(wchar) 56319);
3829 static struct DStringish
3835 auto orig
= DStringish("\U0010fff8 𐁊 foo 𐂓"d
);
3836 auto bcu
= orig
.byCodeUnit();
3837 static assert(is(typeof(bcu
) == dstring
));
3838 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3839 static assert(is(ElementType
!(typeof(bcu
)) == immutable dchar));
3840 assert(bcu
.front
== cast(dchar) 1114104);
3843 static struct FuncStringish
3846 string
s() pure nothrow @nogc { return str; }
3850 auto orig
= FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3851 auto bcu
= orig
.byCodeUnit();
3852 static if (isAutodecodableString
!FuncStringish
)
3853 static assert(is(typeof(bcu
) == struct));
3855 static assert(is(typeof(bcu
) == string
));
3856 static assert(!is(typeof(bcu
) == FuncStringish
));
3857 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3858 static assert(is(ElementType
!(typeof(bcu
)) == immutable char));
3859 assert(bcu
.front
== cast(char) 244);
3865 bool empty() pure nothrow @nogc { return data
.empty
; }
3866 char front() pure nothrow @nogc { return data
[0]; }
3867 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3870 auto orig
= Range("\U0010fff8 𐁊 foo 𐂓");
3871 auto bcu
= orig
.byCodeUnit();
3872 static assert(is(typeof(bcu
) == Range
));
3873 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3874 static assert(is(ElementType
!(typeof(bcu
)) == char));
3875 assert(bcu
.front
== cast(char) 244);
3878 static struct WRange
3881 bool empty() pure nothrow @nogc { return data
.empty
; }
3882 wchar front() pure nothrow @nogc { return data
[0]; }
3883 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3886 auto orig
= WRange("\U0010fff8 𐁊 foo 𐂓"w
);
3887 auto bcu
= orig
.byCodeUnit();
3888 static assert(is(typeof(bcu
) == WRange
));
3889 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3890 static assert(is(ElementType
!(typeof(bcu
)) == wchar));
3891 assert(bcu
.front
== 56319);
3894 static struct DRange
3897 bool empty() pure nothrow @nogc { return data
.empty
; }
3898 dchar front() pure nothrow @nogc { return data
[0]; }
3899 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3902 auto orig
= DRange("\U0010fff8 𐁊 foo 𐂓"d
);
3903 auto bcu
= orig
.byCodeUnit();
3904 static assert(is(typeof(bcu
) == DRange
));
3905 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3906 static assert(is(ElementType
!(typeof(bcu
)) == dchar));
3907 assert(bcu
.front
== 1114104);
3910 static struct RangeAndStringish
3912 bool empty() pure nothrow @nogc { return data
.empty
; }
3913 char front() pure nothrow @nogc { return data
[0]; }
3914 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3921 auto orig
= RangeAndStringish("test.d", "other");
3922 auto bcu
= orig
.byCodeUnit();
3923 static assert(is(typeof(bcu
) == RangeAndStringish
));
3924 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3925 static assert(is(ElementType
!(typeof(bcu
)) == char));
3926 assert(bcu
.front
== 't');
3929 static struct WRangeAndStringish
3931 bool empty() pure nothrow @nogc { return data
.empty
; }
3932 wchar front() pure nothrow @nogc { return data
[0]; }
3933 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3940 auto orig
= WRangeAndStringish("test.d"w
, "other"w
);
3941 auto bcu
= orig
.byCodeUnit();
3942 static assert(is(typeof(bcu
) == WRangeAndStringish
));
3943 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3944 static assert(is(ElementType
!(typeof(bcu
)) == wchar));
3945 assert(bcu
.front
== 't');
3948 static struct DRangeAndStringish
3950 bool empty() pure nothrow @nogc { return data
.empty
; }
3951 dchar front() pure nothrow @nogc { return data
[0]; }
3952 void popFront() pure nothrow @nogc { data
= data
[1 .. $]; }
3959 auto orig
= DRangeAndStringish("test.d"d
, "other"d
);
3960 auto bcu
= orig
.byCodeUnit();
3961 static assert(is(typeof(bcu
) == DRangeAndStringish
));
3962 static assert(is(typeof(bcu
) == typeof(bcu
.byCodeUnit())));
3963 static assert(is(ElementType
!(typeof(bcu
)) == dchar));
3964 assert(bcu
.front
== 't');
3967 enum Enum
: string
{ a
= "test.d" }
3970 auto bcu
= orig
.byCodeUnit();
3971 static assert(!is(typeof(bcu
) == Enum
));
3972 static if (isAutodecodableString
!Enum
)
3973 static assert(is(typeof(bcu
) == struct));
3975 static assert(is(typeof(bcu
) == string
));
3976 static assert(is(ElementType
!(typeof(bcu
)) == immutable char));
3977 assert(bcu
.front
== 't');
3980 enum WEnum
: wstring
{ a
= "test.d"w
}
3982 auto orig
= WEnum
.a
;
3983 auto bcu
= orig
.byCodeUnit();
3984 static assert(!is(typeof(bcu
) == WEnum
));
3985 static if (isAutodecodableString
!WEnum
)
3986 static assert(is(typeof(bcu
) == struct));
3988 static assert(is(typeof(bcu
) == wstring
));
3989 static assert(is(ElementType
!(typeof(bcu
)) == immutable wchar));
3990 assert(bcu
.front
== 't');
3993 enum DEnum
: dstring
{ a
= "test.d"d
}
3995 auto orig
= DEnum
.a
;
3996 auto bcu
= orig
.byCodeUnit();
3997 static assert(is(typeof(bcu
) == dstring
));
3998 static assert(is(ElementType
!(typeof(bcu
)) == immutable dchar));
3999 assert(bcu
.front
== 't');
4002 static if (autodecodeStrings
)
4004 static assert(!is(typeof(byCodeUnit("hello")) == string
));
4005 static assert(!is(typeof(byCodeUnit("hello"w
)) == wstring
));
4009 static assert(is(typeof(byCodeUnit("hello")) == string
));
4010 static assert(is(typeof(byCodeUnit("hello"w
)) == wstring
));
4012 static assert(is(typeof(byCodeUnit("hello"d
)) == dstring
));
4014 static assert(!__traits(compiles
, byCodeUnit((char[5]).init
)));
4015 static assert(!__traits(compiles
, byCodeUnit((wchar[5]).init
)));
4016 static assert(!__traits(compiles
, byCodeUnit((dchar[5]).init
)));
4018 enum SEnum
: char[5] { a
= "hello" }
4019 enum WSEnum
: wchar[5] { a
= "hello"w
}
4020 enum DSEnum
: dchar[5] { a
= "hello"d
}
4022 static assert(!__traits(compiles
, byCodeUnit(SEnum
.a
)));
4023 static assert(!__traits(compiles
, byCodeUnit(WSEnum
.a
)));
4024 static assert(!__traits(compiles
, byCodeUnit(DSEnum
.a
)));
4027 /****************************
4028 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4029 * of characters by char, wchar, or dchar.
4030 * These aliases simply forward to $(LREF byUTF) with the
4031 * corresponding C argument.
4034 * r = input range of characters, or array of characters
4036 alias byChar
= byUTF
!char;
4039 alias byWchar
= byUTF
!wchar;
4042 alias byDchar
= byUTF
!dchar;
4044 @safe pure nothrow @nogc unittest
4049 foreach (c
; "hello".byChar
.byChar())
4051 //writefln("[%d] '%c'", i, c);
4054 assert(s
== "hello");
4057 char[5+2+3+4+3+3] s
;
4060 a
[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d
;
4061 a
[8] = 0xD800; // invalid
4062 a
[9] = cast(dchar) 0x110000; // invalid
4063 foreach (c
; a
[].byChar())
4065 //writefln("[%d] '%c'", i, c);
4068 assert(s
== "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4071 auto r
= "hello"w
.byChar();
4074 assert(r
.front
== 'l');
4077 auto r
= "hello"d
.byChar();
4080 assert(r
.front
== 'l');
4083 auto r
= "hello"d
.byChar();
4084 assert(isForwardRange
!(typeof(r
)));
4087 assert(s
.front
== 'h');
4091 @safe pure nothrow @nogc unittest
4097 a
[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d
;
4098 a
[8] = 0xD800; // invalid
4099 a
[9] = cast(dchar) 0x110000; // invalid
4100 foreach (c
; a
[].byWchar())
4102 //writefln("[%d] '%c' x%x", i, c, c);
4105 foreach (j
, wchar c
; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w
)
4107 //writefln("[%d] '%c' x%x", j, c, c);
4109 assert(s
== "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w
);
4113 auto r
= "hello".byWchar();
4116 assert(r
.front
== 'l');
4119 auto r
= "hello"d
.byWchar();
4122 assert(r
.front
== 'l');
4125 auto r
= "hello"d
.byWchar();
4126 assert(isForwardRange
!(typeof(r
)));
4129 assert(s
.front
== 'h');
4133 @safe pure nothrow @nogc unittest
4138 string a
= "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4139 foreach (c
; a
.byDchar())
4143 assert(s
== "hello\u07FF\uD7FF\U00010000\U0010FFFF"d
);
4146 foreach (s
; invalidUTFstrings
!char())
4148 auto r
= s
.byDchar();
4150 assert(r
.front
== r
.front
);
4152 assert(c
== replacementDchar
);
4156 auto r
= "hello".byDchar();
4159 assert(r
.front
== 'l');
4165 wstring a
= "hello\u07FF\uD7FF\U0010FFFF"w
;
4166 foreach (c
; a
.byDchar())
4168 //writefln("[%d] '%c' x%x", i, c, c);
4171 assert(s
== "hello\u07FF\uD7FF\U0010FFFF"d
);
4174 foreach (s
; invalidUTFstrings
!wchar())
4176 auto r
= s
.byDchar();
4178 assert(r
.front
== r
.front
);
4180 assert(c
== replacementDchar
);
4186 ws
[1] = 0xDD00; // correct surrogate pair
4187 auto r
= ws
[].byDchar();
4189 assert(r
.front
== r
.front
);
4191 assert(c
== '\U00010100');
4194 auto r
= "hello"w
.byDchar();
4197 assert(r
.front
== 'l');
4203 dstring a
= "hello"d
;
4204 foreach (c
; a
.byDchar
.byDchar())
4206 //writefln("[%d] '%c' x%x", i, c, c);
4209 assert(s
== "hello"d
);
4212 auto r
= "hello".byDchar();
4213 assert(isForwardRange
!(typeof(r
)));
4216 assert(s
.front
== 'h');
4219 auto r
= "hello"w
.byDchar();
4220 assert(isForwardRange
!(typeof(r
)));
4223 assert(s
.front
== 'h');
4227 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4228 // which needs to support ranges with and without those attributes
4230 pure @safe nothrow @nogc unittest
4232 dchar[5] s
= "hello"d
;
4233 foreach (c
; s
[].byChar()) { }
4234 foreach (c
; s
[].byWchar()) { }
4235 foreach (c
; s
[].byDchar()) { }
4238 version (StdUnittest
)
4239 private int impureVariable
;
4243 static struct ImpureThrowingSystemRange(Char
)
4245 @property bool empty() const { return true; }
4246 @property Char
front() const { return Char
.init
; }
4250 throw new Exception("only for testing nothrow");
4254 foreach (Char
; AliasSeq
!(char, wchar, dchar))
4256 ImpureThrowingSystemRange
!Char range
;
4257 foreach (c
; range
.byChar()) { }
4258 foreach (c
; range
.byWchar()) { }
4259 foreach (c
; range
.byDchar()) { }
4263 /****************************
4264 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4265 * of characters by char type `C` by encoding the elements of the range.
4267 * UTF sequences that cannot be converted to the specified encoding are either
4268 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4269 * of the Unicode Standard 6.2 or result in a thrown UTFException.
4270 * Hence byUTF is not symmetric.
4271 * This algorithm is lazy, and does not allocate memory.
4272 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4276 * C = `char`, `wchar`, or `dchar`
4277 * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4278 * UseReplacementDchar.no means throw `UTFException` for invalid UTF
4281 * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.no`
4284 * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.yes`
4287 * A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4288 * as defined by $(REF isAutodecodableString, std, traits).
4290 * A forward range if `R` is a forward range and not auto-decodable.
4292 * Or, if `R` is a range and it is auto-decodable and
4293 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4294 * to $(LREF byCodeUnit).
4296 * Otherwise, an input range of characters.
4298 template byUTF(C
, UseReplacementDchar useReplacementDchar
= Yes
.useReplacementDchar
)
4301 static if (is(immutable C
== immutable UC
, UC
) && !is(C
== UC
))
4302 alias byUTF
= byUTF
!UC
;
4305 auto ref byUTF(R
)(R r
)
4306 if (isAutodecodableString
!R
&& isInputRange
!R
&& isSomeChar
!(ElementEncodingType
!R
))
4308 return byUTF(r
.byCodeUnit());
4311 auto ref byUTF(R
)(R r
)
4312 if (!isAutodecodableString
!R
&& isInputRange
!R
&& isSomeChar
!(ElementEncodingType
!R
))
4314 static if (is(immutable ElementEncodingType
!R
== immutable RC
, RC
) && is(RC
== C
))
4316 return r
.byCodeUnit();
4318 else static if (is(C
== dchar))
4320 static struct Result
4322 enum Empty
= uint.max
; // range is empty or just constructed
4324 this(return scope R r
)
4329 this(return scope R r
, uint buff
)
4335 static if (isBidirectionalRange
!R
)
4337 this(return scope R r
, uint frontBuff
, uint backBuff
)
4340 this.buff
= frontBuff
;
4341 this.backBuff
= backBuff
;
4345 @property bool empty()
4347 static if (isBidirectionalRange
!R
)
4348 return buff
== Empty
&& backBuff
== Empty
&& r
.empty
;
4350 return buff
== Empty
&& r
.empty
;
4353 @property dchar front() scope // 'scope' required by call to decodeFront() below
4359 static if (is(RC
== wchar))
4360 enum firstMulti
= 0xD800; // First high surrogate.
4362 enum firstMulti
= 0x80; // First non-ASCII.
4366 buff
= cast(dchar) c
;
4370 buff
= () @trusted { return decodeFront
!(useReplacementDchar
)(r
); }();
4373 return cast(dchar) buff
;
4383 static if (isForwardRange
!R
)
4385 @property auto save()
4387 static if (isBidirectionalRange
!R
)
4389 return Result(r
.save
, buff
, backBuff
);
4393 return Result(r
.save
, buff
);
4398 static if (isBidirectionalRange
!R
)
4400 @property dchar back() scope // 'scope' required by call to decodeBack() below
4402 if (backBuff
!= Empty
)
4403 return cast(dchar) backBuff
;
4406 static if (is(RC
== wchar))
4407 enum firstMulti
= 0xD800; // First high surrogate.
4409 enum firstMulti
= 0x80; // First non-ASCII.
4413 backBuff
= cast(dchar) c
;
4417 backBuff
= () @trusted { return decodeBack
!useReplacementDchar(r
); }();
4419 return cast(dchar) backBuff
;
4425 if (backBuff
== Empty
)
4434 uint buff
= Empty
; // one character lookahead buffer
4435 static if (isBidirectionalRange
!R
)
4436 uint backBuff
= Empty
;
4443 static struct Result
4445 this(return scope R r
)
4450 this(return scope R r
, ushort pos
, ushort fill
, C
[4 / C
.sizeof
] buf
)
4458 static if (isBidirectionalRange
!R
)
4460 this(return scope R r
, ushort frontPos
, ushort frontFill
,
4461 ushort backPos
, ushort backFill
, C
[4 / C
.sizeof
] buf
)
4464 this.pos
= frontPos
;
4465 this.fill
= frontFill
;
4466 this.backPos
= backPos
;
4467 this.backFill
= backFill
;
4472 @property bool empty()
4474 static if (isBidirectionalRange
!R
)
4475 return pos
== fill
&& backPos
== backFill
&& r
.empty
;
4477 return pos
== fill
&& r
.empty
;
4480 @property auto front() scope // 'scope' required by call to decodeFront() below
4487 static if (C
.sizeof
>= 2 && RC
.sizeof
>= 2)
4488 enum firstMulti
= 0xD800; // First high surrogate.
4490 enum firstMulti
= 0x80; // First non-ASCII.
4495 buf
[pos
] = cast(C
) c
;
4499 static if (is(RC
== dchar))
4505 dchar dc
= () @trusted { return decodeFront
!(useReplacementDchar
)(r
); }();
4506 fill
= cast(ushort) encode
!(useReplacementDchar
)(buf
, dc
);
4519 static if (isForwardRange
!R
)
4521 @property auto save()
4523 static if (isBidirectionalRange
!R
)
4525 return Result(r
.save
, pos
, fill
, backPos
, backFill
, buf
);
4529 return Result(r
.save
, pos
, fill
, buf
);
4534 static if (isBidirectionalRange
!R
)
4536 @property auto back() scope // 'scope' required by call to decodeBack() below
4538 if (backPos
!= backFill
)
4539 return buf
[cast(ushort) (backFill
- backPos
- 1)];
4543 static if (C
.sizeof
>= 2 && RC
.sizeof
>= 2)
4544 enum firstMulti
= 0xD800; // First high surrogate.
4546 enum firstMulti
= 0x80; // First non-ASCII.
4551 buf
[cast(ushort) (backFill
- backPos
- 1)] = cast(C
) c
;
4555 static if (is(RC
== dchar))
4561 dchar dc
= () @trusted { return decodeBack
!(useReplacementDchar
)(r
); }();
4562 backFill
= cast(ushort) encode
!(useReplacementDchar
)(buf
, dc
);
4564 return buf
[cast(ushort) (backFill
- backPos
- 1)];
4569 if (backPos
== backFill
)
4579 static if (isBidirectionalRange
!R
)
4580 ushort backPos
, backFill
;
4581 C
[4 / C
.sizeof
] buf
= void;
4590 @safe pure nothrow unittest
4592 import std
.algorithm
.comparison
: equal
;
4594 // hellö as a range of `char`s, which are UTF-8
4595 assert("hell\u00F6".byUTF
!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4597 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4598 assert("hell\u00F6".byUTF
!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4600 // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4601 assert("𐐷".byUTF
!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4602 assert("𐐷".byUTF
!wchar().equal([0xD801, 0xDC37]));
4603 assert("𐐷".byUTF
!dchar().equal([0x00010437]));
4609 import std
.algorithm
.comparison
: equal
;
4610 import std
.exception
: assertThrown
;
4612 assert("hello\xF0betty".byChar
.byUTF
!(dchar, UseReplacementDchar
.yes
).equal("hello\uFFFDetty"));
4613 assertThrown
!UTFException("hello\xF0betty".byChar
.byUTF
!(dchar, UseReplacementDchar
.no
).equal("hello betty"));
4619 wchar[] s
= ['a', 'b', 0x219];
4620 auto r
= s
.byUTF
!char;
4621 assert(isBidirectionalRange
!(typeof(r
)));
4622 assert(r
.back
== 0x99);
4624 assert(r
.back
== 0xc8);
4626 assert(r
.back
== 'b');
4631 wchar[] s
= ['a', 'b', 0x219];
4632 auto r
= s
.byUTF
!wchar;
4634 assert(isBidirectionalRange
!(typeof(r
)));
4635 assert(r
.back
== 0x219);
4637 assert(r
.back
== 'b');
4641 wchar[] s
= ['a', 'b', 0x219];
4642 auto r
= s
.byUTF
!dchar;
4643 assert(isBidirectionalRange
!(typeof(r
)));
4644 assert(r
.back
== 0x219);
4646 assert(r
.back
== 'b');
4650 dchar[] s
= ['𐐷', '😁'];
4651 auto r
= s
.byUTF
!wchar;
4652 assert(r
.back
== 0xde01);
4654 assert(r
.back
== 0xd83d);
4656 assert(r
.back
== 0xdc37);
4658 assert(r
.back
== 0xd801);
4662 dchar[] s
= ['𐐷', '😁'];
4663 auto r
= s
.byUTF
!char;
4670 import std
.algorithm
.comparison
: equal
;
4671 assert(res
.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4676 auto r
= ['a', 'b', 'c', 'd', 'e'].byUTF
!dchar;
4682 import std
.algorithm
.comparison
: equal
;
4683 assert(res
.equal(['e', 'd', 'c', 'b', 'a']));
4687 //testing the save() function
4688 wchar[] s
= ['Ă','ț'];
4690 auto rc
= s
.byUTF
!char;
4692 auto rcCopy
= rc
.save
;
4693 assert(rc
.back
== rcCopy
.back
);
4694 assert(rcCopy
.back
== 0xc8);
4696 auto rd
= s
.byUTF
!dchar;
4698 auto rdCopy
= rd
.save
;
4699 assert(rd
.back
== rdCopy
.back
);
4700 assert(rdCopy
.back
== 'Ă');
4705 @safe pure nothrow unittest
4707 import std
.range
.primitives
;
4708 wchar[] s
= ['ă', 'î'];
4710 auto rc
= s
.byUTF
!char;
4711 static assert(isBidirectionalRange
!(typeof(rc
)));
4712 assert(rc
.back
== 0xae);
4714 assert(rc
.back
== 0xc3);
4716 assert(rc
.back
== 0x83);
4718 assert(rc
.back
== 0xc4);
4720 auto rw
= s
.byUTF
!wchar;
4721 static assert(isBidirectionalRange
!(typeof(rw
)));
4722 assert(rw
.back
== 'î');
4724 assert(rw
.back
== 'ă');
4726 auto rd
= s
.byUTF
!dchar;
4727 static assert(isBidirectionalRange
!(typeof(rd
)));
4728 assert(rd
.back
== 'î');
4730 assert(rd
.back
== 'ă');