c++: auto in trailing-return-type in parameter [PR117778]
[gcc.git] / libphobos / src / std / utf.d
blob7db120befd5ec74fdc0310ca511831f27b304e3a
1 // Written in the D programming language.
3 /++
4 Encode and decode UTF-8, UTF-16 and UTF-32 strings.
6 UTF character support is restricted to
7 $(D '\u0000' <= character <= '\U0010FFFF').
9 $(SCRIPT inhibitQuickIndex = 1;)
10 $(DIVC quickindex,
11 $(BOOKTABLE,
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
14 $(LREF decode)
15 $(LREF decodeFront)
17 $(TR $(TD Lazy decode) $(TD
18 $(LREF byCodeUnit)
19 $(LREF byChar)
20 $(LREF byWchar)
21 $(LREF byDchar)
22 $(LREF byUTF)
24 $(TR $(TD Encode) $(TD
25 $(LREF encode)
26 $(LREF toUTF8)
27 $(LREF toUTF16)
28 $(LREF toUTF32)
29 $(LREF toUTFz)
30 $(LREF toUTF16z)
32 $(TR $(TD Length) $(TD
33 $(LREF codeLength)
34 $(LREF count)
35 $(LREF stride)
36 $(LREF strideBack)
38 $(TR $(TD Index) $(TD
39 $(LREF toUCSindex)
40 $(LREF toUTFindex)
42 $(TR $(TD Validation) $(TD
43 $(LREF isValidDchar)
44 $(LREF isValidCodepoint)
45 $(LREF validate)
47 $(TR $(TD Miscellaneous) $(TD
48 $(LREF replacementDchar)
49 $(LREF UseReplacementDchar)
50 $(LREF UTFException)
53 See_Also:
54 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
55 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
56 $(LINK https://web.archive.org/web/20100113043530/https://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
57 Copyright: Copyright The D Language Foundation 2000 - 2012.
58 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
59 Authors: $(HTTP digitalmars.com, Walter Bright) and
60 $(HTTP jmdavisprog.com, Jonathan M Davis)
61 Source: $(PHOBOSSRC std/utf.d)
63 module std.utf;
65 import std.exception : basicExceptionCtors;
66 import core.exception : UnicodeException;
67 import std.meta : AliasSeq;
68 import std.range;
69 import std.traits : isAutodecodableString, isConvertibleToString,
70 isSomeChar, isSomeString, isStaticArray, Unqual;
71 import std.typecons : Flag, Yes, No;
74 /++
75 Exception thrown on errors in std.utf functions.
77 class UTFException : UnicodeException
79 import core.internal.string : unsignedToTempString, UnsignedStringBuf;
81 uint[4] sequence;
82 size_t len;
84 @safe pure nothrow @nogc
85 UTFException setSequence(scope uint[] data...) return
87 assert(data.length <= 4);
89 len = data.length < 4 ? data.length : 4;
90 sequence[0 .. len] = data[0 .. len];
92 return this;
95 // FIXME: Use std.exception.basicExceptionCtors here once
96 // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
98 /**
99 Standard exception constructors.
101 this(string msg, string file = __FILE__, size_t line = __LINE__,
102 Throwable next = null) @nogc @safe pure nothrow
104 super(msg, 0, file, line, next);
106 /// ditto
107 this(string msg, size_t index, string file = __FILE__,
108 size_t line = __LINE__, Throwable next = null) @safe pure nothrow
110 UnsignedStringBuf buf = void;
111 msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
112 super(msg, index, file, line, next);
116 Returns:
117 A `string` detailing the invalid UTF sequence.
119 override string toString() const
121 if (len == 0)
123 /* Exception.toString() is not marked as const, although
124 * it is const-compatible.
126 //return super.toString();
127 auto e = () @trusted { return cast(Exception) super; } ();
128 return e.toString();
131 string result = "Invalid UTF sequence:";
133 foreach (i; sequence[0 .. len])
135 UnsignedStringBuf buf = void;
136 result ~= ' ';
137 auto h = unsignedToTempString!16(i, buf);
138 if (h.length == 1)
139 result ~= '0';
140 result ~= h;
141 result ~= 'x';
144 if (super.msg.length > 0)
146 result ~= " - ";
147 result ~= super.msg;
150 return result;
155 @safe unittest
157 import std.exception : assertThrown;
159 char[4] buf;
160 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
161 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
162 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
163 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
164 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
168 Provide array of invalidly encoded UTF strings. Useful for testing.
170 Params:
171 Char = char, wchar, or dchar
173 Returns:
174 an array of invalidly encoded UTF strings
177 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
178 if (isSomeChar!Char)
180 static if (is(Char == char))
182 enum x = 0xDC00; // invalid surrogate value
183 enum y = 0x110000; // out of range
185 static immutable string[8] result =
187 "\x80", // not a start byte
188 "\xC0", // truncated
189 "\xC0\xC0", // invalid continuation
190 "\xF0\x82\x82\xAC", // overlong
192 0xE0 | (x >> 12),
193 0x80 | ((x >> 6) & 0x3F),
194 0x80 | (x & 0x3F)
197 cast(char)(0xF0 | (y >> 18)),
198 cast(char)(0x80 | ((y >> 12) & 0x3F)),
199 cast(char)(0x80 | ((y >> 6) & 0x3F)),
200 cast(char)(0x80 | (y & 0x3F))
203 cast(char)(0xF8 | 3), // 5 byte encoding
204 cast(char)(0x80 | 3),
205 cast(char)(0x80 | 3),
206 cast(char)(0x80 | 3),
207 cast(char)(0x80 | 3),
210 cast(char)(0xFC | 3), // 6 byte encoding
211 cast(char)(0x80 | 3),
212 cast(char)(0x80 | 3),
213 cast(char)(0x80 | 3),
214 cast(char)(0x80 | 3),
215 cast(char)(0x80 | 3),
219 return result[];
221 else static if (is(Char == wchar))
223 static immutable wstring[5] result =
226 cast(wchar) 0xDC00,
229 cast(wchar) 0xDFFF,
232 cast(wchar) 0xDBFF,
233 cast(wchar) 0xDBFF,
236 cast(wchar) 0xDBFF,
237 cast(wchar) 0xE000,
240 cast(wchar) 0xD800,
244 return result[];
246 else static if (is(Char == dchar))
248 static immutable dstring[3] result =
250 [ cast(dchar) 0x110000 ],
251 [ cast(dchar) 0x00D800 ],
252 [ cast(dchar) 0x00DFFF ],
255 return result;
257 else
258 static assert(0);
262 Check whether the given Unicode code point is valid.
264 Params:
265 c = code point to check
267 Returns:
268 `true` if and only if `c` is a valid Unicode code point
270 Note:
271 `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
272 as they are permitted for internal use by an application, but they are
273 not allowed for interchange by the Unicode standard.
275 bool isValidDchar(dchar c) pure nothrow @safe @nogc
277 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
281 @safe @nogc pure nothrow unittest
283 assert( isValidDchar(cast(dchar) 0x41));
284 assert( isValidDchar(cast(dchar) 0x00));
285 assert(!isValidDchar(cast(dchar) 0xD800));
286 assert(!isValidDchar(cast(dchar) 0x11FFFF));
289 pure nothrow @safe @nogc unittest
291 import std.exception;
293 assertCTFEable!(
295 assert( isValidDchar(cast(dchar)'a') == true);
296 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
298 assert(!isValidDchar(cast(dchar) 0x00D800));
299 assert(!isValidDchar(cast(dchar) 0x00DBFF));
300 assert(!isValidDchar(cast(dchar) 0x00DC00));
301 assert(!isValidDchar(cast(dchar) 0x00DFFF));
302 assert( isValidDchar(cast(dchar) 0x00FFFE));
303 assert( isValidDchar(cast(dchar) 0x00FFFF));
304 assert( isValidDchar(cast(dchar) 0x01FFFF));
305 assert( isValidDchar(cast(dchar) 0x10FFFF));
306 assert(!isValidDchar(cast(dchar) 0x110000));
311 Checks if a single character forms a valid code point.
313 When standing alone, some characters are invalid code points. For
314 example the `wchar` `0xD800` is a so called high surrogate, which can
315 only be interpreted together with a low surrogate following it. As a
316 standalone character it is considered invalid.
318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
319 Unicode Standard, D90, D91 and D92) for more details.
321 Params:
322 c = character to test
323 Char = character type of `c`
325 Returns:
326 `true`, if `c` forms a valid code point.
328 bool isValidCodepoint(Char)(Char c)
329 if (isSomeChar!Char)
331 alias UChar = typeof(cast() c);
332 static if (is(UChar == char))
334 return c <= 0x7F;
336 else static if (is(UChar == wchar))
338 return c <= 0xD7FF || c >= 0xE000;
340 else static if (is(UChar == dchar))
342 return isValidDchar(c);
344 else
345 static assert(false, "unknown character type: `" ~ Char.stringof ~ "`");
349 @safe pure nothrow unittest
351 assert( isValidCodepoint(cast(char) 0x40));
352 assert(!isValidCodepoint(cast(char) 0x80));
353 assert( isValidCodepoint(cast(wchar) 0x1234));
354 assert(!isValidCodepoint(cast(wchar) 0xD800));
355 assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
356 assert(!isValidCodepoint(cast(dchar) 0x12345678));
360 Calculate the length of the UTF sequence starting at `index`
361 in `str`.
363 Params:
364 str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
365 of UTF code units. Must be random access if `index` is passed
366 index = starting index of UTF sequence (default: `0`)
368 Returns:
369 The number of code units in the UTF sequence. For UTF-8, this is a
370 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
371 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
373 Throws:
374 May throw a `UTFException` if `str[index]` is not the start of a
375 valid UTF sequence.
377 Note:
378 `stride` will only analyze the first `str[index]` element. It
379 will not fully verify the validity of the UTF sequence, nor even verify
380 the presence of the sequence: it will not actually guarantee that
381 $(D index + stride(str, index) <= str.length).
383 uint stride(S)(auto ref S str, size_t index)
384 if (is(S : const char[]) ||
385 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
387 static if (is(typeof(str.length) : ulong))
388 assert(index < str.length, "Past the end of the UTF-8 sequence");
389 immutable c = str[index];
391 if (c < 0x80)
392 return 1;
393 else
394 return strideImpl(c, index);
397 /// Ditto
398 uint stride(S)(auto ref S str)
399 if (is(S : const char[]) ||
400 (isInputRange!S && is(immutable ElementType!S == immutable char)))
402 static if (is(S : const char[]))
403 immutable c = str[0];
404 else
405 immutable c = str.front;
407 if (c < 0x80)
408 return 1;
409 else
410 return strideImpl(c, 0);
413 @system unittest
415 import core.exception : AssertError;
416 import std.conv : to;
417 import std.exception;
418 import std.string : format;
419 import std.traits : FunctionAttribute, functionAttributes, isSafe;
420 static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
422 enforce(stride(s, i) == codeLength!char(c),
423 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
425 enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
426 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
428 auto refRandom = new RefRandomCU!char(s);
429 immutable randLen = refRandom.length;
430 enforce(stride(refRandom, i) == codeLength!char(c),
431 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
432 enforce(refRandom.length == randLen,
433 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
435 if (i == 0)
437 enforce(stride(s) == codeLength!char(c),
438 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
440 enforce(stride(InputCU!char(s)) == codeLength!char(c),
441 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
443 auto refBidir = new RefBidirCU!char(s);
444 immutable bidirLen = refBidir.length;
445 enforce(stride(refBidir) == codeLength!char(c),
446 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
447 enforce(refBidir.length == bidirLen,
448 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
452 assertCTFEable!(
454 test("a", 'a');
455 test(" ", ' ');
456 test("\u2029", '\u2029'); //paraSep
457 test("\u0100", '\u0100');
458 test("\u0430", '\u0430');
459 test("\U00010143", '\U00010143');
460 test("abcdefcdef", 'a');
461 test("hello\U00010143\u0100\U00010143", 'h', 0);
462 test("hello\U00010143\u0100\U00010143", 'e', 1);
463 test("hello\U00010143\u0100\U00010143", 'l', 2);
464 test("hello\U00010143\u0100\U00010143", 'l', 3);
465 test("hello\U00010143\u0100\U00010143", 'o', 4);
466 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
467 test("hello\U00010143\u0100\U00010143", '\u0100', 9);
468 test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
470 foreach (S; AliasSeq!(char[], const char[], string))
472 enum str = to!S("hello world");
473 static assert(isSafe!({ stride(str, 0); }));
474 static assert(isSafe!({ stride(str); }));
475 static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
476 static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0);
481 @safe unittest // invalid start bytes
483 import std.exception : assertThrown;
484 immutable char[] invalidStartBytes = [
485 0b1111_1000, // indicating a sequence length of 5
486 0b1111_1100, // 6
487 0b1111_1110, // 7
488 0b1111_1111, // 8
489 0b1000_0000, // continuation byte
491 foreach (c; invalidStartBytes)
492 assertThrown!UTFException(stride([c]));
495 /// Ditto
496 uint stride(S)(auto ref S str, size_t index)
497 if (is(S : const wchar[]) ||
498 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
500 static if (is(typeof(str.length) : ulong))
501 assert(index < str.length, "Past the end of the UTF-16 sequence");
502 immutable uint u = str[index];
503 return 1 + (u >= 0xD800 && u <= 0xDBFF);
506 /// Ditto
507 uint stride(S)(auto ref S str) @safe pure
508 if (is(S : const wchar[]))
510 return stride(str, 0);
513 /// Ditto
514 uint stride(S)(auto ref S str)
515 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
516 !is(S : const wchar[]))
518 assert(!str.empty, "UTF-16 sequence is empty");
519 immutable uint u = str.front;
520 return 1 + (u >= 0xD800 && u <= 0xDBFF);
523 @system unittest
525 import core.exception : AssertError;
526 import std.conv : to;
527 import std.exception;
528 import std.string : format;
529 import std.traits : FunctionAttribute, functionAttributes, isSafe;
530 static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
532 enforce(stride(s, i) == codeLength!wchar(c),
533 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
535 enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
536 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
538 auto refRandom = new RefRandomCU!wchar(s);
539 immutable randLen = refRandom.length;
540 enforce(stride(refRandom, i) == codeLength!wchar(c),
541 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
542 enforce(refRandom.length == randLen,
543 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
545 if (i == 0)
547 enforce(stride(s) == codeLength!wchar(c),
548 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
550 enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
551 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
553 auto refBidir = new RefBidirCU!wchar(s);
554 immutable bidirLen = refBidir.length;
555 enforce(stride(refBidir) == codeLength!wchar(c),
556 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
557 enforce(refBidir.length == bidirLen,
558 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
562 assertCTFEable!(
564 test("a", 'a');
565 test(" ", ' ');
566 test("\u2029", '\u2029'); //paraSep
567 test("\u0100", '\u0100');
568 test("\u0430", '\u0430');
569 test("\U00010143", '\U00010143');
570 test("abcdefcdef", 'a');
571 test("hello\U00010143\u0100\U00010143", 'h', 0);
572 test("hello\U00010143\u0100\U00010143", 'e', 1);
573 test("hello\U00010143\u0100\U00010143", 'l', 2);
574 test("hello\U00010143\u0100\U00010143", 'l', 3);
575 test("hello\U00010143\u0100\U00010143", 'o', 4);
576 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
577 test("hello\U00010143\u0100\U00010143", '\u0100', 7);
578 test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
580 foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
582 enum str = to!S("hello world");
583 static assert(isSafe!(() => stride(str, 0)));
584 static assert(isSafe!(() => stride(str) ));
585 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
586 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0);
591 /// Ditto
592 uint stride(S)(auto ref S str, size_t index = 0)
593 if (is(S : const dchar[]) ||
594 (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
596 static if (is(typeof(str.length) : ulong))
597 assert(index < str.length, "Past the end of the UTF-32 sequence");
598 else
599 assert(!str.empty, "UTF-32 sequence is empty.");
600 return 1;
604 @safe unittest
606 assert("a".stride == 1);
607 assert("λ".stride == 2);
608 assert("aλ".stride == 1);
609 assert("aλ".stride(1) == 2);
610 assert("𐐷".stride == 4);
613 @system unittest
615 import core.exception : AssertError;
616 import std.conv : to;
617 import std.exception;
618 import std.string : format;
619 import std.traits : FunctionAttribute, functionAttributes, isSafe;
620 static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
622 enforce(stride(s, i) == codeLength!dchar(c),
623 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
625 enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
626 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
628 auto refRandom = new RefRandomCU!dchar(s);
629 immutable randLen = refRandom.length;
630 enforce(stride(refRandom, i) == codeLength!dchar(c),
631 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
632 enforce(refRandom.length == randLen,
633 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
635 if (i == 0)
637 enforce(stride(s) == codeLength!dchar(c),
638 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
640 enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
641 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
643 auto refBidir = new RefBidirCU!dchar(s);
644 immutable bidirLen = refBidir.length;
645 enforce(stride(refBidir) == codeLength!dchar(c),
646 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
647 enforce(refBidir.length == bidirLen,
648 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
652 assertCTFEable!(
654 test("a", 'a');
655 test(" ", ' ');
656 test("\u2029", '\u2029'); //paraSep
657 test("\u0100", '\u0100');
658 test("\u0430", '\u0430');
659 test("\U00010143", '\U00010143');
660 test("abcdefcdef", 'a');
661 test("hello\U00010143\u0100\U00010143", 'h', 0);
662 test("hello\U00010143\u0100\U00010143", 'e', 1);
663 test("hello\U00010143\u0100\U00010143", 'l', 2);
664 test("hello\U00010143\u0100\U00010143", 'l', 3);
665 test("hello\U00010143\u0100\U00010143", 'o', 4);
666 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
667 test("hello\U00010143\u0100\U00010143", '\u0100', 6);
668 test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
670 foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
672 enum str = to!S("hello world");
673 static assert(isSafe!(() => stride(str, 0)));
674 static assert(isSafe!(() => stride(str) ));
675 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
676 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0);
681 private uint strideImpl(char c, size_t index) @trusted pure
682 in { assert(c & 0x80); }
685 import core.bitop : bsr;
686 immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
687 if (c == 0xFF || msbs < 2 || msbs > 4)
688 throw new UTFException("Invalid UTF-8 sequence", index);
689 return msbs;
693 Calculate the length of the UTF sequence ending one code unit before
694 `index` in `str`.
696 Params:
697 str = bidirectional range of UTF code units. Must be random access if
698 `index` is passed
699 index = index one past end of UTF sequence (default: `str.length`)
701 Returns:
702 The number of code units in the UTF sequence. For UTF-8, this is a
703 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
704 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
706 Throws:
707 May throw a `UTFException` if `str[index]` is not one past the
708 end of a valid UTF sequence.
710 Note:
711 `strideBack` will only analyze the element at $(D str[index - 1])
712 element. It will not fully verify the validity of the UTF sequence, nor
713 even verify the presence of the sequence: it will not actually
714 guarantee that $(D strideBack(str, index) <= index).
716 uint strideBack(S)(auto ref S str, size_t index)
717 if (is(S : const char[]) ||
718 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
720 static if (is(typeof(str.length) : ulong))
721 assert(index <= str.length, "Past the end of the UTF-8 sequence");
722 assert(index > 0, "Not the end of the UTF-8 sequence");
724 if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
725 return 1;
727 if (index >= 4) //single verification for most common case
729 static foreach (i; 2 .. 5)
731 if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
732 return i;
735 else
737 static foreach (i; 2 .. 4)
739 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
740 return i;
743 throw new UTFException("Not the end of the UTF sequence", index);
746 /// Ditto
747 uint strideBack(S)(auto ref S str)
748 if (is(S : const char[]) ||
749 (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
751 return strideBack(str, str.length);
754 /// Ditto
755 uint strideBack(S)(auto ref S str)
756 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
758 assert(!str.empty, "Past the end of the UTF-8 sequence");
759 auto temp = str.save;
760 foreach (i; AliasSeq!(1, 2, 3, 4))
762 if ((temp.back & 0b1100_0000) != 0b1000_0000)
763 return i;
764 temp.popBack();
765 if (temp.empty)
766 break;
768 throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
771 @system unittest
773 import core.exception : AssertError;
774 import std.conv : to;
775 import std.exception;
776 import std.string : format;
777 import std.traits : FunctionAttribute, functionAttributes, isSafe;
778 static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
780 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
781 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
783 enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
784 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
786 auto refRandom = new RefRandomCU!char(s);
787 immutable randLen = refRandom.length;
788 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
789 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
790 enforce(refRandom.length == randLen,
791 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
793 if (i == size_t.max)
795 enforce(strideBack(s) == codeLength!char(c),
796 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
798 enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
799 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
801 auto refBidir = new RefBidirCU!char(s);
802 immutable bidirLen = refBidir.length;
803 enforce(strideBack(refBidir) == codeLength!char(c),
804 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
805 enforce(refBidir.length == bidirLen,
806 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
810 assertCTFEable!(
812 test("a", 'a');
813 test(" ", ' ');
814 test("\u2029", '\u2029'); //paraSep
815 test("\u0100", '\u0100');
816 test("\u0430", '\u0430');
817 test("\U00010143", '\U00010143');
818 test("abcdefcdef", 'f');
819 test("\U00010143\u0100\U00010143hello", 'o', 15);
820 test("\U00010143\u0100\U00010143hello", 'l', 14);
821 test("\U00010143\u0100\U00010143hello", 'l', 13);
822 test("\U00010143\u0100\U00010143hello", 'e', 12);
823 test("\U00010143\u0100\U00010143hello", 'h', 11);
824 test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
825 test("\U00010143\u0100\U00010143hello", '\u0100', 6);
826 test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
828 foreach (S; AliasSeq!(char[], const char[], string))
830 enum str = to!S("hello world");
831 static assert(isSafe!({ strideBack(str, 0); }));
832 static assert(isSafe!({ strideBack(str); }));
833 static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
834 static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0);
839 //UTF-16 is self synchronizing: The length of strideBack can be found from
840 //the value of a single wchar
841 /// Ditto
842 uint strideBack(S)(auto ref S str, size_t index)
843 if (is(S : const wchar[]) ||
844 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
846 static if (is(typeof(str.length) : ulong))
847 assert(index <= str.length, "Past the end of the UTF-16 sequence");
848 assert(index > 0, "Not the end of a UTF-16 sequence");
850 immutable c2 = str[index-1];
851 return 1 + (0xDC00 <= c2 && c2 < 0xE000);
854 /// Ditto
855 uint strideBack(S)(auto ref S str)
856 if (is(S : const wchar[]) ||
857 (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
859 assert(!str.empty, "UTF-16 sequence is empty");
861 static if (is(S : const(wchar)[]))
862 immutable c2 = str[$ - 1];
863 else
864 immutable c2 = str.back;
866 return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
869 @system unittest
871 import core.exception : AssertError;
872 import std.conv : to;
873 import std.exception;
874 import std.string : format;
875 import std.traits : FunctionAttribute, functionAttributes, isSafe;
876 static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
878 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
879 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
881 enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
882 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
884 auto refRandom = new RefRandomCU!wchar(s);
885 immutable randLen = refRandom.length;
886 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
887 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
888 enforce(refRandom.length == randLen,
889 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
891 if (i == size_t.max)
893 enforce(strideBack(s) == codeLength!wchar(c),
894 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
896 enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
897 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
899 auto refBidir = new RefBidirCU!wchar(s);
900 immutable bidirLen = refBidir.length;
901 enforce(strideBack(refBidir) == codeLength!wchar(c),
902 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
903 enforce(refBidir.length == bidirLen,
904 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
908 assertCTFEable!(
910 test("a", 'a');
911 test(" ", ' ');
912 test("\u2029", '\u2029'); //paraSep
913 test("\u0100", '\u0100');
914 test("\u0430", '\u0430');
915 test("\U00010143", '\U00010143');
916 test("abcdefcdef", 'f');
917 test("\U00010143\u0100\U00010143hello", 'o', 10);
918 test("\U00010143\u0100\U00010143hello", 'l', 9);
919 test("\U00010143\u0100\U00010143hello", 'l', 8);
920 test("\U00010143\u0100\U00010143hello", 'e', 7);
921 test("\U00010143\u0100\U00010143hello", 'h', 6);
922 test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
923 test("\U00010143\u0100\U00010143hello", '\u0100', 3);
924 test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
926 foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
928 enum str = to!S("hello world");
929 static assert(isSafe!(() => strideBack(str, 0)));
930 static assert(isSafe!(() => strideBack(str) ));
931 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
932 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0);
937 /// Ditto
938 uint strideBack(S)(auto ref S str, size_t index)
939 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
941 static if (is(typeof(str.length) : ulong))
942 assert(index <= str.length, "Past the end of the UTF-32 sequence");
943 assert(index > 0, "Not the end of the UTF-32 sequence");
944 return 1;
947 /// Ditto
948 uint strideBack(S)(auto ref S str)
949 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
951 assert(!str.empty, "Empty UTF-32 sequence");
952 return 1;
956 @safe unittest
958 assert("a".strideBack == 1);
959 assert("λ".strideBack == 2);
960 assert("aλ".strideBack == 2);
961 assert("aλ".strideBack(1) == 1);
962 assert("𐐷".strideBack == 4);
965 @system unittest
967 import core.exception : AssertError;
968 import std.conv : to;
969 import std.exception;
970 import std.string : format;
971 import std.traits : FunctionAttribute, functionAttributes, isSafe;
972 static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
974 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
975 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
977 enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
978 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
980 auto refRandom = new RefRandomCU!dchar(s);
981 immutable randLen = refRandom.length;
982 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
983 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
984 enforce(refRandom.length == randLen,
985 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
987 if (i == size_t.max)
989 enforce(strideBack(s) == codeLength!dchar(c),
990 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
992 enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
993 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
995 auto refBidir = new RefBidirCU!dchar(s);
996 immutable bidirLen = refBidir.length;
997 enforce(strideBack(refBidir) == codeLength!dchar(c),
998 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
999 enforce(refBidir.length == bidirLen,
1000 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
1004 assertCTFEable!(
1006 test("a", 'a');
1007 test(" ", ' ');
1008 test("\u2029", '\u2029'); //paraSep
1009 test("\u0100", '\u0100');
1010 test("\u0430", '\u0430');
1011 test("\U00010143", '\U00010143');
1012 test("abcdefcdef", 'f');
1013 test("\U00010143\u0100\U00010143hello", 'o', 8);
1014 test("\U00010143\u0100\U00010143hello", 'l', 7);
1015 test("\U00010143\u0100\U00010143hello", 'l', 6);
1016 test("\U00010143\u0100\U00010143hello", 'e', 5);
1017 test("\U00010143\u0100\U00010143hello", 'h', 4);
1018 test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019 test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020 test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1022 foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
1024 enum str = to!S("hello world");
1025 static assert(isSafe!(() => strideBack(str, 0)));
1026 static assert(isSafe!(() => strideBack(str) ));
1027 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
1028 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0);
1035 Given `index` into `str` and assuming that `index` is at the start
1036 of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037 up to `index`. So, `index` is the index of a code unit at the
1038 beginning of a code point, and the return value is how many code points into
1039 the string that that code point is.
1041 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
1042 if (isSomeChar!C)
1044 static if (is(immutable C == immutable dchar))
1045 return index;
1046 else
1048 size_t n = 0;
1049 size_t j = 0;
1051 for (; j < index; ++n)
1052 j += stride(str, j);
1054 if (j > index)
1056 static if (is(immutable C == immutable char))
1057 throw new UTFException("Invalid UTF-8 sequence", index);
1058 else
1059 throw new UTFException("Invalid UTF-16 sequence", index);
1062 return n;
1067 @safe unittest
1069 assert(toUCSindex(`hello world`, 7) == 7);
1070 assert(toUCSindex(`hello world`w, 7) == 7);
1071 assert(toUCSindex(`hello world`d, 7) == 7);
1073 assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074 assert(toUCSindex(`Ma Chérie`w, 7) == 7);
1075 assert(toUCSindex(`Ma Chérie`d, 7) == 7);
1077 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1079 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1084 Given a UCS index `n` into `str`, returns the UTF index.
1085 So, `n` is how many code points into the string the code point is, and
1086 the array index of the code unit is returned.
1088 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
1089 if (isSomeChar!C)
1091 static if (is(immutable C == immutable dchar))
1093 return n;
1095 else
1097 size_t i;
1098 while (n--)
1100 i += stride(str, i);
1102 return i;
1107 @safe unittest
1109 assert(toUTFindex(`hello world`, 7) == 7);
1110 assert(toUTFindex(`hello world`w, 7) == 7);
1111 assert(toUTFindex(`hello world`d, 7) == 7);
1113 assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114 assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1115 assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1117 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1119 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1123 /* =================== Decode ======================= */
1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126 alias UseReplacementDchar = Flag!"useReplacementDchar";
1129 Decodes and returns the code point starting at `str[index]`. `index`
1130 is advanced to one past the decoded code point. If the code point is not
1131 well-formed, then a `UTFException` is thrown and `index` remains
1132 unchanged.
1134 decode will only work with strings and random access ranges of code units
1135 with length and slicing, whereas $(LREF decodeFront) will work with any
1136 input range of code units.
1138 Params:
1139 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140 str = input string or indexable Range
1141 index = starting index into s[]; incremented by number of code units processed
1143 Returns:
1144 decoded character
1146 Throws:
1147 $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148 sequence and useReplacementDchar is `No.useReplacementDchar`
1150 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1151 if (!isSomeString!S &&
1152 isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1155 assert(index < str.length, "Attempted to decode past the end of a string");
1157 out (result)
1159 assert(isValidDchar(result));
1163 if (str[index] < codeUnitLimit!S)
1164 return str[index++];
1165 else
1166 return decodeImpl!(true, useReplacementDchar)(str, index);
1169 /// ditto
1170 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1171 auto ref scope S str, ref size_t index) @trusted pure
1172 if (isSomeString!S)
1175 assert(index < str.length, "Attempted to decode past the end of a string");
1177 out (result)
1179 assert(isValidDchar(result));
1183 if (str[index] < codeUnitLimit!S)
1184 return str[index++];
1185 else static if (is(immutable S == immutable C[], C))
1186 return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1190 @safe pure unittest
1192 size_t i;
1194 assert("a".decode(i) == 'a' && i == 1);
1195 i = 0;
1196 assert("å".decode(i) == 'å' && i == 2);
1197 i = 1;
1198 assert("aå".decode(i) == 'å' && i == 3);
1199 i = 0;
1200 assert("å"w.decode(i) == 'å' && i == 1);
1202 // ë as a multi-code point grapheme
1203 i = 0;
1204 assert("e\u0308".decode(i) == 'e' && i == 1);
1205 // ë as a single code point grapheme
1206 i = 0;
1207 assert("ë".decode(i) == 'ë' && i == 2);
1208 i = 0;
1209 assert("ë"w.decode(i) == 'ë' && i == 1);
1212 @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867
1214 import std.conv : hexString;
1215 string data = hexString!"f787a598";
1216 size_t offset = 0;
1217 try data.decode(offset);
1218 catch (UTFException ex) assert(offset == 0);
1222 `decodeFront` is a variant of $(LREF decode) which specifically decodes
1223 the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1224 $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1225 of code units (rather than just a string or random access
1226 range). It also takes the range by `ref` and pops off the elements as it
1227 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1228 of code units which were in the code point which was decoded.
1230 Params:
1231 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1232 str = input string or indexable Range
1233 numCodeUnits = set to number of code units processed
1235 Returns:
1236 decoded character
1238 Throws:
1239 $(LREF UTFException) if `str.front` is not the start of a valid UTF
1240 sequence. If an exception is thrown, then there is no guarantee as to
1241 the number of code units which were popped off, as it depends on the
1242 type of range being used and how many code units had to be popped off
1243 before the code point was determined to be invalid.
1245 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1246 ref S str, out size_t numCodeUnits)
1247 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1250 assert(!str.empty);
1252 out (result)
1254 assert(isValidDchar(result));
1258 immutable fst = str.front;
1260 if (fst < codeUnitLimit!S)
1262 str.popFront();
1263 numCodeUnits = 1;
1264 return fst;
1266 else
1268 // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1269 // done outside of decodeImpl, which is undesirable, since not all
1270 // overloads of decodeImpl need it. So, it should be moved back into
1271 // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1272 // has been fixed.
1273 enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1274 immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1276 // The other range types were already popped by decodeImpl.
1277 static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1278 str = str[numCodeUnits .. str.length];
1280 return retval;
1284 /// ditto
1285 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1286 ref scope S str, out size_t numCodeUnits) @trusted pure
1287 if (isSomeString!S)
1290 assert(!str.empty);
1292 out (result)
1294 assert(isValidDchar(result));
1298 if (str[0] < codeUnitLimit!S)
1300 numCodeUnits = 1;
1301 immutable retval = str[0];
1302 str = str[1 .. $];
1303 return retval;
1305 else static if (is(immutable S == immutable C[], C))
1307 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits);
1308 str = str[numCodeUnits .. $];
1309 return retval;
1313 /++ Ditto +/
1314 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1315 if (isInputRange!S && isSomeChar!(ElementType!S))
1317 size_t numCodeUnits;
1318 return decodeFront!useReplacementDchar(str, numCodeUnits);
1322 @safe pure unittest
1324 import std.range.primitives;
1325 string str = "Hello, World!";
1327 assert(str.decodeFront == 'H' && str == "ello, World!");
1328 str = "å";
1329 assert(str.decodeFront == 'å' && str.empty);
1330 str = "å";
1331 size_t i;
1332 assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
1336 `decodeBack` is a variant of $(LREF decode) which specifically decodes
1337 the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1338 bidirectional range of code units (rather than just a string or random access
1339 range). It also takes the range by `ref` and pops off the elements as it
1340 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1341 of code units which were in the code point which was decoded.
1343 Params:
1344 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1345 str = input string or bidirectional Range
1346 numCodeUnits = gives the number of code units processed
1348 Returns:
1349 A decoded UTF character.
1351 Throws:
1352 $(LREF UTFException) if `str.back` is not the end of a valid UTF
1353 sequence. If an exception is thrown, the `str` itself remains unchanged,
1354 but there is no guarantee as to the value of `numCodeUnits` (when passed).
1356 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1357 ref S str, out size_t numCodeUnits)
1358 if (isSomeString!S)
1361 assert(!str.empty);
1363 out (result)
1365 assert(isValidDchar(result));
1369 if (str[$ - 1] < codeUnitLimit!S)
1371 numCodeUnits = 1;
1372 immutable retval = str[$ - 1];
1373 str = str[0 .. $ - 1];
1374 return retval;
1376 else static if (is(immutable S == immutable C[], C))
1378 numCodeUnits = strideBack(str);
1379 immutable newLength = str.length - numCodeUnits;
1380 size_t index = newLength;
1381 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1382 str = str[0 .. newLength];
1383 return retval;
1387 /++ Ditto +/
1388 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1389 ref S str, out size_t numCodeUnits)
1390 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1391 && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1394 assert(!str.empty);
1396 out (result)
1398 assert(isValidDchar(result));
1402 if (str.back < codeUnitLimit!S)
1404 numCodeUnits = 1;
1405 immutable retval = str.back;
1406 str.popBack();
1407 return retval;
1409 else
1411 numCodeUnits = strideBack(str);
1412 static if (isRandomAccessRange!S)
1414 size_t index = str.length - numCodeUnits;
1415 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1416 str.popBackExactly(numCodeUnits);
1417 return retval;
1419 else
1421 alias Char = typeof(cast() ElementType!S.init);
1422 Char[4] codeUnits = void;
1423 S tmp = str.save;
1424 for (size_t i = numCodeUnits; i > 0; )
1426 codeUnits[--i] = tmp.back;
1427 tmp.popBack();
1429 const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1430 size_t index = 0;
1431 immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1432 str = tmp;
1433 return retval;
1438 /++ Ditto +/
1439 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1440 if (isSomeString!S
1441 || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1442 || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1445 assert(!str.empty);
1447 out (result)
1449 assert(isValidDchar(result));
1453 size_t numCodeUnits;
1454 return decodeBack!useReplacementDchar(str, numCodeUnits);
1458 @system pure unittest
1460 import std.range.primitives;
1461 string str = "Hello, World!";
1463 assert(str.decodeBack == '!' && str == "Hello, World");
1464 str = "å";
1465 assert(str.decodeBack == 'å' && str.empty);
1466 str = "å";
1467 size_t i;
1468 assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
1471 // For the given range, code unit values less than this
1472 // are guaranteed to be valid single-codepoint encodings.
1473 package template codeUnitLimit(S)
1474 if (isSomeChar!(ElementEncodingType!S))
1476 static if (is(immutable ElementEncodingType!S == immutable char))
1477 enum char codeUnitLimit = 0x80;
1478 else static if (is(immutable ElementEncodingType!S == immutable wchar))
1479 enum wchar codeUnitLimit = 0xD800;
1480 else
1481 enum dchar codeUnitLimit = 0xD800;
1485 * For strings, this function does its own bounds checking to give a
1486 * more useful error message when attempting to decode past the end of a string.
1487 * Subsequently it uses a pointer instead of an array to avoid
1488 * redundant bounds checking.
1490 * The three overloads of this operate on chars, wchars, and dchars.
1492 * Params:
1493 * canIndex = if S is indexable
1494 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1495 * str = input string or Range
1496 * index = starting index into s[]; incremented by number of code units processed
1498 * Returns:
1499 * decoded character
1501 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1502 auto ref S str, ref size_t index)
1503 if (
1504 is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
1506 /* The following encodings are valid, except for the 5 and 6 byte
1507 * combinations:
1508 * 0xxxxxxx
1509 * 110xxxxx 10xxxxxx
1510 * 1110xxxx 10xxxxxx 10xxxxxx
1511 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1512 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1513 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1516 /* Dchar bitmask for different numbers of UTF-8 code units.
1518 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1520 static if (is(S : const char[]))
1521 auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code
1522 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1523 auto pstr = str[index .. str.length];
1524 else
1525 alias pstr = str;
1527 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1528 // outside of decodeImpl
1529 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1531 static if (canIndex)
1533 immutable length = str.length - index;
1534 ubyte fst = pstr[0];
1536 else
1538 ubyte fst = pstr.front;
1539 pstr.popFront();
1542 static if (!useReplacementDchar)
1544 static if (canIndex)
1546 static UTFException exception(S)(S str, string msg)
1548 uint[4] sequence = void;
1549 size_t i;
1553 sequence[i] = str[i];
1554 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1556 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1560 UTFException invalidUTF()
1562 static if (canIndex)
1563 return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1564 else
1566 //We can't include the invalid sequence with input strings without
1567 //saving each of the code units along the way, and we can't do it with
1568 //forward ranges without saving the entire range. Both would incur a
1569 //cost for the decoding of every character just to provide a better
1570 //error message for the (hopefully) rare case when an invalid UTF-8
1571 //sequence is encountered, so we don't bother trying to include the
1572 //invalid sequence here, unlike with strings and sliceable ranges.
1573 return new UTFException("Invalid UTF-8 sequence");
1577 UTFException outOfBounds()
1579 static if (canIndex)
1580 return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1581 else
1582 return new UTFException("Attempted to decode past the end of a string");
1586 if ((fst & 0b1100_0000) != 0b1100_0000)
1588 static if (useReplacementDchar)
1590 ++index; // always consume bad input to avoid infinite loops
1591 return replacementDchar;
1593 else
1594 throw invalidUTF(); // starter must have at least 2 first bits set
1596 ubyte tmp = void;
1597 dchar d = fst; // upper control bits are masked out later
1598 fst <<= 1;
1600 foreach (i; AliasSeq!(1, 2, 3))
1603 static if (canIndex)
1605 if (i == length)
1607 static if (useReplacementDchar)
1609 index += i;
1610 return replacementDchar;
1612 else
1613 throw outOfBounds();
1616 else
1618 if (pstr.empty)
1620 static if (useReplacementDchar)
1622 index += i;
1623 return replacementDchar;
1625 else
1626 throw outOfBounds();
1630 static if (canIndex)
1631 tmp = pstr[i];
1632 else
1634 tmp = pstr.front;
1635 pstr.popFront();
1638 if ((tmp & 0xC0) != 0x80)
1640 static if (useReplacementDchar)
1642 index += i + 1;
1643 return replacementDchar;
1645 else
1646 throw invalidUTF();
1649 d = (d << 6) | (tmp & 0x3F);
1650 fst <<= 1;
1652 if (!(fst & 0x80)) // no more bytes
1654 d &= bitMask[i]; // mask out control bits
1656 // overlong, could have been encoded with i bytes
1657 if ((d & ~bitMask[i - 1]) == 0)
1659 static if (useReplacementDchar)
1661 index += i + 1;
1662 return replacementDchar;
1664 else
1665 throw invalidUTF();
1668 // check for surrogates only needed for 3 bytes
1669 static if (i == 2)
1671 if (!isValidDchar(d))
1673 static if (useReplacementDchar)
1675 index += i + 1;
1676 return replacementDchar;
1678 else
1679 throw invalidUTF();
1683 static if (i == 3)
1685 if (d > dchar.max)
1687 static if (useReplacementDchar)
1688 d = replacementDchar;
1689 else
1690 throw invalidUTF();
1694 index += i + 1;
1695 return d;
1699 static if (useReplacementDchar)
1701 index += 4; // read 4 chars by now
1702 return replacementDchar;
1704 else
1705 throw invalidUTF();
1708 @safe pure @nogc nothrow
1709 unittest
1711 // Add tests for useReplacemendDchar == yes path
1713 static struct R
1715 @safe pure @nogc nothrow:
1716 this(string s) { this.s = s; }
1717 @property bool empty() { return idx == s.length; }
1718 @property char front() { return s[idx]; }
1719 void popFront() { ++idx; }
1720 size_t idx;
1721 string s;
1724 foreach (s; invalidUTFstrings!char())
1726 auto r = R(s);
1727 size_t index;
1728 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1729 assert(dc == replacementDchar);
1730 assert(1 <= index && index <= s.length);
1734 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1735 (auto ref S str, ref size_t index)
1736 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
1738 static if (is(S : const wchar[]))
1739 auto pstr = str.ptr + index;
1740 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1741 auto pstr = str[index .. str.length];
1742 else
1743 alias pstr = str;
1745 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1746 // outside of decodeImpl
1747 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1749 static if (canIndex)
1751 immutable length = str.length - index;
1752 uint u = pstr[0];
1754 else
1756 uint u = pstr.front;
1757 pstr.popFront();
1760 static if (!useReplacementDchar)
1762 UTFException exception(string msg)
1764 static if (canIndex)
1765 return new UTFException(msg).setSequence(pstr[0]);
1766 else
1767 return new UTFException(msg);
1771 // The < case must be taken care of before decodeImpl is called.
1772 assert(u >= 0xD800);
1774 if (u <= 0xDBFF)
1776 static if (canIndex)
1777 immutable onlyOneCodeUnit = length == 1;
1778 else
1779 immutable onlyOneCodeUnit = pstr.empty;
1781 if (onlyOneCodeUnit)
1783 static if (useReplacementDchar)
1785 ++index;
1786 return replacementDchar;
1788 else
1789 throw exception("surrogate UTF-16 high value past end of string");
1792 static if (canIndex)
1793 immutable uint u2 = pstr[1];
1794 else
1796 immutable uint u2 = pstr.front;
1797 pstr.popFront();
1800 if (u2 < 0xDC00 || u2 > 0xDFFF)
1802 static if (useReplacementDchar)
1803 u = replacementDchar;
1804 else
1805 throw exception("surrogate UTF-16 low value out of range");
1807 else
1808 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1809 ++index;
1811 else if (u >= 0xDC00 && u <= 0xDFFF)
1813 static if (useReplacementDchar)
1814 u = replacementDchar;
1815 else
1816 throw exception("unpaired surrogate UTF-16 value");
1818 ++index;
1820 // Note: u+FFFE and u+FFFF are specifically permitted by the
1821 // Unicode standard for application internal use (see isValidDchar)
1823 return cast(dchar) u;
1826 @safe pure @nogc nothrow
1827 unittest
1829 // Add tests for useReplacemendDchar == true path
1831 static struct R
1833 @safe pure @nogc nothrow:
1834 this(wstring s) { this.s = s; }
1835 @property bool empty() { return idx == s.length; }
1836 @property wchar front() { return s[idx]; }
1837 void popFront() { ++idx; }
1838 size_t idx;
1839 wstring s;
1842 foreach (s; invalidUTFstrings!wchar())
1844 auto r = R(s);
1845 size_t index;
1846 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1847 assert(dc == replacementDchar);
1848 assert(1 <= index && index <= s.length);
1852 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1853 auto ref S str, ref size_t index)
1854 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
1856 static if (is(S : const dchar[]))
1857 auto pstr = str.ptr;
1858 else
1859 alias pstr = str;
1861 static if (is(S : const dchar[]) || isRandomAccessRange!S)
1863 dchar dc = pstr[index];
1864 if (!isValidDchar(dc))
1866 static if (useReplacementDchar)
1867 dc = replacementDchar;
1868 else
1869 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1871 ++index;
1872 return dc;
1874 else
1876 dchar dc = pstr.front;
1877 if (!isValidDchar(dc))
1879 static if (useReplacementDchar)
1880 dc = replacementDchar;
1881 else
1882 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1884 ++index;
1885 pstr.popFront();
1886 return dc;
1890 @safe pure @nogc nothrow
1891 unittest
1893 // Add tests for useReplacemendDchar == true path
1895 static struct R
1897 @safe pure @nogc nothrow:
1898 this(dstring s) { this.s = s; }
1899 @property bool empty() { return idx == s.length; }
1900 @property dchar front() { return s[idx]; }
1901 void popFront() { ++idx; }
1902 size_t idx;
1903 dstring s;
1906 foreach (s; invalidUTFstrings!dchar())
1908 auto r = R(s);
1909 size_t index;
1910 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1911 assert(dc == replacementDchar);
1912 assert(1 <= index && index <= s.length);
1917 version (StdUnittest) private void testDecode(R)(R range,
1918 size_t index,
1919 dchar expectedChar,
1920 size_t expectedIndex,
1921 size_t line = __LINE__)
1923 import core.exception : AssertError;
1924 import std.exception : enforce;
1925 import std.string : format;
1926 import std.traits : isNarrowString;
1928 static if (hasLength!R)
1929 immutable lenBefore = range.length;
1931 static if (isRandomAccessRange!R && !isNarrowString!R)
1934 immutable result = decode(range, index);
1935 enforce(result == expectedChar,
1936 new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1937 enforce(index == expectedIndex,
1938 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1939 static if (hasLength!R)
1941 enforce(range.length == lenBefore,
1942 new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1948 version (StdUnittest) private void testDecodeFront(R)(ref R range,
1949 dchar expectedChar,
1950 size_t expectedNumCodeUnits,
1951 size_t line = __LINE__)
1953 import core.exception : AssertError;
1954 import std.exception : enforce;
1955 import std.string : format;
1957 static if (hasLength!R)
1958 immutable lenBefore = range.length;
1960 size_t numCodeUnits;
1961 immutable result = decodeFront(range, numCodeUnits);
1962 enforce(result == expectedChar,
1963 new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1964 enforce(numCodeUnits == expectedNumCodeUnits,
1965 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1967 static if (hasLength!R)
1969 enforce(range.length == lenBefore - numCodeUnits,
1970 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1974 version (StdUnittest) private void testDecodeBack(R)(ref R range,
1975 dchar expectedChar,
1976 size_t expectedNumCodeUnits,
1977 size_t line = __LINE__)
1979 // This condition is to allow unit testing all `decode` functions together
1980 static if (!isBidirectionalRange!R)
1981 return;
1982 else
1984 import core.exception : AssertError;
1985 import std.exception : enforce;
1986 import std.string : format;
1988 static if (hasLength!R)
1989 immutable lenBefore = range.length;
1991 size_t numCodeUnits;
1992 immutable result = decodeBack(range, numCodeUnits);
1993 enforce(result == expectedChar,
1994 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1995 enforce(numCodeUnits == expectedNumCodeUnits,
1996 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1998 static if (hasLength!R)
2000 enforce(range.length == lenBefore - numCodeUnits,
2001 new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
2006 version (StdUnittest) private void testAllDecode(R)(R range,
2007 dchar expectedChar,
2008 size_t expectedIndex,
2009 size_t line = __LINE__)
2011 testDecode(range, 0, expectedChar, expectedIndex, line);
2012 static if (isBidirectionalRange!R)
2014 auto rangeCopy = range.save;
2015 testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
2017 testDecodeFront(range, expectedChar, expectedIndex, line);
2020 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
2022 import core.exception : AssertError;
2023 import std.exception : assertThrown, enforce;
2024 import std.string : format;
2026 immutable initialIndex = index;
2028 static if (hasLength!R)
2029 immutable lenBefore = range.length;
2031 static if (isRandomAccessRange!R)
2033 assertThrown!UTFException(decode(range, index), null, __FILE__, line);
2034 enforce(index == initialIndex,
2035 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
2036 static if (hasLength!R)
2038 enforce(range.length == lenBefore,
2039 new AssertError(format("decode: length changed:", range.length), __FILE__, line));
2043 if (initialIndex == 0)
2044 assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
2047 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
2049 // This condition is to allow unit testing all `decode` functions together
2050 static if (!isBidirectionalRange!R)
2051 return;
2052 else
2054 import core.exception : AssertError;
2055 import std.exception : assertThrown, enforce;
2056 import std.string : format;
2058 static if (hasLength!R)
2059 immutable lenBefore = range.length;
2061 static if (isRandomAccessRange!R)
2063 assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
2064 static if (hasLength!R)
2066 enforce(range.length == lenBefore,
2067 new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
2073 @system unittest
2075 import std.conv : to;
2076 import std.exception;
2078 assertCTFEable!(
2080 foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
2081 (string s) => new RefBidirCU!char(s),
2082 (string s) => new RefRandomCU!char(s)))
2084 enum sHasLength = hasLength!(typeof(S("abcd")));
2087 auto range = S("abcd");
2088 testDecode(range, 0, 'a', 1);
2089 testDecode(range, 1, 'b', 2);
2090 testDecodeFront(range, 'a', 1);
2091 testDecodeFront(range, 'b', 1);
2092 assert(decodeFront(range) == 'c');
2093 assert(decodeFront(range) == 'd');
2097 auto range = S("ウェブサイト");
2098 testDecode(range, 0, 'ウ', 3);
2099 testDecode(range, 3, 'ェ', 6);
2100 testDecodeFront(range, 'ウ', 3);
2101 testDecodeFront(range, 'ェ', 3);
2102 assert(decodeFront(range) == 'ブ');
2103 assert(decodeFront(range) == 'サ');
2107 auto range = S("abcd");
2108 testDecodeBack(range, 'd', 1);
2109 testDecodeBack(range, 'c', 1);
2110 testDecodeBack(range, 'b', 1);
2111 testDecodeBack(range, 'a', 1);
2115 auto range = S("ウェブサイト");
2116 testDecodeBack(range, 'ト', 3);
2117 testDecodeBack(range, 'イ', 3);
2118 testDecodeBack(range, 'サ', 3);
2119 testDecodeBack(range, 'ブ', 3);
2122 testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2123 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2125 foreach (str; ["\xE2\x89", // too short
2126 "\xC0\x8A",
2127 "\xE0\x80\x8A",
2128 "\xF0\x80\x80\x8A",
2129 "\xF8\x80\x80\x80\x8A",
2130 "\xFC\x80\x80\x80\x80\x8A"])
2132 testBadDecode(S(str), 0);
2133 testBadDecode(S(str), 1);
2134 testBadDecodeBack(S(str));
2137 //Invalid UTF-8 sequence where the first code unit is valid.
2138 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2139 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2141 //Invalid UTF-8 sequence where the first code unit isn't valid.
2142 foreach (str; ["\xED\xA0\x80",
2143 "\xED\xAD\xBF",
2144 "\xED\xAE\x80",
2145 "\xED\xAF\xBF",
2146 "\xED\xB0\x80",
2147 "\xED\xBE\x80",
2148 "\xED\xBF\xBF"])
2150 testBadDecode(S(str), 0);
2151 testBadDecodeBack(S(str));
2157 @system unittest
2159 import std.exception;
2160 assertCTFEable!(
2162 foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
2163 (wstring s) => new RefBidirCU!wchar(s),
2164 (wstring s) => new RefRandomCU!wchar(s)))
2166 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2167 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2168 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2169 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2170 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2172 testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2173 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2175 testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2176 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2179 auto range = S("ウェブサイト");
2180 testDecode(range, 0, 'ウ', 1);
2181 testDecode(range, 1, 'ェ', 2);
2182 testDecodeFront(range, 'ウ', 1);
2183 testDecodeFront(range, 'ェ', 1);
2184 assert(decodeFront(range) == 'ブ');
2185 assert(decodeFront(range) == 'サ');
2189 auto range = S("ウェブサイト");
2190 testDecodeBack(range, 'ト', 1);
2191 testDecodeBack(range, 'イ', 1);
2192 testDecodeBack(range, 'サ', 1);
2193 testDecodeBack(range, 'ブ', 1);
2197 foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2199 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2200 cast(wchar) 0x1400,
2201 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2202 testDecode(str, 0, cast(dchar) 0x10000, 2);
2203 testDecode(str, 2, cast(dchar) 0x1400, 3);
2204 testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2205 testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2206 testDecodeBack(str, cast(dchar) 0x1400, 1);
2207 testDecodeBack(str, cast(dchar) 0x10000, 2);
2212 @system unittest
2214 import std.exception;
2215 assertCTFEable!(
2217 foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
2218 (dstring s) => new RefBidirCU!dchar(s),
2219 (dstring s) => new RefRandomCU!dchar(s)))
2221 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2222 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2223 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2224 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2225 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2227 testBadDecode(S([cast(dchar) 0xD800]), 0);
2228 testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2229 testBadDecode(S([cast(dchar) 0x110000]), 0);
2231 testBadDecodeBack(S([cast(dchar) 0xD800]));
2232 testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2233 testBadDecodeBack(S([cast(dchar) 0x110000]));
2236 auto range = S("ウェブサイト");
2237 testDecode(range, 0, 'ウ', 1);
2238 testDecode(range, 1, 'ェ', 2);
2239 testDecodeFront(range, 'ウ', 1);
2240 testDecodeFront(range, 'ェ', 1);
2241 assert(decodeFront(range) == 'ブ');
2242 assert(decodeFront(range) == 'サ');
2246 auto range = S("ウェブサイト");
2247 testDecodeBack(range, 'ト', 1);
2248 testDecodeBack(range, 'イ', 1);
2249 testDecodeBack(range, 'サ', 1);
2250 testDecodeBack(range, 'ブ', 1);
2254 foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2256 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2257 testDecode(str, 0, 0x10000, 1);
2258 testDecode(str, 1, 0x1400, 2);
2259 testDecode(str, 2, 0xB9DDE, 3);
2260 testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2261 testDecodeBack(str, cast(dchar) 0x1400, 1);
2262 testDecodeBack(str, cast(dchar) 0x10000, 1);
2267 @safe unittest
2269 import std.exception;
2270 import std.traits : FunctionAttribute, functionAttributes, isSafe;
2271 assertCTFEable!(
2273 foreach (S; AliasSeq!( char[], const( char)[], string,
2274 wchar[], const(wchar)[], wstring,
2275 dchar[], const(dchar)[], dstring))
2277 static assert(isSafe!({ S str; size_t i = 0; decode(str, i); }));
2278 static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2279 static assert(isSafe!({ S str; decodeFront(str); }));
2280 static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2281 static assert((functionAttributes!({
2282 S str; size_t i = 0; decodeFront(str, i);
2283 }) & FunctionAttribute.pure_) != 0);
2284 static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2285 static assert((functionAttributes!({
2286 S str; size_t i = 0; decodeBack(str, i);
2287 }) & FunctionAttribute.pure_) != 0);
2288 static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2293 @safe unittest
2295 import std.exception;
2296 char[4] val;
2297 val[0] = 0b1111_0111;
2298 val[1] = 0b1011_1111;
2299 val[2] = 0b1011_1111;
2300 val[3] = 0b1011_1111;
2301 size_t i = 0;
2302 assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2304 /* =================== Encode ======================= */
2306 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2308 static if (useReplacementDchar)
2309 return replacementDchar;
2310 else
2311 throw new UTFException(msg).setSequence(c);
2315 Encodes `c` into the static array, `buf`, and returns the actual
2316 length of the encoded character (a number between `1` and `4` for
2317 `char[4]` buffers and a number between `1` and `2` for
2318 `wchar[2]` buffers).
2320 Throws:
2321 `UTFException` if `c` is not a valid UTF code point.
2323 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2324 out char[4] buf, dchar c) @safe pure
2326 if (c <= 0x7F)
2328 assert(isValidDchar(c));
2329 buf[0] = cast(char) c;
2330 return 1;
2332 if (c <= 0x7FF)
2334 assert(isValidDchar(c));
2335 buf[0] = cast(char)(0xC0 | (c >> 6));
2336 buf[1] = cast(char)(0x80 | (c & 0x3F));
2337 return 2;
2339 if (c <= 0xFFFF)
2341 if (0xD800 <= c && c <= 0xDFFF)
2342 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2344 assert(isValidDchar(c));
2346 buf[0] = cast(char)(0xE0 | (c >> 12));
2347 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2348 buf[2] = cast(char)(0x80 | (c & 0x3F));
2349 return 3;
2351 if (c <= 0x10FFFF)
2353 assert(isValidDchar(c));
2354 buf[0] = cast(char)(0xF0 | (c >> 18));
2355 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2356 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2357 buf[3] = cast(char)(0x80 | (c & 0x3F));
2358 return 4;
2361 assert(!isValidDchar(c));
2362 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2363 goto L3;
2367 @safe unittest
2369 import std.exception : assertThrown;
2370 import std.typecons : Yes;
2372 char[4] buf;
2374 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2375 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2376 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2377 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2378 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2379 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2381 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2382 auto slice = buf[];
2383 assert(slice.decodeFront == replacementDchar);
2387 @safe unittest
2389 import std.exception : assertThrown;
2390 import std.typecons : Yes;
2392 wchar[2] buf;
2394 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2395 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2396 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2397 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2398 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2399 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2401 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2402 auto slice = buf[];
2403 assert(slice.decodeFront == replacementDchar);
2407 @safe unittest
2409 import std.exception : assertThrown;
2410 import std.typecons : Yes;
2412 dchar[1] buf;
2414 assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
2415 assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
2416 assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
2417 assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
2418 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2420 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2421 assert(buf[0] == replacementDchar);
2424 @safe unittest
2426 import std.exception;
2427 assertCTFEable!(
2429 char[4] buf;
2431 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2432 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2433 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2434 assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2435 assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2436 assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2437 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2438 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2439 assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2440 assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2441 assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2443 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2444 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2445 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2446 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2447 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2449 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2450 enum replacementDcharString = "\uFFFD";
2451 assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
2456 /// Ditto
2457 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2458 out wchar[2] buf, dchar c) @safe pure
2460 if (c <= 0xFFFF)
2462 if (0xD800 <= c && c <= 0xDFFF)
2463 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2465 assert(isValidDchar(c));
2467 buf[0] = cast(wchar) c;
2468 return 1;
2470 if (c <= 0x10FFFF)
2472 assert(isValidDchar(c));
2473 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2474 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2475 return 2;
2478 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2479 goto L1;
2482 @safe unittest
2484 import std.exception;
2485 assertCTFEable!(
2487 wchar[2] buf;
2489 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2490 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2491 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2492 assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2493 assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2494 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2495 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2497 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2498 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2499 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2500 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2501 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2503 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2504 assert(buf.front == replacementDchar);
2509 /// Ditto
2510 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2511 out dchar[1] buf, dchar c) @safe pure
2513 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2514 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2515 else
2516 assert(isValidDchar(c));
2517 buf[0] = c;
2518 return 1;
2521 @safe unittest
2523 import std.exception;
2524 assertCTFEable!(
2526 dchar[1] buf;
2528 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2529 encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2530 encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2531 encode(buf, 0xFFFE); assert(buf[0] == 0xFFFE);
2532 encode(buf, 0xFFFF); assert(buf[0] == 0xFFFF);
2533 encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2535 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2536 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2537 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2538 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2539 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2541 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2542 assert(buf.front == replacementDchar);
2548 Encodes `c` in `str`'s encoding and appends it to `str`.
2550 Throws:
2551 `UTFException` if `c` is not a valid UTF code point.
2553 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2554 ref scope char[] str, dchar c) @safe pure
2556 if (c <= 0x7F)
2558 assert(isValidDchar(c));
2559 str ~= cast(char) c;
2561 else
2563 char[4] buf;
2564 uint L;
2566 if (c <= 0x7FF)
2568 assert(isValidDchar(c));
2569 buf[0] = cast(char)(0xC0 | (c >> 6));
2570 buf[1] = cast(char)(0x80 | (c & 0x3F));
2571 L = 2;
2573 else if (c <= 0xFFFF)
2575 if (0xD800 <= c && c <= 0xDFFF)
2576 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2578 assert(isValidDchar(c));
2580 buf[0] = cast(char)(0xE0 | (c >> 12));
2581 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2582 buf[2] = cast(char)(0x80 | (c & 0x3F));
2583 L = 3;
2585 else if (c <= 0x10FFFF)
2587 assert(isValidDchar(c));
2588 buf[0] = cast(char)(0xF0 | (c >> 18));
2589 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2590 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2591 buf[3] = cast(char)(0x80 | (c & 0x3F));
2592 L = 4;
2594 else
2596 assert(!isValidDchar(c));
2597 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2598 goto L3;
2600 str ~= buf[0 .. L];
2605 @safe unittest
2607 char[] s = "abcd".dup;
2608 dchar d1 = 'a';
2609 dchar d2 = 'ø';
2611 encode(s, d1);
2612 assert(s.length == 5);
2613 assert(s == "abcda");
2614 encode(s, d2);
2615 assert(s.length == 7);
2616 assert(s == "abcdaø");
2619 @safe unittest
2621 import std.exception;
2623 assertCTFEable!(
2625 char[] s = "abcd".dup;
2626 encode(s, cast(dchar)'a');
2627 assert(s.length == 5);
2628 assert(s == "abcda");
2630 encode(s, cast(dchar)'\u00A9');
2631 assert(s.length == 7);
2632 assert(s == "abcda\xC2\xA9");
2633 //assert(s == "abcda\u00A9"); // BUG: fix compiler
2635 encode(s, cast(dchar)'\u2260');
2636 assert(s.length == 10);
2637 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2641 @safe unittest
2643 import std.exception;
2644 assertCTFEable!(
2646 char[] buf;
2648 encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2649 encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2650 encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2651 encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2652 encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2653 encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2654 encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2655 encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2656 encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2657 encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2658 encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2660 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2661 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2662 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2663 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2664 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2666 enum replacementDcharString = "\uFFFD";
2667 enum rdcslen = replacementDcharString.length;
2668 assert(buf[$ - rdcslen .. $] != replacementDcharString);
2669 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2670 assert(buf[$ - rdcslen .. $] == replacementDcharString);
2674 /// ditto
2675 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2676 ref scope wchar[] str, dchar c) @safe pure
2678 if (c <= 0xFFFF)
2680 if (0xD800 <= c && c <= 0xDFFF)
2681 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2683 assert(isValidDchar(c));
2685 str ~= cast(wchar) c;
2687 else if (c <= 0x10FFFF)
2689 wchar[2] buf;
2691 assert(isValidDchar(c));
2692 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2693 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2694 str ~= buf;
2696 else
2698 assert(!isValidDchar(c));
2699 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2700 goto L1;
2704 @safe unittest
2706 import std.exception;
2707 assertCTFEable!(
2709 wchar[] buf;
2711 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2712 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2713 encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2714 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2715 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2716 encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2717 encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2719 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2720 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2721 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2722 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2723 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2725 assert(buf.back != replacementDchar);
2726 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2727 assert(buf.back == replacementDchar);
2731 /// ditto
2732 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2733 ref scope dchar[] str, dchar c) @safe pure
2735 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2736 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2737 else
2738 assert(isValidDchar(c));
2739 str ~= c;
2742 @safe unittest
2744 import std.exception;
2745 assertCTFEable!(
2747 dchar[] buf;
2749 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2750 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2751 encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2752 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2753 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2754 encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2756 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2757 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2758 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2759 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2760 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2762 assert(buf.back != replacementDchar);
2763 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2764 assert(buf.back == replacementDchar);
2770 Returns the number of code units that are required to encode the code point
2771 `c` when `C` is the character type used to encode it.
2773 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2774 if (isSomeChar!C)
2776 static if (C.sizeof == 1)
2778 if (c <= 0x7F) return 1;
2779 if (c <= 0x7FF) return 2;
2780 if (c <= 0xFFFF) return 3;
2781 if (c <= 0x10FFFF) return 4;
2782 assert(false);
2784 else static if (C.sizeof == 2)
2786 return c <= 0xFFFF ? 1 : 2;
2788 else
2790 static assert(C.sizeof == 4);
2791 return 1;
2796 @safe pure nothrow @nogc unittest
2798 assert(codeLength!char('a') == 1);
2799 assert(codeLength!wchar('a') == 1);
2800 assert(codeLength!dchar('a') == 1);
2802 assert(codeLength!char('\U0010FFFF') == 4);
2803 assert(codeLength!wchar('\U0010FFFF') == 2);
2804 assert(codeLength!dchar('\U0010FFFF') == 1);
2809 Returns the number of code units that are required to encode `str`
2810 in a string whose character type is `C`. This is particularly useful
2811 when slicing one string with the length of another and the two string
2812 types use different character types.
2814 Params:
2815 C = the character type to get the encoding length for
2816 input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2817 to calculate the encoding length from
2818 Returns:
2819 The number of code units in `input` when encoded to `C`
2821 size_t codeLength(C, InputRange)(InputRange input)
2822 if (isSomeFiniteCharInputRange!InputRange)
2824 alias EncType = typeof(cast() ElementEncodingType!InputRange.init);
2825 static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2826 return input.length;
2827 else
2829 size_t total = 0;
2831 foreach (c; input.byDchar)
2832 total += codeLength!C(c);
2834 return total;
2839 @safe unittest
2841 assert(codeLength!char("hello world") ==
2842 "hello world".length);
2843 assert(codeLength!wchar("hello world") ==
2844 "hello world"w.length);
2845 assert(codeLength!dchar("hello world") ==
2846 "hello world"d.length);
2848 assert(codeLength!char(`プログラミング`) ==
2849 `プログラミング`.length);
2850 assert(codeLength!wchar(`プログラミング`) ==
2851 `プログラミング`w.length);
2852 assert(codeLength!dchar(`プログラミング`) ==
2853 `プログラミング`d.length);
2855 string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2856 wstring needle = `Être sans la verité`;
2857 assert(haystack[codeLength!char(needle) .. $] ==
2858 `, ça, ce ne serait pas bien.`);
2861 @safe unittest
2863 import std.algorithm.iteration : filter;
2864 import std.conv : to;
2865 import std.exception;
2867 assertCTFEable!(
2869 foreach (S; AliasSeq!( char[], const char[], string,
2870 wchar[], const wchar[], wstring,
2871 dchar[], const dchar[], dstring))
2873 foreach (C; AliasSeq!(char, wchar, dchar))
2875 assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2876 assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2877 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2878 to!(C[])(`ウェブサイト@La_Verité.com`).length);
2879 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2880 to!(C[])(`ウェブサイト@La_Verité.com`).length);
2887 Internal helper function:
2889 Returns true if it is safe to search for the Codepoint `c` inside
2890 code units, without decoding.
2892 This is a runtime check that is used an optimization in various functions,
2893 particularly, in `std.string`.
2895 package bool canSearchInCodeUnits(C)(dchar c)
2896 if (isSomeChar!C)
2898 static if (C.sizeof == 1)
2899 return c <= 0x7F;
2900 else static if (C.sizeof == 2)
2901 return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2902 else static if (C.sizeof == 4)
2903 return true;
2904 else
2905 static assert(0);
2907 @safe unittest
2909 assert( canSearchInCodeUnits! char('a'));
2910 assert( canSearchInCodeUnits!wchar('a'));
2911 assert( canSearchInCodeUnits!dchar('a'));
2912 assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2913 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2914 assert( canSearchInCodeUnits!wchar('ö'));
2915 assert( canSearchInCodeUnits!dchar('ö'));
2916 assert(!canSearchInCodeUnits! char('日'));
2917 assert( canSearchInCodeUnits!wchar('日'));
2918 assert( canSearchInCodeUnits!dchar('日'));
2919 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2920 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2921 assert(!canSearchInCodeUnits! char('\U00010001'));
2922 assert(!canSearchInCodeUnits!wchar('\U00010001'));
2923 assert( canSearchInCodeUnits!dchar('\U00010001'));
2926 /* =================== Validation ======================= */
2929 Checks to see if `str` is well-formed unicode or not.
2931 Throws:
2932 `UTFException` if `str` is not well-formed.
2934 void validate(S)(in S str) @safe pure
2935 if (isSomeString!S)
2937 immutable len = str.length;
2938 for (size_t i = 0; i < len; )
2940 decode(str, i);
2945 @safe unittest
2947 import std.exception : assertThrown;
2948 char[] a = [167, 133, 175];
2949 assertThrown!UTFException(validate(a));
2952 // https://issues.dlang.org/show_bug.cgi?id=12923
2953 @safe unittest
2955 import std.exception;
2956 assertThrown((){
2957 char[3]a=[167, 133, 175];
2958 validate(a[]);
2959 }());
2963 * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2964 * string of the elements.
2966 * Params:
2967 * s = the string to encode
2968 * Returns:
2969 * A UTF-8 string
2970 * See_Also:
2971 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2973 string toUTF8(S)(S s)
2974 if (isSomeFiniteCharInputRange!S)
2976 return toUTFImpl!string(s);
2980 @safe pure unittest
2982 import std.algorithm.comparison : equal;
2984 // The ö is represented by two UTF-8 code units
2985 assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2987 // 𐐷 is four code units in UTF-8
2988 assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2991 @system pure unittest
2993 import std.algorithm.comparison : equal;
2994 import std.internal.test.dummyrange : ReferenceInputRange;
2996 alias RT = ReferenceInputRange!(ElementType!(string));
2997 auto r1 = new RT("Hellø");
2998 auto r2 = new RT("𐐷");
3000 assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
3001 assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
3005 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3006 * `wstring` of the elements.
3008 * Params:
3009 * s = the range to encode
3010 * Returns:
3011 * A UTF-16 string
3012 * See_Also:
3013 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3015 wstring toUTF16(S)(S s)
3016 if (isSomeFiniteCharInputRange!S)
3018 return toUTFImpl!wstring(s);
3022 @safe pure unittest
3024 import std.algorithm.comparison : equal;
3026 // these graphemes are two code units in UTF-16 and one in UTF-32
3027 assert("𤭢"d.length == 1);
3028 assert("𐐷"d.length == 1);
3030 assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62]));
3031 assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37]));
3034 @system pure unittest
3036 import std.algorithm.comparison : equal;
3037 import std.internal.test.dummyrange : ReferenceInputRange;
3039 alias RT = ReferenceInputRange!(ElementType!(string));
3040 auto r1 = new RT("𤭢");
3041 auto r2 = new RT("𐐷");
3043 assert(r1.toUTF16.equal([0xD852, 0xDF62]));
3044 assert(r2.toUTF16.equal([0xD801, 0xDC37]));
3049 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3050 * `dstring` of the elements.
3052 * Params:
3053 * s = the range to encode
3054 * Returns:
3055 * A UTF-32 string
3056 * See_Also:
3057 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3059 dstring toUTF32(S)(scope S s)
3060 if (isSomeFiniteCharInputRange!S)
3062 return toUTFImpl!dstring(s);
3066 @safe pure unittest
3068 import std.algorithm.comparison : equal;
3070 // these graphemes are two code units in UTF-16 and one in UTF-32
3071 assert("𤭢"w.length == 2);
3072 assert("𐐷"w.length == 2);
3074 assert("𤭢"w.toUTF32.equal([0x00024B62]));
3075 assert("𐐷"w.toUTF32.equal([0x00010437]));
3078 private T toUTFImpl(T, S)(scope S s)
3080 static if (is(S : T))
3082 return s.idup;
3084 else
3086 import std.array : appender;
3087 auto app = appender!T();
3089 static if (is(S == C[], C) || hasLength!S)
3090 app.reserve(s.length);
3092 ElementEncodingType!T e = void;
3093 foreach (c; s.byUTF!(typeof(cast() ElementEncodingType!T.init)))
3094 app.put(c);
3096 return app.data;
3100 /* =================== toUTFz ======================= */
3103 Returns a C-style zero-terminated string equivalent to `str`. `str`
3104 must not contain embedded `'\0'`'s as any C function will treat the first
3105 `'\0'` that it sees as the end of the string. If `str.empty` is
3106 `true`, then a string containing only `'\0'` is returned.
3108 `toUTFz` accepts any type of string and is templated on the type of
3109 character pointer that you wish to convert to. It will avoid allocating a
3110 new string if it can, but there's a decent chance that it will end up having
3111 to allocate a new string - particularly when dealing with character types
3112 other than `char`.
3114 $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3115 anything alters the character one past the end of `str` (which is the
3116 `'\0'` character terminating the string), then the string won't be
3117 zero-terminated anymore. The most likely scenarios for that are if you
3118 append to `str` and no reallocation takes place or when `str` is a
3119 slice of a larger array, and you alter the character in the larger array
3120 which is one character past the end of `str`. Another case where it could
3121 occur would be if you had a mutable character array immediately after
3122 `str` in memory (for example, if they're member variables in a
3123 user-defined type with one declared right after the other) and that
3124 character array happened to start with `'\0'`. Such scenarios will never
3125 occur if you immediately use the zero-terminated string after calling
3126 `toUTFz` and the C function using it doesn't keep a reference to it.
3127 Also, they are unlikely to occur even if you save the zero-terminated string
3128 (the cases above would be among the few examples of where it could happen).
3129 However, if you save the zero-terminate string and want to be absolutely
3130 certain that the string stays zero-terminated, then simply append a
3131 `'\0'` to the string and use its `ptr` property rather than calling
3132 `toUTFz`.
3134 $(RED Warning 2:) When passing a character pointer to a C function, and the
3135 C function keeps it around for any reason, make sure that you keep a
3136 reference to it in your D code. Otherwise, it may go away during a garbage
3137 collection cycle and cause a nasty bug when the C code tries to use it.
3139 template toUTFz(P)
3140 if (is(P == C*, C) && isSomeChar!C)
3142 P toUTFz(S)(S str) @safe pure
3143 if (isSomeString!S)
3145 return toUTFzImpl!(P, S)(str);
3150 @safe pure unittest
3152 auto p1 = toUTFz!(char*)("hello world");
3153 auto p2 = toUTFz!(const(char)*)("hello world");
3154 auto p3 = toUTFz!(immutable(char)*)("hello world");
3155 auto p4 = toUTFz!(char*)("hello world"d);
3156 auto p5 = toUTFz!(const(wchar)*)("hello world");
3157 auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
3160 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3161 if (is(immutable typeof(*P.init) == typeof(str[0])))
3162 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3164 if (str.empty)
3166 typeof(*P.init)[] retval = ['\0'];
3168 auto trustedPtr() @trusted { return retval.ptr; }
3169 return trustedPtr();
3172 alias C = typeof(cast() ElementEncodingType!S.init);
3174 //If the P is mutable, then we have to make a copy.
3175 static if (is(typeof(cast() *P.init) == typeof(*P.init)))
3177 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3179 else
3181 if (!__ctfe)
3183 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3184 immutable p = trustedPtrAdd(str);
3186 // Peek past end of str, if it's 0, no conversion necessary.
3187 // Note that the compiler will put a 0 past the end of static
3188 // strings, and the storage allocator will put a 0 past the end
3189 // of newly allocated char[]'s.
3190 // Is p dereferenceable? A simple test: if the p points to an
3191 // address multiple of 4, then conservatively assume the pointer
3192 // might be pointing to a new block of memory, which might be
3193 // unreadable. Otherwise, it's definitely pointing to valid
3194 // memory.
3195 if ((cast(size_t) p & 3) && *p == '\0')
3196 return &str[0];
3199 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3203 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3204 if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable))
3205 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3207 alias InChar = typeof(str[0]);
3208 alias UInChar = typeof(cast() str[0]); // unqualified version of InChar
3209 alias OutChar = typeof(*P.init);
3210 alias UOutChar = typeof(cast() *P.init); // unqualified version
3212 //const(C)[] -> const(C)* or
3213 //C[] -> C* or const(C)*
3214 static if (( is(const(UInChar) == InChar) && is( const(UOutChar) == OutChar)) ||
3215 (!is(const(UInChar) == InChar) && !is(immutable(UOutChar) == OutChar)))
3217 if (!__ctfe)
3219 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3220 auto p = trustedPtrAdd(str);
3222 if ((cast(size_t) p & 3) && *p == '\0')
3223 return &str[0];
3226 str ~= '\0';
3227 return &str[0];
3229 //const(C)[] -> C* or immutable(C)* or
3230 //C[] -> immutable(C)*
3231 else
3233 import std.array : uninitializedArray;
3234 auto copy = uninitializedArray!(UOutChar[])(str.length + 1);
3235 copy[0 .. $ - 1] = str[];
3236 copy[$ - 1] = '\0';
3238 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
3239 return trustedCast(copy);
3243 private P toUTFzImpl(P, S)(S str) @safe pure
3244 if (!is(immutable typeof(*P.init) == immutable typeof(str[0])))
3245 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3247 import std.array : appender;
3248 auto retval = appender!(typeof(*P.init)[])();
3250 foreach (dchar c; str)
3251 retval.put(c);
3252 retval.put('\0');
3254 return () @trusted { return cast(P) retval.data.ptr; } ();
3257 @safe pure unittest
3259 import core.exception : AssertError;
3260 import std.algorithm;
3261 import std.conv : to;
3262 import std.exception;
3263 import std.string : format;
3265 assertCTFEable!(
3267 foreach (S; AliasSeq!(string, wstring, dstring))
3269 alias C = Unqual!(ElementEncodingType!S);
3271 auto s1 = to!S("hello\U00010143\u0100\U00010143");
3272 auto temp = new C[](s1.length + 1);
3273 temp[0 .. $ - 1] = s1[0 .. $];
3274 temp[$ - 1] = '\n';
3275 --temp.length;
3276 auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3277 auto s2 = trustedAssumeUnique(temp);
3278 assert(s1 == s2);
3280 void trustedCStringAssert(P, S)(S s) @trusted
3282 auto p = toUTFz!P(s);
3283 assert(p[0 .. s.length] == s);
3284 assert(p[s.length] == '\0');
3287 foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3289 trustedCStringAssert!P(s1);
3290 trustedCStringAssert!P(s2);
3295 static void test(P, S)(S s, size_t line = __LINE__) @trusted
3297 static size_t zeroLen(C)(const(C)* ptr) @trusted
3299 size_t len = 0;
3300 while (*ptr != '\0') { ++ptr; ++len; }
3301 return len;
3304 auto p = toUTFz!P(s);
3305 immutable len = zeroLen(p);
3306 enforce(cmp(s, p[0 .. len]) == 0,
3307 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3308 __FILE__, line));
3311 assertCTFEable!(
3313 foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3314 dchar*, const(dchar)*, immutable(dchar)*))
3316 test!P("hello\U00010143\u0100\U00010143");
3318 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3319 dchar*, const(dchar)*, immutable(dchar)*))
3321 test!P("hello\U00010143\u0100\U00010143"w);
3323 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3324 wchar*, const(wchar)*, immutable(wchar)*))
3326 test!P("hello\U00010143\u0100\U00010143"d);
3328 foreach (S; AliasSeq!( char[], const( char)[],
3329 wchar[], const(wchar)[],
3330 dchar[], const(dchar)[]))
3332 auto s = to!S("hello\U00010143\u0100\U00010143");
3334 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3335 wchar*, const(wchar)*, immutable(wchar)*,
3336 dchar*, const(dchar)*, immutable(dchar)*))
3338 test!P(s);
3346 `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3348 Encodes string `s` into UTF-16 and returns the encoded string.
3349 `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3350 that take an `LPCWSTR` argument.
3352 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3353 if (isSomeChar!C)
3355 return toUTFz!(const(wchar)*)(str);
3359 @system unittest
3361 string str = "Hello, World!";
3362 const(wchar)* p = str.toUTF16z;
3363 assert(p[str.length] == '\0');
3366 @safe pure unittest
3368 import std.conv : to;
3369 //toUTFz is already thoroughly tested, so this will just verify that
3370 //toUTF16z compiles properly for the various string types.
3371 foreach (S; AliasSeq!(string, wstring, dstring))
3372 assert(toUTF16z(to!S("hello world")) !is null);
3376 /* ================================ tests ================================== */
3378 @safe pure unittest
3380 import std.exception;
3382 assertCTFEable!(
3384 assert(toUTF16("hello"c) == "hello");
3385 assert(toUTF32("hello"c) == "hello");
3386 assert(toUTF8 ("hello"w) == "hello");
3387 assert(toUTF32("hello"w) == "hello");
3388 assert(toUTF8 ("hello"d) == "hello");
3389 assert(toUTF16("hello"d) == "hello");
3391 assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3392 assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3393 assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3394 assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3395 assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3396 assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3398 assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3399 assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3400 assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3401 assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3402 assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3403 assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3409 Returns the total number of code points encoded in `str`.
3411 Supercedes: This function supercedes $(LREF toUCSindex).
3413 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3415 Throws:
3416 `UTFException` if `str` is not well-formed.
3418 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc
3419 if (isSomeChar!C)
3421 return walkLength(str.byDchar);
3425 @safe pure nothrow @nogc unittest
3427 assert(count("") == 0);
3428 assert(count("a") == 1);
3429 assert(count("abc") == 3);
3430 assert(count("\u20AC100") == 4);
3433 @safe pure nothrow @nogc unittest
3435 import std.exception;
3436 assertCTFEable!(
3438 assert(count("") == 0);
3439 assert(count("a") == 1);
3440 assert(count("abc") == 3);
3441 assert(count("\u20AC100") == 4);
3446 // Ranges of code units for testing.
3447 version (StdUnittest)
3449 private:
3450 struct InputCU(C)
3452 import std.conv : to;
3453 @property bool empty() { return _str.empty; }
3454 @property C front() { return _str[0]; }
3455 void popFront() { _str = _str[1 .. $]; }
3457 this(inout(C)[] str)
3459 _str = to!(C[])(str);
3462 C[] _str;
3465 struct BidirCU(C)
3467 import std.conv : to;
3468 @property bool empty() { return _str.empty; }
3469 @property C front() { return _str[0]; }
3470 void popFront() { _str = _str[1 .. $]; }
3471 @property C back() { return _str[$ - 1]; }
3472 void popBack() { _str = _str[0 .. $ - 1]; }
3473 @property auto save() { return BidirCU(_str); }
3474 @property size_t length() { return _str.length; }
3476 this(inout(C)[] str)
3478 _str = to!(C[])(str);
3481 C[] _str;
3484 struct RandomCU(C)
3486 import std.conv : to;
3487 @property bool empty() { return _str.empty; }
3488 @property C front() { return _str[0]; }
3489 void popFront() { _str = _str[1 .. $]; }
3490 @property C back() { return _str[$ - 1]; }
3491 void popBack() { _str = _str[0 .. $ - 1]; }
3492 @property auto save() { return RandomCU(_str); }
3493 @property size_t length() { return _str.length; }
3494 C opIndex(size_t i) { return _str[i]; }
3495 auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3497 this(inout(C)[] str)
3499 _str = to!(C[])(str);
3502 C[] _str;
3505 class RefBidirCU(C)
3507 import std.conv : to;
3508 @property bool empty() { return _str.empty; }
3509 @property C front() { return _str[0]; }
3510 void popFront() { _str = _str[1 .. $]; }
3511 @property C back() { return _str[$ - 1]; }
3512 void popBack() { _str = _str[0 .. $ - 1]; }
3513 @property auto save() { return new RefBidirCU(_str); }
3514 @property size_t length() { return _str.length; }
3516 this(inout(C)[] str)
3518 _str = to!(C[])(str);
3521 C[] _str;
3524 class RefRandomCU(C)
3526 import std.conv : to;
3527 @property bool empty() { return _str.empty; }
3528 @property C front() { return _str[0]; }
3529 void popFront() { _str = _str[1 .. $]; }
3530 @property C back() { return _str[$ - 1]; }
3531 void popBack() { _str = _str[0 .. $ - 1]; }
3532 @property auto save() { return new RefRandomCU(_str); }
3533 @property size_t length() { return _str.length; }
3534 C opIndex(size_t i) { return _str[i]; }
3535 auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3537 this(inout(C)[] str)
3539 _str = to!(C[])(str);
3542 C[] _str;
3548 * Inserted in place of invalid UTF sequences.
3550 * References:
3551 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3553 enum dchar replacementDchar = '\uFFFD';
3555 /********************************************
3556 * Iterate a range of char, wchar, or dchars by code unit.
3558 * The purpose is to bypass the special case decoding that
3559 * $(REF front, std,range,primitives) does to character arrays. As a result,
3560 * using ranges with `byCodeUnit` can be `nothrow` while
3561 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3562 * sequences.
3564 * A code unit is a building block of the UTF encodings. Generally, an
3565 * individual code unit does not represent what's perceived as a full
3566 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3567 * are encoded with multiple code units. For example, the UTF-8 code units for
3568 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3569 * often does not form a character on its own. Attempting to treat it as
3570 * one while iterating over the resulting range will give nonsensical results.
3572 * Params:
3573 * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3574 * of characters (including strings) or a type that implicitly converts to a string type.
3575 * Returns:
3576 * If `r` is not an auto-decodable string (i.e. a narrow string or a
3577 * user-defined type that implicitly converts to a string type), then `r`
3578 * is returned.
3580 * Otherwise, `r` is converted to its corresponding string type (if it's
3581 * not already a string) and wrapped in a random-access range where the
3582 * element encoding type of the string (its code unit) is the element type
3583 * of the range, and that range returned. The range has slicing.
3585 * If `r` is quirky enough to be a struct or class which is an input range
3586 * of characters on its own (i.e. it has the input range API as member
3587 * functions), $(I and) it's implicitly convertible to a string type, then
3588 * `r` is returned, and no implicit conversion takes place.
3590 * If `r` is wrapped in a new range, then that range has a `source`
3591 * property for returning the string that's currently contained within that
3592 * range.
3594 * See_Also:
3595 * Refer to the $(MREF std, uni) docs for a reference on Unicode
3596 * terminology.
3598 * For a range that iterates by grapheme cluster (written character) see
3599 * $(REF byGrapheme, std,uni).
3601 auto byCodeUnit(R)(R r)
3602 if ((isConvertibleToString!R && !isStaticArray!R) ||
3603 (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
3605 import std.traits : StringTypeOf;
3606 static if (// This would be cleaner if we had a way to check whether a type
3607 // was a range without any implicit conversions.
3608 (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3609 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3611 static struct ByCodeUnitImpl
3613 @safe pure nothrow @nogc:
3615 @property bool empty() const { return source.length == 0; }
3616 @property auto ref front() inout { return source[0]; }
3617 void popFront() { source = source[1 .. $]; }
3619 @property auto save() { return ByCodeUnitImpl(source.save); }
3621 @property auto ref back() inout { return source[$ - 1]; }
3622 void popBack() { source = source[0 .. $-1]; }
3624 auto ref opIndex(size_t index) inout { return source[index]; }
3625 auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
3627 @property size_t length() const { return source.length; }
3628 alias opDollar = length;
3630 StringTypeOf!R source;
3633 static assert(isRandomAccessRange!ByCodeUnitImpl);
3635 return ByCodeUnitImpl(r);
3637 else static if (!isInputRange!R ||
3638 (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3639 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3641 return cast(StringTypeOf!R) r;
3643 else
3645 // byCodeUnit for ranges and dchar[] is a no-op
3646 return r;
3651 @safe unittest
3653 import std.range.primitives;
3654 import std.traits : isAutodecodableString;
3656 auto r = "Hello, World!".byCodeUnit();
3657 static assert(hasLength!(typeof(r)));
3658 static assert(hasSlicing!(typeof(r)));
3659 static assert(isRandomAccessRange!(typeof(r)));
3660 static assert(is(ElementType!(typeof(r)) == immutable char));
3662 // contrast with the range capabilities of standard strings (with or
3663 // without autodecoding enabled).
3664 auto s = "Hello, World!";
3665 static assert(isBidirectionalRange!(typeof(r)));
3666 static if (isAutodecodableString!(typeof(s)))
3668 // with autodecoding enabled, strings are non-random-access ranges of
3669 // dchar.
3670 static assert(is(ElementType!(typeof(s)) == dchar));
3671 static assert(!isRandomAccessRange!(typeof(s)));
3672 static assert(!hasSlicing!(typeof(s)));
3673 static assert(!hasLength!(typeof(s)));
3675 else
3677 // without autodecoding, strings are normal arrays.
3678 static assert(is(ElementType!(typeof(s)) == immutable char));
3679 static assert(isRandomAccessRange!(typeof(s)));
3680 static assert(hasSlicing!(typeof(s)));
3681 static assert(hasLength!(typeof(s)));
3685 /// `byCodeUnit` does no Unicode decoding
3686 @safe unittest
3688 string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3689 assert(noel1.byCodeUnit[2] != 'ë');
3690 assert(noel1.byCodeUnit[2] == 'e');
3692 string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3693 // Because string is UTF-8, the code unit at index 2 is just
3694 // the first of a sequence that encodes 'ë'
3695 assert(noel2.byCodeUnit[2] != 'ë');
3698 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3699 @safe unittest
3701 import std.algorithm.comparison : equal;
3702 import std.range : popFrontN;
3703 import std.traits : isAutodecodableString;
3705 auto range = byCodeUnit("hello world");
3706 range.popFrontN(3);
3707 assert(equal(range.save, "lo world"));
3708 static if (isAutodecodableString!string) // only enabled with autodecoding
3710 string str = range.source;
3711 assert(str == "lo world");
3714 // source only exists if the range was wrapped
3716 auto range = byCodeUnit("hello world"d);
3717 static assert(!__traits(compiles, range.source));
3721 @safe pure nothrow @nogc unittest
3723 import std.range;
3725 enum testStr = "𐁄𐂌𐃯 hello ディラン";
3726 char[testStr.length] s;
3727 int i;
3728 foreach (c; testStr.byCodeUnit().byCodeUnit())
3730 s[i++] = c;
3732 assert(s == testStr);
3735 enum testStr = "𐁄𐂌𐃯 hello ディラン"w;
3736 wchar[testStr.length] s;
3737 int i;
3738 foreach (c; testStr.byCodeUnit().byCodeUnit())
3740 s[i++] = c;
3742 assert(s == testStr);
3745 enum testStr = "𐁄𐂌𐃯 hello ディラン"d;
3746 dchar[testStr.length] s;
3747 int i;
3748 foreach (c; testStr.byCodeUnit().byCodeUnit())
3750 s[i++] = c;
3752 assert(s == testStr);
3755 auto bcu = "hello".byCodeUnit();
3756 assert(bcu.length == 5);
3757 assert(bcu[3] == 'l');
3758 assert(bcu[2 .. 4][1] == 'l');
3761 char[5] orig = "hello";
3762 auto bcu = orig[].byCodeUnit();
3763 bcu.front = 'H';
3764 assert(bcu.front == 'H');
3765 bcu[1] = 'E';
3766 assert(bcu[1] == 'E');
3769 auto bcu = "hello".byCodeUnit().byCodeUnit();
3770 static assert(isForwardRange!(typeof(bcu)));
3771 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3772 auto s = bcu.save;
3773 bcu.popFront();
3774 assert(s.front == 'h');
3777 auto bcu = "hello".byCodeUnit();
3778 static assert(hasSlicing!(typeof(bcu)));
3779 static assert(isBidirectionalRange!(typeof(bcu)));
3780 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3781 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3782 auto ret = bcu.retro;
3783 assert(ret.front == 'o');
3784 ret.popFront();
3785 assert(ret.front == 'l');
3788 auto bcu = "κόσμε"w.byCodeUnit();
3789 static assert(hasSlicing!(typeof(bcu)));
3790 static assert(isBidirectionalRange!(typeof(bcu)));
3791 static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
3792 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3793 auto ret = bcu.retro;
3794 assert(ret.front == 'ε');
3795 ret.popFront();
3796 assert(ret.front == 'μ');
3799 static struct Stringish
3801 string s;
3802 alias s this;
3805 auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓");
3806 auto bcu = orig.byCodeUnit();
3807 static assert(is(typeof(bcu) == struct));
3808 static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
3809 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3810 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3811 assert(bcu.front == cast(char) 244);
3814 static struct WStringish
3816 wstring s;
3817 alias s this;
3820 auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w);
3821 auto bcu = orig.byCodeUnit();
3822 static assert(is(typeof(bcu) == struct));
3823 static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
3824 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3825 static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3826 assert(bcu.front == cast(wchar) 56319);
3829 static struct DStringish
3831 dstring s;
3832 alias s this;
3835 auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d);
3836 auto bcu = orig.byCodeUnit();
3837 static assert(is(typeof(bcu) == dstring));
3838 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3839 static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3840 assert(bcu.front == cast(dchar) 1114104);
3843 static struct FuncStringish
3845 string str;
3846 string s() pure nothrow @nogc { return str; }
3847 alias s this;
3850 auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3851 auto bcu = orig.byCodeUnit();
3852 static if (isAutodecodableString!FuncStringish)
3853 static assert(is(typeof(bcu) == struct));
3854 else
3855 static assert(is(typeof(bcu) == string));
3856 static assert(!is(typeof(bcu) == FuncStringish));
3857 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3858 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3859 assert(bcu.front == cast(char) 244);
3862 static struct Range
3864 string data;
3865 bool empty() pure nothrow @nogc { return data.empty; }
3866 char front() pure nothrow @nogc { return data[0]; }
3867 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3870 auto orig = Range("\U0010fff8 𐁊 foo 𐂓");
3871 auto bcu = orig.byCodeUnit();
3872 static assert(is(typeof(bcu) == Range));
3873 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3874 static assert(is(ElementType!(typeof(bcu)) == char));
3875 assert(bcu.front == cast(char) 244);
3878 static struct WRange
3880 wstring data;
3881 bool empty() pure nothrow @nogc { return data.empty; }
3882 wchar front() pure nothrow @nogc { return data[0]; }
3883 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3886 auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w);
3887 auto bcu = orig.byCodeUnit();
3888 static assert(is(typeof(bcu) == WRange));
3889 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3890 static assert(is(ElementType!(typeof(bcu)) == wchar));
3891 assert(bcu.front == 56319);
3894 static struct DRange
3896 dstring data;
3897 bool empty() pure nothrow @nogc { return data.empty; }
3898 dchar front() pure nothrow @nogc { return data[0]; }
3899 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3902 auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d);
3903 auto bcu = orig.byCodeUnit();
3904 static assert(is(typeof(bcu) == DRange));
3905 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3906 static assert(is(ElementType!(typeof(bcu)) == dchar));
3907 assert(bcu.front == 1114104);
3910 static struct RangeAndStringish
3912 bool empty() pure nothrow @nogc { return data.empty; }
3913 char front() pure nothrow @nogc { return data[0]; }
3914 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3916 string data;
3917 string s;
3918 alias s this;
3921 auto orig = RangeAndStringish("test.d", "other");
3922 auto bcu = orig.byCodeUnit();
3923 static assert(is(typeof(bcu) == RangeAndStringish));
3924 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3925 static assert(is(ElementType!(typeof(bcu)) == char));
3926 assert(bcu.front == 't');
3929 static struct WRangeAndStringish
3931 bool empty() pure nothrow @nogc { return data.empty; }
3932 wchar front() pure nothrow @nogc { return data[0]; }
3933 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3935 wstring data;
3936 wstring s;
3937 alias s this;
3940 auto orig = WRangeAndStringish("test.d"w, "other"w);
3941 auto bcu = orig.byCodeUnit();
3942 static assert(is(typeof(bcu) == WRangeAndStringish));
3943 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3944 static assert(is(ElementType!(typeof(bcu)) == wchar));
3945 assert(bcu.front == 't');
3948 static struct DRangeAndStringish
3950 bool empty() pure nothrow @nogc { return data.empty; }
3951 dchar front() pure nothrow @nogc { return data[0]; }
3952 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3954 dstring data;
3955 dstring s;
3956 alias s this;
3959 auto orig = DRangeAndStringish("test.d"d, "other"d);
3960 auto bcu = orig.byCodeUnit();
3961 static assert(is(typeof(bcu) == DRangeAndStringish));
3962 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3963 static assert(is(ElementType!(typeof(bcu)) == dchar));
3964 assert(bcu.front == 't');
3967 enum Enum : string { a = "test.d" }
3969 auto orig = Enum.a;
3970 auto bcu = orig.byCodeUnit();
3971 static assert(!is(typeof(bcu) == Enum));
3972 static if (isAutodecodableString!Enum)
3973 static assert(is(typeof(bcu) == struct));
3974 else
3975 static assert(is(typeof(bcu) == string));
3976 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3977 assert(bcu.front == 't');
3980 enum WEnum : wstring { a = "test.d"w }
3982 auto orig = WEnum.a;
3983 auto bcu = orig.byCodeUnit();
3984 static assert(!is(typeof(bcu) == WEnum));
3985 static if (isAutodecodableString!WEnum)
3986 static assert(is(typeof(bcu) == struct));
3987 else
3988 static assert(is(typeof(bcu) == wstring));
3989 static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3990 assert(bcu.front == 't');
3993 enum DEnum : dstring { a = "test.d"d }
3995 auto orig = DEnum.a;
3996 auto bcu = orig.byCodeUnit();
3997 static assert(is(typeof(bcu) == dstring));
3998 static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3999 assert(bcu.front == 't');
4002 static if (autodecodeStrings)
4004 static assert(!is(typeof(byCodeUnit("hello")) == string));
4005 static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
4007 else
4009 static assert(is(typeof(byCodeUnit("hello")) == string));
4010 static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
4012 static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
4014 static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
4015 static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
4016 static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
4018 enum SEnum : char[5] { a = "hello" }
4019 enum WSEnum : wchar[5] { a = "hello"w }
4020 enum DSEnum : dchar[5] { a = "hello"d }
4022 static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
4023 static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
4024 static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
4027 /****************************
4028 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4029 * of characters by char, wchar, or dchar.
4030 * These aliases simply forward to $(LREF byUTF) with the
4031 * corresponding C argument.
4033 * Params:
4034 * r = input range of characters, or array of characters
4036 alias byChar = byUTF!char;
4038 /// Ditto
4039 alias byWchar = byUTF!wchar;
4041 /// Ditto
4042 alias byDchar = byUTF!dchar;
4044 @safe pure nothrow @nogc unittest
4047 char[5] s;
4048 int i;
4049 foreach (c; "hello".byChar.byChar())
4051 //writefln("[%d] '%c'", i, c);
4052 s[i++] = c;
4054 assert(s == "hello");
4057 char[5+2+3+4+3+3] s;
4058 int i;
4059 dchar[10] a;
4060 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4061 a[8] = 0xD800; // invalid
4062 a[9] = cast(dchar) 0x110000; // invalid
4063 foreach (c; a[].byChar())
4065 //writefln("[%d] '%c'", i, c);
4066 s[i++] = c;
4068 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4071 auto r = "hello"w.byChar();
4072 r.popFront();
4073 r.popFront();
4074 assert(r.front == 'l');
4077 auto r = "hello"d.byChar();
4078 r.popFront();
4079 r.popFront();
4080 assert(r.front == 'l');
4083 auto r = "hello"d.byChar();
4084 assert(isForwardRange!(typeof(r)));
4085 auto s = r.save;
4086 r.popFront();
4087 assert(s.front == 'h');
4091 @safe pure nothrow @nogc unittest
4094 wchar[11] s;
4095 int i;
4096 dchar[10] a;
4097 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4098 a[8] = 0xD800; // invalid
4099 a[9] = cast(dchar) 0x110000; // invalid
4100 foreach (c; a[].byWchar())
4102 //writefln("[%d] '%c' x%x", i, c, c);
4103 s[i++] = c;
4105 foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
4107 //writefln("[%d] '%c' x%x", j, c, c);
4109 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
4113 auto r = "hello".byWchar();
4114 r.popFront();
4115 r.popFront();
4116 assert(r.front == 'l');
4119 auto r = "hello"d.byWchar();
4120 r.popFront();
4121 r.popFront();
4122 assert(r.front == 'l');
4125 auto r = "hello"d.byWchar();
4126 assert(isForwardRange!(typeof(r)));
4127 auto s = r.save;
4128 r.popFront();
4129 assert(s.front == 'h');
4133 @safe pure nothrow @nogc unittest
4136 dchar[9] s;
4137 int i;
4138 string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4139 foreach (c; a.byDchar())
4141 s[i++] = c;
4143 assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
4146 foreach (s; invalidUTFstrings!char())
4148 auto r = s.byDchar();
4149 assert(!r.empty);
4150 assert(r.front == r.front);
4151 dchar c = r.front;
4152 assert(c == replacementDchar);
4156 auto r = "hello".byDchar();
4157 r.popFront();
4158 r.popFront();
4159 assert(r.front == 'l');
4163 dchar[8] s;
4164 int i;
4165 wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
4166 foreach (c; a.byDchar())
4168 //writefln("[%d] '%c' x%x", i, c, c);
4169 s[i++] = c;
4171 assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
4174 foreach (s; invalidUTFstrings!wchar())
4176 auto r = s.byDchar();
4177 assert(!r.empty);
4178 assert(r.front == r.front);
4179 dchar c = r.front;
4180 assert(c == replacementDchar);
4184 wchar[2] ws;
4185 ws[0] = 0xD800;
4186 ws[1] = 0xDD00; // correct surrogate pair
4187 auto r = ws[].byDchar();
4188 assert(!r.empty);
4189 assert(r.front == r.front);
4190 dchar c = r.front;
4191 assert(c == '\U00010100');
4194 auto r = "hello"w.byDchar();
4195 r.popFront();
4196 r.popFront();
4197 assert(r.front == 'l');
4201 dchar[5] s;
4202 int i;
4203 dstring a = "hello"d;
4204 foreach (c; a.byDchar.byDchar())
4206 //writefln("[%d] '%c' x%x", i, c, c);
4207 s[i++] = c;
4209 assert(s == "hello"d);
4212 auto r = "hello".byDchar();
4213 assert(isForwardRange!(typeof(r)));
4214 auto s = r.save;
4215 r.popFront();
4216 assert(s.front == 'h');
4219 auto r = "hello"w.byDchar();
4220 assert(isForwardRange!(typeof(r)));
4221 auto s = r.save;
4222 r.popFront();
4223 assert(s.front == 'h');
4227 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4228 // which needs to support ranges with and without those attributes
4230 pure @safe nothrow @nogc unittest
4232 dchar[5] s = "hello"d;
4233 foreach (c; s[].byChar()) { }
4234 foreach (c; s[].byWchar()) { }
4235 foreach (c; s[].byDchar()) { }
4238 version (StdUnittest)
4239 private int impureVariable;
4241 @system unittest
4243 static struct ImpureThrowingSystemRange(Char)
4245 @property bool empty() const { return true; }
4246 @property Char front() const { return Char.init; }
4247 void popFront()
4249 impureVariable++;
4250 throw new Exception("only for testing nothrow");
4254 foreach (Char; AliasSeq!(char, wchar, dchar))
4256 ImpureThrowingSystemRange!Char range;
4257 foreach (c; range.byChar()) { }
4258 foreach (c; range.byWchar()) { }
4259 foreach (c; range.byDchar()) { }
4263 /****************************
4264 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4265 * of characters by char type `C` by encoding the elements of the range.
4267 * UTF sequences that cannot be converted to the specified encoding are either
4268 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4269 * of the Unicode Standard 6.2 or result in a thrown UTFException.
4270 * Hence byUTF is not symmetric.
4271 * This algorithm is lazy, and does not allocate memory.
4272 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4273 * `r` parameter.
4275 * Params:
4276 * C = `char`, `wchar`, or `dchar`
4277 * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4278 * UseReplacementDchar.no means throw `UTFException` for invalid UTF
4280 * Throws:
4281 * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.no`
4283 * GC:
4284 * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.yes`
4286 * Returns:
4287 * A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4288 * as defined by $(REF isAutodecodableString, std, traits).
4290 * A forward range if `R` is a forward range and not auto-decodable.
4292 * Or, if `R` is a range and it is auto-decodable and
4293 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4294 * to $(LREF byCodeUnit).
4296 * Otherwise, an input range of characters.
4298 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
4299 if (isSomeChar!C)
4301 static if (is(immutable C == immutable UC, UC) && !is(C == UC))
4302 alias byUTF = byUTF!UC;
4303 else:
4305 auto ref byUTF(R)(R r)
4306 if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4308 return byUTF(r.byCodeUnit());
4311 auto ref byUTF(R)(R r)
4312 if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4314 static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C))
4316 return r.byCodeUnit();
4318 else static if (is(C == dchar))
4320 static struct Result
4322 enum Empty = uint.max; // range is empty or just constructed
4324 this(return scope R r)
4326 this.r = r;
4329 this(return scope R r, uint buff)
4331 this.r = r;
4332 this.buff = buff;
4335 static if (isBidirectionalRange!R)
4337 this(return scope R r, uint frontBuff, uint backBuff)
4339 this.r = r;
4340 this.buff = frontBuff;
4341 this.backBuff = backBuff;
4345 @property bool empty()
4347 static if (isBidirectionalRange!R)
4348 return buff == Empty && backBuff == Empty && r.empty;
4349 else
4350 return buff == Empty && r.empty;
4353 @property dchar front() scope // 'scope' required by call to decodeFront() below
4355 if (buff == Empty)
4357 auto c = r.front;
4359 static if (is(RC == wchar))
4360 enum firstMulti = 0xD800; // First high surrogate.
4361 else
4362 enum firstMulti = 0x80; // First non-ASCII.
4363 if (c < firstMulti)
4365 r.popFront;
4366 buff = cast(dchar) c;
4368 else
4370 buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4373 return cast(dchar) buff;
4376 void popFront()
4378 if (buff == Empty)
4379 front();
4380 buff = Empty;
4383 static if (isForwardRange!R)
4385 @property auto save()
4387 static if (isBidirectionalRange!R)
4389 return Result(r.save, buff, backBuff);
4391 else
4393 return Result(r.save, buff);
4398 static if (isBidirectionalRange!R)
4400 @property dchar back() scope // 'scope' required by call to decodeBack() below
4402 if (backBuff != Empty)
4403 return cast(dchar) backBuff;
4405 auto c = r.back;
4406 static if (is(RC == wchar))
4407 enum firstMulti = 0xD800; // First high surrogate.
4408 else
4409 enum firstMulti = 0x80; // First non-ASCII.
4410 if (c < firstMulti)
4412 r.popBack;
4413 backBuff = cast(dchar) c;
4415 else
4417 backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }();
4419 return cast(dchar) backBuff;
4423 void popBack()
4425 if (backBuff == Empty)
4426 back();
4427 backBuff = Empty;
4431 private:
4433 R r;
4434 uint buff = Empty; // one character lookahead buffer
4435 static if (isBidirectionalRange!R)
4436 uint backBuff = Empty;
4439 return Result(r);
4441 else
4443 static struct Result
4445 this(return scope R r)
4447 this.r = r;
4450 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
4452 this.r = r;
4453 this.pos = pos;
4454 this.fill = fill;
4455 this.buf = buf;
4458 static if (isBidirectionalRange!R)
4460 this(return scope R r, ushort frontPos, ushort frontFill,
4461 ushort backPos, ushort backFill, C[4 / C.sizeof] buf)
4463 this.r = r;
4464 this.pos = frontPos;
4465 this.fill = frontFill;
4466 this.backPos = backPos;
4467 this.backFill = backFill;
4468 this.buf = buf;
4472 @property bool empty()
4474 static if (isBidirectionalRange!R)
4475 return pos == fill && backPos == backFill && r.empty;
4476 else
4477 return pos == fill && r.empty;
4480 @property auto front() scope // 'scope' required by call to decodeFront() below
4482 if (pos == fill)
4484 pos = 0;
4485 auto c = r.front;
4487 static if (C.sizeof >= 2 && RC.sizeof >= 2)
4488 enum firstMulti = 0xD800; // First high surrogate.
4489 else
4490 enum firstMulti = 0x80; // First non-ASCII.
4491 if (c < firstMulti)
4493 fill = 1;
4494 r.popFront;
4495 buf[pos] = cast(C) c;
4497 else
4499 static if (is(RC == dchar))
4501 r.popFront;
4502 dchar dc = c;
4504 else
4505 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4506 fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4509 return buf[pos];
4512 void popFront()
4514 if (pos == fill)
4515 front;
4516 ++pos;
4519 static if (isForwardRange!R)
4521 @property auto save()
4523 static if (isBidirectionalRange!R)
4525 return Result(r.save, pos, fill, backPos, backFill, buf);
4527 else
4529 return Result(r.save, pos, fill, buf);
4534 static if (isBidirectionalRange!R)
4536 @property auto back() scope // 'scope' required by call to decodeBack() below
4538 if (backPos != backFill)
4539 return buf[cast(ushort) (backFill - backPos - 1)];
4541 backPos = 0;
4542 auto c = r.back;
4543 static if (C.sizeof >= 2 && RC.sizeof >= 2)
4544 enum firstMulti = 0xD800; // First high surrogate.
4545 else
4546 enum firstMulti = 0x80; // First non-ASCII.
4547 if (c < firstMulti)
4549 backFill = 1;
4550 r.popBack;
4551 buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c;
4553 else
4555 static if (is(RC == dchar))
4557 r.popBack;
4558 dchar dc = c;
4560 else
4561 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }();
4562 backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4564 return buf[cast(ushort) (backFill - backPos - 1)];
4567 void popBack()
4569 if (backPos == backFill)
4570 back;
4571 ++backPos;
4575 private:
4577 R r;
4578 ushort pos, fill;
4579 static if (isBidirectionalRange!R)
4580 ushort backPos, backFill;
4581 C[4 / C.sizeof] buf = void;
4584 return Result(r);
4590 @safe pure nothrow unittest
4592 import std.algorithm.comparison : equal;
4594 // hellö as a range of `char`s, which are UTF-8
4595 assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4597 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4598 assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4600 // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4601 assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4602 assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37]));
4603 assert("𐐷".byUTF!dchar().equal([0x00010437]));
4607 @safe unittest
4609 import std.algorithm.comparison : equal;
4610 import std.exception : assertThrown;
4612 assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
4613 assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
4616 @safe unittest
4619 wchar[] s = ['a', 'b', 0x219];
4620 auto r = s.byUTF!char;
4621 assert(isBidirectionalRange!(typeof(r)));
4622 assert(r.back == 0x99);
4623 r.popBack;
4624 assert(r.back == 0xc8);
4625 r.popBack;
4626 assert(r.back == 'b');
4631 wchar[] s = ['a', 'b', 0x219];
4632 auto r = s.byUTF!wchar;
4633 uint i;
4634 assert(isBidirectionalRange!(typeof(r)));
4635 assert(r.back == 0x219);
4636 r.popBack;
4637 assert(r.back == 'b');
4641 wchar[] s = ['a', 'b', 0x219];
4642 auto r = s.byUTF!dchar;
4643 assert(isBidirectionalRange!(typeof(r)));
4644 assert(r.back == 0x219);
4645 r.popBack;
4646 assert(r.back == 'b');
4650 dchar[] s = ['𐐷', '😁'];
4651 auto r = s.byUTF!wchar;
4652 assert(r.back == 0xde01);
4653 r.popBack;
4654 assert(r.back == 0xd83d);
4655 r.popBack;
4656 assert(r.back == 0xdc37);
4657 r.popBack;
4658 assert(r.back == 0xd801);
4662 dchar[] s = ['𐐷', '😁'];
4663 auto r = s.byUTF!char;
4664 char[] res;
4665 while (!r.empty)
4667 res ~= r.back;
4668 r.popBack;
4670 import std.algorithm.comparison : equal;
4671 assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4675 dchar[] res;
4676 auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar;
4677 while (!r.empty)
4679 res ~= r.back;
4680 r.popBack;
4682 import std.algorithm.comparison : equal;
4683 assert(res.equal(['e', 'd', 'c', 'b', 'a']));
4687 //testing the save() function
4688 wchar[] s = ['Ă','ț'];
4690 auto rc = s.byUTF!char;
4691 rc.popBack;
4692 auto rcCopy = rc.save;
4693 assert(rc.back == rcCopy.back);
4694 assert(rcCopy.back == 0xc8);
4696 auto rd = s.byUTF!dchar;
4697 rd.popBack;
4698 auto rdCopy = rd.save;
4699 assert(rd.back == rdCopy.back);
4700 assert(rdCopy.back == 'Ă');
4705 @safe pure nothrow unittest
4707 import std.range.primitives;
4708 wchar[] s = ['ă', 'î'];
4710 auto rc = s.byUTF!char;
4711 static assert(isBidirectionalRange!(typeof(rc)));
4712 assert(rc.back == 0xae);
4713 rc.popBack;
4714 assert(rc.back == 0xc3);
4715 rc.popBack;
4716 assert(rc.back == 0x83);
4717 rc.popBack;
4718 assert(rc.back == 0xc4);
4720 auto rw = s.byUTF!wchar;
4721 static assert(isBidirectionalRange!(typeof(rw)));
4722 assert(rw.back == 'î');
4723 rw.popBack;
4724 assert(rw.back == 'ă');
4726 auto rd = s.byUTF!dchar;
4727 static assert(isBidirectionalRange!(typeof(rd)));
4728 assert(rd.back == 'î');
4729 rd.popBack;
4730 assert(rd.back == 'ă');