d: Merge upstream dmd 3982604c5, druntime bc58b1e9, phobos 12329adb6.
[official-gcc.git] / libphobos / src / std / utf.d
blob866ec48cbdccf7bb1e5d23f5919ccf4785ba39e7
1 // Written in the D programming language.
3 /++
4 Encode and decode UTF-8, UTF-16 and UTF-32 strings.
6 UTF character support is restricted to
7 $(D '\u0000' <= character <= '\U0010FFFF').
9 $(SCRIPT inhibitQuickIndex = 1;)
10 $(DIVC quickindex,
11 $(BOOKTABLE,
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
14 $(LREF decode)
15 $(LREF decodeFront)
17 $(TR $(TD Lazy decode) $(TD
18 $(LREF byCodeUnit)
19 $(LREF byChar)
20 $(LREF byWchar)
21 $(LREF byDchar)
22 $(LREF byUTF)
24 $(TR $(TD Encode) $(TD
25 $(LREF encode)
26 $(LREF toUTF8)
27 $(LREF toUTF16)
28 $(LREF toUTF32)
29 $(LREF toUTFz)
30 $(LREF toUTF16z)
32 $(TR $(TD Length) $(TD
33 $(LREF codeLength)
34 $(LREF count)
35 $(LREF stride)
36 $(LREF strideBack)
38 $(TR $(TD Index) $(TD
39 $(LREF toUCSindex)
40 $(LREF toUTFindex)
42 $(TR $(TD Validation) $(TD
43 $(LREF isValidDchar)
44 $(LREF isValidCodepoint)
45 $(LREF validate)
47 $(TR $(TD Miscellaneous) $(TD
48 $(LREF replacementDchar)
49 $(LREF UseReplacementDchar)
50 $(LREF UTFException)
53 See_Also:
54 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
55 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
56 $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
57 Copyright: Copyright The D Language Foundation 2000 - 2012.
58 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
59 Authors: $(HTTP digitalmars.com, Walter Bright) and
60 $(HTTP jmdavisprog.com, Jonathan M Davis)
61 Source: $(PHOBOSSRC std/utf.d)
63 module std.utf;
65 import std.exception : basicExceptionCtors;
66 import core.exception : UnicodeException;
67 import std.meta : AliasSeq;
68 import std.range.primitives;
69 import std.traits : isAutodecodableString, isPointer, isSomeChar,
70 isSomeString, isStaticArray, Unqual, isConvertibleToString;
71 import std.typecons : Flag, Yes, No;
74 /++
75 Exception thrown on errors in std.utf functions.
77 class UTFException : UnicodeException
79 import core.internal.string : unsignedToTempString, UnsignedStringBuf;
81 uint[4] sequence;
82 size_t len;
84 @safe pure nothrow @nogc
85 UTFException setSequence(scope uint[] data...) return
87 assert(data.length <= 4);
89 len = data.length < 4 ? data.length : 4;
90 sequence[0 .. len] = data[0 .. len];
92 return this;
95 // FIXME: Use std.exception.basicExceptionCtors here once
96 // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
98 /**
99 Standard exception constructors.
101 this(string msg, string file = __FILE__, size_t line = __LINE__,
102 Throwable next = null) @nogc @safe pure nothrow
104 super(msg, 0, file, line, next);
106 /// ditto
107 this(string msg, size_t index, string file = __FILE__,
108 size_t line = __LINE__, Throwable next = null) @safe pure nothrow
110 UnsignedStringBuf buf = void;
111 msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
112 super(msg, index, file, line, next);
116 Returns:
117 A `string` detailing the invalid UTF sequence.
119 override string toString() const
121 if (len == 0)
123 /* Exception.toString() is not marked as const, although
124 * it is const-compatible.
126 //return super.toString();
127 auto e = () @trusted { return cast(Exception) super; } ();
128 return e.toString();
131 string result = "Invalid UTF sequence:";
133 foreach (i; sequence[0 .. len])
135 UnsignedStringBuf buf = void;
136 result ~= ' ';
137 auto h = unsignedToTempString!16(i, buf);
138 if (h.length == 1)
139 result ~= '0';
140 result ~= h;
141 result ~= 'x';
144 if (super.msg.length > 0)
146 result ~= " - ";
147 result ~= super.msg;
150 return result;
155 @safe unittest
157 import std.exception : assertThrown;
159 char[4] buf;
160 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
161 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
162 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
163 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
164 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
168 Provide array of invalidly encoded UTF strings. Useful for testing.
170 Params:
171 Char = char, wchar, or dchar
173 Returns:
174 an array of invalidly encoded UTF strings
177 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
178 if (isSomeChar!Char)
180 static if (is(Char == char))
182 enum x = 0xDC00; // invalid surrogate value
183 enum y = 0x110000; // out of range
185 static immutable string[8] result =
187 "\x80", // not a start byte
188 "\xC0", // truncated
189 "\xC0\xC0", // invalid continuation
190 "\xF0\x82\x82\xAC", // overlong
192 0xE0 | (x >> 12),
193 0x80 | ((x >> 6) & 0x3F),
194 0x80 | (x & 0x3F)
197 cast(char)(0xF0 | (y >> 18)),
198 cast(char)(0x80 | ((y >> 12) & 0x3F)),
199 cast(char)(0x80 | ((y >> 6) & 0x3F)),
200 cast(char)(0x80 | (y & 0x3F))
203 cast(char)(0xF8 | 3), // 5 byte encoding
204 cast(char)(0x80 | 3),
205 cast(char)(0x80 | 3),
206 cast(char)(0x80 | 3),
207 cast(char)(0x80 | 3),
210 cast(char)(0xFC | 3), // 6 byte encoding
211 cast(char)(0x80 | 3),
212 cast(char)(0x80 | 3),
213 cast(char)(0x80 | 3),
214 cast(char)(0x80 | 3),
215 cast(char)(0x80 | 3),
219 return result[];
221 else static if (is(Char == wchar))
223 static immutable wstring[5] result =
226 cast(wchar) 0xDC00,
229 cast(wchar) 0xDFFF,
232 cast(wchar) 0xDBFF,
233 cast(wchar) 0xDBFF,
236 cast(wchar) 0xDBFF,
237 cast(wchar) 0xE000,
240 cast(wchar) 0xD800,
244 return result[];
246 else static if (is(Char == dchar))
248 static immutable dstring[3] result =
250 [ cast(dchar) 0x110000 ],
251 [ cast(dchar) 0x00D800 ],
252 [ cast(dchar) 0x00DFFF ],
255 return result;
257 else
258 static assert(0);
262 Check whether the given Unicode code point is valid.
264 Params:
265 c = code point to check
267 Returns:
268 `true` if and only if `c` is a valid Unicode code point
270 Note:
271 `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
272 as they are permitted for internal use by an application, but they are
273 not allowed for interchange by the Unicode standard.
275 bool isValidDchar(dchar c) pure nothrow @safe @nogc
277 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
281 @safe @nogc pure nothrow unittest
283 assert( isValidDchar(cast(dchar) 0x41));
284 assert( isValidDchar(cast(dchar) 0x00));
285 assert(!isValidDchar(cast(dchar) 0xD800));
286 assert(!isValidDchar(cast(dchar) 0x11FFFF));
289 pure nothrow @safe @nogc unittest
291 import std.exception;
293 assertCTFEable!(
295 assert( isValidDchar(cast(dchar)'a') == true);
296 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
298 assert(!isValidDchar(cast(dchar) 0x00D800));
299 assert(!isValidDchar(cast(dchar) 0x00DBFF));
300 assert(!isValidDchar(cast(dchar) 0x00DC00));
301 assert(!isValidDchar(cast(dchar) 0x00DFFF));
302 assert( isValidDchar(cast(dchar) 0x00FFFE));
303 assert( isValidDchar(cast(dchar) 0x00FFFF));
304 assert( isValidDchar(cast(dchar) 0x01FFFF));
305 assert( isValidDchar(cast(dchar) 0x10FFFF));
306 assert(!isValidDchar(cast(dchar) 0x110000));
311 Checks if a single character forms a valid code point.
313 When standing alone, some characters are invalid code points. For
314 example the `wchar` `0xD800` is a so called high surrogate, which can
315 only be interpreted together with a low surrogate following it. As a
316 standalone character it is considered invalid.
318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
319 Unicode Standard, D90, D91 and D92) for more details.
321 Params:
322 c = character to test
323 Char = character type of `c`
325 Returns:
326 `true`, if `c` forms a valid code point.
328 bool isValidCodepoint(Char)(Char c)
329 if (isSomeChar!Char)
331 alias UChar = Unqual!Char;
332 static if (is(UChar == char))
334 return c <= 0x7F;
336 else static if (is(UChar == wchar))
338 return c <= 0xD7FF || c >= 0xE000;
340 else static if (is(UChar == dchar))
342 return isValidDchar(c);
344 else
345 static assert(false, "unknown character type: `" ~ Char.stringof ~ "`");
349 @safe pure nothrow unittest
351 assert( isValidCodepoint(cast(char) 0x40));
352 assert(!isValidCodepoint(cast(char) 0x80));
353 assert( isValidCodepoint(cast(wchar) 0x1234));
354 assert(!isValidCodepoint(cast(wchar) 0xD800));
355 assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
356 assert(!isValidCodepoint(cast(dchar) 0x12345678));
360 Calculate the length of the UTF sequence starting at `index`
361 in `str`.
363 Params:
364 str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
365 of UTF code units. Must be random access if `index` is passed
366 index = starting index of UTF sequence (default: `0`)
368 Returns:
369 The number of code units in the UTF sequence. For UTF-8, this is a
370 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
371 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
373 Throws:
374 May throw a `UTFException` if `str[index]` is not the start of a
375 valid UTF sequence.
377 Note:
378 `stride` will only analyze the first `str[index]` element. It
379 will not fully verify the validity of the UTF sequence, nor even verify
380 the presence of the sequence: it will not actually guarantee that
381 $(D index + stride(str, index) <= str.length).
383 uint stride(S)(auto ref S str, size_t index)
384 if (is(S : const char[]) ||
385 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
387 static if (is(typeof(str.length) : ulong))
388 assert(index < str.length, "Past the end of the UTF-8 sequence");
389 immutable c = str[index];
391 if (c < 0x80)
392 return 1;
393 else
394 return strideImpl(c, index);
397 /// Ditto
398 uint stride(S)(auto ref S str)
399 if (is(S : const char[]) ||
400 (isInputRange!S && is(immutable ElementType!S == immutable char)))
402 static if (is(S : const char[]))
403 immutable c = str[0];
404 else
405 immutable c = str.front;
407 if (c < 0x80)
408 return 1;
409 else
410 return strideImpl(c, 0);
413 @system unittest
415 import core.exception : AssertError;
416 import std.conv : to;
417 import std.exception;
418 import std.string : format;
419 import std.traits : FunctionAttribute, functionAttributes, isSafe;
420 static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
422 enforce(stride(s, i) == codeLength!char(c),
423 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
425 enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
426 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
428 auto refRandom = new RefRandomCU!char(s);
429 immutable randLen = refRandom.length;
430 enforce(stride(refRandom, i) == codeLength!char(c),
431 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
432 enforce(refRandom.length == randLen,
433 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
435 if (i == 0)
437 enforce(stride(s) == codeLength!char(c),
438 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
440 enforce(stride(InputCU!char(s)) == codeLength!char(c),
441 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
443 auto refBidir = new RefBidirCU!char(s);
444 immutable bidirLen = refBidir.length;
445 enforce(stride(refBidir) == codeLength!char(c),
446 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
447 enforce(refBidir.length == bidirLen,
448 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
452 assertCTFEable!(
454 test("a", 'a');
455 test(" ", ' ');
456 test("\u2029", '\u2029'); //paraSep
457 test("\u0100", '\u0100');
458 test("\u0430", '\u0430');
459 test("\U00010143", '\U00010143');
460 test("abcdefcdef", 'a');
461 test("hello\U00010143\u0100\U00010143", 'h', 0);
462 test("hello\U00010143\u0100\U00010143", 'e', 1);
463 test("hello\U00010143\u0100\U00010143", 'l', 2);
464 test("hello\U00010143\u0100\U00010143", 'l', 3);
465 test("hello\U00010143\u0100\U00010143", 'o', 4);
466 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
467 test("hello\U00010143\u0100\U00010143", '\u0100', 9);
468 test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
470 foreach (S; AliasSeq!(char[], const char[], string))
472 enum str = to!S("hello world");
473 static assert(isSafe!({ stride(str, 0); }));
474 static assert(isSafe!({ stride(str); }));
475 static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
476 static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0);
481 @safe unittest // invalid start bytes
483 import std.exception : assertThrown;
484 immutable char[] invalidStartBytes = [
485 0b1111_1000, // indicating a sequence length of 5
486 0b1111_1100, // 6
487 0b1111_1110, // 7
488 0b1111_1111, // 8
489 0b1000_0000, // continuation byte
491 foreach (c; invalidStartBytes)
492 assertThrown!UTFException(stride([c]));
495 /// Ditto
496 uint stride(S)(auto ref S str, size_t index)
497 if (is(S : const wchar[]) ||
498 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
500 static if (is(typeof(str.length) : ulong))
501 assert(index < str.length, "Past the end of the UTF-16 sequence");
502 immutable uint u = str[index];
503 return 1 + (u >= 0xD800 && u <= 0xDBFF);
506 /// Ditto
507 uint stride(S)(auto ref S str) @safe pure
508 if (is(S : const wchar[]))
510 return stride(str, 0);
513 /// Ditto
514 uint stride(S)(auto ref S str)
515 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
516 !is(S : const wchar[]))
518 assert(!str.empty, "UTF-16 sequence is empty");
519 immutable uint u = str.front;
520 return 1 + (u >= 0xD800 && u <= 0xDBFF);
523 @system unittest
525 import core.exception : AssertError;
526 import std.conv : to;
527 import std.exception;
528 import std.string : format;
529 import std.traits : FunctionAttribute, functionAttributes, isSafe;
530 static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
532 enforce(stride(s, i) == codeLength!wchar(c),
533 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
535 enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
536 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
538 auto refRandom = new RefRandomCU!wchar(s);
539 immutable randLen = refRandom.length;
540 enforce(stride(refRandom, i) == codeLength!wchar(c),
541 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
542 enforce(refRandom.length == randLen,
543 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
545 if (i == 0)
547 enforce(stride(s) == codeLength!wchar(c),
548 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
550 enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
551 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
553 auto refBidir = new RefBidirCU!wchar(s);
554 immutable bidirLen = refBidir.length;
555 enforce(stride(refBidir) == codeLength!wchar(c),
556 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
557 enforce(refBidir.length == bidirLen,
558 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
562 assertCTFEable!(
564 test("a", 'a');
565 test(" ", ' ');
566 test("\u2029", '\u2029'); //paraSep
567 test("\u0100", '\u0100');
568 test("\u0430", '\u0430');
569 test("\U00010143", '\U00010143');
570 test("abcdefcdef", 'a');
571 test("hello\U00010143\u0100\U00010143", 'h', 0);
572 test("hello\U00010143\u0100\U00010143", 'e', 1);
573 test("hello\U00010143\u0100\U00010143", 'l', 2);
574 test("hello\U00010143\u0100\U00010143", 'l', 3);
575 test("hello\U00010143\u0100\U00010143", 'o', 4);
576 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
577 test("hello\U00010143\u0100\U00010143", '\u0100', 7);
578 test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
580 foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
582 enum str = to!S("hello world");
583 static assert(isSafe!(() => stride(str, 0)));
584 static assert(isSafe!(() => stride(str) ));
585 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
586 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0);
591 /// Ditto
592 uint stride(S)(auto ref S str, size_t index = 0)
593 if (is(S : const dchar[]) ||
594 (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
596 static if (is(typeof(str.length) : ulong))
597 assert(index < str.length, "Past the end of the UTF-32 sequence");
598 else
599 assert(!str.empty, "UTF-32 sequence is empty.");
600 return 1;
604 @safe unittest
606 assert("a".stride == 1);
607 assert("λ".stride == 2);
608 assert("aλ".stride == 1);
609 assert("aλ".stride(1) == 2);
610 assert("𐐷".stride == 4);
613 @system unittest
615 import core.exception : AssertError;
616 import std.conv : to;
617 import std.exception;
618 import std.string : format;
619 import std.traits : FunctionAttribute, functionAttributes, isSafe;
620 static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
622 enforce(stride(s, i) == codeLength!dchar(c),
623 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
625 enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
626 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
628 auto refRandom = new RefRandomCU!dchar(s);
629 immutable randLen = refRandom.length;
630 enforce(stride(refRandom, i) == codeLength!dchar(c),
631 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
632 enforce(refRandom.length == randLen,
633 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
635 if (i == 0)
637 enforce(stride(s) == codeLength!dchar(c),
638 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
640 enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
641 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
643 auto refBidir = new RefBidirCU!dchar(s);
644 immutable bidirLen = refBidir.length;
645 enforce(stride(refBidir) == codeLength!dchar(c),
646 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
647 enforce(refBidir.length == bidirLen,
648 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
652 assertCTFEable!(
654 test("a", 'a');
655 test(" ", ' ');
656 test("\u2029", '\u2029'); //paraSep
657 test("\u0100", '\u0100');
658 test("\u0430", '\u0430');
659 test("\U00010143", '\U00010143');
660 test("abcdefcdef", 'a');
661 test("hello\U00010143\u0100\U00010143", 'h', 0);
662 test("hello\U00010143\u0100\U00010143", 'e', 1);
663 test("hello\U00010143\u0100\U00010143", 'l', 2);
664 test("hello\U00010143\u0100\U00010143", 'l', 3);
665 test("hello\U00010143\u0100\U00010143", 'o', 4);
666 test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
667 test("hello\U00010143\u0100\U00010143", '\u0100', 6);
668 test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
670 foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
672 enum str = to!S("hello world");
673 static assert(isSafe!(() => stride(str, 0)));
674 static assert(isSafe!(() => stride(str) ));
675 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
676 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0);
681 private uint strideImpl(char c, size_t index) @trusted pure
682 in { assert(c & 0x80); }
685 import core.bitop : bsr;
686 immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
687 if (c == 0xFF || msbs < 2 || msbs > 4)
688 throw new UTFException("Invalid UTF-8 sequence", index);
689 return msbs;
693 Calculate the length of the UTF sequence ending one code unit before
694 `index` in `str`.
696 Params:
697 str = bidirectional range of UTF code units. Must be random access if
698 `index` is passed
699 index = index one past end of UTF sequence (default: `str.length`)
701 Returns:
702 The number of code units in the UTF sequence. For UTF-8, this is a
703 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
704 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
706 Throws:
707 May throw a `UTFException` if `str[index]` is not one past the
708 end of a valid UTF sequence.
710 Note:
711 `strideBack` will only analyze the element at $(D str[index - 1])
712 element. It will not fully verify the validity of the UTF sequence, nor
713 even verify the presence of the sequence: it will not actually
714 guarantee that $(D strideBack(str, index) <= index).
716 uint strideBack(S)(auto ref S str, size_t index)
717 if (is(S : const char[]) ||
718 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
720 static if (is(typeof(str.length) : ulong))
721 assert(index <= str.length, "Past the end of the UTF-8 sequence");
722 assert(index > 0, "Not the end of the UTF-8 sequence");
724 if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
725 return 1;
727 if (index >= 4) //single verification for most common case
729 static foreach (i; 2 .. 5)
731 if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
732 return i;
735 else
737 static foreach (i; 2 .. 4)
739 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
740 return i;
743 throw new UTFException("Not the end of the UTF sequence", index);
746 /// Ditto
747 uint strideBack(S)(auto ref S str)
748 if (is(S : const char[]) ||
749 (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
751 return strideBack(str, str.length);
754 /// Ditto
755 uint strideBack(S)(auto ref S str)
756 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
758 assert(!str.empty, "Past the end of the UTF-8 sequence");
759 auto temp = str.save;
760 foreach (i; AliasSeq!(1, 2, 3, 4))
762 if ((temp.back & 0b1100_0000) != 0b1000_0000)
763 return i;
764 temp.popBack();
765 if (temp.empty)
766 break;
768 throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
771 @system unittest
773 import core.exception : AssertError;
774 import std.conv : to;
775 import std.exception;
776 import std.string : format;
777 import std.traits : FunctionAttribute, functionAttributes, isSafe;
778 static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
780 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
781 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
783 enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
784 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
786 auto refRandom = new RefRandomCU!char(s);
787 immutable randLen = refRandom.length;
788 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
789 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
790 enforce(refRandom.length == randLen,
791 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
793 if (i == size_t.max)
795 enforce(strideBack(s) == codeLength!char(c),
796 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
798 enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
799 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
801 auto refBidir = new RefBidirCU!char(s);
802 immutable bidirLen = refBidir.length;
803 enforce(strideBack(refBidir) == codeLength!char(c),
804 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
805 enforce(refBidir.length == bidirLen,
806 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
810 assertCTFEable!(
812 test("a", 'a');
813 test(" ", ' ');
814 test("\u2029", '\u2029'); //paraSep
815 test("\u0100", '\u0100');
816 test("\u0430", '\u0430');
817 test("\U00010143", '\U00010143');
818 test("abcdefcdef", 'f');
819 test("\U00010143\u0100\U00010143hello", 'o', 15);
820 test("\U00010143\u0100\U00010143hello", 'l', 14);
821 test("\U00010143\u0100\U00010143hello", 'l', 13);
822 test("\U00010143\u0100\U00010143hello", 'e', 12);
823 test("\U00010143\u0100\U00010143hello", 'h', 11);
824 test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
825 test("\U00010143\u0100\U00010143hello", '\u0100', 6);
826 test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
828 foreach (S; AliasSeq!(char[], const char[], string))
830 enum str = to!S("hello world");
831 static assert(isSafe!({ strideBack(str, 0); }));
832 static assert(isSafe!({ strideBack(str); }));
833 static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
834 static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0);
839 //UTF-16 is self synchronizing: The length of strideBack can be found from
840 //the value of a single wchar
841 /// Ditto
842 uint strideBack(S)(auto ref S str, size_t index)
843 if (is(S : const wchar[]) ||
844 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
846 static if (is(typeof(str.length) : ulong))
847 assert(index <= str.length, "Past the end of the UTF-16 sequence");
848 assert(index > 0, "Not the end of a UTF-16 sequence");
850 immutable c2 = str[index-1];
851 return 1 + (0xDC00 <= c2 && c2 < 0xE000);
854 /// Ditto
855 uint strideBack(S)(auto ref S str)
856 if (is(S : const wchar[]) ||
857 (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
859 assert(!str.empty, "UTF-16 sequence is empty");
861 static if (is(S : const(wchar)[]))
862 immutable c2 = str[$ - 1];
863 else
864 immutable c2 = str.back;
866 return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
869 @system unittest
871 import core.exception : AssertError;
872 import std.conv : to;
873 import std.exception;
874 import std.string : format;
875 import std.traits : FunctionAttribute, functionAttributes, isSafe;
876 static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
878 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
879 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
881 enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
882 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
884 auto refRandom = new RefRandomCU!wchar(s);
885 immutable randLen = refRandom.length;
886 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
887 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
888 enforce(refRandom.length == randLen,
889 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
891 if (i == size_t.max)
893 enforce(strideBack(s) == codeLength!wchar(c),
894 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
896 enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
897 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
899 auto refBidir = new RefBidirCU!wchar(s);
900 immutable bidirLen = refBidir.length;
901 enforce(strideBack(refBidir) == codeLength!wchar(c),
902 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
903 enforce(refBidir.length == bidirLen,
904 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
908 assertCTFEable!(
910 test("a", 'a');
911 test(" ", ' ');
912 test("\u2029", '\u2029'); //paraSep
913 test("\u0100", '\u0100');
914 test("\u0430", '\u0430');
915 test("\U00010143", '\U00010143');
916 test("abcdefcdef", 'f');
917 test("\U00010143\u0100\U00010143hello", 'o', 10);
918 test("\U00010143\u0100\U00010143hello", 'l', 9);
919 test("\U00010143\u0100\U00010143hello", 'l', 8);
920 test("\U00010143\u0100\U00010143hello", 'e', 7);
921 test("\U00010143\u0100\U00010143hello", 'h', 6);
922 test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
923 test("\U00010143\u0100\U00010143hello", '\u0100', 3);
924 test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
926 foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
928 enum str = to!S("hello world");
929 static assert(isSafe!(() => strideBack(str, 0)));
930 static assert(isSafe!(() => strideBack(str) ));
931 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
932 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0);
937 /// Ditto
938 uint strideBack(S)(auto ref S str, size_t index)
939 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
941 static if (is(typeof(str.length) : ulong))
942 assert(index <= str.length, "Past the end of the UTF-32 sequence");
943 assert(index > 0, "Not the end of the UTF-32 sequence");
944 return 1;
947 /// Ditto
948 uint strideBack(S)(auto ref S str)
949 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
951 assert(!str.empty, "Empty UTF-32 sequence");
952 return 1;
956 @safe unittest
958 assert("a".strideBack == 1);
959 assert("λ".strideBack == 2);
960 assert("aλ".strideBack == 2);
961 assert("aλ".strideBack(1) == 1);
962 assert("𐐷".strideBack == 4);
965 @system unittest
967 import core.exception : AssertError;
968 import std.conv : to;
969 import std.exception;
970 import std.string : format;
971 import std.traits : FunctionAttribute, functionAttributes, isSafe;
972 static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
974 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
975 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
977 enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
978 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
980 auto refRandom = new RefRandomCU!dchar(s);
981 immutable randLen = refRandom.length;
982 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
983 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
984 enforce(refRandom.length == randLen,
985 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
987 if (i == size_t.max)
989 enforce(strideBack(s) == codeLength!dchar(c),
990 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
992 enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
993 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
995 auto refBidir = new RefBidirCU!dchar(s);
996 immutable bidirLen = refBidir.length;
997 enforce(strideBack(refBidir) == codeLength!dchar(c),
998 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
999 enforce(refBidir.length == bidirLen,
1000 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
1004 assertCTFEable!(
1006 test("a", 'a');
1007 test(" ", ' ');
1008 test("\u2029", '\u2029'); //paraSep
1009 test("\u0100", '\u0100');
1010 test("\u0430", '\u0430');
1011 test("\U00010143", '\U00010143');
1012 test("abcdefcdef", 'f');
1013 test("\U00010143\u0100\U00010143hello", 'o', 8);
1014 test("\U00010143\u0100\U00010143hello", 'l', 7);
1015 test("\U00010143\u0100\U00010143hello", 'l', 6);
1016 test("\U00010143\u0100\U00010143hello", 'e', 5);
1017 test("\U00010143\u0100\U00010143hello", 'h', 4);
1018 test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019 test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020 test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1022 foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
1024 enum str = to!S("hello world");
1025 static assert(isSafe!(() => strideBack(str, 0)));
1026 static assert(isSafe!(() => strideBack(str) ));
1027 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
1028 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0);
1035 Given `index` into `str` and assuming that `index` is at the start
1036 of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037 up to `index`. So, `index` is the index of a code unit at the
1038 beginning of a code point, and the return value is how many code points into
1039 the string that that code point is.
1041 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
1042 if (isSomeChar!C)
1044 static if (is(immutable C == immutable dchar))
1045 return index;
1046 else
1048 size_t n = 0;
1049 size_t j = 0;
1051 for (; j < index; ++n)
1052 j += stride(str, j);
1054 if (j > index)
1056 static if (is(immutable C == immutable char))
1057 throw new UTFException("Invalid UTF-8 sequence", index);
1058 else
1059 throw new UTFException("Invalid UTF-16 sequence", index);
1062 return n;
1067 @safe unittest
1069 assert(toUCSindex(`hello world`, 7) == 7);
1070 assert(toUCSindex(`hello world`w, 7) == 7);
1071 assert(toUCSindex(`hello world`d, 7) == 7);
1073 assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074 assert(toUCSindex(`Ma Chérie`w, 7) == 7);
1075 assert(toUCSindex(`Ma Chérie`d, 7) == 7);
1077 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1079 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1084 Given a UCS index `n` into `str`, returns the UTF index.
1085 So, `n` is how many code points into the string the code point is, and
1086 the array index of the code unit is returned.
1088 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
1089 if (isSomeChar!C)
1091 static if (is(immutable C == immutable dchar))
1093 return n;
1095 else
1097 size_t i;
1098 while (n--)
1100 i += stride(str, i);
1102 return i;
1107 @safe unittest
1109 assert(toUTFindex(`hello world`, 7) == 7);
1110 assert(toUTFindex(`hello world`w, 7) == 7);
1111 assert(toUTFindex(`hello world`d, 7) == 7);
1113 assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114 assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1115 assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1117 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1119 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1123 /* =================== Decode ======================= */
1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126 alias UseReplacementDchar = Flag!"useReplacementDchar";
1129 Decodes and returns the code point starting at `str[index]`. `index`
1130 is advanced to one past the decoded code point. If the code point is not
1131 well-formed, then a `UTFException` is thrown and `index` remains
1132 unchanged.
1134 decode will only work with strings and random access ranges of code units
1135 with length and slicing, whereas $(LREF decodeFront) will work with any
1136 input range of code units.
1138 Params:
1139 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140 str = input string or indexable Range
1141 index = starting index into s[]; incremented by number of code units processed
1143 Returns:
1144 decoded character
1146 Throws:
1147 $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148 sequence and useReplacementDchar is `No.useReplacementDchar`
1150 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1151 if (!isSomeString!S &&
1152 isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1155 assert(index < str.length, "Attempted to decode past the end of a string");
1157 out (result)
1159 assert(isValidDchar(result));
1163 if (str[index] < codeUnitLimit!S)
1164 return str[index++];
1165 else
1166 return decodeImpl!(true, useReplacementDchar)(str, index);
1169 /// ditto
1170 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1171 auto ref S str, ref size_t index) @trusted pure
1172 if (isSomeString!S)
1175 assert(index < str.length, "Attempted to decode past the end of a string");
1177 out (result)
1179 assert(isValidDchar(result));
1183 if (str[index] < codeUnitLimit!S)
1184 return str[index++];
1185 else static if (is(immutable S == immutable C[], C))
1186 return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1190 @safe pure unittest
1192 size_t i;
1194 assert("a".decode(i) == 'a' && i == 1);
1195 i = 0;
1196 assert("å".decode(i) == 'å' && i == 2);
1197 i = 1;
1198 assert("aå".decode(i) == 'å' && i == 3);
1199 i = 0;
1200 assert("å"w.decode(i) == 'å' && i == 1);
1202 // ë as a multi-code point grapheme
1203 i = 0;
1204 assert("e\u0308".decode(i) == 'e' && i == 1);
1205 // ë as a single code point grapheme
1206 i = 0;
1207 assert("ë".decode(i) == 'ë' && i == 2);
1208 i = 0;
1209 assert("ë"w.decode(i) == 'ë' && i == 1);
1213 `decodeFront` is a variant of $(LREF decode) which specifically decodes
1214 the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1215 $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1216 of code units (rather than just a string or random access
1217 range). It also takes the range by `ref` and pops off the elements as it
1218 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1219 of code units which were in the code point which was decoded.
1221 Params:
1222 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1223 str = input string or indexable Range
1224 numCodeUnits = set to number of code units processed
1226 Returns:
1227 decoded character
1229 Throws:
1230 $(LREF UTFException) if `str.front` is not the start of a valid UTF
1231 sequence. If an exception is thrown, then there is no guarantee as to
1232 the number of code units which were popped off, as it depends on the
1233 type of range being used and how many code units had to be popped off
1234 before the code point was determined to be invalid.
1236 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1237 ref S str, out size_t numCodeUnits)
1238 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1241 assert(!str.empty);
1243 out (result)
1245 assert(isValidDchar(result));
1249 immutable fst = str.front;
1251 if (fst < codeUnitLimit!S)
1253 str.popFront();
1254 numCodeUnits = 1;
1255 return fst;
1257 else
1259 // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1260 // done outside of decodeImpl, which is undesirable, since not all
1261 // overloads of decodeImpl need it. So, it should be moved back into
1262 // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1263 // has been fixed.
1264 enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1265 immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1267 // The other range types were already popped by decodeImpl.
1268 static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1269 str = str[numCodeUnits .. str.length];
1271 return retval;
1275 /// ditto
1276 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1277 ref S str, out size_t numCodeUnits) @trusted pure
1278 if (isSomeString!S)
1281 assert(!str.empty);
1283 out (result)
1285 assert(isValidDchar(result));
1289 if (str[0] < codeUnitLimit!S)
1291 numCodeUnits = 1;
1292 immutable retval = str[0];
1293 str = str[1 .. $];
1294 return retval;
1296 else static if (is(immutable S == immutable C[], C))
1298 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits);
1299 str = str[numCodeUnits .. $];
1300 return retval;
1304 /++ Ditto +/
1305 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1306 if (isInputRange!S && isSomeChar!(ElementType!S))
1308 size_t numCodeUnits;
1309 return decodeFront!useReplacementDchar(str, numCodeUnits);
1313 @safe pure unittest
1315 import std.range.primitives;
1316 string str = "Hello, World!";
1318 assert(str.decodeFront == 'H' && str == "ello, World!");
1319 str = "å";
1320 assert(str.decodeFront == 'å' && str.empty);
1321 str = "å";
1322 size_t i;
1323 assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
1327 `decodeBack` is a variant of $(LREF decode) which specifically decodes
1328 the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1329 bidirectional range of code units (rather than just a string or random access
1330 range). It also takes the range by `ref` and pops off the elements as it
1331 decodes them. If `numCodeUnits` is passed in, it gets set to the number
1332 of code units which were in the code point which was decoded.
1334 Params:
1335 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1336 str = input string or bidirectional Range
1337 numCodeUnits = gives the number of code units processed
1339 Returns:
1340 A decoded UTF character.
1342 Throws:
1343 $(LREF UTFException) if `str.back` is not the end of a valid UTF
1344 sequence. If an exception is thrown, the `str` itself remains unchanged,
1345 but there is no guarantee as to the value of `numCodeUnits` (when passed).
1347 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1348 ref S str, out size_t numCodeUnits)
1349 if (isSomeString!S)
1352 assert(!str.empty);
1354 out (result)
1356 assert(isValidDchar(result));
1360 if (str[$ - 1] < codeUnitLimit!S)
1362 numCodeUnits = 1;
1363 immutable retval = str[$ - 1];
1364 str = str[0 .. $ - 1];
1365 return retval;
1367 else static if (is(immutable S == immutable C[], C))
1369 numCodeUnits = strideBack(str);
1370 immutable newLength = str.length - numCodeUnits;
1371 size_t index = newLength;
1372 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1373 str = str[0 .. newLength];
1374 return retval;
1378 /++ Ditto +/
1379 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1380 ref S str, out size_t numCodeUnits)
1381 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1382 && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1385 assert(!str.empty);
1387 out (result)
1389 assert(isValidDchar(result));
1393 if (str.back < codeUnitLimit!S)
1395 numCodeUnits = 1;
1396 immutable retval = str.back;
1397 str.popBack();
1398 return retval;
1400 else
1402 numCodeUnits = strideBack(str);
1403 static if (isRandomAccessRange!S)
1405 size_t index = str.length - numCodeUnits;
1406 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1407 str.popBackExactly(numCodeUnits);
1408 return retval;
1410 else
1412 alias Char = Unqual!(ElementType!S);
1413 Char[4] codeUnits;
1414 S tmp = str.save;
1415 for (size_t i = numCodeUnits; i > 0; )
1417 codeUnits[--i] = tmp.back;
1418 tmp.popBack();
1420 const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1421 size_t index = 0;
1422 immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1423 str = tmp;
1424 return retval;
1429 /++ Ditto +/
1430 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1431 if (isSomeString!S
1432 || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1433 || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1436 assert(!str.empty);
1438 out (result)
1440 assert(isValidDchar(result));
1444 size_t numCodeUnits;
1445 return decodeBack!useReplacementDchar(str, numCodeUnits);
1449 @system pure unittest
1451 import std.range.primitives;
1452 string str = "Hello, World!";
1454 assert(str.decodeBack == '!' && str == "Hello, World");
1455 str = "å";
1456 assert(str.decodeBack == 'å' && str.empty);
1457 str = "å";
1458 size_t i;
1459 assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
1462 // For the given range, code unit values less than this
1463 // are guaranteed to be valid single-codepoint encodings.
1464 package template codeUnitLimit(S)
1465 if (isSomeChar!(ElementEncodingType!S))
1467 static if (is(immutable ElementEncodingType!S == immutable char))
1468 enum char codeUnitLimit = 0x80;
1469 else static if (is(immutable ElementEncodingType!S == immutable wchar))
1470 enum wchar codeUnitLimit = 0xD800;
1471 else
1472 enum dchar codeUnitLimit = 0xD800;
1476 * For strings, this function does its own bounds checking to give a
1477 * more useful error message when attempting to decode past the end of a string.
1478 * Subsequently it uses a pointer instead of an array to avoid
1479 * redundant bounds checking.
1481 * The three overloads of this operate on chars, wchars, and dchars.
1483 * Params:
1484 * canIndex = if S is indexable
1485 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1486 * str = input string or Range
1487 * index = starting index into s[]; incremented by number of code units processed
1489 * Returns:
1490 * decoded character
1492 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1493 auto ref S str, ref size_t index)
1494 if (
1495 is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
1497 /* The following encodings are valid, except for the 5 and 6 byte
1498 * combinations:
1499 * 0xxxxxxx
1500 * 110xxxxx 10xxxxxx
1501 * 1110xxxx 10xxxxxx 10xxxxxx
1502 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1503 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1504 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1507 /* Dchar bitmask for different numbers of UTF-8 code units.
1509 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1511 static if (is(S : const char[]))
1512 auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code
1513 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1514 auto pstr = str[index .. str.length];
1515 else
1516 alias pstr = str;
1518 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1519 // outside of decodeImpl
1520 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1522 static if (canIndex)
1524 immutable length = str.length - index;
1525 ubyte fst = pstr[0];
1527 else
1529 ubyte fst = pstr.front;
1530 pstr.popFront();
1533 static if (!useReplacementDchar)
1535 static if (canIndex)
1537 static UTFException exception(S)(S str, string msg)
1539 uint[4] sequence = void;
1540 size_t i;
1544 sequence[i] = str[i];
1545 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1547 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1551 UTFException invalidUTF()
1553 static if (canIndex)
1554 return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1555 else
1557 //We can't include the invalid sequence with input strings without
1558 //saving each of the code units along the way, and we can't do it with
1559 //forward ranges without saving the entire range. Both would incur a
1560 //cost for the decoding of every character just to provide a better
1561 //error message for the (hopefully) rare case when an invalid UTF-8
1562 //sequence is encountered, so we don't bother trying to include the
1563 //invalid sequence here, unlike with strings and sliceable ranges.
1564 return new UTFException("Invalid UTF-8 sequence");
1568 UTFException outOfBounds()
1570 static if (canIndex)
1571 return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1572 else
1573 return new UTFException("Attempted to decode past the end of a string");
1577 if ((fst & 0b1100_0000) != 0b1100_0000)
1579 static if (useReplacementDchar)
1581 ++index; // always consume bad input to avoid infinite loops
1582 return replacementDchar;
1584 else
1585 throw invalidUTF(); // starter must have at least 2 first bits set
1587 ubyte tmp = void;
1588 dchar d = fst; // upper control bits are masked out later
1589 fst <<= 1;
1591 foreach (i; AliasSeq!(1, 2, 3))
1594 static if (canIndex)
1596 if (i == length)
1598 static if (useReplacementDchar)
1600 index += i;
1601 return replacementDchar;
1603 else
1604 throw outOfBounds();
1607 else
1609 if (pstr.empty)
1611 static if (useReplacementDchar)
1613 index += i;
1614 return replacementDchar;
1616 else
1617 throw outOfBounds();
1621 static if (canIndex)
1622 tmp = pstr[i];
1623 else
1625 tmp = pstr.front;
1626 pstr.popFront();
1629 if ((tmp & 0xC0) != 0x80)
1631 static if (useReplacementDchar)
1633 index += i + 1;
1634 return replacementDchar;
1636 else
1637 throw invalidUTF();
1640 d = (d << 6) | (tmp & 0x3F);
1641 fst <<= 1;
1643 if (!(fst & 0x80)) // no more bytes
1645 d &= bitMask[i]; // mask out control bits
1647 // overlong, could have been encoded with i bytes
1648 if ((d & ~bitMask[i - 1]) == 0)
1650 static if (useReplacementDchar)
1652 index += i + 1;
1653 return replacementDchar;
1655 else
1656 throw invalidUTF();
1659 // check for surrogates only needed for 3 bytes
1660 static if (i == 2)
1662 if (!isValidDchar(d))
1664 static if (useReplacementDchar)
1666 index += i + 1;
1667 return replacementDchar;
1669 else
1670 throw invalidUTF();
1674 index += i + 1;
1675 static if (i == 3)
1677 if (d > dchar.max)
1679 static if (useReplacementDchar)
1680 d = replacementDchar;
1681 else
1682 throw invalidUTF();
1685 return d;
1689 static if (useReplacementDchar)
1691 index += 4; // read 4 chars by now
1692 return replacementDchar;
1694 else
1695 throw invalidUTF();
1698 @safe pure @nogc nothrow
1699 unittest
1701 // Add tests for useReplacemendDchar == yes path
1703 static struct R
1705 @safe pure @nogc nothrow:
1706 this(string s) { this.s = s; }
1707 @property bool empty() { return idx == s.length; }
1708 @property char front() { return s[idx]; }
1709 void popFront() { ++idx; }
1710 size_t idx;
1711 string s;
1714 foreach (s; invalidUTFstrings!char())
1716 auto r = R(s);
1717 size_t index;
1718 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1719 assert(dc == replacementDchar);
1720 assert(1 <= index && index <= s.length);
1724 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1725 (auto ref S str, ref size_t index)
1726 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
1728 static if (is(S : const wchar[]))
1729 auto pstr = str.ptr + index;
1730 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1731 auto pstr = str[index .. str.length];
1732 else
1733 alias pstr = str;
1735 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1736 // outside of decodeImpl
1737 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1739 static if (canIndex)
1741 immutable length = str.length - index;
1742 uint u = pstr[0];
1744 else
1746 uint u = pstr.front;
1747 pstr.popFront();
1750 static if (!useReplacementDchar)
1752 UTFException exception(string msg)
1754 static if (canIndex)
1755 return new UTFException(msg).setSequence(pstr[0]);
1756 else
1757 return new UTFException(msg);
1761 // The < case must be taken care of before decodeImpl is called.
1762 assert(u >= 0xD800);
1764 if (u <= 0xDBFF)
1766 static if (canIndex)
1767 immutable onlyOneCodeUnit = length == 1;
1768 else
1769 immutable onlyOneCodeUnit = pstr.empty;
1771 if (onlyOneCodeUnit)
1773 static if (useReplacementDchar)
1775 ++index;
1776 return replacementDchar;
1778 else
1779 throw exception("surrogate UTF-16 high value past end of string");
1782 static if (canIndex)
1783 immutable uint u2 = pstr[1];
1784 else
1786 immutable uint u2 = pstr.front;
1787 pstr.popFront();
1790 if (u2 < 0xDC00 || u2 > 0xDFFF)
1792 static if (useReplacementDchar)
1793 u = replacementDchar;
1794 else
1795 throw exception("surrogate UTF-16 low value out of range");
1797 else
1798 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1799 ++index;
1801 else if (u >= 0xDC00 && u <= 0xDFFF)
1803 static if (useReplacementDchar)
1804 u = replacementDchar;
1805 else
1806 throw exception("unpaired surrogate UTF-16 value");
1808 ++index;
1810 // Note: u+FFFE and u+FFFF are specifically permitted by the
1811 // Unicode standard for application internal use (see isValidDchar)
1813 return cast(dchar) u;
1816 @safe pure @nogc nothrow
1817 unittest
1819 // Add tests for useReplacemendDchar == true path
1821 static struct R
1823 @safe pure @nogc nothrow:
1824 this(wstring s) { this.s = s; }
1825 @property bool empty() { return idx == s.length; }
1826 @property wchar front() { return s[idx]; }
1827 void popFront() { ++idx; }
1828 size_t idx;
1829 wstring s;
1832 foreach (s; invalidUTFstrings!wchar())
1834 auto r = R(s);
1835 size_t index;
1836 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1837 assert(dc == replacementDchar);
1838 assert(1 <= index && index <= s.length);
1842 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1843 auto ref S str, ref size_t index)
1844 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
1846 static if (is(S : const dchar[]))
1847 auto pstr = str.ptr;
1848 else
1849 alias pstr = str;
1851 static if (is(S : const dchar[]) || isRandomAccessRange!S)
1853 dchar dc = pstr[index];
1854 if (!isValidDchar(dc))
1856 static if (useReplacementDchar)
1857 dc = replacementDchar;
1858 else
1859 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1861 ++index;
1862 return dc;
1864 else
1866 dchar dc = pstr.front;
1867 if (!isValidDchar(dc))
1869 static if (useReplacementDchar)
1870 dc = replacementDchar;
1871 else
1872 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1874 ++index;
1875 pstr.popFront();
1876 return dc;
1880 @safe pure @nogc nothrow
1881 unittest
1883 // Add tests for useReplacemendDchar == true path
1885 static struct R
1887 @safe pure @nogc nothrow:
1888 this(dstring s) { this.s = s; }
1889 @property bool empty() { return idx == s.length; }
1890 @property dchar front() { return s[idx]; }
1891 void popFront() { ++idx; }
1892 size_t idx;
1893 dstring s;
1896 foreach (s; invalidUTFstrings!dchar())
1898 auto r = R(s);
1899 size_t index;
1900 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1901 assert(dc == replacementDchar);
1902 assert(1 <= index && index <= s.length);
1907 version (StdUnittest) private void testDecode(R)(R range,
1908 size_t index,
1909 dchar expectedChar,
1910 size_t expectedIndex,
1911 size_t line = __LINE__)
1913 import core.exception : AssertError;
1914 import std.exception : enforce;
1915 import std.string : format;
1916 import std.traits : isNarrowString;
1918 static if (hasLength!R)
1919 immutable lenBefore = range.length;
1921 static if (isRandomAccessRange!R && !isNarrowString!R)
1924 immutable result = decode(range, index);
1925 enforce(result == expectedChar,
1926 new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1927 enforce(index == expectedIndex,
1928 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1929 static if (hasLength!R)
1931 enforce(range.length == lenBefore,
1932 new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1938 version (StdUnittest) private void testDecodeFront(R)(ref R range,
1939 dchar expectedChar,
1940 size_t expectedNumCodeUnits,
1941 size_t line = __LINE__)
1943 import core.exception : AssertError;
1944 import std.exception : enforce;
1945 import std.string : format;
1947 static if (hasLength!R)
1948 immutable lenBefore = range.length;
1950 size_t numCodeUnits;
1951 immutable result = decodeFront(range, numCodeUnits);
1952 enforce(result == expectedChar,
1953 new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1954 enforce(numCodeUnits == expectedNumCodeUnits,
1955 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1957 static if (hasLength!R)
1959 enforce(range.length == lenBefore - numCodeUnits,
1960 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1964 version (StdUnittest) private void testDecodeBack(R)(ref R range,
1965 dchar expectedChar,
1966 size_t expectedNumCodeUnits,
1967 size_t line = __LINE__)
1969 // This condition is to allow unit testing all `decode` functions together
1970 static if (!isBidirectionalRange!R)
1971 return;
1972 else
1974 import core.exception : AssertError;
1975 import std.exception : enforce;
1976 import std.string : format;
1978 static if (hasLength!R)
1979 immutable lenBefore = range.length;
1981 size_t numCodeUnits;
1982 immutable result = decodeBack(range, numCodeUnits);
1983 enforce(result == expectedChar,
1984 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1985 enforce(numCodeUnits == expectedNumCodeUnits,
1986 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1988 static if (hasLength!R)
1990 enforce(range.length == lenBefore - numCodeUnits,
1991 new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
1996 version (StdUnittest) private void testAllDecode(R)(R range,
1997 dchar expectedChar,
1998 size_t expectedIndex,
1999 size_t line = __LINE__)
2001 testDecode(range, 0, expectedChar, expectedIndex, line);
2002 static if (isBidirectionalRange!R)
2004 auto rangeCopy = range.save;
2005 testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
2007 testDecodeFront(range, expectedChar, expectedIndex, line);
2010 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
2012 import core.exception : AssertError;
2013 import std.exception : assertThrown, enforce;
2014 import std.string : format;
2016 immutable initialIndex = index;
2018 static if (hasLength!R)
2019 immutable lenBefore = range.length;
2021 static if (isRandomAccessRange!R)
2023 assertThrown!UTFException(decode(range, index), null, __FILE__, line);
2024 enforce(index == initialIndex,
2025 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
2026 static if (hasLength!R)
2028 enforce(range.length == lenBefore,
2029 new AssertError(format("decode: length changed:", range.length), __FILE__, line));
2033 if (initialIndex == 0)
2034 assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
2037 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
2039 // This condition is to allow unit testing all `decode` functions together
2040 static if (!isBidirectionalRange!R)
2041 return;
2042 else
2044 import core.exception : AssertError;
2045 import std.exception : assertThrown, enforce;
2046 import std.string : format;
2048 static if (hasLength!R)
2049 immutable lenBefore = range.length;
2051 static if (isRandomAccessRange!R)
2053 assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
2054 static if (hasLength!R)
2056 enforce(range.length == lenBefore,
2057 new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
2063 @system unittest
2065 import std.conv : to;
2066 import std.exception;
2068 assertCTFEable!(
2070 foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
2071 (string s) => new RefBidirCU!char(s),
2072 (string s) => new RefRandomCU!char(s)))
2074 enum sHasLength = hasLength!(typeof(S("abcd")));
2077 auto range = S("abcd");
2078 testDecode(range, 0, 'a', 1);
2079 testDecode(range, 1, 'b', 2);
2080 testDecodeFront(range, 'a', 1);
2081 testDecodeFront(range, 'b', 1);
2082 assert(decodeFront(range) == 'c');
2083 assert(decodeFront(range) == 'd');
2087 auto range = S("ウェブサイト");
2088 testDecode(range, 0, 'ウ', 3);
2089 testDecode(range, 3, 'ェ', 6);
2090 testDecodeFront(range, 'ウ', 3);
2091 testDecodeFront(range, 'ェ', 3);
2092 assert(decodeFront(range) == 'ブ');
2093 assert(decodeFront(range) == 'サ');
2097 auto range = S("abcd");
2098 testDecodeBack(range, 'd', 1);
2099 testDecodeBack(range, 'c', 1);
2100 testDecodeBack(range, 'b', 1);
2101 testDecodeBack(range, 'a', 1);
2105 auto range = S("ウェブサイト");
2106 testDecodeBack(range, 'ト', 3);
2107 testDecodeBack(range, 'イ', 3);
2108 testDecodeBack(range, 'サ', 3);
2109 testDecodeBack(range, 'ブ', 3);
2112 testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2113 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2115 foreach (str; ["\xE2\x89", // too short
2116 "\xC0\x8A",
2117 "\xE0\x80\x8A",
2118 "\xF0\x80\x80\x8A",
2119 "\xF8\x80\x80\x80\x8A",
2120 "\xFC\x80\x80\x80\x80\x8A"])
2122 testBadDecode(S(str), 0);
2123 testBadDecode(S(str), 1);
2124 testBadDecodeBack(S(str));
2127 //Invalid UTF-8 sequence where the first code unit is valid.
2128 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2129 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2131 //Invalid UTF-8 sequence where the first code unit isn't valid.
2132 foreach (str; ["\xED\xA0\x80",
2133 "\xED\xAD\xBF",
2134 "\xED\xAE\x80",
2135 "\xED\xAF\xBF",
2136 "\xED\xB0\x80",
2137 "\xED\xBE\x80",
2138 "\xED\xBF\xBF"])
2140 testBadDecode(S(str), 0);
2141 testBadDecodeBack(S(str));
2147 @system unittest
2149 import std.exception;
2150 assertCTFEable!(
2152 foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
2153 (wstring s) => new RefBidirCU!wchar(s),
2154 (wstring s) => new RefRandomCU!wchar(s)))
2156 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2157 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2158 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2159 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2160 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2162 testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2163 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2165 testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2166 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2169 auto range = S("ウェブサイト");
2170 testDecode(range, 0, 'ウ', 1);
2171 testDecode(range, 1, 'ェ', 2);
2172 testDecodeFront(range, 'ウ', 1);
2173 testDecodeFront(range, 'ェ', 1);
2174 assert(decodeFront(range) == 'ブ');
2175 assert(decodeFront(range) == 'サ');
2179 auto range = S("ウェブサイト");
2180 testDecodeBack(range, 'ト', 1);
2181 testDecodeBack(range, 'イ', 1);
2182 testDecodeBack(range, 'サ', 1);
2183 testDecodeBack(range, 'ブ', 1);
2187 foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2189 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2190 cast(wchar) 0x1400,
2191 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2192 testDecode(str, 0, cast(dchar) 0x10000, 2);
2193 testDecode(str, 2, cast(dchar) 0x1400, 3);
2194 testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2195 testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2196 testDecodeBack(str, cast(dchar) 0x1400, 1);
2197 testDecodeBack(str, cast(dchar) 0x10000, 2);
2202 @system unittest
2204 import std.exception;
2205 assertCTFEable!(
2207 foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
2208 (dstring s) => new RefBidirCU!dchar(s),
2209 (dstring s) => new RefRandomCU!dchar(s)))
2211 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2212 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2213 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2214 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2215 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2217 testBadDecode(S([cast(dchar) 0xD800]), 0);
2218 testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2219 testBadDecode(S([cast(dchar) 0x110000]), 0);
2221 testBadDecodeBack(S([cast(dchar) 0xD800]));
2222 testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2223 testBadDecodeBack(S([cast(dchar) 0x110000]));
2226 auto range = S("ウェブサイト");
2227 testDecode(range, 0, 'ウ', 1);
2228 testDecode(range, 1, 'ェ', 2);
2229 testDecodeFront(range, 'ウ', 1);
2230 testDecodeFront(range, 'ェ', 1);
2231 assert(decodeFront(range) == 'ブ');
2232 assert(decodeFront(range) == 'サ');
2236 auto range = S("ウェブサイト");
2237 testDecodeBack(range, 'ト', 1);
2238 testDecodeBack(range, 'イ', 1);
2239 testDecodeBack(range, 'サ', 1);
2240 testDecodeBack(range, 'ブ', 1);
2244 foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2246 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2247 testDecode(str, 0, 0x10000, 1);
2248 testDecode(str, 1, 0x1400, 2);
2249 testDecode(str, 2, 0xB9DDE, 3);
2250 testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2251 testDecodeBack(str, cast(dchar) 0x1400, 1);
2252 testDecodeBack(str, cast(dchar) 0x10000, 1);
2257 @safe unittest
2259 import std.exception;
2260 import std.traits : FunctionAttribute, functionAttributes, isSafe;
2261 assertCTFEable!(
2263 foreach (S; AliasSeq!( char[], const( char)[], string,
2264 wchar[], const(wchar)[], wstring,
2265 dchar[], const(dchar)[], dstring))
2267 static assert(isSafe!({ S str; size_t i = 0; decode(str, i); }));
2268 static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2269 static assert(isSafe!({ S str; decodeFront(str); }));
2270 static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2271 static assert((functionAttributes!({
2272 S str; size_t i = 0; decodeFront(str, i);
2273 }) & FunctionAttribute.pure_) != 0);
2274 static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2275 static assert((functionAttributes!({
2276 S str; size_t i = 0; decodeBack(str, i);
2277 }) & FunctionAttribute.pure_) != 0);
2278 static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2283 @safe unittest
2285 import std.exception;
2286 char[4] val;
2287 val[0] = 0b1111_0111;
2288 val[1] = 0b1011_1111;
2289 val[2] = 0b1011_1111;
2290 val[3] = 0b1011_1111;
2291 size_t i = 0;
2292 assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2294 /* =================== Encode ======================= */
2296 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2298 static if (useReplacementDchar)
2299 return replacementDchar;
2300 else
2301 throw new UTFException(msg).setSequence(c);
2305 Encodes `c` into the static array, `buf`, and returns the actual
2306 length of the encoded character (a number between `1` and `4` for
2307 `char[4]` buffers and a number between `1` and `2` for
2308 `wchar[2]` buffers).
2310 Throws:
2311 `UTFException` if `c` is not a valid UTF code point.
2313 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2314 out char[4] buf, dchar c) @safe pure
2316 if (c <= 0x7F)
2318 assert(isValidDchar(c));
2319 buf[0] = cast(char) c;
2320 return 1;
2322 if (c <= 0x7FF)
2324 assert(isValidDchar(c));
2325 buf[0] = cast(char)(0xC0 | (c >> 6));
2326 buf[1] = cast(char)(0x80 | (c & 0x3F));
2327 return 2;
2329 if (c <= 0xFFFF)
2331 if (0xD800 <= c && c <= 0xDFFF)
2332 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2334 assert(isValidDchar(c));
2336 buf[0] = cast(char)(0xE0 | (c >> 12));
2337 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2338 buf[2] = cast(char)(0x80 | (c & 0x3F));
2339 return 3;
2341 if (c <= 0x10FFFF)
2343 assert(isValidDchar(c));
2344 buf[0] = cast(char)(0xF0 | (c >> 18));
2345 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2346 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2347 buf[3] = cast(char)(0x80 | (c & 0x3F));
2348 return 4;
2351 assert(!isValidDchar(c));
2352 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2353 goto L3;
2357 @safe unittest
2359 import std.exception : assertThrown;
2360 import std.typecons : Yes;
2362 char[4] buf;
2364 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2365 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2366 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2367 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2368 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2369 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2371 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2372 auto slice = buf[];
2373 assert(slice.decodeFront == replacementDchar);
2377 @safe unittest
2379 import std.exception : assertThrown;
2380 import std.typecons : Yes;
2382 wchar[2] buf;
2384 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2385 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2386 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2387 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2388 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2389 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2391 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2392 auto slice = buf[];
2393 assert(slice.decodeFront == replacementDchar);
2397 @safe unittest
2399 import std.exception : assertThrown;
2400 import std.typecons : Yes;
2402 dchar[1] buf;
2404 assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
2405 assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
2406 assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
2407 assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
2408 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2410 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2411 assert(buf[0] == replacementDchar);
2414 @safe unittest
2416 import std.exception;
2417 assertCTFEable!(
2419 char[4] buf;
2421 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2422 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2423 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2424 assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2425 assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2426 assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2427 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2428 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2429 assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2430 assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2431 assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2433 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2434 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2435 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2436 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2437 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2439 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2440 enum replacementDcharString = "\uFFFD";
2441 assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
2446 /// Ditto
2447 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2448 out wchar[2] buf, dchar c) @safe pure
2450 if (c <= 0xFFFF)
2452 if (0xD800 <= c && c <= 0xDFFF)
2453 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2455 assert(isValidDchar(c));
2457 buf[0] = cast(wchar) c;
2458 return 1;
2460 if (c <= 0x10FFFF)
2462 assert(isValidDchar(c));
2463 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2464 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2465 return 2;
2468 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2469 goto L1;
2472 @safe unittest
2474 import std.exception;
2475 assertCTFEable!(
2477 wchar[2] buf;
2479 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2480 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2481 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2482 assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2483 assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2484 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2485 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2487 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2488 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2489 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2490 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2491 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2493 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2494 assert(buf.front == replacementDchar);
2499 /// Ditto
2500 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2501 out dchar[1] buf, dchar c) @safe pure
2503 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2504 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2505 else
2506 assert(isValidDchar(c));
2507 buf[0] = c;
2508 return 1;
2511 @safe unittest
2513 import std.exception;
2514 assertCTFEable!(
2516 dchar[1] buf;
2518 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2519 encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2520 encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2521 encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
2522 encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
2523 encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2525 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2526 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2527 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2528 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2529 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2531 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2532 assert(buf.front == replacementDchar);
2538 Encodes `c` in `str`'s encoding and appends it to `str`.
2540 Throws:
2541 `UTFException` if `c` is not a valid UTF code point.
2543 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2544 ref char[] str, dchar c) @safe pure
2546 char[] r = str;
2548 if (c <= 0x7F)
2550 assert(isValidDchar(c));
2551 r ~= cast(char) c;
2553 else
2555 char[4] buf;
2556 uint L;
2558 if (c <= 0x7FF)
2560 assert(isValidDchar(c));
2561 buf[0] = cast(char)(0xC0 | (c >> 6));
2562 buf[1] = cast(char)(0x80 | (c & 0x3F));
2563 L = 2;
2565 else if (c <= 0xFFFF)
2567 if (0xD800 <= c && c <= 0xDFFF)
2568 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2570 assert(isValidDchar(c));
2572 buf[0] = cast(char)(0xE0 | (c >> 12));
2573 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2574 buf[2] = cast(char)(0x80 | (c & 0x3F));
2575 L = 3;
2577 else if (c <= 0x10FFFF)
2579 assert(isValidDchar(c));
2580 buf[0] = cast(char)(0xF0 | (c >> 18));
2581 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2582 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2583 buf[3] = cast(char)(0x80 | (c & 0x3F));
2584 L = 4;
2586 else
2588 assert(!isValidDchar(c));
2589 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2590 goto L3;
2592 r ~= buf[0 .. L];
2594 str = r;
2598 @safe unittest
2600 char[] s = "abcd".dup;
2601 dchar d1 = 'a';
2602 dchar d2 = 'ø';
2604 encode(s, d1);
2605 assert(s.length == 5);
2606 assert(s == "abcda");
2607 encode(s, d2);
2608 assert(s.length == 7);
2609 assert(s == "abcdaø");
2612 @safe unittest
2614 import std.exception;
2616 assertCTFEable!(
2618 char[] s = "abcd".dup;
2619 encode(s, cast(dchar)'a');
2620 assert(s.length == 5);
2621 assert(s == "abcda");
2623 encode(s, cast(dchar)'\u00A9');
2624 assert(s.length == 7);
2625 assert(s == "abcda\xC2\xA9");
2626 //assert(s == "abcda\u00A9"); // BUG: fix compiler
2628 encode(s, cast(dchar)'\u2260');
2629 assert(s.length == 10);
2630 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2634 @safe unittest
2636 import std.exception;
2637 assertCTFEable!(
2639 char[] buf;
2641 encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2642 encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2643 encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2644 encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2645 encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2646 encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2647 encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2648 encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2649 encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2650 encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2651 encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2653 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2654 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2655 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2656 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2657 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2659 enum replacementDcharString = "\uFFFD";
2660 enum rdcslen = replacementDcharString.length;
2661 assert(buf[$ - rdcslen .. $] != replacementDcharString);
2662 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2663 assert(buf[$ - rdcslen .. $] == replacementDcharString);
2667 /// ditto
2668 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2669 ref wchar[] str, dchar c) @safe pure
2671 wchar[] r = str;
2673 if (c <= 0xFFFF)
2675 if (0xD800 <= c && c <= 0xDFFF)
2676 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2678 assert(isValidDchar(c));
2680 r ~= cast(wchar) c;
2682 else if (c <= 0x10FFFF)
2684 wchar[2] buf;
2686 assert(isValidDchar(c));
2687 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2688 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2689 r ~= buf;
2691 else
2693 assert(!isValidDchar(c));
2694 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2695 goto L1;
2698 str = r;
2701 @safe unittest
2703 import std.exception;
2704 assertCTFEable!(
2706 wchar[] buf;
2708 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2709 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2710 encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2711 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2712 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2713 encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2714 encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2716 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2717 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2718 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2719 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2720 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2722 assert(buf.back != replacementDchar);
2723 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2724 assert(buf.back == replacementDchar);
2728 /// ditto
2729 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2730 ref dchar[] str, dchar c) @safe pure
2732 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2733 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2734 else
2735 assert(isValidDchar(c));
2736 str ~= c;
2739 @safe unittest
2741 import std.exception;
2742 assertCTFEable!(
2744 dchar[] buf;
2746 encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2747 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2748 encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2749 encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
2750 encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
2751 encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2753 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2754 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2755 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2756 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2757 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2759 assert(buf.back != replacementDchar);
2760 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2761 assert(buf.back == replacementDchar);
2767 Returns the number of code units that are required to encode the code point
2768 `c` when `C` is the character type used to encode it.
2770 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2771 if (isSomeChar!C)
2773 static if (C.sizeof == 1)
2775 if (c <= 0x7F) return 1;
2776 if (c <= 0x7FF) return 2;
2777 if (c <= 0xFFFF) return 3;
2778 if (c <= 0x10FFFF) return 4;
2779 assert(false);
2781 else static if (C.sizeof == 2)
2783 return c <= 0xFFFF ? 1 : 2;
2785 else
2787 static assert(C.sizeof == 4);
2788 return 1;
2793 @safe pure nothrow @nogc unittest
2795 assert(codeLength!char('a') == 1);
2796 assert(codeLength!wchar('a') == 1);
2797 assert(codeLength!dchar('a') == 1);
2799 assert(codeLength!char('\U0010FFFF') == 4);
2800 assert(codeLength!wchar('\U0010FFFF') == 2);
2801 assert(codeLength!dchar('\U0010FFFF') == 1);
2806 Returns the number of code units that are required to encode `str`
2807 in a string whose character type is `C`. This is particularly useful
2808 when slicing one string with the length of another and the two string
2809 types use different character types.
2811 Params:
2812 C = the character type to get the encoding length for
2813 input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2814 to calculate the encoding length from
2815 Returns:
2816 The number of code units in `input` when encoded to `C`
2818 size_t codeLength(C, InputRange)(InputRange input)
2819 if (isInputRange!InputRange && !isInfinite!InputRange && isSomeChar!(ElementType!InputRange))
2821 alias EncType = Unqual!(ElementEncodingType!InputRange);
2822 static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2823 return input.length;
2824 else
2826 size_t total = 0;
2828 foreach (c; input.byDchar)
2829 total += codeLength!C(c);
2831 return total;
2836 @safe unittest
2838 assert(codeLength!char("hello world") ==
2839 "hello world".length);
2840 assert(codeLength!wchar("hello world") ==
2841 "hello world"w.length);
2842 assert(codeLength!dchar("hello world") ==
2843 "hello world"d.length);
2845 assert(codeLength!char(`プログラミング`) ==
2846 `プログラミング`.length);
2847 assert(codeLength!wchar(`プログラミング`) ==
2848 `プログラミング`w.length);
2849 assert(codeLength!dchar(`プログラミング`) ==
2850 `プログラミング`d.length);
2852 string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2853 wstring needle = `Être sans la verité`;
2854 assert(haystack[codeLength!char(needle) .. $] ==
2855 `, ça, ce ne serait pas bien.`);
2858 @safe unittest
2860 import std.algorithm.iteration : filter;
2861 import std.conv : to;
2862 import std.exception;
2864 assertCTFEable!(
2866 foreach (S; AliasSeq!( char[], const char[], string,
2867 wchar[], const wchar[], wstring,
2868 dchar[], const dchar[], dstring))
2870 foreach (C; AliasSeq!(char, wchar, dchar))
2872 assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2873 assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2874 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2875 to!(C[])(`ウェブサイト@La_Verité.com`).length);
2876 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2877 to!(C[])(`ウェブサイト@La_Verité.com`).length);
2884 Internal helper function:
2886 Returns true if it is safe to search for the Codepoint `c` inside
2887 code units, without decoding.
2889 This is a runtime check that is used an optimization in various functions,
2890 particularly, in `std.string`.
2892 package bool canSearchInCodeUnits(C)(dchar c)
2893 if (isSomeChar!C)
2895 static if (C.sizeof == 1)
2896 return c <= 0x7F;
2897 else static if (C.sizeof == 2)
2898 return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2899 else static if (C.sizeof == 4)
2900 return true;
2901 else
2902 static assert(0);
2904 @safe unittest
2906 assert( canSearchInCodeUnits! char('a'));
2907 assert( canSearchInCodeUnits!wchar('a'));
2908 assert( canSearchInCodeUnits!dchar('a'));
2909 assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2910 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2911 assert( canSearchInCodeUnits!wchar('ö'));
2912 assert( canSearchInCodeUnits!dchar('ö'));
2913 assert(!canSearchInCodeUnits! char('日'));
2914 assert( canSearchInCodeUnits!wchar('日'));
2915 assert( canSearchInCodeUnits!dchar('日'));
2916 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2917 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2918 assert(!canSearchInCodeUnits! char('\U00010001'));
2919 assert(!canSearchInCodeUnits!wchar('\U00010001'));
2920 assert( canSearchInCodeUnits!dchar('\U00010001'));
2923 /* =================== Validation ======================= */
2926 Checks to see if `str` is well-formed unicode or not.
2928 Throws:
2929 `UTFException` if `str` is not well-formed.
2931 void validate(S)(in S str) @safe pure
2932 if (isSomeString!S)
2934 immutable len = str.length;
2935 for (size_t i = 0; i < len; )
2937 decode(str, i);
2942 @safe unittest
2944 import std.exception : assertThrown;
2945 char[] a = [167, 133, 175];
2946 assertThrown!UTFException(validate(a));
2949 // https://issues.dlang.org/show_bug.cgi?id=12923
2950 @safe unittest
2952 import std.exception;
2953 assertThrown((){
2954 char[3]a=[167, 133, 175];
2955 validate(a[]);
2956 }());
2960 * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2961 * string of the elements.
2963 * Params:
2964 * s = the string to encode
2965 * Returns:
2966 * A UTF-8 string
2967 * See_Also:
2968 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2970 string toUTF8(S)(S s)
2971 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2973 return toUTFImpl!string(s);
2977 @safe pure unittest
2979 import std.algorithm.comparison : equal;
2981 // The ö is represented by two UTF-8 code units
2982 assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2984 // 𐐷 is four code units in UTF-8
2985 assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2988 @system pure unittest
2990 import std.algorithm.comparison : equal;
2991 import std.internal.test.dummyrange : ReferenceInputRange;
2993 alias RT = ReferenceInputRange!(ElementType!(string));
2994 auto r1 = new RT("Hellø");
2995 auto r2 = new RT("𐐷");
2997 assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2998 assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
3002 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3003 * `wstring` of the elements.
3005 * Params:
3006 * s = the range to encode
3007 * Returns:
3008 * A UTF-16 string
3009 * See_Also:
3010 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3012 wstring toUTF16(S)(S s)
3013 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
3015 return toUTFImpl!wstring(s);
3019 @safe pure unittest
3021 import std.algorithm.comparison : equal;
3023 // these graphemes are two code units in UTF-16 and one in UTF-32
3024 assert("𤭢"d.length == 1);
3025 assert("𐐷"d.length == 1);
3027 assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62]));
3028 assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37]));
3031 @system pure unittest
3033 import std.algorithm.comparison : equal;
3034 import std.internal.test.dummyrange : ReferenceInputRange;
3036 alias RT = ReferenceInputRange!(ElementType!(string));
3037 auto r1 = new RT("𤭢");
3038 auto r2 = new RT("𐐷");
3040 assert(r1.toUTF16.equal([0xD852, 0xDF62]));
3041 assert(r2.toUTF16.equal([0xD801, 0xDC37]));
3046 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3047 * `dstring` of the elements.
3049 * Params:
3050 * s = the range to encode
3051 * Returns:
3052 * A UTF-32 string
3053 * See_Also:
3054 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3056 dstring toUTF32(S)(scope S s)
3057 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
3059 return toUTFImpl!dstring(s);
3063 @safe pure unittest
3065 import std.algorithm.comparison : equal;
3067 // these graphemes are two code units in UTF-16 and one in UTF-32
3068 assert("𤭢"w.length == 2);
3069 assert("𐐷"w.length == 2);
3071 assert("𤭢"w.toUTF32.equal([0x00024B62]));
3072 assert("𐐷"w.toUTF32.equal([0x00010437]));
3075 private T toUTFImpl(T, S)(scope S s)
3077 static if (is(S : T))
3079 return s.idup;
3081 else
3083 import std.array : appender;
3084 auto app = appender!T();
3086 static if (is(S == C[], C) || hasLength!S)
3087 app.reserve(s.length);
3089 foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
3090 app.put(c);
3092 return app.data;
3096 /* =================== toUTFz ======================= */
3099 Returns a C-style zero-terminated string equivalent to `str`. `str`
3100 must not contain embedded `'\0'`'s as any C function will treat the first
3101 `'\0'` that it sees as the end of the string. If `str.empty` is
3102 `true`, then a string containing only `'\0'` is returned.
3104 `toUTFz` accepts any type of string and is templated on the type of
3105 character pointer that you wish to convert to. It will avoid allocating a
3106 new string if it can, but there's a decent chance that it will end up having
3107 to allocate a new string - particularly when dealing with character types
3108 other than `char`.
3110 $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3111 anything alters the character one past the end of `str` (which is the
3112 `'\0'` character terminating the string), then the string won't be
3113 zero-terminated anymore. The most likely scenarios for that are if you
3114 append to `str` and no reallocation takes place or when `str` is a
3115 slice of a larger array, and you alter the character in the larger array
3116 which is one character past the end of `str`. Another case where it could
3117 occur would be if you had a mutable character array immediately after
3118 `str` in memory (for example, if they're member variables in a
3119 user-defined type with one declared right after the other) and that
3120 character array happened to start with `'\0'`. Such scenarios will never
3121 occur if you immediately use the zero-terminated string after calling
3122 `toUTFz` and the C function using it doesn't keep a reference to it.
3123 Also, they are unlikely to occur even if you save the zero-terminated string
3124 (the cases above would be among the few examples of where it could happen).
3125 However, if you save the zero-terminate string and want to be absolutely
3126 certain that the string stays zero-terminated, then simply append a
3127 `'\0'` to the string and use its `ptr` property rather than calling
3128 `toUTFz`.
3130 $(RED Warning 2:) When passing a character pointer to a C function, and the
3131 C function keeps it around for any reason, make sure that you keep a
3132 reference to it in your D code. Otherwise, it may go away during a garbage
3133 collection cycle and cause a nasty bug when the C code tries to use it.
3135 template toUTFz(P)
3136 if (isPointer!P && isSomeChar!(typeof(*P.init)))
3138 P toUTFz(S)(S str) @safe pure
3139 if (isSomeString!S)
3141 return toUTFzImpl!(P, S)(str);
3146 @safe pure unittest
3148 auto p1 = toUTFz!(char*)("hello world");
3149 auto p2 = toUTFz!(const(char)*)("hello world");
3150 auto p3 = toUTFz!(immutable(char)*)("hello world");
3151 auto p4 = toUTFz!(char*)("hello world"d);
3152 auto p5 = toUTFz!(const(wchar)*)("hello world");
3153 auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
3156 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3157 if (is(immutable typeof(*P.init) == typeof(str[0])))
3158 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3160 if (str.empty)
3162 typeof(*P.init)[] retval = ['\0'];
3164 auto trustedPtr() @trusted { return retval.ptr; }
3165 return trustedPtr();
3168 alias C = Unqual!(ElementEncodingType!S);
3170 //If the P is mutable, then we have to make a copy.
3171 static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
3173 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3175 else
3177 if (!__ctfe)
3179 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3180 immutable p = trustedPtrAdd(str);
3182 // Peek past end of str, if it's 0, no conversion necessary.
3183 // Note that the compiler will put a 0 past the end of static
3184 // strings, and the storage allocator will put a 0 past the end
3185 // of newly allocated char[]'s.
3186 // Is p dereferenceable? A simple test: if the p points to an
3187 // address multiple of 4, then conservatively assume the pointer
3188 // might be pointing to a new block of memory, which might be
3189 // unreadable. Otherwise, it's definitely pointing to valid
3190 // memory.
3191 if ((cast(size_t) p & 3) && *p == '\0')
3192 return &str[0];
3195 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3199 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3200 if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable))
3201 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3203 alias InChar = typeof(str[0]);
3204 alias OutChar = typeof(*P.init);
3206 //const(C)[] -> const(C)* or
3207 //C[] -> C* or const(C)*
3208 static if (( is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) ||
3209 (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
3211 if (!__ctfe)
3213 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3214 auto p = trustedPtrAdd(str);
3216 if ((cast(size_t) p & 3) && *p == '\0')
3217 return &str[0];
3220 str ~= '\0';
3221 return &str[0];
3223 //const(C)[] -> C* or immutable(C)* or
3224 //C[] -> immutable(C)*
3225 else
3227 import std.array : uninitializedArray;
3228 auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
3229 copy[0 .. $ - 1] = str[];
3230 copy[$ - 1] = '\0';
3232 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
3233 return trustedCast(copy);
3237 private P toUTFzImpl(P, S)(S str) @safe pure
3238 if (!is(immutable typeof(*P.init) == immutable typeof(str[0])))
3239 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3241 import std.array : appender;
3242 auto retval = appender!(typeof(*P.init)[])();
3244 foreach (dchar c; str)
3245 retval.put(c);
3246 retval.put('\0');
3248 return () @trusted { return cast(P) retval.data.ptr; } ();
3251 @safe pure unittest
3253 import core.exception : AssertError;
3254 import std.algorithm;
3255 import std.conv : to;
3256 import std.exception;
3257 import std.string : format;
3259 assertCTFEable!(
3261 foreach (S; AliasSeq!(string, wstring, dstring))
3263 alias C = Unqual!(ElementEncodingType!S);
3265 auto s1 = to!S("hello\U00010143\u0100\U00010143");
3266 auto temp = new C[](s1.length + 1);
3267 temp[0 .. $ - 1] = s1[0 .. $];
3268 temp[$ - 1] = '\n';
3269 --temp.length;
3270 auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3271 auto s2 = trustedAssumeUnique(temp);
3272 assert(s1 == s2);
3274 void trustedCStringAssert(P, S)(S s) @trusted
3276 auto p = toUTFz!P(s);
3277 assert(p[0 .. s.length] == s);
3278 assert(p[s.length] == '\0');
3281 foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3283 trustedCStringAssert!P(s1);
3284 trustedCStringAssert!P(s2);
3289 static void test(P, S)(S s, size_t line = __LINE__) @trusted
3291 static size_t zeroLen(C)(const(C)* ptr) @trusted
3293 size_t len = 0;
3294 while (*ptr != '\0') { ++ptr; ++len; }
3295 return len;
3298 auto p = toUTFz!P(s);
3299 immutable len = zeroLen(p);
3300 enforce(cmp(s, p[0 .. len]) == 0,
3301 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3302 __FILE__, line));
3305 assertCTFEable!(
3307 foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3308 dchar*, const(dchar)*, immutable(dchar)*))
3310 test!P("hello\U00010143\u0100\U00010143");
3312 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3313 dchar*, const(dchar)*, immutable(dchar)*))
3315 test!P("hello\U00010143\u0100\U00010143"w);
3317 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3318 wchar*, const(wchar)*, immutable(wchar)*))
3320 test!P("hello\U00010143\u0100\U00010143"d);
3322 foreach (S; AliasSeq!( char[], const( char)[],
3323 wchar[], const(wchar)[],
3324 dchar[], const(dchar)[]))
3326 auto s = to!S("hello\U00010143\u0100\U00010143");
3328 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3329 wchar*, const(wchar)*, immutable(wchar)*,
3330 dchar*, const(dchar)*, immutable(dchar)*))
3332 test!P(s);
3340 `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3342 Encodes string `s` into UTF-16 and returns the encoded string.
3343 `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3344 that take an `LPCWSTR` argument.
3346 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3347 if (isSomeChar!C)
3349 return toUTFz!(const(wchar)*)(str);
3353 @system unittest
3355 string str = "Hello, World!";
3356 const(wchar)* p = str.toUTF16z;
3357 assert(p[str.length] == '\0');
3360 @safe pure unittest
3362 import std.conv : to;
3363 //toUTFz is already thoroughly tested, so this will just verify that
3364 //toUTF16z compiles properly for the various string types.
3365 foreach (S; AliasSeq!(string, wstring, dstring))
3366 assert(toUTF16z(to!S("hello world")) !is null);
3370 /* ================================ tests ================================== */
3372 @safe pure unittest
3374 import std.exception;
3376 assertCTFEable!(
3378 assert(toUTF16("hello"c) == "hello");
3379 assert(toUTF32("hello"c) == "hello");
3380 assert(toUTF8 ("hello"w) == "hello");
3381 assert(toUTF32("hello"w) == "hello");
3382 assert(toUTF8 ("hello"d) == "hello");
3383 assert(toUTF16("hello"d) == "hello");
3385 assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3386 assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3387 assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3388 assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3389 assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3390 assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3392 assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3393 assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3394 assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3395 assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3396 assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3397 assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3403 Returns the total number of code points encoded in `str`.
3405 Supercedes: This function supercedes $(LREF toUCSindex).
3407 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3409 Throws:
3410 `UTFException` if `str` is not well-formed.
3412 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc
3413 if (isSomeChar!C)
3415 return walkLength(str.byDchar);
3419 @safe pure nothrow @nogc unittest
3421 assert(count("") == 0);
3422 assert(count("a") == 1);
3423 assert(count("abc") == 3);
3424 assert(count("\u20AC100") == 4);
3427 @safe pure nothrow @nogc unittest
3429 import std.exception;
3430 assertCTFEable!(
3432 assert(count("") == 0);
3433 assert(count("a") == 1);
3434 assert(count("abc") == 3);
3435 assert(count("\u20AC100") == 4);
3440 // Ranges of code units for testing.
3441 version (StdUnittest)
3443 private:
3444 struct InputCU(C)
3446 import std.conv : to;
3447 @property bool empty() { return _str.empty; }
3448 @property C front() { return _str[0]; }
3449 void popFront() { _str = _str[1 .. $]; }
3451 this(inout(C)[] str)
3453 _str = to!(C[])(str);
3456 C[] _str;
3459 struct BidirCU(C)
3461 import std.conv : to;
3462 @property bool empty() { return _str.empty; }
3463 @property C front() { return _str[0]; }
3464 void popFront() { _str = _str[1 .. $]; }
3465 @property C back() { return _str[$ - 1]; }
3466 void popBack() { _str = _str[0 .. $ - 1]; }
3467 @property auto save() { return BidirCU(_str); }
3468 @property size_t length() { return _str.length; }
3470 this(inout(C)[] str)
3472 _str = to!(C[])(str);
3475 C[] _str;
3478 struct RandomCU(C)
3480 import std.conv : to;
3481 @property bool empty() { return _str.empty; }
3482 @property C front() { return _str[0]; }
3483 void popFront() { _str = _str[1 .. $]; }
3484 @property C back() { return _str[$ - 1]; }
3485 void popBack() { _str = _str[0 .. $ - 1]; }
3486 @property auto save() { return RandomCU(_str); }
3487 @property size_t length() { return _str.length; }
3488 C opIndex(size_t i) { return _str[i]; }
3489 auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3491 this(inout(C)[] str)
3493 _str = to!(C[])(str);
3496 C[] _str;
3499 class RefBidirCU(C)
3501 import std.conv : to;
3502 @property bool empty() { return _str.empty; }
3503 @property C front() { return _str[0]; }
3504 void popFront() { _str = _str[1 .. $]; }
3505 @property C back() { return _str[$ - 1]; }
3506 void popBack() { _str = _str[0 .. $ - 1]; }
3507 @property auto save() { return new RefBidirCU(_str); }
3508 @property size_t length() { return _str.length; }
3510 this(inout(C)[] str)
3512 _str = to!(C[])(str);
3515 C[] _str;
3518 class RefRandomCU(C)
3520 import std.conv : to;
3521 @property bool empty() { return _str.empty; }
3522 @property C front() { return _str[0]; }
3523 void popFront() { _str = _str[1 .. $]; }
3524 @property C back() { return _str[$ - 1]; }
3525 void popBack() { _str = _str[0 .. $ - 1]; }
3526 @property auto save() { return new RefRandomCU(_str); }
3527 @property size_t length() { return _str.length; }
3528 C opIndex(size_t i) { return _str[i]; }
3529 auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3531 this(inout(C)[] str)
3533 _str = to!(C[])(str);
3536 C[] _str;
3542 * Inserted in place of invalid UTF sequences.
3544 * References:
3545 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3547 enum dchar replacementDchar = '\uFFFD';
3549 /********************************************
3550 * Iterate a range of char, wchar, or dchars by code unit.
3552 * The purpose is to bypass the special case decoding that
3553 * $(REF front, std,range,primitives) does to character arrays. As a result,
3554 * using ranges with `byCodeUnit` can be `nothrow` while
3555 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3556 * sequences.
3558 * A code unit is a building block of the UTF encodings. Generally, an
3559 * individual code unit does not represent what's perceived as a full
3560 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3561 * are encoded with multiple code units. For example, the UTF-8 code units for
3562 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3563 * often does not form a character on its own. Attempting to treat it as
3564 * one while iterating over the resulting range will give nonsensical results.
3566 * Params:
3567 * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3568 * of characters (including strings) or a type that implicitly converts to a string type.
3569 * Returns:
3570 * If `r` is not an auto-decodable string (i.e. a narrow string or a
3571 * user-defined type that implicits converts to a string type), then `r`
3572 * is returned.
3574 * Otherwise, `r` is converted to its corresponding string type (if it's
3575 * not already a string) and wrapped in a random-access range where the
3576 * element encoding type of the string (its code unit) is the element type
3577 * of the range, and that range returned. The range has slicing.
3579 * If `r` is quirky enough to be a struct or class which is an input range
3580 * of characters on its own (i.e. it has the input range API as member
3581 * functions), $(I and) it's implicitly convertible to a string type, then
3582 * `r` is returned, and no implicit conversion takes place.
3584 * If `r` is wrapped in a new range, then that range has a `source`
3585 * property for returning the string that's currently contained within that
3586 * range.
3588 * See_Also:
3589 * Refer to the $(MREF std, uni) docs for a reference on Unicode
3590 * terminology.
3592 * For a range that iterates by grapheme cluster (written character) see
3593 * $(REF byGrapheme, std,uni).
3595 auto byCodeUnit(R)(R r)
3596 if ((isConvertibleToString!R && !isStaticArray!R) ||
3597 (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
3599 import std.traits : StringTypeOf;
3600 static if (// This would be cleaner if we had a way to check whether a type
3601 // was a range without any implicit conversions.
3602 (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3603 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3605 static struct ByCodeUnitImpl
3607 @safe pure nothrow @nogc:
3609 @property bool empty() const { return source.length == 0; }
3610 @property auto ref front() inout { return source[0]; }
3611 void popFront() { source = source[1 .. $]; }
3613 @property auto save() { return ByCodeUnitImpl(source.save); }
3615 @property auto ref back() inout { return source[$ - 1]; }
3616 void popBack() { source = source[0 .. $-1]; }
3618 auto ref opIndex(size_t index) inout { return source[index]; }
3619 auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
3621 @property size_t length() const { return source.length; }
3622 alias opDollar = length;
3624 StringTypeOf!R source;
3627 static assert(isRandomAccessRange!ByCodeUnitImpl);
3629 return ByCodeUnitImpl(r);
3631 else static if (!isInputRange!R ||
3632 (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3633 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3635 return cast(StringTypeOf!R) r;
3637 else
3639 // byCodeUnit for ranges and dchar[] is a no-op
3640 return r;
3645 @safe unittest
3647 import std.range.primitives;
3648 import std.traits : isAutodecodableString;
3650 auto r = "Hello, World!".byCodeUnit();
3651 static assert(hasLength!(typeof(r)));
3652 static assert(hasSlicing!(typeof(r)));
3653 static assert(isRandomAccessRange!(typeof(r)));
3654 static assert(is(ElementType!(typeof(r)) == immutable char));
3656 // contrast with the range capabilities of standard strings (with or
3657 // without autodecoding enabled).
3658 auto s = "Hello, World!";
3659 static assert(isBidirectionalRange!(typeof(r)));
3660 static if (isAutodecodableString!(typeof(s)))
3662 // with autodecoding enabled, strings are non-random-access ranges of
3663 // dchar.
3664 static assert(is(ElementType!(typeof(s)) == dchar));
3665 static assert(!isRandomAccessRange!(typeof(s)));
3666 static assert(!hasSlicing!(typeof(s)));
3667 static assert(!hasLength!(typeof(s)));
3669 else
3671 // without autodecoding, strings are normal arrays.
3672 static assert(is(ElementType!(typeof(s)) == immutable char));
3673 static assert(isRandomAccessRange!(typeof(s)));
3674 static assert(hasSlicing!(typeof(s)));
3675 static assert(hasLength!(typeof(s)));
3679 /// `byCodeUnit` does no Unicode decoding
3680 @safe unittest
3682 string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3683 assert(noel1.byCodeUnit[2] != 'ë');
3684 assert(noel1.byCodeUnit[2] == 'e');
3686 string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3687 // Because string is UTF-8, the code unit at index 2 is just
3688 // the first of a sequence that encodes 'ë'
3689 assert(noel2.byCodeUnit[2] != 'ë');
3692 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3693 @safe unittest
3695 import std.algorithm.comparison : equal;
3696 import std.range : popFrontN;
3697 import std.traits : isAutodecodableString;
3699 auto range = byCodeUnit("hello world");
3700 range.popFrontN(3);
3701 assert(equal(range.save, "lo world"));
3702 static if (isAutodecodableString!string) // only enabled with autodecoding
3704 string str = range.source;
3705 assert(str == "lo world");
3708 // source only exists if the range was wrapped
3710 auto range = byCodeUnit("hello world"d);
3711 static assert(!__traits(compiles, range.source));
3715 @safe pure nothrow @nogc unittest
3717 import std.range;
3719 enum testStr = "𐁄𐂌𐃯 hello ディラン";
3720 char[testStr.length] s;
3721 int i;
3722 foreach (c; testStr.byCodeUnit().byCodeUnit())
3724 s[i++] = c;
3726 assert(s == testStr);
3729 enum testStr = "𐁄𐂌𐃯 hello ディラン"w;
3730 wchar[testStr.length] s;
3731 int i;
3732 foreach (c; testStr.byCodeUnit().byCodeUnit())
3734 s[i++] = c;
3736 assert(s == testStr);
3739 enum testStr = "𐁄𐂌𐃯 hello ディラン"d;
3740 dchar[testStr.length] s;
3741 int i;
3742 foreach (c; testStr.byCodeUnit().byCodeUnit())
3744 s[i++] = c;
3746 assert(s == testStr);
3749 auto bcu = "hello".byCodeUnit();
3750 assert(bcu.length == 5);
3751 assert(bcu[3] == 'l');
3752 assert(bcu[2 .. 4][1] == 'l');
3755 char[5] orig = "hello";
3756 auto bcu = orig[].byCodeUnit();
3757 bcu.front = 'H';
3758 assert(bcu.front == 'H');
3759 bcu[1] = 'E';
3760 assert(bcu[1] == 'E');
3763 auto bcu = "hello".byCodeUnit().byCodeUnit();
3764 static assert(isForwardRange!(typeof(bcu)));
3765 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3766 auto s = bcu.save;
3767 bcu.popFront();
3768 assert(s.front == 'h');
3771 auto bcu = "hello".byCodeUnit();
3772 static assert(hasSlicing!(typeof(bcu)));
3773 static assert(isBidirectionalRange!(typeof(bcu)));
3774 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3775 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3776 auto ret = bcu.retro;
3777 assert(ret.front == 'o');
3778 ret.popFront();
3779 assert(ret.front == 'l');
3782 auto bcu = "κόσμε"w.byCodeUnit();
3783 static assert(hasSlicing!(typeof(bcu)));
3784 static assert(isBidirectionalRange!(typeof(bcu)));
3785 static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
3786 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3787 auto ret = bcu.retro;
3788 assert(ret.front == 'ε');
3789 ret.popFront();
3790 assert(ret.front == 'μ');
3793 static struct Stringish
3795 string s;
3796 alias s this;
3799 auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓");
3800 auto bcu = orig.byCodeUnit();
3801 static assert(is(typeof(bcu) == struct));
3802 static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
3803 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3804 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3805 assert(bcu.front == cast(char) 244);
3808 static struct WStringish
3810 wstring s;
3811 alias s this;
3814 auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w);
3815 auto bcu = orig.byCodeUnit();
3816 static assert(is(typeof(bcu) == struct));
3817 static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
3818 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3819 static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3820 assert(bcu.front == cast(wchar) 56319);
3823 static struct DStringish
3825 dstring s;
3826 alias s this;
3829 auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d);
3830 auto bcu = orig.byCodeUnit();
3831 static assert(is(typeof(bcu) == dstring));
3832 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3833 static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3834 assert(bcu.front == cast(dchar) 1114104);
3837 static struct FuncStringish
3839 string str;
3840 string s() pure nothrow @nogc { return str; }
3841 alias s this;
3844 auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3845 auto bcu = orig.byCodeUnit();
3846 static if (isAutodecodableString!FuncStringish)
3847 static assert(is(typeof(bcu) == struct));
3848 else
3849 static assert(is(typeof(bcu) == string));
3850 static assert(!is(typeof(bcu) == FuncStringish));
3851 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3852 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3853 assert(bcu.front == cast(char) 244);
3856 static struct Range
3858 string data;
3859 bool empty() pure nothrow @nogc { return data.empty; }
3860 char front() pure nothrow @nogc { return data[0]; }
3861 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3864 auto orig = Range("\U0010fff8 𐁊 foo 𐂓");
3865 auto bcu = orig.byCodeUnit();
3866 static assert(is(typeof(bcu) == Range));
3867 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3868 static assert(is(ElementType!(typeof(bcu)) == char));
3869 assert(bcu.front == cast(char) 244);
3872 static struct WRange
3874 wstring data;
3875 bool empty() pure nothrow @nogc { return data.empty; }
3876 wchar front() pure nothrow @nogc { return data[0]; }
3877 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3880 auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w);
3881 auto bcu = orig.byCodeUnit();
3882 static assert(is(typeof(bcu) == WRange));
3883 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3884 static assert(is(ElementType!(typeof(bcu)) == wchar));
3885 assert(bcu.front == 56319);
3888 static struct DRange
3890 dstring data;
3891 bool empty() pure nothrow @nogc { return data.empty; }
3892 dchar front() pure nothrow @nogc { return data[0]; }
3893 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3896 auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d);
3897 auto bcu = orig.byCodeUnit();
3898 static assert(is(typeof(bcu) == DRange));
3899 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3900 static assert(is(ElementType!(typeof(bcu)) == dchar));
3901 assert(bcu.front == 1114104);
3904 static struct RangeAndStringish
3906 bool empty() pure nothrow @nogc { return data.empty; }
3907 char front() pure nothrow @nogc { return data[0]; }
3908 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3910 string data;
3911 string s;
3912 alias s this;
3915 auto orig = RangeAndStringish("test.d", "other");
3916 auto bcu = orig.byCodeUnit();
3917 static assert(is(typeof(bcu) == RangeAndStringish));
3918 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3919 static assert(is(ElementType!(typeof(bcu)) == char));
3920 assert(bcu.front == 't');
3923 static struct WRangeAndStringish
3925 bool empty() pure nothrow @nogc { return data.empty; }
3926 wchar front() pure nothrow @nogc { return data[0]; }
3927 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3929 wstring data;
3930 wstring s;
3931 alias s this;
3934 auto orig = WRangeAndStringish("test.d"w, "other"w);
3935 auto bcu = orig.byCodeUnit();
3936 static assert(is(typeof(bcu) == WRangeAndStringish));
3937 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3938 static assert(is(ElementType!(typeof(bcu)) == wchar));
3939 assert(bcu.front == 't');
3942 static struct DRangeAndStringish
3944 bool empty() pure nothrow @nogc { return data.empty; }
3945 dchar front() pure nothrow @nogc { return data[0]; }
3946 void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3948 dstring data;
3949 dstring s;
3950 alias s this;
3953 auto orig = DRangeAndStringish("test.d"d, "other"d);
3954 auto bcu = orig.byCodeUnit();
3955 static assert(is(typeof(bcu) == DRangeAndStringish));
3956 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3957 static assert(is(ElementType!(typeof(bcu)) == dchar));
3958 assert(bcu.front == 't');
3961 enum Enum : string { a = "test.d" }
3963 auto orig = Enum.a;
3964 auto bcu = orig.byCodeUnit();
3965 static assert(!is(typeof(bcu) == Enum));
3966 static if (isAutodecodableString!Enum)
3967 static assert(is(typeof(bcu) == struct));
3968 else
3969 static assert(is(typeof(bcu) == string));
3970 static assert(is(ElementType!(typeof(bcu)) == immutable char));
3971 assert(bcu.front == 't');
3974 enum WEnum : wstring { a = "test.d"w }
3976 auto orig = WEnum.a;
3977 auto bcu = orig.byCodeUnit();
3978 static assert(!is(typeof(bcu) == WEnum));
3979 static if (isAutodecodableString!WEnum)
3980 static assert(is(typeof(bcu) == struct));
3981 else
3982 static assert(is(typeof(bcu) == wstring));
3983 static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3984 assert(bcu.front == 't');
3987 enum DEnum : dstring { a = "test.d"d }
3989 auto orig = DEnum.a;
3990 auto bcu = orig.byCodeUnit();
3991 static assert(is(typeof(bcu) == dstring));
3992 static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3993 assert(bcu.front == 't');
3996 static if (autodecodeStrings)
3998 static assert(!is(typeof(byCodeUnit("hello")) == string));
3999 static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
4001 else
4003 static assert(is(typeof(byCodeUnit("hello")) == string));
4004 static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
4006 static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
4008 static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
4009 static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
4010 static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
4012 enum SEnum : char[5] { a = "hello" }
4013 enum WSEnum : wchar[5] { a = "hello"w }
4014 enum DSEnum : dchar[5] { a = "hello"d }
4016 static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
4017 static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
4018 static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
4021 /****************************
4022 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4023 * of characters by char, wchar, or dchar.
4024 * These aliases simply forward to $(LREF byUTF) with the
4025 * corresponding C argument.
4027 * Params:
4028 * r = input range of characters, or array of characters
4030 alias byChar = byUTF!char;
4032 /// Ditto
4033 alias byWchar = byUTF!wchar;
4035 /// Ditto
4036 alias byDchar = byUTF!dchar;
4038 @safe pure nothrow @nogc unittest
4041 char[5] s;
4042 int i;
4043 foreach (c; "hello".byChar.byChar())
4045 //writefln("[%d] '%c'", i, c);
4046 s[i++] = c;
4048 assert(s == "hello");
4051 char[5+2+3+4+3+3] s;
4052 int i;
4053 dchar[10] a;
4054 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4055 a[8] = 0xD800; // invalid
4056 a[9] = cast(dchar) 0x110000; // invalid
4057 foreach (c; a[].byChar())
4059 //writefln("[%d] '%c'", i, c);
4060 s[i++] = c;
4062 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4065 auto r = "hello"w.byChar();
4066 r.popFront();
4067 r.popFront();
4068 assert(r.front == 'l');
4071 auto r = "hello"d.byChar();
4072 r.popFront();
4073 r.popFront();
4074 assert(r.front == 'l');
4077 auto r = "hello"d.byChar();
4078 assert(isForwardRange!(typeof(r)));
4079 auto s = r.save;
4080 r.popFront();
4081 assert(s.front == 'h');
4085 @safe pure nothrow @nogc unittest
4088 wchar[11] s;
4089 int i;
4090 dchar[10] a;
4091 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4092 a[8] = 0xD800; // invalid
4093 a[9] = cast(dchar) 0x110000; // invalid
4094 foreach (c; a[].byWchar())
4096 //writefln("[%d] '%c' x%x", i, c, c);
4097 s[i++] = c;
4099 foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
4101 //writefln("[%d] '%c' x%x", j, c, c);
4103 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
4107 auto r = "hello".byWchar();
4108 r.popFront();
4109 r.popFront();
4110 assert(r.front == 'l');
4113 auto r = "hello"d.byWchar();
4114 r.popFront();
4115 r.popFront();
4116 assert(r.front == 'l');
4119 auto r = "hello"d.byWchar();
4120 assert(isForwardRange!(typeof(r)));
4121 auto s = r.save;
4122 r.popFront();
4123 assert(s.front == 'h');
4127 @safe pure nothrow @nogc unittest
4130 dchar[9] s;
4131 int i;
4132 string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4133 foreach (c; a.byDchar())
4135 s[i++] = c;
4137 assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
4140 foreach (s; invalidUTFstrings!char())
4142 auto r = s.byDchar();
4143 assert(!r.empty);
4144 assert(r.front == r.front);
4145 dchar c = r.front;
4146 assert(c == replacementDchar);
4150 auto r = "hello".byDchar();
4151 r.popFront();
4152 r.popFront();
4153 assert(r.front == 'l');
4157 dchar[8] s;
4158 int i;
4159 wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
4160 foreach (c; a.byDchar())
4162 //writefln("[%d] '%c' x%x", i, c, c);
4163 s[i++] = c;
4165 assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
4168 foreach (s; invalidUTFstrings!wchar())
4170 auto r = s.byDchar();
4171 assert(!r.empty);
4172 assert(r.front == r.front);
4173 dchar c = r.front;
4174 assert(c == replacementDchar);
4178 wchar[2] ws;
4179 ws[0] = 0xD800;
4180 ws[1] = 0xDD00; // correct surrogate pair
4181 auto r = ws[].byDchar();
4182 assert(!r.empty);
4183 assert(r.front == r.front);
4184 dchar c = r.front;
4185 assert(c == '\U00010100');
4188 auto r = "hello"w.byDchar();
4189 r.popFront();
4190 r.popFront();
4191 assert(r.front == 'l');
4195 dchar[5] s;
4196 int i;
4197 dstring a = "hello"d;
4198 foreach (c; a.byDchar.byDchar())
4200 //writefln("[%d] '%c' x%x", i, c, c);
4201 s[i++] = c;
4203 assert(s == "hello"d);
4206 auto r = "hello".byDchar();
4207 assert(isForwardRange!(typeof(r)));
4208 auto s = r.save;
4209 r.popFront();
4210 assert(s.front == 'h');
4213 auto r = "hello"w.byDchar();
4214 assert(isForwardRange!(typeof(r)));
4215 auto s = r.save;
4216 r.popFront();
4217 assert(s.front == 'h');
4221 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4222 // which needs to support ranges with and without those attributes
4224 pure @safe nothrow @nogc unittest
4226 dchar[5] s = "hello"d;
4227 foreach (c; s[].byChar()) { }
4228 foreach (c; s[].byWchar()) { }
4229 foreach (c; s[].byDchar()) { }
4232 version (StdUnittest)
4233 private int impureVariable;
4235 @system unittest
4237 static struct ImpureThrowingSystemRange(Char)
4239 @property bool empty() const { return true; }
4240 @property Char front() const { return Char.init; }
4241 void popFront()
4243 impureVariable++;
4244 throw new Exception("only for testing nothrow");
4248 foreach (Char; AliasSeq!(char, wchar, dchar))
4250 ImpureThrowingSystemRange!Char range;
4251 foreach (c; range.byChar()) { }
4252 foreach (c; range.byWchar()) { }
4253 foreach (c; range.byDchar()) { }
4257 /****************************
4258 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4259 * of characters by char type `C` by encoding the elements of the range.
4261 * UTF sequences that cannot be converted to the specified encoding are either
4262 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4263 * of the Unicode Standard 6.2 or result in a thrown UTFException.
4264 * Hence byUTF is not symmetric.
4265 * This algorithm is lazy, and does not allocate memory.
4266 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4267 * `r` parameter.
4269 * Params:
4270 * C = `char`, `wchar`, or `dchar`
4271 * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4272 * UseReplacementDchar.no means throw `UTFException` for invalid UTF
4274 * Throws:
4275 * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes`
4277 * GC:
4278 * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no`
4280 * Returns:
4281 * A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4282 * as defined by $(REF isAutodecodableString, std, traits).
4284 * A forward range if `R` is a forward range and not auto-decodable.
4286 * Or, if `R` is a range and it is auto-decodable and
4287 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4288 * to $(LREF byCodeUnit).
4290 * Otherwise, an input range of characters.
4292 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
4293 if (isSomeChar!C)
4295 static if (is(immutable C == immutable UC, UC) && !is(C == UC))
4296 alias byUTF = byUTF!UC;
4297 else:
4299 auto ref byUTF(R)(R r)
4300 if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4302 return byUTF(r.byCodeUnit());
4305 auto ref byUTF(R)(R r)
4306 if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4308 static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C))
4310 return r.byCodeUnit();
4312 else static if (is(C == dchar))
4314 static struct Result
4316 enum Empty = uint.max; // range is empty or just constructed
4318 this(return scope R r)
4320 this.r = r;
4323 this(return scope R r, uint buff)
4325 this.r = r;
4326 this.buff = buff;
4329 static if (isBidirectionalRange!R)
4331 this(return scope R r, uint frontBuff, uint backBuff)
4333 this.r = r;
4334 this.buff = frontBuff;
4335 this.backBuff = backBuff;
4339 @property bool empty()
4341 static if (isBidirectionalRange!R)
4342 return buff == Empty && backBuff == Empty && r.empty;
4343 else
4344 return buff == Empty && r.empty;
4347 @property dchar front() scope // 'scope' required by call to decodeFront() below
4349 if (buff == Empty)
4351 auto c = r.front;
4353 static if (is(RC == wchar))
4354 enum firstMulti = 0xD800; // First high surrogate.
4355 else
4356 enum firstMulti = 0x80; // First non-ASCII.
4357 if (c < firstMulti)
4359 r.popFront;
4360 buff = cast(dchar) c;
4362 else
4364 buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4367 return cast(dchar) buff;
4370 void popFront()
4372 if (buff == Empty)
4373 front();
4374 buff = Empty;
4377 static if (isForwardRange!R)
4379 @property auto save()
4381 static if (isBidirectionalRange!R)
4383 return Result(r.save, buff, backBuff);
4385 else
4387 return Result(r.save, buff);
4392 static if (isBidirectionalRange!R)
4394 @property dchar back() scope // 'scope' required by call to decodeBack() below
4396 if (backBuff != Empty)
4397 return cast(dchar) backBuff;
4399 auto c = r.back;
4400 static if (is(RC == wchar))
4401 enum firstMulti = 0xD800; // First high surrogate.
4402 else
4403 enum firstMulti = 0x80; // First non-ASCII.
4404 if (c < firstMulti)
4406 r.popBack;
4407 backBuff = cast(dchar) c;
4409 else
4411 backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }();
4413 return cast(dchar) backBuff;
4417 void popBack()
4419 if (backBuff == Empty)
4420 back();
4421 backBuff = Empty;
4425 private:
4427 R r;
4428 uint buff = Empty; // one character lookahead buffer
4429 static if (isBidirectionalRange!R)
4430 uint backBuff = Empty;
4433 return Result(r);
4435 else
4437 static struct Result
4439 this(return scope R r)
4441 this.r = r;
4444 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
4446 this.r = r;
4447 this.pos = pos;
4448 this.fill = fill;
4449 this.buf = buf;
4452 static if (isBidirectionalRange!R)
4454 this(return scope R r, ushort frontPos, ushort frontFill,
4455 ushort backPos, ushort backFill, C[4 / C.sizeof] buf)
4457 this.r = r;
4458 this.pos = frontPos;
4459 this.fill = frontFill;
4460 this.backPos = backPos;
4461 this.backFill = backFill;
4462 this.buf = buf;
4466 @property bool empty()
4468 static if (isBidirectionalRange!R)
4469 return pos == fill && backPos == backFill && r.empty;
4470 else
4471 return pos == fill && r.empty;
4474 @property auto front() scope // 'scope' required by call to decodeFront() below
4476 if (pos == fill)
4478 pos = 0;
4479 auto c = r.front;
4481 static if (C.sizeof >= 2 && RC.sizeof >= 2)
4482 enum firstMulti = 0xD800; // First high surrogate.
4483 else
4484 enum firstMulti = 0x80; // First non-ASCII.
4485 if (c < firstMulti)
4487 fill = 1;
4488 r.popFront;
4489 buf[pos] = cast(C) c;
4491 else
4493 static if (is(RC == dchar))
4495 r.popFront;
4496 dchar dc = c;
4498 else
4499 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4500 fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4503 return buf[pos];
4506 void popFront()
4508 if (pos == fill)
4509 front;
4510 ++pos;
4513 static if (isForwardRange!R)
4515 @property auto save()
4517 static if (isBidirectionalRange!R)
4519 return Result(r.save, pos, fill, backPos, backFill, buf);
4521 else
4523 return Result(r.save, pos, fill, buf);
4528 static if (isBidirectionalRange!R)
4530 @property auto back() scope // 'scope' required by call to decodeBack() below
4532 if (backPos != backFill)
4533 return buf[cast(ushort) (backFill - backPos - 1)];
4535 backPos = 0;
4536 auto c = r.back;
4537 static if (C.sizeof >= 2 && RC.sizeof >= 2)
4538 enum firstMulti = 0xD800; // First high surrogate.
4539 else
4540 enum firstMulti = 0x80; // First non-ASCII.
4541 if (c < firstMulti)
4543 backFill = 1;
4544 r.popBack;
4545 buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c;
4547 else
4549 static if (is(RC == dchar))
4551 r.popBack;
4552 dchar dc = c;
4554 else
4555 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }();
4556 backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4558 return buf[cast(ushort) (backFill - backPos - 1)];
4561 void popBack()
4563 if (backPos == backFill)
4564 back;
4565 ++backPos;
4569 private:
4571 R r;
4572 ushort pos, fill;
4573 static if (isBidirectionalRange!R)
4574 ushort backPos, backFill;
4575 C[4 / C.sizeof] buf = void;
4578 return Result(r);
4584 @safe pure nothrow unittest
4586 import std.algorithm.comparison : equal;
4588 // hellö as a range of `char`s, which are UTF-8
4589 assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4591 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4592 assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4594 // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4595 assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4596 assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37]));
4597 assert("𐐷".byUTF!dchar().equal([0x00010437]));
4601 @safe unittest
4603 import std.algorithm.comparison : equal;
4604 import std.exception : assertThrown;
4606 assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
4607 assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
4610 @safe unittest
4613 wchar[] s = ['a', 'b', 0x219];
4614 auto r = s.byUTF!char;
4615 assert(isBidirectionalRange!(typeof(r)));
4616 assert(r.back == 0x99);
4617 r.popBack;
4618 assert(r.back == 0xc8);
4619 r.popBack;
4620 assert(r.back == 'b');
4625 wchar[] s = ['a', 'b', 0x219];
4626 auto r = s.byUTF!wchar;
4627 uint i;
4628 assert(isBidirectionalRange!(typeof(r)));
4629 assert(r.back == 0x219);
4630 r.popBack;
4631 assert(r.back == 'b');
4635 wchar[] s = ['a', 'b', 0x219];
4636 auto r = s.byUTF!dchar;
4637 assert(isBidirectionalRange!(typeof(r)));
4638 assert(r.back == 0x219);
4639 r.popBack;
4640 assert(r.back == 'b');
4644 dchar[] s = ['𐐷', '😁'];
4645 auto r = s.byUTF!wchar;
4646 assert(r.back == 0xde01);
4647 r.popBack;
4648 assert(r.back == 0xd83d);
4649 r.popBack;
4650 assert(r.back == 0xdc37);
4651 r.popBack;
4652 assert(r.back == 0xd801);
4656 dchar[] s = ['𐐷', '😁'];
4657 auto r = s.byUTF!char;
4658 char[] res;
4659 while (!r.empty)
4661 res ~= r.back;
4662 r.popBack;
4664 import std.algorithm.comparison : equal;
4665 assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4669 dchar[] res;
4670 auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar;
4671 while (!r.empty)
4673 res ~= r.back;
4674 r.popBack;
4676 import std.algorithm.comparison : equal;
4677 assert(res.equal(['e', 'd', 'c', 'b', 'a']));
4681 //testing the save() function
4682 wchar[] s = ['Ă','ț'];
4684 auto rc = s.byUTF!char;
4685 rc.popBack;
4686 auto rcCopy = rc.save;
4687 assert(rc.back == rcCopy.back);
4688 assert(rcCopy.back == 0xc8);
4690 auto rd = s.byUTF!dchar;
4691 rd.popBack;
4692 auto rdCopy = rd.save;
4693 assert(rd.back == rdCopy.back);
4694 assert(rdCopy.back == 'Ă');
4699 @safe pure nothrow unittest
4701 import std.range.primitives;
4702 wchar[] s = ['ă', 'î'];
4704 auto rc = s.byUTF!char;
4705 static assert(isBidirectionalRange!(typeof(rc)));
4706 assert(rc.back == 0xae);
4707 rc.popBack;
4708 assert(rc.back == 0xc3);
4709 rc.popBack;
4710 assert(rc.back == 0x83);
4711 rc.popBack;
4712 assert(rc.back == 0xc4);
4714 auto rw = s.byUTF!wchar;
4715 static assert(isBidirectionalRange!(typeof(rw)));
4716 assert(rw.back == 'î');
4717 rw.popBack;
4718 assert(rw.back == 'ă');
4720 auto rd = s.byUTF!dchar;
4721 static assert(isBidirectionalRange!(typeof(rd)));
4722 assert(rd.back == 'î');
4723 rd.popBack;
4724 assert(rd.back == 'ă');