libphobos/src/std/utf.d

   1 // Written in the D programming language.
   2
   3 /++
   4     Encode and decode UTF-8, UTF-16 and UTF-32 strings.
   5
   6     UTF character support is restricted to
   7     $(D '\u0000' &lt;= character &lt;= '\U0010FFFF').
   8
   9 $(SCRIPT inhibitQuickIndex = 1;)
  10 $(DIVC quickindex,
  11 $(BOOKTABLE,
  12 $(TR $(TH Category) $(TH Functions))
  13 $(TR $(TD Decode) $(TD
  14     $(LREF decode)
  15     $(LREF decodeFront)
  16 ))
  17 $(TR $(TD Lazy decode) $(TD
  18     $(LREF byCodeUnit)
  19     $(LREF byChar)
  20     $(LREF byWchar)
  21     $(LREF byDchar)
  22     $(LREF byUTF)
  23 ))
  24 $(TR $(TD Encode) $(TD
  25     $(LREF encode)
  26     $(LREF toUTF8)
  27     $(LREF toUTF16)
  28     $(LREF toUTF32)
  29     $(LREF toUTFz)
  30     $(LREF toUTF16z)
  31 ))
  32 $(TR $(TD Length) $(TD
  33     $(LREF codeLength)
  34     $(LREF count)
  35     $(LREF stride)
  36     $(LREF strideBack)
  37 ))
  38 $(TR $(TD Index) $(TD
  39     $(LREF toUCSindex)
  40     $(LREF toUTFindex)
  41 ))
  42 $(TR $(TD Validation) $(TD
  43     $(LREF isValidDchar)
  44     $(LREF isValidCodepoint)
  45     $(LREF validate)
  46 ))
  47 $(TR $(TD Miscellaneous) $(TD
  48     $(LREF replacementDchar)
  49     $(LREF UseReplacementDchar)
  50     $(LREF UTFException)
  51 ))
  52 ))
  53     See_Also:
  54         $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
  55         $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
  56         $(LINK https://web.archive.org/web/20100113043530/https://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
  57     Copyright: Copyright The D Language Foundation 2000 - 2012.
  58     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  59     Authors:   $(HTTP digitalmars.com, Walter Bright) and
  60                $(HTTP jmdavisprog.com, Jonathan M Davis)
  61     Source:    $(PHOBOSSRC std/utf.d)
  62    +/
  63 module std.utf;
  64
  65 import std.exception : basicExceptionCtors;
  66 import core.exception : UnicodeException;
  67 import std.meta : AliasSeq;
  68 import std.range;
  69 import std.traits : isAutodecodableString, isConvertibleToString,
  70     isSomeChar, isSomeString, isStaticArray, Unqual;
  71 import std.typecons : Flag, Yes, No;
  72
  73
  74 /++
  75     Exception thrown on errors in std.utf functions.
  76   +/
  77 class UTFException : UnicodeException
  78 {
  79     import core.internal.string : unsignedToTempString, UnsignedStringBuf;
  80
  81     uint[4] sequence;
  82     size_t  len;
  83
  84     @safe pure nothrow @nogc
  85     UTFException setSequence(scope uint[] data...) return
  86     {
  87         assert(data.length <= 4);
  88
  89         len = data.length < 4 ? data.length : 4;
  90         sequence[0 .. len] = data[0 .. len];
  91
  92         return this;
  93     }
  94
  95     // FIXME: Use std.exception.basicExceptionCtors here once
  96     // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
  97
  98     /**
  99     Standard exception constructors.
 100      */
 101     this(string msg, string file = __FILE__, size_t line = __LINE__,
 102          Throwable next = null) @nogc @safe pure nothrow
 103     {
 104         super(msg, 0, file, line, next);
 105     }
 106     /// ditto
 107     this(string msg, size_t index, string file = __FILE__,
 108          size_t line = __LINE__, Throwable next = null) @safe pure nothrow
 109     {
 110         UnsignedStringBuf buf = void;
 111         msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
 112         super(msg, index, file, line, next);
 113     }
 114
 115     /**
 116     Returns:
 117         A `string` detailing the invalid UTF sequence.
 118      */
 119     override string toString() const
 120     {
 121         if (len == 0)
 122         {
 123             /* Exception.toString() is not marked as const, although
 124              * it is const-compatible.
 125              */
 126             //return super.toString();
 127             auto e = () @trusted { return cast(Exception) super; } ();
 128             return e.toString();
 129         }
 130
 131         string result = "Invalid UTF sequence:";
 132
 133         foreach (i; sequence[0 .. len])
 134         {
 135             UnsignedStringBuf buf = void;
 136             result ~= ' ';
 137             auto h = unsignedToTempString!16(i, buf);
 138             if (h.length == 1)
 139                 result ~= '0';
 140             result ~= h;
 141             result ~= 'x';
 142         }
 143
 144         if (super.msg.length > 0)
 145         {
 146             result ~= " - ";
 147             result ~= super.msg;
 148         }
 149
 150         return result;
 151     }
 152 }
 153
 154 ///
 155 @safe unittest
 156 {
 157     import std.exception : assertThrown;
 158
 159     char[4] buf;
 160     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
 161     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
 162     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
 163     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
 164     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
 165 }
 166
 167 /*
 168    Provide array of invalidly encoded UTF strings. Useful for testing.
 169
 170    Params:
 171         Char = char, wchar, or dchar
 172
 173    Returns:
 174         an array of invalidly encoded UTF strings
 175  */
 176
 177 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
 178 if (isSomeChar!Char)
 179 {
 180     static if (is(Char == char))
 181     {
 182         enum x = 0xDC00;         // invalid surrogate value
 183         enum y = 0x110000;       // out of range
 184
 185         static immutable string[8] result =
 186         [
 187             "\x80",             // not a start byte
 188             "\xC0",             // truncated
 189             "\xC0\xC0",         // invalid continuation
 190             "\xF0\x82\x82\xAC", // overlong
 191             [
 192               0xE0 | (x >> 12),
 193               0x80 | ((x >> 6) & 0x3F),
 194               0x80 | (x & 0x3F)
 195             ],
 196             [
 197               cast(char)(0xF0 | (y >> 18)),
 198               cast(char)(0x80 | ((y >> 12) & 0x3F)),
 199               cast(char)(0x80 | ((y >> 6) & 0x3F)),
 200               cast(char)(0x80 | (y & 0x3F))
 201             ],
 202             [
 203               cast(char)(0xF8 | 3),     // 5 byte encoding
 204               cast(char)(0x80 | 3),
 205               cast(char)(0x80 | 3),
 206               cast(char)(0x80 | 3),
 207               cast(char)(0x80 | 3),
 208             ],
 209             [
 210               cast(char)(0xFC | 3),     // 6 byte encoding
 211               cast(char)(0x80 | 3),
 212               cast(char)(0x80 | 3),
 213               cast(char)(0x80 | 3),
 214               cast(char)(0x80 | 3),
 215               cast(char)(0x80 | 3),
 216             ],
 217         ];
 218
 219         return result[];
 220     }
 221     else static if (is(Char == wchar))
 222     {
 223         static immutable wstring[5] result =
 224         [
 225             [
 226               cast(wchar) 0xDC00,
 227             ],
 228             [
 229               cast(wchar) 0xDFFF,
 230             ],
 231             [
 232               cast(wchar) 0xDBFF,
 233               cast(wchar) 0xDBFF,
 234             ],
 235             [
 236               cast(wchar) 0xDBFF,
 237               cast(wchar) 0xE000,
 238             ],
 239             [
 240               cast(wchar) 0xD800,
 241             ],
 242         ];
 243
 244         return result[];
 245     }
 246     else static if (is(Char == dchar))
 247     {
 248         static immutable dstring[3] result =
 249         [
 250             [ cast(dchar) 0x110000 ],
 251             [ cast(dchar) 0x00D800 ],
 252             [ cast(dchar) 0x00DFFF ],
 253         ];
 254
 255         return result;
 256     }
 257     else
 258         static assert(0);
 259 }
 260
 261 /++
 262     Check whether the given Unicode code point is valid.
 263
 264     Params:
 265         c = code point to check
 266
 267     Returns:
 268         `true` if and only if `c` is a valid Unicode code point
 269
 270     Note:
 271     `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
 272     as they are permitted for internal use by an application, but they are
 273     not allowed for interchange by the Unicode standard.
 274   +/
 275 bool isValidDchar(dchar c) pure nothrow @safe @nogc
 276 {
 277     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
 278 }
 279
 280 ///
 281 @safe @nogc pure nothrow unittest
 282 {
 283     assert( isValidDchar(cast(dchar) 0x41));
 284     assert( isValidDchar(cast(dchar) 0x00));
 285     assert(!isValidDchar(cast(dchar) 0xD800));
 286     assert(!isValidDchar(cast(dchar) 0x11FFFF));
 287 }
 288
 289 pure nothrow @safe @nogc unittest
 290 {
 291     import std.exception;
 292
 293     assertCTFEable!(
 294     {
 295     assert( isValidDchar(cast(dchar)'a') == true);
 296     assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
 297
 298     assert(!isValidDchar(cast(dchar) 0x00D800));
 299     assert(!isValidDchar(cast(dchar) 0x00DBFF));
 300     assert(!isValidDchar(cast(dchar) 0x00DC00));
 301     assert(!isValidDchar(cast(dchar) 0x00DFFF));
 302     assert( isValidDchar(cast(dchar) 0x00FFFE));
 303     assert( isValidDchar(cast(dchar) 0x00FFFF));
 304     assert( isValidDchar(cast(dchar) 0x01FFFF));
 305     assert( isValidDchar(cast(dchar) 0x10FFFF));
 306     assert(!isValidDchar(cast(dchar) 0x110000));
 307     });
 308 }
 309
 310 /**
 311 Checks if a single character forms a valid code point.
 312
 313 When standing alone, some characters are invalid code points. For
 314 example the `wchar` `0xD800` is a so called high surrogate, which can
 315 only be interpreted together with a low surrogate following it. As a
 316 standalone character it is considered invalid.
 317
 318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
 319 Unicode Standard, D90, D91 and D92) for more details.
 320
 321 Params:
 322     c = character to test
 323     Char = character type of `c`
 324
 325 Returns:
 326     `true`, if `c` forms a valid code point.
 327  */
 328 bool isValidCodepoint(Char)(Char c)
 329 if (isSomeChar!Char)
 330 {
 331     alias UChar = typeof(cast() c);
 332     static if (is(UChar == char))
 333     {
 334         return c <= 0x7F;
 335     }
 336     else static if (is(UChar == wchar))
 337     {
 338         return c <= 0xD7FF || c >= 0xE000;
 339     }
 340     else static if (is(UChar == dchar))
 341     {
 342         return isValidDchar(c);
 343     }
 344     else
 345         static assert(false, "unknown character type: `" ~ Char.stringof ~ "`");
 346 }
 347
 348 ///
 349 @safe pure nothrow unittest
 350 {
 351     assert( isValidCodepoint(cast(char) 0x40));
 352     assert(!isValidCodepoint(cast(char) 0x80));
 353     assert( isValidCodepoint(cast(wchar) 0x1234));
 354     assert(!isValidCodepoint(cast(wchar) 0xD800));
 355     assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
 356     assert(!isValidCodepoint(cast(dchar) 0x12345678));
 357 }
 358
 359 /++
 360     Calculate the length of the UTF sequence starting at `index`
 361     in `str`.
 362
 363     Params:
 364         str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
 365         of UTF code units. Must be random access if `index` is passed
 366         index = starting index of UTF sequence (default: `0`)
 367
 368     Returns:
 369         The number of code units in the UTF sequence. For UTF-8, this is a
 370         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
 371         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
 372
 373     Throws:
 374         May throw a `UTFException` if `str[index]` is not the start of a
 375         valid UTF sequence.
 376
 377     Note:
 378         `stride` will only analyze the first `str[index]` element. It
 379         will not fully verify the validity of the UTF sequence, nor even verify
 380         the presence of the sequence: it will not actually guarantee that
 381         $(D index + stride(str, index) <= str.length).
 382   +/
 383 uint stride(S)(auto ref S str, size_t index)
 384 if (is(S : const char[]) ||
 385     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
 386 {
 387     static if (is(typeof(str.length) : ulong))
 388         assert(index < str.length, "Past the end of the UTF-8 sequence");
 389     immutable c = str[index];
 390
 391     if (c < 0x80)
 392         return 1;
 393     else
 394         return strideImpl(c, index);
 395 }
 396
 397 /// Ditto
 398 uint stride(S)(auto ref S str)
 399 if (is(S : const char[]) ||
 400     (isInputRange!S && is(immutable ElementType!S == immutable char)))
 401 {
 402     static if (is(S : const char[]))
 403         immutable c = str[0];
 404     else
 405         immutable c = str.front;
 406
 407     if (c < 0x80)
 408         return 1;
 409     else
 410         return strideImpl(c, 0);
 411 }
 412
 413 @system unittest
 414 {
 415     import core.exception : AssertError;
 416     import std.conv : to;
 417     import std.exception;
 418     import std.string : format;
 419     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 420     static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
 421     {
 422         enforce(stride(s, i) == codeLength!char(c),
 423                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 424
 425         enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
 426                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 427
 428         auto refRandom = new RefRandomCU!char(s);
 429         immutable randLen = refRandom.length;
 430         enforce(stride(refRandom, i) == codeLength!char(c),
 431                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 432         enforce(refRandom.length == randLen,
 433                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 434
 435         if (i == 0)
 436         {
 437             enforce(stride(s) == codeLength!char(c),
 438                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
 439
 440             enforce(stride(InputCU!char(s)) == codeLength!char(c),
 441                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
 442
 443             auto refBidir = new RefBidirCU!char(s);
 444             immutable bidirLen = refBidir.length;
 445             enforce(stride(refBidir) == codeLength!char(c),
 446                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 447             enforce(refBidir.length == bidirLen,
 448                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 449         }
 450     }
 451
 452     assertCTFEable!(
 453     {
 454     test("a", 'a');
 455     test(" ", ' ');
 456     test("\u2029", '\u2029'); //paraSep
 457     test("\u0100", '\u0100');
 458     test("\u0430", '\u0430');
 459     test("\U00010143", '\U00010143');
 460     test("abcdefcdef", 'a');
 461     test("hello\U00010143\u0100\U00010143", 'h', 0);
 462     test("hello\U00010143\u0100\U00010143", 'e', 1);
 463     test("hello\U00010143\u0100\U00010143", 'l', 2);
 464     test("hello\U00010143\u0100\U00010143", 'l', 3);
 465     test("hello\U00010143\u0100\U00010143", 'o', 4);
 466     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
 467     test("hello\U00010143\u0100\U00010143", '\u0100', 9);
 468     test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
 469
 470     foreach (S; AliasSeq!(char[], const char[], string))
 471     {
 472         enum str = to!S("hello world");
 473         static assert(isSafe!({ stride(str, 0); }));
 474         static assert(isSafe!({ stride(str);    }));
 475         static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
 476         static assert((functionAttributes!({ stride(str);    }) & FunctionAttribute.pure_) != 0);
 477     }
 478     });
 479 }
 480
 481 @safe unittest // invalid start bytes
 482 {
 483     import std.exception : assertThrown;
 484     immutable char[] invalidStartBytes = [
 485         0b1111_1000, // indicating a sequence length of 5
 486         0b1111_1100, // 6
 487         0b1111_1110, // 7
 488         0b1111_1111, // 8
 489         0b1000_0000, // continuation byte
 490     ];
 491     foreach (c; invalidStartBytes)
 492         assertThrown!UTFException(stride([c]));
 493 }
 494
 495 /// Ditto
 496 uint stride(S)(auto ref S str, size_t index)
 497 if (is(S : const wchar[]) ||
 498     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
 499 {
 500     static if (is(typeof(str.length) : ulong))
 501         assert(index < str.length, "Past the end of the UTF-16 sequence");
 502     immutable uint u = str[index];
 503     return 1 + (u >= 0xD800 && u <= 0xDBFF);
 504 }
 505
 506 /// Ditto
 507 uint stride(S)(auto ref S str) @safe pure
 508 if (is(S : const wchar[]))
 509 {
 510     return stride(str, 0);
 511 }
 512
 513 /// Ditto
 514 uint stride(S)(auto ref S str)
 515 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
 516     !is(S : const wchar[]))
 517 {
 518     assert(!str.empty, "UTF-16 sequence is empty");
 519     immutable uint u = str.front;
 520     return 1 + (u >= 0xD800 && u <= 0xDBFF);
 521 }
 522
 523 @system unittest
 524 {
 525     import core.exception : AssertError;
 526     import std.conv : to;
 527     import std.exception;
 528     import std.string : format;
 529     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 530     static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
 531     {
 532         enforce(stride(s, i) == codeLength!wchar(c),
 533                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 534
 535         enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
 536                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 537
 538         auto refRandom = new RefRandomCU!wchar(s);
 539         immutable randLen = refRandom.length;
 540         enforce(stride(refRandom, i) == codeLength!wchar(c),
 541                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 542         enforce(refRandom.length == randLen,
 543                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 544
 545         if (i == 0)
 546         {
 547             enforce(stride(s) == codeLength!wchar(c),
 548                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
 549
 550             enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
 551                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
 552
 553             auto refBidir = new RefBidirCU!wchar(s);
 554             immutable bidirLen = refBidir.length;
 555             enforce(stride(refBidir) == codeLength!wchar(c),
 556                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 557             enforce(refBidir.length == bidirLen,
 558                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 559         }
 560     }
 561
 562     assertCTFEable!(
 563     {
 564     test("a", 'a');
 565     test(" ", ' ');
 566     test("\u2029", '\u2029'); //paraSep
 567     test("\u0100", '\u0100');
 568     test("\u0430", '\u0430');
 569     test("\U00010143", '\U00010143');
 570     test("abcdefcdef", 'a');
 571     test("hello\U00010143\u0100\U00010143", 'h', 0);
 572     test("hello\U00010143\u0100\U00010143", 'e', 1);
 573     test("hello\U00010143\u0100\U00010143", 'l', 2);
 574     test("hello\U00010143\u0100\U00010143", 'l', 3);
 575     test("hello\U00010143\u0100\U00010143", 'o', 4);
 576     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
 577     test("hello\U00010143\u0100\U00010143", '\u0100', 7);
 578     test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
 579
 580     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
 581     {
 582         enum str = to!S("hello world");
 583         static assert(isSafe!(() => stride(str, 0)));
 584         static assert(isSafe!(() => stride(str)   ));
 585         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
 586         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
 587     }
 588     });
 589 }
 590
 591 /// Ditto
 592 uint stride(S)(auto ref S str, size_t index = 0)
 593 if (is(S : const dchar[]) ||
 594     (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
 595 {
 596     static if (is(typeof(str.length) : ulong))
 597         assert(index < str.length, "Past the end of the UTF-32 sequence");
 598     else
 599         assert(!str.empty, "UTF-32 sequence is empty.");
 600     return 1;
 601 }
 602
 603 ///
 604 @safe unittest
 605 {
 606     assert("a".stride == 1);
 607     assert("λ".stride == 2);
 608     assert("aλ".stride == 1);
 609     assert("aλ".stride(1) == 2);
 610     assert("𐐷".stride == 4);
 611 }
 612
 613 @system unittest
 614 {
 615     import core.exception : AssertError;
 616     import std.conv : to;
 617     import std.exception;
 618     import std.string : format;
 619     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 620     static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
 621     {
 622         enforce(stride(s, i) == codeLength!dchar(c),
 623                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 624
 625         enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
 626                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 627
 628         auto refRandom = new RefRandomCU!dchar(s);
 629         immutable randLen = refRandom.length;
 630         enforce(stride(refRandom, i) == codeLength!dchar(c),
 631                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 632         enforce(refRandom.length == randLen,
 633                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 634
 635         if (i == 0)
 636         {
 637             enforce(stride(s) == codeLength!dchar(c),
 638                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
 639
 640             enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
 641                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
 642
 643             auto refBidir = new RefBidirCU!dchar(s);
 644             immutable bidirLen = refBidir.length;
 645             enforce(stride(refBidir) == codeLength!dchar(c),
 646                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 647             enforce(refBidir.length == bidirLen,
 648                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 649         }
 650     }
 651
 652     assertCTFEable!(
 653     {
 654     test("a", 'a');
 655     test(" ", ' ');
 656     test("\u2029", '\u2029'); //paraSep
 657     test("\u0100", '\u0100');
 658     test("\u0430", '\u0430');
 659     test("\U00010143", '\U00010143');
 660     test("abcdefcdef", 'a');
 661     test("hello\U00010143\u0100\U00010143", 'h', 0);
 662     test("hello\U00010143\u0100\U00010143", 'e', 1);
 663     test("hello\U00010143\u0100\U00010143", 'l', 2);
 664     test("hello\U00010143\u0100\U00010143", 'l', 3);
 665     test("hello\U00010143\u0100\U00010143", 'o', 4);
 666     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
 667     test("hello\U00010143\u0100\U00010143", '\u0100', 6);
 668     test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
 669
 670     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
 671     {
 672         enum str = to!S("hello world");
 673         static assert(isSafe!(() => stride(str, 0)));
 674         static assert(isSafe!(() => stride(str)   ));
 675         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
 676         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
 677     }
 678     });
 679 }
 680
 681 private uint strideImpl(char c, size_t index) @trusted pure
 682 in { assert(c & 0x80); }
 683 do
 684 {
 685     import core.bitop : bsr;
 686     immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
 687     if (c == 0xFF || msbs < 2 || msbs > 4)
 688         throw new UTFException("Invalid UTF-8 sequence", index);
 689     return msbs;
 690 }
 691
 692 /++
 693     Calculate the length of the UTF sequence ending one code unit before
 694     `index` in `str`.
 695
 696     Params:
 697         str = bidirectional range of UTF code units. Must be random access if
 698         `index` is passed
 699         index = index one past end of UTF sequence (default: `str.length`)
 700
 701     Returns:
 702         The number of code units in the UTF sequence. For UTF-8, this is a
 703         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
 704         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
 705
 706     Throws:
 707         May throw a `UTFException` if `str[index]` is not one past the
 708         end of a valid UTF sequence.
 709
 710     Note:
 711         `strideBack` will only analyze the element at $(D str[index - 1])
 712         element. It will not fully verify the validity of the UTF sequence, nor
 713         even verify the presence of the sequence: it will not actually
 714         guarantee that $(D strideBack(str, index) <= index).
 715   +/
 716 uint strideBack(S)(auto ref S str, size_t index)
 717 if (is(S : const char[]) ||
 718     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
 719 {
 720     static if (is(typeof(str.length) : ulong))
 721         assert(index <= str.length, "Past the end of the UTF-8 sequence");
 722     assert(index > 0, "Not the end of the UTF-8 sequence");
 723
 724     if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
 725         return 1;
 726
 727     if (index >= 4) //single verification for most common case
 728     {
 729         static foreach (i; 2 .. 5)
 730         {
 731             if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
 732                 return i;
 733         }
 734     }
 735     else
 736     {
 737         static foreach (i; 2 .. 4)
 738         {
 739             if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
 740                 return i;
 741         }
 742     }
 743     throw new UTFException("Not the end of the UTF sequence", index);
 744 }
 745
 746 /// Ditto
 747 uint strideBack(S)(auto ref S str)
 748 if (is(S : const char[]) ||
 749     (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
 750 {
 751     return strideBack(str, str.length);
 752 }
 753
 754 /// Ditto
 755 uint strideBack(S)(auto ref S str)
 756 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
 757 {
 758     assert(!str.empty, "Past the end of the UTF-8 sequence");
 759     auto temp = str.save;
 760     foreach (i; AliasSeq!(1, 2, 3, 4))
 761     {
 762         if ((temp.back & 0b1100_0000) != 0b1000_0000)
 763             return i;
 764         temp.popBack();
 765         if (temp.empty)
 766             break;
 767     }
 768     throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
 769 }
 770
 771 @system unittest
 772 {
 773     import core.exception : AssertError;
 774     import std.conv : to;
 775     import std.exception;
 776     import std.string : format;
 777     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 778     static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
 779     {
 780         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
 781                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 782
 783         enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
 784                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 785
 786         auto refRandom = new RefRandomCU!char(s);
 787         immutable randLen = refRandom.length;
 788         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
 789                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 790         enforce(refRandom.length == randLen,
 791                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 792
 793         if (i == size_t.max)
 794         {
 795             enforce(strideBack(s) == codeLength!char(c),
 796                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
 797
 798             enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
 799                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
 800
 801             auto refBidir = new RefBidirCU!char(s);
 802             immutable bidirLen = refBidir.length;
 803             enforce(strideBack(refBidir) == codeLength!char(c),
 804                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 805             enforce(refBidir.length == bidirLen,
 806                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 807         }
 808     }
 809
 810     assertCTFEable!(
 811     {
 812     test("a", 'a');
 813     test(" ", ' ');
 814     test("\u2029", '\u2029'); //paraSep
 815     test("\u0100", '\u0100');
 816     test("\u0430", '\u0430');
 817     test("\U00010143", '\U00010143');
 818     test("abcdefcdef", 'f');
 819     test("\U00010143\u0100\U00010143hello", 'o', 15);
 820     test("\U00010143\u0100\U00010143hello", 'l', 14);
 821     test("\U00010143\u0100\U00010143hello", 'l', 13);
 822     test("\U00010143\u0100\U00010143hello", 'e', 12);
 823     test("\U00010143\u0100\U00010143hello", 'h', 11);
 824     test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
 825     test("\U00010143\u0100\U00010143hello", '\u0100', 6);
 826     test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
 827
 828     foreach (S; AliasSeq!(char[], const char[], string))
 829     {
 830         enum str = to!S("hello world");
 831         static assert(isSafe!({ strideBack(str, 0); }));
 832         static assert(isSafe!({ strideBack(str);    }));
 833         static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
 834         static assert((functionAttributes!({ strideBack(str);    }) & FunctionAttribute.pure_) != 0);
 835     }
 836     });
 837 }
 838
 839 //UTF-16 is self synchronizing: The length of strideBack can be found from
 840 //the value of a single wchar
 841 /// Ditto
 842 uint strideBack(S)(auto ref S str, size_t index)
 843 if (is(S : const wchar[]) ||
 844     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
 845 {
 846     static if (is(typeof(str.length) : ulong))
 847         assert(index <= str.length, "Past the end of the UTF-16 sequence");
 848     assert(index > 0, "Not the end of a UTF-16 sequence");
 849
 850     immutable c2 = str[index-1];
 851     return 1 + (0xDC00 <= c2 && c2 < 0xE000);
 852 }
 853
 854 /// Ditto
 855 uint strideBack(S)(auto ref S str)
 856 if (is(S : const wchar[]) ||
 857     (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
 858 {
 859     assert(!str.empty, "UTF-16 sequence is empty");
 860
 861     static if (is(S : const(wchar)[]))
 862         immutable c2 = str[$ - 1];
 863     else
 864         immutable c2 = str.back;
 865
 866     return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
 867 }
 868
 869 @system unittest
 870 {
 871     import core.exception : AssertError;
 872     import std.conv : to;
 873     import std.exception;
 874     import std.string : format;
 875     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 876     static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
 877     {
 878         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
 879                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 880
 881         enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
 882                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 883
 884         auto refRandom = new RefRandomCU!wchar(s);
 885         immutable randLen = refRandom.length;
 886         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
 887                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 888         enforce(refRandom.length == randLen,
 889                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 890
 891         if (i == size_t.max)
 892         {
 893             enforce(strideBack(s) == codeLength!wchar(c),
 894                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
 895
 896             enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
 897                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
 898
 899             auto refBidir = new RefBidirCU!wchar(s);
 900             immutable bidirLen = refBidir.length;
 901             enforce(strideBack(refBidir) == codeLength!wchar(c),
 902                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 903             enforce(refBidir.length == bidirLen,
 904                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 905         }
 906     }
 907
 908     assertCTFEable!(
 909     {
 910     test("a", 'a');
 911     test(" ", ' ');
 912     test("\u2029", '\u2029'); //paraSep
 913     test("\u0100", '\u0100');
 914     test("\u0430", '\u0430');
 915     test("\U00010143", '\U00010143');
 916     test("abcdefcdef", 'f');
 917     test("\U00010143\u0100\U00010143hello", 'o', 10);
 918     test("\U00010143\u0100\U00010143hello", 'l', 9);
 919     test("\U00010143\u0100\U00010143hello", 'l', 8);
 920     test("\U00010143\u0100\U00010143hello", 'e', 7);
 921     test("\U00010143\u0100\U00010143hello", 'h', 6);
 922     test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
 923     test("\U00010143\u0100\U00010143hello", '\u0100', 3);
 924     test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
 925
 926     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
 927     {
 928         enum str = to!S("hello world");
 929         static assert(isSafe!(() => strideBack(str, 0)));
 930         static assert(isSafe!(() => strideBack(str)   ));
 931         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
 932         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
 933     }
 934     });
 935 }
 936
 937 /// Ditto
 938 uint strideBack(S)(auto ref S str, size_t index)
 939 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
 940 {
 941     static if (is(typeof(str.length) : ulong))
 942         assert(index <= str.length, "Past the end of the UTF-32 sequence");
 943     assert(index > 0, "Not the end of the UTF-32 sequence");
 944     return 1;
 945 }
 946
 947 /// Ditto
 948 uint strideBack(S)(auto ref S str)
 949 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
 950 {
 951     assert(!str.empty, "Empty UTF-32 sequence");
 952     return 1;
 953 }
 954
 955 ///
 956 @safe unittest
 957 {
 958     assert("a".strideBack == 1);
 959     assert("λ".strideBack == 2);
 960     assert("aλ".strideBack == 2);
 961     assert("aλ".strideBack(1) == 1);
 962     assert("𐐷".strideBack == 4);
 963 }
 964
 965 @system unittest
 966 {
 967     import core.exception : AssertError;
 968     import std.conv : to;
 969     import std.exception;
 970     import std.string : format;
 971     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 972     static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
 973     {
 974         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
 975                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 976
 977         enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
 978                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 979
 980         auto refRandom = new RefRandomCU!dchar(s);
 981         immutable randLen = refRandom.length;
 982         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
 983                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 984         enforce(refRandom.length == randLen,
 985                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 986
 987         if (i == size_t.max)
 988         {
 989             enforce(strideBack(s) == codeLength!dchar(c),
 990                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
 991
 992             enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
 993                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
 994
 995             auto refBidir = new RefBidirCU!dchar(s);
 996             immutable bidirLen = refBidir.length;
 997             enforce(strideBack(refBidir) == codeLength!dchar(c),
 998                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 999             enforce(refBidir.length == bidirLen,
1000                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
1001         }
1002     }
1003
1004     assertCTFEable!(
1005     {
1006     test("a", 'a');
1007     test(" ", ' ');
1008     test("\u2029", '\u2029'); //paraSep
1009     test("\u0100", '\u0100');
1010     test("\u0430", '\u0430');
1011     test("\U00010143", '\U00010143');
1012     test("abcdefcdef", 'f');
1013     test("\U00010143\u0100\U00010143hello", 'o', 8);
1014     test("\U00010143\u0100\U00010143hello", 'l', 7);
1015     test("\U00010143\u0100\U00010143hello", 'l', 6);
1016     test("\U00010143\u0100\U00010143hello", 'e', 5);
1017     test("\U00010143\u0100\U00010143hello", 'h', 4);
1018     test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019     test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020     test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1021
1022     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
1023     {
1024         enum str = to!S("hello world");
1025         static assert(isSafe!(() => strideBack(str, 0)));
1026         static assert(isSafe!(() => strideBack(str)   ));
1027         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
1028         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
1029     }
1030     });
1031 }
1032
1033
1034 /++
1035     Given `index` into `str` and assuming that `index` is at the start
1036     of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037     up to `index`. So, `index` is the index of a code unit at the
1038     beginning of a code point, and the return value is how many code points into
1039     the string that that code point is.
1040   +/
1041 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
1042 if (isSomeChar!C)
1043 {
1044     static if (is(immutable C == immutable dchar))
1045         return index;
1046     else
1047     {
1048         size_t n = 0;
1049         size_t j = 0;
1050
1051         for (; j < index; ++n)
1052             j += stride(str, j);
1053
1054         if (j > index)
1055         {
1056             static if (is(immutable C == immutable char))
1057                 throw new UTFException("Invalid UTF-8 sequence", index);
1058             else
1059                 throw new UTFException("Invalid UTF-16 sequence", index);
1060         }
1061
1062         return n;
1063     }
1064 }
1065
1066 ///
1067 @safe unittest
1068 {
1069     assert(toUCSindex(`hello world`, 7) == 7);
1070     assert(toUCSindex(`hello world`w, 7) == 7);
1071     assert(toUCSindex(`hello world`d, 7) == 7);
1072
1073     assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074     assert(toUCSindex(`Ma Chérie`w, 7) == 7);
1075     assert(toUCSindex(`Ma Chérie`d, 7) == 7);
1076
1077     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1079     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1080 }
1081
1082
1083 /++
1084     Given a UCS index `n` into `str`, returns the UTF index.
1085     So, `n` is how many code points into the string the code point is, and
1086     the array index of the code unit is returned.
1087   +/
1088 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
1089 if (isSomeChar!C)
1090 {
1091     static if (is(immutable C == immutable dchar))
1092     {
1093         return n;
1094     }
1095     else
1096     {
1097         size_t i;
1098         while (n--)
1099         {
1100             i += stride(str, i);
1101         }
1102         return i;
1103     }
1104 }
1105
1106 ///
1107 @safe unittest
1108 {
1109     assert(toUTFindex(`hello world`, 7) == 7);
1110     assert(toUTFindex(`hello world`w, 7) == 7);
1111     assert(toUTFindex(`hello world`d, 7) == 7);
1112
1113     assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114     assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1115     assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1116
1117     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1119     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1120 }
1121
1122
1123 /* =================== Decode ======================= */
1124
1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126 alias UseReplacementDchar = Flag!"useReplacementDchar";
1127
1128 /++
1129     Decodes and returns the code point starting at `str[index]`. `index`
1130     is advanced to one past the decoded code point. If the code point is not
1131     well-formed, then a `UTFException` is thrown and `index` remains
1132     unchanged.
1133
1134     decode will only work with strings and random access ranges of code units
1135     with length and slicing, whereas $(LREF decodeFront) will work with any
1136     input range of code units.
1137
1138     Params:
1139         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140         str = input string or indexable Range
1141         index = starting index into s[]; incremented by number of code units processed
1142
1143     Returns:
1144         decoded character
1145
1146     Throws:
1147         $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148         sequence and useReplacementDchar is `No.useReplacementDchar`
1149   +/
1150 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1151 if (!isSomeString!S &&
1152     isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1153 in
1154 {
1155     assert(index < str.length, "Attempted to decode past the end of a string");
1156 }
1157 out (result)
1158 {
1159     assert(isValidDchar(result));
1160 }
1161 do
1162 {
1163     if (str[index] < codeUnitLimit!S)
1164         return str[index++];
1165     else
1166         return decodeImpl!(true, useReplacementDchar)(str, index);
1167 }
1168
1169 /// ditto
1170 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1171 auto ref scope S str, ref size_t index) @trusted pure
1172 if (isSomeString!S)
1173 in
1174 {
1175     assert(index < str.length, "Attempted to decode past the end of a string");
1176 }
1177 out (result)
1178 {
1179     assert(isValidDchar(result));
1180 }
1181 do
1182 {
1183     if (str[index] < codeUnitLimit!S)
1184         return str[index++];
1185     else static if (is(immutable S == immutable C[], C))
1186         return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1187 }
1188
1189 ///
1190 @safe pure unittest
1191 {
1192     size_t i;
1193
1194     assert("a".decode(i) == 'a' && i == 1);
1195     i = 0;
1196     assert("å".decode(i) == 'å' && i == 2);
1197     i = 1;
1198     assert("aå".decode(i) == 'å' && i == 3);
1199     i = 0;
1200     assert("å"w.decode(i) == 'å' && i == 1);
1201
1202     // ë as a multi-code point grapheme
1203     i = 0;
1204     assert("e\u0308".decode(i) == 'e' && i == 1);
1205     // ë as a single code point grapheme
1206     i = 0;
1207     assert("ë".decode(i) == 'ë' && i == 2);
1208     i = 0;
1209     assert("ë"w.decode(i) == 'ë' && i == 1);
1210 }
1211
1212 @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867
1213 {
1214     import std.conv : hexString;
1215     string data = hexString!"f787a598";
1216     size_t offset = 0;
1217     try data.decode(offset);
1218     catch (UTFException ex) assert(offset == 0);
1219 }
1220
1221 /++
1222     `decodeFront` is a variant of $(LREF decode) which specifically decodes
1223     the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1224     $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1225     of code units (rather than just a string or random access
1226     range). It also takes the range by `ref` and pops off the elements as it
1227     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1228     of code units which were in the code point which was decoded.
1229
1230     Params:
1231         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1232         str = input string or indexable Range
1233         numCodeUnits = set to number of code units processed
1234
1235     Returns:
1236         decoded character
1237
1238     Throws:
1239         $(LREF UTFException) if `str.front` is not the start of a valid UTF
1240         sequence. If an exception is thrown, then there is no guarantee as to
1241         the number of code units which were popped off, as it depends on the
1242         type of range being used and how many code units had to be popped off
1243         before the code point was determined to be invalid.
1244   +/
1245 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1246 ref S str, out size_t numCodeUnits)
1247 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1248 in
1249 {
1250     assert(!str.empty);
1251 }
1252 out (result)
1253 {
1254     assert(isValidDchar(result));
1255 }
1256 do
1257 {
1258     immutable fst = str.front;
1259
1260     if (fst < codeUnitLimit!S)
1261     {
1262         str.popFront();
1263         numCodeUnits = 1;
1264         return fst;
1265     }
1266     else
1267     {
1268         // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1269         // done outside of decodeImpl, which is undesirable, since not all
1270         // overloads of decodeImpl need it. So, it should be moved back into
1271         // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1272         // has been fixed.
1273         enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1274         immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1275
1276         // The other range types were already popped by decodeImpl.
1277         static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1278             str = str[numCodeUnits .. str.length];
1279
1280         return retval;
1281     }
1282 }
1283
1284 /// ditto
1285 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1286 ref scope S str, out size_t numCodeUnits) @trusted pure
1287 if (isSomeString!S)
1288 in
1289 {
1290     assert(!str.empty);
1291 }
1292 out (result)
1293 {
1294     assert(isValidDchar(result));
1295 }
1296 do
1297 {
1298     if (str[0] < codeUnitLimit!S)
1299     {
1300         numCodeUnits = 1;
1301         immutable retval = str[0];
1302         str = str[1 .. $];
1303         return retval;
1304     }
1305     else static if (is(immutable S == immutable C[], C))
1306     {
1307         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits);
1308         str = str[numCodeUnits .. $];
1309         return retval;
1310     }
1311 }
1312
1313 /++ Ditto +/
1314 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1315 if (isInputRange!S && isSomeChar!(ElementType!S))
1316 {
1317     size_t numCodeUnits;
1318     return decodeFront!useReplacementDchar(str, numCodeUnits);
1319 }
1320
1321 ///
1322 @safe pure unittest
1323 {
1324     import std.range.primitives;
1325     string str = "Hello, World!";
1326
1327     assert(str.decodeFront == 'H' && str == "ello, World!");
1328     str = "å";
1329     assert(str.decodeFront == 'å' && str.empty);
1330     str = "å";
1331     size_t i;
1332     assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
1333 }
1334
1335 /++
1336     `decodeBack` is a variant of $(LREF decode) which specifically decodes
1337     the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1338     bidirectional range of code units (rather than just a string or random access
1339     range). It also takes the range by `ref` and pops off the elements as it
1340     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1341     of code units which were in the code point which was decoded.
1342
1343     Params:
1344         useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1345         str = input string or bidirectional Range
1346         numCodeUnits = gives the number of code units processed
1347
1348     Returns:
1349         A decoded UTF character.
1350
1351     Throws:
1352         $(LREF UTFException) if `str.back` is not the end of a valid UTF
1353         sequence. If an exception is thrown, the `str` itself remains unchanged,
1354         but there is no guarantee as to the value of `numCodeUnits` (when passed).
1355   +/
1356 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1357     ref S str, out size_t numCodeUnits)
1358 if (isSomeString!S)
1359 in
1360 {
1361     assert(!str.empty);
1362 }
1363 out (result)
1364 {
1365     assert(isValidDchar(result));
1366 }
1367 do
1368 {
1369     if (str[$ - 1] < codeUnitLimit!S)
1370     {
1371         numCodeUnits = 1;
1372         immutable retval = str[$ - 1];
1373         str = str[0 .. $ - 1];
1374         return retval;
1375     }
1376     else static if (is(immutable S == immutable C[], C))
1377     {
1378         numCodeUnits = strideBack(str);
1379         immutable newLength = str.length - numCodeUnits;
1380         size_t index = newLength;
1381         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1382         str = str[0 .. newLength];
1383         return retval;
1384     }
1385 }
1386
1387 /++ Ditto +/
1388 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1389     ref S str, out size_t numCodeUnits)
1390 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1391     && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1392 in
1393 {
1394     assert(!str.empty);
1395 }
1396 out (result)
1397 {
1398     assert(isValidDchar(result));
1399 }
1400 do
1401 {
1402     if (str.back < codeUnitLimit!S)
1403     {
1404         numCodeUnits = 1;
1405         immutable retval = str.back;
1406         str.popBack();
1407         return retval;
1408     }
1409     else
1410     {
1411         numCodeUnits = strideBack(str);
1412         static if (isRandomAccessRange!S)
1413         {
1414             size_t index = str.length - numCodeUnits;
1415             immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1416             str.popBackExactly(numCodeUnits);
1417             return retval;
1418         }
1419         else
1420         {
1421             alias Char = typeof(cast() ElementType!S.init);
1422             Char[4] codeUnits = void;
1423             S tmp = str.save;
1424             for (size_t i = numCodeUnits; i > 0; )
1425             {
1426                 codeUnits[--i] = tmp.back;
1427                 tmp.popBack();
1428             }
1429             const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1430             size_t index = 0;
1431             immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1432             str = tmp;
1433             return retval;
1434         }
1435     }
1436 }
1437
1438 /++ Ditto +/
1439 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1440 if (isSomeString!S
1441     || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1442     || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1443 in
1444 {
1445     assert(!str.empty);
1446 }
1447 out (result)
1448 {
1449     assert(isValidDchar(result));
1450 }
1451 do
1452 {
1453     size_t numCodeUnits;
1454     return decodeBack!useReplacementDchar(str, numCodeUnits);
1455 }
1456
1457 ///
1458 @system pure unittest
1459 {
1460     import std.range.primitives;
1461     string str = "Hello, World!";
1462
1463     assert(str.decodeBack == '!' && str == "Hello, World");
1464     str = "å";
1465     assert(str.decodeBack == 'å' && str.empty);
1466     str = "å";
1467     size_t i;
1468     assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
1469 }
1470
1471 // For the given range, code unit values less than this
1472 // are guaranteed to be valid single-codepoint encodings.
1473 package template codeUnitLimit(S)
1474 if (isSomeChar!(ElementEncodingType!S))
1475 {
1476     static if (is(immutable ElementEncodingType!S == immutable char))
1477         enum char codeUnitLimit = 0x80;
1478     else static if (is(immutable ElementEncodingType!S == immutable wchar))
1479         enum wchar codeUnitLimit = 0xD800;
1480     else
1481         enum dchar codeUnitLimit = 0xD800;
1482 }
1483
1484 /*
1485  * For strings, this function does its own bounds checking to give a
1486  * more useful error message when attempting to decode past the end of a string.
1487  * Subsequently it uses a pointer instead of an array to avoid
1488  * redundant bounds checking.
1489  *
1490  * The three overloads of this operate on chars, wchars, and dchars.
1491  *
1492  * Params:
1493  *      canIndex = if S is indexable
1494  *      useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1495  *      str = input string or Range
1496  *      index = starting index into s[]; incremented by number of code units processed
1497  *
1498  * Returns:
1499  *      decoded character
1500  */
1501 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1502     auto ref S str, ref size_t index)
1503 if (
1504     is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
1505 {
1506     /* The following encodings are valid, except for the 5 and 6 byte
1507      * combinations:
1508      *  0xxxxxxx
1509      *  110xxxxx 10xxxxxx
1510      *  1110xxxx 10xxxxxx 10xxxxxx
1511      *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1512      *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1513      *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1514      */
1515
1516     /* Dchar bitmask for different numbers of UTF-8 code units.
1517      */
1518     alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1519
1520     static if (is(S : const char[]))
1521         auto pstr = str.ptr + index;    // this is what makes decodeImpl() @system code
1522     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1523         auto pstr = str[index .. str.length];
1524     else
1525         alias pstr = str;
1526
1527     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1528     // outside of decodeImpl
1529     //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1530
1531     static if (canIndex)
1532     {
1533         immutable length = str.length - index;
1534         ubyte fst = pstr[0];
1535     }
1536     else
1537     {
1538         ubyte fst = pstr.front;
1539         pstr.popFront();
1540     }
1541
1542     static if (!useReplacementDchar)
1543     {
1544         static if (canIndex)
1545         {
1546             static UTFException exception(S)(S str, string msg)
1547             {
1548                 uint[4] sequence = void;
1549                 size_t i;
1550
1551                 do
1552                 {
1553                     sequence[i] = str[i];
1554                 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1555
1556                 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1557             }
1558         }
1559
1560         UTFException invalidUTF()
1561         {
1562             static if (canIndex)
1563                return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1564             else
1565             {
1566                 //We can't include the invalid sequence with input strings without
1567                 //saving each of the code units along the way, and we can't do it with
1568                 //forward ranges without saving the entire range. Both would incur a
1569                 //cost for the decoding of every character just to provide a better
1570                 //error message for the (hopefully) rare case when an invalid UTF-8
1571                 //sequence is encountered, so we don't bother trying to include the
1572                 //invalid sequence here, unlike with strings and sliceable ranges.
1573                return new UTFException("Invalid UTF-8 sequence");
1574             }
1575         }
1576
1577         UTFException outOfBounds()
1578         {
1579             static if (canIndex)
1580                return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1581             else
1582                return new UTFException("Attempted to decode past the end of a string");
1583         }
1584     }
1585
1586     if ((fst & 0b1100_0000) != 0b1100_0000)
1587     {
1588         static if (useReplacementDchar)
1589         {
1590             ++index;            // always consume bad input to avoid infinite loops
1591             return replacementDchar;
1592         }
1593         else
1594             throw invalidUTF(); // starter must have at least 2 first bits set
1595     }
1596     ubyte tmp = void;
1597     dchar d = fst; // upper control bits are masked out later
1598     fst <<= 1;
1599
1600     foreach (i; AliasSeq!(1, 2, 3))
1601     {
1602
1603         static if (canIndex)
1604         {
1605             if (i == length)
1606             {
1607                 static if (useReplacementDchar)
1608                 {
1609                     index += i;
1610                     return replacementDchar;
1611                 }
1612                 else
1613                     throw outOfBounds();
1614             }
1615         }
1616         else
1617         {
1618             if (pstr.empty)
1619             {
1620                 static if (useReplacementDchar)
1621                 {
1622                     index += i;
1623                     return replacementDchar;
1624                 }
1625                 else
1626                     throw outOfBounds();
1627             }
1628         }
1629
1630         static if (canIndex)
1631             tmp = pstr[i];
1632         else
1633         {
1634             tmp = pstr.front;
1635             pstr.popFront();
1636         }
1637
1638         if ((tmp & 0xC0) != 0x80)
1639         {
1640             static if (useReplacementDchar)
1641             {
1642                 index += i + 1;
1643                 return replacementDchar;
1644             }
1645             else
1646                 throw invalidUTF();
1647         }
1648
1649         d = (d << 6) | (tmp & 0x3F);
1650         fst <<= 1;
1651
1652         if (!(fst & 0x80)) // no more bytes
1653         {
1654             d &= bitMask[i]; // mask out control bits
1655
1656             // overlong, could have been encoded with i bytes
1657             if ((d & ~bitMask[i - 1]) == 0)
1658             {
1659                 static if (useReplacementDchar)
1660                 {
1661                     index += i + 1;
1662                     return replacementDchar;
1663                 }
1664                 else
1665                     throw invalidUTF();
1666             }
1667
1668             // check for surrogates only needed for 3 bytes
1669             static if (i == 2)
1670             {
1671                 if (!isValidDchar(d))
1672                 {
1673                     static if (useReplacementDchar)
1674                     {
1675                         index += i + 1;
1676                         return replacementDchar;
1677                     }
1678                     else
1679                         throw invalidUTF();
1680                 }
1681             }
1682
1683             static if (i == 3)
1684             {
1685                 if (d > dchar.max)
1686                 {
1687                     static if (useReplacementDchar)
1688                         d = replacementDchar;
1689                     else
1690                         throw invalidUTF();
1691                 }
1692             }
1693
1694             index += i + 1;
1695             return d;
1696         }
1697     }
1698
1699     static if (useReplacementDchar)
1700     {
1701         index += 4;             // read 4 chars by now
1702         return replacementDchar;
1703     }
1704     else
1705         throw invalidUTF();
1706 }
1707
1708 @safe pure @nogc nothrow
1709 unittest
1710 {
1711     // Add tests for useReplacemendDchar == yes path
1712
1713     static struct R
1714     {
1715       @safe pure @nogc nothrow:
1716         this(string s) { this.s = s; }
1717         @property bool empty() { return idx == s.length; }
1718         @property char front() { return s[idx]; }
1719         void popFront() { ++idx; }
1720         size_t idx;
1721         string s;
1722     }
1723
1724     foreach (s; invalidUTFstrings!char())
1725     {
1726         auto r = R(s);
1727         size_t index;
1728         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1729         assert(dc == replacementDchar);
1730         assert(1 <= index && index <= s.length);
1731     }
1732 }
1733
1734 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1735 (auto ref S str, ref size_t index)
1736 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
1737 {
1738     static if (is(S : const wchar[]))
1739         auto pstr = str.ptr + index;
1740     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1741         auto pstr = str[index .. str.length];
1742     else
1743         alias pstr = str;
1744
1745     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1746     // outside of decodeImpl
1747     //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1748
1749     static if (canIndex)
1750     {
1751         immutable length = str.length - index;
1752         uint u = pstr[0];
1753     }
1754     else
1755     {
1756         uint u = pstr.front;
1757         pstr.popFront();
1758     }
1759
1760     static if (!useReplacementDchar)
1761     {
1762         UTFException exception(string msg)
1763         {
1764             static if (canIndex)
1765                 return new UTFException(msg).setSequence(pstr[0]);
1766             else
1767                 return new UTFException(msg);
1768         }
1769     }
1770
1771     // The < case must be taken care of before decodeImpl is called.
1772     assert(u >= 0xD800);
1773
1774     if (u <= 0xDBFF)
1775     {
1776         static if (canIndex)
1777             immutable onlyOneCodeUnit = length == 1;
1778         else
1779             immutable onlyOneCodeUnit = pstr.empty;
1780
1781         if (onlyOneCodeUnit)
1782         {
1783             static if (useReplacementDchar)
1784             {
1785                 ++index;
1786                 return replacementDchar;
1787             }
1788             else
1789                 throw exception("surrogate UTF-16 high value past end of string");
1790         }
1791
1792         static if (canIndex)
1793             immutable uint u2 = pstr[1];
1794         else
1795         {
1796             immutable uint u2 = pstr.front;
1797             pstr.popFront();
1798         }
1799
1800         if (u2 < 0xDC00 || u2 > 0xDFFF)
1801         {
1802             static if (useReplacementDchar)
1803                 u = replacementDchar;
1804             else
1805                 throw exception("surrogate UTF-16 low value out of range");
1806         }
1807         else
1808             u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1809         ++index;
1810     }
1811     else if (u >= 0xDC00 && u <= 0xDFFF)
1812     {
1813         static if (useReplacementDchar)
1814             u = replacementDchar;
1815         else
1816             throw exception("unpaired surrogate UTF-16 value");
1817     }
1818     ++index;
1819
1820     // Note: u+FFFE and u+FFFF are specifically permitted by the
1821     // Unicode standard for application internal use (see isValidDchar)
1822
1823     return cast(dchar) u;
1824 }
1825
1826 @safe pure @nogc nothrow
1827 unittest
1828 {
1829     // Add tests for useReplacemendDchar == true path
1830
1831     static struct R
1832     {
1833       @safe pure @nogc nothrow:
1834         this(wstring s) { this.s = s; }
1835         @property bool empty() { return idx == s.length; }
1836         @property wchar front() { return s[idx]; }
1837         void popFront() { ++idx; }
1838         size_t idx;
1839         wstring s;
1840     }
1841
1842     foreach (s; invalidUTFstrings!wchar())
1843     {
1844         auto r = R(s);
1845         size_t index;
1846         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1847         assert(dc == replacementDchar);
1848         assert(1 <= index && index <= s.length);
1849     }
1850 }
1851
1852 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1853     auto ref S str, ref size_t index)
1854 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
1855 {
1856     static if (is(S : const dchar[]))
1857         auto pstr = str.ptr;
1858     else
1859         alias pstr = str;
1860
1861     static if (is(S : const dchar[]) || isRandomAccessRange!S)
1862     {
1863         dchar dc = pstr[index];
1864         if (!isValidDchar(dc))
1865         {
1866             static if (useReplacementDchar)
1867                 dc = replacementDchar;
1868             else
1869                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1870         }
1871         ++index;
1872         return dc;
1873     }
1874     else
1875     {
1876         dchar dc = pstr.front;
1877         if (!isValidDchar(dc))
1878         {
1879             static if (useReplacementDchar)
1880                 dc = replacementDchar;
1881             else
1882                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1883         }
1884         ++index;
1885         pstr.popFront();
1886         return dc;
1887     }
1888 }
1889
1890 @safe pure @nogc nothrow
1891 unittest
1892 {
1893     // Add tests for useReplacemendDchar == true path
1894
1895     static struct R
1896     {
1897       @safe pure @nogc nothrow:
1898         this(dstring s) { this.s = s; }
1899         @property bool empty() { return idx == s.length; }
1900         @property dchar front() { return s[idx]; }
1901         void popFront() { ++idx; }
1902         size_t idx;
1903         dstring s;
1904     }
1905
1906     foreach (s; invalidUTFstrings!dchar())
1907     {
1908         auto r = R(s);
1909         size_t index;
1910         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1911         assert(dc == replacementDchar);
1912         assert(1 <= index && index <= s.length);
1913     }
1914 }
1915
1916
1917 version (StdUnittest) private void testDecode(R)(R range,
1918                                              size_t index,
1919                                              dchar expectedChar,
1920                                              size_t expectedIndex,
1921                                              size_t line = __LINE__)
1922 {
1923     import core.exception : AssertError;
1924     import std.exception : enforce;
1925     import std.string : format;
1926     import std.traits : isNarrowString;
1927
1928     static if (hasLength!R)
1929         immutable lenBefore = range.length;
1930
1931     static if (isRandomAccessRange!R && !isNarrowString!R)
1932     {
1933         {
1934             immutable result = decode(range, index);
1935             enforce(result == expectedChar,
1936                     new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1937             enforce(index == expectedIndex,
1938                     new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1939             static if (hasLength!R)
1940             {
1941                 enforce(range.length == lenBefore,
1942                         new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1943             }
1944         }
1945     }
1946 }
1947
1948 version (StdUnittest) private void testDecodeFront(R)(ref R range,
1949                                                   dchar expectedChar,
1950                                                   size_t expectedNumCodeUnits,
1951                                                   size_t line = __LINE__)
1952 {
1953     import core.exception : AssertError;
1954     import std.exception : enforce;
1955     import std.string : format;
1956
1957     static if (hasLength!R)
1958         immutable lenBefore = range.length;
1959
1960     size_t numCodeUnits;
1961     immutable result = decodeFront(range, numCodeUnits);
1962     enforce(result == expectedChar,
1963             new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1964     enforce(numCodeUnits == expectedNumCodeUnits,
1965             new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1966
1967     static if (hasLength!R)
1968     {
1969         enforce(range.length == lenBefore - numCodeUnits,
1970                 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1971     }
1972 }
1973
1974 version (StdUnittest) private void testDecodeBack(R)(ref R range,
1975                                                  dchar expectedChar,
1976                                                  size_t expectedNumCodeUnits,
1977                                                  size_t line = __LINE__)
1978 {
1979     // This condition is to allow unit testing all `decode` functions together
1980     static if (!isBidirectionalRange!R)
1981         return;
1982     else
1983     {
1984         import core.exception : AssertError;
1985         import std.exception : enforce;
1986         import std.string : format;
1987
1988         static if (hasLength!R)
1989             immutable lenBefore = range.length;
1990
1991         size_t numCodeUnits;
1992         immutable result = decodeBack(range, numCodeUnits);
1993         enforce(result == expectedChar,
1994                 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1995         enforce(numCodeUnits == expectedNumCodeUnits,
1996                 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1997
1998         static if (hasLength!R)
1999         {
2000             enforce(range.length == lenBefore - numCodeUnits,
2001                     new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
2002         }
2003     }
2004 }
2005
2006 version (StdUnittest) private void testAllDecode(R)(R range,
2007                                                 dchar expectedChar,
2008                                                 size_t expectedIndex,
2009                                                 size_t line = __LINE__)
2010 {
2011     testDecode(range, 0, expectedChar, expectedIndex, line);
2012     static if (isBidirectionalRange!R)
2013     {
2014         auto rangeCopy = range.save;
2015         testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
2016     }
2017     testDecodeFront(range, expectedChar, expectedIndex, line);
2018 }
2019
2020 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
2021 {
2022     import core.exception : AssertError;
2023     import std.exception : assertThrown, enforce;
2024     import std.string : format;
2025
2026     immutable initialIndex = index;
2027
2028     static if (hasLength!R)
2029         immutable lenBefore = range.length;
2030
2031     static if (isRandomAccessRange!R)
2032     {
2033         assertThrown!UTFException(decode(range, index), null, __FILE__, line);
2034         enforce(index == initialIndex,
2035                 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
2036         static if (hasLength!R)
2037         {
2038             enforce(range.length == lenBefore,
2039                     new AssertError(format("decode: length changed:", range.length), __FILE__, line));
2040         }
2041     }
2042
2043     if (initialIndex == 0)
2044         assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
2045 }
2046
2047 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
2048 {
2049     // This condition is to allow unit testing all `decode` functions together
2050     static if (!isBidirectionalRange!R)
2051         return;
2052     else
2053     {
2054         import core.exception : AssertError;
2055         import std.exception : assertThrown, enforce;
2056         import std.string : format;
2057
2058         static if (hasLength!R)
2059             immutable lenBefore = range.length;
2060
2061         static if (isRandomAccessRange!R)
2062         {
2063             assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
2064             static if (hasLength!R)
2065             {
2066                 enforce(range.length == lenBefore,
2067                         new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
2068             }
2069         }
2070     }
2071 }
2072
2073 @system unittest
2074 {
2075     import std.conv : to;
2076     import std.exception;
2077
2078     assertCTFEable!(
2079     {
2080     foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
2081                           (string s) => new RefBidirCU!char(s),
2082                           (string s) => new RefRandomCU!char(s)))
2083     {
2084         enum sHasLength = hasLength!(typeof(S("abcd")));
2085
2086         {
2087             auto range = S("abcd");
2088             testDecode(range, 0, 'a', 1);
2089             testDecode(range, 1, 'b', 2);
2090             testDecodeFront(range, 'a', 1);
2091             testDecodeFront(range, 'b', 1);
2092             assert(decodeFront(range) == 'c');
2093             assert(decodeFront(range) == 'd');
2094         }
2095
2096         {
2097             auto range = S("ウェブサイト");
2098             testDecode(range, 0, 'ウ', 3);
2099             testDecode(range, 3, 'ェ', 6);
2100             testDecodeFront(range, 'ウ', 3);
2101             testDecodeFront(range, 'ェ', 3);
2102             assert(decodeFront(range) == 'ブ');
2103             assert(decodeFront(range) == 'サ');
2104         }
2105
2106         {
2107             auto range = S("abcd");
2108             testDecodeBack(range, 'd', 1);
2109             testDecodeBack(range, 'c', 1);
2110             testDecodeBack(range, 'b', 1);
2111             testDecodeBack(range, 'a', 1);
2112         }
2113
2114         {
2115             auto range = S("ウェブサイト");
2116             testDecodeBack(range, 'ト', 3);
2117             testDecodeBack(range, 'イ', 3);
2118             testDecodeBack(range, 'サ', 3);
2119             testDecodeBack(range, 'ブ', 3);
2120         }
2121
2122         testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2123         testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2124
2125         foreach (str; ["\xE2\x89", // too short
2126                        "\xC0\x8A",
2127                        "\xE0\x80\x8A",
2128                        "\xF0\x80\x80\x8A",
2129                        "\xF8\x80\x80\x80\x8A",
2130                        "\xFC\x80\x80\x80\x80\x8A"])
2131         {
2132             testBadDecode(S(str), 0);
2133             testBadDecode(S(str), 1);
2134             testBadDecodeBack(S(str));
2135         }
2136
2137         //Invalid UTF-8 sequence where the first code unit is valid.
2138         testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2139         testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2140
2141         //Invalid UTF-8 sequence where the first code unit isn't valid.
2142         foreach (str; ["\xED\xA0\x80",
2143                        "\xED\xAD\xBF",
2144                        "\xED\xAE\x80",
2145                        "\xED\xAF\xBF",
2146                        "\xED\xB0\x80",
2147                        "\xED\xBE\x80",
2148                        "\xED\xBF\xBF"])
2149         {
2150             testBadDecode(S(str), 0);
2151             testBadDecodeBack(S(str));
2152         }
2153     }
2154     });
2155 }
2156
2157 @system unittest
2158 {
2159     import std.exception;
2160     assertCTFEable!(
2161     {
2162     foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
2163                           (wstring s) => new RefBidirCU!wchar(s),
2164                           (wstring s) => new RefRandomCU!wchar(s)))
2165     {
2166         testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2167         testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2168         testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2169         testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2170         testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2171
2172         testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2173         testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2174
2175         testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2176         testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2177
2178         {
2179             auto range = S("ウェブサイト");
2180             testDecode(range, 0, 'ウ', 1);
2181             testDecode(range, 1, 'ェ', 2);
2182             testDecodeFront(range, 'ウ', 1);
2183             testDecodeFront(range, 'ェ', 1);
2184             assert(decodeFront(range) == 'ブ');
2185             assert(decodeFront(range) == 'サ');
2186         }
2187
2188         {
2189             auto range = S("ウェブサイト");
2190             testDecodeBack(range, 'ト', 1);
2191             testDecodeBack(range, 'イ', 1);
2192             testDecodeBack(range, 'サ', 1);
2193             testDecodeBack(range, 'ブ', 1);
2194         }
2195     }
2196
2197     foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2198     {
2199         auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2200                       cast(wchar) 0x1400,
2201                       cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2202         testDecode(str, 0, cast(dchar) 0x10000, 2);
2203         testDecode(str, 2, cast(dchar) 0x1400, 3);
2204         testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2205         testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2206         testDecodeBack(str, cast(dchar) 0x1400, 1);
2207         testDecodeBack(str, cast(dchar) 0x10000, 2);
2208     }
2209     });
2210 }
2211
2212 @system unittest
2213 {
2214     import std.exception;
2215     assertCTFEable!(
2216     {
2217     foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
2218                           (dstring s) => new RefBidirCU!dchar(s),
2219                           (dstring s) => new RefRandomCU!dchar(s)))
2220     {
2221         testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2222         testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2223         testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2224         testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2225         testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2226
2227         testBadDecode(S([cast(dchar) 0xD800]), 0);
2228         testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2229         testBadDecode(S([cast(dchar) 0x110000]), 0);
2230
2231         testBadDecodeBack(S([cast(dchar) 0xD800]));
2232         testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2233         testBadDecodeBack(S([cast(dchar) 0x110000]));
2234
2235         {
2236             auto range = S("ウェブサイト");
2237             testDecode(range, 0, 'ウ', 1);
2238             testDecode(range, 1, 'ェ', 2);
2239             testDecodeFront(range, 'ウ', 1);
2240             testDecodeFront(range, 'ェ', 1);
2241             assert(decodeFront(range) == 'ブ');
2242             assert(decodeFront(range) == 'サ');
2243         }
2244
2245         {
2246             auto range = S("ウェブサイト");
2247             testDecodeBack(range, 'ト', 1);
2248             testDecodeBack(range, 'イ', 1);
2249             testDecodeBack(range, 'サ', 1);
2250             testDecodeBack(range, 'ブ', 1);
2251         }
2252     }
2253
2254     foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2255     {
2256         auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2257         testDecode(str, 0, 0x10000, 1);
2258         testDecode(str, 1, 0x1400, 2);
2259         testDecode(str, 2, 0xB9DDE, 3);
2260         testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2261         testDecodeBack(str, cast(dchar) 0x1400, 1);
2262         testDecodeBack(str, cast(dchar) 0x10000, 1);
2263     }
2264     });
2265 }
2266
2267 @safe unittest
2268 {
2269     import std.exception;
2270     import std.traits : FunctionAttribute, functionAttributes, isSafe;
2271     assertCTFEable!(
2272     {
2273     foreach (S; AliasSeq!( char[], const( char)[],  string,
2274                           wchar[], const(wchar)[], wstring,
2275                           dchar[], const(dchar)[], dstring))
2276     {
2277         static assert(isSafe!({ S str; size_t i = 0; decode(str, i);      }));
2278         static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2279         static assert(isSafe!({ S str; decodeFront(str); }));
2280         static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2281         static assert((functionAttributes!({
2282             S str; size_t i = 0; decodeFront(str, i);
2283         }) & FunctionAttribute.pure_) != 0);
2284         static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2285         static assert((functionAttributes!({
2286             S str; size_t i = 0; decodeBack(str, i);
2287         }) & FunctionAttribute.pure_) != 0);
2288         static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2289     }
2290     });
2291 }
2292
2293 @safe unittest
2294 {
2295     import std.exception;
2296     char[4] val;
2297     val[0] = 0b1111_0111;
2298     val[1] = 0b1011_1111;
2299     val[2] = 0b1011_1111;
2300     val[3] = 0b1011_1111;
2301     size_t i = 0;
2302     assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2303 }
2304 /* =================== Encode ======================= */
2305
2306 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2307 {
2308     static if (useReplacementDchar)
2309         return replacementDchar;
2310     else
2311         throw new UTFException(msg).setSequence(c);
2312 }
2313
2314 /++
2315     Encodes `c` into the static array, `buf`, and returns the actual
2316     length of the encoded character (a number between `1` and `4` for
2317     `char[4]` buffers and a number between `1` and `2` for
2318     `wchar[2]` buffers).
2319
2320     Throws:
2321         `UTFException` if `c` is not a valid UTF code point.
2322   +/
2323 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2324     out char[4] buf, dchar c) @safe pure
2325 {
2326     if (c <= 0x7F)
2327     {
2328         assert(isValidDchar(c));
2329         buf[0] = cast(char) c;
2330         return 1;
2331     }
2332     if (c <= 0x7FF)
2333     {
2334         assert(isValidDchar(c));
2335         buf[0] = cast(char)(0xC0 | (c >> 6));
2336         buf[1] = cast(char)(0x80 | (c & 0x3F));
2337         return 2;
2338     }
2339     if (c <= 0xFFFF)
2340     {
2341         if (0xD800 <= c && c <= 0xDFFF)
2342             c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2343
2344         assert(isValidDchar(c));
2345     L3:
2346         buf[0] = cast(char)(0xE0 | (c >> 12));
2347         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2348         buf[2] = cast(char)(0x80 | (c & 0x3F));
2349         return 3;
2350     }
2351     if (c <= 0x10FFFF)
2352     {
2353         assert(isValidDchar(c));
2354         buf[0] = cast(char)(0xF0 | (c >> 18));
2355         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2356         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2357         buf[3] = cast(char)(0x80 | (c & 0x3F));
2358         return 4;
2359     }
2360
2361     assert(!isValidDchar(c));
2362     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2363     goto L3;
2364 }
2365
2366 ///
2367 @safe unittest
2368 {
2369     import std.exception : assertThrown;
2370     import std.typecons : Yes;
2371
2372     char[4] buf;
2373
2374     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2375     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2376     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2377     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2378     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2379     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2380
2381     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2382     auto slice = buf[];
2383     assert(slice.decodeFront == replacementDchar);
2384 }
2385
2386 ///
2387 @safe unittest
2388 {
2389     import std.exception : assertThrown;
2390     import std.typecons : Yes;
2391
2392     wchar[2] buf;
2393
2394     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2395     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2396     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2397     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2398     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2399     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2400
2401     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2402     auto slice = buf[];
2403     assert(slice.decodeFront == replacementDchar);
2404 }
2405
2406 ///
2407 @safe unittest
2408 {
2409     import std.exception : assertThrown;
2410     import std.typecons : Yes;
2411
2412     dchar[1] buf;
2413
2414     assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
2415     assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
2416     assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
2417     assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
2418     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2419
2420     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2421     assert(buf[0] == replacementDchar);
2422 }
2423
2424 @safe unittest
2425 {
2426     import std.exception;
2427     assertCTFEable!(
2428     {
2429     char[4] buf;
2430
2431     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2432     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2433     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2434     assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2435     assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2436     assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2437     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2438     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2439     assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2440     assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2441     assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2442
2443     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2444     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2445     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2446     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2447     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2448
2449     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2450     enum replacementDcharString = "\uFFFD";
2451     assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
2452     });
2453 }
2454
2455
2456 /// Ditto
2457 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2458     out wchar[2] buf, dchar c) @safe pure
2459 {
2460     if (c <= 0xFFFF)
2461     {
2462         if (0xD800 <= c && c <= 0xDFFF)
2463             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2464
2465         assert(isValidDchar(c));
2466     L1:
2467         buf[0] = cast(wchar) c;
2468         return 1;
2469     }
2470     if (c <= 0x10FFFF)
2471     {
2472         assert(isValidDchar(c));
2473         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2474         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2475         return 2;
2476     }
2477
2478     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2479     goto L1;
2480 }
2481
2482 @safe unittest
2483 {
2484     import std.exception;
2485     assertCTFEable!(
2486     {
2487     wchar[2] buf;
2488
2489     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2490     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2491     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2492     assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2493     assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2494     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2495     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2496
2497     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2498     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2499     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2500     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2501     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2502
2503     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2504     assert(buf.front == replacementDchar);
2505     });
2506 }
2507
2508
2509 /// Ditto
2510 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2511     out dchar[1] buf, dchar c) @safe pure
2512 {
2513     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2514         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2515     else
2516         assert(isValidDchar(c));
2517     buf[0] = c;
2518     return 1;
2519 }
2520
2521 @safe unittest
2522 {
2523     import std.exception;
2524     assertCTFEable!(
2525     {
2526     dchar[1] buf;
2527
2528     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2529     encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2530     encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2531     encode(buf, 0xFFFE); assert(buf[0] == 0xFFFE);
2532     encode(buf, 0xFFFF); assert(buf[0] == 0xFFFF);
2533     encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2534
2535     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2536     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2537     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2538     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2539     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2540
2541     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2542     assert(buf.front == replacementDchar);
2543     });
2544 }
2545
2546
2547 /++
2548     Encodes `c` in `str`'s encoding and appends it to `str`.
2549
2550     Throws:
2551         `UTFException` if `c` is not a valid UTF code point.
2552   +/
2553 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2554     ref scope char[] str, dchar c) @safe pure
2555 {
2556     if (c <= 0x7F)
2557     {
2558         assert(isValidDchar(c));
2559         str ~= cast(char) c;
2560     }
2561     else
2562     {
2563         char[4] buf;
2564         uint L;
2565
2566         if (c <= 0x7FF)
2567         {
2568             assert(isValidDchar(c));
2569             buf[0] = cast(char)(0xC0 | (c >> 6));
2570             buf[1] = cast(char)(0x80 | (c & 0x3F));
2571             L = 2;
2572         }
2573         else if (c <= 0xFFFF)
2574         {
2575             if (0xD800 <= c && c <= 0xDFFF)
2576                 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2577
2578             assert(isValidDchar(c));
2579         L3:
2580             buf[0] = cast(char)(0xE0 | (c >> 12));
2581             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2582             buf[2] = cast(char)(0x80 | (c & 0x3F));
2583             L = 3;
2584         }
2585         else if (c <= 0x10FFFF)
2586         {
2587             assert(isValidDchar(c));
2588             buf[0] = cast(char)(0xF0 | (c >> 18));
2589             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2590             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2591             buf[3] = cast(char)(0x80 | (c & 0x3F));
2592             L = 4;
2593         }
2594         else
2595         {
2596             assert(!isValidDchar(c));
2597             c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2598             goto L3;
2599         }
2600         str ~= buf[0 .. L];
2601     }
2602 }
2603
2604 ///
2605 @safe unittest
2606 {
2607     char[] s = "abcd".dup;
2608     dchar d1 = 'a';
2609     dchar d2 = 'ø';
2610
2611     encode(s, d1);
2612     assert(s.length == 5);
2613     assert(s == "abcda");
2614     encode(s, d2);
2615     assert(s.length == 7);
2616     assert(s == "abcdaø");
2617 }
2618
2619 @safe unittest
2620 {
2621     import std.exception;
2622
2623     assertCTFEable!(
2624     {
2625     char[] s = "abcd".dup;
2626     encode(s, cast(dchar)'a');
2627     assert(s.length == 5);
2628     assert(s == "abcda");
2629
2630     encode(s, cast(dchar)'\u00A9');
2631     assert(s.length == 7);
2632     assert(s == "abcda\xC2\xA9");
2633     //assert(s == "abcda\u00A9");   // BUG: fix compiler
2634
2635     encode(s, cast(dchar)'\u2260');
2636     assert(s.length == 10);
2637     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2638     });
2639 }
2640
2641 @safe unittest
2642 {
2643     import std.exception;
2644     assertCTFEable!(
2645     {
2646     char[] buf;
2647
2648     encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2649     encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2650     encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2651     encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2652     encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2653     encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2654     encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2655     encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2656     encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2657     encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2658     encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2659
2660     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2661     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2662     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2663     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2664     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2665
2666     enum replacementDcharString = "\uFFFD";
2667     enum rdcslen = replacementDcharString.length;
2668     assert(buf[$ - rdcslen .. $] != replacementDcharString);
2669     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2670     assert(buf[$ - rdcslen .. $] == replacementDcharString);
2671     });
2672 }
2673
2674 /// ditto
2675 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2676     ref scope wchar[] str, dchar c) @safe pure
2677 {
2678     if (c <= 0xFFFF)
2679     {
2680         if (0xD800 <= c && c <= 0xDFFF)
2681             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2682
2683         assert(isValidDchar(c));
2684     L1:
2685         str ~= cast(wchar) c;
2686     }
2687     else if (c <= 0x10FFFF)
2688     {
2689         wchar[2] buf;
2690
2691         assert(isValidDchar(c));
2692         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2693         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2694         str ~= buf;
2695     }
2696     else
2697     {
2698         assert(!isValidDchar(c));
2699         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2700         goto L1;
2701     }
2702 }
2703
2704 @safe unittest
2705 {
2706     import std.exception;
2707     assertCTFEable!(
2708     {
2709     wchar[] buf;
2710
2711     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2712     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2713     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2714     encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2715     encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2716     encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2717     encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2718
2719     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2720     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2721     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2722     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2723     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2724
2725     assert(buf.back != replacementDchar);
2726     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2727     assert(buf.back == replacementDchar);
2728     });
2729 }
2730
2731 /// ditto
2732 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2733     ref scope dchar[] str, dchar c) @safe pure
2734 {
2735     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2736         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2737     else
2738         assert(isValidDchar(c));
2739     str ~= c;
2740 }
2741
2742 @safe unittest
2743 {
2744     import std.exception;
2745     assertCTFEable!(
2746     {
2747     dchar[] buf;
2748
2749     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2750     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2751     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2752     encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2753     encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2754     encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2755
2756     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2757     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2758     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2759     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2760     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2761
2762     assert(buf.back != replacementDchar);
2763     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2764     assert(buf.back == replacementDchar);
2765     });
2766 }
2767
2768
2769 /++
2770     Returns the number of code units that are required to encode the code point
2771     `c` when `C` is the character type used to encode it.
2772   +/
2773 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2774 if (isSomeChar!C)
2775 {
2776     static if (C.sizeof == 1)
2777     {
2778         if (c <= 0x7F) return 1;
2779         if (c <= 0x7FF) return 2;
2780         if (c <= 0xFFFF) return 3;
2781         if (c <= 0x10FFFF) return 4;
2782         assert(false);
2783     }
2784     else static if (C.sizeof == 2)
2785     {
2786         return c <= 0xFFFF ? 1 : 2;
2787     }
2788     else
2789     {
2790         static assert(C.sizeof == 4);
2791         return 1;
2792     }
2793 }
2794
2795 ///
2796 @safe pure nothrow @nogc unittest
2797 {
2798     assert(codeLength!char('a') == 1);
2799     assert(codeLength!wchar('a') == 1);
2800     assert(codeLength!dchar('a') == 1);
2801
2802     assert(codeLength!char('\U0010FFFF') == 4);
2803     assert(codeLength!wchar('\U0010FFFF') == 2);
2804     assert(codeLength!dchar('\U0010FFFF') == 1);
2805 }
2806
2807
2808 /++
2809     Returns the number of code units that are required to encode `str`
2810     in a string whose character type is `C`. This is particularly useful
2811     when slicing one string with the length of another and the two string
2812     types use different character types.
2813
2814     Params:
2815         C = the character type to get the encoding length for
2816         input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2817         to calculate the encoding length from
2818     Returns:
2819         The number of code units in `input` when encoded to `C`
2820   +/
2821 size_t codeLength(C, InputRange)(InputRange input)
2822 if (isSomeFiniteCharInputRange!InputRange)
2823 {
2824     alias EncType = typeof(cast() ElementEncodingType!InputRange.init);
2825     static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2826         return input.length;
2827     else
2828     {
2829         size_t total = 0;
2830
2831         foreach (c; input.byDchar)
2832             total += codeLength!C(c);
2833
2834         return total;
2835     }
2836 }
2837
2838 ///
2839 @safe unittest
2840 {
2841     assert(codeLength!char("hello world") ==
2842            "hello world".length);
2843     assert(codeLength!wchar("hello world") ==
2844            "hello world"w.length);
2845     assert(codeLength!dchar("hello world") ==
2846            "hello world"d.length);
2847
2848     assert(codeLength!char(`プログラミング`) ==
2849            `プログラミング`.length);
2850     assert(codeLength!wchar(`プログラミング`) ==
2851            `プログラミング`w.length);
2852     assert(codeLength!dchar(`プログラミング`) ==
2853            `プログラミング`d.length);
2854
2855     string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2856     wstring needle = `Être sans la verité`;
2857     assert(haystack[codeLength!char(needle) .. $] ==
2858            `, ça, ce ne serait pas bien.`);
2859 }
2860
2861 @safe unittest
2862 {
2863     import std.algorithm.iteration : filter;
2864     import std.conv : to;
2865     import std.exception;
2866
2867     assertCTFEable!(
2868     {
2869     foreach (S; AliasSeq!( char[], const  char[],  string,
2870                           wchar[], const wchar[], wstring,
2871                           dchar[], const dchar[], dstring))
2872     {
2873         foreach (C; AliasSeq!(char, wchar, dchar))
2874         {
2875             assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2876             assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2877             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2878                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2879             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2880                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2881         }
2882     }
2883     });
2884 }
2885
2886 /+
2887 Internal helper function:
2888
2889 Returns true if it is safe to search for the Codepoint `c` inside
2890 code units, without decoding.
2891
2892 This is a runtime check that is used an optimization in various functions,
2893 particularly, in `std.string`.
2894   +/
2895 package bool canSearchInCodeUnits(C)(dchar c)
2896 if (isSomeChar!C)
2897 {
2898     static if (C.sizeof == 1)
2899          return c <= 0x7F;
2900     else static if (C.sizeof == 2)
2901         return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2902     else static if (C.sizeof == 4)
2903         return true;
2904     else
2905         static assert(0);
2906 }
2907 @safe unittest
2908 {
2909     assert( canSearchInCodeUnits! char('a'));
2910     assert( canSearchInCodeUnits!wchar('a'));
2911     assert( canSearchInCodeUnits!dchar('a'));
2912     assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2913     assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2914     assert( canSearchInCodeUnits!wchar('ö'));
2915     assert( canSearchInCodeUnits!dchar('ö'));
2916     assert(!canSearchInCodeUnits! char('日'));
2917     assert( canSearchInCodeUnits!wchar('日'));
2918     assert( canSearchInCodeUnits!dchar('日'));
2919     assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2920     assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2921     assert(!canSearchInCodeUnits! char('\U00010001'));
2922     assert(!canSearchInCodeUnits!wchar('\U00010001'));
2923     assert( canSearchInCodeUnits!dchar('\U00010001'));
2924 }
2925
2926 /* =================== Validation ======================= */
2927
2928 /++
2929     Checks to see if `str` is well-formed unicode or not.
2930
2931     Throws:
2932         `UTFException` if `str` is not well-formed.
2933   +/
2934 void validate(S)(in S str) @safe pure
2935 if (isSomeString!S)
2936 {
2937     immutable len = str.length;
2938     for (size_t i = 0; i < len; )
2939     {
2940         decode(str, i);
2941     }
2942 }
2943
2944 ///
2945 @safe unittest
2946 {
2947     import std.exception : assertThrown;
2948     char[] a = [167, 133, 175];
2949     assertThrown!UTFException(validate(a));
2950 }
2951
2952 // https://issues.dlang.org/show_bug.cgi?id=12923
2953 @safe unittest
2954 {
2955     import std.exception;
2956     assertThrown((){
2957         char[3]a=[167, 133, 175];
2958         validate(a[]);
2959     }());
2960 }
2961
2962 /**
2963  * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2964  * string of the elements.
2965  *
2966  * Params:
2967  *     s = the string to encode
2968  * Returns:
2969  *     A UTF-8 string
2970  * See_Also:
2971  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2972  */
2973 string toUTF8(S)(S s)
2974 if (isSomeFiniteCharInputRange!S)
2975 {
2976     return toUTFImpl!string(s);
2977 }
2978
2979 ///
2980 @safe pure unittest
2981 {
2982     import std.algorithm.comparison : equal;
2983
2984     // The ö is represented by two UTF-8 code units
2985     assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2986
2987     // 𐐷 is four code units in UTF-8
2988     assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2989 }
2990
2991 @system pure unittest
2992 {
2993     import std.algorithm.comparison : equal;
2994     import std.internal.test.dummyrange : ReferenceInputRange;
2995
2996     alias RT = ReferenceInputRange!(ElementType!(string));
2997     auto r1 = new RT("Hellø");
2998     auto r2 = new RT("𐐷");
2999
3000     assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
3001     assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
3002 }
3003
3004 /**
3005  * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3006  * `wstring` of the elements.
3007  *
3008  * Params:
3009  *     s = the range to encode
3010  * Returns:
3011  *     A UTF-16 string
3012  * See_Also:
3013  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3014  */
3015 wstring toUTF16(S)(S s)
3016 if (isSomeFiniteCharInputRange!S)
3017 {
3018     return toUTFImpl!wstring(s);
3019 }
3020
3021 ///
3022 @safe pure unittest
3023 {
3024     import std.algorithm.comparison : equal;
3025
3026     // these graphemes are two code units in UTF-16 and one in UTF-32
3027     assert("𤭢"d.length == 1);
3028     assert("𐐷"d.length == 1);
3029
3030     assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62]));
3031     assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37]));
3032 }
3033
3034 @system pure unittest
3035 {
3036     import std.algorithm.comparison : equal;
3037     import std.internal.test.dummyrange : ReferenceInputRange;
3038
3039     alias RT = ReferenceInputRange!(ElementType!(string));
3040     auto r1 = new RT("𤭢");
3041     auto r2 = new RT("𐐷");
3042
3043     assert(r1.toUTF16.equal([0xD852, 0xDF62]));
3044     assert(r2.toUTF16.equal([0xD801, 0xDC37]));
3045 }
3046
3047
3048 /**
3049  * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3050  * `dstring` of the elements.
3051  *
3052  * Params:
3053  *     s = the range to encode
3054  * Returns:
3055  *     A UTF-32 string
3056  * See_Also:
3057  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3058  */
3059 dstring toUTF32(S)(scope S s)
3060 if (isSomeFiniteCharInputRange!S)
3061 {
3062     return toUTFImpl!dstring(s);
3063 }
3064
3065 ///
3066 @safe pure unittest
3067 {
3068     import std.algorithm.comparison : equal;
3069
3070     // these graphemes are two code units in UTF-16 and one in UTF-32
3071     assert("𤭢"w.length == 2);
3072     assert("𐐷"w.length == 2);
3073
3074     assert("𤭢"w.toUTF32.equal([0x00024B62]));
3075     assert("𐐷"w.toUTF32.equal([0x00010437]));
3076 }
3077
3078 private T toUTFImpl(T, S)(scope S s)
3079 {
3080     static if (is(S : T))
3081     {
3082         return s.idup;
3083     }
3084     else
3085     {
3086         import std.array : appender;
3087         auto app = appender!T();
3088
3089         static if (is(S == C[], C) || hasLength!S)
3090             app.reserve(s.length);
3091
3092         ElementEncodingType!T e = void;
3093         foreach (c; s.byUTF!(typeof(cast() ElementEncodingType!T.init)))
3094             app.put(c);
3095
3096         return app.data;
3097     }
3098 }
3099
3100 /* =================== toUTFz ======================= */
3101
3102 /++
3103     Returns a C-style zero-terminated string equivalent to `str`. `str`
3104     must not contain embedded `'\0'`'s as any C function will treat the first
3105     `'\0'` that it sees as the end of the string. If `str.empty` is
3106     `true`, then a string containing only `'\0'` is returned.
3107
3108     `toUTFz` accepts any type of string and is templated on the type of
3109     character pointer that you wish to convert to. It will avoid allocating a
3110     new string if it can, but there's a decent chance that it will end up having
3111     to allocate a new string - particularly when dealing with character types
3112     other than `char`.
3113
3114     $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3115     anything alters the character one past the end of `str` (which is the
3116     `'\0'` character terminating the string), then the string won't be
3117     zero-terminated anymore. The most likely scenarios for that are if you
3118     append to `str` and no reallocation takes place or when `str` is a
3119     slice of a larger array, and you alter the character in the larger array
3120     which is one character past the end of `str`. Another case where it could
3121     occur would be if you had a mutable character array immediately after
3122     `str` in memory (for example, if they're member variables in a
3123     user-defined type with one declared right after the other) and that
3124     character array happened to start with `'\0'`. Such scenarios will never
3125     occur if you immediately use the zero-terminated string after calling
3126     `toUTFz` and the C function using it doesn't keep a reference to it.
3127     Also, they are unlikely to occur even if you save the zero-terminated string
3128     (the cases above would be among the few examples of where it could happen).
3129     However, if you save the zero-terminate string and want to be absolutely
3130     certain that the string stays zero-terminated, then simply append a
3131     `'\0'` to the string and use its `ptr` property rather than calling
3132     `toUTFz`.
3133
3134     $(RED Warning 2:) When passing a character pointer to a C function, and the
3135     C function keeps it around for any reason, make sure that you keep a
3136     reference to it in your D code. Otherwise, it may go away during a garbage
3137     collection cycle and cause a nasty bug when the C code tries to use it.
3138   +/
3139 template toUTFz(P)
3140 if (is(P == C*, C) && isSomeChar!C)
3141 {
3142     P toUTFz(S)(S str) @safe pure
3143     if (isSomeString!S)
3144     {
3145         return toUTFzImpl!(P, S)(str);
3146     }
3147 }
3148
3149 ///
3150 @safe pure unittest
3151 {
3152     auto p1 = toUTFz!(char*)("hello world");
3153     auto p2 = toUTFz!(const(char)*)("hello world");
3154     auto p3 = toUTFz!(immutable(char)*)("hello world");
3155     auto p4 = toUTFz!(char*)("hello world"d);
3156     auto p5 = toUTFz!(const(wchar)*)("hello world");
3157     auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
3158 }
3159
3160 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3161 if (is(immutable typeof(*P.init) == typeof(str[0])))
3162 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3163 {
3164     if (str.empty)
3165     {
3166         typeof(*P.init)[] retval = ['\0'];
3167
3168         auto trustedPtr() @trusted { return retval.ptr; }
3169         return trustedPtr();
3170     }
3171
3172     alias C = typeof(cast() ElementEncodingType!S.init);
3173
3174     //If the P is mutable, then we have to make a copy.
3175     static if (is(typeof(cast() *P.init) == typeof(*P.init)))
3176     {
3177         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3178     }
3179     else
3180     {
3181         if (!__ctfe)
3182         {
3183             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3184             immutable p = trustedPtrAdd(str);
3185
3186             // Peek past end of str, if it's 0, no conversion necessary.
3187             // Note that the compiler will put a 0 past the end of static
3188             // strings, and the storage allocator will put a 0 past the end
3189             // of newly allocated char[]'s.
3190             // Is p dereferenceable? A simple test: if the p points to an
3191             // address multiple of 4, then conservatively assume the pointer
3192             // might be pointing to a new block of memory, which might be
3193             // unreadable. Otherwise, it's definitely pointing to valid
3194             // memory.
3195             if ((cast(size_t) p & 3) && *p == '\0')
3196                 return &str[0];
3197         }
3198
3199         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3200     }
3201 }
3202
3203 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3204 if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable))
3205 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3206 {
3207     alias InChar   = typeof(str[0]);
3208     alias UInChar  = typeof(cast() str[0]); // unqualified version of InChar
3209     alias OutChar  = typeof(*P.init);
3210     alias UOutChar = typeof(cast() *P.init); // unqualified version
3211
3212     //const(C)[] -> const(C)* or
3213     //C[] -> C* or const(C)*
3214     static if (( is(const(UInChar) == InChar) &&  is(    const(UOutChar) == OutChar)) ||
3215                (!is(const(UInChar) == InChar) && !is(immutable(UOutChar) == OutChar)))
3216     {
3217         if (!__ctfe)
3218         {
3219             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3220             auto p = trustedPtrAdd(str);
3221
3222             if ((cast(size_t) p & 3) && *p == '\0')
3223                 return &str[0];
3224         }
3225
3226         str ~= '\0';
3227         return &str[0];
3228     }
3229     //const(C)[] -> C* or immutable(C)* or
3230     //C[] -> immutable(C)*
3231     else
3232     {
3233         import std.array : uninitializedArray;
3234         auto copy = uninitializedArray!(UOutChar[])(str.length + 1);
3235         copy[0 .. $ - 1] = str[];
3236         copy[$ - 1] = '\0';
3237
3238         auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
3239         return trustedCast(copy);
3240     }
3241 }
3242
3243 private P toUTFzImpl(P, S)(S str) @safe pure
3244 if (!is(immutable typeof(*P.init) == immutable typeof(str[0])))
3245 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3246 {
3247     import std.array : appender;
3248     auto retval = appender!(typeof(*P.init)[])();
3249
3250     foreach (dchar c; str)
3251         retval.put(c);
3252     retval.put('\0');
3253
3254     return () @trusted { return cast(P) retval.data.ptr; } ();
3255 }
3256
3257 @safe pure unittest
3258 {
3259     import core.exception : AssertError;
3260     import std.algorithm;
3261     import std.conv : to;
3262     import std.exception;
3263     import std.string : format;
3264
3265     assertCTFEable!(
3266     {
3267     foreach (S; AliasSeq!(string, wstring, dstring))
3268     {
3269         alias C = Unqual!(ElementEncodingType!S);
3270
3271         auto s1 = to!S("hello\U00010143\u0100\U00010143");
3272         auto temp = new C[](s1.length + 1);
3273         temp[0 .. $ - 1] = s1[0 .. $];
3274         temp[$ - 1] = '\n';
3275         --temp.length;
3276         auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3277         auto s2 = trustedAssumeUnique(temp);
3278         assert(s1 == s2);
3279
3280         void trustedCStringAssert(P, S)(S s) @trusted
3281         {
3282             auto p = toUTFz!P(s);
3283             assert(p[0 .. s.length] == s);
3284             assert(p[s.length] == '\0');
3285         }
3286
3287         foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3288         {
3289             trustedCStringAssert!P(s1);
3290             trustedCStringAssert!P(s2);
3291         }
3292     }
3293     });
3294
3295     static void test(P, S)(S s, size_t line = __LINE__) @trusted
3296     {
3297         static size_t zeroLen(C)(const(C)* ptr) @trusted
3298         {
3299             size_t len = 0;
3300             while (*ptr != '\0') { ++ptr; ++len; }
3301             return len;
3302         }
3303
3304         auto p = toUTFz!P(s);
3305         immutable len = zeroLen(p);
3306         enforce(cmp(s, p[0 .. len]) == 0,
3307                 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3308                                 __FILE__, line));
3309     }
3310
3311     assertCTFEable!(
3312     {
3313     foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3314                           dchar*, const(dchar)*, immutable(dchar)*))
3315     {
3316         test!P("hello\U00010143\u0100\U00010143");
3317     }
3318     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3319                           dchar*, const(dchar)*, immutable(dchar)*))
3320     {
3321         test!P("hello\U00010143\u0100\U00010143"w);
3322     }
3323     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3324                           wchar*, const(wchar)*, immutable(wchar)*))
3325     {
3326         test!P("hello\U00010143\u0100\U00010143"d);
3327     }
3328     foreach (S; AliasSeq!( char[], const( char)[],
3329                           wchar[], const(wchar)[],
3330                           dchar[], const(dchar)[]))
3331     {
3332         auto s = to!S("hello\U00010143\u0100\U00010143");
3333
3334         foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3335                               wchar*, const(wchar)*, immutable(wchar)*,
3336                               dchar*, const(dchar)*, immutable(dchar)*))
3337         {
3338             test!P(s);
3339         }
3340     }
3341     });
3342 }
3343
3344
3345 /++
3346     `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3347
3348     Encodes string `s` into UTF-16 and returns the encoded string.
3349     `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3350     that take an `LPCWSTR` argument.
3351   +/
3352 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3353 if (isSomeChar!C)
3354 {
3355     return toUTFz!(const(wchar)*)(str);
3356 }
3357
3358 ///
3359 @system unittest
3360 {
3361     string str = "Hello, World!";
3362     const(wchar)* p = str.toUTF16z;
3363     assert(p[str.length] == '\0');
3364 }
3365
3366 @safe pure unittest
3367 {
3368     import std.conv : to;
3369     //toUTFz is already thoroughly tested, so this will just verify that
3370     //toUTF16z compiles properly for the various string types.
3371     foreach (S; AliasSeq!(string, wstring, dstring))
3372         assert(toUTF16z(to!S("hello world")) !is null);
3373 }
3374
3375
3376 /* ================================ tests ================================== */
3377
3378 @safe pure unittest
3379 {
3380     import std.exception;
3381
3382     assertCTFEable!(
3383     {
3384     assert(toUTF16("hello"c) == "hello");
3385     assert(toUTF32("hello"c) == "hello");
3386     assert(toUTF8 ("hello"w) == "hello");
3387     assert(toUTF32("hello"w) == "hello");
3388     assert(toUTF8 ("hello"d) == "hello");
3389     assert(toUTF16("hello"d) == "hello");
3390
3391     assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3392     assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3393     assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3394     assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3395     assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3396     assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3397
3398     assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3399     assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3400     assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3401     assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3402     assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3403     assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3404     });
3405 }
3406
3407
3408 /++
3409     Returns the total number of code points encoded in `str`.
3410
3411     Supercedes: This function supercedes $(LREF toUCSindex).
3412
3413     Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3414
3415     Throws:
3416         `UTFException` if `str` is not well-formed.
3417   +/
3418 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc
3419 if (isSomeChar!C)
3420 {
3421     return walkLength(str.byDchar);
3422 }
3423
3424 ///
3425 @safe pure nothrow @nogc unittest
3426 {
3427     assert(count("") == 0);
3428     assert(count("a") == 1);
3429     assert(count("abc") == 3);
3430     assert(count("\u20AC100") == 4);
3431 }
3432
3433 @safe pure nothrow @nogc unittest
3434 {
3435     import std.exception;
3436     assertCTFEable!(
3437     {
3438     assert(count("") == 0);
3439     assert(count("a") == 1);
3440     assert(count("abc") == 3);
3441     assert(count("\u20AC100") == 4);
3442     });
3443 }
3444
3445
3446 // Ranges of code units for testing.
3447 version (StdUnittest)
3448 {
3449 private:
3450     struct InputCU(C)
3451     {
3452         import std.conv : to;
3453         @property bool empty() { return _str.empty; }
3454         @property C front() { return _str[0]; }
3455         void popFront() { _str = _str[1 .. $]; }
3456
3457         this(inout(C)[] str)
3458         {
3459             _str = to!(C[])(str);
3460         }
3461
3462         C[] _str;
3463     }
3464
3465     struct BidirCU(C)
3466     {
3467         import std.conv : to;
3468         @property bool empty() { return _str.empty; }
3469         @property C front() { return _str[0]; }
3470         void popFront() { _str = _str[1 .. $]; }
3471         @property C back() { return _str[$ - 1]; }
3472         void popBack() { _str = _str[0 .. $ - 1]; }
3473         @property auto save() { return BidirCU(_str); }
3474         @property size_t length() { return _str.length; }
3475
3476         this(inout(C)[] str)
3477         {
3478             _str = to!(C[])(str);
3479         }
3480
3481         C[] _str;
3482     }
3483
3484     struct RandomCU(C)
3485     {
3486         import std.conv : to;
3487         @property bool empty() { return _str.empty; }
3488         @property C front() { return _str[0]; }
3489         void popFront() { _str = _str[1 .. $]; }
3490         @property C back() { return _str[$ - 1]; }
3491         void popBack() { _str = _str[0 .. $ - 1]; }
3492         @property auto save() { return RandomCU(_str); }
3493         @property size_t length() { return _str.length; }
3494         C opIndex(size_t i) { return _str[i]; }
3495         auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3496
3497         this(inout(C)[] str)
3498         {
3499             _str = to!(C[])(str);
3500         }
3501
3502         C[] _str;
3503     }
3504
3505     class RefBidirCU(C)
3506     {
3507         import std.conv : to;
3508         @property bool empty() { return _str.empty; }
3509         @property C front() { return _str[0]; }
3510         void popFront() { _str = _str[1 .. $]; }
3511         @property C back() { return _str[$ - 1]; }
3512         void popBack() { _str = _str[0 .. $ - 1]; }
3513         @property auto save() { return new RefBidirCU(_str); }
3514         @property size_t length() { return _str.length; }
3515
3516         this(inout(C)[] str)
3517         {
3518             _str = to!(C[])(str);
3519         }
3520
3521         C[] _str;
3522     }
3523
3524     class RefRandomCU(C)
3525     {
3526         import std.conv : to;
3527         @property bool empty() { return _str.empty; }
3528         @property C front() { return _str[0]; }
3529         void popFront() { _str = _str[1 .. $]; }
3530         @property C back() { return _str[$ - 1]; }
3531         void popBack() { _str = _str[0 .. $ - 1]; }
3532         @property auto save() { return new RefRandomCU(_str); }
3533         @property size_t length() { return _str.length; }
3534         C opIndex(size_t i) { return _str[i]; }
3535         auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3536
3537         this(inout(C)[] str)
3538         {
3539             _str = to!(C[])(str);
3540         }
3541
3542         C[] _str;
3543     }
3544 }
3545
3546
3547 /**
3548  * Inserted in place of invalid UTF sequences.
3549  *
3550  * References:
3551  *      $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3552  */
3553 enum dchar replacementDchar = '\uFFFD';
3554
3555 /********************************************
3556  * Iterate a range of char, wchar, or dchars by code unit.
3557  *
3558  * The purpose is to bypass the special case decoding that
3559  * $(REF front, std,range,primitives) does to character arrays. As a result,
3560  * using ranges with `byCodeUnit` can be `nothrow` while
3561  * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3562  * sequences.
3563  *
3564  * A code unit is a building block of the UTF encodings. Generally, an
3565  * individual code unit does not represent what's perceived as a full
3566  * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3567  * are encoded with multiple code units. For example, the UTF-8 code units for
3568  * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3569  * often does not form a character on its own. Attempting to treat it as
3570  * one while iterating over the resulting range will give nonsensical results.
3571  *
3572  * Params:
3573  *      r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3574  *      of characters (including strings) or a type that implicitly converts to a string type.
3575  * Returns:
3576  *      If `r` is not an auto-decodable string (i.e. a narrow string or a
3577  *      user-defined type that implicitly converts to a string type), then `r`
3578  *      is returned.
3579  *
3580  *      Otherwise, `r` is converted to its corresponding string type (if it's
3581  *      not already a string) and wrapped in a random-access range where the
3582  *      element encoding type of the string (its code unit) is the element type
3583  *      of the range, and that range returned. The range has slicing.
3584  *
3585  *      If `r` is quirky enough to be a struct or class which is an input range
3586  *      of characters on its own (i.e. it has the input range API as member
3587  *      functions), $(I and) it's implicitly convertible to a string type, then
3588  *      `r` is returned, and no implicit conversion takes place.
3589  *
3590  *      If `r` is wrapped in a new range, then that range has a `source`
3591  *      property for returning the string that's currently contained within that
3592  *      range.
3593  *
3594  * See_Also:
3595  *      Refer to the $(MREF std, uni) docs for a reference on Unicode
3596  *      terminology.
3597  *
3598  *      For a range that iterates by grapheme cluster (written character) see
3599  *      $(REF byGrapheme, std,uni).
3600  */
3601 auto byCodeUnit(R)(R r)
3602 if ((isConvertibleToString!R && !isStaticArray!R) ||
3603     (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
3604 {
3605     import std.traits : StringTypeOf;
3606     static if (// This would be cleaner if we had a way to check whether a type
3607                // was a range without any implicit conversions.
3608                (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3609                 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3610     {
3611         static struct ByCodeUnitImpl
3612         {
3613         @safe pure nothrow @nogc:
3614
3615             @property bool empty() const     { return source.length == 0; }
3616             @property auto ref front() inout { return source[0]; }
3617             void popFront()                  { source = source[1 .. $]; }
3618
3619             @property auto save() { return ByCodeUnitImpl(source.save); }
3620
3621             @property auto ref back() inout { return source[$ - 1]; }
3622             void popBack()                  { source = source[0 .. $-1]; }
3623
3624             auto ref opIndex(size_t index) inout     { return source[index]; }
3625             auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
3626
3627             @property size_t length() const { return source.length; }
3628             alias opDollar = length;
3629
3630             StringTypeOf!R source;
3631         }
3632
3633         static assert(isRandomAccessRange!ByCodeUnitImpl);
3634
3635         return ByCodeUnitImpl(r);
3636     }
3637     else static if (!isInputRange!R ||
3638                     (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3639                     !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3640     {
3641         return cast(StringTypeOf!R) r;
3642     }
3643     else
3644     {
3645         // byCodeUnit for ranges and dchar[] is a no-op
3646         return r;
3647     }
3648 }
3649
3650 ///
3651 @safe unittest
3652 {
3653     import std.range.primitives;
3654     import std.traits : isAutodecodableString;
3655
3656     auto r = "Hello, World!".byCodeUnit();
3657     static assert(hasLength!(typeof(r)));
3658     static assert(hasSlicing!(typeof(r)));
3659     static assert(isRandomAccessRange!(typeof(r)));
3660     static assert(is(ElementType!(typeof(r)) == immutable char));
3661
3662     // contrast with the range capabilities of standard strings (with or
3663     // without autodecoding enabled).
3664     auto s = "Hello, World!";
3665     static assert(isBidirectionalRange!(typeof(r)));
3666     static if (isAutodecodableString!(typeof(s)))
3667     {
3668         // with autodecoding enabled, strings are non-random-access ranges of
3669         // dchar.
3670         static assert(is(ElementType!(typeof(s)) == dchar));
3671         static assert(!isRandomAccessRange!(typeof(s)));
3672         static assert(!hasSlicing!(typeof(s)));
3673         static assert(!hasLength!(typeof(s)));
3674     }
3675     else
3676     {
3677         // without autodecoding, strings are normal arrays.
3678         static assert(is(ElementType!(typeof(s)) == immutable char));
3679         static assert(isRandomAccessRange!(typeof(s)));
3680         static assert(hasSlicing!(typeof(s)));
3681         static assert(hasLength!(typeof(s)));
3682     }
3683 }
3684
3685 /// `byCodeUnit` does no Unicode decoding
3686 @safe unittest
3687 {
3688     string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3689     assert(noel1.byCodeUnit[2] != 'ë');
3690     assert(noel1.byCodeUnit[2] == 'e');
3691
3692     string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3693     // Because string is UTF-8, the code unit at index 2 is just
3694     // the first of a sequence that encodes 'ë'
3695     assert(noel2.byCodeUnit[2] != 'ë');
3696 }
3697
3698 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3699 @safe unittest
3700 {
3701     import std.algorithm.comparison : equal;
3702     import std.range : popFrontN;
3703     import std.traits : isAutodecodableString;
3704     {
3705         auto range = byCodeUnit("hello world");
3706         range.popFrontN(3);
3707         assert(equal(range.save, "lo world"));
3708         static if (isAutodecodableString!string) // only enabled with autodecoding
3709         {
3710             string str = range.source;
3711             assert(str == "lo world");
3712         }
3713     }
3714     // source only exists if the range was wrapped
3715     {
3716         auto range = byCodeUnit("hello world"d);
3717         static assert(!__traits(compiles, range.source));
3718     }
3719 }
3720
3721 @safe pure nothrow @nogc unittest
3722 {
3723     import std.range;
3724     {
3725         enum testStr = "𐁄𐂌𐃯 hello ディラン";
3726         char[testStr.length] s;
3727         int i;
3728         foreach (c; testStr.byCodeUnit().byCodeUnit())
3729         {
3730             s[i++] = c;
3731         }
3732         assert(s == testStr);
3733     }
3734     {
3735         enum testStr = "𐁄𐂌𐃯 hello ディラン"w;
3736         wchar[testStr.length] s;
3737         int i;
3738         foreach (c; testStr.byCodeUnit().byCodeUnit())
3739         {
3740             s[i++] = c;
3741         }
3742         assert(s == testStr);
3743     }
3744     {
3745         enum testStr = "𐁄𐂌𐃯 hello ディラン"d;
3746         dchar[testStr.length] s;
3747         int i;
3748         foreach (c; testStr.byCodeUnit().byCodeUnit())
3749         {
3750             s[i++] = c;
3751         }
3752         assert(s == testStr);
3753     }
3754     {
3755         auto bcu = "hello".byCodeUnit();
3756         assert(bcu.length == 5);
3757         assert(bcu[3] == 'l');
3758         assert(bcu[2 .. 4][1] == 'l');
3759     }
3760     {
3761         char[5] orig = "hello";
3762         auto bcu = orig[].byCodeUnit();
3763         bcu.front = 'H';
3764         assert(bcu.front == 'H');
3765         bcu[1] = 'E';
3766         assert(bcu[1] == 'E');
3767     }
3768     {
3769         auto bcu = "hello".byCodeUnit().byCodeUnit();
3770         static assert(isForwardRange!(typeof(bcu)));
3771         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3772         auto s = bcu.save;
3773         bcu.popFront();
3774         assert(s.front == 'h');
3775     }
3776     {
3777         auto bcu = "hello".byCodeUnit();
3778         static assert(hasSlicing!(typeof(bcu)));
3779         static assert(isBidirectionalRange!(typeof(bcu)));
3780         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3781         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3782         auto ret = bcu.retro;
3783         assert(ret.front == 'o');
3784         ret.popFront();
3785         assert(ret.front == 'l');
3786     }
3787     {
3788         auto bcu = "κόσμε"w.byCodeUnit();
3789         static assert(hasSlicing!(typeof(bcu)));
3790         static assert(isBidirectionalRange!(typeof(bcu)));
3791         static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
3792         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3793         auto ret = bcu.retro;
3794         assert(ret.front == 'ε');
3795         ret.popFront();
3796         assert(ret.front == 'μ');
3797     }
3798     {
3799         static struct Stringish
3800         {
3801             string s;
3802             alias s this;
3803         }
3804
3805         auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓");
3806         auto bcu = orig.byCodeUnit();
3807         static assert(is(typeof(bcu) == struct));
3808         static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
3809         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3810         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3811         assert(bcu.front == cast(char) 244);
3812     }
3813     {
3814         static struct WStringish
3815         {
3816             wstring s;
3817             alias s this;
3818         }
3819
3820         auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w);
3821         auto bcu = orig.byCodeUnit();
3822         static assert(is(typeof(bcu) == struct));
3823         static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
3824         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3825         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3826         assert(bcu.front == cast(wchar) 56319);
3827     }
3828     {
3829         static struct DStringish
3830         {
3831             dstring s;
3832             alias s this;
3833         }
3834
3835         auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d);
3836         auto bcu = orig.byCodeUnit();
3837         static assert(is(typeof(bcu) == dstring));
3838         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3839         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3840         assert(bcu.front == cast(dchar) 1114104);
3841     }
3842     {
3843         static struct FuncStringish
3844         {
3845             string str;
3846             string s() pure nothrow @nogc { return str; }
3847             alias s this;
3848         }
3849
3850         auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3851         auto bcu = orig.byCodeUnit();
3852         static if (isAutodecodableString!FuncStringish)
3853             static assert(is(typeof(bcu) == struct));
3854         else
3855             static assert(is(typeof(bcu) == string));
3856         static assert(!is(typeof(bcu) == FuncStringish));
3857         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3858         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3859         assert(bcu.front == cast(char) 244);
3860     }
3861     {
3862         static struct Range
3863         {
3864             string data;
3865             bool empty() pure nothrow @nogc { return data.empty; }
3866             char front() pure nothrow @nogc { return data[0]; }
3867             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3868         }
3869
3870         auto orig = Range("\U0010fff8 𐁊 foo 𐂓");
3871         auto bcu = orig.byCodeUnit();
3872         static assert(is(typeof(bcu) == Range));
3873         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3874         static assert(is(ElementType!(typeof(bcu)) == char));
3875         assert(bcu.front == cast(char) 244);
3876     }
3877     {
3878         static struct WRange
3879         {
3880             wstring data;
3881             bool empty() pure nothrow @nogc { return data.empty; }
3882             wchar front() pure nothrow @nogc { return data[0]; }
3883             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3884         }
3885
3886         auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w);
3887         auto bcu = orig.byCodeUnit();
3888         static assert(is(typeof(bcu) == WRange));
3889         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3890         static assert(is(ElementType!(typeof(bcu)) == wchar));
3891         assert(bcu.front == 56319);
3892     }
3893     {
3894         static struct DRange
3895         {
3896             dstring data;
3897             bool empty() pure nothrow @nogc { return data.empty; }
3898             dchar front() pure nothrow @nogc { return data[0]; }
3899             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3900         }
3901
3902         auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d);
3903         auto bcu = orig.byCodeUnit();
3904         static assert(is(typeof(bcu) == DRange));
3905         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3906         static assert(is(ElementType!(typeof(bcu)) == dchar));
3907         assert(bcu.front == 1114104);
3908     }
3909     {
3910         static struct RangeAndStringish
3911         {
3912             bool empty() pure nothrow @nogc { return data.empty; }
3913             char front() pure nothrow @nogc { return data[0]; }
3914             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3915
3916             string data;
3917             string s;
3918             alias s this;
3919         }
3920
3921         auto orig = RangeAndStringish("test.d", "other");
3922         auto bcu = orig.byCodeUnit();
3923         static assert(is(typeof(bcu) == RangeAndStringish));
3924         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3925         static assert(is(ElementType!(typeof(bcu)) == char));
3926         assert(bcu.front == 't');
3927     }
3928     {
3929         static struct WRangeAndStringish
3930         {
3931             bool empty() pure nothrow @nogc { return data.empty; }
3932             wchar front() pure nothrow @nogc { return data[0]; }
3933             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3934
3935             wstring data;
3936             wstring s;
3937             alias s this;
3938         }
3939
3940         auto orig = WRangeAndStringish("test.d"w, "other"w);
3941         auto bcu = orig.byCodeUnit();
3942         static assert(is(typeof(bcu) == WRangeAndStringish));
3943         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3944         static assert(is(ElementType!(typeof(bcu)) == wchar));
3945         assert(bcu.front == 't');
3946     }
3947     {
3948         static struct DRangeAndStringish
3949         {
3950             bool empty() pure nothrow @nogc { return data.empty; }
3951             dchar front() pure nothrow @nogc { return data[0]; }
3952             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3953
3954             dstring data;
3955             dstring s;
3956             alias s this;
3957         }
3958
3959         auto orig = DRangeAndStringish("test.d"d, "other"d);
3960         auto bcu = orig.byCodeUnit();
3961         static assert(is(typeof(bcu) == DRangeAndStringish));
3962         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3963         static assert(is(ElementType!(typeof(bcu)) == dchar));
3964         assert(bcu.front == 't');
3965     }
3966     {
3967         enum Enum : string { a = "test.d" }
3968
3969         auto orig = Enum.a;
3970         auto bcu = orig.byCodeUnit();
3971         static assert(!is(typeof(bcu) == Enum));
3972         static if (isAutodecodableString!Enum)
3973             static assert(is(typeof(bcu) == struct));
3974         else
3975             static assert(is(typeof(bcu) == string));
3976         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3977         assert(bcu.front == 't');
3978     }
3979     {
3980         enum WEnum : wstring { a = "test.d"w }
3981
3982         auto orig = WEnum.a;
3983         auto bcu = orig.byCodeUnit();
3984         static assert(!is(typeof(bcu) == WEnum));
3985         static if (isAutodecodableString!WEnum)
3986             static assert(is(typeof(bcu) == struct));
3987         else
3988             static assert(is(typeof(bcu) == wstring));
3989         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3990         assert(bcu.front == 't');
3991     }
3992     {
3993         enum DEnum : dstring { a = "test.d"d }
3994
3995         auto orig = DEnum.a;
3996         auto bcu = orig.byCodeUnit();
3997         static assert(is(typeof(bcu) == dstring));
3998         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3999         assert(bcu.front == 't');
4000     }
4001
4002     static if (autodecodeStrings)
4003     {
4004         static assert(!is(typeof(byCodeUnit("hello")) == string));
4005         static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
4006     }
4007     else
4008     {
4009         static assert(is(typeof(byCodeUnit("hello")) == string));
4010         static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
4011     }
4012     static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
4013
4014     static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
4015     static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
4016     static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
4017
4018     enum SEnum : char[5] { a = "hello" }
4019     enum WSEnum : wchar[5] { a = "hello"w }
4020     enum DSEnum : dchar[5] { a = "hello"d }
4021
4022     static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
4023     static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
4024     static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
4025 }
4026
4027 /****************************
4028  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4029  * of characters by char, wchar, or dchar.
4030  * These aliases simply forward to $(LREF byUTF) with the
4031  * corresponding C argument.
4032  *
4033  * Params:
4034  *      r = input range of characters, or array of characters
4035  */
4036 alias byChar = byUTF!char;
4037
4038 /// Ditto
4039 alias byWchar = byUTF!wchar;
4040
4041 /// Ditto
4042 alias byDchar = byUTF!dchar;
4043
4044 @safe pure nothrow @nogc unittest
4045 {
4046   {
4047     char[5] s;
4048     int i;
4049     foreach (c; "hello".byChar.byChar())
4050     {
4051         //writefln("[%d] '%c'", i, c);
4052         s[i++] = c;
4053     }
4054     assert(s == "hello");
4055   }
4056   {
4057     char[5+2+3+4+3+3] s;
4058     int i;
4059     dchar[10] a;
4060     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4061     a[8] = 0xD800;   // invalid
4062     a[9] = cast(dchar) 0x110000; // invalid
4063     foreach (c; a[].byChar())
4064     {
4065         //writefln("[%d] '%c'", i, c);
4066         s[i++] = c;
4067     }
4068     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4069   }
4070   {
4071     auto r = "hello"w.byChar();
4072     r.popFront();
4073     r.popFront();
4074     assert(r.front == 'l');
4075   }
4076   {
4077     auto r = "hello"d.byChar();
4078     r.popFront();
4079     r.popFront();
4080     assert(r.front == 'l');
4081   }
4082   {
4083     auto r = "hello"d.byChar();
4084     assert(isForwardRange!(typeof(r)));
4085     auto s = r.save;
4086     r.popFront();
4087     assert(s.front == 'h');
4088   }
4089 }
4090
4091 @safe pure nothrow @nogc unittest
4092 {
4093   {
4094     wchar[11] s;
4095     int i;
4096     dchar[10] a;
4097     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4098     a[8] = 0xD800;   // invalid
4099     a[9] = cast(dchar) 0x110000; // invalid
4100     foreach (c; a[].byWchar())
4101     {
4102         //writefln("[%d] '%c' x%x", i, c, c);
4103         s[i++] = c;
4104     }
4105     foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
4106     {
4107         //writefln("[%d] '%c' x%x", j, c, c);
4108     }
4109     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
4110   }
4111
4112   {
4113     auto r = "hello".byWchar();
4114     r.popFront();
4115     r.popFront();
4116     assert(r.front == 'l');
4117   }
4118   {
4119     auto r = "hello"d.byWchar();
4120     r.popFront();
4121     r.popFront();
4122     assert(r.front == 'l');
4123   }
4124   {
4125     auto r = "hello"d.byWchar();
4126     assert(isForwardRange!(typeof(r)));
4127     auto s = r.save;
4128     r.popFront();
4129     assert(s.front == 'h');
4130   }
4131 }
4132
4133 @safe pure nothrow @nogc unittest
4134 {
4135   {
4136     dchar[9] s;
4137     int i;
4138     string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4139     foreach (c; a.byDchar())
4140     {
4141         s[i++] = c;
4142     }
4143     assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
4144   }
4145   {
4146     foreach (s; invalidUTFstrings!char())
4147     {
4148         auto r = s.byDchar();
4149         assert(!r.empty);
4150         assert(r.front == r.front);
4151         dchar c = r.front;
4152         assert(c == replacementDchar);
4153     }
4154   }
4155   {
4156     auto r = "hello".byDchar();
4157     r.popFront();
4158     r.popFront();
4159     assert(r.front == 'l');
4160   }
4161
4162   {
4163     dchar[8] s;
4164     int i;
4165     wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
4166     foreach (c; a.byDchar())
4167     {
4168         //writefln("[%d] '%c' x%x", i, c, c);
4169         s[i++] = c;
4170     }
4171     assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
4172   }
4173   {
4174     foreach (s; invalidUTFstrings!wchar())
4175     {
4176         auto r = s.byDchar();
4177         assert(!r.empty);
4178         assert(r.front == r.front);
4179         dchar c = r.front;
4180         assert(c == replacementDchar);
4181     }
4182   }
4183   {
4184     wchar[2] ws;
4185     ws[0] = 0xD800;
4186     ws[1] = 0xDD00;             // correct surrogate pair
4187     auto r = ws[].byDchar();
4188     assert(!r.empty);
4189     assert(r.front == r.front);
4190     dchar c = r.front;
4191     assert(c == '\U00010100');
4192   }
4193   {
4194     auto r = "hello"w.byDchar();
4195     r.popFront();
4196     r.popFront();
4197     assert(r.front == 'l');
4198   }
4199
4200   {
4201     dchar[5] s;
4202     int i;
4203     dstring a = "hello"d;
4204     foreach (c; a.byDchar.byDchar())
4205     {
4206         //writefln("[%d] '%c' x%x", i, c, c);
4207         s[i++] = c;
4208     }
4209     assert(s == "hello"d);
4210   }
4211   {
4212     auto r = "hello".byDchar();
4213     assert(isForwardRange!(typeof(r)));
4214     auto s = r.save;
4215     r.popFront();
4216     assert(s.front == 'h');
4217   }
4218   {
4219     auto r = "hello"w.byDchar();
4220     assert(isForwardRange!(typeof(r)));
4221     auto s = r.save;
4222     r.popFront();
4223     assert(s.front == 'h');
4224   }
4225 }
4226
4227 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4228 // which needs to support ranges with and without those attributes
4229
4230 pure @safe nothrow @nogc unittest
4231 {
4232     dchar[5] s = "hello"d;
4233     foreach (c; s[].byChar())  { }
4234     foreach (c; s[].byWchar()) { }
4235     foreach (c; s[].byDchar()) { }
4236 }
4237
4238 version (StdUnittest)
4239 private int impureVariable;
4240
4241 @system unittest
4242 {
4243     static struct ImpureThrowingSystemRange(Char)
4244     {
4245         @property bool empty() const { return true; }
4246         @property Char front() const { return Char.init; }
4247         void popFront()
4248         {
4249             impureVariable++;
4250             throw new Exception("only for testing nothrow");
4251         }
4252     }
4253
4254     foreach (Char; AliasSeq!(char, wchar, dchar))
4255     {
4256         ImpureThrowingSystemRange!Char range;
4257         foreach (c; range.byChar())  { }
4258         foreach (c; range.byWchar()) { }
4259         foreach (c; range.byDchar()) { }
4260     }
4261 }
4262
4263 /****************************
4264  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4265  * of characters by char type `C` by encoding the elements of the range.
4266  *
4267  * UTF sequences that cannot be converted to the specified encoding are either
4268  * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4269  * of the Unicode Standard 6.2 or result in a thrown UTFException.
4270  *  Hence byUTF is not symmetric.
4271  * This algorithm is lazy, and does not allocate memory.
4272  * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4273  * `r` parameter.
4274  *
4275  * Params:
4276  *      C = `char`, `wchar`, or `dchar`
4277  *      useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4278  *                            UseReplacementDchar.no means throw `UTFException` for invalid UTF
4279  *
4280  * Throws:
4281  *      `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.no`
4282  *
4283  * GC:
4284  *      Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.yes`
4285  *
4286  * Returns:
4287  *      A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4288  *      as defined by $(REF isAutodecodableString, std, traits).
4289  *
4290  *      A forward range if `R` is a forward range and not auto-decodable.
4291  *
4292  *      Or, if `R` is a range and it is auto-decodable and
4293  *      `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4294  *      to $(LREF byCodeUnit).
4295  *
4296  *      Otherwise, an input range of characters.
4297  */
4298 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
4299 if (isSomeChar!C)
4300 {
4301     static if (is(immutable C == immutable UC, UC) && !is(C == UC))
4302         alias byUTF = byUTF!UC;
4303     else:
4304
4305     auto ref byUTF(R)(R r)
4306     if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4307     {
4308         return byUTF(r.byCodeUnit());
4309     }
4310
4311     auto ref byUTF(R)(R r)
4312     if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4313     {
4314         static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C))
4315         {
4316             return r.byCodeUnit();
4317         }
4318         else static if (is(C == dchar))
4319         {
4320             static struct Result
4321             {
4322                 enum Empty = uint.max;  // range is empty or just constructed
4323
4324                 this(return scope R r)
4325                 {
4326                     this.r = r;
4327                 }
4328
4329                 this(return scope R r, uint buff)
4330                 {
4331                     this.r = r;
4332                     this.buff = buff;
4333                 }
4334
4335                 static if (isBidirectionalRange!R)
4336                 {
4337                     this(return scope R r, uint frontBuff, uint backBuff)
4338                     {
4339                         this.r = r;
4340                         this.buff = frontBuff;
4341                         this.backBuff = backBuff;
4342                     }
4343                 }
4344
4345                 @property bool empty()
4346                 {
4347                     static if (isBidirectionalRange!R)
4348                         return buff == Empty && backBuff == Empty && r.empty;
4349                     else
4350                         return buff == Empty && r.empty;
4351                 }
4352
4353                 @property dchar front() scope // 'scope' required by call to decodeFront() below
4354                 {
4355                     if (buff == Empty)
4356                     {
4357                         auto c = r.front;
4358
4359                         static if (is(RC == wchar))
4360                             enum firstMulti = 0xD800; // First high surrogate.
4361                         else
4362                             enum firstMulti = 0x80; // First non-ASCII.
4363                         if (c < firstMulti)
4364                         {
4365                             r.popFront;
4366                             buff = cast(dchar) c;
4367                         }
4368                         else
4369                         {
4370                             buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4371                         }
4372                     }
4373                     return cast(dchar) buff;
4374                 }
4375
4376                 void popFront()
4377                 {
4378                     if (buff == Empty)
4379                         front();
4380                     buff = Empty;
4381                 }
4382
4383                 static if (isForwardRange!R)
4384                 {
4385                     @property auto save()
4386                     {
4387                         static if (isBidirectionalRange!R)
4388                         {
4389                             return Result(r.save, buff, backBuff);
4390                         }
4391                         else
4392                         {
4393                             return Result(r.save, buff);
4394                         }
4395                     }
4396                 }
4397
4398                 static if (isBidirectionalRange!R)
4399                 {
4400                     @property dchar back() scope // 'scope' required by call to decodeBack() below
4401                     {
4402                         if (backBuff != Empty)
4403                             return cast(dchar) backBuff;
4404
4405                         auto c = r.back;
4406                         static if (is(RC == wchar))
4407                             enum firstMulti = 0xD800; // First high surrogate.
4408                         else
4409                             enum firstMulti = 0x80; // First non-ASCII.
4410                         if (c < firstMulti)
4411                         {
4412                             r.popBack;
4413                             backBuff = cast(dchar) c;
4414                         }
4415                         else
4416                         {
4417                             backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }();
4418                         }
4419                         return cast(dchar) backBuff;
4420
4421                     }
4422
4423                     void popBack()
4424                     {
4425                         if (backBuff == Empty)
4426                             back();
4427                         backBuff = Empty;
4428                     }
4429                 }
4430
4431             private:
4432
4433                 R r;
4434                 uint buff = Empty;      // one character lookahead buffer
4435                 static if (isBidirectionalRange!R)
4436                     uint backBuff = Empty;
4437             }
4438
4439             return Result(r);
4440         }
4441         else
4442         {
4443             static struct Result
4444             {
4445                 this(return scope R r)
4446                 {
4447                     this.r = r;
4448                 }
4449
4450                 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
4451                 {
4452                     this.r = r;
4453                     this.pos = pos;
4454                     this.fill = fill;
4455                     this.buf = buf;
4456                 }
4457
4458                 static if (isBidirectionalRange!R)
4459                 {
4460                     this(return scope R r, ushort frontPos, ushort frontFill,
4461                          ushort backPos, ushort backFill, C[4 / C.sizeof] buf)
4462                     {
4463                         this.r = r;
4464                         this.pos = frontPos;
4465                         this.fill = frontFill;
4466                         this.backPos = backPos;
4467                         this.backFill = backFill;
4468                         this.buf = buf;
4469                     }
4470                 }
4471
4472                 @property bool empty()
4473                 {
4474                     static if (isBidirectionalRange!R)
4475                         return pos == fill && backPos == backFill && r.empty;
4476                     else
4477                         return pos == fill && r.empty;
4478                 }
4479
4480                 @property auto front() scope // 'scope' required by call to decodeFront() below
4481                 {
4482                     if (pos == fill)
4483                     {
4484                         pos = 0;
4485                         auto c = r.front;
4486
4487                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4488                             enum firstMulti = 0xD800; // First high surrogate.
4489                         else
4490                             enum firstMulti = 0x80; // First non-ASCII.
4491                         if (c < firstMulti)
4492                         {
4493                             fill = 1;
4494                             r.popFront;
4495                             buf[pos] = cast(C) c;
4496                         }
4497                         else
4498                         {
4499                             static if (is(RC == dchar))
4500                             {
4501                                 r.popFront;
4502                                 dchar dc = c;
4503                             }
4504                             else
4505                                 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4506                             fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4507                         }
4508                     }
4509                     return buf[pos];
4510                 }
4511
4512                 void popFront()
4513                 {
4514                     if (pos == fill)
4515                         front;
4516                     ++pos;
4517                 }
4518
4519                 static if (isForwardRange!R)
4520                 {
4521                     @property auto save()
4522                     {
4523                         static if (isBidirectionalRange!R)
4524                         {
4525                             return Result(r.save, pos, fill, backPos, backFill, buf);
4526                         }
4527                         else
4528                         {
4529                             return Result(r.save, pos, fill, buf);
4530                         }
4531                     }
4532                 }
4533
4534                 static if (isBidirectionalRange!R)
4535                 {
4536                     @property auto back() scope // 'scope' required by call to decodeBack() below
4537                     {
4538                         if (backPos != backFill)
4539                             return buf[cast(ushort) (backFill - backPos - 1)];
4540
4541                         backPos = 0;
4542                         auto c = r.back;
4543                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4544                             enum firstMulti = 0xD800; // First high surrogate.
4545                         else
4546                             enum firstMulti = 0x80; // First non-ASCII.
4547                         if (c < firstMulti)
4548                         {
4549                             backFill = 1;
4550                             r.popBack;
4551                             buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c;
4552                         }
4553                         else
4554                         {
4555                             static if (is(RC == dchar))
4556                             {
4557                                 r.popBack;
4558                                 dchar dc = c;
4559                             }
4560                             else
4561                                 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }();
4562                             backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4563                         }
4564                         return buf[cast(ushort) (backFill - backPos - 1)];
4565                     }
4566
4567                     void popBack()
4568                     {
4569                         if (backPos == backFill)
4570                             back;
4571                         ++backPos;
4572                     }
4573                 }
4574
4575             private:
4576
4577                 R r;
4578                 ushort pos, fill;
4579                 static if (isBidirectionalRange!R)
4580                     ushort backPos, backFill;
4581                 C[4 / C.sizeof] buf = void;
4582             }
4583
4584             return Result(r);
4585         }
4586     }
4587 }
4588
4589 ///
4590 @safe pure nothrow unittest
4591 {
4592     import std.algorithm.comparison : equal;
4593
4594     // hellö as a range of `char`s, which are UTF-8
4595     assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4596
4597     // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4598     assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4599
4600     // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4601     assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4602     assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37]));
4603     assert("𐐷".byUTF!dchar().equal([0x00010437]));
4604 }
4605
4606 ///
4607 @safe unittest
4608 {
4609     import std.algorithm.comparison : equal;
4610     import std.exception : assertThrown;
4611
4612     assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
4613     assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
4614 }
4615
4616 @safe unittest
4617 {
4618     {
4619         wchar[] s = ['a', 'b', 0x219];
4620         auto r = s.byUTF!char;
4621         assert(isBidirectionalRange!(typeof(r)));
4622         assert(r.back == 0x99);
4623         r.popBack;
4624         assert(r.back == 0xc8);
4625         r.popBack;
4626         assert(r.back == 'b');
4627
4628     }
4629
4630     {
4631         wchar[] s = ['a', 'b', 0x219];
4632         auto r = s.byUTF!wchar;
4633         uint i;
4634         assert(isBidirectionalRange!(typeof(r)));
4635         assert(r.back == 0x219);
4636         r.popBack;
4637         assert(r.back == 'b');
4638     }
4639
4640     {
4641         wchar[] s = ['a', 'b', 0x219];
4642         auto r = s.byUTF!dchar;
4643         assert(isBidirectionalRange!(typeof(r)));
4644         assert(r.back == 0x219);
4645         r.popBack;
4646         assert(r.back == 'b');
4647     }
4648
4649     {
4650         dchar[] s = ['𐐷', '😁'];
4651         auto r = s.byUTF!wchar;
4652         assert(r.back == 0xde01);
4653         r.popBack;
4654         assert(r.back == 0xd83d);
4655         r.popBack;
4656         assert(r.back == 0xdc37);
4657         r.popBack;
4658         assert(r.back == 0xd801);
4659     }
4660
4661     {
4662         dchar[] s = ['𐐷', '😁'];
4663         auto r = s.byUTF!char;
4664         char[] res;
4665         while (!r.empty)
4666         {
4667             res ~= r.back;
4668             r.popBack;
4669         }
4670         import std.algorithm.comparison : equal;
4671         assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4672     }
4673
4674     {
4675         dchar[] res;
4676         auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar;
4677         while (!r.empty)
4678         {
4679             res ~= r.back;
4680             r.popBack;
4681         }
4682         import std.algorithm.comparison : equal;
4683         assert(res.equal(['e', 'd', 'c', 'b', 'a']));
4684     }
4685
4686     {
4687         //testing the save() function
4688         wchar[] s = ['Ă','ț'];
4689
4690         auto rc = s.byUTF!char;
4691         rc.popBack;
4692         auto rcCopy = rc.save;
4693         assert(rc.back == rcCopy.back);
4694         assert(rcCopy.back == 0xc8);
4695
4696         auto rd = s.byUTF!dchar;
4697         rd.popBack;
4698         auto rdCopy = rd.save;
4699         assert(rd.back == rdCopy.back);
4700         assert(rdCopy.back == 'Ă');
4701     }
4702 }
4703
4704 ///
4705 @safe pure nothrow unittest
4706 {
4707     import std.range.primitives;
4708     wchar[] s = ['ă', 'î'];
4709
4710     auto rc = s.byUTF!char;
4711     static assert(isBidirectionalRange!(typeof(rc)));
4712     assert(rc.back == 0xae);
4713     rc.popBack;
4714     assert(rc.back == 0xc3);
4715     rc.popBack;
4716     assert(rc.back == 0x83);
4717     rc.popBack;
4718     assert(rc.back == 0xc4);
4719
4720     auto rw = s.byUTF!wchar;
4721     static assert(isBidirectionalRange!(typeof(rw)));
4722     assert(rw.back == 'î');
4723     rw.popBack;
4724     assert(rw.back == 'ă');
4725
4726     auto rd = s.byUTF!dchar;
4727     static assert(isBidirectionalRange!(typeof(rd)));
4728     assert(rd.back == 'î');
4729     rd.popBack;
4730     assert(rd.back == 'ă');
4731 }