libphobos/src/std/utf.d

   1 // Written in the D programming language.
   2
   3 /++
   4     Encode and decode UTF-8, UTF-16 and UTF-32 strings.
   5
   6     UTF character support is restricted to
   7     $(D '\u0000' &lt;= character &lt;= '\U0010FFFF').
   8
   9 $(SCRIPT inhibitQuickIndex = 1;)
  10 $(DIVC quickindex,
  11 $(BOOKTABLE,
  12 $(TR $(TH Category) $(TH Functions))
  13 $(TR $(TD Decode) $(TD
  14     $(LREF decode)
  15     $(LREF decodeFront)
  16 ))
  17 $(TR $(TD Lazy decode) $(TD
  18     $(LREF byCodeUnit)
  19     $(LREF byChar)
  20     $(LREF byWchar)
  21     $(LREF byDchar)
  22     $(LREF byUTF)
  23 ))
  24 $(TR $(TD Encode) $(TD
  25     $(LREF encode)
  26     $(LREF toUTF8)
  27     $(LREF toUTF16)
  28     $(LREF toUTF32)
  29     $(LREF toUTFz)
  30     $(LREF toUTF16z)
  31 ))
  32 $(TR $(TD Length) $(TD
  33     $(LREF codeLength)
  34     $(LREF count)
  35     $(LREF stride)
  36     $(LREF strideBack)
  37 ))
  38 $(TR $(TD Index) $(TD
  39     $(LREF toUCSindex)
  40     $(LREF toUTFindex)
  41 ))
  42 $(TR $(TD Validation) $(TD
  43     $(LREF isValidDchar)
  44     $(LREF isValidCodepoint)
  45     $(LREF validate)
  46 ))
  47 $(TR $(TD Miscellaneous) $(TD
  48     $(LREF replacementDchar)
  49     $(LREF UseReplacementDchar)
  50     $(LREF UTFException)
  51 ))
  52 ))
  53     See_Also:
  54         $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
  55         $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
  56         $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
  57     Copyright: Copyright The D Language Foundation 2000 - 2012.
  58     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  59     Authors:   $(HTTP digitalmars.com, Walter Bright) and
  60                $(HTTP jmdavisprog.com, Jonathan M Davis)
  61     Source:    $(PHOBOSSRC std/utf.d)
  62    +/
  63 module std.utf;
  64
  65 import std.exception : basicExceptionCtors;
  66 import core.exception : UnicodeException;
  67 import std.meta : AliasSeq;
  68 import std.range.primitives;
  69 import std.traits : isAutodecodableString, isPointer, isSomeChar,
  70     isSomeString, isStaticArray, Unqual, isConvertibleToString;
  71 import std.typecons : Flag, Yes, No;
  72
  73
  74 /++
  75     Exception thrown on errors in std.utf functions.
  76   +/
  77 class UTFException : UnicodeException
  78 {
  79     import core.internal.string : unsignedToTempString, UnsignedStringBuf;
  80
  81     uint[4] sequence;
  82     size_t  len;
  83
  84     @safe pure nothrow @nogc
  85     UTFException setSequence(scope uint[] data...) return
  86     {
  87         assert(data.length <= 4);
  88
  89         len = data.length < 4 ? data.length : 4;
  90         sequence[0 .. len] = data[0 .. len];
  91
  92         return this;
  93     }
  94
  95     // FIXME: Use std.exception.basicExceptionCtors here once
  96     // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
  97
  98     /**
  99     Standard exception constructors.
 100      */
 101     this(string msg, string file = __FILE__, size_t line = __LINE__,
 102          Throwable next = null) @nogc @safe pure nothrow
 103     {
 104         super(msg, 0, file, line, next);
 105     }
 106     /// ditto
 107     this(string msg, size_t index, string file = __FILE__,
 108          size_t line = __LINE__, Throwable next = null) @safe pure nothrow
 109     {
 110         UnsignedStringBuf buf = void;
 111         msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
 112         super(msg, index, file, line, next);
 113     }
 114
 115     /**
 116     Returns:
 117         A `string` detailing the invalid UTF sequence.
 118      */
 119     override string toString() const
 120     {
 121         if (len == 0)
 122         {
 123             /* Exception.toString() is not marked as const, although
 124              * it is const-compatible.
 125              */
 126             //return super.toString();
 127             auto e = () @trusted { return cast(Exception) super; } ();
 128             return e.toString();
 129         }
 130
 131         string result = "Invalid UTF sequence:";
 132
 133         foreach (i; sequence[0 .. len])
 134         {
 135             UnsignedStringBuf buf = void;
 136             result ~= ' ';
 137             auto h = unsignedToTempString!16(i, buf);
 138             if (h.length == 1)
 139                 result ~= '0';
 140             result ~= h;
 141             result ~= 'x';
 142         }
 143
 144         if (super.msg.length > 0)
 145         {
 146             result ~= " - ";
 147             result ~= super.msg;
 148         }
 149
 150         return result;
 151     }
 152 }
 153
 154 ///
 155 @safe unittest
 156 {
 157     import std.exception : assertThrown;
 158
 159     char[4] buf;
 160     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
 161     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
 162     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
 163     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
 164     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
 165 }
 166
 167 /*
 168    Provide array of invalidly encoded UTF strings. Useful for testing.
 169
 170    Params:
 171         Char = char, wchar, or dchar
 172
 173    Returns:
 174         an array of invalidly encoded UTF strings
 175  */
 176
 177 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
 178 if (isSomeChar!Char)
 179 {
 180     static if (is(Char == char))
 181     {
 182         enum x = 0xDC00;         // invalid surrogate value
 183         enum y = 0x110000;       // out of range
 184
 185         static immutable string[8] result =
 186         [
 187             "\x80",             // not a start byte
 188             "\xC0",             // truncated
 189             "\xC0\xC0",         // invalid continuation
 190             "\xF0\x82\x82\xAC", // overlong
 191             [
 192               0xE0 | (x >> 12),
 193               0x80 | ((x >> 6) & 0x3F),
 194               0x80 | (x & 0x3F)
 195             ],
 196             [
 197               cast(char)(0xF0 | (y >> 18)),
 198               cast(char)(0x80 | ((y >> 12) & 0x3F)),
 199               cast(char)(0x80 | ((y >> 6) & 0x3F)),
 200               cast(char)(0x80 | (y & 0x3F))
 201             ],
 202             [
 203               cast(char)(0xF8 | 3),     // 5 byte encoding
 204               cast(char)(0x80 | 3),
 205               cast(char)(0x80 | 3),
 206               cast(char)(0x80 | 3),
 207               cast(char)(0x80 | 3),
 208             ],
 209             [
 210               cast(char)(0xFC | 3),     // 6 byte encoding
 211               cast(char)(0x80 | 3),
 212               cast(char)(0x80 | 3),
 213               cast(char)(0x80 | 3),
 214               cast(char)(0x80 | 3),
 215               cast(char)(0x80 | 3),
 216             ],
 217         ];
 218
 219         return result[];
 220     }
 221     else static if (is(Char == wchar))
 222     {
 223         static immutable wstring[5] result =
 224         [
 225             [
 226               cast(wchar) 0xDC00,
 227             ],
 228             [
 229               cast(wchar) 0xDFFF,
 230             ],
 231             [
 232               cast(wchar) 0xDBFF,
 233               cast(wchar) 0xDBFF,
 234             ],
 235             [
 236               cast(wchar) 0xDBFF,
 237               cast(wchar) 0xE000,
 238             ],
 239             [
 240               cast(wchar) 0xD800,
 241             ],
 242         ];
 243
 244         return result[];
 245     }
 246     else static if (is(Char == dchar))
 247     {
 248         static immutable dstring[3] result =
 249         [
 250             [ cast(dchar) 0x110000 ],
 251             [ cast(dchar) 0x00D800 ],
 252             [ cast(dchar) 0x00DFFF ],
 253         ];
 254
 255         return result;
 256     }
 257     else
 258         static assert(0);
 259 }
 260
 261 /++
 262     Check whether the given Unicode code point is valid.
 263
 264     Params:
 265         c = code point to check
 266
 267     Returns:
 268         `true` if and only if `c` is a valid Unicode code point
 269
 270     Note:
 271     `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
 272     as they are permitted for internal use by an application, but they are
 273     not allowed for interchange by the Unicode standard.
 274   +/
 275 bool isValidDchar(dchar c) pure nothrow @safe @nogc
 276 {
 277     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
 278 }
 279
 280 ///
 281 @safe @nogc pure nothrow unittest
 282 {
 283     assert( isValidDchar(cast(dchar) 0x41));
 284     assert( isValidDchar(cast(dchar) 0x00));
 285     assert(!isValidDchar(cast(dchar) 0xD800));
 286     assert(!isValidDchar(cast(dchar) 0x11FFFF));
 287 }
 288
 289 pure nothrow @safe @nogc unittest
 290 {
 291     import std.exception;
 292
 293     assertCTFEable!(
 294     {
 295     assert( isValidDchar(cast(dchar)'a') == true);
 296     assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
 297
 298     assert(!isValidDchar(cast(dchar) 0x00D800));
 299     assert(!isValidDchar(cast(dchar) 0x00DBFF));
 300     assert(!isValidDchar(cast(dchar) 0x00DC00));
 301     assert(!isValidDchar(cast(dchar) 0x00DFFF));
 302     assert( isValidDchar(cast(dchar) 0x00FFFE));
 303     assert( isValidDchar(cast(dchar) 0x00FFFF));
 304     assert( isValidDchar(cast(dchar) 0x01FFFF));
 305     assert( isValidDchar(cast(dchar) 0x10FFFF));
 306     assert(!isValidDchar(cast(dchar) 0x110000));
 307     });
 308 }
 309
 310 /**
 311 Checks if a single character forms a valid code point.
 312
 313 When standing alone, some characters are invalid code points. For
 314 example the `wchar` `0xD800` is a so called high surrogate, which can
 315 only be interpreted together with a low surrogate following it. As a
 316 standalone character it is considered invalid.
 317
 318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
 319 Unicode Standard, D90, D91 and D92) for more details.
 320
 321 Params:
 322     c = character to test
 323     Char = character type of `c`
 324
 325 Returns:
 326     `true`, if `c` forms a valid code point.
 327  */
 328 bool isValidCodepoint(Char)(Char c)
 329 if (isSomeChar!Char)
 330 {
 331     alias UChar = Unqual!Char;
 332     static if (is(UChar == char))
 333     {
 334         return c <= 0x7F;
 335     }
 336     else static if (is(UChar == wchar))
 337     {
 338         return c <= 0xD7FF || c >= 0xE000;
 339     }
 340     else static if (is(UChar == dchar))
 341     {
 342         return isValidDchar(c);
 343     }
 344     else
 345         static assert(false, "unknown character type: `" ~ Char.stringof ~ "`");
 346 }
 347
 348 ///
 349 @safe pure nothrow unittest
 350 {
 351     assert( isValidCodepoint(cast(char) 0x40));
 352     assert(!isValidCodepoint(cast(char) 0x80));
 353     assert( isValidCodepoint(cast(wchar) 0x1234));
 354     assert(!isValidCodepoint(cast(wchar) 0xD800));
 355     assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
 356     assert(!isValidCodepoint(cast(dchar) 0x12345678));
 357 }
 358
 359 /++
 360     Calculate the length of the UTF sequence starting at `index`
 361     in `str`.
 362
 363     Params:
 364         str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
 365         of UTF code units. Must be random access if `index` is passed
 366         index = starting index of UTF sequence (default: `0`)
 367
 368     Returns:
 369         The number of code units in the UTF sequence. For UTF-8, this is a
 370         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
 371         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
 372
 373     Throws:
 374         May throw a `UTFException` if `str[index]` is not the start of a
 375         valid UTF sequence.
 376
 377     Note:
 378         `stride` will only analyze the first `str[index]` element. It
 379         will not fully verify the validity of the UTF sequence, nor even verify
 380         the presence of the sequence: it will not actually guarantee that
 381         $(D index + stride(str, index) <= str.length).
 382   +/
 383 uint stride(S)(auto ref S str, size_t index)
 384 if (is(S : const char[]) ||
 385     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
 386 {
 387     static if (is(typeof(str.length) : ulong))
 388         assert(index < str.length, "Past the end of the UTF-8 sequence");
 389     immutable c = str[index];
 390
 391     if (c < 0x80)
 392         return 1;
 393     else
 394         return strideImpl(c, index);
 395 }
 396
 397 /// Ditto
 398 uint stride(S)(auto ref S str)
 399 if (is(S : const char[]) ||
 400     (isInputRange!S && is(immutable ElementType!S == immutable char)))
 401 {
 402     static if (is(S : const char[]))
 403         immutable c = str[0];
 404     else
 405         immutable c = str.front;
 406
 407     if (c < 0x80)
 408         return 1;
 409     else
 410         return strideImpl(c, 0);
 411 }
 412
 413 @system unittest
 414 {
 415     import core.exception : AssertError;
 416     import std.conv : to;
 417     import std.exception;
 418     import std.string : format;
 419     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 420     static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
 421     {
 422         enforce(stride(s, i) == codeLength!char(c),
 423                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 424
 425         enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
 426                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 427
 428         auto refRandom = new RefRandomCU!char(s);
 429         immutable randLen = refRandom.length;
 430         enforce(stride(refRandom, i) == codeLength!char(c),
 431                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 432         enforce(refRandom.length == randLen,
 433                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 434
 435         if (i == 0)
 436         {
 437             enforce(stride(s) == codeLength!char(c),
 438                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
 439
 440             enforce(stride(InputCU!char(s)) == codeLength!char(c),
 441                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
 442
 443             auto refBidir = new RefBidirCU!char(s);
 444             immutable bidirLen = refBidir.length;
 445             enforce(stride(refBidir) == codeLength!char(c),
 446                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 447             enforce(refBidir.length == bidirLen,
 448                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 449         }
 450     }
 451
 452     assertCTFEable!(
 453     {
 454     test("a", 'a');
 455     test(" ", ' ');
 456     test("\u2029", '\u2029'); //paraSep
 457     test("\u0100", '\u0100');
 458     test("\u0430", '\u0430');
 459     test("\U00010143", '\U00010143');
 460     test("abcdefcdef", 'a');
 461     test("hello\U00010143\u0100\U00010143", 'h', 0);
 462     test("hello\U00010143\u0100\U00010143", 'e', 1);
 463     test("hello\U00010143\u0100\U00010143", 'l', 2);
 464     test("hello\U00010143\u0100\U00010143", 'l', 3);
 465     test("hello\U00010143\u0100\U00010143", 'o', 4);
 466     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
 467     test("hello\U00010143\u0100\U00010143", '\u0100', 9);
 468     test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
 469
 470     foreach (S; AliasSeq!(char[], const char[], string))
 471     {
 472         enum str = to!S("hello world");
 473         static assert(isSafe!({ stride(str, 0); }));
 474         static assert(isSafe!({ stride(str);    }));
 475         static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
 476         static assert((functionAttributes!({ stride(str);    }) & FunctionAttribute.pure_) != 0);
 477     }
 478     });
 479 }
 480
 481 @safe unittest // invalid start bytes
 482 {
 483     import std.exception : assertThrown;
 484     immutable char[] invalidStartBytes = [
 485         0b1111_1000, // indicating a sequence length of 5
 486         0b1111_1100, // 6
 487         0b1111_1110, // 7
 488         0b1111_1111, // 8
 489         0b1000_0000, // continuation byte
 490     ];
 491     foreach (c; invalidStartBytes)
 492         assertThrown!UTFException(stride([c]));
 493 }
 494
 495 /// Ditto
 496 uint stride(S)(auto ref S str, size_t index)
 497 if (is(S : const wchar[]) ||
 498     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
 499 {
 500     static if (is(typeof(str.length) : ulong))
 501         assert(index < str.length, "Past the end of the UTF-16 sequence");
 502     immutable uint u = str[index];
 503     return 1 + (u >= 0xD800 && u <= 0xDBFF);
 504 }
 505
 506 /// Ditto
 507 uint stride(S)(auto ref S str) @safe pure
 508 if (is(S : const wchar[]))
 509 {
 510     return stride(str, 0);
 511 }
 512
 513 /// Ditto
 514 uint stride(S)(auto ref S str)
 515 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
 516     !is(S : const wchar[]))
 517 {
 518     assert(!str.empty, "UTF-16 sequence is empty");
 519     immutable uint u = str.front;
 520     return 1 + (u >= 0xD800 && u <= 0xDBFF);
 521 }
 522
 523 @system unittest
 524 {
 525     import core.exception : AssertError;
 526     import std.conv : to;
 527     import std.exception;
 528     import std.string : format;
 529     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 530     static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
 531     {
 532         enforce(stride(s, i) == codeLength!wchar(c),
 533                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 534
 535         enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
 536                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 537
 538         auto refRandom = new RefRandomCU!wchar(s);
 539         immutable randLen = refRandom.length;
 540         enforce(stride(refRandom, i) == codeLength!wchar(c),
 541                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 542         enforce(refRandom.length == randLen,
 543                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 544
 545         if (i == 0)
 546         {
 547             enforce(stride(s) == codeLength!wchar(c),
 548                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
 549
 550             enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
 551                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
 552
 553             auto refBidir = new RefBidirCU!wchar(s);
 554             immutable bidirLen = refBidir.length;
 555             enforce(stride(refBidir) == codeLength!wchar(c),
 556                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 557             enforce(refBidir.length == bidirLen,
 558                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 559         }
 560     }
 561
 562     assertCTFEable!(
 563     {
 564     test("a", 'a');
 565     test(" ", ' ');
 566     test("\u2029", '\u2029'); //paraSep
 567     test("\u0100", '\u0100');
 568     test("\u0430", '\u0430');
 569     test("\U00010143", '\U00010143');
 570     test("abcdefcdef", 'a');
 571     test("hello\U00010143\u0100\U00010143", 'h', 0);
 572     test("hello\U00010143\u0100\U00010143", 'e', 1);
 573     test("hello\U00010143\u0100\U00010143", 'l', 2);
 574     test("hello\U00010143\u0100\U00010143", 'l', 3);
 575     test("hello\U00010143\u0100\U00010143", 'o', 4);
 576     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
 577     test("hello\U00010143\u0100\U00010143", '\u0100', 7);
 578     test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
 579
 580     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
 581     {
 582         enum str = to!S("hello world");
 583         static assert(isSafe!(() => stride(str, 0)));
 584         static assert(isSafe!(() => stride(str)   ));
 585         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
 586         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
 587     }
 588     });
 589 }
 590
 591 /// Ditto
 592 uint stride(S)(auto ref S str, size_t index = 0)
 593 if (is(S : const dchar[]) ||
 594     (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
 595 {
 596     static if (is(typeof(str.length) : ulong))
 597         assert(index < str.length, "Past the end of the UTF-32 sequence");
 598     else
 599         assert(!str.empty, "UTF-32 sequence is empty.");
 600     return 1;
 601 }
 602
 603 ///
 604 @safe unittest
 605 {
 606     assert("a".stride == 1);
 607     assert("λ".stride == 2);
 608     assert("aλ".stride == 1);
 609     assert("aλ".stride(1) == 2);
 610     assert("𐐷".stride == 4);
 611 }
 612
 613 @system unittest
 614 {
 615     import core.exception : AssertError;
 616     import std.conv : to;
 617     import std.exception;
 618     import std.string : format;
 619     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 620     static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
 621     {
 622         enforce(stride(s, i) == codeLength!dchar(c),
 623                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 624
 625         enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
 626                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 627
 628         auto refRandom = new RefRandomCU!dchar(s);
 629         immutable randLen = refRandom.length;
 630         enforce(stride(refRandom, i) == codeLength!dchar(c),
 631                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 632         enforce(refRandom.length == randLen,
 633                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 634
 635         if (i == 0)
 636         {
 637             enforce(stride(s) == codeLength!dchar(c),
 638                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
 639
 640             enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
 641                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
 642
 643             auto refBidir = new RefBidirCU!dchar(s);
 644             immutable bidirLen = refBidir.length;
 645             enforce(stride(refBidir) == codeLength!dchar(c),
 646                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 647             enforce(refBidir.length == bidirLen,
 648                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 649         }
 650     }
 651
 652     assertCTFEable!(
 653     {
 654     test("a", 'a');
 655     test(" ", ' ');
 656     test("\u2029", '\u2029'); //paraSep
 657     test("\u0100", '\u0100');
 658     test("\u0430", '\u0430');
 659     test("\U00010143", '\U00010143');
 660     test("abcdefcdef", 'a');
 661     test("hello\U00010143\u0100\U00010143", 'h', 0);
 662     test("hello\U00010143\u0100\U00010143", 'e', 1);
 663     test("hello\U00010143\u0100\U00010143", 'l', 2);
 664     test("hello\U00010143\u0100\U00010143", 'l', 3);
 665     test("hello\U00010143\u0100\U00010143", 'o', 4);
 666     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
 667     test("hello\U00010143\u0100\U00010143", '\u0100', 6);
 668     test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
 669
 670     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
 671     {
 672         enum str = to!S("hello world");
 673         static assert(isSafe!(() => stride(str, 0)));
 674         static assert(isSafe!(() => stride(str)   ));
 675         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
 676         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
 677     }
 678     });
 679 }
 680
 681 private uint strideImpl(char c, size_t index) @trusted pure
 682 in { assert(c & 0x80); }
 683 do
 684 {
 685     import core.bitop : bsr;
 686     immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
 687     if (c == 0xFF || msbs < 2 || msbs > 4)
 688         throw new UTFException("Invalid UTF-8 sequence", index);
 689     return msbs;
 690 }
 691
 692 /++
 693     Calculate the length of the UTF sequence ending one code unit before
 694     `index` in `str`.
 695
 696     Params:
 697         str = bidirectional range of UTF code units. Must be random access if
 698         `index` is passed
 699         index = index one past end of UTF sequence (default: `str.length`)
 700
 701     Returns:
 702         The number of code units in the UTF sequence. For UTF-8, this is a
 703         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
 704         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
 705
 706     Throws:
 707         May throw a `UTFException` if `str[index]` is not one past the
 708         end of a valid UTF sequence.
 709
 710     Note:
 711         `strideBack` will only analyze the element at $(D str[index - 1])
 712         element. It will not fully verify the validity of the UTF sequence, nor
 713         even verify the presence of the sequence: it will not actually
 714         guarantee that $(D strideBack(str, index) <= index).
 715   +/
 716 uint strideBack(S)(auto ref S str, size_t index)
 717 if (is(S : const char[]) ||
 718     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
 719 {
 720     static if (is(typeof(str.length) : ulong))
 721         assert(index <= str.length, "Past the end of the UTF-8 sequence");
 722     assert(index > 0, "Not the end of the UTF-8 sequence");
 723
 724     if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
 725         return 1;
 726
 727     if (index >= 4) //single verification for most common case
 728     {
 729         static foreach (i; 2 .. 5)
 730         {
 731             if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
 732                 return i;
 733         }
 734     }
 735     else
 736     {
 737         static foreach (i; 2 .. 4)
 738         {
 739             if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
 740                 return i;
 741         }
 742     }
 743     throw new UTFException("Not the end of the UTF sequence", index);
 744 }
 745
 746 /// Ditto
 747 uint strideBack(S)(auto ref S str)
 748 if (is(S : const char[]) ||
 749     (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
 750 {
 751     return strideBack(str, str.length);
 752 }
 753
 754 /// Ditto
 755 uint strideBack(S)(auto ref S str)
 756 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
 757 {
 758     assert(!str.empty, "Past the end of the UTF-8 sequence");
 759     auto temp = str.save;
 760     foreach (i; AliasSeq!(1, 2, 3, 4))
 761     {
 762         if ((temp.back & 0b1100_0000) != 0b1000_0000)
 763             return i;
 764         temp.popBack();
 765         if (temp.empty)
 766             break;
 767     }
 768     throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
 769 }
 770
 771 @system unittest
 772 {
 773     import core.exception : AssertError;
 774     import std.conv : to;
 775     import std.exception;
 776     import std.string : format;
 777     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 778     static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
 779     {
 780         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
 781                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 782
 783         enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
 784                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 785
 786         auto refRandom = new RefRandomCU!char(s);
 787         immutable randLen = refRandom.length;
 788         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
 789                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 790         enforce(refRandom.length == randLen,
 791                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 792
 793         if (i == size_t.max)
 794         {
 795             enforce(strideBack(s) == codeLength!char(c),
 796                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
 797
 798             enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
 799                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
 800
 801             auto refBidir = new RefBidirCU!char(s);
 802             immutable bidirLen = refBidir.length;
 803             enforce(strideBack(refBidir) == codeLength!char(c),
 804                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 805             enforce(refBidir.length == bidirLen,
 806                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 807         }
 808     }
 809
 810     assertCTFEable!(
 811     {
 812     test("a", 'a');
 813     test(" ", ' ');
 814     test("\u2029", '\u2029'); //paraSep
 815     test("\u0100", '\u0100');
 816     test("\u0430", '\u0430');
 817     test("\U00010143", '\U00010143');
 818     test("abcdefcdef", 'f');
 819     test("\U00010143\u0100\U00010143hello", 'o', 15);
 820     test("\U00010143\u0100\U00010143hello", 'l', 14);
 821     test("\U00010143\u0100\U00010143hello", 'l', 13);
 822     test("\U00010143\u0100\U00010143hello", 'e', 12);
 823     test("\U00010143\u0100\U00010143hello", 'h', 11);
 824     test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
 825     test("\U00010143\u0100\U00010143hello", '\u0100', 6);
 826     test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
 827
 828     foreach (S; AliasSeq!(char[], const char[], string))
 829     {
 830         enum str = to!S("hello world");
 831         static assert(isSafe!({ strideBack(str, 0); }));
 832         static assert(isSafe!({ strideBack(str);    }));
 833         static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
 834         static assert((functionAttributes!({ strideBack(str);    }) & FunctionAttribute.pure_) != 0);
 835     }
 836     });
 837 }
 838
 839 //UTF-16 is self synchronizing: The length of strideBack can be found from
 840 //the value of a single wchar
 841 /// Ditto
 842 uint strideBack(S)(auto ref S str, size_t index)
 843 if (is(S : const wchar[]) ||
 844     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
 845 {
 846     static if (is(typeof(str.length) : ulong))
 847         assert(index <= str.length, "Past the end of the UTF-16 sequence");
 848     assert(index > 0, "Not the end of a UTF-16 sequence");
 849
 850     immutable c2 = str[index-1];
 851     return 1 + (0xDC00 <= c2 && c2 < 0xE000);
 852 }
 853
 854 /// Ditto
 855 uint strideBack(S)(auto ref S str)
 856 if (is(S : const wchar[]) ||
 857     (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
 858 {
 859     assert(!str.empty, "UTF-16 sequence is empty");
 860
 861     static if (is(S : const(wchar)[]))
 862         immutable c2 = str[$ - 1];
 863     else
 864         immutable c2 = str.back;
 865
 866     return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
 867 }
 868
 869 @system unittest
 870 {
 871     import core.exception : AssertError;
 872     import std.conv : to;
 873     import std.exception;
 874     import std.string : format;
 875     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 876     static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
 877     {
 878         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
 879                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 880
 881         enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
 882                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 883
 884         auto refRandom = new RefRandomCU!wchar(s);
 885         immutable randLen = refRandom.length;
 886         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
 887                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 888         enforce(refRandom.length == randLen,
 889                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 890
 891         if (i == size_t.max)
 892         {
 893             enforce(strideBack(s) == codeLength!wchar(c),
 894                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
 895
 896             enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
 897                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
 898
 899             auto refBidir = new RefBidirCU!wchar(s);
 900             immutable bidirLen = refBidir.length;
 901             enforce(strideBack(refBidir) == codeLength!wchar(c),
 902                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 903             enforce(refBidir.length == bidirLen,
 904                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
 905         }
 906     }
 907
 908     assertCTFEable!(
 909     {
 910     test("a", 'a');
 911     test(" ", ' ');
 912     test("\u2029", '\u2029'); //paraSep
 913     test("\u0100", '\u0100');
 914     test("\u0430", '\u0430');
 915     test("\U00010143", '\U00010143');
 916     test("abcdefcdef", 'f');
 917     test("\U00010143\u0100\U00010143hello", 'o', 10);
 918     test("\U00010143\u0100\U00010143hello", 'l', 9);
 919     test("\U00010143\u0100\U00010143hello", 'l', 8);
 920     test("\U00010143\u0100\U00010143hello", 'e', 7);
 921     test("\U00010143\u0100\U00010143hello", 'h', 6);
 922     test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
 923     test("\U00010143\u0100\U00010143hello", '\u0100', 3);
 924     test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
 925
 926     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
 927     {
 928         enum str = to!S("hello world");
 929         static assert(isSafe!(() => strideBack(str, 0)));
 930         static assert(isSafe!(() => strideBack(str)   ));
 931         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
 932         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
 933     }
 934     });
 935 }
 936
 937 /// Ditto
 938 uint strideBack(S)(auto ref S str, size_t index)
 939 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
 940 {
 941     static if (is(typeof(str.length) : ulong))
 942         assert(index <= str.length, "Past the end of the UTF-32 sequence");
 943     assert(index > 0, "Not the end of the UTF-32 sequence");
 944     return 1;
 945 }
 946
 947 /// Ditto
 948 uint strideBack(S)(auto ref S str)
 949 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
 950 {
 951     assert(!str.empty, "Empty UTF-32 sequence");
 952     return 1;
 953 }
 954
 955 ///
 956 @safe unittest
 957 {
 958     assert("a".strideBack == 1);
 959     assert("λ".strideBack == 2);
 960     assert("aλ".strideBack == 2);
 961     assert("aλ".strideBack(1) == 1);
 962     assert("𐐷".strideBack == 4);
 963 }
 964
 965 @system unittest
 966 {
 967     import core.exception : AssertError;
 968     import std.conv : to;
 969     import std.exception;
 970     import std.string : format;
 971     import std.traits : FunctionAttribute, functionAttributes, isSafe;
 972     static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
 973     {
 974         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
 975                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
 976
 977         enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
 978                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
 979
 980         auto refRandom = new RefRandomCU!dchar(s);
 981         immutable randLen = refRandom.length;
 982         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
 983                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
 984         enforce(refRandom.length == randLen,
 985                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
 986
 987         if (i == size_t.max)
 988         {
 989             enforce(strideBack(s) == codeLength!dchar(c),
 990                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
 991
 992             enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
 993                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
 994
 995             auto refBidir = new RefBidirCU!dchar(s);
 996             immutable bidirLen = refBidir.length;
 997             enforce(strideBack(refBidir) == codeLength!dchar(c),
 998                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
 999             enforce(refBidir.length == bidirLen,
1000                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
1001         }
1002     }
1003
1004     assertCTFEable!(
1005     {
1006     test("a", 'a');
1007     test(" ", ' ');
1008     test("\u2029", '\u2029'); //paraSep
1009     test("\u0100", '\u0100');
1010     test("\u0430", '\u0430');
1011     test("\U00010143", '\U00010143');
1012     test("abcdefcdef", 'f');
1013     test("\U00010143\u0100\U00010143hello", 'o', 8);
1014     test("\U00010143\u0100\U00010143hello", 'l', 7);
1015     test("\U00010143\u0100\U00010143hello", 'l', 6);
1016     test("\U00010143\u0100\U00010143hello", 'e', 5);
1017     test("\U00010143\u0100\U00010143hello", 'h', 4);
1018     test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
1019     test("\U00010143\u0100\U00010143hello", '\u0100', 2);
1020     test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
1021
1022     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
1023     {
1024         enum str = to!S("hello world");
1025         static assert(isSafe!(() => strideBack(str, 0)));
1026         static assert(isSafe!(() => strideBack(str)   ));
1027         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
1028         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
1029     }
1030     });
1031 }
1032
1033
1034 /++
1035     Given `index` into `str` and assuming that `index` is at the start
1036     of a UTF sequence, `toUCSindex` determines the number of UCS characters
1037     up to `index`. So, `index` is the index of a code unit at the
1038     beginning of a code point, and the return value is how many code points into
1039     the string that that code point is.
1040   +/
1041 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
1042 if (isSomeChar!C)
1043 {
1044     static if (is(immutable C == immutable dchar))
1045         return index;
1046     else
1047     {
1048         size_t n = 0;
1049         size_t j = 0;
1050
1051         for (; j < index; ++n)
1052             j += stride(str, j);
1053
1054         if (j > index)
1055         {
1056             static if (is(immutable C == immutable char))
1057                 throw new UTFException("Invalid UTF-8 sequence", index);
1058             else
1059                 throw new UTFException("Invalid UTF-16 sequence", index);
1060         }
1061
1062         return n;
1063     }
1064 }
1065
1066 ///
1067 @safe unittest
1068 {
1069     assert(toUCSindex(`hello world`, 7) == 7);
1070     assert(toUCSindex(`hello world`w, 7) == 7);
1071     assert(toUCSindex(`hello world`d, 7) == 7);
1072
1073     assert(toUCSindex(`Ma Chérie`, 7) == 6);
1074     assert(toUCSindex(`Ma Chérie`w, 7) == 7);
1075     assert(toUCSindex(`Ma Chérie`d, 7) == 7);
1076
1077     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1078     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1079     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1080 }
1081
1082
1083 /++
1084     Given a UCS index `n` into `str`, returns the UTF index.
1085     So, `n` is how many code points into the string the code point is, and
1086     the array index of the code unit is returned.
1087   +/
1088 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
1089 if (isSomeChar!C)
1090 {
1091     static if (is(immutable C == immutable dchar))
1092     {
1093         return n;
1094     }
1095     else
1096     {
1097         size_t i;
1098         while (n--)
1099         {
1100             i += stride(str, i);
1101         }
1102         return i;
1103     }
1104 }
1105
1106 ///
1107 @safe unittest
1108 {
1109     assert(toUTFindex(`hello world`, 7) == 7);
1110     assert(toUTFindex(`hello world`w, 7) == 7);
1111     assert(toUTFindex(`hello world`d, 7) == 7);
1112
1113     assert(toUTFindex(`Ma Chérie`, 6) == 7);
1114     assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1115     assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1116
1117     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1118     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1119     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1120 }
1121
1122
1123 /* =================== Decode ======================= */
1124
1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1126 alias UseReplacementDchar = Flag!"useReplacementDchar";
1127
1128 /++
1129     Decodes and returns the code point starting at `str[index]`. `index`
1130     is advanced to one past the decoded code point. If the code point is not
1131     well-formed, then a `UTFException` is thrown and `index` remains
1132     unchanged.
1133
1134     decode will only work with strings and random access ranges of code units
1135     with length and slicing, whereas $(LREF decodeFront) will work with any
1136     input range of code units.
1137
1138     Params:
1139         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1140         str = input string or indexable Range
1141         index = starting index into s[]; incremented by number of code units processed
1142
1143     Returns:
1144         decoded character
1145
1146     Throws:
1147         $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1148         sequence and useReplacementDchar is `No.useReplacementDchar`
1149   +/
1150 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1151 if (!isSomeString!S &&
1152     isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1153 in
1154 {
1155     assert(index < str.length, "Attempted to decode past the end of a string");
1156 }
1157 out (result)
1158 {
1159     assert(isValidDchar(result));
1160 }
1161 do
1162 {
1163     if (str[index] < codeUnitLimit!S)
1164         return str[index++];
1165     else
1166         return decodeImpl!(true, useReplacementDchar)(str, index);
1167 }
1168
1169 /// ditto
1170 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1171 auto ref S str, ref size_t index) @trusted pure
1172 if (isSomeString!S)
1173 in
1174 {
1175     assert(index < str.length, "Attempted to decode past the end of a string");
1176 }
1177 out (result)
1178 {
1179     assert(isValidDchar(result));
1180 }
1181 do
1182 {
1183     if (str[index] < codeUnitLimit!S)
1184         return str[index++];
1185     else static if (is(immutable S == immutable C[], C))
1186         return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1187 }
1188
1189 ///
1190 @safe pure unittest
1191 {
1192     size_t i;
1193
1194     assert("a".decode(i) == 'a' && i == 1);
1195     i = 0;
1196     assert("å".decode(i) == 'å' && i == 2);
1197     i = 1;
1198     assert("aå".decode(i) == 'å' && i == 3);
1199     i = 0;
1200     assert("å"w.decode(i) == 'å' && i == 1);
1201
1202     // ë as a multi-code point grapheme
1203     i = 0;
1204     assert("e\u0308".decode(i) == 'e' && i == 1);
1205     // ë as a single code point grapheme
1206     i = 0;
1207     assert("ë".decode(i) == 'ë' && i == 2);
1208     i = 0;
1209     assert("ë"w.decode(i) == 'ë' && i == 1);
1210 }
1211
1212 /++
1213     `decodeFront` is a variant of $(LREF decode) which specifically decodes
1214     the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1215     $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1216     of code units (rather than just a string or random access
1217     range). It also takes the range by `ref` and pops off the elements as it
1218     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1219     of code units which were in the code point which was decoded.
1220
1221     Params:
1222         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1223         str = input string or indexable Range
1224         numCodeUnits = set to number of code units processed
1225
1226     Returns:
1227         decoded character
1228
1229     Throws:
1230         $(LREF UTFException) if `str.front` is not the start of a valid UTF
1231         sequence. If an exception is thrown, then there is no guarantee as to
1232         the number of code units which were popped off, as it depends on the
1233         type of range being used and how many code units had to be popped off
1234         before the code point was determined to be invalid.
1235   +/
1236 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1237 ref S str, out size_t numCodeUnits)
1238 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1239 in
1240 {
1241     assert(!str.empty);
1242 }
1243 out (result)
1244 {
1245     assert(isValidDchar(result));
1246 }
1247 do
1248 {
1249     immutable fst = str.front;
1250
1251     if (fst < codeUnitLimit!S)
1252     {
1253         str.popFront();
1254         numCodeUnits = 1;
1255         return fst;
1256     }
1257     else
1258     {
1259         // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1260         // done outside of decodeImpl, which is undesirable, since not all
1261         // overloads of decodeImpl need it. So, it should be moved back into
1262         // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1263         // has been fixed.
1264         enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1265         immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
1266
1267         // The other range types were already popped by decodeImpl.
1268         static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1269             str = str[numCodeUnits .. str.length];
1270
1271         return retval;
1272     }
1273 }
1274
1275 /// ditto
1276 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1277 ref S str, out size_t numCodeUnits) @trusted pure
1278 if (isSomeString!S)
1279 in
1280 {
1281     assert(!str.empty);
1282 }
1283 out (result)
1284 {
1285     assert(isValidDchar(result));
1286 }
1287 do
1288 {
1289     if (str[0] < codeUnitLimit!S)
1290     {
1291         numCodeUnits = 1;
1292         immutable retval = str[0];
1293         str = str[1 .. $];
1294         return retval;
1295     }
1296     else static if (is(immutable S == immutable C[], C))
1297     {
1298         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits);
1299         str = str[numCodeUnits .. $];
1300         return retval;
1301     }
1302 }
1303
1304 /++ Ditto +/
1305 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1306 if (isInputRange!S && isSomeChar!(ElementType!S))
1307 {
1308     size_t numCodeUnits;
1309     return decodeFront!useReplacementDchar(str, numCodeUnits);
1310 }
1311
1312 ///
1313 @safe pure unittest
1314 {
1315     import std.range.primitives;
1316     string str = "Hello, World!";
1317
1318     assert(str.decodeFront == 'H' && str == "ello, World!");
1319     str = "å";
1320     assert(str.decodeFront == 'å' && str.empty);
1321     str = "å";
1322     size_t i;
1323     assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
1324 }
1325
1326 /++
1327     `decodeBack` is a variant of $(LREF decode) which specifically decodes
1328     the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1329     bidirectional range of code units (rather than just a string or random access
1330     range). It also takes the range by `ref` and pops off the elements as it
1331     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1332     of code units which were in the code point which was decoded.
1333
1334     Params:
1335         useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1336         str = input string or bidirectional Range
1337         numCodeUnits = gives the number of code units processed
1338
1339     Returns:
1340         A decoded UTF character.
1341
1342     Throws:
1343         $(LREF UTFException) if `str.back` is not the end of a valid UTF
1344         sequence. If an exception is thrown, the `str` itself remains unchanged,
1345         but there is no guarantee as to the value of `numCodeUnits` (when passed).
1346   +/
1347 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1348     ref S str, out size_t numCodeUnits)
1349 if (isSomeString!S)
1350 in
1351 {
1352     assert(!str.empty);
1353 }
1354 out (result)
1355 {
1356     assert(isValidDchar(result));
1357 }
1358 do
1359 {
1360     if (str[$ - 1] < codeUnitLimit!S)
1361     {
1362         numCodeUnits = 1;
1363         immutable retval = str[$ - 1];
1364         str = str[0 .. $ - 1];
1365         return retval;
1366     }
1367     else static if (is(immutable S == immutable C[], C))
1368     {
1369         numCodeUnits = strideBack(str);
1370         immutable newLength = str.length - numCodeUnits;
1371         size_t index = newLength;
1372         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
1373         str = str[0 .. newLength];
1374         return retval;
1375     }
1376 }
1377
1378 /++ Ditto +/
1379 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1380     ref S str, out size_t numCodeUnits)
1381 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1382     && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1383 in
1384 {
1385     assert(!str.empty);
1386 }
1387 out (result)
1388 {
1389     assert(isValidDchar(result));
1390 }
1391 do
1392 {
1393     if (str.back < codeUnitLimit!S)
1394     {
1395         numCodeUnits = 1;
1396         immutable retval = str.back;
1397         str.popBack();
1398         return retval;
1399     }
1400     else
1401     {
1402         numCodeUnits = strideBack(str);
1403         static if (isRandomAccessRange!S)
1404         {
1405             size_t index = str.length - numCodeUnits;
1406             immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
1407             str.popBackExactly(numCodeUnits);
1408             return retval;
1409         }
1410         else
1411         {
1412             alias Char = Unqual!(ElementType!S);
1413             Char[4] codeUnits;
1414             S tmp = str.save;
1415             for (size_t i = numCodeUnits; i > 0; )
1416             {
1417                 codeUnits[--i] = tmp.back;
1418                 tmp.popBack();
1419             }
1420             const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1421             size_t index = 0;
1422             immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
1423             str = tmp;
1424             return retval;
1425         }
1426     }
1427 }
1428
1429 /++ Ditto +/
1430 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1431 if (isSomeString!S
1432     || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1433     || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1434 in
1435 {
1436     assert(!str.empty);
1437 }
1438 out (result)
1439 {
1440     assert(isValidDchar(result));
1441 }
1442 do
1443 {
1444     size_t numCodeUnits;
1445     return decodeBack!useReplacementDchar(str, numCodeUnits);
1446 }
1447
1448 ///
1449 @system pure unittest
1450 {
1451     import std.range.primitives;
1452     string str = "Hello, World!";
1453
1454     assert(str.decodeBack == '!' && str == "Hello, World");
1455     str = "å";
1456     assert(str.decodeBack == 'å' && str.empty);
1457     str = "å";
1458     size_t i;
1459     assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
1460 }
1461
1462 // For the given range, code unit values less than this
1463 // are guaranteed to be valid single-codepoint encodings.
1464 package template codeUnitLimit(S)
1465 if (isSomeChar!(ElementEncodingType!S))
1466 {
1467     static if (is(immutable ElementEncodingType!S == immutable char))
1468         enum char codeUnitLimit = 0x80;
1469     else static if (is(immutable ElementEncodingType!S == immutable wchar))
1470         enum wchar codeUnitLimit = 0xD800;
1471     else
1472         enum dchar codeUnitLimit = 0xD800;
1473 }
1474
1475 /*
1476  * For strings, this function does its own bounds checking to give a
1477  * more useful error message when attempting to decode past the end of a string.
1478  * Subsequently it uses a pointer instead of an array to avoid
1479  * redundant bounds checking.
1480  *
1481  * The three overloads of this operate on chars, wchars, and dchars.
1482  *
1483  * Params:
1484  *      canIndex = if S is indexable
1485  *      useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1486  *      str = input string or Range
1487  *      index = starting index into s[]; incremented by number of code units processed
1488  *
1489  * Returns:
1490  *      decoded character
1491  */
1492 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1493     auto ref S str, ref size_t index)
1494 if (
1495     is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
1496 {
1497     /* The following encodings are valid, except for the 5 and 6 byte
1498      * combinations:
1499      *  0xxxxxxx
1500      *  110xxxxx 10xxxxxx
1501      *  1110xxxx 10xxxxxx 10xxxxxx
1502      *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1503      *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1504      *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1505      */
1506
1507     /* Dchar bitmask for different numbers of UTF-8 code units.
1508      */
1509     alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1510
1511     static if (is(S : const char[]))
1512         auto pstr = str.ptr + index;    // this is what makes decodeImpl() @system code
1513     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1514         auto pstr = str[index .. str.length];
1515     else
1516         alias pstr = str;
1517
1518     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1519     // outside of decodeImpl
1520     //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1521
1522     static if (canIndex)
1523     {
1524         immutable length = str.length - index;
1525         ubyte fst = pstr[0];
1526     }
1527     else
1528     {
1529         ubyte fst = pstr.front;
1530         pstr.popFront();
1531     }
1532
1533     static if (!useReplacementDchar)
1534     {
1535         static if (canIndex)
1536         {
1537             static UTFException exception(S)(S str, string msg)
1538             {
1539                 uint[4] sequence = void;
1540                 size_t i;
1541
1542                 do
1543                 {
1544                     sequence[i] = str[i];
1545                 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1546
1547                 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1548             }
1549         }
1550
1551         UTFException invalidUTF()
1552         {
1553             static if (canIndex)
1554                return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1555             else
1556             {
1557                 //We can't include the invalid sequence with input strings without
1558                 //saving each of the code units along the way, and we can't do it with
1559                 //forward ranges without saving the entire range. Both would incur a
1560                 //cost for the decoding of every character just to provide a better
1561                 //error message for the (hopefully) rare case when an invalid UTF-8
1562                 //sequence is encountered, so we don't bother trying to include the
1563                 //invalid sequence here, unlike with strings and sliceable ranges.
1564                return new UTFException("Invalid UTF-8 sequence");
1565             }
1566         }
1567
1568         UTFException outOfBounds()
1569         {
1570             static if (canIndex)
1571                return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1572             else
1573                return new UTFException("Attempted to decode past the end of a string");
1574         }
1575     }
1576
1577     if ((fst & 0b1100_0000) != 0b1100_0000)
1578     {
1579         static if (useReplacementDchar)
1580         {
1581             ++index;            // always consume bad input to avoid infinite loops
1582             return replacementDchar;
1583         }
1584         else
1585             throw invalidUTF(); // starter must have at least 2 first bits set
1586     }
1587     ubyte tmp = void;
1588     dchar d = fst; // upper control bits are masked out later
1589     fst <<= 1;
1590
1591     foreach (i; AliasSeq!(1, 2, 3))
1592     {
1593
1594         static if (canIndex)
1595         {
1596             if (i == length)
1597             {
1598                 static if (useReplacementDchar)
1599                 {
1600                     index += i;
1601                     return replacementDchar;
1602                 }
1603                 else
1604                     throw outOfBounds();
1605             }
1606         }
1607         else
1608         {
1609             if (pstr.empty)
1610             {
1611                 static if (useReplacementDchar)
1612                 {
1613                     index += i;
1614                     return replacementDchar;
1615                 }
1616                 else
1617                     throw outOfBounds();
1618             }
1619         }
1620
1621         static if (canIndex)
1622             tmp = pstr[i];
1623         else
1624         {
1625             tmp = pstr.front;
1626             pstr.popFront();
1627         }
1628
1629         if ((tmp & 0xC0) != 0x80)
1630         {
1631             static if (useReplacementDchar)
1632             {
1633                 index += i + 1;
1634                 return replacementDchar;
1635             }
1636             else
1637                 throw invalidUTF();
1638         }
1639
1640         d = (d << 6) | (tmp & 0x3F);
1641         fst <<= 1;
1642
1643         if (!(fst & 0x80)) // no more bytes
1644         {
1645             d &= bitMask[i]; // mask out control bits
1646
1647             // overlong, could have been encoded with i bytes
1648             if ((d & ~bitMask[i - 1]) == 0)
1649             {
1650                 static if (useReplacementDchar)
1651                 {
1652                     index += i + 1;
1653                     return replacementDchar;
1654                 }
1655                 else
1656                     throw invalidUTF();
1657             }
1658
1659             // check for surrogates only needed for 3 bytes
1660             static if (i == 2)
1661             {
1662                 if (!isValidDchar(d))
1663                 {
1664                     static if (useReplacementDchar)
1665                     {
1666                         index += i + 1;
1667                         return replacementDchar;
1668                     }
1669                     else
1670                         throw invalidUTF();
1671                 }
1672             }
1673
1674             index += i + 1;
1675             static if (i == 3)
1676             {
1677                 if (d > dchar.max)
1678                 {
1679                     static if (useReplacementDchar)
1680                         d = replacementDchar;
1681                     else
1682                         throw invalidUTF();
1683                 }
1684             }
1685             return d;
1686         }
1687     }
1688
1689     static if (useReplacementDchar)
1690     {
1691         index += 4;             // read 4 chars by now
1692         return replacementDchar;
1693     }
1694     else
1695         throw invalidUTF();
1696 }
1697
1698 @safe pure @nogc nothrow
1699 unittest
1700 {
1701     // Add tests for useReplacemendDchar == yes path
1702
1703     static struct R
1704     {
1705       @safe pure @nogc nothrow:
1706         this(string s) { this.s = s; }
1707         @property bool empty() { return idx == s.length; }
1708         @property char front() { return s[idx]; }
1709         void popFront() { ++idx; }
1710         size_t idx;
1711         string s;
1712     }
1713
1714     foreach (s; invalidUTFstrings!char())
1715     {
1716         auto r = R(s);
1717         size_t index;
1718         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1719         assert(dc == replacementDchar);
1720         assert(1 <= index && index <= s.length);
1721     }
1722 }
1723
1724 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1725 (auto ref S str, ref size_t index)
1726 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
1727 {
1728     static if (is(S : const wchar[]))
1729         auto pstr = str.ptr + index;
1730     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1731         auto pstr = str[index .. str.length];
1732     else
1733         alias pstr = str;
1734
1735     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1736     // outside of decodeImpl
1737     //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1738
1739     static if (canIndex)
1740     {
1741         immutable length = str.length - index;
1742         uint u = pstr[0];
1743     }
1744     else
1745     {
1746         uint u = pstr.front;
1747         pstr.popFront();
1748     }
1749
1750     static if (!useReplacementDchar)
1751     {
1752         UTFException exception(string msg)
1753         {
1754             static if (canIndex)
1755                 return new UTFException(msg).setSequence(pstr[0]);
1756             else
1757                 return new UTFException(msg);
1758         }
1759     }
1760
1761     // The < case must be taken care of before decodeImpl is called.
1762     assert(u >= 0xD800);
1763
1764     if (u <= 0xDBFF)
1765     {
1766         static if (canIndex)
1767             immutable onlyOneCodeUnit = length == 1;
1768         else
1769             immutable onlyOneCodeUnit = pstr.empty;
1770
1771         if (onlyOneCodeUnit)
1772         {
1773             static if (useReplacementDchar)
1774             {
1775                 ++index;
1776                 return replacementDchar;
1777             }
1778             else
1779                 throw exception("surrogate UTF-16 high value past end of string");
1780         }
1781
1782         static if (canIndex)
1783             immutable uint u2 = pstr[1];
1784         else
1785         {
1786             immutable uint u2 = pstr.front;
1787             pstr.popFront();
1788         }
1789
1790         if (u2 < 0xDC00 || u2 > 0xDFFF)
1791         {
1792             static if (useReplacementDchar)
1793                 u = replacementDchar;
1794             else
1795                 throw exception("surrogate UTF-16 low value out of range");
1796         }
1797         else
1798             u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1799         ++index;
1800     }
1801     else if (u >= 0xDC00 && u <= 0xDFFF)
1802     {
1803         static if (useReplacementDchar)
1804             u = replacementDchar;
1805         else
1806             throw exception("unpaired surrogate UTF-16 value");
1807     }
1808     ++index;
1809
1810     // Note: u+FFFE and u+FFFF are specifically permitted by the
1811     // Unicode standard for application internal use (see isValidDchar)
1812
1813     return cast(dchar) u;
1814 }
1815
1816 @safe pure @nogc nothrow
1817 unittest
1818 {
1819     // Add tests for useReplacemendDchar == true path
1820
1821     static struct R
1822     {
1823       @safe pure @nogc nothrow:
1824         this(wstring s) { this.s = s; }
1825         @property bool empty() { return idx == s.length; }
1826         @property wchar front() { return s[idx]; }
1827         void popFront() { ++idx; }
1828         size_t idx;
1829         wstring s;
1830     }
1831
1832     foreach (s; invalidUTFstrings!wchar())
1833     {
1834         auto r = R(s);
1835         size_t index;
1836         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1837         assert(dc == replacementDchar);
1838         assert(1 <= index && index <= s.length);
1839     }
1840 }
1841
1842 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1843     auto ref S str, ref size_t index)
1844 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
1845 {
1846     static if (is(S : const dchar[]))
1847         auto pstr = str.ptr;
1848     else
1849         alias pstr = str;
1850
1851     static if (is(S : const dchar[]) || isRandomAccessRange!S)
1852     {
1853         dchar dc = pstr[index];
1854         if (!isValidDchar(dc))
1855         {
1856             static if (useReplacementDchar)
1857                 dc = replacementDchar;
1858             else
1859                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1860         }
1861         ++index;
1862         return dc;
1863     }
1864     else
1865     {
1866         dchar dc = pstr.front;
1867         if (!isValidDchar(dc))
1868         {
1869             static if (useReplacementDchar)
1870                 dc = replacementDchar;
1871             else
1872                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1873         }
1874         ++index;
1875         pstr.popFront();
1876         return dc;
1877     }
1878 }
1879
1880 @safe pure @nogc nothrow
1881 unittest
1882 {
1883     // Add tests for useReplacemendDchar == true path
1884
1885     static struct R
1886     {
1887       @safe pure @nogc nothrow:
1888         this(dstring s) { this.s = s; }
1889         @property bool empty() { return idx == s.length; }
1890         @property dchar front() { return s[idx]; }
1891         void popFront() { ++idx; }
1892         size_t idx;
1893         dstring s;
1894     }
1895
1896     foreach (s; invalidUTFstrings!dchar())
1897     {
1898         auto r = R(s);
1899         size_t index;
1900         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1901         assert(dc == replacementDchar);
1902         assert(1 <= index && index <= s.length);
1903     }
1904 }
1905
1906
1907 version (StdUnittest) private void testDecode(R)(R range,
1908                                              size_t index,
1909                                              dchar expectedChar,
1910                                              size_t expectedIndex,
1911                                              size_t line = __LINE__)
1912 {
1913     import core.exception : AssertError;
1914     import std.exception : enforce;
1915     import std.string : format;
1916     import std.traits : isNarrowString;
1917
1918     static if (hasLength!R)
1919         immutable lenBefore = range.length;
1920
1921     static if (isRandomAccessRange!R && !isNarrowString!R)
1922     {
1923         {
1924             immutable result = decode(range, index);
1925             enforce(result == expectedChar,
1926                     new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1927             enforce(index == expectedIndex,
1928                     new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1929             static if (hasLength!R)
1930             {
1931                 enforce(range.length == lenBefore,
1932                         new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1933             }
1934         }
1935     }
1936 }
1937
1938 version (StdUnittest) private void testDecodeFront(R)(ref R range,
1939                                                   dchar expectedChar,
1940                                                   size_t expectedNumCodeUnits,
1941                                                   size_t line = __LINE__)
1942 {
1943     import core.exception : AssertError;
1944     import std.exception : enforce;
1945     import std.string : format;
1946
1947     static if (hasLength!R)
1948         immutable lenBefore = range.length;
1949
1950     size_t numCodeUnits;
1951     immutable result = decodeFront(range, numCodeUnits);
1952     enforce(result == expectedChar,
1953             new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1954     enforce(numCodeUnits == expectedNumCodeUnits,
1955             new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1956
1957     static if (hasLength!R)
1958     {
1959         enforce(range.length == lenBefore - numCodeUnits,
1960                 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1961     }
1962 }
1963
1964 version (StdUnittest) private void testDecodeBack(R)(ref R range,
1965                                                  dchar expectedChar,
1966                                                  size_t expectedNumCodeUnits,
1967                                                  size_t line = __LINE__)
1968 {
1969     // This condition is to allow unit testing all `decode` functions together
1970     static if (!isBidirectionalRange!R)
1971         return;
1972     else
1973     {
1974         import core.exception : AssertError;
1975         import std.exception : enforce;
1976         import std.string : format;
1977
1978         static if (hasLength!R)
1979             immutable lenBefore = range.length;
1980
1981         size_t numCodeUnits;
1982         immutable result = decodeBack(range, numCodeUnits);
1983         enforce(result == expectedChar,
1984                 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1985         enforce(numCodeUnits == expectedNumCodeUnits,
1986                 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1987
1988         static if (hasLength!R)
1989         {
1990             enforce(range.length == lenBefore - numCodeUnits,
1991                     new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
1992         }
1993     }
1994 }
1995
1996 version (StdUnittest) private void testAllDecode(R)(R range,
1997                                                 dchar expectedChar,
1998                                                 size_t expectedIndex,
1999                                                 size_t line = __LINE__)
2000 {
2001     testDecode(range, 0, expectedChar, expectedIndex, line);
2002     static if (isBidirectionalRange!R)
2003     {
2004         auto rangeCopy = range.save;
2005         testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
2006     }
2007     testDecodeFront(range, expectedChar, expectedIndex, line);
2008 }
2009
2010 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
2011 {
2012     import core.exception : AssertError;
2013     import std.exception : assertThrown, enforce;
2014     import std.string : format;
2015
2016     immutable initialIndex = index;
2017
2018     static if (hasLength!R)
2019         immutable lenBefore = range.length;
2020
2021     static if (isRandomAccessRange!R)
2022     {
2023         assertThrown!UTFException(decode(range, index), null, __FILE__, line);
2024         enforce(index == initialIndex,
2025                 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
2026         static if (hasLength!R)
2027         {
2028             enforce(range.length == lenBefore,
2029                     new AssertError(format("decode: length changed:", range.length), __FILE__, line));
2030         }
2031     }
2032
2033     if (initialIndex == 0)
2034         assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
2035 }
2036
2037 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
2038 {
2039     // This condition is to allow unit testing all `decode` functions together
2040     static if (!isBidirectionalRange!R)
2041         return;
2042     else
2043     {
2044         import core.exception : AssertError;
2045         import std.exception : assertThrown, enforce;
2046         import std.string : format;
2047
2048         static if (hasLength!R)
2049             immutable lenBefore = range.length;
2050
2051         static if (isRandomAccessRange!R)
2052         {
2053             assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
2054             static if (hasLength!R)
2055             {
2056                 enforce(range.length == lenBefore,
2057                         new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
2058             }
2059         }
2060     }
2061 }
2062
2063 @system unittest
2064 {
2065     import std.conv : to;
2066     import std.exception;
2067
2068     assertCTFEable!(
2069     {
2070     foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
2071                           (string s) => new RefBidirCU!char(s),
2072                           (string s) => new RefRandomCU!char(s)))
2073     {
2074         enum sHasLength = hasLength!(typeof(S("abcd")));
2075
2076         {
2077             auto range = S("abcd");
2078             testDecode(range, 0, 'a', 1);
2079             testDecode(range, 1, 'b', 2);
2080             testDecodeFront(range, 'a', 1);
2081             testDecodeFront(range, 'b', 1);
2082             assert(decodeFront(range) == 'c');
2083             assert(decodeFront(range) == 'd');
2084         }
2085
2086         {
2087             auto range = S("ウェブサイト");
2088             testDecode(range, 0, 'ウ', 3);
2089             testDecode(range, 3, 'ェ', 6);
2090             testDecodeFront(range, 'ウ', 3);
2091             testDecodeFront(range, 'ェ', 3);
2092             assert(decodeFront(range) == 'ブ');
2093             assert(decodeFront(range) == 'サ');
2094         }
2095
2096         {
2097             auto range = S("abcd");
2098             testDecodeBack(range, 'd', 1);
2099             testDecodeBack(range, 'c', 1);
2100             testDecodeBack(range, 'b', 1);
2101             testDecodeBack(range, 'a', 1);
2102         }
2103
2104         {
2105             auto range = S("ウェブサイト");
2106             testDecodeBack(range, 'ト', 3);
2107             testDecodeBack(range, 'イ', 3);
2108             testDecodeBack(range, 'サ', 3);
2109             testDecodeBack(range, 'ブ', 3);
2110         }
2111
2112         testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2113         testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2114
2115         foreach (str; ["\xE2\x89", // too short
2116                        "\xC0\x8A",
2117                        "\xE0\x80\x8A",
2118                        "\xF0\x80\x80\x8A",
2119                        "\xF8\x80\x80\x80\x8A",
2120                        "\xFC\x80\x80\x80\x80\x8A"])
2121         {
2122             testBadDecode(S(str), 0);
2123             testBadDecode(S(str), 1);
2124             testBadDecodeBack(S(str));
2125         }
2126
2127         //Invalid UTF-8 sequence where the first code unit is valid.
2128         testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2129         testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2130
2131         //Invalid UTF-8 sequence where the first code unit isn't valid.
2132         foreach (str; ["\xED\xA0\x80",
2133                        "\xED\xAD\xBF",
2134                        "\xED\xAE\x80",
2135                        "\xED\xAF\xBF",
2136                        "\xED\xB0\x80",
2137                        "\xED\xBE\x80",
2138                        "\xED\xBF\xBF"])
2139         {
2140             testBadDecode(S(str), 0);
2141             testBadDecodeBack(S(str));
2142         }
2143     }
2144     });
2145 }
2146
2147 @system unittest
2148 {
2149     import std.exception;
2150     assertCTFEable!(
2151     {
2152     foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
2153                           (wstring s) => new RefBidirCU!wchar(s),
2154                           (wstring s) => new RefRandomCU!wchar(s)))
2155     {
2156         testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2157         testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2158         testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2159         testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2160         testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2161
2162         testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2163         testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2164
2165         testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2166         testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2167
2168         {
2169             auto range = S("ウェブサイト");
2170             testDecode(range, 0, 'ウ', 1);
2171             testDecode(range, 1, 'ェ', 2);
2172             testDecodeFront(range, 'ウ', 1);
2173             testDecodeFront(range, 'ェ', 1);
2174             assert(decodeFront(range) == 'ブ');
2175             assert(decodeFront(range) == 'サ');
2176         }
2177
2178         {
2179             auto range = S("ウェブサイト");
2180             testDecodeBack(range, 'ト', 1);
2181             testDecodeBack(range, 'イ', 1);
2182             testDecodeBack(range, 'サ', 1);
2183             testDecodeBack(range, 'ブ', 1);
2184         }
2185     }
2186
2187     foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2188     {
2189         auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2190                       cast(wchar) 0x1400,
2191                       cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2192         testDecode(str, 0, cast(dchar) 0x10000, 2);
2193         testDecode(str, 2, cast(dchar) 0x1400, 3);
2194         testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2195         testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2196         testDecodeBack(str, cast(dchar) 0x1400, 1);
2197         testDecodeBack(str, cast(dchar) 0x10000, 2);
2198     }
2199     });
2200 }
2201
2202 @system unittest
2203 {
2204     import std.exception;
2205     assertCTFEable!(
2206     {
2207     foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
2208                           (dstring s) => new RefBidirCU!dchar(s),
2209                           (dstring s) => new RefRandomCU!dchar(s)))
2210     {
2211         testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2212         testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2213         testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2214         testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2215         testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2216
2217         testBadDecode(S([cast(dchar) 0xD800]), 0);
2218         testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2219         testBadDecode(S([cast(dchar) 0x110000]), 0);
2220
2221         testBadDecodeBack(S([cast(dchar) 0xD800]));
2222         testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2223         testBadDecodeBack(S([cast(dchar) 0x110000]));
2224
2225         {
2226             auto range = S("ウェブサイト");
2227             testDecode(range, 0, 'ウ', 1);
2228             testDecode(range, 1, 'ェ', 2);
2229             testDecodeFront(range, 'ウ', 1);
2230             testDecodeFront(range, 'ェ', 1);
2231             assert(decodeFront(range) == 'ブ');
2232             assert(decodeFront(range) == 'サ');
2233         }
2234
2235         {
2236             auto range = S("ウェブサイト");
2237             testDecodeBack(range, 'ト', 1);
2238             testDecodeBack(range, 'イ', 1);
2239             testDecodeBack(range, 'サ', 1);
2240             testDecodeBack(range, 'ブ', 1);
2241         }
2242     }
2243
2244     foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2245     {
2246         auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2247         testDecode(str, 0, 0x10000, 1);
2248         testDecode(str, 1, 0x1400, 2);
2249         testDecode(str, 2, 0xB9DDE, 3);
2250         testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2251         testDecodeBack(str, cast(dchar) 0x1400, 1);
2252         testDecodeBack(str, cast(dchar) 0x10000, 1);
2253     }
2254     });
2255 }
2256
2257 @safe unittest
2258 {
2259     import std.exception;
2260     import std.traits : FunctionAttribute, functionAttributes, isSafe;
2261     assertCTFEable!(
2262     {
2263     foreach (S; AliasSeq!( char[], const( char)[],  string,
2264                           wchar[], const(wchar)[], wstring,
2265                           dchar[], const(dchar)[], dstring))
2266     {
2267         static assert(isSafe!({ S str; size_t i = 0; decode(str, i);      }));
2268         static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2269         static assert(isSafe!({ S str; decodeFront(str); }));
2270         static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2271         static assert((functionAttributes!({
2272             S str; size_t i = 0; decodeFront(str, i);
2273         }) & FunctionAttribute.pure_) != 0);
2274         static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2275         static assert((functionAttributes!({
2276             S str; size_t i = 0; decodeBack(str, i);
2277         }) & FunctionAttribute.pure_) != 0);
2278         static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2279     }
2280     });
2281 }
2282
2283 @safe unittest
2284 {
2285     import std.exception;
2286     char[4] val;
2287     val[0] = 0b1111_0111;
2288     val[1] = 0b1011_1111;
2289     val[2] = 0b1011_1111;
2290     val[3] = 0b1011_1111;
2291     size_t i = 0;
2292     assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2293 }
2294 /* =================== Encode ======================= */
2295
2296 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2297 {
2298     static if (useReplacementDchar)
2299         return replacementDchar;
2300     else
2301         throw new UTFException(msg).setSequence(c);
2302 }
2303
2304 /++
2305     Encodes `c` into the static array, `buf`, and returns the actual
2306     length of the encoded character (a number between `1` and `4` for
2307     `char[4]` buffers and a number between `1` and `2` for
2308     `wchar[2]` buffers).
2309
2310     Throws:
2311         `UTFException` if `c` is not a valid UTF code point.
2312   +/
2313 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2314     out char[4] buf, dchar c) @safe pure
2315 {
2316     if (c <= 0x7F)
2317     {
2318         assert(isValidDchar(c));
2319         buf[0] = cast(char) c;
2320         return 1;
2321     }
2322     if (c <= 0x7FF)
2323     {
2324         assert(isValidDchar(c));
2325         buf[0] = cast(char)(0xC0 | (c >> 6));
2326         buf[1] = cast(char)(0x80 | (c & 0x3F));
2327         return 2;
2328     }
2329     if (c <= 0xFFFF)
2330     {
2331         if (0xD800 <= c && c <= 0xDFFF)
2332             c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2333
2334         assert(isValidDchar(c));
2335     L3:
2336         buf[0] = cast(char)(0xE0 | (c >> 12));
2337         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2338         buf[2] = cast(char)(0x80 | (c & 0x3F));
2339         return 3;
2340     }
2341     if (c <= 0x10FFFF)
2342     {
2343         assert(isValidDchar(c));
2344         buf[0] = cast(char)(0xF0 | (c >> 18));
2345         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2346         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2347         buf[3] = cast(char)(0x80 | (c & 0x3F));
2348         return 4;
2349     }
2350
2351     assert(!isValidDchar(c));
2352     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2353     goto L3;
2354 }
2355
2356 ///
2357 @safe unittest
2358 {
2359     import std.exception : assertThrown;
2360     import std.typecons : Yes;
2361
2362     char[4] buf;
2363
2364     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2365     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2366     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2367     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2368     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2369     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2370
2371     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2372     auto slice = buf[];
2373     assert(slice.decodeFront == replacementDchar);
2374 }
2375
2376 ///
2377 @safe unittest
2378 {
2379     import std.exception : assertThrown;
2380     import std.typecons : Yes;
2381
2382     wchar[2] buf;
2383
2384     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2385     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2386     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2387     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2388     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2389     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2390
2391     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2392     auto slice = buf[];
2393     assert(slice.decodeFront == replacementDchar);
2394 }
2395
2396 ///
2397 @safe unittest
2398 {
2399     import std.exception : assertThrown;
2400     import std.typecons : Yes;
2401
2402     dchar[1] buf;
2403
2404     assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
2405     assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
2406     assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
2407     assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
2408     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2409
2410     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2411     assert(buf[0] == replacementDchar);
2412 }
2413
2414 @safe unittest
2415 {
2416     import std.exception;
2417     assertCTFEable!(
2418     {
2419     char[4] buf;
2420
2421     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2422     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2423     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2424     assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2425     assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2426     assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2427     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2428     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2429     assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2430     assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2431     assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2432
2433     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2434     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2435     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2436     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2437     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2438
2439     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2440     enum replacementDcharString = "\uFFFD";
2441     assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
2442     });
2443 }
2444
2445
2446 /// Ditto
2447 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2448     out wchar[2] buf, dchar c) @safe pure
2449 {
2450     if (c <= 0xFFFF)
2451     {
2452         if (0xD800 <= c && c <= 0xDFFF)
2453             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2454
2455         assert(isValidDchar(c));
2456     L1:
2457         buf[0] = cast(wchar) c;
2458         return 1;
2459     }
2460     if (c <= 0x10FFFF)
2461     {
2462         assert(isValidDchar(c));
2463         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2464         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2465         return 2;
2466     }
2467
2468     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2469     goto L1;
2470 }
2471
2472 @safe unittest
2473 {
2474     import std.exception;
2475     assertCTFEable!(
2476     {
2477     wchar[2] buf;
2478
2479     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2480     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2481     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2482     assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2483     assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2484     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2485     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2486
2487     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2488     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2489     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2490     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2491     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2492
2493     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2494     assert(buf.front == replacementDchar);
2495     });
2496 }
2497
2498
2499 /// Ditto
2500 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2501     out dchar[1] buf, dchar c) @safe pure
2502 {
2503     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2504         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2505     else
2506         assert(isValidDchar(c));
2507     buf[0] = c;
2508     return 1;
2509 }
2510
2511 @safe unittest
2512 {
2513     import std.exception;
2514     assertCTFEable!(
2515     {
2516     dchar[1] buf;
2517
2518     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2519     encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2520     encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2521     encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
2522     encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
2523     encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2524
2525     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2526     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2527     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2528     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2529     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2530
2531     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2532     assert(buf.front == replacementDchar);
2533     });
2534 }
2535
2536
2537 /++
2538     Encodes `c` in `str`'s encoding and appends it to `str`.
2539
2540     Throws:
2541         `UTFException` if `c` is not a valid UTF code point.
2542   +/
2543 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2544     ref char[] str, dchar c) @safe pure
2545 {
2546     char[] r = str;
2547
2548     if (c <= 0x7F)
2549     {
2550         assert(isValidDchar(c));
2551         r ~= cast(char) c;
2552     }
2553     else
2554     {
2555         char[4] buf;
2556         uint L;
2557
2558         if (c <= 0x7FF)
2559         {
2560             assert(isValidDchar(c));
2561             buf[0] = cast(char)(0xC0 | (c >> 6));
2562             buf[1] = cast(char)(0x80 | (c & 0x3F));
2563             L = 2;
2564         }
2565         else if (c <= 0xFFFF)
2566         {
2567             if (0xD800 <= c && c <= 0xDFFF)
2568                 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2569
2570             assert(isValidDchar(c));
2571         L3:
2572             buf[0] = cast(char)(0xE0 | (c >> 12));
2573             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2574             buf[2] = cast(char)(0x80 | (c & 0x3F));
2575             L = 3;
2576         }
2577         else if (c <= 0x10FFFF)
2578         {
2579             assert(isValidDchar(c));
2580             buf[0] = cast(char)(0xF0 | (c >> 18));
2581             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2582             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2583             buf[3] = cast(char)(0x80 | (c & 0x3F));
2584             L = 4;
2585         }
2586         else
2587         {
2588             assert(!isValidDchar(c));
2589             c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2590             goto L3;
2591         }
2592         r ~= buf[0 .. L];
2593     }
2594     str = r;
2595 }
2596
2597 ///
2598 @safe unittest
2599 {
2600     char[] s = "abcd".dup;
2601     dchar d1 = 'a';
2602     dchar d2 = 'ø';
2603
2604     encode(s, d1);
2605     assert(s.length == 5);
2606     assert(s == "abcda");
2607     encode(s, d2);
2608     assert(s.length == 7);
2609     assert(s == "abcdaø");
2610 }
2611
2612 @safe unittest
2613 {
2614     import std.exception;
2615
2616     assertCTFEable!(
2617     {
2618     char[] s = "abcd".dup;
2619     encode(s, cast(dchar)'a');
2620     assert(s.length == 5);
2621     assert(s == "abcda");
2622
2623     encode(s, cast(dchar)'\u00A9');
2624     assert(s.length == 7);
2625     assert(s == "abcda\xC2\xA9");
2626     //assert(s == "abcda\u00A9");   // BUG: fix compiler
2627
2628     encode(s, cast(dchar)'\u2260');
2629     assert(s.length == 10);
2630     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2631     });
2632 }
2633
2634 @safe unittest
2635 {
2636     import std.exception;
2637     assertCTFEable!(
2638     {
2639     char[] buf;
2640
2641     encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2642     encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2643     encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2644     encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2645     encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2646     encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2647     encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2648     encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2649     encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2650     encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2651     encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2652
2653     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2654     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2655     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2656     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2657     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2658
2659     enum replacementDcharString = "\uFFFD";
2660     enum rdcslen = replacementDcharString.length;
2661     assert(buf[$ - rdcslen .. $] != replacementDcharString);
2662     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2663     assert(buf[$ - rdcslen .. $] == replacementDcharString);
2664     });
2665 }
2666
2667 /// ditto
2668 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2669     ref wchar[] str, dchar c) @safe pure
2670 {
2671     wchar[] r = str;
2672
2673     if (c <= 0xFFFF)
2674     {
2675         if (0xD800 <= c && c <= 0xDFFF)
2676             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2677
2678         assert(isValidDchar(c));
2679     L1:
2680         r ~= cast(wchar) c;
2681     }
2682     else if (c <= 0x10FFFF)
2683     {
2684         wchar[2] buf;
2685
2686         assert(isValidDchar(c));
2687         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2688         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2689         r ~= buf;
2690     }
2691     else
2692     {
2693         assert(!isValidDchar(c));
2694         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2695         goto L1;
2696     }
2697
2698     str = r;
2699 }
2700
2701 @safe unittest
2702 {
2703     import std.exception;
2704     assertCTFEable!(
2705     {
2706     wchar[] buf;
2707
2708     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2709     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2710     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2711     encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2712     encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2713     encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2714     encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2715
2716     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2717     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2718     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2719     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2720     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2721
2722     assert(buf.back != replacementDchar);
2723     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2724     assert(buf.back == replacementDchar);
2725     });
2726 }
2727
2728 /// ditto
2729 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2730     ref dchar[] str, dchar c) @safe pure
2731 {
2732     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2733         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2734     else
2735         assert(isValidDchar(c));
2736     str ~= c;
2737 }
2738
2739 @safe unittest
2740 {
2741     import std.exception;
2742     assertCTFEable!(
2743     {
2744     dchar[] buf;
2745
2746     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2747     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2748     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2749     encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
2750     encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
2751     encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2752
2753     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2754     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2755     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2756     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2757     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2758
2759     assert(buf.back != replacementDchar);
2760     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2761     assert(buf.back == replacementDchar);
2762     });
2763 }
2764
2765
2766 /++
2767     Returns the number of code units that are required to encode the code point
2768     `c` when `C` is the character type used to encode it.
2769   +/
2770 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2771 if (isSomeChar!C)
2772 {
2773     static if (C.sizeof == 1)
2774     {
2775         if (c <= 0x7F) return 1;
2776         if (c <= 0x7FF) return 2;
2777         if (c <= 0xFFFF) return 3;
2778         if (c <= 0x10FFFF) return 4;
2779         assert(false);
2780     }
2781     else static if (C.sizeof == 2)
2782     {
2783         return c <= 0xFFFF ? 1 : 2;
2784     }
2785     else
2786     {
2787         static assert(C.sizeof == 4);
2788         return 1;
2789     }
2790 }
2791
2792 ///
2793 @safe pure nothrow @nogc unittest
2794 {
2795     assert(codeLength!char('a') == 1);
2796     assert(codeLength!wchar('a') == 1);
2797     assert(codeLength!dchar('a') == 1);
2798
2799     assert(codeLength!char('\U0010FFFF') == 4);
2800     assert(codeLength!wchar('\U0010FFFF') == 2);
2801     assert(codeLength!dchar('\U0010FFFF') == 1);
2802 }
2803
2804
2805 /++
2806     Returns the number of code units that are required to encode `str`
2807     in a string whose character type is `C`. This is particularly useful
2808     when slicing one string with the length of another and the two string
2809     types use different character types.
2810
2811     Params:
2812         C = the character type to get the encoding length for
2813         input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2814         to calculate the encoding length from
2815     Returns:
2816         The number of code units in `input` when encoded to `C`
2817   +/
2818 size_t codeLength(C, InputRange)(InputRange input)
2819 if (isInputRange!InputRange && !isInfinite!InputRange && isSomeChar!(ElementType!InputRange))
2820 {
2821     alias EncType = Unqual!(ElementEncodingType!InputRange);
2822     static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2823         return input.length;
2824     else
2825     {
2826         size_t total = 0;
2827
2828         foreach (c; input.byDchar)
2829             total += codeLength!C(c);
2830
2831         return total;
2832     }
2833 }
2834
2835 ///
2836 @safe unittest
2837 {
2838     assert(codeLength!char("hello world") ==
2839            "hello world".length);
2840     assert(codeLength!wchar("hello world") ==
2841            "hello world"w.length);
2842     assert(codeLength!dchar("hello world") ==
2843            "hello world"d.length);
2844
2845     assert(codeLength!char(`プログラミング`) ==
2846            `プログラミング`.length);
2847     assert(codeLength!wchar(`プログラミング`) ==
2848            `プログラミング`w.length);
2849     assert(codeLength!dchar(`プログラミング`) ==
2850            `プログラミング`d.length);
2851
2852     string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2853     wstring needle = `Être sans la verité`;
2854     assert(haystack[codeLength!char(needle) .. $] ==
2855            `, ça, ce ne serait pas bien.`);
2856 }
2857
2858 @safe unittest
2859 {
2860     import std.algorithm.iteration : filter;
2861     import std.conv : to;
2862     import std.exception;
2863
2864     assertCTFEable!(
2865     {
2866     foreach (S; AliasSeq!( char[], const  char[],  string,
2867                           wchar[], const wchar[], wstring,
2868                           dchar[], const dchar[], dstring))
2869     {
2870         foreach (C; AliasSeq!(char, wchar, dchar))
2871         {
2872             assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2873             assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2874             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2875                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2876             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2877                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2878         }
2879     }
2880     });
2881 }
2882
2883 /+
2884 Internal helper function:
2885
2886 Returns true if it is safe to search for the Codepoint `c` inside
2887 code units, without decoding.
2888
2889 This is a runtime check that is used an optimization in various functions,
2890 particularly, in `std.string`.
2891   +/
2892 package bool canSearchInCodeUnits(C)(dchar c)
2893 if (isSomeChar!C)
2894 {
2895     static if (C.sizeof == 1)
2896          return c <= 0x7F;
2897     else static if (C.sizeof == 2)
2898         return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2899     else static if (C.sizeof == 4)
2900         return true;
2901     else
2902         static assert(0);
2903 }
2904 @safe unittest
2905 {
2906     assert( canSearchInCodeUnits! char('a'));
2907     assert( canSearchInCodeUnits!wchar('a'));
2908     assert( canSearchInCodeUnits!dchar('a'));
2909     assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2910     assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2911     assert( canSearchInCodeUnits!wchar('ö'));
2912     assert( canSearchInCodeUnits!dchar('ö'));
2913     assert(!canSearchInCodeUnits! char('日'));
2914     assert( canSearchInCodeUnits!wchar('日'));
2915     assert( canSearchInCodeUnits!dchar('日'));
2916     assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2917     assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2918     assert(!canSearchInCodeUnits! char('\U00010001'));
2919     assert(!canSearchInCodeUnits!wchar('\U00010001'));
2920     assert( canSearchInCodeUnits!dchar('\U00010001'));
2921 }
2922
2923 /* =================== Validation ======================= */
2924
2925 /++
2926     Checks to see if `str` is well-formed unicode or not.
2927
2928     Throws:
2929         `UTFException` if `str` is not well-formed.
2930   +/
2931 void validate(S)(in S str) @safe pure
2932 if (isSomeString!S)
2933 {
2934     immutable len = str.length;
2935     for (size_t i = 0; i < len; )
2936     {
2937         decode(str, i);
2938     }
2939 }
2940
2941 ///
2942 @safe unittest
2943 {
2944     import std.exception : assertThrown;
2945     char[] a = [167, 133, 175];
2946     assertThrown!UTFException(validate(a));
2947 }
2948
2949 // https://issues.dlang.org/show_bug.cgi?id=12923
2950 @safe unittest
2951 {
2952     import std.exception;
2953     assertThrown((){
2954         char[3]a=[167, 133, 175];
2955         validate(a[]);
2956     }());
2957 }
2958
2959 /**
2960  * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2961  * string of the elements.
2962  *
2963  * Params:
2964  *     s = the string to encode
2965  * Returns:
2966  *     A UTF-8 string
2967  * See_Also:
2968  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2969  */
2970 string toUTF8(S)(S s)
2971 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2972 {
2973     return toUTFImpl!string(s);
2974 }
2975
2976 ///
2977 @safe pure unittest
2978 {
2979     import std.algorithm.comparison : equal;
2980
2981     // The ö is represented by two UTF-8 code units
2982     assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2983
2984     // 𐐷 is four code units in UTF-8
2985     assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2986 }
2987
2988 @system pure unittest
2989 {
2990     import std.algorithm.comparison : equal;
2991     import std.internal.test.dummyrange : ReferenceInputRange;
2992
2993     alias RT = ReferenceInputRange!(ElementType!(string));
2994     auto r1 = new RT("Hellø");
2995     auto r2 = new RT("𐐷");
2996
2997     assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2998     assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2999 }
3000
3001 /**
3002  * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
3003  * `wstring` of the elements.
3004  *
3005  * Params:
3006  *     s = the range to encode
3007  * Returns:
3008  *     A UTF-16 string
3009  * See_Also:
3010  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3011  */
3012 wstring toUTF16(S)(S s)
3013 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
3014 {
3015     return toUTFImpl!wstring(s);
3016 }
3017
3018 ///
3019 @safe pure unittest
3020 {
3021     import std.algorithm.comparison : equal;
3022
3023     // these graphemes are two code units in UTF-16 and one in UTF-32
3024     assert("𤭢"d.length == 1);
3025     assert("𐐷"d.length == 1);
3026
3027     assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62]));
3028     assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37]));
3029 }
3030
3031 @system pure unittest
3032 {
3033     import std.algorithm.comparison : equal;
3034     import std.internal.test.dummyrange : ReferenceInputRange;
3035
3036     alias RT = ReferenceInputRange!(ElementType!(string));
3037     auto r1 = new RT("𤭢");
3038     auto r2 = new RT("𐐷");
3039
3040     assert(r1.toUTF16.equal([0xD852, 0xDF62]));
3041     assert(r2.toUTF16.equal([0xD801, 0xDC37]));
3042 }
3043
3044
3045 /**
3046  * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3047  * `dstring` of the elements.
3048  *
3049  * Params:
3050  *     s = the range to encode
3051  * Returns:
3052  *     A UTF-32 string
3053  * See_Also:
3054  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3055  */
3056 dstring toUTF32(S)(scope S s)
3057 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
3058 {
3059     return toUTFImpl!dstring(s);
3060 }
3061
3062 ///
3063 @safe pure unittest
3064 {
3065     import std.algorithm.comparison : equal;
3066
3067     // these graphemes are two code units in UTF-16 and one in UTF-32
3068     assert("𤭢"w.length == 2);
3069     assert("𐐷"w.length == 2);
3070
3071     assert("𤭢"w.toUTF32.equal([0x00024B62]));
3072     assert("𐐷"w.toUTF32.equal([0x00010437]));
3073 }
3074
3075 private T toUTFImpl(T, S)(scope S s)
3076 {
3077     static if (is(S : T))
3078     {
3079         return s.idup;
3080     }
3081     else
3082     {
3083         import std.array : appender;
3084         auto app = appender!T();
3085
3086         static if (is(S == C[], C) || hasLength!S)
3087             app.reserve(s.length);
3088
3089         foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
3090             app.put(c);
3091
3092         return app.data;
3093     }
3094 }
3095
3096 /* =================== toUTFz ======================= */
3097
3098 /++
3099     Returns a C-style zero-terminated string equivalent to `str`. `str`
3100     must not contain embedded `'\0'`'s as any C function will treat the first
3101     `'\0'` that it sees as the end of the string. If `str.empty` is
3102     `true`, then a string containing only `'\0'` is returned.
3103
3104     `toUTFz` accepts any type of string and is templated on the type of
3105     character pointer that you wish to convert to. It will avoid allocating a
3106     new string if it can, but there's a decent chance that it will end up having
3107     to allocate a new string - particularly when dealing with character types
3108     other than `char`.
3109
3110     $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3111     anything alters the character one past the end of `str` (which is the
3112     `'\0'` character terminating the string), then the string won't be
3113     zero-terminated anymore. The most likely scenarios for that are if you
3114     append to `str` and no reallocation takes place or when `str` is a
3115     slice of a larger array, and you alter the character in the larger array
3116     which is one character past the end of `str`. Another case where it could
3117     occur would be if you had a mutable character array immediately after
3118     `str` in memory (for example, if they're member variables in a
3119     user-defined type with one declared right after the other) and that
3120     character array happened to start with `'\0'`. Such scenarios will never
3121     occur if you immediately use the zero-terminated string after calling
3122     `toUTFz` and the C function using it doesn't keep a reference to it.
3123     Also, they are unlikely to occur even if you save the zero-terminated string
3124     (the cases above would be among the few examples of where it could happen).
3125     However, if you save the zero-terminate string and want to be absolutely
3126     certain that the string stays zero-terminated, then simply append a
3127     `'\0'` to the string and use its `ptr` property rather than calling
3128     `toUTFz`.
3129
3130     $(RED Warning 2:) When passing a character pointer to a C function, and the
3131     C function keeps it around for any reason, make sure that you keep a
3132     reference to it in your D code. Otherwise, it may go away during a garbage
3133     collection cycle and cause a nasty bug when the C code tries to use it.
3134   +/
3135 template toUTFz(P)
3136 if (isPointer!P && isSomeChar!(typeof(*P.init)))
3137 {
3138     P toUTFz(S)(S str) @safe pure
3139     if (isSomeString!S)
3140     {
3141         return toUTFzImpl!(P, S)(str);
3142     }
3143 }
3144
3145 ///
3146 @safe pure unittest
3147 {
3148     auto p1 = toUTFz!(char*)("hello world");
3149     auto p2 = toUTFz!(const(char)*)("hello world");
3150     auto p3 = toUTFz!(immutable(char)*)("hello world");
3151     auto p4 = toUTFz!(char*)("hello world"d);
3152     auto p5 = toUTFz!(const(wchar)*)("hello world");
3153     auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
3154 }
3155
3156 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3157 if (is(immutable typeof(*P.init) == typeof(str[0])))
3158 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3159 {
3160     if (str.empty)
3161     {
3162         typeof(*P.init)[] retval = ['\0'];
3163
3164         auto trustedPtr() @trusted { return retval.ptr; }
3165         return trustedPtr();
3166     }
3167
3168     alias C = Unqual!(ElementEncodingType!S);
3169
3170     //If the P is mutable, then we have to make a copy.
3171     static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
3172     {
3173         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3174     }
3175     else
3176     {
3177         if (!__ctfe)
3178         {
3179             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3180             immutable p = trustedPtrAdd(str);
3181
3182             // Peek past end of str, if it's 0, no conversion necessary.
3183             // Note that the compiler will put a 0 past the end of static
3184             // strings, and the storage allocator will put a 0 past the end
3185             // of newly allocated char[]'s.
3186             // Is p dereferenceable? A simple test: if the p points to an
3187             // address multiple of 4, then conservatively assume the pointer
3188             // might be pointing to a new block of memory, which might be
3189             // unreadable. Otherwise, it's definitely pointing to valid
3190             // memory.
3191             if ((cast(size_t) p & 3) && *p == '\0')
3192                 return &str[0];
3193         }
3194
3195         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3196     }
3197 }
3198
3199 private P toUTFzImpl(P, S)(return scope S str) @safe pure
3200 if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable))
3201 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3202 {
3203     alias InChar  = typeof(str[0]);
3204     alias OutChar = typeof(*P.init);
3205
3206     //const(C)[] -> const(C)* or
3207     //C[] -> C* or const(C)*
3208     static if (( is(const(Unqual!InChar) == InChar) &&  is(const(Unqual!OutChar) == OutChar)) ||
3209                (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
3210     {
3211         if (!__ctfe)
3212         {
3213             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3214             auto p = trustedPtrAdd(str);
3215
3216             if ((cast(size_t) p & 3) && *p == '\0')
3217                 return &str[0];
3218         }
3219
3220         str ~= '\0';
3221         return &str[0];
3222     }
3223     //const(C)[] -> C* or immutable(C)* or
3224     //C[] -> immutable(C)*
3225     else
3226     {
3227         import std.array : uninitializedArray;
3228         auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
3229         copy[0 .. $ - 1] = str[];
3230         copy[$ - 1] = '\0';
3231
3232         auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
3233         return trustedCast(copy);
3234     }
3235 }
3236
3237 private P toUTFzImpl(P, S)(S str) @safe pure
3238 if (!is(immutable typeof(*P.init) == immutable typeof(str[0])))
3239 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3240 {
3241     import std.array : appender;
3242     auto retval = appender!(typeof(*P.init)[])();
3243
3244     foreach (dchar c; str)
3245         retval.put(c);
3246     retval.put('\0');
3247
3248     return () @trusted { return cast(P) retval.data.ptr; } ();
3249 }
3250
3251 @safe pure unittest
3252 {
3253     import core.exception : AssertError;
3254     import std.algorithm;
3255     import std.conv : to;
3256     import std.exception;
3257     import std.string : format;
3258
3259     assertCTFEable!(
3260     {
3261     foreach (S; AliasSeq!(string, wstring, dstring))
3262     {
3263         alias C = Unqual!(ElementEncodingType!S);
3264
3265         auto s1 = to!S("hello\U00010143\u0100\U00010143");
3266         auto temp = new C[](s1.length + 1);
3267         temp[0 .. $ - 1] = s1[0 .. $];
3268         temp[$ - 1] = '\n';
3269         --temp.length;
3270         auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3271         auto s2 = trustedAssumeUnique(temp);
3272         assert(s1 == s2);
3273
3274         void trustedCStringAssert(P, S)(S s) @trusted
3275         {
3276             auto p = toUTFz!P(s);
3277             assert(p[0 .. s.length] == s);
3278             assert(p[s.length] == '\0');
3279         }
3280
3281         foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3282         {
3283             trustedCStringAssert!P(s1);
3284             trustedCStringAssert!P(s2);
3285         }
3286     }
3287     });
3288
3289     static void test(P, S)(S s, size_t line = __LINE__) @trusted
3290     {
3291         static size_t zeroLen(C)(const(C)* ptr) @trusted
3292         {
3293             size_t len = 0;
3294             while (*ptr != '\0') { ++ptr; ++len; }
3295             return len;
3296         }
3297
3298         auto p = toUTFz!P(s);
3299         immutable len = zeroLen(p);
3300         enforce(cmp(s, p[0 .. len]) == 0,
3301                 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3302                                 __FILE__, line));
3303     }
3304
3305     assertCTFEable!(
3306     {
3307     foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3308                           dchar*, const(dchar)*, immutable(dchar)*))
3309     {
3310         test!P("hello\U00010143\u0100\U00010143");
3311     }
3312     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3313                           dchar*, const(dchar)*, immutable(dchar)*))
3314     {
3315         test!P("hello\U00010143\u0100\U00010143"w);
3316     }
3317     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3318                           wchar*, const(wchar)*, immutable(wchar)*))
3319     {
3320         test!P("hello\U00010143\u0100\U00010143"d);
3321     }
3322     foreach (S; AliasSeq!( char[], const( char)[],
3323                           wchar[], const(wchar)[],
3324                           dchar[], const(dchar)[]))
3325     {
3326         auto s = to!S("hello\U00010143\u0100\U00010143");
3327
3328         foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3329                               wchar*, const(wchar)*, immutable(wchar)*,
3330                               dchar*, const(dchar)*, immutable(dchar)*))
3331         {
3332             test!P(s);
3333         }
3334     }
3335     });
3336 }
3337
3338
3339 /++
3340     `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3341
3342     Encodes string `s` into UTF-16 and returns the encoded string.
3343     `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3344     that take an `LPCWSTR` argument.
3345   +/
3346 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3347 if (isSomeChar!C)
3348 {
3349     return toUTFz!(const(wchar)*)(str);
3350 }
3351
3352 ///
3353 @system unittest
3354 {
3355     string str = "Hello, World!";
3356     const(wchar)* p = str.toUTF16z;
3357     assert(p[str.length] == '\0');
3358 }
3359
3360 @safe pure unittest
3361 {
3362     import std.conv : to;
3363     //toUTFz is already thoroughly tested, so this will just verify that
3364     //toUTF16z compiles properly for the various string types.
3365     foreach (S; AliasSeq!(string, wstring, dstring))
3366         assert(toUTF16z(to!S("hello world")) !is null);
3367 }
3368
3369
3370 /* ================================ tests ================================== */
3371
3372 @safe pure unittest
3373 {
3374     import std.exception;
3375
3376     assertCTFEable!(
3377     {
3378     assert(toUTF16("hello"c) == "hello");
3379     assert(toUTF32("hello"c) == "hello");
3380     assert(toUTF8 ("hello"w) == "hello");
3381     assert(toUTF32("hello"w) == "hello");
3382     assert(toUTF8 ("hello"d) == "hello");
3383     assert(toUTF16("hello"d) == "hello");
3384
3385     assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3386     assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3387     assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3388     assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3389     assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3390     assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3391
3392     assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3393     assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3394     assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3395     assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3396     assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3397     assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3398     });
3399 }
3400
3401
3402 /++
3403     Returns the total number of code points encoded in `str`.
3404
3405     Supercedes: This function supercedes $(LREF toUCSindex).
3406
3407     Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3408
3409     Throws:
3410         `UTFException` if `str` is not well-formed.
3411   +/
3412 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc
3413 if (isSomeChar!C)
3414 {
3415     return walkLength(str.byDchar);
3416 }
3417
3418 ///
3419 @safe pure nothrow @nogc unittest
3420 {
3421     assert(count("") == 0);
3422     assert(count("a") == 1);
3423     assert(count("abc") == 3);
3424     assert(count("\u20AC100") == 4);
3425 }
3426
3427 @safe pure nothrow @nogc unittest
3428 {
3429     import std.exception;
3430     assertCTFEable!(
3431     {
3432     assert(count("") == 0);
3433     assert(count("a") == 1);
3434     assert(count("abc") == 3);
3435     assert(count("\u20AC100") == 4);
3436     });
3437 }
3438
3439
3440 // Ranges of code units for testing.
3441 version (StdUnittest)
3442 {
3443 private:
3444     struct InputCU(C)
3445     {
3446         import std.conv : to;
3447         @property bool empty() { return _str.empty; }
3448         @property C front() { return _str[0]; }
3449         void popFront() { _str = _str[1 .. $]; }
3450
3451         this(inout(C)[] str)
3452         {
3453             _str = to!(C[])(str);
3454         }
3455
3456         C[] _str;
3457     }
3458
3459     struct BidirCU(C)
3460     {
3461         import std.conv : to;
3462         @property bool empty() { return _str.empty; }
3463         @property C front() { return _str[0]; }
3464         void popFront() { _str = _str[1 .. $]; }
3465         @property C back() { return _str[$ - 1]; }
3466         void popBack() { _str = _str[0 .. $ - 1]; }
3467         @property auto save() { return BidirCU(_str); }
3468         @property size_t length() { return _str.length; }
3469
3470         this(inout(C)[] str)
3471         {
3472             _str = to!(C[])(str);
3473         }
3474
3475         C[] _str;
3476     }
3477
3478     struct RandomCU(C)
3479     {
3480         import std.conv : to;
3481         @property bool empty() { return _str.empty; }
3482         @property C front() { return _str[0]; }
3483         void popFront() { _str = _str[1 .. $]; }
3484         @property C back() { return _str[$ - 1]; }
3485         void popBack() { _str = _str[0 .. $ - 1]; }
3486         @property auto save() { return RandomCU(_str); }
3487         @property size_t length() { return _str.length; }
3488         C opIndex(size_t i) { return _str[i]; }
3489         auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3490
3491         this(inout(C)[] str)
3492         {
3493             _str = to!(C[])(str);
3494         }
3495
3496         C[] _str;
3497     }
3498
3499     class RefBidirCU(C)
3500     {
3501         import std.conv : to;
3502         @property bool empty() { return _str.empty; }
3503         @property C front() { return _str[0]; }
3504         void popFront() { _str = _str[1 .. $]; }
3505         @property C back() { return _str[$ - 1]; }
3506         void popBack() { _str = _str[0 .. $ - 1]; }
3507         @property auto save() { return new RefBidirCU(_str); }
3508         @property size_t length() { return _str.length; }
3509
3510         this(inout(C)[] str)
3511         {
3512             _str = to!(C[])(str);
3513         }
3514
3515         C[] _str;
3516     }
3517
3518     class RefRandomCU(C)
3519     {
3520         import std.conv : to;
3521         @property bool empty() { return _str.empty; }
3522         @property C front() { return _str[0]; }
3523         void popFront() { _str = _str[1 .. $]; }
3524         @property C back() { return _str[$ - 1]; }
3525         void popBack() { _str = _str[0 .. $ - 1]; }
3526         @property auto save() { return new RefRandomCU(_str); }
3527         @property size_t length() { return _str.length; }
3528         C opIndex(size_t i) { return _str[i]; }
3529         auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3530
3531         this(inout(C)[] str)
3532         {
3533             _str = to!(C[])(str);
3534         }
3535
3536         C[] _str;
3537     }
3538 }
3539
3540
3541 /**
3542  * Inserted in place of invalid UTF sequences.
3543  *
3544  * References:
3545  *      $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3546  */
3547 enum dchar replacementDchar = '\uFFFD';
3548
3549 /********************************************
3550  * Iterate a range of char, wchar, or dchars by code unit.
3551  *
3552  * The purpose is to bypass the special case decoding that
3553  * $(REF front, std,range,primitives) does to character arrays. As a result,
3554  * using ranges with `byCodeUnit` can be `nothrow` while
3555  * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3556  * sequences.
3557  *
3558  * A code unit is a building block of the UTF encodings. Generally, an
3559  * individual code unit does not represent what's perceived as a full
3560  * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3561  * are encoded with multiple code units. For example, the UTF-8 code units for
3562  * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3563  * often does not form a character on its own. Attempting to treat it as
3564  * one while iterating over the resulting range will give nonsensical results.
3565  *
3566  * Params:
3567  *      r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3568  *      of characters (including strings) or a type that implicitly converts to a string type.
3569  * Returns:
3570  *      If `r` is not an auto-decodable string (i.e. a narrow string or a
3571  *      user-defined type that implicits converts to a string type), then `r`
3572  *      is returned.
3573  *
3574  *      Otherwise, `r` is converted to its corresponding string type (if it's
3575  *      not already a string) and wrapped in a random-access range where the
3576  *      element encoding type of the string (its code unit) is the element type
3577  *      of the range, and that range returned. The range has slicing.
3578  *
3579  *      If `r` is quirky enough to be a struct or class which is an input range
3580  *      of characters on its own (i.e. it has the input range API as member
3581  *      functions), $(I and) it's implicitly convertible to a string type, then
3582  *      `r` is returned, and no implicit conversion takes place.
3583  *
3584  *      If `r` is wrapped in a new range, then that range has a `source`
3585  *      property for returning the string that's currently contained within that
3586  *      range.
3587  *
3588  * See_Also:
3589  *      Refer to the $(MREF std, uni) docs for a reference on Unicode
3590  *      terminology.
3591  *
3592  *      For a range that iterates by grapheme cluster (written character) see
3593  *      $(REF byGrapheme, std,uni).
3594  */
3595 auto byCodeUnit(R)(R r)
3596 if ((isConvertibleToString!R && !isStaticArray!R) ||
3597     (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
3598 {
3599     import std.traits : StringTypeOf;
3600     static if (// This would be cleaner if we had a way to check whether a type
3601                // was a range without any implicit conversions.
3602                (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3603                 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3604     {
3605         static struct ByCodeUnitImpl
3606         {
3607         @safe pure nothrow @nogc:
3608
3609             @property bool empty() const     { return source.length == 0; }
3610             @property auto ref front() inout { return source[0]; }
3611             void popFront()                  { source = source[1 .. $]; }
3612
3613             @property auto save() { return ByCodeUnitImpl(source.save); }
3614
3615             @property auto ref back() inout { return source[$ - 1]; }
3616             void popBack()                  { source = source[0 .. $-1]; }
3617
3618             auto ref opIndex(size_t index) inout     { return source[index]; }
3619             auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
3620
3621             @property size_t length() const { return source.length; }
3622             alias opDollar = length;
3623
3624             StringTypeOf!R source;
3625         }
3626
3627         static assert(isRandomAccessRange!ByCodeUnitImpl);
3628
3629         return ByCodeUnitImpl(r);
3630     }
3631     else static if (!isInputRange!R ||
3632                     (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3633                     !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3634     {
3635         return cast(StringTypeOf!R) r;
3636     }
3637     else
3638     {
3639         // byCodeUnit for ranges and dchar[] is a no-op
3640         return r;
3641     }
3642 }
3643
3644 ///
3645 @safe unittest
3646 {
3647     import std.range.primitives;
3648     import std.traits : isAutodecodableString;
3649
3650     auto r = "Hello, World!".byCodeUnit();
3651     static assert(hasLength!(typeof(r)));
3652     static assert(hasSlicing!(typeof(r)));
3653     static assert(isRandomAccessRange!(typeof(r)));
3654     static assert(is(ElementType!(typeof(r)) == immutable char));
3655
3656     // contrast with the range capabilities of standard strings (with or
3657     // without autodecoding enabled).
3658     auto s = "Hello, World!";
3659     static assert(isBidirectionalRange!(typeof(r)));
3660     static if (isAutodecodableString!(typeof(s)))
3661     {
3662         // with autodecoding enabled, strings are non-random-access ranges of
3663         // dchar.
3664         static assert(is(ElementType!(typeof(s)) == dchar));
3665         static assert(!isRandomAccessRange!(typeof(s)));
3666         static assert(!hasSlicing!(typeof(s)));
3667         static assert(!hasLength!(typeof(s)));
3668     }
3669     else
3670     {
3671         // without autodecoding, strings are normal arrays.
3672         static assert(is(ElementType!(typeof(s)) == immutable char));
3673         static assert(isRandomAccessRange!(typeof(s)));
3674         static assert(hasSlicing!(typeof(s)));
3675         static assert(hasLength!(typeof(s)));
3676     }
3677 }
3678
3679 /// `byCodeUnit` does no Unicode decoding
3680 @safe unittest
3681 {
3682     string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3683     assert(noel1.byCodeUnit[2] != 'ë');
3684     assert(noel1.byCodeUnit[2] == 'e');
3685
3686     string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3687     // Because string is UTF-8, the code unit at index 2 is just
3688     // the first of a sequence that encodes 'ë'
3689     assert(noel2.byCodeUnit[2] != 'ë');
3690 }
3691
3692 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3693 @safe unittest
3694 {
3695     import std.algorithm.comparison : equal;
3696     import std.range : popFrontN;
3697     import std.traits : isAutodecodableString;
3698     {
3699         auto range = byCodeUnit("hello world");
3700         range.popFrontN(3);
3701         assert(equal(range.save, "lo world"));
3702         static if (isAutodecodableString!string) // only enabled with autodecoding
3703         {
3704             string str = range.source;
3705             assert(str == "lo world");
3706         }
3707     }
3708     // source only exists if the range was wrapped
3709     {
3710         auto range = byCodeUnit("hello world"d);
3711         static assert(!__traits(compiles, range.source));
3712     }
3713 }
3714
3715 @safe pure nothrow @nogc unittest
3716 {
3717     import std.range;
3718     {
3719         enum testStr = "𐁄𐂌𐃯 hello ディラン";
3720         char[testStr.length] s;
3721         int i;
3722         foreach (c; testStr.byCodeUnit().byCodeUnit())
3723         {
3724             s[i++] = c;
3725         }
3726         assert(s == testStr);
3727     }
3728     {
3729         enum testStr = "𐁄𐂌𐃯 hello ディラン"w;
3730         wchar[testStr.length] s;
3731         int i;
3732         foreach (c; testStr.byCodeUnit().byCodeUnit())
3733         {
3734             s[i++] = c;
3735         }
3736         assert(s == testStr);
3737     }
3738     {
3739         enum testStr = "𐁄𐂌𐃯 hello ディラン"d;
3740         dchar[testStr.length] s;
3741         int i;
3742         foreach (c; testStr.byCodeUnit().byCodeUnit())
3743         {
3744             s[i++] = c;
3745         }
3746         assert(s == testStr);
3747     }
3748     {
3749         auto bcu = "hello".byCodeUnit();
3750         assert(bcu.length == 5);
3751         assert(bcu[3] == 'l');
3752         assert(bcu[2 .. 4][1] == 'l');
3753     }
3754     {
3755         char[5] orig = "hello";
3756         auto bcu = orig[].byCodeUnit();
3757         bcu.front = 'H';
3758         assert(bcu.front == 'H');
3759         bcu[1] = 'E';
3760         assert(bcu[1] == 'E');
3761     }
3762     {
3763         auto bcu = "hello".byCodeUnit().byCodeUnit();
3764         static assert(isForwardRange!(typeof(bcu)));
3765         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3766         auto s = bcu.save;
3767         bcu.popFront();
3768         assert(s.front == 'h');
3769     }
3770     {
3771         auto bcu = "hello".byCodeUnit();
3772         static assert(hasSlicing!(typeof(bcu)));
3773         static assert(isBidirectionalRange!(typeof(bcu)));
3774         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3775         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3776         auto ret = bcu.retro;
3777         assert(ret.front == 'o');
3778         ret.popFront();
3779         assert(ret.front == 'l');
3780     }
3781     {
3782         auto bcu = "κόσμε"w.byCodeUnit();
3783         static assert(hasSlicing!(typeof(bcu)));
3784         static assert(isBidirectionalRange!(typeof(bcu)));
3785         static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
3786         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3787         auto ret = bcu.retro;
3788         assert(ret.front == 'ε');
3789         ret.popFront();
3790         assert(ret.front == 'μ');
3791     }
3792     {
3793         static struct Stringish
3794         {
3795             string s;
3796             alias s this;
3797         }
3798
3799         auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓");
3800         auto bcu = orig.byCodeUnit();
3801         static assert(is(typeof(bcu) == struct));
3802         static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
3803         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3804         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3805         assert(bcu.front == cast(char) 244);
3806     }
3807     {
3808         static struct WStringish
3809         {
3810             wstring s;
3811             alias s this;
3812         }
3813
3814         auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w);
3815         auto bcu = orig.byCodeUnit();
3816         static assert(is(typeof(bcu) == struct));
3817         static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
3818         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3819         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3820         assert(bcu.front == cast(wchar) 56319);
3821     }
3822     {
3823         static struct DStringish
3824         {
3825             dstring s;
3826             alias s this;
3827         }
3828
3829         auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d);
3830         auto bcu = orig.byCodeUnit();
3831         static assert(is(typeof(bcu) == dstring));
3832         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3833         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3834         assert(bcu.front == cast(dchar) 1114104);
3835     }
3836     {
3837         static struct FuncStringish
3838         {
3839             string str;
3840             string s() pure nothrow @nogc { return str; }
3841             alias s this;
3842         }
3843
3844         auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3845         auto bcu = orig.byCodeUnit();
3846         static if (isAutodecodableString!FuncStringish)
3847             static assert(is(typeof(bcu) == struct));
3848         else
3849             static assert(is(typeof(bcu) == string));
3850         static assert(!is(typeof(bcu) == FuncStringish));
3851         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3852         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3853         assert(bcu.front == cast(char) 244);
3854     }
3855     {
3856         static struct Range
3857         {
3858             string data;
3859             bool empty() pure nothrow @nogc { return data.empty; }
3860             char front() pure nothrow @nogc { return data[0]; }
3861             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3862         }
3863
3864         auto orig = Range("\U0010fff8 𐁊 foo 𐂓");
3865         auto bcu = orig.byCodeUnit();
3866         static assert(is(typeof(bcu) == Range));
3867         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3868         static assert(is(ElementType!(typeof(bcu)) == char));
3869         assert(bcu.front == cast(char) 244);
3870     }
3871     {
3872         static struct WRange
3873         {
3874             wstring data;
3875             bool empty() pure nothrow @nogc { return data.empty; }
3876             wchar front() pure nothrow @nogc { return data[0]; }
3877             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3878         }
3879
3880         auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w);
3881         auto bcu = orig.byCodeUnit();
3882         static assert(is(typeof(bcu) == WRange));
3883         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3884         static assert(is(ElementType!(typeof(bcu)) == wchar));
3885         assert(bcu.front == 56319);
3886     }
3887     {
3888         static struct DRange
3889         {
3890             dstring data;
3891             bool empty() pure nothrow @nogc { return data.empty; }
3892             dchar front() pure nothrow @nogc { return data[0]; }
3893             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3894         }
3895
3896         auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d);
3897         auto bcu = orig.byCodeUnit();
3898         static assert(is(typeof(bcu) == DRange));
3899         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3900         static assert(is(ElementType!(typeof(bcu)) == dchar));
3901         assert(bcu.front == 1114104);
3902     }
3903     {
3904         static struct RangeAndStringish
3905         {
3906             bool empty() pure nothrow @nogc { return data.empty; }
3907             char front() pure nothrow @nogc { return data[0]; }
3908             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3909
3910             string data;
3911             string s;
3912             alias s this;
3913         }
3914
3915         auto orig = RangeAndStringish("test.d", "other");
3916         auto bcu = orig.byCodeUnit();
3917         static assert(is(typeof(bcu) == RangeAndStringish));
3918         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3919         static assert(is(ElementType!(typeof(bcu)) == char));
3920         assert(bcu.front == 't');
3921     }
3922     {
3923         static struct WRangeAndStringish
3924         {
3925             bool empty() pure nothrow @nogc { return data.empty; }
3926             wchar front() pure nothrow @nogc { return data[0]; }
3927             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3928
3929             wstring data;
3930             wstring s;
3931             alias s this;
3932         }
3933
3934         auto orig = WRangeAndStringish("test.d"w, "other"w);
3935         auto bcu = orig.byCodeUnit();
3936         static assert(is(typeof(bcu) == WRangeAndStringish));
3937         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3938         static assert(is(ElementType!(typeof(bcu)) == wchar));
3939         assert(bcu.front == 't');
3940     }
3941     {
3942         static struct DRangeAndStringish
3943         {
3944             bool empty() pure nothrow @nogc { return data.empty; }
3945             dchar front() pure nothrow @nogc { return data[0]; }
3946             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3947
3948             dstring data;
3949             dstring s;
3950             alias s this;
3951         }
3952
3953         auto orig = DRangeAndStringish("test.d"d, "other"d);
3954         auto bcu = orig.byCodeUnit();
3955         static assert(is(typeof(bcu) == DRangeAndStringish));
3956         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3957         static assert(is(ElementType!(typeof(bcu)) == dchar));
3958         assert(bcu.front == 't');
3959     }
3960     {
3961         enum Enum : string { a = "test.d" }
3962
3963         auto orig = Enum.a;
3964         auto bcu = orig.byCodeUnit();
3965         static assert(!is(typeof(bcu) == Enum));
3966         static if (isAutodecodableString!Enum)
3967             static assert(is(typeof(bcu) == struct));
3968         else
3969             static assert(is(typeof(bcu) == string));
3970         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3971         assert(bcu.front == 't');
3972     }
3973     {
3974         enum WEnum : wstring { a = "test.d"w }
3975
3976         auto orig = WEnum.a;
3977         auto bcu = orig.byCodeUnit();
3978         static assert(!is(typeof(bcu) == WEnum));
3979         static if (isAutodecodableString!WEnum)
3980             static assert(is(typeof(bcu) == struct));
3981         else
3982             static assert(is(typeof(bcu) == wstring));
3983         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3984         assert(bcu.front == 't');
3985     }
3986     {
3987         enum DEnum : dstring { a = "test.d"d }
3988
3989         auto orig = DEnum.a;
3990         auto bcu = orig.byCodeUnit();
3991         static assert(is(typeof(bcu) == dstring));
3992         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3993         assert(bcu.front == 't');
3994     }
3995
3996     static if (autodecodeStrings)
3997     {
3998         static assert(!is(typeof(byCodeUnit("hello")) == string));
3999         static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
4000     }
4001     else
4002     {
4003         static assert(is(typeof(byCodeUnit("hello")) == string));
4004         static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
4005     }
4006     static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
4007
4008     static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
4009     static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
4010     static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
4011
4012     enum SEnum : char[5] { a = "hello" }
4013     enum WSEnum : wchar[5] { a = "hello"w }
4014     enum DSEnum : dchar[5] { a = "hello"d }
4015
4016     static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
4017     static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
4018     static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
4019 }
4020
4021 /****************************
4022  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4023  * of characters by char, wchar, or dchar.
4024  * These aliases simply forward to $(LREF byUTF) with the
4025  * corresponding C argument.
4026  *
4027  * Params:
4028  *      r = input range of characters, or array of characters
4029  */
4030 alias byChar = byUTF!char;
4031
4032 /// Ditto
4033 alias byWchar = byUTF!wchar;
4034
4035 /// Ditto
4036 alias byDchar = byUTF!dchar;
4037
4038 @safe pure nothrow @nogc unittest
4039 {
4040   {
4041     char[5] s;
4042     int i;
4043     foreach (c; "hello".byChar.byChar())
4044     {
4045         //writefln("[%d] '%c'", i, c);
4046         s[i++] = c;
4047     }
4048     assert(s == "hello");
4049   }
4050   {
4051     char[5+2+3+4+3+3] s;
4052     int i;
4053     dchar[10] a;
4054     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4055     a[8] = 0xD800;   // invalid
4056     a[9] = cast(dchar) 0x110000; // invalid
4057     foreach (c; a[].byChar())
4058     {
4059         //writefln("[%d] '%c'", i, c);
4060         s[i++] = c;
4061     }
4062     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4063   }
4064   {
4065     auto r = "hello"w.byChar();
4066     r.popFront();
4067     r.popFront();
4068     assert(r.front == 'l');
4069   }
4070   {
4071     auto r = "hello"d.byChar();
4072     r.popFront();
4073     r.popFront();
4074     assert(r.front == 'l');
4075   }
4076   {
4077     auto r = "hello"d.byChar();
4078     assert(isForwardRange!(typeof(r)));
4079     auto s = r.save;
4080     r.popFront();
4081     assert(s.front == 'h');
4082   }
4083 }
4084
4085 @safe pure nothrow @nogc unittest
4086 {
4087   {
4088     wchar[11] s;
4089     int i;
4090     dchar[10] a;
4091     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4092     a[8] = 0xD800;   // invalid
4093     a[9] = cast(dchar) 0x110000; // invalid
4094     foreach (c; a[].byWchar())
4095     {
4096         //writefln("[%d] '%c' x%x", i, c, c);
4097         s[i++] = c;
4098     }
4099     foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
4100     {
4101         //writefln("[%d] '%c' x%x", j, c, c);
4102     }
4103     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
4104   }
4105
4106   {
4107     auto r = "hello".byWchar();
4108     r.popFront();
4109     r.popFront();
4110     assert(r.front == 'l');
4111   }
4112   {
4113     auto r = "hello"d.byWchar();
4114     r.popFront();
4115     r.popFront();
4116     assert(r.front == 'l');
4117   }
4118   {
4119     auto r = "hello"d.byWchar();
4120     assert(isForwardRange!(typeof(r)));
4121     auto s = r.save;
4122     r.popFront();
4123     assert(s.front == 'h');
4124   }
4125 }
4126
4127 @safe pure nothrow @nogc unittest
4128 {
4129   {
4130     dchar[9] s;
4131     int i;
4132     string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4133     foreach (c; a.byDchar())
4134     {
4135         s[i++] = c;
4136     }
4137     assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
4138   }
4139   {
4140     foreach (s; invalidUTFstrings!char())
4141     {
4142         auto r = s.byDchar();
4143         assert(!r.empty);
4144         assert(r.front == r.front);
4145         dchar c = r.front;
4146         assert(c == replacementDchar);
4147     }
4148   }
4149   {
4150     auto r = "hello".byDchar();
4151     r.popFront();
4152     r.popFront();
4153     assert(r.front == 'l');
4154   }
4155
4156   {
4157     dchar[8] s;
4158     int i;
4159     wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
4160     foreach (c; a.byDchar())
4161     {
4162         //writefln("[%d] '%c' x%x", i, c, c);
4163         s[i++] = c;
4164     }
4165     assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
4166   }
4167   {
4168     foreach (s; invalidUTFstrings!wchar())
4169     {
4170         auto r = s.byDchar();
4171         assert(!r.empty);
4172         assert(r.front == r.front);
4173         dchar c = r.front;
4174         assert(c == replacementDchar);
4175     }
4176   }
4177   {
4178     wchar[2] ws;
4179     ws[0] = 0xD800;
4180     ws[1] = 0xDD00;             // correct surrogate pair
4181     auto r = ws[].byDchar();
4182     assert(!r.empty);
4183     assert(r.front == r.front);
4184     dchar c = r.front;
4185     assert(c == '\U00010100');
4186   }
4187   {
4188     auto r = "hello"w.byDchar();
4189     r.popFront();
4190     r.popFront();
4191     assert(r.front == 'l');
4192   }
4193
4194   {
4195     dchar[5] s;
4196     int i;
4197     dstring a = "hello"d;
4198     foreach (c; a.byDchar.byDchar())
4199     {
4200         //writefln("[%d] '%c' x%x", i, c, c);
4201         s[i++] = c;
4202     }
4203     assert(s == "hello"d);
4204   }
4205   {
4206     auto r = "hello".byDchar();
4207     assert(isForwardRange!(typeof(r)));
4208     auto s = r.save;
4209     r.popFront();
4210     assert(s.front == 'h');
4211   }
4212   {
4213     auto r = "hello"w.byDchar();
4214     assert(isForwardRange!(typeof(r)));
4215     auto s = r.save;
4216     r.popFront();
4217     assert(s.front == 'h');
4218   }
4219 }
4220
4221 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4222 // which needs to support ranges with and without those attributes
4223
4224 pure @safe nothrow @nogc unittest
4225 {
4226     dchar[5] s = "hello"d;
4227     foreach (c; s[].byChar())  { }
4228     foreach (c; s[].byWchar()) { }
4229     foreach (c; s[].byDchar()) { }
4230 }
4231
4232 version (StdUnittest)
4233 private int impureVariable;
4234
4235 @system unittest
4236 {
4237     static struct ImpureThrowingSystemRange(Char)
4238     {
4239         @property bool empty() const { return true; }
4240         @property Char front() const { return Char.init; }
4241         void popFront()
4242         {
4243             impureVariable++;
4244             throw new Exception("only for testing nothrow");
4245         }
4246     }
4247
4248     foreach (Char; AliasSeq!(char, wchar, dchar))
4249     {
4250         ImpureThrowingSystemRange!Char range;
4251         foreach (c; range.byChar())  { }
4252         foreach (c; range.byWchar()) { }
4253         foreach (c; range.byDchar()) { }
4254     }
4255 }
4256
4257 /****************************
4258  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4259  * of characters by char type `C` by encoding the elements of the range.
4260  *
4261  * UTF sequences that cannot be converted to the specified encoding are either
4262  * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4263  * of the Unicode Standard 6.2 or result in a thrown UTFException.
4264  *  Hence byUTF is not symmetric.
4265  * This algorithm is lazy, and does not allocate memory.
4266  * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4267  * `r` parameter.
4268  *
4269  * Params:
4270  *      C = `char`, `wchar`, or `dchar`
4271  *      useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4272  *                            UseReplacementDchar.no means throw `UTFException` for invalid UTF
4273  *
4274  * Throws:
4275  *      `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes`
4276  *
4277  * GC:
4278  *      Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no`
4279  *
4280  * Returns:
4281  *      A bidirectional range if `R` is a bidirectional range and not auto-decodable,
4282  *      as defined by $(REF isAutodecodableString, std, traits).
4283  *
4284  *      A forward range if `R` is a forward range and not auto-decodable.
4285  *
4286  *      Or, if `R` is a range and it is auto-decodable and
4287  *      `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4288  *      to $(LREF byCodeUnit).
4289  *
4290  *      Otherwise, an input range of characters.
4291  */
4292 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
4293 if (isSomeChar!C)
4294 {
4295     static if (is(immutable C == immutable UC, UC) && !is(C == UC))
4296         alias byUTF = byUTF!UC;
4297     else:
4298
4299     auto ref byUTF(R)(R r)
4300         if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4301     {
4302         return byUTF(r.byCodeUnit());
4303     }
4304
4305     auto ref byUTF(R)(R r)
4306         if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4307     {
4308         static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C))
4309         {
4310             return r.byCodeUnit();
4311         }
4312         else static if (is(C == dchar))
4313         {
4314             static struct Result
4315             {
4316                 enum Empty = uint.max;  // range is empty or just constructed
4317
4318                 this(return scope R r)
4319                 {
4320                     this.r = r;
4321                 }
4322
4323                 this(return scope R r, uint buff)
4324                 {
4325                     this.r = r;
4326                     this.buff = buff;
4327                 }
4328
4329                 static if (isBidirectionalRange!R)
4330                 {
4331                     this(return scope R r, uint frontBuff, uint backBuff)
4332                     {
4333                         this.r = r;
4334                         this.buff = frontBuff;
4335                         this.backBuff = backBuff;
4336                     }
4337                 }
4338
4339                 @property bool empty()
4340                 {
4341                     static if (isBidirectionalRange!R)
4342                         return buff == Empty && backBuff == Empty && r.empty;
4343                     else
4344                         return buff == Empty && r.empty;
4345                 }
4346
4347                 @property dchar front() scope // 'scope' required by call to decodeFront() below
4348                 {
4349                     if (buff == Empty)
4350                     {
4351                         auto c = r.front;
4352
4353                         static if (is(RC == wchar))
4354                             enum firstMulti = 0xD800; // First high surrogate.
4355                         else
4356                             enum firstMulti = 0x80; // First non-ASCII.
4357                         if (c < firstMulti)
4358                         {
4359                             r.popFront;
4360                             buff = cast(dchar) c;
4361                         }
4362                         else
4363                         {
4364                             buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4365                         }
4366                     }
4367                     return cast(dchar) buff;
4368                 }
4369
4370                 void popFront()
4371                 {
4372                     if (buff == Empty)
4373                         front();
4374                     buff = Empty;
4375                 }
4376
4377                 static if (isForwardRange!R)
4378                 {
4379                     @property auto save()
4380                     {
4381                         static if (isBidirectionalRange!R)
4382                         {
4383                             return Result(r.save, buff, backBuff);
4384                         }
4385                         else
4386                         {
4387                             return Result(r.save, buff);
4388                         }
4389                     }
4390                 }
4391
4392                 static if (isBidirectionalRange!R)
4393                 {
4394                     @property dchar back() scope // 'scope' required by call to decodeBack() below
4395                     {
4396                         if (backBuff != Empty)
4397                             return cast(dchar) backBuff;
4398
4399                         auto c = r.back;
4400                         static if (is(RC == wchar))
4401                             enum firstMulti = 0xD800; // First high surrogate.
4402                         else
4403                             enum firstMulti = 0x80; // First non-ASCII.
4404                         if (c < firstMulti)
4405                         {
4406                             r.popBack;
4407                             backBuff = cast(dchar) c;
4408                         }
4409                         else
4410                         {
4411                             backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }();
4412                         }
4413                         return cast(dchar) backBuff;
4414
4415                     }
4416
4417                     void popBack()
4418                     {
4419                         if (backBuff == Empty)
4420                             back();
4421                         backBuff = Empty;
4422                     }
4423                 }
4424
4425             private:
4426
4427                 R r;
4428                 uint buff = Empty;      // one character lookahead buffer
4429                 static if (isBidirectionalRange!R)
4430                     uint backBuff = Empty;
4431             }
4432
4433             return Result(r);
4434         }
4435         else
4436         {
4437             static struct Result
4438             {
4439                 this(return scope R r)
4440                 {
4441                     this.r = r;
4442                 }
4443
4444                 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
4445                 {
4446                     this.r = r;
4447                     this.pos = pos;
4448                     this.fill = fill;
4449                     this.buf = buf;
4450                 }
4451
4452                 static if (isBidirectionalRange!R)
4453                 {
4454                     this(return scope R r, ushort frontPos, ushort frontFill,
4455                          ushort backPos, ushort backFill, C[4 / C.sizeof] buf)
4456                     {
4457                         this.r = r;
4458                         this.pos = frontPos;
4459                         this.fill = frontFill;
4460                         this.backPos = backPos;
4461                         this.backFill = backFill;
4462                         this.buf = buf;
4463                     }
4464                 }
4465
4466                 @property bool empty()
4467                 {
4468                     static if (isBidirectionalRange!R)
4469                         return pos == fill && backPos == backFill && r.empty;
4470                     else
4471                         return pos == fill && r.empty;
4472                 }
4473
4474                 @property auto front() scope // 'scope' required by call to decodeFront() below
4475                 {
4476                     if (pos == fill)
4477                     {
4478                         pos = 0;
4479                         auto c = r.front;
4480
4481                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4482                             enum firstMulti = 0xD800; // First high surrogate.
4483                         else
4484                             enum firstMulti = 0x80; // First non-ASCII.
4485                         if (c < firstMulti)
4486                         {
4487                             fill = 1;
4488                             r.popFront;
4489                             buf[pos] = cast(C) c;
4490                         }
4491                         else
4492                         {
4493                             static if (is(RC == dchar))
4494                             {
4495                                 r.popFront;
4496                                 dchar dc = c;
4497                             }
4498                             else
4499                                 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4500                             fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4501                         }
4502                     }
4503                     return buf[pos];
4504                 }
4505
4506                 void popFront()
4507                 {
4508                     if (pos == fill)
4509                         front;
4510                     ++pos;
4511                 }
4512
4513                 static if (isForwardRange!R)
4514                 {
4515                     @property auto save()
4516                     {
4517                         static if (isBidirectionalRange!R)
4518                         {
4519                             return Result(r.save, pos, fill, backPos, backFill, buf);
4520                         }
4521                         else
4522                         {
4523                             return Result(r.save, pos, fill, buf);
4524                         }
4525                     }
4526                 }
4527
4528                 static if (isBidirectionalRange!R)
4529                 {
4530                     @property auto back() scope // 'scope' required by call to decodeBack() below
4531                     {
4532                         if (backPos != backFill)
4533                             return buf[cast(ushort) (backFill - backPos - 1)];
4534
4535                         backPos = 0;
4536                         auto c = r.back;
4537                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4538                             enum firstMulti = 0xD800; // First high surrogate.
4539                         else
4540                             enum firstMulti = 0x80; // First non-ASCII.
4541                         if (c < firstMulti)
4542                         {
4543                             backFill = 1;
4544                             r.popBack;
4545                             buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c;
4546                         }
4547                         else
4548                         {
4549                             static if (is(RC == dchar))
4550                             {
4551                                 r.popBack;
4552                                 dchar dc = c;
4553                             }
4554                             else
4555                                 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }();
4556                             backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4557                         }
4558                         return buf[cast(ushort) (backFill - backPos - 1)];
4559                     }
4560
4561                     void popBack()
4562                     {
4563                         if (backPos == backFill)
4564                             back;
4565                         ++backPos;
4566                     }
4567                 }
4568
4569             private:
4570
4571                 R r;
4572                 ushort pos, fill;
4573                 static if (isBidirectionalRange!R)
4574                     ushort backPos, backFill;
4575                 C[4 / C.sizeof] buf = void;
4576             }
4577
4578             return Result(r);
4579         }
4580     }
4581 }
4582
4583 ///
4584 @safe pure nothrow unittest
4585 {
4586     import std.algorithm.comparison : equal;
4587
4588     // hellö as a range of `char`s, which are UTF-8
4589     assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4590
4591     // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4592     assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4593
4594     // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4595     assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4596     assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37]));
4597     assert("𐐷".byUTF!dchar().equal([0x00010437]));
4598 }
4599
4600 ///
4601 @safe unittest
4602 {
4603     import std.algorithm.comparison : equal;
4604     import std.exception : assertThrown;
4605
4606     assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
4607     assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
4608 }
4609
4610 @safe unittest
4611 {
4612     {
4613         wchar[] s = ['a', 'b', 0x219];
4614         auto r = s.byUTF!char;
4615         assert(isBidirectionalRange!(typeof(r)));
4616         assert(r.back == 0x99);
4617         r.popBack;
4618         assert(r.back == 0xc8);
4619         r.popBack;
4620         assert(r.back == 'b');
4621
4622     }
4623
4624     {
4625         wchar[] s = ['a', 'b', 0x219];
4626         auto r = s.byUTF!wchar;
4627         uint i;
4628         assert(isBidirectionalRange!(typeof(r)));
4629         assert(r.back == 0x219);
4630         r.popBack;
4631         assert(r.back == 'b');
4632     }
4633
4634     {
4635         wchar[] s = ['a', 'b', 0x219];
4636         auto r = s.byUTF!dchar;
4637         assert(isBidirectionalRange!(typeof(r)));
4638         assert(r.back == 0x219);
4639         r.popBack;
4640         assert(r.back == 'b');
4641     }
4642
4643     {
4644         dchar[] s = ['𐐷', '😁'];
4645         auto r = s.byUTF!wchar;
4646         assert(r.back == 0xde01);
4647         r.popBack;
4648         assert(r.back == 0xd83d);
4649         r.popBack;
4650         assert(r.back == 0xdc37);
4651         r.popBack;
4652         assert(r.back == 0xd801);
4653     }
4654
4655     {
4656         dchar[] s = ['𐐷', '😁'];
4657         auto r = s.byUTF!char;
4658         char[] res;
4659         while (!r.empty)
4660         {
4661             res ~= r.back;
4662             r.popBack;
4663         }
4664         import std.algorithm.comparison : equal;
4665         assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
4666     }
4667
4668     {
4669         dchar[] res;
4670         auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar;
4671         while (!r.empty)
4672         {
4673             res ~= r.back;
4674             r.popBack;
4675         }
4676         import std.algorithm.comparison : equal;
4677         assert(res.equal(['e', 'd', 'c', 'b', 'a']));
4678     }
4679
4680     {
4681         //testing the save() function
4682         wchar[] s = ['Ă','ț'];
4683
4684         auto rc = s.byUTF!char;
4685         rc.popBack;
4686         auto rcCopy = rc.save;
4687         assert(rc.back == rcCopy.back);
4688         assert(rcCopy.back == 0xc8);
4689
4690         auto rd = s.byUTF!dchar;
4691         rd.popBack;
4692         auto rdCopy = rd.save;
4693         assert(rd.back == rdCopy.back);
4694         assert(rdCopy.back == 'Ă');
4695     }
4696 }
4697
4698 ///
4699 @safe pure nothrow unittest
4700 {
4701     import std.range.primitives;
4702     wchar[] s = ['ă', 'î'];
4703
4704     auto rc = s.byUTF!char;
4705     static assert(isBidirectionalRange!(typeof(rc)));
4706     assert(rc.back == 0xae);
4707     rc.popBack;
4708     assert(rc.back == 0xc3);
4709     rc.popBack;
4710     assert(rc.back == 0x83);
4711     rc.popBack;
4712     assert(rc.back == 0xc4);
4713
4714     auto rw = s.byUTF!wchar;
4715     static assert(isBidirectionalRange!(typeof(rw)));
4716     assert(rw.back == 'î');
4717     rw.popBack;
4718     assert(rw.back == 'ă');
4719
4720     auto rd = s.byUTF!dchar;
4721     static assert(isBidirectionalRange!(typeof(rd)));
4722     assert(rd.back == 'î');
4723     rd.popBack;
4724     assert(rd.back == 'ă');
4725 }