libphobos/src/std/regex/package.d

   1 /++
   2   $(LINK2 https://en.wikipedia.org/wiki/Regular_expression, Regular expressions)
   3   are a commonly used method of pattern matching
   4   on strings, with $(I regex) being a catchy word for a pattern in this domain
   5   specific language. Typical problems usually solved by regular expressions
   6   include validation of user input and the ubiquitous find $(AMP) replace
   7   in text processing utilities.
   8
   9 $(SCRIPT inhibitQuickIndex = 1;)
  10 $(DIVC quickindex,
  11 $(BOOKTABLE,
  12 $(TR $(TH Category) $(TH Functions))
  13 $(TR $(TD Matching) $(TD
  14         $(LREF bmatch)
  15         $(LREF match)
  16         $(LREF matchAll)
  17         $(LREF matchFirst)
  18 ))
  19 $(TR $(TD Building) $(TD
  20         $(LREF ctRegex)
  21         $(LREF escaper)
  22         $(LREF regex)
  23 ))
  24 $(TR $(TD Replace) $(TD
  25         $(LREF replace)
  26         $(LREF replaceAll)
  27         $(LREF replaceAllInto)
  28         $(LREF replaceFirst)
  29         $(LREF replaceFirstInto)
  30 ))
  31 $(TR $(TD Split) $(TD
  32         $(LREF split)
  33         $(LREF splitter)
  34 ))
  35 $(TR $(TD Objects) $(TD
  36         $(LREF Captures)
  37         $(LREF Regex)
  38         $(LREF RegexException)
  39         $(LREF RegexMatch)
  40         $(LREF Splitter)
  41         $(LREF StaticRegex)
  42 ))
  43 ))
  44
  45   $(SECTION Synopsis)
  46
  47   Create a regex at runtime:
  48   $(RUNNABLE_EXAMPLE
  49   $(RUNNABLE_EXAMPLE_STDIN
  50 They met on 24/01/1970.
  51 7/8/99 wasn't as hot as 7/8/2022.
  52 )
  53       ---
  54       import std.regex;
  55       import std.stdio;
  56       // Print out all possible dd/mm/yy(yy) dates found in user input.
  57       auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b");
  58       foreach (line; stdin.byLine)
  59       {
  60         // matchAll() returns a range that can be iterated
  61         // to get all subsequent matches.
  62         foreach (c; matchAll(line, r))
  63             writeln(c.hit);
  64       }
  65       ---
  66   )
  67   Create a static regex at compile-time, which contains fast native code:
  68   $(RUNNABLE_EXAMPLE
  69   ---
  70   import std.regex;
  71   auto ctr = ctRegex!(`^.*/([^/]+)/?$`);
  72
  73   // It works just like a normal regex:
  74   auto c2 = matchFirst("foo/bar", ctr);   // First match found here, if any
  75   assert(!c2.empty);   // Be sure to check if there is a match before examining contents!
  76   assert(c2[1] == "bar");   // Captures is a range of submatches: 0 = full match.
  77   ---
  78   )
  79   Multi-pattern regex:
  80   $(RUNNABLE_EXAMPLE
  81   ---
  82   import std.regex;
  83   auto multi = regex([`\d+,\d+`, `([a-z]+):(\d+)`]);
  84   auto m = "abc:43 12,34".matchAll(multi);
  85   assert(m.front.whichPattern == 2);
  86   assert(m.front[1] == "abc");
  87   assert(m.front[2] == "43");
  88   m.popFront();
  89   assert(m.front.whichPattern == 1);
  90   assert(m.front[0] == "12,34");
  91   ---
  92   )
  93   $(LREF Captures) and `opCast!bool`:
  94   $(RUNNABLE_EXAMPLE
  95   ---
  96   import std.regex;
  97   // The result of `matchAll/matchFirst` is directly testable with `if/assert/while`,
  98   // e.g. test if a string consists of letters only:
  99   assert(matchFirst("LettersOnly", `^\p{L}+$`));
 100
 101   // And we can take advantage of the ability to define a variable in the IfCondition:
 102   if (const captures = matchFirst("At l34st one digit, but maybe more...", `((\d)(\d*))`))
 103   {
 104       assert(captures[2] == "3");
 105       assert(captures[3] == "4");
 106       assert(captures[1] == "34");
 107   }
 108   ---
 109   )
 110   See_Also: $(LINK2 https://dlang.org/spec/statement.html#IfCondition, `IfCondition`).
 111
 112   $(SECTION Syntax and general information)
 113   The general usage guideline is to keep regex complexity on the side of simplicity,
 114   as its capabilities reside in purely character-level manipulation.
 115   As such it's ill-suited for tasks involving higher level invariants
 116   like matching an integer number $(U bounded) in an [a,b] interval.
 117   Checks of this sort of are better addressed by additional post-processing.
 118
 119   The basic syntax shouldn't surprise experienced users of regular expressions.
 120   For an introduction to `std.regex` see a
 121   $(HTTP dlang.org/regular-expression.html, short tour) of the module API
 122   and its abilities.
 123
 124   There are other web resources on regular expressions to help newcomers,
 125   and a good $(HTTP www.regular-expressions.info, reference with tutorial)
 126   can easily be found.
 127
 128   This library uses a remarkably common ECMAScript syntax flavor
 129   with the following extensions:
 130   $(UL
 131     $(LI Named subexpressions, with Python syntax. )
 132     $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.)
 133     $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.)
 134   )
 135
 136   $(REG_START Pattern syntax )
 137   $(I std.regex operates on codepoint level,
 138     'character' in this table denotes a single Unicode codepoint.)
 139   $(REG_TABLE
 140     $(REG_TITLE Pattern element, Semantics )
 141     $(REG_TITLE Atoms, Match single characters )
 142     $(REG_ROW any character except [{|*+?()^$, Matches the character itself. )
 143     $(REG_ROW ., In single line mode matches any character.
 144       Otherwise it matches any character except '\n' and '\r'. )
 145     $(REG_ROW [class], Matches a single character
 146       that belongs to this character class. )
 147     $(REG_ROW [^class], Matches a single character that
 148       does $(U not) belong to this character class.)
 149     $(REG_ROW \cC, Matches the control character corresponding to letter C)
 150     $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. )
 151     $(REG_ROW \uXXXX, Matches a character  with hexadecimal value of XXXX. )
 152     $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. )
 153     $(REG_ROW \f, Matches a formfeed character. )
 154     $(REG_ROW \n, Matches a linefeed character. )
 155     $(REG_ROW \r, Matches a carriage return character. )
 156     $(REG_ROW \t, Matches a tab character. )
 157     $(REG_ROW \v, Matches a vertical tab character. )
 158     $(REG_ROW \d, Matches any Unicode digit. )
 159     $(REG_ROW \D, Matches any character except Unicode digits. )
 160     $(REG_ROW \w, Matches any word character (note: this includes numbers).)
 161     $(REG_ROW \W, Matches any non-word character.)
 162     $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.)
 163     $(REG_ROW \S, Matches any character except those recognized as $(I \s ). )
 164     $(REG_ROW \\\\, Matches \ character. )
 165     $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. )
 166     $(REG_ROW \p{PropertyName}, Matches a character that belongs
 167         to the Unicode PropertyName set.
 168       Single letter abbreviations can be used without surrounding {,}. )
 169     $(REG_ROW  \P{PropertyName}, Matches a character that does not belong
 170         to the Unicode PropertyName set.
 171       Single letter abbreviations can be used without surrounding {,}. )
 172     $(REG_ROW \p{InBasicLatin}, Matches any character that is part of
 173           the BasicLatin Unicode $(U block).)
 174     $(REG_ROW \P{InBasicLatin}, Matches any character except ones in
 175           the BasicLatin Unicode $(U block).)
 176     $(REG_ROW \p{Cyrillic}, Matches any character that is part of
 177         Cyrillic $(U script).)
 178     $(REG_ROW \P{Cyrillic}, Matches any character except ones in
 179         Cyrillic $(U script).)
 180     $(REG_TITLE Quantifiers, Specify repetition of other elements)
 181     $(REG_ROW *, Matches previous character/subexpression 0 or more times.
 182       Greedy version - tries as many times as possible.)
 183     $(REG_ROW *?, Matches previous character/subexpression 0 or more times.
 184       Lazy version  - stops as early as possible.)
 185     $(REG_ROW +, Matches previous character/subexpression 1 or more times.
 186       Greedy version - tries as many times as possible.)
 187     $(REG_ROW +?, Matches previous character/subexpression 1 or more times.
 188       Lazy version  - stops as early as possible.)
 189     $(REG_ROW ?, Matches previous character/subexpression 0 or 1 time.
 190       Greedy version - tries as many times as possible.)
 191     $(REG_ROW ??, Matches previous character/subexpression 0 or 1 time.
 192       Lazy version  - stops as early as possible.)
 193     $(REG_ROW {n}, Matches previous character/subexpression exactly n times. )
 194     $(REG_ROW {n$(COMMA)}, Matches previous character/subexpression n times or more.
 195       Greedy version - tries as many times as possible. )
 196     $(REG_ROW {n$(COMMA)}?, Matches previous character/subexpression n times or more.
 197       Lazy version - stops as early as possible.)
 198     $(REG_ROW {n$(COMMA)m}, Matches previous character/subexpression n to m times.
 199       Greedy version - tries as many times as possible, but no more than m times. )
 200     $(REG_ROW {n$(COMMA)m}?, Matches previous character/subexpression n to m times.
 201       Lazy version - stops as early as possible, but no less then n times.)
 202     $(REG_TITLE Other, Subexpressions $(AMP) alternations )
 203     $(REG_ROW (regex),  Matches subexpression regex,
 204       saving matched portion of text for later retrieval. )
 205     $(REG_ROW (?#comment), An inline comment that is ignored while matching.)
 206     $(REG_ROW (?:regex), Matches subexpression regex,
 207       $(U not) saving matched portion of text. Useful to speed up matching. )
 208     $(REG_ROW A|B, Matches subexpression A, or failing that, matches B. )
 209     $(REG_ROW (?P$(LT)name$(GT)regex), Matches named subexpression
 210         regex labeling it with name 'name'.
 211         When referring to a matched portion of text,
 212         names work like aliases in addition to direct numbers.
 213      )
 214     $(REG_TITLE Assertions, Match position rather than character )
 215     $(REG_ROW ^, Matches at the beginning of input or line (in multiline mode).)
 216     $(REG_ROW $, Matches at the end of input or line (in multiline mode). )
 217     $(REG_ROW \b, Matches at word boundary. )
 218     $(REG_ROW \B, Matches when $(U not) at word boundary. )
 219     $(REG_ROW (?=regex), Zero-width lookahead assertion.
 220         Matches at a point where the subexpression
 221         regex could be matched starting from the current position.
 222       )
 223     $(REG_ROW (?!regex), Zero-width negative lookahead assertion.
 224         Matches at a point where the subexpression
 225         regex could $(U not) be matched starting from the current position.
 226       )
 227     $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point
 228         where the subexpression regex could be matched ending
 229         at the current position (matching goes backwards).
 230       )
 231     $(REG_ROW  (?<!regex), Zero-width negative lookbehind assertion.
 232       Matches at a point where the subexpression regex could $(U not)
 233       be matched ending at the current position (matching goes backwards).
 234      )
 235   )
 236
 237   $(REG_START Character classes )
 238   $(REG_TABLE
 239     $(REG_TITLE Pattern element, Semantics )
 240     $(REG_ROW Any atom, Has the same meaning as outside of a character class,
 241       except for ] which must be written as \\])
 242     $(REG_ROW a-z, Includes characters a, b, c, ..., z. )
 243     $(REG_ROW [a||b]$(COMMA) [a--b]$(COMMA) [a~~b]$(COMMA) [a$(AMP)$(AMP)b],
 244      Where a, b are arbitrary classes, means union, set difference,
 245      symmetric set difference, and intersection respectively.
 246      $(I Any sequence of character class elements implicitly forms a union.) )
 247   )
 248
 249   $(REG_START Regex flags )
 250   $(REG_TABLE
 251     $(REG_TITLE Flag, Semantics )
 252     $(REG_ROW g, Global regex, repeat over the whole input. )
 253     $(REG_ROW i, Case insensitive matching. )
 254     $(REG_ROW m, Multi-line mode, match ^, $ on start and end line separators
 255        as well as start and end of input.)
 256     $(REG_ROW s, Single-line mode, makes . match '\n' and '\r' as well. )
 257     $(REG_ROW x, Free-form syntax, ignores whitespace in pattern,
 258       useful for formatting complex regular expressions. )
 259   )
 260
 261   $(SECTION Unicode support)
 262
 263   This library provides full Level 1 support* according to
 264     $(HTTP unicode.org/reports/tr18/, UTS 18). Specifically:
 265   $(UL
 266     $(LI 1.1 Hex notation via any of \uxxxx, \U00YYYYYY, \xZZ.)
 267     $(LI 1.2 Unicode properties.)
 268     $(LI 1.3 Character classes with set operations.)
 269     $(LI 1.4 Word boundaries use the full set of "word" characters.)
 270     $(LI 1.5 Using simple casefolding to match case
 271         insensitively across the full range of codepoints.)
 272     $(LI 1.6 Respecting line breaks as any of
 273         \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A.)
 274     $(LI 1.7 Operating on codepoint level.)
 275   )
 276   *With exception of point 1.1.1, as of yet, normalization of input
 277     is expected to be enforced by user.
 278
 279     $(SECTION Replace format string)
 280
 281     A set of functions in this module that do the substitution rely
 282     on a simple format to guide the process. In particular the table below
 283     applies to the `format` argument of
 284     $(LREF replaceFirst) and $(LREF replaceAll).
 285
 286     The format string can reference parts of match using the following notation.
 287     $(REG_TABLE
 288         $(REG_TITLE Format specifier, Replaced by )
 289         $(REG_ROW $(DOLLAR)$(AMP), the whole match. )
 290         $(REG_ROW $(DOLLAR)$(BACKTICK), part of input $(I preceding) the match. )
 291         $(REG_ROW $', part of input $(I following) the match. )
 292         $(REG_ROW $$, '$' character. )
 293         $(REG_ROW \c $(COMMA) where c is any character, the character c itself. )
 294         $(REG_ROW \\\\, '\\' character. )
 295         $(REG_ROW $(DOLLAR)1 .. $(DOLLAR)99, submatch number 1 to 99 respectively. )
 296     )
 297
 298   $(SECTION Slicing and zero memory allocations orientation)
 299
 300   All matches returned by pattern matching functionality in this library
 301     are slices of the original input. The notable exception is the `replace`
 302     family of functions  that generate a new string from the input.
 303
 304     In cases where producing the replacement is the ultimate goal
 305     $(LREF replaceFirstInto) and $(LREF replaceAllInto) could come in handy
 306     as functions that  avoid allocations even for replacement.
 307
 308     Copyright: Copyright Dmitry Olshansky, 2011-
 309
 310   License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
 311
 312   Authors: Dmitry Olshansky,
 313
 314     API and utility constructs are modeled after the original `std.regex`
 315   by Walter Bright and Andrei Alexandrescu.
 316
 317   Source: $(PHOBOSSRC std/regex/package.d)
 318
 319 Macros:
 320     REG_ROW = $(TR $(TD $(I $1 )) $(TD $+) )
 321     REG_TITLE = $(TR $(TD $(B $1)) $(TD $(B $2)) )
 322     REG_TABLE = <table border="1" cellspacing="0" cellpadding="5" > $0 </table>
 323     REG_START = <h3><div align="center"> $0 </div></h3>
 324     SECTION = <h3><a id="$1" href="#$1" class="anchor">$0</a></h3>
 325     S_LINK = <a href="#$1">$+</a>
 326  +/
 327 module std.regex;
 328
 329 import std.range.primitives, std.traits;
 330 import std.regex.internal.ir;
 331 import std.typecons : Flag, Yes, No;
 332
 333 /++
 334     `Regex` object holds regular expression pattern in compiled form.
 335
 336     Instances of this object are constructed via calls to `regex`.
 337     This is an intended form for caching and storage of frequently
 338     used regular expressions.
 339
 340     Example:
 341
 342     Test if this object doesn't contain any compiled pattern.
 343     ---
 344     Regex!char r;
 345     assert(r.empty);
 346     r = regex(""); // Note: "" is a valid regex pattern.
 347     assert(!r.empty);
 348     ---
 349
 350     Getting a range of all the named captures in the regex.
 351     ----
 352     import std.range;
 353     import std.algorithm;
 354
 355     auto re = regex(`(?P<name>\w+) = (?P<var>\d+)`);
 356     auto nc = re.namedCaptures;
 357     static assert(isRandomAccessRange!(typeof(nc)));
 358     assert(!nc.empty);
 359     assert(nc.length == 2);
 360     assert(nc.equal(["name", "var"]));
 361     assert(nc[0] == "name");
 362     assert(nc[1..$].equal(["var"]));
 363     ----
 364 +/
 365 public alias Regex(Char) = std.regex.internal.ir.Regex!(Char);
 366
 367 /++
 368     A `StaticRegex` is `Regex` object that contains D code specially
 369     generated at compile-time to speed up matching.
 370
 371     No longer used, kept as alias to Regex for backwards compatibility.
 372 +/
 373 public alias StaticRegex = Regex;
 374
 375 /++
 376     Compile regular expression pattern for the later execution.
 377     Returns: `Regex` object that works on inputs having
 378     the same character width as `pattern`.
 379
 380     Params:
 381     pattern = A single regular expression to match.
 382     patterns = An array of regular expression strings.
 383         The resulting `Regex` object will match any expression;
 384         use $(LREF whichPattern) to know which.
 385     flags = The _attributes (g, i, m, s and x accepted)
 386
 387     Throws: `RegexException` if there were any errors during compilation.
 388 +/
 389 @trusted public auto regex(S : C[], C)(const S[] patterns, const(char)[] flags="")
 390 if (isSomeString!(S))
 391 {
 392     import std.array : appender;
 393     import std.functional : memoize;
 394     enum cacheSize = 8; //TODO: invent nice interface to control regex caching
 395     const(C)[] pat;
 396     if (patterns.length > 1)
 397     {
 398         auto app = appender!S();
 399         foreach (i, p; patterns)
 400         {
 401             if (i != 0)
 402                 app.put("|");
 403             app.put("(?:");
 404             app.put(patterns[i]);
 405             // terminator for the pattern
 406             // to detect if the pattern unexpectedly ends
 407             app.put("\\");
 408             app.put(cast(dchar)(privateUseStart+i));
 409             app.put(")");
 410             // another one to return correct whichPattern
 411             // for all of potential alternatives in the patterns[i]
 412             app.put("\\");
 413             app.put(cast(dchar)(privateUseStart+i));
 414         }
 415         pat = app.data;
 416     }
 417     else
 418         pat = patterns[0];
 419
 420     if (__ctfe)
 421         return regexImpl(pat, flags);
 422     return memoize!(regexImpl!S, cacheSize)(pat, flags);
 423 }
 424
 425 ///ditto
 426 @trusted public auto regex(S)(S pattern, const(char)[] flags="")
 427 if (isSomeString!(S))
 428 {
 429     return regex([pattern], flags);
 430 }
 431
 432 ///
 433 @system unittest
 434 {
 435     void test(S)()
 436     {
 437         // multi-pattern regex example
 438         S[] arr = [`([a-z]+):(\d+)`, `(\d+),\d+`];
 439         auto multi = regex(arr); // multi regex
 440         S str = "abc:43 12,34";
 441         auto m = str.matchAll(multi);
 442         assert(m.front.whichPattern == 1);
 443         assert(m.front[1] == "abc");
 444         assert(m.front[2] == "43");
 445         m.popFront();
 446         assert(m.front.whichPattern == 2);
 447         assert(m.front[1] == "12");
 448     }
 449
 450     import std.meta : AliasSeq;
 451     static foreach (C; AliasSeq!(string, wstring, dstring))
 452         // Test with const array of patterns - see https://issues.dlang.org/show_bug.cgi?id=20301
 453         static foreach (S; AliasSeq!(C, const C, immutable C))
 454             test!S();
 455 }
 456
 457 @system unittest
 458 {
 459     import std.conv : to;
 460     import std.string : indexOf;
 461
 462     immutable pattern = "s+";
 463     auto regexString = to!string(regex(pattern, "U"));
 464     assert(regexString.length <= pattern.length + 100, "String representation shouldn't be unreasonably bloated.");
 465     assert(indexOf(regexString, "s+") >= 0, "String representation should include pattern.");
 466     assert(indexOf(regexString, 'U') >= 0, "String representation should include flags.");
 467 }
 468
 469 public auto regexImpl(S)(const S pattern, const(char)[] flags="")
 470 if (isSomeString!(typeof(pattern)))
 471 {
 472     import std.regex.internal.parser : Parser, CodeGen;
 473     auto parser = Parser!(Unqual!(typeof(pattern)), CodeGen)(pattern, flags);
 474     auto r = parser.program;
 475     return r;
 476 }
 477
 478
 479 private struct CTRegexWrapper(Char)
 480 {
 481     private immutable(Regex!Char)* re;
 482
 483     // allow code that expects mutable Regex to still work
 484     // we stay "logically const"
 485     @property @trusted ref getRe() const { return *cast(Regex!Char*) re; }
 486     alias getRe this;
 487 }
 488
 489 template ctRegexImpl(alias pattern, string flags="")
 490 {
 491     import std.regex.internal.backtracking, std.regex.internal.parser;
 492     static immutable r = cast(immutable) regex(pattern, flags);
 493     alias Char = BasicElementOf!(typeof(pattern));
 494     enum source = ctGenRegExCode(r);
 495     @trusted pure bool func(BacktrackingMatcher!Char matcher)
 496     {
 497         debug(std_regex_ctr) pragma(msg, source);
 498         cast(void) matcher;
 499         mixin(source);
 500     }
 501     static immutable staticRe =
 502         cast(immutable) r.withFactory(new CtfeFactory!(BacktrackingMatcher, Char, func));
 503     enum wrapper = CTRegexWrapper!Char(&staticRe);
 504 }
 505
 506 @safe pure unittest
 507 {
 508     // test compat for logical const workaround
 509     static void test(StaticRegex!char)
 510     {
 511     }
 512     enum re = ctRegex!``;
 513     test(re);
 514 }
 515
 516 @safe pure unittest
 517 {
 518     auto re = ctRegex!`foo`;
 519     assert(matchFirst("foo", re));
 520
 521     // test reassignment
 522     re = ctRegex!`bar`;
 523     assert(matchFirst("bar", re));
 524     assert(!matchFirst("bar", ctRegex!`foo`));
 525 }
 526
 527 /++
 528     Compile regular expression using CTFE
 529     and generate optimized native machine code for matching it.
 530
 531     Returns: StaticRegex object for faster matching.
 532
 533     Params:
 534     pattern = Regular expression
 535     flags = The _attributes (g, i, m, s and x accepted)
 536 +/
 537 public enum ctRegex(alias pattern, string flags="") = ctRegexImpl!(pattern, flags).wrapper;
 538
 539 enum isRegexFor(RegEx, R) = is(immutable RegEx == immutable Regex!(BasicElementOf!R))
 540      || is(RegEx : const(Regex!(BasicElementOf!R)))
 541      || is(immutable RegEx == immutable StaticRegex!(BasicElementOf!R));
 542
 543
 544 /++
 545     `Captures` object contains submatches captured during a call
 546     to `match` or iteration over `RegexMatch` range.
 547
 548     First element of range is the whole match.
 549 +/
 550 @trusted public struct Captures(R)
 551 if (isSomeString!R)
 552 {//@trusted because of union inside
 553     alias DataIndex = size_t;
 554     alias String = R;
 555     alias Store = SmallFixedArray!(Group!DataIndex, 3);
 556 private:
 557     import std.conv : text;
 558     Store matches;
 559     const(NamedGroup)[] _names;
 560     R _input;
 561     int _nMatch;
 562     uint _f, _b;
 563
 564     this(R input, uint n, const(NamedGroup)[] named)
 565     {
 566         _input = input;
 567         _names = named;
 568         matches = Store(n);
 569         _b = n;
 570         _f = 0;
 571     }
 572
 573     this(ref RegexMatch!R rmatch)
 574     {
 575         _input = rmatch._input;
 576         _names = rmatch._engine.pattern.dict;
 577         immutable n = rmatch._engine.pattern.ngroup;
 578         matches = Store(n);
 579         _b = n;
 580         _f = 0;
 581     }
 582
 583     inout(R) getMatch(size_t index) inout
 584     {
 585         auto m = &matches[index];
 586         return *m ? _input[m.begin .. m.end] : null;
 587     }
 588
 589 public:
 590     ///Slice of input prior to the match.
 591     @property R pre()
 592     {
 593         return _nMatch == 0 ? _input[] : _input[0 .. matches[0].begin];
 594     }
 595
 596     ///Slice of input immediately after the match.
 597     @property R post()
 598     {
 599         return _nMatch == 0 ? _input[] : _input[matches[0].end .. $];
 600     }
 601
 602     ///Slice of matched portion of input.
 603     @property R hit()
 604     {
 605         assert(_nMatch, "attempted to get hit of an empty match");
 606         return _input[matches[0].begin .. matches[0].end];
 607     }
 608
 609     ///Range interface.
 610     @property R front()
 611     {
 612         assert(_nMatch, "attempted to get front of an empty match");
 613         return getMatch(_f);
 614     }
 615
 616     ///ditto
 617     @property R back()
 618     {
 619         assert(_nMatch, "attempted to get back of an empty match");
 620         return getMatch(_b - 1);
 621     }
 622
 623     ///ditto
 624     void popFront()
 625     {
 626         assert(!empty);
 627         ++_f;
 628     }
 629
 630     ///ditto
 631     void popBack()
 632     {
 633         assert(!empty);
 634         --_b;
 635     }
 636
 637     ///ditto
 638     @property bool empty() const { return _nMatch == 0 || _f >= _b; }
 639
 640     ///ditto
 641     inout(R) opIndex()(size_t i) inout
 642     {
 643         assert(_f + i < _b,text("requested submatch number ", i," is out of range"));
 644         return getMatch(_f + i);
 645     }
 646
 647     /++
 648         Explicit cast to bool.
 649         Useful as a shorthand for !(x.empty) in if and assert statements.
 650
 651         ---
 652         import std.regex;
 653
 654         assert(!matchFirst("nothing", "something"));
 655         ---
 656     +/
 657
 658     @safe bool opCast(T:bool)() const nothrow { return _nMatch != 0; }
 659
 660     /++
 661         Number of pattern matched counting, where 1 - the first pattern.
 662         Returns 0 on no match.
 663     +/
 664
 665     @safe @property int whichPattern() const nothrow { return _nMatch; }
 666
 667     ///
 668     @system unittest
 669     {
 670         import std.regex;
 671         assert(matchFirst("abc", "[0-9]+", "[a-z]+").whichPattern == 2);
 672     }
 673
 674     /++
 675         Lookup named submatch.
 676
 677         ---
 678         import std.regex;
 679         import std.range;
 680
 681         auto c = matchFirst("a = 42;", regex(`(?P<var>\w+)\s*=\s*(?P<value>\d+);`));
 682         assert(c["var"] == "a");
 683         assert(c["value"] == "42");
 684         popFrontN(c, 2);
 685         //named groups are unaffected by range primitives
 686         assert(c["var"] =="a");
 687         assert(c.front == "42");
 688         ----
 689     +/
 690     R opIndex(String)(String i) /*const*/ //@@@BUG@@@
 691     if (isSomeString!String)
 692     {
 693         size_t index = lookupNamedGroup(_names, i);
 694         return getMatch(index);
 695     }
 696
 697     ///Number of matches in this object.
 698     @property size_t length() const { return _nMatch == 0 ? 0 : _b - _f;  }
 699
 700     ///A hook for compatibility with original std.regex.
 701     @property ref captures(){ return this; }
 702 }
 703
 704 ///
 705 @system unittest
 706 {
 707     import std.range.primitives : popFrontN;
 708
 709     auto c = matchFirst("@abc#", regex(`(\w)(\w)(\w)`));
 710     assert(c.pre == "@"); // Part of input preceding match
 711     assert(c.post == "#"); // Immediately after match
 712     assert(c.hit == c[0] && c.hit == "abc"); // The whole match
 713     assert(c[2] == "b");
 714     assert(c.front == "abc");
 715     c.popFront();
 716     assert(c.front == "a");
 717     assert(c.back == "c");
 718     c.popBack();
 719     assert(c.back == "b");
 720     popFrontN(c, 2);
 721     assert(c.empty);
 722
 723     assert(!matchFirst("nothing", "something"));
 724
 725     // Captures that are not matched will be null.
 726     c = matchFirst("ac", regex(`a(b)?c`));
 727     assert(c);
 728     assert(!c[1]);
 729 }
 730
 731 @system unittest
 732 {
 733     Captures!string c;
 734     string s = "abc";
 735     assert(cast(bool)(c = matchFirst(s, regex("d")))
 736         || cast(bool)(c = matchFirst(s, regex("a"))));
 737 }
 738
 739 // https://issues.dlang.org/show_bug.cgi?id=19979
 740 @system unittest
 741 {
 742     auto c = matchFirst("bad", regex(`(^)(not )?bad($)`));
 743     assert(c[0] && c[0].length == "bad".length);
 744     assert(c[1] && !c[1].length);
 745     assert(!c[2]);
 746     assert(c[3] && !c[3].length);
 747 }
 748
 749 /++
 750     A regex engine state, as returned by `match` family of functions.
 751
 752     Effectively it's a forward range of Captures!R, produced
 753     by lazily searching for matches in a given input.
 754 +/
 755 @trusted public struct RegexMatch(R)
 756 if (isSomeString!R)
 757 {
 758     import std.typecons : Rebindable;
 759 private:
 760     alias Char = BasicElementOf!R;
 761     Matcher!Char _engine;
 762     Rebindable!(const MatcherFactory!Char) _factory;
 763     R _input;
 764     Captures!R _captures;
 765
 766     this(RegEx)(R input, RegEx prog)
 767     {
 768         import std.exception : enforce;
 769         _input = input;
 770         if (prog.factory is null) _factory = defaultFactory!Char(prog);
 771         else _factory = prog.factory;
 772         _engine = _factory.create(prog, input);
 773         assert(_engine.refCount == 1);
 774         _captures = Captures!R(this);
 775         _captures.matches.mutate((slice) pure { _captures._nMatch = _engine.match(slice); });
 776     }
 777
 778 public:
 779     this(this)
 780     {
 781         if (_engine) _factory.incRef(_engine);
 782     }
 783
 784     ~this()
 785     {
 786         if (_engine) _factory.decRef(_engine);
 787     }
 788
 789     ///Shorthands for front.pre, front.post, front.hit.
 790     @property R pre()
 791     {
 792         return _captures.pre;
 793     }
 794
 795     ///ditto
 796     @property R post()
 797     {
 798         return _captures.post;
 799     }
 800
 801     ///ditto
 802     @property R hit()
 803     {
 804         return _captures.hit;
 805     }
 806
 807     /++
 808         Functionality for processing subsequent matches of global regexes via range interface:
 809         ---
 810         import std.regex;
 811         auto m = matchAll("Hello, world!", regex(`\w+`));
 812         assert(m.front.hit == "Hello");
 813         m.popFront();
 814         assert(m.front.hit == "world");
 815         m.popFront();
 816         assert(m.empty);
 817         ---
 818     +/
 819     @property inout(Captures!R) front() inout
 820     {
 821         return _captures;
 822     }
 823
 824     ///ditto
 825     void popFront()
 826     {
 827         import std.exception : enforce;
 828         // CoW - if refCount is not 1, we are aliased by somebody else
 829         if (_engine.refCount != 1)
 830         {
 831             // we create a new engine & abandon this reference
 832             auto old = _engine;
 833             _engine = _factory.dup(old, _input);
 834             _factory.decRef(old);
 835         }
 836         _captures.matches.mutate((slice) { _captures._nMatch = _engine.match(slice); });
 837     }
 838
 839     ///ditto
 840     auto save(){ return this; }
 841
 842     ///Test if this match object is empty.
 843     @property bool empty() const { return _captures._nMatch == 0; }
 844
 845     ///Same as !(x.empty), provided for its convenience  in conditional statements.
 846     T opCast(T:bool)(){ return !empty; }
 847
 848     /// Same as .front, provided for compatibility with original std.regex.
 849     @property inout(Captures!R) captures() inout { return _captures; }
 850 }
 851
 852 private auto matchOnceImpl(RegEx, R)(R input, const auto ref RegEx prog) @trusted
 853 {
 854     alias Char = BasicElementOf!R;
 855     static struct Key
 856     {
 857         immutable(Char)[] pattern;
 858         uint flags;
 859     }
 860     static Key cacheKey = Key("", -1);
 861     static Matcher!Char cache;
 862     auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory;
 863     auto key = Key(prog.pattern, prog.flags);
 864     Matcher!Char engine;
 865     if (cacheKey == key)
 866     {
 867         engine = cache;
 868         engine.rearm(input);
 869     }
 870     else
 871     {
 872         engine = factory.create(prog, input);
 873         if (cache) factory.decRef(cache); // destroy cached engine *after* building a new one
 874         cache = engine;
 875         cacheKey = key;
 876     }
 877     auto captures = Captures!R(input, prog.ngroup, prog.dict);
 878     captures.matches.mutate((slice) pure { captures._nMatch = engine.match(slice); });
 879     return captures;
 880 }
 881
 882 // matchOnce is constructed as a safe, pure wrapper over matchOnceImpl. It can be
 883 // faked as pure because the static mutable variables are used to cache the key and
 884 // character matcher. The technique used avoids delegates and GC.
 885 private @safe auto matchOnce(RegEx, R)(R input, const auto ref RegEx prog) pure
 886 {
 887     static auto impl(R input, const ref RegEx prog)
 888     {
 889         return matchOnceImpl(input, prog);
 890     }
 891
 892     static @trusted auto pureImpl(R input, const ref RegEx prog)
 893     {
 894         auto p = assumePureFunction(&impl);
 895         return p(input, prog);
 896     }
 897
 898     return pureImpl(input, prog);
 899 }
 900
 901 private auto matchMany(RegEx, R)(R input, auto ref RegEx re) @safe
 902 {
 903     return RegexMatch!R(input, re.withFlags(re.flags | RegexOption.global));
 904 }
 905
 906 @system unittest
 907 {
 908     //sanity checks for new API
 909     auto re = regex("abc");
 910     assert(!"abc".matchOnce(re).empty);
 911     assert("abc".matchOnce(re)[0] == "abc");
 912 }
 913
 914 // https://issues.dlang.org/show_bug.cgi?id=18135
 915 @system unittest
 916 {
 917     static struct MapResult { RegexMatch!string m; }
 918     MapResult m;
 919     m = MapResult();
 920     assert(m == m);
 921 }
 922
 923 private enum isReplaceFunctor(alias fun, R) =
 924     __traits(compiles, (Captures!R c) { fun(c); });
 925
 926 // the lowest level - just stuff replacements into the sink
 927 private @trusted void replaceCapturesInto(alias output, Sink, R, T)
 928         (ref Sink sink, R input, T captures)
 929 if (isOutputRange!(Sink, dchar) && isSomeString!R)
 930 {
 931     if (captures.empty)
 932     {
 933         sink.put(input);
 934         return;
 935     }
 936     sink.put(captures.pre);
 937     // a hack to get around bogus errors, should be simply output(captures, sink)
 938     // "is a nested function and cannot be accessed from"
 939     static if (isReplaceFunctor!(output, R))
 940         sink.put(output(captures)); //"mutator" type of function
 941     else
 942         output(captures, sink); //"output" type of function
 943     sink.put(captures.post);
 944 }
 945
 946 // ditto for a range of captures
 947 private void replaceMatchesInto(alias output, Sink, R, T)
 948         (ref Sink sink, R input, T matches)
 949 if (isOutputRange!(Sink, dchar) && isSomeString!R)
 950 {
 951     size_t offset = 0;
 952     foreach (cap; matches)
 953     {
 954         sink.put(cap.pre[offset .. $]);
 955         // same hack, see replaceCapturesInto
 956         static if (isReplaceFunctor!(output, R))
 957             sink.put(output(cap)); //"mutator" type of function
 958         else
 959             output(cap, sink); //"output" type of function
 960         offset = cap.pre.length + cap.hit.length;
 961     }
 962     sink.put(input[offset .. $]);
 963 }
 964
 965 //  a general skeleton of replaceFirst
 966 private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re)
 967 if (isSomeString!R && isRegexFor!(RegEx, R))
 968 {
 969     import std.array : appender;
 970     auto data = matchFirst(input, re);
 971     if (data.empty)
 972         return input;
 973     auto app = appender!(R)();
 974     replaceCapturesInto!output(app, input, data);
 975     return app.data;
 976 }
 977
 978 // ditto for replaceAll
 979 // the method parameter allows old API to ride on the back of the new one
 980 private R replaceAllWith(alias output,
 981         alias method=matchAll, R, RegEx)(R input, RegEx re)
 982 if (isSomeString!R && isRegexFor!(RegEx, R))
 983 {
 984     import std.array : appender;
 985     auto matches = method(input, re); //inout(C)[] fails
 986     if (matches.empty)
 987         return input;
 988     auto app = appender!(R)();
 989     replaceMatchesInto!output(app, input, matches);
 990     return app.data;
 991 }
 992
 993
 994 /++
 995     Start matching `input` to regex pattern `re`,
 996     using Thompson NFA matching scheme.
 997
 998     The use of this function is $(RED discouraged) - use either of
 999     $(LREF matchAll) or $(LREF matchFirst).
1000
1001     Delegating  the kind of operation
1002     to "g" flag is soon to be phased out along with the
1003     ability to choose the exact matching scheme. The choice of
1004     matching scheme to use depends highly on the pattern kind and
1005     can done automatically on case by case basis.
1006
1007     Returns: a `RegexMatch` object holding engine state after first match.
1008 +/
1009
1010 public auto match(R, RegEx)(R input, RegEx re)
1011 if (isSomeString!R && isRegexFor!(RegEx,R))
1012 {
1013     return RegexMatch!(Unqual!(typeof(input)))(input, re);
1014 }
1015
1016 ///ditto
1017 public auto match(R, String)(R input, String re)
1018 if (isSomeString!R && isSomeString!String)
1019 {
1020     return RegexMatch!(Unqual!(typeof(input)))(input, regex(re));
1021 }
1022
1023 /++
1024     Find the first (leftmost) slice of the `input` that
1025     matches the pattern `re`. This function picks the most suitable
1026     regular expression engine depending on the pattern properties.
1027
1028     `re` parameter can be one of three types:
1029     $(UL
1030       $(LI Plain string(s), in which case it's compiled to bytecode before matching. )
1031       $(LI Regex!char (wchar/dchar) that contains a pattern in the form of
1032         compiled  bytecode. )
1033       $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of
1034         compiled native machine code. )
1035     )
1036
1037     Returns:
1038     $(LREF Captures) containing the extent of a match together with all submatches
1039     if there was a match, otherwise an empty $(LREF Captures) object.
1040 +/
1041 public auto matchFirst(R, RegEx)(R input, RegEx re)
1042 if (isSomeString!R && isRegexFor!(RegEx, R))
1043 {
1044     return matchOnce(input, re);
1045 }
1046
1047 ///ditto
1048 public auto matchFirst(R, String)(R input, String re)
1049 if (isSomeString!R && isSomeString!String)
1050 {
1051     return matchOnce(input, regex(re));
1052 }
1053
1054 ///ditto
1055 public auto matchFirst(R, String)(R input, String[] re...)
1056 if (isSomeString!R && isSomeString!String)
1057 {
1058     return matchOnce(input, regex(re));
1059 }
1060
1061 /++
1062     Initiate a search for all non-overlapping matches to the pattern `re`
1063     in the given `input`. The result is a lazy range of matches generated
1064     as they are encountered in the input going left to right.
1065
1066     This function picks the most suitable regular expression engine
1067     depending on the pattern properties.
1068
1069     `re` parameter can be one of three types:
1070     $(UL
1071       $(LI Plain string(s), in which case it's compiled to bytecode before matching. )
1072       $(LI Regex!char (wchar/dchar) that contains a pattern in the form of
1073         compiled  bytecode. )
1074       $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of
1075         compiled native machine code. )
1076     )
1077
1078     Returns:
1079     $(LREF RegexMatch) object that represents matcher state
1080     after the first match was found or an empty one if not present.
1081 +/
1082 public auto matchAll(R, RegEx)(R input, RegEx re)
1083 if (isSomeString!R && isRegexFor!(RegEx, R))
1084 {
1085     return matchMany(input, re);
1086 }
1087
1088 ///ditto
1089 public auto matchAll(R, String)(R input, String re)
1090 if (isSomeString!R && isSomeString!String)
1091 {
1092     return matchMany(input, regex(re));
1093 }
1094
1095 ///ditto
1096 public auto matchAll(R, String)(R input, String[] re...)
1097 if (isSomeString!R && isSomeString!String)
1098 {
1099     return matchMany(input, regex(re));
1100 }
1101
1102 // another set of tests just to cover the new API
1103 @system unittest
1104 {
1105     import std.algorithm.comparison : equal;
1106     import std.algorithm.iteration : map;
1107     import std.conv : to;
1108
1109     static foreach (String; AliasSeq!(string, wstring, const(dchar)[]))
1110     {{
1111         auto str1 = "blah-bleh".to!String();
1112         auto pat1 = "bl[ae]h".to!String();
1113         auto mf = matchFirst(str1, pat1);
1114         assert(mf.equal(["blah".to!String()]));
1115         auto mAll = matchAll(str1, pat1);
1116         assert(mAll.equal!((a,b) => a.equal(b))
1117             ([["blah".to!String()], ["bleh".to!String()]]));
1118
1119         auto str2 = "1/03/12 - 3/03/12".to!String();
1120         auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]);
1121         auto mf2 = matchFirst(str2, pat2);
1122         assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)()));
1123         auto mAll2 = matchAll(str2, pat2);
1124         assert(mAll2.front.equal(mf2));
1125         mAll2.popFront();
1126         assert(mAll2.front.equal(["3/03/12", "3", "03", "12"].map!(to!String)()));
1127         mf2.popFrontN(3);
1128         assert(mf2.equal(["12".to!String()]));
1129
1130         auto ctPat = ctRegex!(`(?P<Quot>\d+)/(?P<Denom>\d+)`.to!String());
1131         auto str = "2 + 34/56 - 6/1".to!String();
1132         auto cmf = matchFirst(str, ctPat);
1133         assert(cmf.equal(["34/56", "34", "56"].map!(to!String)()));
1134         assert(cmf["Quot"] == "34".to!String());
1135         assert(cmf["Denom"] == "56".to!String());
1136
1137         auto cmAll = matchAll(str, ctPat);
1138         assert(cmAll.front.equal(cmf));
1139         cmAll.popFront();
1140         assert(cmAll.front.equal(["6/1", "6", "1"].map!(to!String)()));
1141     }}
1142 }
1143
1144 /++
1145     Start matching of `input` to regex pattern `re`,
1146     using traditional $(LINK2 https://en.wikipedia.org/wiki/Backtracking,
1147     backtracking) matching scheme.
1148
1149     The use of this function is $(RED discouraged) - use either of
1150     $(LREF matchAll) or $(LREF matchFirst).
1151
1152     Delegating  the kind of operation
1153     to "g" flag is soon to be phased out along with the
1154     ability to choose the exact matching scheme. The choice of
1155     matching scheme to use depends highly on the pattern kind and
1156     can done automatically on case by case basis.
1157
1158     Returns: a `RegexMatch` object holding engine
1159     state after first match.
1160
1161 +/
1162 public auto bmatch(R, RegEx)(R input, RegEx re)
1163 if (isSomeString!R && isRegexFor!(RegEx, R))
1164 {
1165     return RegexMatch!(Unqual!(typeof(input)))(input, re);
1166 }
1167
1168 ///ditto
1169 public auto bmatch(R, String)(R input, String re)
1170 if (isSomeString!R && isSomeString!String)
1171 {
1172     return RegexMatch!(Unqual!(typeof(input)))(input, regex(re));
1173 }
1174
1175 // produces replacement string from format using captures for substitution
1176 package void replaceFmt(R, Capt, OutR)
1177     (R format, Capt captures, OutR sink, bool ignoreBadSubs = false)
1178 if (isOutputRange!(OutR, ElementEncodingType!R[]) &&
1179     isOutputRange!(OutR, ElementEncodingType!(Capt.String)[]))
1180 {
1181     import std.algorithm.searching : find;
1182     import std.ascii : isDigit, isAlpha;
1183     import std.conv : text, parse;
1184     import std.exception : enforce;
1185     enum State { Normal, Dollar }
1186     auto state = State.Normal;
1187     size_t offset;
1188 L_Replace_Loop:
1189     while (!format.empty)
1190         final switch (state)
1191         {
1192         case State.Normal:
1193             for (offset = 0; offset < format.length; offset++)//no decoding
1194             {
1195                 if (format[offset] == '$')
1196                 {
1197                     state = State.Dollar;
1198                     sink.put(format[0 .. offset]);
1199                     format = format[offset+1 .. $];//ditto
1200                     continue L_Replace_Loop;
1201                 }
1202             }
1203             sink.put(format[0 .. offset]);
1204             format = format[offset .. $];
1205             break;
1206         case State.Dollar:
1207             if (isDigit(format[0]))
1208             {
1209                 uint digit = parse!uint(format);
1210                 enforce(ignoreBadSubs || digit < captures.length, text("invalid submatch number ", digit));
1211                 if (digit < captures.length)
1212                     sink.put(captures[digit]);
1213             }
1214             else if (format[0] == '{')
1215             {
1216                 auto x = find!(a => !isAlpha(a))(format[1..$]);
1217                 enforce(!x.empty && x[0] == '}', "no matching '}' in replacement format");
1218                 auto name = format[1 .. $ - x.length];
1219                 format = x[1..$];
1220                 enforce(!name.empty, "invalid name in ${...} replacement format");
1221                 sink.put(captures[name]);
1222             }
1223             else if (format[0] == '&')
1224             {
1225                 sink.put(captures[0]);
1226                 format = format[1 .. $];
1227             }
1228             else if (format[0] == '`')
1229             {
1230                 sink.put(captures.pre);
1231                 format = format[1 .. $];
1232             }
1233             else if (format[0] == '\'')
1234             {
1235                 sink.put(captures.post);
1236                 format = format[1 .. $];
1237             }
1238             else if (format[0] == '$')
1239             {
1240                 sink.put(format[0 .. 1]);
1241                 format = format[1 .. $];
1242             }
1243             state = State.Normal;
1244             break;
1245         }
1246     enforce(state == State.Normal, "invalid format string in regex replace");
1247 }
1248
1249 /++
1250     Construct a new string from `input` by replacing the first match with
1251     a string generated from it according to the `format` specifier.
1252
1253     To replace all matches use $(LREF replaceAll).
1254
1255     Params:
1256     input = string to search
1257     re = compiled regular expression to use
1258     format = _format string to generate replacements from,
1259     see $(S_LINK Replace _format string, the _format string).
1260
1261     Returns:
1262     A string of the same type with the first match (if any) replaced.
1263     If no match is found returns the input string itself.
1264 +/
1265 public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format)
1266 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R))
1267 {
1268     return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re);
1269 }
1270
1271 ///
1272 @system unittest
1273 {
1274     assert(replaceFirst("noon", regex("n"), "[$&]") == "[n]oon");
1275 }
1276
1277 /++
1278     This is a general replacement tool that construct a new string by replacing
1279     matches of pattern `re` in the `input`. Unlike the other overload
1280     there is no format string instead captures are passed to
1281     to a user-defined functor `fun` that returns a new string
1282     to use as replacement.
1283
1284     This version replaces the first match in `input`,
1285     see $(LREF replaceAll) to replace the all of the matches.
1286
1287     Returns:
1288     A new string of the same type as `input` with all matches
1289     replaced by return values of `fun`. If no matches found
1290     returns the `input` itself.
1291 +/
1292 public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re)
1293 if (isSomeString!R && isRegexFor!(RegEx, R))
1294 {
1295     return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re);
1296 }
1297
1298 ///
1299 @system unittest
1300 {
1301     import std.conv : to;
1302     string list = "#21 out of 46";
1303     string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1))
1304         (list, regex(`[0-9]+`));
1305     assert(newList == "#22 out of 46");
1306 }
1307
1308 /++
1309     A variation on $(LREF replaceFirst) that instead of allocating a new string
1310     on each call outputs the result piece-wise to the `sink`. In particular
1311     this enables efficient construction of a final output incrementally.
1312
1313     Like in $(LREF replaceFirst) family of functions there is an overload
1314     for the substitution guided by the `format` string
1315     and the one with the user defined callback.
1316 +/
1317 public @trusted void replaceFirstInto(Sink, R, C, RegEx)
1318         (ref Sink sink, R input, RegEx re, const(C)[] format)
1319 if (isOutputRange!(Sink, dchar) && isSomeString!R
1320     && is(C : dchar) && isRegexFor!(RegEx, R))
1321     {
1322     replaceCapturesInto!((m, sink) => replaceFmt(format, m, sink))
1323         (sink, input, matchFirst(input, re));
1324     }
1325
1326 ///ditto
1327 public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx)
1328     (Sink sink, R input, RegEx re)
1329 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R))
1330 {
1331     replaceCapturesInto!fun(sink, input, matchFirst(input, re));
1332 }
1333
1334 ///
1335 @system unittest
1336 {
1337     import std.array;
1338     string m1 = "first message\n";
1339     string m2 = "second message\n";
1340     auto result = appender!string();
1341     replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1");
1342     //equivalent of the above with user-defined callback
1343     replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`));
1344     assert(result.data == "first\nsecond\n");
1345 }
1346
1347 //examples for replaceFirst
1348 @system unittest
1349 {
1350     import std.conv;
1351     string list = "#21 out of 46";
1352     string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1))
1353         (list, regex(`[0-9]+`));
1354     assert(newList == "#22 out of 46");
1355     import std.array;
1356     string m1 = "first message\n";
1357     string m2 = "second message\n";
1358     auto result = appender!string();
1359     replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1");
1360     //equivalent of the above with user-defined callback
1361     replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`));
1362     assert(result.data == "first\nsecond\n");
1363 }
1364
1365 /++
1366     Construct a new string from `input` by replacing all of the
1367     fragments that match a pattern `re` with a string generated
1368     from the match according to the `format` specifier.
1369
1370     To replace only the first match use $(LREF replaceFirst).
1371
1372     Params:
1373     input = string to search
1374     re = compiled regular expression to use
1375     format = _format string to generate replacements from,
1376     see $(S_LINK Replace _format string, the _format string).
1377
1378     Returns:
1379     A string of the same type as `input` with the all
1380     of the matches (if any) replaced.
1381     If no match is found returns the input string itself.
1382 +/
1383 public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format)
1384 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R))
1385 {
1386     return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re);
1387 }
1388
1389 ///
1390 @system unittest
1391 {
1392     // insert comma as thousands delimiter
1393     auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g");
1394     assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100");
1395 }
1396
1397 /++
1398     This is a general replacement tool that construct a new string by replacing
1399     matches of pattern `re` in the `input`. Unlike the other overload
1400     there is no format string instead captures are passed to
1401     to a user-defined functor `fun` that returns a new string
1402     to use as replacement.
1403
1404     This version replaces all of the matches found in `input`,
1405     see $(LREF replaceFirst) to replace the first match only.
1406
1407     Returns:
1408     A new string of the same type as `input` with all matches
1409     replaced by return values of `fun`. If no matches found
1410     returns the `input` itself.
1411
1412     Params:
1413     input = string to search
1414     re = compiled regular expression
1415     fun = delegate to use
1416 +/
1417 public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re)
1418 if (isSomeString!R && isRegexFor!(RegEx, R))
1419 {
1420     return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re);
1421 }
1422
1423 ///
1424 @system unittest
1425 {
1426     string baz(Captures!(string) m)
1427     {
1428         import std.string : toUpper;
1429         return toUpper(m.hit);
1430     }
1431     // Capitalize the letters 'a' and 'r':
1432     auto s = replaceAll!(baz)("Strap a rocket engine on a chicken.",
1433             regex("[ar]"));
1434     assert(s == "StRAp A Rocket engine on A chicken.");
1435 }
1436
1437 /++
1438     A variation on $(LREF replaceAll) that instead of allocating a new string
1439     on each call outputs the result piece-wise to the `sink`. In particular
1440     this enables efficient construction of a final output incrementally.
1441
1442     As with $(LREF replaceAll) there are 2 overloads - one with a format string,
1443     the other one with a user defined functor.
1444 +/
1445 public @trusted void replaceAllInto(Sink, R, C, RegEx)
1446         (Sink sink, R input, RegEx re, const(C)[] format)
1447 if (isOutputRange!(Sink, dchar) && isSomeString!R
1448     && is(C : dchar) && isRegexFor!(RegEx, R))
1449     {
1450     replaceMatchesInto!((m, sink) => replaceFmt(format, m, sink))
1451         (sink, input, matchAll(input, re));
1452     }
1453
1454 ///ditto
1455 public @trusted void replaceAllInto(alias fun, Sink, R, RegEx)
1456         (Sink sink, R input, RegEx re)
1457 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R))
1458 {
1459     replaceMatchesInto!fun(sink, input, matchAll(input, re));
1460 }
1461
1462 ///
1463 @system unittest
1464 {
1465     // insert comma as thousands delimiter in fifty randomly produced big numbers
1466     import std.array, std.conv, std.random, std.range;
1467     static re = regex(`(?<=\d)(?=(\d\d\d)+\b)`, "g");
1468     auto sink = appender!(char [])();
1469     enum ulong min = 10UL ^^ 10, max = 10UL ^^ 19;
1470     foreach (i; 0 .. 50)
1471     {
1472         sink.clear();
1473         replaceAllInto(sink, text(uniform(min, max)), re, ",");
1474         foreach (pos; iota(sink.data.length - 4, 0, -4))
1475             assert(sink.data[pos] == ',');
1476     }
1477 }
1478
1479 // exercise all of the replace APIs
1480 @system unittest
1481 {
1482     import std.array : appender;
1483     import std.conv;
1484     // try and check first/all simple substitution
1485     static foreach (S; AliasSeq!(string, wstring, dstring, char[], wchar[], dchar[]))
1486     {{
1487         S s1 = "curt trial".to!S();
1488         S s2 = "round dome".to!S();
1489         S t1F = "court trial".to!S();
1490         S t2F = "hound dome".to!S();
1491         S t1A = "court trial".to!S();
1492         S t2A = "hound home".to!S();
1493         auto re1 = regex("curt".to!S());
1494         auto re2 = regex("[dr]o".to!S());
1495
1496         assert(replaceFirst(s1, re1, "court") == t1F);
1497         assert(replaceFirst(s2, re2, "ho") == t2F);
1498         assert(replaceAll(s1, re1, "court") == t1A);
1499         assert(replaceAll(s2, re2, "ho") == t2A);
1500
1501         auto rep1 = replaceFirst!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1);
1502         assert(rep1 == t1F);
1503         assert(replaceFirst!(cap => "ho".to!S())(s2, re2) == t2F);
1504         auto rep1A = replaceAll!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1);
1505         assert(rep1A == t1A);
1506         assert(replaceAll!(cap => "ho".to!S())(s2, re2) == t2A);
1507
1508         auto sink = appender!S();
1509         replaceFirstInto(sink, s1, re1, "court");
1510         assert(sink.data == t1F);
1511         replaceFirstInto(sink, s2, re2, "ho");
1512         assert(sink.data == t1F~t2F);
1513         replaceAllInto(sink, s1, re1, "court");
1514         assert(sink.data == t1F~t2F~t1A);
1515         replaceAllInto(sink, s2, re2, "ho");
1516         assert(sink.data == t1F~t2F~t1A~t2A);
1517     }}
1518 }
1519
1520 /++
1521     Old API for replacement, operation depends on flags of pattern `re`.
1522     With "g" flag it performs the equivalent of $(LREF replaceAll) otherwise it
1523     works the same as $(LREF replaceFirst).
1524
1525     The use of this function is $(RED discouraged), please use $(LREF replaceAll)
1526     or $(LREF replaceFirst) explicitly.
1527 +/
1528 public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format)
1529 if (isSomeString!R && isRegexFor!(RegEx, R))
1530 {
1531     return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re);
1532 }
1533
1534 ///ditto
1535 public R replace(alias fun, R, RegEx)(R input, RegEx re)
1536 if (isSomeString!R && isRegexFor!(RegEx, R))
1537 {
1538     return replaceAllWith!(fun, match)(input, re);
1539 }
1540
1541 /**
1542 Splits a string `r` using a regular expression `pat` as a separator.
1543
1544 Params:
1545     keepSeparators = flag to specify if the matches should be in the resulting range
1546     r = the string to split
1547     pat = the pattern to split on
1548 Returns:
1549     A lazy range of strings
1550 */
1551 public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, alias RegEx = Regex)
1552 if (isSomeString!Range && isRegexFor!(RegEx, Range))
1553 {
1554 private:
1555     Range _input;
1556     size_t _offset;
1557     alias Rx = typeof(match(Range.init,RegEx.init));
1558     Rx _match;
1559
1560     static if (keepSeparators) bool onMatch = false;
1561
1562     @trusted this(Range input, RegEx separator)
1563     {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted
1564         _input = input;
1565         const re = separator.withFlags(separator.flags | RegexOption.global);
1566         if (_input.empty)
1567         {
1568             //there is nothing to match at all, make _offset > 0
1569             _offset = 1;
1570         }
1571         else
1572         {
1573             _match = Rx(_input, re);
1574
1575             static if (keepSeparators)
1576                 if (_match.pre.empty)
1577                     popFront();
1578         }
1579     }
1580
1581 public:
1582     auto ref opSlice()
1583     {
1584         return this.save;
1585     }
1586
1587     ///Forward range primitives.
1588     @property Range front()
1589     {
1590         import std.algorithm.comparison : min;
1591
1592         assert(!empty && _offset <= _match.pre.length
1593                 && _match.pre.length <= _input.length);
1594
1595         static if (keepSeparators)
1596         {
1597             if (!onMatch)
1598                 return _input[_offset .. min($, _match.pre.length)];
1599             else
1600                 return _match.hit();
1601         }
1602         else
1603         {
1604             return _input[_offset .. min($, _match.pre.length)];
1605         }
1606     }
1607
1608     ///ditto
1609     @property bool empty()
1610     {
1611         static if (keepSeparators)
1612             return _offset >= _input.length;
1613         else
1614             return _offset > _input.length;
1615     }
1616
1617     ///ditto
1618     void popFront()
1619     {
1620         assert(!empty);
1621         if (_match.empty)
1622         {
1623             //No more separators, work is done here
1624             _offset = _input.length + 1;
1625         }
1626         else
1627         {
1628             static if (keepSeparators)
1629             {
1630                 if (!onMatch)
1631                 {
1632                     //skip past the separator
1633                     _offset = _match.pre.length;
1634                 }
1635                 else
1636                 {
1637                     _offset += _match.hit.length;
1638                     _match.popFront();
1639                 }
1640
1641                 onMatch = !onMatch;
1642             }
1643             else
1644             {
1645                 //skip past the separator
1646                 _offset = _match.pre.length + _match.hit.length;
1647                 _match.popFront();
1648             }
1649         }
1650     }
1651
1652     ///ditto
1653     @property auto save()
1654     {
1655         return this;
1656     }
1657 }
1658
1659 /// ditto
1660 public Splitter!(keepSeparators, Range, RegEx) splitter(
1661     Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat)
1662 if (
1663     is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range))
1664 {
1665     return Splitter!(keepSeparators, Range, RegEx)(r, pat);
1666 }
1667
1668 ///
1669 @system unittest
1670 {
1671     import std.algorithm.comparison : equal;
1672     auto s1 = ", abc, de,  fg, hi, ";
1673     assert(equal(splitter(s1, regex(", *")),
1674         ["", "abc", "de", "fg", "hi", ""]));
1675 }
1676
1677 /// Split on a pattern, but keep the matches in the resulting range
1678 @system unittest
1679 {
1680     import std.algorithm.comparison : equal;
1681     import std.typecons : Yes;
1682
1683     auto pattern = regex(`([\.,])`);
1684
1685     assert("2003.04.05"
1686         .splitter!(Yes.keepSeparators)(pattern)
1687         .equal(["2003", ".", "04", ".", "05"]));
1688
1689     assert(",1,2,3"
1690         .splitter!(Yes.keepSeparators)(pattern)
1691         .equal([",", "1", ",", "2", ",", "3"]));
1692 }
1693
1694 ///An eager version of `splitter` that creates an array with splitted slices of `input`.
1695 public @trusted String[] split(String, RegEx)(String input, RegEx rx)
1696 if (isSomeString!String  && isRegexFor!(RegEx, String))
1697 {
1698     import std.array : appender;
1699     auto a = appender!(String[])();
1700     foreach (e; splitter(input, rx))
1701         a.put(e);
1702     return a.data;
1703 }
1704
1705 ///Exception object thrown in case of errors during regex compilation.
1706 public alias RegexException = std.regex.internal.ir.RegexException;
1707
1708 /++
1709   A range that lazily produces a string output escaped
1710   to be used inside of a regular expression.
1711 +/
1712 auto escaper(Range)(Range r)
1713 {
1714     import std.algorithm.searching : find;
1715     static immutable escapables = [Escapables];
1716     static struct Escaper // template to deduce attributes
1717     {
1718         Range r;
1719         bool escaped;
1720
1721         @property ElementType!Range front(){
1722           if (escaped)
1723               return '\\';
1724           else
1725               return r.front;
1726         }
1727
1728         @property bool empty(){ return r.empty; }
1729
1730         void popFront(){
1731           if (escaped) escaped = false;
1732           else
1733           {
1734               r.popFront();
1735               if (!r.empty && !escapables.find(r.front).empty)
1736                   escaped = true;
1737           }
1738         }
1739
1740         @property auto save(){ return Escaper(r.save, escaped); }
1741     }
1742
1743     bool escaped = !r.empty && !escapables.find(r.front).empty;
1744     return Escaper(r, escaped);
1745 }
1746
1747 ///
1748 @system unittest
1749 {
1750     import std.algorithm.comparison;
1751     import std.regex;
1752     string s = `This is {unfriendly} to *regex*`;
1753     assert(s.escaper.equal(`This is \{unfriendly\} to \*regex\*`));
1754 }
1755
1756 @system unittest
1757 {
1758     import std.algorithm.comparison;
1759     import std.conv;
1760     static foreach (S; AliasSeq!(string, wstring, dstring))
1761     {{
1762       auto s = "^".to!S;
1763       assert(s.escaper.equal(`\^`));
1764       auto s2 = "";
1765       assert(s2.escaper.equal(""));
1766     }}
1767 }
1768
1769 @system unittest
1770 {
1771     assert("ab".matchFirst(regex(`a?b?`)).hit == "ab");
1772     assert("ab".matchFirst(regex(`a??b?`)).hit == "");
1773 }