gcc/d/dmd/lexer.d

   1 /**
   2  * Implements the lexical analyzer, which converts source code into lexical tokens.
   3  *
   4  * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
   5  *
   6  * Copyright:   Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved
   7  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
   8  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
   9  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
  10  * Documentation:  https://dlang.org/phobos/dmd_lexer.html
  11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
  12  */
  13
  14 module dmd.lexer;
  15
  16 import core.stdc.ctype;
  17 import core.stdc.errno;
  18 import core.stdc.stdarg;
  19 import core.stdc.stdio;
  20 import core.stdc.stdlib : getenv;
  21 import core.stdc.string;
  22 import core.stdc.time;
  23
  24 import dmd.entity;
  25 import dmd.errors;
  26 import dmd.globals;
  27 import dmd.id;
  28 import dmd.identifier;
  29 import dmd.root.array;
  30 import dmd.root.ctfloat;
  31 import dmd.common.outbuffer;
  32 import dmd.root.port;
  33 import dmd.root.rmem;
  34 import dmd.root.string;
  35 import dmd.root.utf;
  36 import dmd.tokens;
  37 import dmd.utils;
  38
  39 nothrow:
  40
  41 version (DMDLIB)
  42 {
  43     version = LocOffset;
  44 }
  45
  46 /***********************************************************
  47  */
  48 class Lexer
  49 {
  50     private __gshared OutBuffer stringbuffer;
  51
  52     Loc scanloc;            // for error messages
  53     Loc prevloc;            // location of token before current
  54
  55     const(char)* p;         // current character
  56
  57     Token token;
  58
  59     // For ImportC
  60     bool Ccompile;              /// true if compiling ImportC
  61
  62     // The following are valid only if (Ccompile == true)
  63     ubyte boolsize;             /// size of a C _Bool, default 1
  64     ubyte shortsize;            /// size of a C short, default 2
  65     ubyte intsize;              /// size of a C int, default 4
  66     ubyte longsize;             /// size of C long, 4 or 8
  67     ubyte long_longsize;        /// size of a C long long, default 8
  68     ubyte long_doublesize;      /// size of C long double, 8 or D real.sizeof
  69     ubyte wchar_tsize;          /// size of C wchar_t, 2 or 4
  70
  71     private
  72     {
  73         const(char)* base;      // pointer to start of buffer
  74         const(char)* end;       // pointer to last element of buffer
  75         const(char)* line;      // start of current line
  76
  77         bool doDocComment;      // collect doc comment information
  78         bool anyToken;          // seen at least one token
  79         bool commentToken;      // comments are TOK.comment's
  80         bool tokenizeNewlines;  // newlines are turned into TOK.endOfLine's
  81
  82         version (DMDLIB)
  83         {
  84             bool whitespaceToken;   // tokenize whitespaces
  85         }
  86
  87         int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
  88         int lastDocLine;        // last line of previous doc comment
  89
  90         Token* tokenFreelist;
  91     }
  92
  93   nothrow:
  94
  95     /*********************
  96      * Creates a Lexer for the source code base[begoffset..endoffset+1].
  97      * The last character, base[endoffset], must be null (0) or EOF (0x1A).
  98      *
  99      * Params:
 100      *  filename = used for error messages
 101      *  base = source code, must be terminated by a null (0) or EOF (0x1A) character
 102      *  begoffset = starting offset into base[]
 103      *  endoffset = the last offset to read into base[]
 104      *  doDocComment = handle documentation comments
 105      *  commentToken = comments become TOK.comment's
 106      */
 107     this(const(char)* filename, const(char)* base, size_t begoffset,
 108         size_t endoffset, bool doDocComment, bool commentToken) pure
 109     {
 110         scanloc = Loc(filename, 1, 1);
 111         // debug printf("Lexer::Lexer(%p)\n", base);
 112         // debug printf("lexer.filename = %s\n", filename);
 113         token = Token.init;
 114         this.base = base;
 115         this.end = base + endoffset;
 116         p = base + begoffset;
 117         line = p;
 118         this.doDocComment = doDocComment;
 119         this.commentToken = commentToken;
 120         this.tokenizeNewlines = false;
 121         this.inTokenStringConstant = 0;
 122         this.lastDocLine = 0;
 123         //initKeywords();
 124         /* If first line starts with '#!', ignore the line
 125          */
 126         if (p && p[0] == '#' && p[1] == '!')
 127         {
 128             p += 2;
 129             while (1)
 130             {
 131                 char c = *p++;
 132                 switch (c)
 133                 {
 134                 case 0:
 135                 case 0x1A:
 136                     p--;
 137                     goto case;
 138                 case '\n':
 139                     break;
 140                 default:
 141                     continue;
 142                 }
 143                 break;
 144             }
 145             endOfLine();
 146         }
 147     }
 148
 149     version (DMDLIB)
 150     {
 151         this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
 152             bool doDocComment, bool commentToken, bool whitespaceToken)
 153         {
 154             this(filename, base, begoffset, endoffset, doDocComment, commentToken);
 155             this.whitespaceToken = whitespaceToken;
 156         }
 157
 158         bool empty() const pure @property @nogc @safe
 159         {
 160             return front() == TOK.endOfFile;
 161         }
 162
 163         TOK front() const pure @property @nogc @safe
 164         {
 165             return token.value;
 166         }
 167
 168         void popFront()
 169         {
 170             nextToken();
 171         }
 172     }
 173
 174     /// Returns: a newly allocated `Token`.
 175     Token* allocateToken() pure nothrow @safe
 176     {
 177         if (tokenFreelist)
 178         {
 179             Token* t = tokenFreelist;
 180             tokenFreelist = t.next;
 181             t.next = null;
 182             return t;
 183         }
 184         return new Token();
 185     }
 186
 187     /// Frees the given token by returning it to the freelist.
 188     private void releaseToken(Token* token) pure nothrow @nogc @safe
 189     {
 190         if (mem.isGCEnabled)
 191             *token = Token.init;
 192         token.next = tokenFreelist;
 193         tokenFreelist = token;
 194     }
 195
 196     final TOK nextToken()
 197     {
 198         prevloc = token.loc;
 199         if (token.next)
 200         {
 201             Token* t = token.next;
 202             memcpy(&token, t, Token.sizeof);
 203             releaseToken(t);
 204         }
 205         else
 206         {
 207             scan(&token);
 208         }
 209         //printf(token.toChars());
 210         return token.value;
 211     }
 212
 213     /***********************
 214      * Look ahead at next token's value.
 215      */
 216     final TOK peekNext()
 217     {
 218         return peek(&token).value;
 219     }
 220
 221     /***********************
 222      * Look 2 tokens ahead at value.
 223      */
 224     final TOK peekNext2()
 225     {
 226         Token* t = peek(&token);
 227         return peek(t).value;
 228     }
 229
 230     /****************************
 231      * Turn next token in buffer into a token.
 232      * Params:
 233      *  t = the token to set the resulting Token to
 234      */
 235     final void scan(Token* t)
 236     {
 237         const lastLine = scanloc.linnum;
 238         Loc startLoc;
 239         t.blockComment = null;
 240         t.lineComment = null;
 241
 242         while (1)
 243         {
 244             t.ptr = p;
 245             //printf("p = %p, *p = '%c'\n",p,*p);
 246             t.loc = loc();
 247             switch (*p)
 248             {
 249             case 0:
 250             case 0x1A:
 251                 t.value = TOK.endOfFile; // end of file
 252                 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
 253                 return;
 254             case ' ':
 255                 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
 256                 while ((cast(size_t)p) % uint.sizeof)
 257                 {
 258                     if (*p != ' ')
 259                         goto LendSkipFourSpaces;
 260                     p++;
 261                 }
 262                 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
 263                     p += 4;
 264                 // Skip over any remaining space on the line.
 265                 while (*p == ' ')
 266                     p++;
 267             LendSkipFourSpaces:
 268                 version (DMDLIB)
 269                 {
 270                     if (whitespaceToken)
 271                     {
 272                         t.value = TOK.whitespace;
 273                         return;
 274                     }
 275                 }
 276                 continue; // skip white space
 277             case '\t':
 278             case '\v':
 279             case '\f':
 280                 p++;
 281                 version (DMDLIB)
 282                 {
 283                     if (whitespaceToken)
 284                     {
 285                         t.value = TOK.whitespace;
 286                         return;
 287                     }
 288                 }
 289                 continue; // skip white space
 290             case '\r':
 291                 p++;
 292                 if (*p != '\n') // if CR stands by itself
 293                 {
 294                     endOfLine();
 295                     if (tokenizeNewlines)
 296                     {
 297                         t.value = TOK.endOfLine;
 298                         tokenizeNewlines = false;
 299                         return;
 300                     }
 301                 }
 302                 version (DMDLIB)
 303                 {
 304                     if (whitespaceToken)
 305                     {
 306                         t.value = TOK.whitespace;
 307                         return;
 308                     }
 309                 }
 310                 continue; // skip white space
 311             case '\n':
 312                 p++;
 313                 endOfLine();
 314                 if (tokenizeNewlines)
 315                 {
 316                     t.value = TOK.endOfLine;
 317                     tokenizeNewlines = false;
 318                     return;
 319                 }
 320                 version (DMDLIB)
 321                 {
 322                     if (whitespaceToken)
 323                     {
 324                         t.value = TOK.whitespace;
 325                         return;
 326                     }
 327                 }
 328                 continue; // skip white space
 329             case '0':
 330                 if (!isZeroSecond(p[1]))        // if numeric literal does not continue
 331                 {
 332                     ++p;
 333                     t.unsvalue = 0;
 334                     t.value = TOK.int32Literal;
 335                     return;
 336                 }
 337                 goto Lnumber;
 338
 339             case '1': .. case '9':
 340                 if (!isDigitSecond(p[1]))       // if numeric literal does not continue
 341                 {
 342                     t.unsvalue = *p - '0';
 343                     ++p;
 344                     t.value = TOK.int32Literal;
 345                     return;
 346                 }
 347             Lnumber:
 348                 t.value = number(t);
 349                 return;
 350
 351             case '\'':
 352                 if (issinglechar(p[1]) && p[2] == '\'')
 353                 {
 354                     t.unsvalue = p[1];        // simple one character literal
 355                     t.value = TOK.charLiteral;
 356                     p += 3;
 357                 }
 358                 else if (Ccompile)
 359                 {
 360                     clexerCharConstant(*t, 0);
 361                 }
 362                 else
 363                 {
 364                     t.value = charConstant(t);
 365                 }
 366                 return;
 367
 368             case 'u':
 369             case 'U':
 370             case 'L':
 371                 if (!Ccompile)
 372                     goto case_ident;
 373                 if (p[1] == '\'')       // C wide character constant
 374                 {
 375                     char c = *p;
 376                     if (c == 'L')       // convert L to u or U
 377                         c = (wchar_tsize == 4) ? 'u' : 'U';
 378                     ++p;
 379                     clexerCharConstant(*t, c);
 380                     return;
 381                 }
 382                 else if (p[1] == '\"')  // C wide string literal
 383                 {
 384                     const c = *p;
 385                     ++p;
 386                     escapeStringConstant(t);
 387                     t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
 388                                 c == 'u' ? 'w' :
 389                                 'd';
 390                     return;
 391                 }
 392                 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
 393                 {
 394                     p += 2;
 395                     escapeStringConstant(t);
 396                     return;
 397                 }
 398                 goto case_ident;
 399
 400             case 'r':
 401                 if (Ccompile || p[1] != '"')
 402                     goto case_ident;
 403                 p++;
 404                 goto case '`';
 405             case '`':
 406                 if (Ccompile)
 407                     goto default;
 408                 wysiwygStringConstant(t);
 409                 return;
 410             case 'q':
 411                 if (Ccompile)
 412                     goto case_ident;
 413                 if (p[1] == '"')
 414                 {
 415                     p++;
 416                     delimitedStringConstant(t);
 417                     return;
 418                 }
 419                 else if (p[1] == '{')
 420                 {
 421                     p++;
 422                     tokenStringConstant(t);
 423                     return;
 424                 }
 425                 else
 426                     goto case_ident;
 427             case '"':
 428                 escapeStringConstant(t);
 429                 return;
 430             case 'a':
 431             case 'b':
 432             case 'c':
 433             case 'd':
 434             case 'e':
 435             case 'f':
 436             case 'g':
 437             case 'h':
 438             case 'i':
 439             case 'j':
 440             case 'k':
 441             case 'l':
 442             case 'm':
 443             case 'n':
 444             case 'o':
 445             case 'p':
 446                 /*case 'q': case 'r':*/
 447             case 's':
 448             case 't':
 449             //case 'u':
 450             case 'v':
 451             case 'w':
 452             case 'x':
 453             case 'y':
 454             case 'z':
 455             case 'A':
 456             case 'B':
 457             case 'C':
 458             case 'D':
 459             case 'E':
 460             case 'F':
 461             case 'G':
 462             case 'H':
 463             case 'I':
 464             case 'J':
 465             case 'K':
 466             //case 'L':
 467             case 'M':
 468             case 'N':
 469             case 'O':
 470             case 'P':
 471             case 'Q':
 472             case 'R':
 473             case 'S':
 474             case 'T':
 475             //case 'U':
 476             case 'V':
 477             case 'W':
 478             case 'X':
 479             case 'Y':
 480             case 'Z':
 481             case '_':
 482             case_ident:
 483                 {
 484                     while (1)
 485                     {
 486                         const c = *++p;
 487                         if (isidchar(c))
 488                             continue;
 489                         else if (c & 0x80)
 490                         {
 491                             const s = p;
 492                             const u = decodeUTF();
 493                             if (isUniAlpha(u))
 494                                 continue;
 495                             error("char 0x%04x not allowed in identifier", u);
 496                             p = s;
 497                         }
 498                         break;
 499                     }
 500                     Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
 501                     t.ident = id;
 502                     t.value = cast(TOK)id.getValue();
 503
 504                     anyToken = 1;
 505
 506                     /* Different keywords for C and D
 507                      */
 508                     if (Ccompile)
 509                     {
 510                         if (t.value != TOK.identifier)
 511                         {
 512                             t.value = Ckeywords[t.value];  // filter out D keywords
 513                         }
 514                     }
 515                     else if (t.value >= FirstCKeyword)
 516                         t.value = TOK.identifier;       // filter out C keywords
 517
 518                     else if (*t.ptr == '_') // if special identifier token
 519                     {
 520                         // Lazy initialization
 521                         TimeStampInfo.initialize(t.loc);
 522
 523                         if (id == Id.DATE)
 524                         {
 525                             t.ustring = TimeStampInfo.date.ptr;
 526                             goto Lstr;
 527                         }
 528                         else if (id == Id.TIME)
 529                         {
 530                             t.ustring = TimeStampInfo.time.ptr;
 531                             goto Lstr;
 532                         }
 533                         else if (id == Id.VENDOR)
 534                         {
 535                             t.ustring = global.vendor.xarraydup.ptr;
 536                             goto Lstr;
 537                         }
 538                         else if (id == Id.TIMESTAMP)
 539                         {
 540                             t.ustring = TimeStampInfo.timestamp.ptr;
 541                         Lstr:
 542                             t.value = TOK.string_;
 543                             t.postfix = 0;
 544                             t.len = cast(uint)strlen(t.ustring);
 545                         }
 546                         else if (id == Id.VERSIONX)
 547                         {
 548                             t.value = TOK.int64Literal;
 549                             t.unsvalue = global.versionNumber();
 550                         }
 551                         else if (id == Id.EOFX)
 552                         {
 553                             t.value = TOK.endOfFile;
 554                             // Advance scanner to end of file
 555                             while (!(*p == 0 || *p == 0x1A))
 556                                 p++;
 557                         }
 558                     }
 559                     //printf("t.value = %d\n",t.value);
 560                     return;
 561                 }
 562             case '/':
 563                 p++;
 564                 switch (*p)
 565                 {
 566                 case '=':
 567                     p++;
 568                     t.value = TOK.divAssign;
 569                     return;
 570                 case '*':
 571                     p++;
 572                     startLoc = loc();
 573                     while (1)
 574                     {
 575                         while (1)
 576                         {
 577                             const c = *p;
 578                             switch (c)
 579                             {
 580                             case '/':
 581                                 break;
 582                             case '\n':
 583                                 endOfLine();
 584                                 p++;
 585                                 continue;
 586                             case '\r':
 587                                 p++;
 588                                 if (*p != '\n')
 589                                     endOfLine();
 590                                 continue;
 591                             case 0:
 592                             case 0x1A:
 593                                 error("unterminated /* */ comment");
 594                                 p = end;
 595                                 t.loc = loc();
 596                                 t.value = TOK.endOfFile;
 597                                 return;
 598                             default:
 599                                 if (c & 0x80)
 600                                 {
 601                                     const u = decodeUTF();
 602                                     if (u == PS || u == LS)
 603                                         endOfLine();
 604                                 }
 605                                 p++;
 606                                 continue;
 607                             }
 608                             break;
 609                         }
 610                         p++;
 611                         if (p[-2] == '*' && p - 3 != t.ptr)
 612                             break;
 613                     }
 614                     if (commentToken)
 615                     {
 616                         t.loc = startLoc;
 617                         t.value = TOK.comment;
 618                         return;
 619                     }
 620                     else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
 621                     {
 622                         // if /** but not /**/
 623                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 624                         lastDocLine = scanloc.linnum;
 625                     }
 626                     continue;
 627                 case '/': // do // style comments
 628                     startLoc = loc();
 629                     while (1)
 630                     {
 631                         const c = *++p;
 632                         switch (c)
 633                         {
 634                         case '\n':
 635                             break;
 636                         case '\r':
 637                             if (p[1] == '\n')
 638                                 p++;
 639                             break;
 640                         case 0:
 641                         case 0x1A:
 642                             if (commentToken)
 643                             {
 644                                 p = end;
 645                                 t.loc = startLoc;
 646                                 t.value = TOK.comment;
 647                                 return;
 648                             }
 649                             if (doDocComment && t.ptr[2] == '/')
 650                             {
 651                                 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 652                                 lastDocLine = scanloc.linnum;
 653                             }
 654                             p = end;
 655                             t.loc = loc();
 656                             t.value = TOK.endOfFile;
 657                             return;
 658                         default:
 659                             if (c & 0x80)
 660                             {
 661                                 const u = decodeUTF();
 662                                 if (u == PS || u == LS)
 663                                     break;
 664                             }
 665                             continue;
 666                         }
 667                         break;
 668                     }
 669                     if (commentToken)
 670                     {
 671                         version (DMDLIB) {}
 672                         else
 673                         {
 674                             p++;
 675                             endOfLine();
 676                         }
 677                         t.loc = startLoc;
 678                         t.value = TOK.comment;
 679                         return;
 680                     }
 681                     if (doDocComment && t.ptr[2] == '/')
 682                     {
 683                         getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 684                         lastDocLine = scanloc.linnum;
 685                     }
 686                     p++;
 687                     endOfLine();
 688                     continue;
 689                 case '+':
 690                     if (!Ccompile)
 691                     {
 692                         int nest;
 693                         startLoc = loc();
 694                         p++;
 695                         nest = 1;
 696                         while (1)
 697                         {
 698                             char c = *p;
 699                             switch (c)
 700                             {
 701                             case '/':
 702                                 p++;
 703                                 if (*p == '+')
 704                                 {
 705                                     p++;
 706                                     nest++;
 707                                 }
 708                                 continue;
 709                             case '+':
 710                                 p++;
 711                                 if (*p == '/')
 712                                 {
 713                                     p++;
 714                                     if (--nest == 0)
 715                                         break;
 716                                 }
 717                                 continue;
 718                             case '\r':
 719                                 p++;
 720                                 if (*p != '\n')
 721                                     endOfLine();
 722                                 continue;
 723                             case '\n':
 724                                 endOfLine();
 725                                 p++;
 726                                 continue;
 727                             case 0:
 728                             case 0x1A:
 729                                 error("unterminated /+ +/ comment");
 730                                 p = end;
 731                                 t.loc = loc();
 732                                 t.value = TOK.endOfFile;
 733                                 return;
 734                             default:
 735                                 if (c & 0x80)
 736                                 {
 737                                     uint u = decodeUTF();
 738                                     if (u == PS || u == LS)
 739                                         endOfLine();
 740                                 }
 741                                 p++;
 742                                 continue;
 743                             }
 744                             break;
 745                         }
 746                         if (commentToken)
 747                         {
 748                             t.loc = startLoc;
 749                             t.value = TOK.comment;
 750                             return;
 751                         }
 752                         if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
 753                         {
 754                             // if /++ but not /++/
 755                             getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
 756                             lastDocLine = scanloc.linnum;
 757                         }
 758                         continue;
 759                     }
 760                     break;
 761                 default:
 762                     break;
 763                 }
 764                 t.value = TOK.div;
 765                 return;
 766             case '.':
 767                 p++;
 768                 if (isdigit(*p))
 769                 {
 770                     /* Note that we don't allow ._1 and ._ as being
 771                      * valid floating point numbers.
 772                      */
 773                     p--;
 774                     t.value = inreal(t);
 775                 }
 776                 else if (p[0] == '.')
 777                 {
 778                     if (p[1] == '.')
 779                     {
 780                         p += 2;
 781                         t.value = TOK.dotDotDot;
 782                     }
 783                     else
 784                     {
 785                         p++;
 786                         t.value = TOK.slice;
 787                     }
 788                 }
 789                 else
 790                     t.value = TOK.dot;
 791                 return;
 792             case '&':
 793                 p++;
 794                 if (*p == '=')
 795                 {
 796                     p++;
 797                     t.value = TOK.andAssign;
 798                 }
 799                 else if (*p == '&')
 800                 {
 801                     p++;
 802                     t.value = TOK.andAnd;
 803                 }
 804                 else
 805                     t.value = TOK.and;
 806                 return;
 807             case '|':
 808                 p++;
 809                 if (*p == '=')
 810                 {
 811                     p++;
 812                     t.value = TOK.orAssign;
 813                 }
 814                 else if (*p == '|')
 815                 {
 816                     p++;
 817                     t.value = TOK.orOr;
 818                 }
 819                 else
 820                     t.value = TOK.or;
 821                 return;
 822             case '-':
 823                 p++;
 824                 if (*p == '=')
 825                 {
 826                     p++;
 827                     t.value = TOK.minAssign;
 828                 }
 829                 else if (*p == '-')
 830                 {
 831                     p++;
 832                     t.value = TOK.minusMinus;
 833                 }
 834                 else if (*p == '>')
 835                 {
 836                     ++p;
 837                     t.value = TOK.arrow;
 838                 }
 839                 else
 840                     t.value = TOK.min;
 841                 return;
 842             case '+':
 843                 p++;
 844                 if (*p == '=')
 845                 {
 846                     p++;
 847                     t.value = TOK.addAssign;
 848                 }
 849                 else if (*p == '+')
 850                 {
 851                     p++;
 852                     t.value = TOK.plusPlus;
 853                 }
 854                 else
 855                     t.value = TOK.add;
 856                 return;
 857             case '<':
 858                 p++;
 859                 if (*p == '=')
 860                 {
 861                     p++;
 862                     t.value = TOK.lessOrEqual; // <=
 863                 }
 864                 else if (*p == '<')
 865                 {
 866                     p++;
 867                     if (*p == '=')
 868                     {
 869                         p++;
 870                         t.value = TOK.leftShiftAssign; // <<=
 871                     }
 872                     else
 873                         t.value = TOK.leftShift; // <<
 874                 }
 875                 else if (*p == ':' && Ccompile)
 876                 {
 877                     ++p;
 878                     t.value = TOK.leftBracket;  // <:
 879                 }
 880                 else if (*p == '%' && Ccompile)
 881                 {
 882                     ++p;
 883                     t.value = TOK.leftCurly;    // <%
 884                 }
 885                 else
 886                     t.value = TOK.lessThan; // <
 887                 return;
 888             case '>':
 889                 p++;
 890                 if (*p == '=')
 891                 {
 892                     p++;
 893                     t.value = TOK.greaterOrEqual; // >=
 894                 }
 895                 else if (*p == '>')
 896                 {
 897                     p++;
 898                     if (*p == '=')
 899                     {
 900                         p++;
 901                         t.value = TOK.rightShiftAssign; // >>=
 902                     }
 903                     else if (*p == '>')
 904                     {
 905                         p++;
 906                         if (*p == '=')
 907                         {
 908                             p++;
 909                             t.value = TOK.unsignedRightShiftAssign; // >>>=
 910                         }
 911                         else
 912                             t.value = TOK.unsignedRightShift; // >>>
 913                     }
 914                     else
 915                         t.value = TOK.rightShift; // >>
 916                 }
 917                 else
 918                     t.value = TOK.greaterThan; // >
 919                 return;
 920             case '!':
 921                 p++;
 922                 if (*p == '=')
 923                 {
 924                     p++;
 925                     t.value = TOK.notEqual; // !=
 926                 }
 927                 else
 928                     t.value = TOK.not; // !
 929                 return;
 930             case '=':
 931                 p++;
 932                 if (*p == '=')
 933                 {
 934                     p++;
 935                     t.value = TOK.equal; // ==
 936                 }
 937                 else if (*p == '>')
 938                 {
 939                     p++;
 940                     t.value = TOK.goesTo; // =>
 941                 }
 942                 else
 943                     t.value = TOK.assign; // =
 944                 return;
 945             case '~':
 946                 p++;
 947                 if (*p == '=')
 948                 {
 949                     p++;
 950                     t.value = TOK.concatenateAssign; // ~=
 951                 }
 952                 else
 953                     t.value = TOK.tilde; // ~
 954                 return;
 955             case '^':
 956                 p++;
 957                 if (*p == '^')
 958                 {
 959                     p++;
 960                     if (*p == '=')
 961                     {
 962                         p++;
 963                         t.value = TOK.powAssign; // ^^=
 964                     }
 965                     else
 966                         t.value = TOK.pow; // ^^
 967                 }
 968                 else if (*p == '=')
 969                 {
 970                     p++;
 971                     t.value = TOK.xorAssign; // ^=
 972                 }
 973                 else
 974                     t.value = TOK.xor; // ^
 975                 return;
 976             case '(':
 977                 p++;
 978                 t.value = TOK.leftParenthesis;
 979                 return;
 980             case ')':
 981                 p++;
 982                 t.value = TOK.rightParenthesis;
 983                 return;
 984             case '[':
 985                 p++;
 986                 t.value = TOK.leftBracket;
 987                 return;
 988             case ']':
 989                 p++;
 990                 t.value = TOK.rightBracket;
 991                 return;
 992             case '{':
 993                 p++;
 994                 t.value = TOK.leftCurly;
 995                 return;
 996             case '}':
 997                 p++;
 998                 t.value = TOK.rightCurly;
 999                 return;
1000             case '?':
1001                 p++;
1002                 t.value = TOK.question;
1003                 return;
1004             case ',':
1005                 p++;
1006                 t.value = TOK.comma;
1007                 return;
1008             case ';':
1009                 p++;
1010                 t.value = TOK.semicolon;
1011                 return;
1012             case ':':
1013                 p++;
1014                 if (*p == ':')
1015                 {
1016                     ++p;
1017                     t.value = TOK.colonColon;
1018                 }
1019                 else if (*p == '>' && Ccompile)
1020                 {
1021                     ++p;
1022                     t.value = TOK.rightBracket;
1023                 }
1024                 else
1025                     t.value = TOK.colon;
1026                 return;
1027             case '$':
1028                 p++;
1029                 t.value = TOK.dollar;
1030                 return;
1031             case '@':
1032                 p++;
1033                 t.value = TOK.at;
1034                 return;
1035             case '*':
1036                 p++;
1037                 if (*p == '=')
1038                 {
1039                     p++;
1040                     t.value = TOK.mulAssign;
1041                 }
1042                 else
1043                     t.value = TOK.mul;
1044                 return;
1045             case '%':
1046                 p++;
1047                 if (*p == '=')
1048                 {
1049                     p++;
1050                     t.value = TOK.modAssign;
1051                 }
1052                 else if (*p == '>' && Ccompile)
1053                 {
1054                     ++p;
1055                     t.value = TOK.rightCurly;
1056                 }
1057                 else if (*p == ':' && Ccompile)
1058                 {
1059                     goto case '#';      // %: means #
1060                 }
1061                 else
1062                     t.value = TOK.mod;
1063                 return;
1064             case '#':
1065                 {
1066                     // https://issues.dlang.org/show_bug.cgi?id=22825
1067                     // Special token sequences are terminated by newlines,
1068                     // and should not be skipped over.
1069                     this.tokenizeNewlines = true;
1070                     p++;
1071                     if (parseSpecialTokenSequence())
1072                         continue;
1073                     t.value = TOK.pound;
1074                     return;
1075                 }
1076             default:
1077                 {
1078                     dchar c = *p;
1079                     if (c & 0x80)
1080                     {
1081                         c = decodeUTF();
1082                         // Check for start of unicode identifier
1083                         if (isUniAlpha(c))
1084                             goto case_ident;
1085                         if (c == PS || c == LS)
1086                         {
1087                             endOfLine();
1088                             p++;
1089                             if (tokenizeNewlines)
1090                             {
1091                                 t.value = TOK.endOfLine;
1092                                 tokenizeNewlines = false;
1093                                 return;
1094                             }
1095                             continue;
1096                         }
1097                     }
1098                     if (c < 0x80 && isprint(c))
1099                         error("character '%c' is not a valid token", c);
1100                     else
1101                         error("character 0x%02x is not a valid token", c);
1102                     p++;
1103                     continue;
1104                 }
1105             }
1106         }
1107     }
1108
1109     final Token* peek(Token* ct)
1110     {
1111         Token* t;
1112         if (ct.next)
1113             t = ct.next;
1114         else
1115         {
1116             t = allocateToken();
1117             scan(t);
1118             ct.next = t;
1119         }
1120         return t;
1121     }
1122
1123     /*********************************
1124      * tk is on the opening (.
1125      * Look ahead and return token that is past the closing ).
1126      */
1127     final Token* peekPastParen(Token* tk)
1128     {
1129         //printf("peekPastParen()\n");
1130         int parens = 1;
1131         int curlynest = 0;
1132         while (1)
1133         {
1134             tk = peek(tk);
1135             //tk.print();
1136             switch (tk.value)
1137             {
1138             case TOK.leftParenthesis:
1139                 parens++;
1140                 continue;
1141             case TOK.rightParenthesis:
1142                 --parens;
1143                 if (parens)
1144                     continue;
1145                 tk = peek(tk);
1146                 break;
1147             case TOK.leftCurly:
1148                 curlynest++;
1149                 continue;
1150             case TOK.rightCurly:
1151                 if (--curlynest >= 0)
1152                     continue;
1153                 break;
1154             case TOK.semicolon:
1155                 if (curlynest)
1156                     continue;
1157                 break;
1158             case TOK.endOfFile:
1159                 break;
1160             default:
1161                 continue;
1162             }
1163             return tk;
1164         }
1165     }
1166
1167     /*******************************************
1168      * Parse escape sequence.
1169      */
1170     private uint escapeSequence()
1171     {
1172         return Lexer.escapeSequence(token.loc, p, Ccompile);
1173     }
1174
1175     /********
1176      * Parse the given string literal escape sequence into a single character.
1177      * D https://dlang.org/spec/lex.html#escape_sequences
1178      * C11 6.4.4.4
1179      * Params:
1180      *  loc = location to use for error messages
1181      *  sequence = pointer to string with escape sequence to parse. Updated to
1182      *             point past the end of the escape sequence
1183      *  Ccompile = true for compile C11 escape sequences
1184      * Returns:
1185      *  the escape sequence as a single character
1186      */
1187     private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile)
1188     {
1189         const(char)* p = sequence; // cache sequence reference on stack
1190         scope(exit) sequence = p;
1191
1192         uint c = *p;
1193         int ndigits;
1194         switch (c)
1195         {
1196         case '\'':
1197         case '"':
1198         case '?':
1199         case '\\':
1200         Lconsume:
1201             p++;
1202             break;
1203         case 'a':
1204             c = 7;
1205             goto Lconsume;
1206         case 'b':
1207             c = 8;
1208             goto Lconsume;
1209         case 'f':
1210             c = 12;
1211             goto Lconsume;
1212         case 'n':
1213             c = 10;
1214             goto Lconsume;
1215         case 'r':
1216             c = 13;
1217             goto Lconsume;
1218         case 't':
1219             c = 9;
1220             goto Lconsume;
1221         case 'v':
1222             c = 11;
1223             goto Lconsume;
1224         case 'u':
1225             ndigits = 4;
1226             goto Lhex;
1227         case 'U':
1228             ndigits = 8;
1229             goto Lhex;
1230         case 'x':
1231             ndigits = 2;
1232         Lhex:
1233             p++;
1234             c = *p;
1235             if (ishex(cast(char)c))
1236             {
1237                 uint v = 0;
1238                 int n = 0;
1239                 while (1)
1240                 {
1241                     if (isdigit(cast(char)c))
1242                         c -= '0';
1243                     else if (islower(c))
1244                         c -= 'a' - 10;
1245                     else
1246                         c -= 'A' - 10;
1247                     v = v * 16 + c;
1248                     c = *++p;
1249                     if (++n == ndigits)
1250                         break;
1251                     if (!ishex(cast(char)c))
1252                     {
1253                         .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
1254                         break;
1255                     }
1256                 }
1257                 if (ndigits != 2 && !utf_isValidDchar(v))
1258                 {
1259                     .error(loc, "invalid UTF character \\U%08x", v);
1260                     v = '?'; // recover with valid UTF character
1261                 }
1262                 c = v;
1263             }
1264             else
1265             {
1266                 .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
1267                 p++;
1268             }
1269             break;
1270         case '&':
1271             if (Ccompile)
1272                 goto default;
1273
1274             // named character entity
1275             for (const idstart = ++p; 1; p++)
1276             {
1277                 switch (*p)
1278                 {
1279                 case ';':
1280                     c = HtmlNamedEntity(idstart, p - idstart);
1281                     if (c == ~0)
1282                     {
1283                         .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1284                         c = '?';
1285                     }
1286                     p++;
1287                     break;
1288                 default:
1289                     if (isalpha(*p) || (p != idstart && isdigit(*p)))
1290                         continue;
1291                     .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1292                     c = '?';
1293                     break;
1294                 }
1295                 break;
1296             }
1297             break;
1298         case 0:
1299         case 0x1A:
1300             // end of file
1301             c = '\\';
1302             break;
1303         default:
1304             if (isoctal(cast(char)c))
1305             {
1306                 uint v = 0;
1307                 int n = 0;
1308                 do
1309                 {
1310                     v = v * 8 + (c - '0');
1311                     c = *++p;
1312                 }
1313                 while (++n < 3 && isoctal(cast(char)c));
1314                 c = v;
1315                 if (c > 0xFF)
1316                     .error(loc, "escape octal sequence \\%03o is larger than \\377", c);
1317             }
1318             else
1319             {
1320                 .error(loc, "undefined escape sequence \\%c", c);
1321                 p++;
1322             }
1323             break;
1324         }
1325         return c;
1326     }
1327
1328     /**
1329     Lex a wysiwyg string. `p` must be pointing to the first character before the
1330     contents of the string literal. The character pointed to by `p` will be used as
1331     the terminating character (i.e. backtick or double-quote).
1332     Params:
1333         result = pointer to the token that accepts the result
1334     */
1335     private void wysiwygStringConstant(Token* result)
1336     {
1337         result.value = TOK.string_;
1338         Loc start = loc();
1339         auto terminator = p[0];
1340         p++;
1341         stringbuffer.setsize(0);
1342         while (1)
1343         {
1344             dchar c = p[0];
1345             p++;
1346             switch (c)
1347             {
1348             case '\n':
1349                 endOfLine();
1350                 break;
1351             case '\r':
1352                 if (p[0] == '\n')
1353                     continue; // ignore
1354                 c = '\n'; // treat EndOfLine as \n character
1355                 endOfLine();
1356                 break;
1357             case 0:
1358             case 0x1A:
1359                 error("unterminated string constant starting at %s", start.toChars());
1360                 result.setString();
1361                 // rewind `p` so it points to the EOF character
1362                 p--;
1363                 return;
1364             default:
1365                 if (c == terminator)
1366                 {
1367                     result.setString(stringbuffer);
1368                     stringPostfix(result);
1369                     return;
1370                 }
1371                 else if (c & 0x80)
1372                 {
1373                     p--;
1374                     const u = decodeUTF();
1375                     p++;
1376                     if (u == PS || u == LS)
1377                         endOfLine();
1378                     stringbuffer.writeUTF8(u);
1379                     continue;
1380                 }
1381                 break;
1382             }
1383             stringbuffer.writeByte(c);
1384         }
1385     }
1386
1387     /**
1388     Lex a delimited string. Some examples of delimited strings are:
1389     ---
1390     q"(foo(xxx))"      // "foo(xxx)"
1391     q"[foo$(LPAREN)]"  // "foo$(LPAREN)"
1392     q"/foo]/"          // "foo]"
1393     q"HERE
1394     foo
1395     HERE"              // "foo\n"
1396     ---
1397     It is assumed that `p` points to the opening double-quote '"'.
1398     Params:
1399         result = pointer to the token that accepts the result
1400     */
1401     private void delimitedStringConstant(Token* result)
1402     {
1403         result.value = TOK.string_;
1404         Loc start = loc();
1405         dchar delimleft = 0;
1406         dchar delimright = 0;
1407         uint nest = 1;
1408         uint nestcount = ~0; // dead assignment, needed to suppress warning
1409         Identifier hereid = null;
1410         uint blankrol = 0;
1411         uint startline = 0;
1412         p++;
1413         stringbuffer.setsize(0);
1414         while (1)
1415         {
1416             dchar c = *p++;
1417             //printf("c = '%c'\n", c);
1418             switch (c)
1419             {
1420             case '\n':
1421             Lnextline:
1422                 endOfLine();
1423                 startline = 1;
1424                 if (blankrol)
1425                 {
1426                     blankrol = 0;
1427                     continue;
1428                 }
1429                 if (hereid)
1430                 {
1431                     stringbuffer.writeUTF8(c);
1432                     continue;
1433                 }
1434                 break;
1435             case '\r':
1436                 if (*p == '\n')
1437                     continue; // ignore
1438                 c = '\n'; // treat EndOfLine as \n character
1439                 goto Lnextline;
1440             case 0:
1441             case 0x1A:
1442                 error("unterminated delimited string constant starting at %s", start.toChars());
1443                 result.setString();
1444                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1445                 p--;
1446                 return;
1447             default:
1448                 if (c & 0x80)
1449                 {
1450                     p--;
1451                     c = decodeUTF();
1452                     p++;
1453                     if (c == PS || c == LS)
1454                         goto Lnextline;
1455                 }
1456                 break;
1457             }
1458             if (delimleft == 0)
1459             {
1460                 delimleft = c;
1461                 nest = 1;
1462                 nestcount = 1;
1463                 if (c == '(')
1464                     delimright = ')';
1465                 else if (c == '{')
1466                     delimright = '}';
1467                 else if (c == '[')
1468                     delimright = ']';
1469                 else if (c == '<')
1470                     delimright = '>';
1471                 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1472                 {
1473                     // Start of identifier; must be a heredoc
1474                     Token tok;
1475                     p--;
1476                     scan(&tok); // read in heredoc identifier
1477                     if (tok.value != TOK.identifier)
1478                     {
1479                         error("identifier expected for heredoc, not %s", tok.toChars());
1480                         delimright = c;
1481                     }
1482                     else
1483                     {
1484                         hereid = tok.ident;
1485                         //printf("hereid = '%s'\n", hereid.toChars());
1486                         blankrol = 1;
1487                     }
1488                     nest = 0;
1489                 }
1490                 else
1491                 {
1492                     delimright = c;
1493                     nest = 0;
1494                     if (isspace(c))
1495                         error("delimiter cannot be whitespace");
1496                 }
1497             }
1498             else
1499             {
1500                 if (blankrol)
1501                 {
1502                     error("heredoc rest of line should be blank");
1503                     blankrol = 0;
1504                     continue;
1505                 }
1506                 if (nest == 1)
1507                 {
1508                     if (c == delimleft)
1509                         nestcount++;
1510                     else if (c == delimright)
1511                     {
1512                         nestcount--;
1513                         if (nestcount == 0)
1514                             goto Ldone;
1515                     }
1516                 }
1517                 else if (c == delimright)
1518                     goto Ldone;
1519                 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1520                 {
1521                     Token tok;
1522                     auto psave = p;
1523                     p--;
1524                     scan(&tok); // read in possible heredoc identifier
1525                     //printf("endid = '%s'\n", tok.ident.toChars());
1526                     if (tok.value == TOK.identifier && tok.ident is hereid)
1527                     {
1528                         /* should check that rest of line is blank
1529                          */
1530                         goto Ldone;
1531                     }
1532                     p = psave;
1533                 }
1534                 stringbuffer.writeUTF8(c);
1535                 startline = 0;
1536             }
1537         }
1538     Ldone:
1539         if (*p == '"')
1540             p++;
1541         else if (hereid)
1542             error("delimited string must end in `%s\"`", hereid.toChars());
1543         else if (isspace(delimright))
1544             error("delimited string must end in `\"`");
1545         else
1546             error("delimited string must end in `%c\"`", delimright);
1547         result.setString(stringbuffer);
1548         stringPostfix(result);
1549     }
1550
1551     /**
1552     Lex a token string. Some examples of token strings are:
1553     ---
1554     q{ foo(xxx) }    // " foo(xxx) "
1555     q{foo$(LPAREN)}  // "foo$(LPAREN)"
1556     q{{foo}"}"}      // "{foo}"}""
1557     ---
1558     It is assumed that `p` points to the opening curly-brace.
1559     Params:
1560         result = pointer to the token that accepts the result
1561     */
1562     private void tokenStringConstant(Token* result)
1563     {
1564         result.value = TOK.string_;
1565
1566         uint nest = 1;
1567         const start = loc();
1568         const pstart = ++p;
1569         inTokenStringConstant++;
1570         scope(exit) inTokenStringConstant--;
1571         while (1)
1572         {
1573             Token tok;
1574             scan(&tok);
1575             switch (tok.value)
1576             {
1577             case TOK.leftCurly:
1578                 nest++;
1579                 continue;
1580             case TOK.rightCurly:
1581                 if (--nest == 0)
1582                 {
1583                     result.setString(pstart, p - 1 - pstart);
1584                     stringPostfix(result);
1585                     return;
1586                 }
1587                 continue;
1588             case TOK.endOfFile:
1589                 error("unterminated token string constant starting at %s", start.toChars());
1590                 result.setString();
1591                 return;
1592             default:
1593                 continue;
1594             }
1595         }
1596     }
1597
1598     /**
1599     Scan a quoted string while building the processed string value by
1600     handling escape sequences. The result is returned in the given `t` token.
1601     This function assumes that `p` currently points to the opening quote
1602     of the string.
1603     Params:
1604         t = the token to set the resulting string to
1605     * References:
1606     *   D https://dlang.org/spec/lex.html#double_quoted_strings
1607     *   ImportC C11 6.4.5
1608     */
1609     private void escapeStringConstant(Token* t)
1610     {
1611         t.value = TOK.string_;
1612
1613         const start = loc();
1614         const tc = *p++;        // opening quote
1615         stringbuffer.setsize(0);
1616         while (1)
1617         {
1618             dchar c = *p++;
1619             switch (c)
1620             {
1621             case '\\':
1622                 switch (*p)
1623                 {
1624                 case '&':
1625                     if (Ccompile)
1626                         goto default;
1627                     goto case;
1628
1629                 case 'u':
1630                 case 'U':
1631                     c = escapeSequence();
1632                     stringbuffer.writeUTF8(c);
1633                     continue;
1634                 default:
1635                     c = escapeSequence();
1636                     break;
1637                 }
1638                 break;
1639             case '\n':
1640                 endOfLine();
1641                 if (Ccompile)
1642                     goto Lunterminated;
1643                 break;
1644             case '\r':
1645                 if (*p == '\n')
1646                     continue; // ignore
1647                 c = '\n'; // treat EndOfLine as \n character
1648                 endOfLine();
1649                 if (Ccompile)
1650                     goto Lunterminated;
1651                 break;
1652             case '\'':
1653             case '"':
1654                 if (c != tc)
1655                     goto default;
1656                 t.setString(stringbuffer);
1657                 if (!Ccompile)
1658                     stringPostfix(t);
1659                 return;
1660             case 0:
1661             case 0x1A:
1662                 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1663                 p--;
1664             Lunterminated:
1665                 error("unterminated string constant starting at %s", start.toChars());
1666                 t.setString();
1667                 return;
1668             default:
1669                 if (c & 0x80)
1670                 {
1671                     p--;
1672                     c = decodeUTF();
1673                     if (c == LS || c == PS)
1674                     {
1675                         c = '\n';
1676                         endOfLine();
1677                         if (Ccompile)
1678                             goto Lunterminated;
1679                     }
1680                     p++;
1681                     stringbuffer.writeUTF8(c);
1682                     continue;
1683                 }
1684                 break;
1685             }
1686             stringbuffer.writeByte(c);
1687         }
1688     }
1689
1690     /**************************************
1691      * Reference:
1692      *    https://dlang.org/spec/lex.html#characterliteral
1693      */
1694     private TOK charConstant(Token* t)
1695     {
1696         TOK tk = TOK.charLiteral;
1697         //printf("Lexer::charConstant\n");
1698         p++;
1699         dchar c = *p++;
1700         switch (c)
1701         {
1702         case '\\':
1703             switch (*p)
1704             {
1705             case 'u':
1706                 t.unsvalue = escapeSequence();
1707                 tk = TOK.wcharLiteral;
1708                 break;
1709             case 'U':
1710             case '&':
1711                 t.unsvalue = escapeSequence();
1712                 tk = TOK.dcharLiteral;
1713                 break;
1714             default:
1715                 t.unsvalue = escapeSequence();
1716                 break;
1717             }
1718             break;
1719         case '\n':
1720         L1:
1721             endOfLine();
1722             goto case;
1723         case '\r':
1724             goto case '\'';
1725         case 0:
1726         case 0x1A:
1727             // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1728             p--;
1729             goto case;
1730         case '\'':
1731             error("unterminated character constant");
1732             t.unsvalue = '?';
1733             return tk;
1734         default:
1735             if (c & 0x80)
1736             {
1737                 p--;
1738                 c = decodeUTF();
1739                 p++;
1740                 if (c == LS || c == PS)
1741                     goto L1;
1742                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1743                     tk = TOK.wcharLiteral;
1744                 else
1745                     tk = TOK.dcharLiteral;
1746             }
1747             t.unsvalue = c;
1748             break;
1749         }
1750         if (*p != '\'')
1751         {
1752             while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1753                     *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1754             {
1755                 if (*p & 0x80)
1756                 {
1757                     const s = p;
1758                     c = decodeUTF();
1759                     if (c == LS || c == PS)
1760                     {
1761                         p = s;
1762                         break;
1763                     }
1764                 }
1765                 p++;
1766             }
1767
1768             if (*p == '\'')
1769             {
1770                 error("character constant has multiple characters");
1771                 p++;
1772             }
1773             else
1774                 error("unterminated character constant");
1775             t.unsvalue = '?';
1776             return tk;
1777         }
1778         p++;
1779         return tk;
1780     }
1781
1782     /***************************************
1783      * Lex C character constant.
1784      * Parser is on the opening quote.
1785      * Params:
1786      *  t = token to fill in
1787      *  prefix = one of `u`, `U` or 0.
1788      * Reference:
1789      *  C11 6.4.4.4
1790      */
1791     private void clexerCharConstant(ref Token t, char prefix)
1792     {
1793         escapeStringConstant(&t);
1794         const(char)[] str = t.ustring[0 .. t.len];
1795         const n = str.length;
1796         const loc = t.loc;
1797         if (n == 0)
1798         {
1799             error(loc, "empty character constant");
1800             t.value = TOK.semicolon;
1801             return;
1802         }
1803
1804         uint u;
1805         switch (prefix)
1806         {
1807             case 0:
1808                 if (n == 1) // fast case
1809                 {
1810                     u = str[0];
1811                 }
1812                 else if (n > 4)
1813                     error(loc, "max number of chars in character literal is 4, had %d",
1814                         cast(int)n);
1815                 else
1816                 {
1817                     foreach (i, c; str)
1818                         (cast(char*)&u)[n - 1 - i] = c;
1819                 }
1820                 break;
1821
1822             case 'u':
1823                 dchar d1;
1824                 size_t idx;
1825                 auto msg = utf_decodeChar(str, idx, d1);
1826                 dchar d2 = 0;
1827                 if (idx < n && !msg)
1828                     msg = utf_decodeChar(str, idx, d2);
1829                 if (msg)
1830                     error(loc, "%s", msg);
1831                 else if (idx < n)
1832                     error(loc, "max number of chars in 16 bit character literal is 2, had %d",
1833                         (n + 1) >> 1);
1834                 else if (d1 > 0x1_0000)
1835                     error(loc, "%d does not fit in 16 bits", d1);
1836                 else if (d2 > 0x1_0000)
1837                     error(loc, "%d does not fit in 16 bits", d2);
1838                 u = d1;
1839                 if (d2)
1840                     u = (d1 << 16) | d2;
1841                 break;
1842
1843             case 'U':
1844                 dchar d;
1845                 size_t idx;
1846                 auto msg = utf_decodeChar(str, idx, d);
1847                 if (msg)
1848                     error(loc, "%s", msg);
1849                 else if (idx < n)
1850                     error(loc, "max number of chars in 32 bit character literal is 1, had %d",
1851                         (n + 3) >> 2);
1852                 u = d;
1853                 break;
1854
1855             default:
1856                 assert(0);
1857         }
1858         t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
1859         t.unsvalue = u;
1860     }
1861
1862     /***************************************
1863      * Get postfix of string literal.
1864      */
1865     private void stringPostfix(Token* t) pure @nogc
1866     {
1867         switch (*p)
1868         {
1869         case 'c':
1870         case 'w':
1871         case 'd':
1872             t.postfix = *p;
1873             p++;
1874             break;
1875         default:
1876             t.postfix = 0;
1877             break;
1878         }
1879     }
1880
1881     /**************************************
1882      * Read in a number.
1883      * If it's an integer, store it in tok.TKutok.Vlong.
1884      *      integers can be decimal, octal or hex
1885      *      Handle the suffixes U, UL, LU, L, etc.
1886      * If it's double, store it in tok.TKutok.Vdouble.
1887      * Returns:
1888      *      TKnum
1889      *      TKdouble,...
1890      */
1891     private TOK number(Token* t)
1892     {
1893         int base = 10;
1894         const start = p;
1895         uinteger_t n = 0; // unsigned >=64 bit integer type
1896         int d;
1897         bool err = false;
1898         bool overflow = false;
1899         bool anyBinaryDigitsNoSingleUS = false;
1900         bool anyHexDigitsNoSingleUS = false;
1901         char errorDigit = 0;
1902         dchar c = *p;
1903         if (c == '0')
1904         {
1905             ++p;
1906             c = *p;
1907             switch (c)
1908             {
1909             case '0':
1910             case '1':
1911             case '2':
1912             case '3':
1913             case '4':
1914             case '5':
1915             case '6':
1916             case '7':
1917                 base = 8;
1918                 break;
1919
1920             case '8':
1921             case '9':
1922                 errorDigit = cast(char) c;
1923                 base = 8;
1924                 break;
1925             case 'x':
1926             case 'X':
1927                 ++p;
1928                 base = 16;
1929                 break;
1930             case 'b':
1931             case 'B':
1932                 if (Ccompile)
1933                     error("binary constants not allowed");
1934                 ++p;
1935                 base = 2;
1936                 break;
1937             case '.':
1938                 if (p[1] == '.')
1939                     goto Ldone; // if ".."
1940                 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1941                 {
1942                     if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
1943                         goto Lreal;  // if `0.f` or `0.L`
1944                     goto Ldone; // if ".identifier" or ".unicode"
1945                 }
1946                 goto Lreal; // '.' is part of current token
1947             case 'i':
1948             case 'f':
1949             case 'F':
1950                 goto Lreal;
1951             case '_':
1952                 if (Ccompile)
1953                     error("embedded `_` not allowed");
1954                 ++p;
1955                 base = 8;
1956                 break;
1957             case 'L':
1958                 if (p[1] == 'i')
1959                     goto Lreal;
1960                 break;
1961             default:
1962                 break;
1963             }
1964         }
1965         while (1)
1966         {
1967             c = *p;
1968             switch (c)
1969             {
1970             case '0':
1971             case '1':
1972             case '2':
1973             case '3':
1974             case '4':
1975             case '5':
1976             case '6':
1977             case '7':
1978             case '8':
1979             case '9':
1980                 ++p;
1981                 d = c - '0';
1982                 break;
1983             case 'a':
1984             case 'b':
1985             case 'c':
1986             case 'd':
1987             case 'e':
1988             case 'f':
1989             case 'A':
1990             case 'B':
1991             case 'C':
1992             case 'D':
1993             case 'E':
1994             case 'F':
1995                 ++p;
1996                 if (base != 16)
1997                 {
1998                     if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
1999                         goto Lreal;
2000                 }
2001                 if (c >= 'a')
2002                     d = c + 10 - 'a';
2003                 else
2004                     d = c + 10 - 'A';
2005                 break;
2006             case 'L':
2007                 if (p[1] == 'i')
2008                     goto Lreal;
2009                 goto Ldone;
2010             case '.':
2011                 if (p[1] == '.')
2012                     goto Ldone; // if ".."
2013                 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
2014                 {
2015                     if (Ccompile && base == 10 &&
2016                         (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2017                         goto Lreal;  // if `1.e6` or `1.f` or `1.L`
2018                     goto Ldone; // if ".identifier" or ".unicode"
2019                 }
2020                 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2021                     goto Ldone; // if ".identifier" or ".unicode"
2022                 if (base == 2)
2023                     goto Ldone; // if ".identifier" or ".unicode"
2024                 goto Lreal; // otherwise as part of a floating point literal
2025             case 'p':
2026             case 'P':
2027             case 'i':
2028             Lreal:
2029                 p = start;
2030                 return inreal(t);
2031             case '_':
2032                 if (Ccompile)
2033                     goto default;
2034                 ++p;
2035                 continue;
2036             default:
2037                 goto Ldone;
2038             }
2039             // got a digit here, set any necessary flags, check for errors
2040             anyHexDigitsNoSingleUS = true;
2041             anyBinaryDigitsNoSingleUS = true;
2042             if (!errorDigit && d >= base)
2043             {
2044                 errorDigit = cast(char) c;
2045             }
2046             // Avoid expensive overflow check if we aren't at risk of overflow
2047             if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2048                 n = n * base + d;
2049             else
2050             {
2051                 import core.checkedint : mulu, addu;
2052
2053                 n = mulu(n, base, overflow);
2054                 n = addu(n, d, overflow);
2055             }
2056         }
2057     Ldone:
2058         if (errorDigit)
2059         {
2060             error("%s digit expected, not `%c`", base == 2 ? "binary".ptr :
2061                                                  base == 8 ? "octal".ptr :
2062                                                  "decimal".ptr, errorDigit);
2063             err = true;
2064         }
2065         if (overflow && !err)
2066         {
2067             error("integer overflow");
2068             err = true;
2069         }
2070         if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2071             (base == 16 && !anyHexDigitsNoSingleUS))
2072             error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
2073
2074         t.unsvalue = n;
2075
2076         if (Ccompile)
2077             return cnumber(base, n);
2078
2079         enum FLAGS : int
2080         {
2081             none = 0,
2082             decimal = 1, // decimal
2083             unsigned = 2, // u or U suffix
2084             long_ = 4, // L suffix
2085         }
2086
2087         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2088         // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2089         const psuffix = p;
2090         while (1)
2091         {
2092             FLAGS f;
2093             switch (*p)
2094             {
2095             case 'U':
2096             case 'u':
2097                 f = FLAGS.unsigned;
2098                 goto L1;
2099             case 'l':
2100                 f = FLAGS.long_;
2101                 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2102                 goto L1;
2103             case 'L':
2104                 f = FLAGS.long_;
2105             L1:
2106                 p++;
2107                 if ((flags & f) && !err)
2108                 {
2109                     error("unrecognized token");
2110                     err = true;
2111                 }
2112                 flags = cast(FLAGS)(flags | f);
2113                 continue;
2114             default:
2115                 break;
2116             }
2117             break;
2118         }
2119         if (base == 8 && n >= 8)
2120         {
2121             if (err)
2122                 // can't translate invalid octal value, just show a generic message
2123                 error("octal literals larger than 7 are no longer supported");
2124             else
2125                 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
2126                     n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2127         }
2128         TOK result;
2129         switch (flags)
2130         {
2131         case FLAGS.none:
2132             /* Octal or Hexadecimal constant.
2133              * First that fits: int, uint, long, ulong
2134              */
2135             if (n & 0x8000000000000000L)
2136                 result = TOK.uns64Literal;
2137             else if (n & 0xFFFFFFFF00000000L)
2138                 result = TOK.int64Literal;
2139             else if (n & 0x80000000)
2140                 result = TOK.uns32Literal;
2141             else
2142                 result = TOK.int32Literal;
2143             break;
2144         case FLAGS.decimal:
2145             /* First that fits: int, long, long long
2146              */
2147             if (n & 0x8000000000000000L)
2148             {
2149                 result = TOK.uns64Literal;
2150             }
2151             else if (n & 0xFFFFFFFF80000000L)
2152                 result = TOK.int64Literal;
2153             else
2154                 result = TOK.int32Literal;
2155             break;
2156         case FLAGS.unsigned:
2157         case FLAGS.decimal | FLAGS.unsigned:
2158             /* First that fits: uint, ulong
2159              */
2160             if (n & 0xFFFFFFFF00000000L)
2161                 result = TOK.uns64Literal;
2162             else
2163                 result = TOK.uns32Literal;
2164             break;
2165         case FLAGS.decimal | FLAGS.long_:
2166             if (n & 0x8000000000000000L)
2167             {
2168                 if (!err)
2169                 {
2170                     error("signed integer overflow");
2171                     err = true;
2172                 }
2173                 result = TOK.uns64Literal;
2174             }
2175             else
2176                 result = TOK.int64Literal;
2177             break;
2178         case FLAGS.long_:
2179             if (n & 0x8000000000000000L)
2180                 result = TOK.uns64Literal;
2181             else
2182                 result = TOK.int64Literal;
2183             break;
2184         case FLAGS.unsigned | FLAGS.long_:
2185         case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2186             result = TOK.uns64Literal;
2187             break;
2188         default:
2189             debug
2190             {
2191                 printf("%x\n", flags);
2192             }
2193             assert(0);
2194         }
2195         return result;
2196     }
2197
2198     /**************************************
2199      * Lex C integer-suffix
2200      * Params:
2201      *  base = number base
2202      *  n = raw integer value
2203      * Returns:
2204      *  token value
2205      */
2206     private TOK cnumber(int base, uinteger_t n)
2207     {
2208         /* C11 6.4.4.1
2209          * Parse trailing suffixes:
2210          *   u or U
2211          *   l or L
2212          *   ll or LL
2213          */
2214         enum FLAGS : uint
2215         {
2216             octalhex = 1, // octal or hexadecimal
2217             decimal  = 2, // decimal
2218             unsigned = 4, // u or U suffix
2219             long_    = 8, // l or L suffix
2220             llong    = 0x10 // ll or LL
2221         }
2222         FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2223         bool err;
2224     Lsuffixes:
2225         while (1)
2226         {
2227             FLAGS f;
2228             const cs = *p;
2229             switch (cs)
2230             {
2231                 case 'U':
2232                 case 'u':
2233                     f = FLAGS.unsigned;
2234                     break;
2235
2236                 case 'l':
2237                 case 'L':
2238                     f = FLAGS.long_;
2239                     if (cs == p[1])
2240                     {
2241                         f = FLAGS.long_ | FLAGS.llong;
2242                         ++p;
2243                     }
2244                     break;
2245
2246                 default:
2247                     break Lsuffixes;
2248             }
2249             ++p;
2250             if ((flags & f) && !err)
2251             {
2252                 error("duplicate integer suffixes");
2253                 err = true;
2254             }
2255             flags = cast(FLAGS)(flags | f);
2256         }
2257
2258         TOK result = TOK.int32Literal;     // default
2259         switch (flags)
2260         {
2261             /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2262              * this code deviates from C by picking D int, uint, long, or ulong instead
2263              */
2264
2265             case FLAGS.octalhex:
2266                 /* Octal or Hexadecimal constant.
2267                  * First that fits: int, unsigned, long, unsigned long,
2268                  * long long, unsigned long long
2269                  */
2270                 if (n & 0x8000000000000000L)
2271                     result = TOK.uns64Literal;      // unsigned long
2272                 else if (n & 0xFFFFFFFF00000000L)
2273                     result = TOK.int64Literal;      // long
2274                 else if (n & 0x80000000)
2275                     result = TOK.uns32Literal;
2276                 else
2277                     result = TOK.int32Literal;
2278                 break;
2279
2280             case FLAGS.decimal:
2281                 /* First that fits: int, long, long long
2282                  */
2283                 if (n & 0x8000000000000000L)
2284                     result = TOK.uns64Literal;      // unsigned long
2285                 else if (n & 0xFFFFFFFF80000000L)
2286                     result = TOK.int64Literal;      // long
2287                 else
2288                     result = TOK.int32Literal;
2289                 break;
2290
2291             case FLAGS.octalhex | FLAGS.unsigned:
2292             case FLAGS.decimal | FLAGS.unsigned:
2293                 /* First that fits: unsigned, unsigned long, unsigned long long
2294                  */
2295                 if (n & 0xFFFFFFFF00000000L)
2296                     result = TOK.uns64Literal;      // unsigned long
2297                 else
2298                     result = TOK.uns32Literal;
2299                 break;
2300
2301             case FLAGS.decimal | FLAGS.long_:
2302                 /* First that fits: long, long long
2303                  */
2304                 if (longsize == 4 || long_longsize == 4)
2305                 {
2306                     if (n & 0xFFFFFFFF_80000000L)
2307                         result = TOK.int64Literal;
2308                     else
2309                         result = TOK.int32Literal;  // long
2310                 }
2311                 else
2312                 {
2313                     result = TOK.int64Literal;      // long
2314                 }
2315                 break;
2316
2317             case FLAGS.octalhex | FLAGS.long_:
2318                 /* First that fits: long, unsigned long, long long,
2319                  * unsigned long long
2320                  */
2321                 if (longsize == 4 || long_longsize == 4)
2322                 {
2323                     if (n & 0x8000000000000000L)
2324                         result = TOK.uns64Literal;
2325                     else if (n & 0xFFFFFFFF00000000L)
2326                         result = TOK.int64Literal;
2327                     else if (n & 0x80000000)
2328                         result = TOK.uns32Literal;      // unsigned long
2329                     else
2330                         result = TOK.int32Literal;      // long
2331                 }
2332                 else
2333                 {
2334                     if (n & 0x80000000_00000000L)
2335                         result = TOK.uns64Literal;      // unsigned long
2336                     else
2337                         result = TOK.int64Literal;      // long
2338                 }
2339                 break;
2340
2341             case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2342             case FLAGS.decimal  | FLAGS.unsigned | FLAGS.long_:
2343                 /* First that fits: unsigned long, unsigned long long
2344                  */
2345                 if (longsize == 4 || long_longsize == 4)
2346                 {
2347                     if (n & 0xFFFFFFFF00000000L)
2348                         result = TOK.uns64Literal;
2349                     else
2350                         result = TOK.uns32Literal;      // unsigned long
2351                 }
2352                 else
2353                 {
2354                     result = TOK.uns64Literal;  // unsigned long
2355                 }
2356                 break;
2357
2358             case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2359                 /* First that fits: long long, unsigned long long
2360                  */
2361                 if (n & 0x8000000000000000L)
2362                     result = TOK.uns64Literal;
2363                 else
2364                     result = TOK.int64Literal;
2365                 break;
2366
2367             case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2368                 /* long long
2369                  */
2370                 result = TOK.int64Literal;
2371                 break;
2372
2373             case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2374             case FLAGS.decimal  | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2375                 result = TOK.uns64Literal;
2376                 break;
2377
2378             default:
2379                 debug printf("%x\n",flags);
2380                 assert(0);
2381         }
2382         return result;
2383     }
2384
2385     /**************************************
2386      * Read in characters, converting them to real.
2387      * Bugs:
2388      *      Exponent overflow not detected.
2389      *      Too much requested precision is not detected.
2390      */
2391     private TOK inreal(Token* t)
2392     {
2393         //printf("Lexer::inreal()\n");
2394         debug
2395         {
2396             assert(*p == '.' || isdigit(*p));
2397         }
2398         bool isWellformedString = true;
2399         stringbuffer.setsize(0);
2400         auto pstart = p;
2401         bool hex = false;
2402         dchar c = *p++;
2403         // Leading '0x'
2404         if (c == '0')
2405         {
2406             c = *p++;
2407             if (c == 'x' || c == 'X')
2408             {
2409                 hex = true;
2410                 c = *p++;
2411             }
2412         }
2413         // Digits to left of '.'
2414         while (1)
2415         {
2416             if (c == '.')
2417             {
2418                 c = *p++;
2419                 break;
2420             }
2421             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2422             {
2423                 c = *p++;
2424                 continue;
2425             }
2426             break;
2427         }
2428         // Digits to right of '.'
2429         while (1)
2430         {
2431             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2432             {
2433                 c = *p++;
2434                 continue;
2435             }
2436             break;
2437         }
2438         if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2439         {
2440             c = *p++;
2441             if (c == '-' || c == '+')
2442             {
2443                 c = *p++;
2444             }
2445             bool anyexp = false;
2446             while (1)
2447             {
2448                 if (isdigit(c))
2449                 {
2450                     anyexp = true;
2451                     c = *p++;
2452                     continue;
2453                 }
2454                 if (c == '_')
2455                 {
2456                     if (Ccompile)
2457                         error("embedded `_` in numeric literals not allowed");
2458                     c = *p++;
2459                     continue;
2460                 }
2461                 if (!anyexp)
2462                 {
2463                     error("missing exponent");
2464                     isWellformedString = false;
2465                 }
2466                 break;
2467             }
2468         }
2469         else if (hex)
2470         {
2471             error("exponent required for hex float");
2472             isWellformedString = false;
2473         }
2474         --p;
2475         while (pstart < p)
2476         {
2477             if (*pstart != '_')
2478                 stringbuffer.writeByte(*pstart);
2479             ++pstart;
2480         }
2481         stringbuffer.writeByte(0);
2482         auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2483         TOK result;
2484         bool isOutOfRange = false;
2485         t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero);
2486         switch (*p)
2487         {
2488         case 'F':
2489         case 'f':
2490             if (isWellformedString && !isOutOfRange)
2491                 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2492             result = TOK.float32Literal;
2493             p++;
2494             break;
2495         default:
2496             if (isWellformedString && !isOutOfRange)
2497                 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2498             result = TOK.float64Literal;
2499             break;
2500         case 'l':
2501             if (!Ccompile)
2502                 error("use 'L' suffix instead of 'l'");
2503             goto case 'L';
2504         case 'L':
2505             ++p;
2506             if (Ccompile && long_doublesize == 8)
2507                 goto default;
2508             result = TOK.float80Literal;
2509             break;
2510         }
2511         if ((*p == 'i' || *p == 'I') && !Ccompile)
2512         {
2513             if (*p == 'I')
2514                 error("use 'i' suffix instead of 'I'");
2515             p++;
2516             switch (result)
2517             {
2518             case TOK.float32Literal:
2519                 result = TOK.imaginary32Literal;
2520                 break;
2521             case TOK.float64Literal:
2522                 result = TOK.imaginary64Literal;
2523                 break;
2524             case TOK.float80Literal:
2525                 result = TOK.imaginary80Literal;
2526                 break;
2527             default:
2528                 break;
2529             }
2530         }
2531         const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
2532         if (isOutOfRange && !isLong && (!Ccompile || hex))
2533         {
2534             /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2535              */
2536             const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : "";
2537             error(scanloc, "number `%s%s` is not representable", sbufptr, suffix);
2538         }
2539         debug
2540         {
2541             switch (result)
2542             {
2543             case TOK.float32Literal:
2544             case TOK.float64Literal:
2545             case TOK.float80Literal:
2546             case TOK.imaginary32Literal:
2547             case TOK.imaginary64Literal:
2548             case TOK.imaginary80Literal:
2549                 break;
2550             default:
2551                 assert(0);
2552             }
2553         }
2554         return result;
2555     }
2556
2557     final Loc loc() pure @nogc
2558     {
2559         scanloc.charnum = cast(uint)(1 + p - line);
2560         version (LocOffset)
2561             scanloc.fileOffset = cast(uint)(p - base);
2562         return scanloc;
2563     }
2564
2565     final void error(const(char)* format, ...)
2566     {
2567         va_list args;
2568         va_start(args, format);
2569         .verror(token.loc, format, args);
2570         va_end(args);
2571     }
2572
2573     final void error(const ref Loc loc, const(char)* format, ...)
2574     {
2575         va_list args;
2576         va_start(args, format);
2577         .verror(loc, format, args);
2578         va_end(args);
2579     }
2580
2581     final void deprecation(const(char)* format, ...)
2582     {
2583         va_list args;
2584         va_start(args, format);
2585         .vdeprecation(token.loc, format, args);
2586         va_end(args);
2587     }
2588
2589     /***************************************
2590      * Parse special token sequence:
2591      * Returns:
2592      *  true if the special token sequence was handled
2593      * References:
2594      *  https://dlang.org/spec/lex.html#special-token-sequence
2595      */
2596     bool parseSpecialTokenSequence()
2597     {
2598         Token n;
2599         scan(&n);
2600         if (n.value == TOK.identifier)
2601         {
2602             if (n.ident == Id.line)
2603             {
2604                 poundLine(n, false);
2605                 return true;
2606             }
2607             else
2608             {
2609                 const locx = loc();
2610                 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
2611             }
2612         }
2613         else if (n.value == TOK.if_)
2614         {
2615             error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
2616         }
2617         return false;
2618     }
2619
2620     /*********************************************
2621      * Parse line/file preprocessor directive:
2622      *    #line linnum [filespec]
2623      * Allow __LINE__ for linnum, and __FILE__ for filespec.
2624      * Accept linemarker format:
2625      *    # linnum [filespec] {flags}
2626      * There can be zero or more flags, which are one of the digits 1..4, and
2627      * must be in ascending order. The flags are ignored.
2628      * Params:
2629      *  tok = token we're on, which is linnum of linemarker
2630      *  linemarker = true if line marker format and lexer is on linnum
2631      * References:
2632      *  linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2633      */
2634     final void poundLine(ref Token tok, bool linemarker)
2635     {
2636         auto linnum = this.scanloc.linnum;
2637         const(char)* filespec = null;
2638         bool flags;
2639
2640         if (!linemarker)
2641             scan(&tok);
2642         if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2643         {
2644             const lin = cast(int)(tok.unsvalue);
2645             if (lin != tok.unsvalue)
2646             {
2647                 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2648                 skipToNextLine();
2649                 return;
2650             }
2651             else
2652                 linnum = lin;
2653         }
2654         else if (tok.value == TOK.line)  // #line __LINE__
2655         {
2656         }
2657         else
2658         {
2659             error(tok.loc, "positive integer argument expected following `#line`");
2660             if (tok.value != TOK.endOfLine)
2661                 skipToNextLine();
2662             return;
2663         }
2664         while (1)
2665         {
2666             scan(&tok);
2667             switch (tok.value)
2668             {
2669             case TOK.endOfFile:
2670             case TOK.endOfLine:
2671                 if (!inTokenStringConstant)
2672                 {
2673                     this.scanloc.linnum = linnum;
2674                     if (filespec)
2675                         this.scanloc.filename = filespec;
2676                 }
2677                 return;
2678             case TOK.file:
2679                 if (filespec || flags)
2680                     goto Lerr;
2681                 filespec = mem.xstrdup(scanloc.filename);
2682                 continue;
2683             case TOK.string_:
2684                 if (filespec || flags)
2685                     goto Lerr;
2686                 if (tok.ptr[0] != '"' || tok.postfix != 0)
2687                     goto Lerr;
2688                 filespec = tok.ustring;
2689                 continue;
2690             case TOK.int32Literal:
2691                 if (!filespec)
2692                     goto Lerr;
2693                 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
2694                 {
2695                     flags = true;   // linemarker flags seen
2696                     continue;
2697                 }
2698                 goto Lerr;
2699             default:
2700                 goto Lerr;
2701             }
2702         }
2703     Lerr:
2704         if (filespec is null)
2705             error(tok.loc, "invalid filename for `#line` directive");
2706         else if (linemarker)
2707             error(tok.loc, "invalid flag for line marker directive");
2708         else if (!Ccompile)
2709             error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2710         if (tok.value != TOK.endOfLine)
2711             skipToNextLine();
2712     }
2713
2714     /***************************************
2715      * Scan forward to start of next line.
2716      */
2717     final void skipToNextLine()
2718     {
2719         while (1)
2720         {
2721             switch (*p)
2722             {
2723             case 0:
2724             case 0x1A:
2725                 return; // do not advance p
2726
2727             case '\n':
2728                 ++p;
2729                 break;
2730
2731             case '\r':
2732                 ++p;
2733                 if (p[0] == '\n')
2734                    ++p;
2735                 break;
2736
2737             default:
2738                 if (*p & 0x80)
2739                 {
2740                     const u = decodeUTF();
2741                     if (u == PS || u == LS)
2742                     {
2743                         ++p;
2744                         break;
2745                     }
2746                 }
2747                 ++p;
2748                 continue;
2749             }
2750             break;
2751         }
2752         endOfLine();
2753         tokenizeNewlines = false;
2754     }
2755
2756     /********************************************
2757      * Decode UTF character.
2758      * Issue error messages for invalid sequences.
2759      * Return decoded character, advance p to last character in UTF sequence.
2760      */
2761     private uint decodeUTF()
2762     {
2763         const s = p;
2764         assert(*s & 0x80);
2765         // Check length of remaining string up to 4 UTF-8 characters
2766         size_t len;
2767         for (len = 1; len < 4 && s[len]; len++)
2768         {
2769         }
2770         size_t idx = 0;
2771         dchar u;
2772         const msg = utf_decodeChar(s[0 .. len], idx, u);
2773         p += idx - 1;
2774         if (msg)
2775         {
2776             error("%.*s", cast(int)msg.length, msg.ptr);
2777         }
2778         return u;
2779     }
2780
2781     /***************************************************
2782      * Parse doc comment embedded between t.ptr and p.
2783      * Remove trailing blanks and tabs from lines.
2784      * Replace all newlines with \n.
2785      * Remove leading comment character from each line.
2786      * Decide if it's a lineComment or a blockComment.
2787      * Append to previous one for this token.
2788      *
2789      * If newParagraph is true, an extra newline will be
2790      * added between adjoining doc comments.
2791      */
2792     private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
2793     {
2794         /* ct tells us which kind of comment it is: '/', '*', or '+'
2795          */
2796         const ct = t.ptr[2];
2797         /* Start of comment text skips over / * *, / + +, or / / /
2798          */
2799         const(char)* q = t.ptr + 3; // start of comment text
2800         const(char)* qend = p;
2801         if (ct == '*' || ct == '+')
2802             qend -= 2;
2803         /* Scan over initial row of ****'s or ++++'s or ////'s
2804          */
2805         for (; q < qend; q++)
2806         {
2807             if (*q != ct)
2808                 break;
2809         }
2810         /* Remove leading spaces until start of the comment
2811          */
2812         int linestart = 0;
2813         if (ct == '/')
2814         {
2815             while (q < qend && (*q == ' ' || *q == '\t'))
2816                 ++q;
2817         }
2818         else if (q < qend)
2819         {
2820             if (*q == '\r')
2821             {
2822                 ++q;
2823                 if (q < qend && *q == '\n')
2824                     ++q;
2825                 linestart = 1;
2826             }
2827             else if (*q == '\n')
2828             {
2829                 ++q;
2830                 linestart = 1;
2831             }
2832         }
2833         /* Remove trailing row of ****'s or ++++'s
2834          */
2835         if (ct != '/')
2836         {
2837             for (; q < qend; qend--)
2838             {
2839                 if (qend[-1] != ct)
2840                     break;
2841             }
2842         }
2843         /* Comment is now [q .. qend].
2844          * Canonicalize it into buf[].
2845          */
2846         OutBuffer buf;
2847
2848         void trimTrailingWhitespace()
2849         {
2850             const s = buf[];
2851             auto len = s.length;
2852             while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2853                 --len;
2854             buf.setsize(len);
2855         }
2856
2857         for (; q < qend; q++)
2858         {
2859             char c = *q;
2860             switch (c)
2861             {
2862             case '*':
2863             case '+':
2864                 if (linestart && c == ct)
2865                 {
2866                     linestart = 0;
2867                     /* Trim preceding whitespace up to preceding \n
2868                      */
2869                     trimTrailingWhitespace();
2870                     continue;
2871                 }
2872                 break;
2873             case ' ':
2874             case '\t':
2875                 break;
2876             case '\r':
2877                 if (q[1] == '\n')
2878                     continue; // skip the \r
2879                 goto Lnewline;
2880             default:
2881                 if (c == 226)
2882                 {
2883                     // If LS or PS
2884                     if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2885                     {
2886                         q += 2;
2887                         goto Lnewline;
2888                     }
2889                 }
2890                 linestart = 0;
2891                 break;
2892             Lnewline:
2893                 c = '\n'; // replace all newlines with \n
2894                 goto case;
2895             case '\n':
2896                 linestart = 1;
2897                 /* Trim trailing whitespace
2898                  */
2899                 trimTrailingWhitespace();
2900                 break;
2901             }
2902             buf.writeByte(c);
2903         }
2904         /* Trim trailing whitespace (if the last line does not have newline)
2905          */
2906         trimTrailingWhitespace();
2907
2908         // Always end with a newline
2909         const s = buf[];
2910         if (s.length == 0 || s[$ - 1] != '\n')
2911             buf.writeByte('\n');
2912
2913         // It's a line comment if the start of the doc comment comes
2914         // after other non-whitespace on the same line.
2915         auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
2916         // Combine with previous doc comment, if any
2917         if (*dc)
2918             *dc = combineComments(*dc, buf[], newParagraph).toDString();
2919         else
2920             *dc = buf.extractSlice(true);
2921     }
2922
2923     /********************************************
2924      * Combine two document comments into one,
2925      * separated by an extra newline if newParagraph is true.
2926      */
2927     static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
2928     {
2929         //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
2930         const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
2931         if (!c1)
2932             return c2.ptr;
2933         if (!c2)
2934             return c1.ptr;
2935
2936         int insertNewLine = 0;
2937         if (c1.length && c1[$ - 1] != '\n')
2938             insertNewLine = 1;
2939         const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
2940         auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
2941         p[0 .. c1.length] = c1[];
2942         if (insertNewLine)
2943             p[c1.length] = '\n';
2944         if (newParagraph)
2945             p[c1.length + insertNewLine] = '\n';
2946         p[retSize - c2.length .. retSize] = c2[];
2947         p[retSize] = 0;
2948         return p;
2949     }
2950
2951     /**************************
2952      * `p` should be at start of next line
2953      */
2954     private void endOfLine() pure @nogc @safe
2955     {
2956         scanloc.linnum++;
2957         line = p;
2958     }
2959 }
2960
2961
2962 /******************************* Private *****************************************/
2963
2964 private:
2965
2966 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
2967 private struct TimeStampInfo
2968 {
2969     private __gshared bool initdone = false;
2970
2971     // Note: Those properties need to be guarded by a call to `init`
2972     // The API isn't safe, and quite brittle, but it was left this way
2973     // over performance concerns.
2974     // This is currently only called once, from the lexer.
2975     __gshared char[11 + 1] date;
2976     __gshared char[8 + 1] time;
2977     __gshared char[24 + 1] timestamp;
2978
2979     public static void initialize(const ref Loc loc) nothrow
2980     {
2981         if (initdone)
2982             return;
2983
2984         initdone = true;
2985         time_t ct;
2986         // https://issues.dlang.org/show_bug.cgi?id=20444
2987         if (auto p = getenv("SOURCE_DATE_EPOCH"))
2988         {
2989             if (!ct.parseDigits(p.toDString()))
2990                 error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
2991         }
2992         else
2993             .time(&ct);
2994         const p = ctime(&ct);
2995         assert(p);
2996         sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
2997         sprintf(&time[0], "%.8s", p + 11);
2998         sprintf(&timestamp[0], "%.24s", p);
2999     }
3000 }
3001
3002 private enum LS = 0x2028;       // UTF line separator
3003 private enum PS = 0x2029;       // UTF paragraph separator
3004
3005 /********************************************
3006  * Do our own char maps
3007  */
3008 private static immutable cmtable = ()
3009 {
3010     ubyte[256] table;
3011     foreach (const c; 0 .. table.length)
3012     {
3013         if ('0' <= c && c <= '7')
3014             table[c] |= CMoctal;
3015         if (c_isxdigit(c))
3016             table[c] |= CMhex;
3017         if (c_isalnum(c) || c == '_')
3018             table[c] |= CMidchar;
3019
3020         switch (c)
3021         {
3022             case 'x': case 'X':
3023             case 'b': case 'B':
3024                 table[c] |= CMzerosecond;
3025                 break;
3026
3027             case '0': .. case '9':
3028             case 'e': case 'E':
3029             case 'f': case 'F':
3030             case 'l': case 'L':
3031             case 'p': case 'P':
3032             case 'u': case 'U':
3033             case 'i':
3034             case '.':
3035             case '_':
3036                 table[c] |= CMzerosecond | CMdigitsecond;
3037                 break;
3038
3039             default:
3040                 break;
3041         }
3042
3043         switch (c)
3044         {
3045             case '\\':
3046             case '\n':
3047             case '\r':
3048             case 0:
3049             case 0x1A:
3050             case '\'':
3051                 break;
3052             default:
3053                 if (!(c & 0x80))
3054                     table[c] |= CMsinglechar;
3055                 break;
3056         }
3057     }
3058     return table;
3059 }();
3060
3061 private
3062 {
3063     enum CMoctal  = 0x1;
3064     enum CMhex    = 0x2;
3065     enum CMidchar = 0x4;
3066     enum CMzerosecond = 0x8;
3067     enum CMdigitsecond = 0x10;
3068     enum CMsinglechar = 0x20;
3069 }
3070
3071 private bool isoctal(const char c) pure @nogc @safe
3072 {
3073     return (cmtable[c] & CMoctal) != 0;
3074 }
3075
3076 private bool ishex(const char c) pure @nogc @safe
3077 {
3078     return (cmtable[c] & CMhex) != 0;
3079 }
3080
3081 private bool isidchar(const char c) pure @nogc @safe
3082 {
3083     return (cmtable[c] & CMidchar) != 0;
3084 }
3085
3086 private bool isZeroSecond(const char c) pure @nogc @safe
3087 {
3088     return (cmtable[c] & CMzerosecond) != 0;
3089 }
3090
3091 private bool isDigitSecond(const char c) pure @nogc @safe
3092 {
3093     return (cmtable[c] & CMdigitsecond) != 0;
3094 }
3095
3096 private bool issinglechar(const char c) pure @nogc @safe
3097 {
3098     return (cmtable[c] & CMsinglechar) != 0;
3099 }
3100
3101 private bool c_isxdigit(const int c) pure @nogc @safe
3102 {
3103     return (( c >= '0' && c <= '9') ||
3104             ( c >= 'a' && c <= 'f') ||
3105             ( c >= 'A' && c <= 'F'));
3106 }
3107
3108 private bool c_isalnum(const int c) pure @nogc @safe
3109 {
3110     return (( c >= '0' && c <= '9') ||
3111             ( c >= 'a' && c <= 'z') ||
3112             ( c >= 'A' && c <= 'Z'));
3113 }
3114
3115 /******************************* Unittest *****************************************/
3116
3117 unittest
3118 {
3119     import dmd.console;
3120     nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3121                                    const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3122     {
3123         assert(0);
3124     }
3125     diagnosticHandler = &assertDiagnosticHandler;
3126
3127     static void test(T)(string sequence, T expected, bool Ccompile = false)
3128     {
3129         auto p = cast(const(char)*)sequence.ptr;
3130         assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile));
3131         assert(p == sequence.ptr + sequence.length);
3132     }
3133
3134     test(`'`, '\'');
3135     test(`"`, '"');
3136     test(`?`, '?');
3137     test(`\`, '\\');
3138     test(`0`, '\0');
3139     test(`a`, '\a');
3140     test(`b`, '\b');
3141     test(`f`, '\f');
3142     test(`n`, '\n');
3143     test(`r`, '\r');
3144     test(`t`, '\t');
3145     test(`v`, '\v');
3146
3147     test(`x00`, 0x00);
3148     test(`xff`, 0xff);
3149     test(`xFF`, 0xff);
3150     test(`xa7`, 0xa7);
3151     test(`x3c`, 0x3c);
3152     test(`xe2`, 0xe2);
3153
3154     test(`1`, '\1');
3155     test(`42`, '\42');
3156     test(`357`, '\357');
3157
3158     test(`u1234`, '\u1234');
3159     test(`uf0e4`, '\uf0e4');
3160
3161     test(`U0001f603`, '\U0001f603');
3162
3163     test(`&quot;`, '"');
3164     test(`&lt;`, '<');
3165     test(`&gt;`, '>');
3166
3167     diagnosticHandler = null;
3168 }
3169
3170 unittest
3171 {
3172     import dmd.console;
3173     string expected;
3174     bool gotError;
3175
3176     nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
3177                                          const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
3178     {
3179         assert(cast(Classification)headerColor == Classification.error);
3180
3181         gotError = true;
3182         char[100] buffer = void;
3183         auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)];
3184         assert(expected == actual);
3185         return true;
3186     }
3187
3188     diagnosticHandler = &expectDiagnosticHandler;
3189
3190     void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3191     {
3192         uint errors = global.errors;
3193         gotError = false;
3194         expected = expectedError;
3195         auto p = cast(const(char)*)sequence.ptr;
3196         auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile);
3197         assert(gotError);
3198         assert(expectedReturnValue == actualReturnValue);
3199
3200         auto actualScanLength = p - sequence.ptr;
3201         assert(expectedScanLength == actualScanLength);
3202         global.errors = errors;
3203     }
3204
3205     test("c", `undefined escape sequence \c`, 'c', 1);
3206     test("!", `undefined escape sequence \!`, '!', 1);
3207     test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3208
3209     test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3210
3211     test("u1"  , `escape hex sequence has 1 hex digits instead of 4`,   0x1, 2);
3212     test("u12" , `escape hex sequence has 2 hex digits instead of 4`,  0x12, 3);
3213     test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3214
3215     test("U0"      , `escape hex sequence has 1 hex digits instead of 8`,       0x0, 2);
3216     test("U00"     , `escape hex sequence has 2 hex digits instead of 8`,      0x00, 3);
3217     test("U000"    , `escape hex sequence has 3 hex digits instead of 8`,     0x000, 4);
3218     test("U0000"   , `escape hex sequence has 4 hex digits instead of 8`,    0x0000, 5);
3219     test("U0001f"  , `escape hex sequence has 5 hex digits instead of 8`,   0x0001f, 6);
3220     test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`,  0x0001f6, 7);
3221     test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3222
3223     test("ud800"    , `invalid UTF character \U0000d800`, '?', 5);
3224     test("udfff"    , `invalid UTF character \U0000dfff`, '?', 5);
3225     test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3226
3227     test("xg0"      , `undefined escape hex sequence \xg`, 'g', 2);
3228     test("ug000"    , `undefined escape hex sequence \ug`, 'g', 2);
3229     test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3230
3231     test("&BAD;", `unnamed character entity &BAD;`  , '?', 5);
3232     test("&quot", `unterminated named entity &quot;`, '?', 5);
3233     test("&quot", `unterminated named entity &quot;`, '?', 5);
3234
3235     test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
3236
3237     diagnosticHandler = null;
3238 }
3239
3240 unittest
3241 {
3242     //printf("lexer.unittest\n");
3243     /* Not much here, just trying things out.
3244      */
3245     string text = "int"; // We rely on the implicit null-terminator
3246     scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
3247     TOK tok;
3248     tok = lex1.nextToken();
3249     //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3250     assert(tok == TOK.int32);
3251     tok = lex1.nextToken();
3252     assert(tok == TOK.endOfFile);
3253     tok = lex1.nextToken();
3254     assert(tok == TOK.endOfFile);
3255     tok = lex1.nextToken();
3256     assert(tok == TOK.endOfFile);
3257 }
3258
3259 unittest
3260 {
3261     // We don't want to see Lexer error output during these tests.
3262     uint errors = global.startGagging();
3263     scope(exit) global.endGagging(errors);
3264
3265     // Test malformed input: even malformed input should end in a TOK.endOfFile.
3266     static immutable char[][] testcases =
3267     [   // Testcase must end with 0 or 0x1A.
3268         [0], // not malformed, but pathological
3269         ['\'', 0],
3270         ['\'', 0x1A],
3271         ['{', '{', 'q', '{', 0],
3272         [0xFF, 0],
3273         [0xFF, 0x80, 0],
3274         [0xFF, 0xFF, 0],
3275         [0xFF, 0xFF, 0],
3276         ['x', '"', 0x1A],
3277     ];
3278
3279     foreach (testcase; testcases)
3280     {
3281         scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0);
3282         TOK tok = lex2.nextToken();
3283         size_t iterations = 1;
3284         while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3285         {
3286             tok = lex2.nextToken();
3287         }
3288         assert(tok == TOK.endOfFile);
3289         tok = lex2.nextToken();
3290         assert(tok == TOK.endOfFile);
3291     }
3292 }