llvm/lib/MC/MCParser/AsmLexer.cpp

   1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This class implements the lexer for assembly files.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "llvm/MC/MCParser/AsmLexer.h"
  14 #include "llvm/ADT/APInt.h"
  15 #include "llvm/ADT/ArrayRef.h"
  16 #include "llvm/ADT/StringExtras.h"
  17 #include "llvm/ADT/StringRef.h"
  18 #include "llvm/ADT/StringSwitch.h"
  19 #include "llvm/MC/MCAsmInfo.h"
  20 #include "llvm/MC/MCParser/MCAsmLexer.h"
  21 #include "llvm/Support/Compiler.h"
  22 #include "llvm/Support/SMLoc.h"
  23 #include "llvm/Support/SaveAndRestore.h"
  24 #include <cassert>
  25 #include <cctype>
  26 #include <cstdio>
  27 #include <cstring>
  28 #include <string>
  29 #include <tuple>
  30 #include <utility>
  31
  32 using namespace llvm;
  33
  34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
  35   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@");
  36   LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
  37 }
  38
  39 AsmLexer::~AsmLexer() = default;
  40
  41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
  42                          bool EndStatementAtEOF) {
  43   CurBuf = Buf;
  44
  45   if (ptr)
  46     CurPtr = ptr;
  47   else
  48     CurPtr = CurBuf.begin();
  49
  50   TokStart = nullptr;
  51   this->EndStatementAtEOF = EndStatementAtEOF;
  52 }
  53
  54 /// ReturnError - Set the error to the specified string at the specified
  55 /// location.  This is defined to always return AsmToken::Error.
  56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
  57   SetError(SMLoc::getFromPointer(Loc), Msg);
  58
  59   return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
  60 }
  61
  62 int AsmLexer::getNextChar() {
  63   if (CurPtr == CurBuf.end())
  64     return EOF;
  65   return (unsigned char)*CurPtr++;
  66 }
  67
  68 int AsmLexer::peekNextChar() {
  69   if (CurPtr == CurBuf.end())
  70     return EOF;
  71   return (unsigned char)*CurPtr;
  72 }
  73
  74 /// The leading integral digit sequence and dot should have already been
  75 /// consumed, some or all of the fractional digit sequence *can* have been
  76 /// consumed.
  77 AsmToken AsmLexer::LexFloatLiteral() {
  78   // Skip the fractional digit sequence.
  79   while (isDigit(*CurPtr))
  80     ++CurPtr;
  81
  82   if (*CurPtr == '-' || *CurPtr == '+')
  83     return ReturnError(CurPtr, "invalid sign in float literal");
  84
  85   // Check for exponent
  86   if ((*CurPtr == 'e' || *CurPtr == 'E')) {
  87     ++CurPtr;
  88
  89     if (*CurPtr == '-' || *CurPtr == '+')
  90       ++CurPtr;
  91
  92     while (isDigit(*CurPtr))
  93       ++CurPtr;
  94   }
  95
  96   return AsmToken(AsmToken::Real,
  97                   StringRef(TokStart, CurPtr - TokStart));
  98 }
  99
 100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
 101 /// while making sure there are enough actual digits around for the constant to
 102 /// be valid.
 103 ///
 104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
 105 /// before we get here.
 106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
 107   assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
 108          "unexpected parse state in floating hex");
 109   bool NoFracDigits = true;
 110
 111   // Skip the fractional part if there is one
 112   if (*CurPtr == '.') {
 113     ++CurPtr;
 114
 115     const char *FracStart = CurPtr;
 116     while (isHexDigit(*CurPtr))
 117       ++CurPtr;
 118
 119     NoFracDigits = CurPtr == FracStart;
 120   }
 121
 122   if (NoIntDigits && NoFracDigits)
 123     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
 124                                  "expected at least one significand digit");
 125
 126   // Make sure we do have some kind of proper exponent part
 127   if (*CurPtr != 'p' && *CurPtr != 'P')
 128     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
 129                                  "expected exponent part 'p'");
 130   ++CurPtr;
 131
 132   if (*CurPtr == '+' || *CurPtr == '-')
 133     ++CurPtr;
 134
 135   // N.b. exponent digits are *not* hex
 136   const char *ExpStart = CurPtr;
 137   while (isDigit(*CurPtr))
 138     ++CurPtr;
 139
 140   if (CurPtr == ExpStart)
 141     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
 142                                  "expected at least one exponent digit");
 143
 144   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
 145 }
 146
 147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
 148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
 149   return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
 150          (AllowAt && C == '@') || (AllowHash && C == '#');
 151 }
 152
 153 AsmToken AsmLexer::LexIdentifier() {
 154   // Check for floating point literals.
 155   if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
 156     // Disambiguate a .1243foo identifier from a floating literal.
 157     while (isDigit(*CurPtr))
 158       ++CurPtr;
 159
 160     if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
 161                           AllowHashInIdentifier) ||
 162         *CurPtr == 'e' || *CurPtr == 'E')
 163       return LexFloatLiteral();
 164   }
 165
 166   while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
 167     ++CurPtr;
 168
 169   // Handle . as a special case.
 170   if (CurPtr == TokStart+1 && TokStart[0] == '.')
 171     return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
 172
 173   return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
 174 }
 175
 176 /// LexSlash: Slash: /
 177 ///           C-Style Comment: /* ... */
 178 ///           C-style Comment: // ...
 179 AsmToken AsmLexer::LexSlash() {
 180   if (!MAI.shouldAllowAdditionalComments()) {
 181     IsAtStartOfStatement = false;
 182     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
 183   }
 184
 185   switch (*CurPtr) {
 186   case '*':
 187     IsAtStartOfStatement = false;
 188     break; // C style comment.
 189   case '/':
 190     ++CurPtr;
 191     return LexLineComment();
 192   default:
 193     IsAtStartOfStatement = false;
 194     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
 195   }
 196
 197   // C Style comment.
 198   ++CurPtr;  // skip the star.
 199   const char *CommentTextStart = CurPtr;
 200   while (CurPtr != CurBuf.end()) {
 201     switch (*CurPtr++) {
 202     case '*':
 203       // End of the comment?
 204       if (*CurPtr != '/')
 205         break;
 206       // If we have a CommentConsumer, notify it about the comment.
 207       if (CommentConsumer) {
 208         CommentConsumer->HandleComment(
 209             SMLoc::getFromPointer(CommentTextStart),
 210             StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
 211       }
 212       ++CurPtr;   // End the */.
 213       return AsmToken(AsmToken::Comment,
 214                       StringRef(TokStart, CurPtr - TokStart));
 215     }
 216   }
 217   return ReturnError(TokStart, "unterminated comment");
 218 }
 219
 220 /// LexLineComment: Comment: #[^\n]*
 221 ///                        : //[^\n]*
 222 AsmToken AsmLexer::LexLineComment() {
 223   // Mark This as an end of statement with a body of the
 224   // comment. While it would be nicer to leave this two tokens,
 225   // backwards compatability with TargetParsers makes keeping this in this form
 226   // better.
 227   const char *CommentTextStart = CurPtr;
 228   int CurChar = getNextChar();
 229   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
 230     CurChar = getNextChar();
 231   const char *NewlinePtr = CurPtr;
 232   if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
 233     ++CurPtr;
 234
 235   // If we have a CommentConsumer, notify it about the comment.
 236   if (CommentConsumer) {
 237     CommentConsumer->HandleComment(
 238         SMLoc::getFromPointer(CommentTextStart),
 239         StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
 240   }
 241
 242   IsAtStartOfLine = true;
 243   // This is a whole line comment. leave newline
 244   if (IsAtStartOfStatement)
 245     return AsmToken(AsmToken::EndOfStatement,
 246                     StringRef(TokStart, CurPtr - TokStart));
 247   IsAtStartOfStatement = true;
 248
 249   return AsmToken(AsmToken::EndOfStatement,
 250                   StringRef(TokStart, CurPtr - 1 - TokStart));
 251 }
 252
 253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
 254   // Skip case-insensitive ULL, UL, U, L and LL suffixes.
 255   if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
 256     ++CurPtr;
 257   if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
 258     ++CurPtr;
 259   if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
 260     ++CurPtr;
 261 }
 262
 263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
 264 // integer as a hexadecimal, possibly with leading zeroes.
 265 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
 266                                bool LexHex) {
 267   const char *FirstNonDec = nullptr;
 268   const char *LookAhead = CurPtr;
 269   while (true) {
 270     if (isDigit(*LookAhead)) {
 271       ++LookAhead;
 272     } else {
 273       if (!FirstNonDec)
 274         FirstNonDec = LookAhead;
 275
 276       // Keep going if we are looking for a 'h' suffix.
 277       if (LexHex && isHexDigit(*LookAhead))
 278         ++LookAhead;
 279       else
 280         break;
 281     }
 282   }
 283   bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
 284   CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
 285   if (isHex)
 286     return 16;
 287   return DefaultRadix;
 288 }
 289
 290 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
 291   while (hexDigitValue(*CurPtr) < DefaultRadix) {
 292     ++CurPtr;
 293   }
 294   return CurPtr;
 295 }
 296
 297 static AsmToken intToken(StringRef Ref, APInt &Value) {
 298   if (Value.isIntN(64))
 299     return AsmToken(AsmToken::Integer, Ref, Value);
 300   return AsmToken(AsmToken::BigNum, Ref, Value);
 301 }
 302
 303 static std::string radixName(unsigned Radix) {
 304   switch (Radix) {
 305   case 2:
 306     return "binary";
 307   case 8:
 308     return "octal";
 309   case 10:
 310     return "decimal";
 311   case 16:
 312     return "hexadecimal";
 313   default:
 314     return "base-" + std::to_string(Radix);
 315   }
 316 }
 317
 318 /// LexDigit: First character is [0-9].
 319 ///   Local Label: [0-9][:]
 320 ///   Forward/Backward Label: [0-9][fb]
 321 ///   Binary integer: 0b[01]+
 322 ///   Octal integer: 0[0-7]+
 323 ///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
 324 ///   Decimal integer: [1-9][0-9]*
 325 AsmToken AsmLexer::LexDigit() {
 326   // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
 327   // MASM-flavor octal integer: [0-7]+[oOqQ]
 328   // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
 329   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
 330   if (LexMasmIntegers && isdigit(CurPtr[-1])) {
 331     const char *FirstNonBinary =
 332         (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
 333     const char *FirstNonDecimal =
 334         (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
 335     const char *OldCurPtr = CurPtr;
 336     while (isHexDigit(*CurPtr)) {
 337       switch (*CurPtr) {
 338       default:
 339         if (!FirstNonDecimal) {
 340           FirstNonDecimal = CurPtr;
 341         }
 342         [[fallthrough]];
 343       case '9':
 344       case '8':
 345       case '7':
 346       case '6':
 347       case '5':
 348       case '4':
 349       case '3':
 350       case '2':
 351         if (!FirstNonBinary) {
 352           FirstNonBinary = CurPtr;
 353         }
 354         break;
 355       case '1':
 356       case '0':
 357         break;
 358       }
 359       ++CurPtr;
 360     }
 361     if (*CurPtr == '.') {
 362       // MASM float literals (other than hex floats) always contain a ".", and
 363       // are always written in decimal.
 364       ++CurPtr;
 365       return LexFloatLiteral();
 366     }
 367
 368     if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
 369       ++CurPtr;
 370       return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
 371     }
 372
 373     unsigned Radix = 0;
 374     if (*CurPtr == 'h' || *CurPtr == 'H') {
 375       // hexadecimal number
 376       ++CurPtr;
 377       Radix = 16;
 378     } else if (*CurPtr == 't' || *CurPtr == 'T') {
 379       // decimal number
 380       ++CurPtr;
 381       Radix = 10;
 382     } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
 383                *CurPtr == 'Q') {
 384       // octal number
 385       ++CurPtr;
 386       Radix = 8;
 387     } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
 388       // binary number
 389       ++CurPtr;
 390       Radix = 2;
 391     } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
 392                DefaultRadix < 14 &&
 393                (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
 394       Radix = 10;
 395     } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
 396                DefaultRadix < 12 &&
 397                (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
 398       Radix = 2;
 399     }
 400
 401     if (Radix) {
 402       StringRef Result(TokStart, CurPtr - TokStart);
 403       APInt Value(128, 0, true);
 404
 405       if (Result.drop_back().getAsInteger(Radix, Value))
 406         return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
 407
 408       // MSVC accepts and ignores type suffices on integer literals.
 409       SkipIgnoredIntegerSuffix(CurPtr);
 410
 411       return intToken(Result, Value);
 412     }
 413
 414     // default-radix integers, or floating point numbers, fall through
 415     CurPtr = OldCurPtr;
 416   }
 417
 418   // MASM default-radix integers: [0-9a-fA-F]+
 419   // (All other integer literals have a radix specifier.)
 420   if (LexMasmIntegers && UseMasmDefaultRadix) {
 421     CurPtr = findLastDigit(CurPtr, 16);
 422     StringRef Result(TokStart, CurPtr - TokStart);
 423
 424     APInt Value(128, 0, true);
 425     if (Result.getAsInteger(DefaultRadix, Value)) {
 426       return ReturnError(TokStart,
 427                          "invalid " + radixName(DefaultRadix) + " number");
 428     }
 429
 430     return intToken(Result, Value);
 431   }
 432
 433   // Motorola hex integers: $[0-9a-fA-F]+
 434   if (LexMotorolaIntegers && CurPtr[-1] == '$') {
 435     const char *NumStart = CurPtr;
 436     while (isHexDigit(CurPtr[0]))
 437       ++CurPtr;
 438
 439     APInt Result(128, 0);
 440     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
 441       return ReturnError(TokStart, "invalid hexadecimal number");
 442
 443     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
 444   }
 445
 446   // Motorola binary integers: %[01]+
 447   if (LexMotorolaIntegers && CurPtr[-1] == '%') {
 448     const char *NumStart = CurPtr;
 449     while (*CurPtr == '0' || *CurPtr == '1')
 450       ++CurPtr;
 451
 452     APInt Result(128, 0);
 453     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
 454       return ReturnError(TokStart, "invalid binary number");
 455
 456     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
 457   }
 458
 459   // Decimal integer: [1-9][0-9]*
 460   // HLASM-flavour decimal integer: [0-9][0-9]*
 461   // FIXME: Later on, support for fb for HLASM has to be added in
 462   // as they probably would be needed for asm goto
 463   if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
 464     unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
 465
 466     if (!LexHLASMIntegers) {
 467       bool IsHex = Radix == 16;
 468       // Check for floating point literals.
 469       if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
 470         if (*CurPtr == '.')
 471           ++CurPtr;
 472         return LexFloatLiteral();
 473       }
 474     }
 475
 476     StringRef Result(TokStart, CurPtr - TokStart);
 477
 478     APInt Value(128, 0, true);
 479     if (Result.getAsInteger(Radix, Value))
 480       return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
 481
 482     if (!LexHLASMIntegers)
 483       // The darwin/x86 (and x86-64) assembler accepts and ignores type
 484       // suffices on integer literals.
 485       SkipIgnoredIntegerSuffix(CurPtr);
 486
 487     return intToken(Result, Value);
 488   }
 489
 490   if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
 491     ++CurPtr;
 492     // See if we actually have "0b" as part of something like "jmp 0b\n"
 493     if (!isDigit(CurPtr[0])) {
 494       --CurPtr;
 495       StringRef Result(TokStart, CurPtr - TokStart);
 496       return AsmToken(AsmToken::Integer, Result, 0);
 497     }
 498     const char *NumStart = CurPtr;
 499     while (CurPtr[0] == '0' || CurPtr[0] == '1')
 500       ++CurPtr;
 501
 502     // Requires at least one binary digit.
 503     if (CurPtr == NumStart)
 504       return ReturnError(TokStart, "invalid binary number");
 505
 506     StringRef Result(TokStart, CurPtr - TokStart);
 507
 508     APInt Value(128, 0, true);
 509     if (Result.substr(2).getAsInteger(2, Value))
 510       return ReturnError(TokStart, "invalid binary number");
 511
 512     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 513     // suffixes on integer literals.
 514     SkipIgnoredIntegerSuffix(CurPtr);
 515
 516     return intToken(Result, Value);
 517   }
 518
 519   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
 520     ++CurPtr;
 521     const char *NumStart = CurPtr;
 522     while (isHexDigit(CurPtr[0]))
 523       ++CurPtr;
 524
 525     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
 526     // diagnosed by LexHexFloatLiteral).
 527     if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
 528       return LexHexFloatLiteral(NumStart == CurPtr);
 529
 530     // Otherwise requires at least one hex digit.
 531     if (CurPtr == NumStart)
 532       return ReturnError(CurPtr-2, "invalid hexadecimal number");
 533
 534     APInt Result(128, 0);
 535     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
 536       return ReturnError(TokStart, "invalid hexadecimal number");
 537
 538     // Consume the optional [hH].
 539     if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
 540       ++CurPtr;
 541
 542     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 543     // suffixes on integer literals.
 544     SkipIgnoredIntegerSuffix(CurPtr);
 545
 546     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
 547   }
 548
 549   // Either octal or hexadecimal.
 550   APInt Value(128, 0, true);
 551   unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
 552   StringRef Result(TokStart, CurPtr - TokStart);
 553   if (Result.getAsInteger(Radix, Value))
 554     return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
 555
 556   // Consume the [hH].
 557   if (Radix == 16)
 558     ++CurPtr;
 559
 560   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 561   // suffixes on integer literals.
 562   SkipIgnoredIntegerSuffix(CurPtr);
 563
 564   return intToken(Result, Value);
 565 }
 566
 567 /// LexSingleQuote: Integer: 'b'
 568 AsmToken AsmLexer::LexSingleQuote() {
 569   int CurChar = getNextChar();
 570
 571   if (LexHLASMStrings)
 572     return ReturnError(TokStart, "invalid usage of character literals");
 573
 574   if (LexMasmStrings) {
 575     while (CurChar != EOF) {
 576       if (CurChar != '\'') {
 577         CurChar = getNextChar();
 578       } else if (peekNextChar() == '\'') {
 579         // In MASM single-quote strings, doubled single-quotes mean an escaped
 580         // single quote, so should be lexed in.
 581         (void)getNextChar();
 582         CurChar = getNextChar();
 583       } else {
 584         break;
 585       }
 586     }
 587     if (CurChar == EOF)
 588       return ReturnError(TokStart, "unterminated string constant");
 589     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
 590   }
 591
 592   if (CurChar == '\\')
 593     CurChar = getNextChar();
 594
 595   if (CurChar == EOF)
 596     return ReturnError(TokStart, "unterminated single quote");
 597
 598   CurChar = getNextChar();
 599
 600   if (CurChar != '\'')
 601     return ReturnError(TokStart, "single quote way too long");
 602
 603   // The idea here being that 'c' is basically just an integral
 604   // constant.
 605   StringRef Res = StringRef(TokStart,CurPtr - TokStart);
 606   long long Value;
 607
 608   if (Res.starts_with("\'\\")) {
 609     char theChar = Res[2];
 610     switch (theChar) {
 611       default: Value = theChar; break;
 612       case '\'': Value = '\''; break;
 613       case 't': Value = '\t'; break;
 614       case 'n': Value = '\n'; break;
 615       case 'b': Value = '\b'; break;
 616       case 'f': Value = '\f'; break;
 617       case 'r': Value = '\r'; break;
 618     }
 619   } else
 620     Value = TokStart[1];
 621
 622   return AsmToken(AsmToken::Integer, Res, Value);
 623 }
 624
 625 /// LexQuote: String: "..."
 626 AsmToken AsmLexer::LexQuote() {
 627   int CurChar = getNextChar();
 628   if (LexHLASMStrings)
 629     return ReturnError(TokStart, "invalid usage of string literals");
 630
 631   if (LexMasmStrings) {
 632     while (CurChar != EOF) {
 633       if (CurChar != '"') {
 634         CurChar = getNextChar();
 635       } else if (peekNextChar() == '"') {
 636         // In MASM double-quoted strings, doubled double-quotes mean an escaped
 637         // double quote, so should be lexed in.
 638         (void)getNextChar();
 639         CurChar = getNextChar();
 640       } else {
 641         break;
 642       }
 643     }
 644     if (CurChar == EOF)
 645       return ReturnError(TokStart, "unterminated string constant");
 646     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
 647   }
 648
 649   // TODO: does gas allow multiline string constants?
 650   while (CurChar != '"') {
 651     if (CurChar == '\\') {
 652       // Allow \", etc.
 653       CurChar = getNextChar();
 654     }
 655
 656     if (CurChar == EOF)
 657       return ReturnError(TokStart, "unterminated string constant");
 658
 659     CurChar = getNextChar();
 660   }
 661
 662   return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
 663 }
 664
 665 StringRef AsmLexer::LexUntilEndOfStatement() {
 666   TokStart = CurPtr;
 667
 668   while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
 669          !isAtStatementSeparator(CurPtr) && // End of statement marker.
 670          *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
 671     ++CurPtr;
 672   }
 673   return StringRef(TokStart, CurPtr-TokStart);
 674 }
 675
 676 StringRef AsmLexer::LexUntilEndOfLine() {
 677   TokStart = CurPtr;
 678
 679   while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
 680     ++CurPtr;
 681   }
 682   return StringRef(TokStart, CurPtr-TokStart);
 683 }
 684
 685 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
 686                             bool ShouldSkipSpace) {
 687   SaveAndRestore SavedTokenStart(TokStart);
 688   SaveAndRestore SavedCurPtr(CurPtr);
 689   SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
 690   SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
 691   SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
 692   SaveAndRestore SavedIsPeeking(IsPeeking, true);
 693   std::string SavedErr = getErr();
 694   SMLoc SavedErrLoc = getErrLoc();
 695
 696   size_t ReadCount;
 697   for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
 698     AsmToken Token = LexToken();
 699
 700     Buf[ReadCount] = Token;
 701
 702     if (Token.is(AsmToken::Eof))
 703       break;
 704   }
 705
 706   SetError(SavedErrLoc, SavedErr);
 707   return ReadCount;
 708 }
 709
 710 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
 711   if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
 712     return false;
 713
 714   StringRef CommentString = MAI.getCommentString();
 715
 716   if (CommentString.size() == 1)
 717     return CommentString[0] == Ptr[0];
 718
 719   // Allow # preprocessor comments also be counted as comments for "##" cases
 720   if (CommentString[1] == '#')
 721     return CommentString[0] == Ptr[0];
 722
 723   return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
 724 }
 725
 726 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
 727   return strncmp(Ptr, MAI.getSeparatorString(),
 728                  strlen(MAI.getSeparatorString())) == 0;
 729 }
 730
 731 AsmToken AsmLexer::LexToken() {
 732   TokStart = CurPtr;
 733   // This always consumes at least one character.
 734   int CurChar = getNextChar();
 735
 736   if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
 737     // If this starts with a '#', this may be a cpp
 738     // hash directive and otherwise a line comment.
 739     AsmToken TokenBuf[2];
 740     MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
 741     size_t num = peekTokens(Buf, true);
 742     // There cannot be a space preceding this
 743     if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
 744         TokenBuf[1].is(AsmToken::String)) {
 745       CurPtr = TokStart; // reset curPtr;
 746       StringRef s = LexUntilEndOfLine();
 747       UnLex(TokenBuf[1]);
 748       UnLex(TokenBuf[0]);
 749       return AsmToken(AsmToken::HashDirective, s);
 750     }
 751
 752     if (MAI.shouldAllowAdditionalComments())
 753       return LexLineComment();
 754   }
 755
 756   if (isAtStartOfComment(TokStart))
 757     return LexLineComment();
 758
 759   if (isAtStatementSeparator(TokStart)) {
 760     CurPtr += strlen(MAI.getSeparatorString()) - 1;
 761     IsAtStartOfLine = true;
 762     IsAtStartOfStatement = true;
 763     return AsmToken(AsmToken::EndOfStatement,
 764                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
 765   }
 766
 767   // If we're missing a newline at EOF, make sure we still get an
 768   // EndOfStatement token before the Eof token.
 769   if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
 770     IsAtStartOfLine = true;
 771     IsAtStartOfStatement = true;
 772     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
 773   }
 774   IsAtStartOfLine = false;
 775   bool OldIsAtStartOfStatement = IsAtStartOfStatement;
 776   IsAtStartOfStatement = false;
 777   switch (CurChar) {
 778   default:
 779     // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
 780     // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
 781     // an identifier is target-dependent. These characters are handled in the
 782     // respective switch cases.
 783     if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
 784       return LexIdentifier();
 785
 786     // Unknown character, emit an error.
 787     return ReturnError(TokStart, "invalid character in input");
 788   case EOF:
 789     if (EndStatementAtEOF) {
 790       IsAtStartOfLine = true;
 791       IsAtStartOfStatement = true;
 792     }
 793     return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
 794   case 0:
 795   case ' ':
 796   case '\t':
 797     IsAtStartOfStatement = OldIsAtStartOfStatement;
 798     while (*CurPtr == ' ' || *CurPtr == '\t')
 799       CurPtr++;
 800     if (SkipSpace)
 801       return LexToken(); // Ignore whitespace.
 802     else
 803       return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
 804   case '\r': {
 805     IsAtStartOfLine = true;
 806     IsAtStartOfStatement = true;
 807     // If this is a CR followed by LF, treat that as one token.
 808     if (CurPtr != CurBuf.end() && *CurPtr == '\n')
 809       ++CurPtr;
 810     return AsmToken(AsmToken::EndOfStatement,
 811                     StringRef(TokStart, CurPtr - TokStart));
 812   }
 813   case '\n':
 814     IsAtStartOfLine = true;
 815     IsAtStartOfStatement = true;
 816     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
 817   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
 818   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
 819   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
 820   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
 821   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
 822   case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
 823   case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
 824   case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
 825   case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
 826   case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
 827   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
 828   case '$': {
 829     if (LexMotorolaIntegers && isHexDigit(*CurPtr))
 830       return LexDigit();
 831     if (MAI.doesAllowDollarAtStartOfIdentifier())
 832       return LexIdentifier();
 833     return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
 834   }
 835   case '@':
 836     if (MAI.doesAllowAtAtStartOfIdentifier())
 837       return LexIdentifier();
 838     return AsmToken(AsmToken::At, StringRef(TokStart, 1));
 839   case '#':
 840     if (MAI.doesAllowHashAtStartOfIdentifier())
 841       return LexIdentifier();
 842     return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
 843   case '?':
 844     if (MAI.doesAllowQuestionAtStartOfIdentifier())
 845       return LexIdentifier();
 846     return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
 847   case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
 848   case '=':
 849     if (*CurPtr == '=') {
 850       ++CurPtr;
 851       return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
 852     }
 853     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
 854   case '-':
 855     if (*CurPtr == '>') {
 856       ++CurPtr;
 857       return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
 858     }
 859     return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
 860   case '|':
 861     if (*CurPtr == '|') {
 862       ++CurPtr;
 863       return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
 864     }
 865     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
 866   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
 867   case '&':
 868     if (*CurPtr == '&') {
 869       ++CurPtr;
 870       return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
 871     }
 872     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
 873   case '!':
 874     if (*CurPtr == '=') {
 875       ++CurPtr;
 876       return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
 877     }
 878     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
 879   case '%':
 880     if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
 881       return LexDigit();
 882     }
 883
 884     if (MAI.hasMipsExpressions()) {
 885       AsmToken::TokenKind Operator;
 886       unsigned OperatorLength;
 887
 888       std::tie(Operator, OperatorLength) =
 889           StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
 890               StringRef(CurPtr))
 891               .StartsWith("call16", {AsmToken::PercentCall16, 7})
 892               .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
 893               .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
 894               .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
 895               .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
 896               .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
 897               .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
 898               .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
 899               .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
 900               .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
 901               .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
 902               .StartsWith("got", {AsmToken::PercentGot, 4})
 903               .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
 904               .StartsWith("higher", {AsmToken::PercentHigher, 7})
 905               .StartsWith("highest", {AsmToken::PercentHighest, 8})
 906               .StartsWith("hi", {AsmToken::PercentHi, 3})
 907               .StartsWith("lo", {AsmToken::PercentLo, 3})
 908               .StartsWith("neg", {AsmToken::PercentNeg, 4})
 909               .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
 910               .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
 911               .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
 912               .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
 913               .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
 914               .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
 915               .Default({AsmToken::Percent, 1});
 916
 917       if (Operator != AsmToken::Percent) {
 918         CurPtr += OperatorLength - 1;
 919         return AsmToken(Operator, StringRef(TokStart, OperatorLength));
 920       }
 921     }
 922     return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
 923   case '/':
 924     IsAtStartOfStatement = OldIsAtStartOfStatement;
 925     return LexSlash();
 926   case '\'': return LexSingleQuote();
 927   case '"': return LexQuote();
 928   case '0': case '1': case '2': case '3': case '4':
 929   case '5': case '6': case '7': case '8': case '9':
 930     return LexDigit();
 931   case '<':
 932     switch (*CurPtr) {
 933     case '<':
 934       ++CurPtr;
 935       return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
 936     case '=':
 937       ++CurPtr;
 938       return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
 939     case '>':
 940       ++CurPtr;
 941       return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
 942     default:
 943       return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
 944     }
 945   case '>':
 946     switch (*CurPtr) {
 947     case '>':
 948       ++CurPtr;
 949       return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
 950     case '=':
 951       ++CurPtr;
 952       return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
 953     default:
 954       return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
 955     }
 956
 957   // TODO: Quoted identifiers (objc methods etc)
 958   // local labels: [0-9][:]
 959   // Forward/backward labels: [0-9][fb]
 960   // Integers, fp constants, character constants.
 961   }
 962 }