[RISCV] Fix mgather -> riscv.masked.strided.load combine not extending indices (...
[llvm-project.git] / llvm / lib / MC / MCParser / AsmLexer.cpp
blobe08404ae0ad92f1bcab34fa7510440c1ac25d103
1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSwitch.h"
19 #include "llvm/MC/MCAsmInfo.h"
20 #include "llvm/MC/MCParser/MCAsmLexer.h"
21 #include "llvm/Support/Compiler.h"
22 #include "llvm/Support/SMLoc.h"
23 #include "llvm/Support/SaveAndRestore.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29 #include <tuple>
30 #include <utility>
32 using namespace llvm;
34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@");
36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
39 AsmLexer::~AsmLexer() = default;
41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42 bool EndStatementAtEOF) {
43 CurBuf = Buf;
45 if (ptr)
46 CurPtr = ptr;
47 else
48 CurPtr = CurBuf.begin();
50 TokStart = nullptr;
51 this->EndStatementAtEOF = EndStatementAtEOF;
54 /// ReturnError - Set the error to the specified string at the specified
55 /// location. This is defined to always return AsmToken::Error.
56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57 SetError(SMLoc::getFromPointer(Loc), Msg);
59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
62 int AsmLexer::getNextChar() {
63 if (CurPtr == CurBuf.end())
64 return EOF;
65 return (unsigned char)*CurPtr++;
68 int AsmLexer::peekNextChar() {
69 if (CurPtr == CurBuf.end())
70 return EOF;
71 return (unsigned char)*CurPtr;
74 /// The leading integral digit sequence and dot should have already been
75 /// consumed, some or all of the fractional digit sequence *can* have been
76 /// consumed.
77 AsmToken AsmLexer::LexFloatLiteral() {
78 // Skip the fractional digit sequence.
79 while (isDigit(*CurPtr))
80 ++CurPtr;
82 if (*CurPtr == '-' || *CurPtr == '+')
83 return ReturnError(CurPtr, "invalid sign in float literal");
85 // Check for exponent
86 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87 ++CurPtr;
89 if (*CurPtr == '-' || *CurPtr == '+')
90 ++CurPtr;
92 while (isDigit(*CurPtr))
93 ++CurPtr;
96 return AsmToken(AsmToken::Real,
97 StringRef(TokStart, CurPtr - TokStart));
100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101 /// while making sure there are enough actual digits around for the constant to
102 /// be valid.
104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105 /// before we get here.
106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108 "unexpected parse state in floating hex");
109 bool NoFracDigits = true;
111 // Skip the fractional part if there is one
112 if (*CurPtr == '.') {
113 ++CurPtr;
115 const char *FracStart = CurPtr;
116 while (isHexDigit(*CurPtr))
117 ++CurPtr;
119 NoFracDigits = CurPtr == FracStart;
122 if (NoIntDigits && NoFracDigits)
123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124 "expected at least one significand digit");
126 // Make sure we do have some kind of proper exponent part
127 if (*CurPtr != 'p' && *CurPtr != 'P')
128 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129 "expected exponent part 'p'");
130 ++CurPtr;
132 if (*CurPtr == '+' || *CurPtr == '-')
133 ++CurPtr;
135 // N.b. exponent digits are *not* hex
136 const char *ExpStart = CurPtr;
137 while (isDigit(*CurPtr))
138 ++CurPtr;
140 if (CurPtr == ExpStart)
141 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142 "expected at least one exponent digit");
144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150 (AllowAt && C == '@') || (AllowHash && C == '#');
153 AsmToken AsmLexer::LexIdentifier() {
154 // Check for floating point literals.
155 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156 // Disambiguate a .1243foo identifier from a floating literal.
157 while (isDigit(*CurPtr))
158 ++CurPtr;
160 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
161 AllowHashInIdentifier) ||
162 *CurPtr == 'e' || *CurPtr == 'E')
163 return LexFloatLiteral();
166 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
167 ++CurPtr;
169 // Handle . as a special case.
170 if (CurPtr == TokStart+1 && TokStart[0] == '.')
171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
176 /// LexSlash: Slash: /
177 /// C-Style Comment: /* ... */
178 /// C-style Comment: // ...
179 AsmToken AsmLexer::LexSlash() {
180 if (!MAI.shouldAllowAdditionalComments()) {
181 IsAtStartOfStatement = false;
182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
185 switch (*CurPtr) {
186 case '*':
187 IsAtStartOfStatement = false;
188 break; // C style comment.
189 case '/':
190 ++CurPtr;
191 return LexLineComment();
192 default:
193 IsAtStartOfStatement = false;
194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
197 // C Style comment.
198 ++CurPtr; // skip the star.
199 const char *CommentTextStart = CurPtr;
200 while (CurPtr != CurBuf.end()) {
201 switch (*CurPtr++) {
202 case '*':
203 // End of the comment?
204 if (*CurPtr != '/')
205 break;
206 // If we have a CommentConsumer, notify it about the comment.
207 if (CommentConsumer) {
208 CommentConsumer->HandleComment(
209 SMLoc::getFromPointer(CommentTextStart),
210 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
212 ++CurPtr; // End the */.
213 return AsmToken(AsmToken::Comment,
214 StringRef(TokStart, CurPtr - TokStart));
217 return ReturnError(TokStart, "unterminated comment");
220 /// LexLineComment: Comment: #[^\n]*
221 /// : //[^\n]*
222 AsmToken AsmLexer::LexLineComment() {
223 // Mark This as an end of statement with a body of the
224 // comment. While it would be nicer to leave this two tokens,
225 // backwards compatability with TargetParsers makes keeping this in this form
226 // better.
227 const char *CommentTextStart = CurPtr;
228 int CurChar = getNextChar();
229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230 CurChar = getNextChar();
231 const char *NewlinePtr = CurPtr;
232 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233 ++CurPtr;
235 // If we have a CommentConsumer, notify it about the comment.
236 if (CommentConsumer) {
237 CommentConsumer->HandleComment(
238 SMLoc::getFromPointer(CommentTextStart),
239 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
242 IsAtStartOfLine = true;
243 // This is a whole line comment. leave newline
244 if (IsAtStartOfStatement)
245 return AsmToken(AsmToken::EndOfStatement,
246 StringRef(TokStart, CurPtr - TokStart));
247 IsAtStartOfStatement = true;
249 return AsmToken(AsmToken::EndOfStatement,
250 StringRef(TokStart, CurPtr - 1 - TokStart));
253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
255 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
256 ++CurPtr;
257 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
258 ++CurPtr;
259 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
260 ++CurPtr;
263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264 // integer as a hexadecimal, possibly with leading zeroes.
265 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266 bool LexHex) {
267 const char *FirstNonDec = nullptr;
268 const char *LookAhead = CurPtr;
269 while (true) {
270 if (isDigit(*LookAhead)) {
271 ++LookAhead;
272 } else {
273 if (!FirstNonDec)
274 FirstNonDec = LookAhead;
276 // Keep going if we are looking for a 'h' suffix.
277 if (LexHex && isHexDigit(*LookAhead))
278 ++LookAhead;
279 else
280 break;
283 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285 if (isHex)
286 return 16;
287 return DefaultRadix;
290 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291 while (hexDigitValue(*CurPtr) < DefaultRadix) {
292 ++CurPtr;
294 return CurPtr;
297 static AsmToken intToken(StringRef Ref, APInt &Value) {
298 if (Value.isIntN(64))
299 return AsmToken(AsmToken::Integer, Ref, Value);
300 return AsmToken(AsmToken::BigNum, Ref, Value);
303 static std::string radixName(unsigned Radix) {
304 switch (Radix) {
305 case 2:
306 return "binary";
307 case 8:
308 return "octal";
309 case 10:
310 return "decimal";
311 case 16:
312 return "hexadecimal";
313 default:
314 return "base-" + std::to_string(Radix);
318 /// LexDigit: First character is [0-9].
319 /// Local Label: [0-9][:]
320 /// Forward/Backward Label: [0-9][fb]
321 /// Binary integer: 0b[01]+
322 /// Octal integer: 0[0-7]+
323 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324 /// Decimal integer: [1-9][0-9]*
325 AsmToken AsmLexer::LexDigit() {
326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327 // MASM-flavor octal integer: [0-7]+[oOqQ]
328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331 const char *FirstNonBinary =
332 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333 const char *FirstNonDecimal =
334 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335 const char *OldCurPtr = CurPtr;
336 while (isHexDigit(*CurPtr)) {
337 switch (*CurPtr) {
338 default:
339 if (!FirstNonDecimal) {
340 FirstNonDecimal = CurPtr;
342 [[fallthrough]];
343 case '9':
344 case '8':
345 case '7':
346 case '6':
347 case '5':
348 case '4':
349 case '3':
350 case '2':
351 if (!FirstNonBinary) {
352 FirstNonBinary = CurPtr;
354 break;
355 case '1':
356 case '0':
357 break;
359 ++CurPtr;
361 if (*CurPtr == '.') {
362 // MASM float literals (other than hex floats) always contain a ".", and
363 // are always written in decimal.
364 ++CurPtr;
365 return LexFloatLiteral();
368 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369 ++CurPtr;
370 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
373 unsigned Radix = 0;
374 if (*CurPtr == 'h' || *CurPtr == 'H') {
375 // hexadecimal number
376 ++CurPtr;
377 Radix = 16;
378 } else if (*CurPtr == 't' || *CurPtr == 'T') {
379 // decimal number
380 ++CurPtr;
381 Radix = 10;
382 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383 *CurPtr == 'Q') {
384 // octal number
385 ++CurPtr;
386 Radix = 8;
387 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388 // binary number
389 ++CurPtr;
390 Radix = 2;
391 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392 DefaultRadix < 14 &&
393 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394 Radix = 10;
395 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396 DefaultRadix < 12 &&
397 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398 Radix = 2;
401 if (Radix) {
402 StringRef Result(TokStart, CurPtr - TokStart);
403 APInt Value(128, 0, true);
405 if (Result.drop_back().getAsInteger(Radix, Value))
406 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
408 // MSVC accepts and ignores type suffices on integer literals.
409 SkipIgnoredIntegerSuffix(CurPtr);
411 return intToken(Result, Value);
414 // default-radix integers, or floating point numbers, fall through
415 CurPtr = OldCurPtr;
418 // MASM default-radix integers: [0-9a-fA-F]+
419 // (All other integer literals have a radix specifier.)
420 if (LexMasmIntegers && UseMasmDefaultRadix) {
421 CurPtr = findLastDigit(CurPtr, 16);
422 StringRef Result(TokStart, CurPtr - TokStart);
424 APInt Value(128, 0, true);
425 if (Result.getAsInteger(DefaultRadix, Value)) {
426 return ReturnError(TokStart,
427 "invalid " + radixName(DefaultRadix) + " number");
430 return intToken(Result, Value);
433 // Motorola hex integers: $[0-9a-fA-F]+
434 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435 const char *NumStart = CurPtr;
436 while (isHexDigit(CurPtr[0]))
437 ++CurPtr;
439 APInt Result(128, 0);
440 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
441 return ReturnError(TokStart, "invalid hexadecimal number");
443 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
446 // Motorola binary integers: %[01]+
447 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448 const char *NumStart = CurPtr;
449 while (*CurPtr == '0' || *CurPtr == '1')
450 ++CurPtr;
452 APInt Result(128, 0);
453 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
454 return ReturnError(TokStart, "invalid binary number");
456 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
459 // Decimal integer: [1-9][0-9]*
460 // HLASM-flavour decimal integer: [0-9][0-9]*
461 // FIXME: Later on, support for fb for HLASM has to be added in
462 // as they probably would be needed for asm goto
463 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
466 if (!LexHLASMIntegers) {
467 bool IsHex = Radix == 16;
468 // Check for floating point literals.
469 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470 if (*CurPtr == '.')
471 ++CurPtr;
472 return LexFloatLiteral();
476 StringRef Result(TokStart, CurPtr - TokStart);
478 APInt Value(128, 0, true);
479 if (Result.getAsInteger(Radix, Value))
480 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
482 if (!LexHLASMIntegers)
483 // The darwin/x86 (and x86-64) assembler accepts and ignores type
484 // suffices on integer literals.
485 SkipIgnoredIntegerSuffix(CurPtr);
487 return intToken(Result, Value);
490 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491 ++CurPtr;
492 // See if we actually have "0b" as part of something like "jmp 0b\n"
493 if (!isDigit(CurPtr[0])) {
494 --CurPtr;
495 StringRef Result(TokStart, CurPtr - TokStart);
496 return AsmToken(AsmToken::Integer, Result, 0);
498 const char *NumStart = CurPtr;
499 while (CurPtr[0] == '0' || CurPtr[0] == '1')
500 ++CurPtr;
502 // Requires at least one binary digit.
503 if (CurPtr == NumStart)
504 return ReturnError(TokStart, "invalid binary number");
506 StringRef Result(TokStart, CurPtr - TokStart);
508 APInt Value(128, 0, true);
509 if (Result.substr(2).getAsInteger(2, Value))
510 return ReturnError(TokStart, "invalid binary number");
512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513 // suffixes on integer literals.
514 SkipIgnoredIntegerSuffix(CurPtr);
516 return intToken(Result, Value);
519 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520 ++CurPtr;
521 const char *NumStart = CurPtr;
522 while (isHexDigit(CurPtr[0]))
523 ++CurPtr;
525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526 // diagnosed by LexHexFloatLiteral).
527 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528 return LexHexFloatLiteral(NumStart == CurPtr);
530 // Otherwise requires at least one hex digit.
531 if (CurPtr == NumStart)
532 return ReturnError(CurPtr-2, "invalid hexadecimal number");
534 APInt Result(128, 0);
535 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
536 return ReturnError(TokStart, "invalid hexadecimal number");
538 // Consume the optional [hH].
539 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540 ++CurPtr;
542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543 // suffixes on integer literals.
544 SkipIgnoredIntegerSuffix(CurPtr);
546 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
549 // Either octal or hexadecimal.
550 APInt Value(128, 0, true);
551 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
552 StringRef Result(TokStart, CurPtr - TokStart);
553 if (Result.getAsInteger(Radix, Value))
554 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
556 // Consume the [hH].
557 if (Radix == 16)
558 ++CurPtr;
560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561 // suffixes on integer literals.
562 SkipIgnoredIntegerSuffix(CurPtr);
564 return intToken(Result, Value);
567 /// LexSingleQuote: Integer: 'b'
568 AsmToken AsmLexer::LexSingleQuote() {
569 int CurChar = getNextChar();
571 if (LexHLASMStrings)
572 return ReturnError(TokStart, "invalid usage of character literals");
574 if (LexMasmStrings) {
575 while (CurChar != EOF) {
576 if (CurChar != '\'') {
577 CurChar = getNextChar();
578 } else if (peekNextChar() == '\'') {
579 // In MASM single-quote strings, doubled single-quotes mean an escaped
580 // single quote, so should be lexed in.
581 (void)getNextChar();
582 CurChar = getNextChar();
583 } else {
584 break;
587 if (CurChar == EOF)
588 return ReturnError(TokStart, "unterminated string constant");
589 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
592 if (CurChar == '\\')
593 CurChar = getNextChar();
595 if (CurChar == EOF)
596 return ReturnError(TokStart, "unterminated single quote");
598 CurChar = getNextChar();
600 if (CurChar != '\'')
601 return ReturnError(TokStart, "single quote way too long");
603 // The idea here being that 'c' is basically just an integral
604 // constant.
605 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606 long long Value;
608 if (Res.starts_with("\'\\")) {
609 char theChar = Res[2];
610 switch (theChar) {
611 default: Value = theChar; break;
612 case '\'': Value = '\''; break;
613 case 't': Value = '\t'; break;
614 case 'n': Value = '\n'; break;
615 case 'b': Value = '\b'; break;
616 case 'f': Value = '\f'; break;
617 case 'r': Value = '\r'; break;
619 } else
620 Value = TokStart[1];
622 return AsmToken(AsmToken::Integer, Res, Value);
625 /// LexQuote: String: "..."
626 AsmToken AsmLexer::LexQuote() {
627 int CurChar = getNextChar();
628 if (LexHLASMStrings)
629 return ReturnError(TokStart, "invalid usage of string literals");
631 if (LexMasmStrings) {
632 while (CurChar != EOF) {
633 if (CurChar != '"') {
634 CurChar = getNextChar();
635 } else if (peekNextChar() == '"') {
636 // In MASM double-quoted strings, doubled double-quotes mean an escaped
637 // double quote, so should be lexed in.
638 (void)getNextChar();
639 CurChar = getNextChar();
640 } else {
641 break;
644 if (CurChar == EOF)
645 return ReturnError(TokStart, "unterminated string constant");
646 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
649 // TODO: does gas allow multiline string constants?
650 while (CurChar != '"') {
651 if (CurChar == '\\') {
652 // Allow \", etc.
653 CurChar = getNextChar();
656 if (CurChar == EOF)
657 return ReturnError(TokStart, "unterminated string constant");
659 CurChar = getNextChar();
662 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
665 StringRef AsmLexer::LexUntilEndOfStatement() {
666 TokStart = CurPtr;
668 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
669 !isAtStatementSeparator(CurPtr) && // End of statement marker.
670 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
671 ++CurPtr;
673 return StringRef(TokStart, CurPtr-TokStart);
676 StringRef AsmLexer::LexUntilEndOfLine() {
677 TokStart = CurPtr;
679 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
680 ++CurPtr;
682 return StringRef(TokStart, CurPtr-TokStart);
685 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
686 bool ShouldSkipSpace) {
687 SaveAndRestore SavedTokenStart(TokStart);
688 SaveAndRestore SavedCurPtr(CurPtr);
689 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
690 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
691 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
692 SaveAndRestore SavedIsPeeking(IsPeeking, true);
693 std::string SavedErr = getErr();
694 SMLoc SavedErrLoc = getErrLoc();
696 size_t ReadCount;
697 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
698 AsmToken Token = LexToken();
700 Buf[ReadCount] = Token;
702 if (Token.is(AsmToken::Eof))
703 break;
706 SetError(SavedErrLoc, SavedErr);
707 return ReadCount;
710 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
711 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
712 return false;
714 StringRef CommentString = MAI.getCommentString();
716 if (CommentString.size() == 1)
717 return CommentString[0] == Ptr[0];
719 // Allow # preprocessor comments also be counted as comments for "##" cases
720 if (CommentString[1] == '#')
721 return CommentString[0] == Ptr[0];
723 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
726 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
727 return strncmp(Ptr, MAI.getSeparatorString(),
728 strlen(MAI.getSeparatorString())) == 0;
731 AsmToken AsmLexer::LexToken() {
732 TokStart = CurPtr;
733 // This always consumes at least one character.
734 int CurChar = getNextChar();
736 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
737 // If this starts with a '#', this may be a cpp
738 // hash directive and otherwise a line comment.
739 AsmToken TokenBuf[2];
740 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
741 size_t num = peekTokens(Buf, true);
742 // There cannot be a space preceding this
743 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
744 TokenBuf[1].is(AsmToken::String)) {
745 CurPtr = TokStart; // reset curPtr;
746 StringRef s = LexUntilEndOfLine();
747 UnLex(TokenBuf[1]);
748 UnLex(TokenBuf[0]);
749 return AsmToken(AsmToken::HashDirective, s);
752 if (MAI.shouldAllowAdditionalComments())
753 return LexLineComment();
756 if (isAtStartOfComment(TokStart))
757 return LexLineComment();
759 if (isAtStatementSeparator(TokStart)) {
760 CurPtr += strlen(MAI.getSeparatorString()) - 1;
761 IsAtStartOfLine = true;
762 IsAtStartOfStatement = true;
763 return AsmToken(AsmToken::EndOfStatement,
764 StringRef(TokStart, strlen(MAI.getSeparatorString())));
767 // If we're missing a newline at EOF, make sure we still get an
768 // EndOfStatement token before the Eof token.
769 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
770 IsAtStartOfLine = true;
771 IsAtStartOfStatement = true;
772 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
774 IsAtStartOfLine = false;
775 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
776 IsAtStartOfStatement = false;
777 switch (CurChar) {
778 default:
779 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
780 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
781 // an identifier is target-dependent. These characters are handled in the
782 // respective switch cases.
783 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
784 return LexIdentifier();
786 // Unknown character, emit an error.
787 return ReturnError(TokStart, "invalid character in input");
788 case EOF:
789 if (EndStatementAtEOF) {
790 IsAtStartOfLine = true;
791 IsAtStartOfStatement = true;
793 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
794 case 0:
795 case ' ':
796 case '\t':
797 IsAtStartOfStatement = OldIsAtStartOfStatement;
798 while (*CurPtr == ' ' || *CurPtr == '\t')
799 CurPtr++;
800 if (SkipSpace)
801 return LexToken(); // Ignore whitespace.
802 else
803 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
804 case '\r': {
805 IsAtStartOfLine = true;
806 IsAtStartOfStatement = true;
807 // If this is a CR followed by LF, treat that as one token.
808 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
809 ++CurPtr;
810 return AsmToken(AsmToken::EndOfStatement,
811 StringRef(TokStart, CurPtr - TokStart));
813 case '\n':
814 IsAtStartOfLine = true;
815 IsAtStartOfStatement = true;
816 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
817 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
818 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
819 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
820 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
821 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
822 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
823 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
824 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
825 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
826 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
827 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
828 case '$': {
829 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
830 return LexDigit();
831 if (MAI.doesAllowDollarAtStartOfIdentifier())
832 return LexIdentifier();
833 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
835 case '@':
836 if (MAI.doesAllowAtAtStartOfIdentifier())
837 return LexIdentifier();
838 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
839 case '#':
840 if (MAI.doesAllowHashAtStartOfIdentifier())
841 return LexIdentifier();
842 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
843 case '?':
844 if (MAI.doesAllowQuestionAtStartOfIdentifier())
845 return LexIdentifier();
846 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
847 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
848 case '=':
849 if (*CurPtr == '=') {
850 ++CurPtr;
851 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
853 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
854 case '-':
855 if (*CurPtr == '>') {
856 ++CurPtr;
857 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
859 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
860 case '|':
861 if (*CurPtr == '|') {
862 ++CurPtr;
863 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
865 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
866 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
867 case '&':
868 if (*CurPtr == '&') {
869 ++CurPtr;
870 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
872 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
873 case '!':
874 if (*CurPtr == '=') {
875 ++CurPtr;
876 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
878 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
879 case '%':
880 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
881 return LexDigit();
884 if (MAI.hasMipsExpressions()) {
885 AsmToken::TokenKind Operator;
886 unsigned OperatorLength;
888 std::tie(Operator, OperatorLength) =
889 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
890 StringRef(CurPtr))
891 .StartsWith("call16", {AsmToken::PercentCall16, 7})
892 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
893 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
894 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
895 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
896 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
897 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
898 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
899 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
900 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
901 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
902 .StartsWith("got", {AsmToken::PercentGot, 4})
903 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
904 .StartsWith("higher", {AsmToken::PercentHigher, 7})
905 .StartsWith("highest", {AsmToken::PercentHighest, 8})
906 .StartsWith("hi", {AsmToken::PercentHi, 3})
907 .StartsWith("lo", {AsmToken::PercentLo, 3})
908 .StartsWith("neg", {AsmToken::PercentNeg, 4})
909 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
910 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
911 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
912 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
913 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
914 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
915 .Default({AsmToken::Percent, 1});
917 if (Operator != AsmToken::Percent) {
918 CurPtr += OperatorLength - 1;
919 return AsmToken(Operator, StringRef(TokStart, OperatorLength));
922 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
923 case '/':
924 IsAtStartOfStatement = OldIsAtStartOfStatement;
925 return LexSlash();
926 case '\'': return LexSingleQuote();
927 case '"': return LexQuote();
928 case '0': case '1': case '2': case '3': case '4':
929 case '5': case '6': case '7': case '8': case '9':
930 return LexDigit();
931 case '<':
932 switch (*CurPtr) {
933 case '<':
934 ++CurPtr;
935 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
936 case '=':
937 ++CurPtr;
938 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
939 case '>':
940 ++CurPtr;
941 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
942 default:
943 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
945 case '>':
946 switch (*CurPtr) {
947 case '>':
948 ++CurPtr;
949 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
950 case '=':
951 ++CurPtr;
952 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
953 default:
954 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
957 // TODO: Quoted identifiers (objc methods etc)
958 // local labels: [0-9][:]
959 // Forward/backward labels: [0-9][fb]
960 // Integers, fp constants, character constants.