1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This class implements the lexer for assembly files.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSwitch.h"
19 #include "llvm/MC/MCAsmInfo.h"
20 #include "llvm/MC/MCParser/MCAsmLexer.h"
21 #include "llvm/Support/Compiler.h"
22 #include "llvm/Support/SMLoc.h"
23 #include "llvm/Support/SaveAndRestore.h"
34 AsmLexer::AsmLexer(const MCAsmInfo
&MAI
) : MAI(MAI
) {
35 AllowAtInIdentifier
= !StringRef(MAI
.getCommentString()).starts_with("@");
36 LexMotorolaIntegers
= MAI
.shouldUseMotorolaIntegers();
39 AsmLexer::~AsmLexer() = default;
41 void AsmLexer::setBuffer(StringRef Buf
, const char *ptr
,
42 bool EndStatementAtEOF
) {
48 CurPtr
= CurBuf
.begin();
51 this->EndStatementAtEOF
= EndStatementAtEOF
;
54 /// ReturnError - Set the error to the specified string at the specified
55 /// location. This is defined to always return AsmToken::Error.
56 AsmToken
AsmLexer::ReturnError(const char *Loc
, const std::string
&Msg
) {
57 SetError(SMLoc::getFromPointer(Loc
), Msg
);
59 return AsmToken(AsmToken::Error
, StringRef(Loc
, CurPtr
- Loc
));
62 int AsmLexer::getNextChar() {
63 if (CurPtr
== CurBuf
.end())
65 return (unsigned char)*CurPtr
++;
68 int AsmLexer::peekNextChar() {
69 if (CurPtr
== CurBuf
.end())
71 return (unsigned char)*CurPtr
;
74 /// The leading integral digit sequence and dot should have already been
75 /// consumed, some or all of the fractional digit sequence *can* have been
77 AsmToken
AsmLexer::LexFloatLiteral() {
78 // Skip the fractional digit sequence.
79 while (isDigit(*CurPtr
))
82 if (*CurPtr
== '-' || *CurPtr
== '+')
83 return ReturnError(CurPtr
, "invalid sign in float literal");
86 if ((*CurPtr
== 'e' || *CurPtr
== 'E')) {
89 if (*CurPtr
== '-' || *CurPtr
== '+')
92 while (isDigit(*CurPtr
))
96 return AsmToken(AsmToken::Real
,
97 StringRef(TokStart
, CurPtr
- TokStart
));
100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101 /// while making sure there are enough actual digits around for the constant to
104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105 /// before we get here.
106 AsmToken
AsmLexer::LexHexFloatLiteral(bool NoIntDigits
) {
107 assert((*CurPtr
== 'p' || *CurPtr
== 'P' || *CurPtr
== '.') &&
108 "unexpected parse state in floating hex");
109 bool NoFracDigits
= true;
111 // Skip the fractional part if there is one
112 if (*CurPtr
== '.') {
115 const char *FracStart
= CurPtr
;
116 while (isHexDigit(*CurPtr
))
119 NoFracDigits
= CurPtr
== FracStart
;
122 if (NoIntDigits
&& NoFracDigits
)
123 return ReturnError(TokStart
, "invalid hexadecimal floating-point constant: "
124 "expected at least one significand digit");
126 // Make sure we do have some kind of proper exponent part
127 if (*CurPtr
!= 'p' && *CurPtr
!= 'P')
128 return ReturnError(TokStart
, "invalid hexadecimal floating-point constant: "
129 "expected exponent part 'p'");
132 if (*CurPtr
== '+' || *CurPtr
== '-')
135 // N.b. exponent digits are *not* hex
136 const char *ExpStart
= CurPtr
;
137 while (isDigit(*CurPtr
))
140 if (CurPtr
== ExpStart
)
141 return ReturnError(TokStart
, "invalid hexadecimal floating-point constant: "
142 "expected at least one exponent digit");
144 return AsmToken(AsmToken::Real
, StringRef(TokStart
, CurPtr
- TokStart
));
147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148 static bool isIdentifierChar(char C
, bool AllowAt
, bool AllowHash
) {
149 return isAlnum(C
) || C
== '_' || C
== '$' || C
== '.' || C
== '?' ||
150 (AllowAt
&& C
== '@') || (AllowHash
&& C
== '#');
153 AsmToken
AsmLexer::LexIdentifier() {
154 // Check for floating point literals.
155 if (CurPtr
[-1] == '.' && isDigit(*CurPtr
)) {
156 // Disambiguate a .1243foo identifier from a floating literal.
157 while (isDigit(*CurPtr
))
160 if (!isIdentifierChar(*CurPtr
, AllowAtInIdentifier
,
161 AllowHashInIdentifier
) ||
162 *CurPtr
== 'e' || *CurPtr
== 'E')
163 return LexFloatLiteral();
166 while (isIdentifierChar(*CurPtr
, AllowAtInIdentifier
, AllowHashInIdentifier
))
169 // Handle . as a special case.
170 if (CurPtr
== TokStart
+1 && TokStart
[0] == '.')
171 return AsmToken(AsmToken::Dot
, StringRef(TokStart
, 1));
173 return AsmToken(AsmToken::Identifier
, StringRef(TokStart
, CurPtr
- TokStart
));
176 /// LexSlash: Slash: /
177 /// C-Style Comment: /* ... */
178 /// C-style Comment: // ...
179 AsmToken
AsmLexer::LexSlash() {
180 if (!MAI
.shouldAllowAdditionalComments()) {
181 IsAtStartOfStatement
= false;
182 return AsmToken(AsmToken::Slash
, StringRef(TokStart
, 1));
187 IsAtStartOfStatement
= false;
188 break; // C style comment.
191 return LexLineComment();
193 IsAtStartOfStatement
= false;
194 return AsmToken(AsmToken::Slash
, StringRef(TokStart
, 1));
198 ++CurPtr
; // skip the star.
199 const char *CommentTextStart
= CurPtr
;
200 while (CurPtr
!= CurBuf
.end()) {
203 // End of the comment?
206 // If we have a CommentConsumer, notify it about the comment.
207 if (CommentConsumer
) {
208 CommentConsumer
->HandleComment(
209 SMLoc::getFromPointer(CommentTextStart
),
210 StringRef(CommentTextStart
, CurPtr
- 1 - CommentTextStart
));
212 ++CurPtr
; // End the */.
213 return AsmToken(AsmToken::Comment
,
214 StringRef(TokStart
, CurPtr
- TokStart
));
217 return ReturnError(TokStart
, "unterminated comment");
220 /// LexLineComment: Comment: #[^\n]*
222 AsmToken
AsmLexer::LexLineComment() {
223 // Mark This as an end of statement with a body of the
224 // comment. While it would be nicer to leave this two tokens,
225 // backwards compatability with TargetParsers makes keeping this in this form
227 const char *CommentTextStart
= CurPtr
;
228 int CurChar
= getNextChar();
229 while (CurChar
!= '\n' && CurChar
!= '\r' && CurChar
!= EOF
)
230 CurChar
= getNextChar();
231 const char *NewlinePtr
= CurPtr
;
232 if (CurChar
== '\r' && CurPtr
!= CurBuf
.end() && *CurPtr
== '\n')
235 // If we have a CommentConsumer, notify it about the comment.
236 if (CommentConsumer
) {
237 CommentConsumer
->HandleComment(
238 SMLoc::getFromPointer(CommentTextStart
),
239 StringRef(CommentTextStart
, NewlinePtr
- 1 - CommentTextStart
));
242 IsAtStartOfLine
= true;
243 // This is a whole line comment. leave newline
244 if (IsAtStartOfStatement
)
245 return AsmToken(AsmToken::EndOfStatement
,
246 StringRef(TokStart
, CurPtr
- TokStart
));
247 IsAtStartOfStatement
= true;
249 return AsmToken(AsmToken::EndOfStatement
,
250 StringRef(TokStart
, CurPtr
- 1 - TokStart
));
253 static void SkipIgnoredIntegerSuffix(const char *&CurPtr
) {
254 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
255 if (CurPtr
[0] == 'U' || CurPtr
[0] == 'u')
257 if (CurPtr
[0] == 'L' || CurPtr
[0] == 'l')
259 if (CurPtr
[0] == 'L' || CurPtr
[0] == 'l')
263 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264 // integer as a hexadecimal, possibly with leading zeroes.
265 static unsigned doHexLookAhead(const char *&CurPtr
, unsigned DefaultRadix
,
267 const char *FirstNonDec
= nullptr;
268 const char *LookAhead
= CurPtr
;
270 if (isDigit(*LookAhead
)) {
274 FirstNonDec
= LookAhead
;
276 // Keep going if we are looking for a 'h' suffix.
277 if (LexHex
&& isHexDigit(*LookAhead
))
283 bool isHex
= LexHex
&& (*LookAhead
== 'h' || *LookAhead
== 'H');
284 CurPtr
= isHex
|| !FirstNonDec
? LookAhead
: FirstNonDec
;
290 static const char *findLastDigit(const char *CurPtr
, unsigned DefaultRadix
) {
291 while (hexDigitValue(*CurPtr
) < DefaultRadix
) {
297 static AsmToken
intToken(StringRef Ref
, APInt
&Value
) {
298 if (Value
.isIntN(64))
299 return AsmToken(AsmToken::Integer
, Ref
, Value
);
300 return AsmToken(AsmToken::BigNum
, Ref
, Value
);
303 static std::string
radixName(unsigned Radix
) {
312 return "hexadecimal";
314 return "base-" + std::to_string(Radix
);
318 /// LexDigit: First character is [0-9].
319 /// Local Label: [0-9][:]
320 /// Forward/Backward Label: [0-9][fb]
321 /// Binary integer: 0b[01]+
322 /// Octal integer: 0[0-7]+
323 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324 /// Decimal integer: [1-9][0-9]*
325 AsmToken
AsmLexer::LexDigit() {
326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327 // MASM-flavor octal integer: [0-7]+[oOqQ]
328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330 if (LexMasmIntegers
&& isdigit(CurPtr
[-1])) {
331 const char *FirstNonBinary
=
332 (CurPtr
[-1] != '0' && CurPtr
[-1] != '1') ? CurPtr
- 1 : nullptr;
333 const char *FirstNonDecimal
=
334 (CurPtr
[-1] < '0' || CurPtr
[-1] > '9') ? CurPtr
- 1 : nullptr;
335 const char *OldCurPtr
= CurPtr
;
336 while (isHexDigit(*CurPtr
)) {
339 if (!FirstNonDecimal
) {
340 FirstNonDecimal
= CurPtr
;
351 if (!FirstNonBinary
) {
352 FirstNonBinary
= CurPtr
;
361 if (*CurPtr
== '.') {
362 // MASM float literals (other than hex floats) always contain a ".", and
363 // are always written in decimal.
365 return LexFloatLiteral();
368 if (LexMasmHexFloats
&& (*CurPtr
== 'r' || *CurPtr
== 'R')) {
370 return AsmToken(AsmToken::Real
, StringRef(TokStart
, CurPtr
- TokStart
));
374 if (*CurPtr
== 'h' || *CurPtr
== 'H') {
375 // hexadecimal number
378 } else if (*CurPtr
== 't' || *CurPtr
== 'T') {
382 } else if (*CurPtr
== 'o' || *CurPtr
== 'O' || *CurPtr
== 'q' ||
387 } else if (*CurPtr
== 'y' || *CurPtr
== 'Y') {
391 } else if (FirstNonDecimal
&& FirstNonDecimal
+ 1 == CurPtr
&&
393 (*FirstNonDecimal
== 'd' || *FirstNonDecimal
== 'D')) {
395 } else if (FirstNonBinary
&& FirstNonBinary
+ 1 == CurPtr
&&
397 (*FirstNonBinary
== 'b' || *FirstNonBinary
== 'B')) {
402 StringRef
Result(TokStart
, CurPtr
- TokStart
);
403 APInt
Value(128, 0, true);
405 if (Result
.drop_back().getAsInteger(Radix
, Value
))
406 return ReturnError(TokStart
, "invalid " + radixName(Radix
) + " number");
408 // MSVC accepts and ignores type suffices on integer literals.
409 SkipIgnoredIntegerSuffix(CurPtr
);
411 return intToken(Result
, Value
);
414 // default-radix integers, or floating point numbers, fall through
418 // MASM default-radix integers: [0-9a-fA-F]+
419 // (All other integer literals have a radix specifier.)
420 if (LexMasmIntegers
&& UseMasmDefaultRadix
) {
421 CurPtr
= findLastDigit(CurPtr
, 16);
422 StringRef
Result(TokStart
, CurPtr
- TokStart
);
424 APInt
Value(128, 0, true);
425 if (Result
.getAsInteger(DefaultRadix
, Value
)) {
426 return ReturnError(TokStart
,
427 "invalid " + radixName(DefaultRadix
) + " number");
430 return intToken(Result
, Value
);
433 // Motorola hex integers: $[0-9a-fA-F]+
434 if (LexMotorolaIntegers
&& CurPtr
[-1] == '$') {
435 const char *NumStart
= CurPtr
;
436 while (isHexDigit(CurPtr
[0]))
439 APInt
Result(128, 0);
440 if (StringRef(NumStart
, CurPtr
- NumStart
).getAsInteger(16, Result
))
441 return ReturnError(TokStart
, "invalid hexadecimal number");
443 return intToken(StringRef(TokStart
, CurPtr
- TokStart
), Result
);
446 // Motorola binary integers: %[01]+
447 if (LexMotorolaIntegers
&& CurPtr
[-1] == '%') {
448 const char *NumStart
= CurPtr
;
449 while (*CurPtr
== '0' || *CurPtr
== '1')
452 APInt
Result(128, 0);
453 if (StringRef(NumStart
, CurPtr
- NumStart
).getAsInteger(2, Result
))
454 return ReturnError(TokStart
, "invalid binary number");
456 return intToken(StringRef(TokStart
, CurPtr
- TokStart
), Result
);
459 // Decimal integer: [1-9][0-9]*
460 // HLASM-flavour decimal integer: [0-9][0-9]*
461 // FIXME: Later on, support for fb for HLASM has to be added in
462 // as they probably would be needed for asm goto
463 if (LexHLASMIntegers
|| CurPtr
[-1] != '0' || CurPtr
[0] == '.') {
464 unsigned Radix
= doHexLookAhead(CurPtr
, 10, LexMasmIntegers
);
466 if (!LexHLASMIntegers
) {
467 bool IsHex
= Radix
== 16;
468 // Check for floating point literals.
469 if (!IsHex
&& (*CurPtr
== '.' || *CurPtr
== 'e' || *CurPtr
== 'E')) {
472 return LexFloatLiteral();
476 StringRef
Result(TokStart
, CurPtr
- TokStart
);
478 APInt
Value(128, 0, true);
479 if (Result
.getAsInteger(Radix
, Value
))
480 return ReturnError(TokStart
, "invalid " + radixName(Radix
) + " number");
482 if (!LexHLASMIntegers
)
483 // The darwin/x86 (and x86-64) assembler accepts and ignores type
484 // suffices on integer literals.
485 SkipIgnoredIntegerSuffix(CurPtr
);
487 return intToken(Result
, Value
);
490 if (!LexMasmIntegers
&& ((*CurPtr
== 'b') || (*CurPtr
== 'B'))) {
492 // See if we actually have "0b" as part of something like "jmp 0b\n"
493 if (!isDigit(CurPtr
[0])) {
495 StringRef
Result(TokStart
, CurPtr
- TokStart
);
496 return AsmToken(AsmToken::Integer
, Result
, 0);
498 const char *NumStart
= CurPtr
;
499 while (CurPtr
[0] == '0' || CurPtr
[0] == '1')
502 // Requires at least one binary digit.
503 if (CurPtr
== NumStart
)
504 return ReturnError(TokStart
, "invalid binary number");
506 StringRef
Result(TokStart
, CurPtr
- TokStart
);
508 APInt
Value(128, 0, true);
509 if (Result
.substr(2).getAsInteger(2, Value
))
510 return ReturnError(TokStart
, "invalid binary number");
512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513 // suffixes on integer literals.
514 SkipIgnoredIntegerSuffix(CurPtr
);
516 return intToken(Result
, Value
);
519 if ((*CurPtr
== 'x') || (*CurPtr
== 'X')) {
521 const char *NumStart
= CurPtr
;
522 while (isHexDigit(CurPtr
[0]))
525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526 // diagnosed by LexHexFloatLiteral).
527 if (CurPtr
[0] == '.' || CurPtr
[0] == 'p' || CurPtr
[0] == 'P')
528 return LexHexFloatLiteral(NumStart
== CurPtr
);
530 // Otherwise requires at least one hex digit.
531 if (CurPtr
== NumStart
)
532 return ReturnError(CurPtr
-2, "invalid hexadecimal number");
534 APInt
Result(128, 0);
535 if (StringRef(TokStart
, CurPtr
- TokStart
).getAsInteger(0, Result
))
536 return ReturnError(TokStart
, "invalid hexadecimal number");
538 // Consume the optional [hH].
539 if (LexMasmIntegers
&& (*CurPtr
== 'h' || *CurPtr
== 'H'))
542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543 // suffixes on integer literals.
544 SkipIgnoredIntegerSuffix(CurPtr
);
546 return intToken(StringRef(TokStart
, CurPtr
- TokStart
), Result
);
549 // Either octal or hexadecimal.
550 APInt
Value(128, 0, true);
551 unsigned Radix
= doHexLookAhead(CurPtr
, 8, LexMasmIntegers
);
552 StringRef
Result(TokStart
, CurPtr
- TokStart
);
553 if (Result
.getAsInteger(Radix
, Value
))
554 return ReturnError(TokStart
, "invalid " + radixName(Radix
) + " number");
560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561 // suffixes on integer literals.
562 SkipIgnoredIntegerSuffix(CurPtr
);
564 return intToken(Result
, Value
);
567 /// LexSingleQuote: Integer: 'b'
568 AsmToken
AsmLexer::LexSingleQuote() {
569 int CurChar
= getNextChar();
572 return ReturnError(TokStart
, "invalid usage of character literals");
574 if (LexMasmStrings
) {
575 while (CurChar
!= EOF
) {
576 if (CurChar
!= '\'') {
577 CurChar
= getNextChar();
578 } else if (peekNextChar() == '\'') {
579 // In MASM single-quote strings, doubled single-quotes mean an escaped
580 // single quote, so should be lexed in.
582 CurChar
= getNextChar();
588 return ReturnError(TokStart
, "unterminated string constant");
589 return AsmToken(AsmToken::String
, StringRef(TokStart
, CurPtr
- TokStart
));
593 CurChar
= getNextChar();
596 return ReturnError(TokStart
, "unterminated single quote");
598 CurChar
= getNextChar();
601 return ReturnError(TokStart
, "single quote way too long");
603 // The idea here being that 'c' is basically just an integral
605 StringRef Res
= StringRef(TokStart
,CurPtr
- TokStart
);
608 if (Res
.starts_with("\'\\")) {
609 char theChar
= Res
[2];
611 default: Value
= theChar
; break;
612 case '\'': Value
= '\''; break;
613 case 't': Value
= '\t'; break;
614 case 'n': Value
= '\n'; break;
615 case 'b': Value
= '\b'; break;
616 case 'f': Value
= '\f'; break;
617 case 'r': Value
= '\r'; break;
622 return AsmToken(AsmToken::Integer
, Res
, Value
);
625 /// LexQuote: String: "..."
626 AsmToken
AsmLexer::LexQuote() {
627 int CurChar
= getNextChar();
629 return ReturnError(TokStart
, "invalid usage of string literals");
631 if (LexMasmStrings
) {
632 while (CurChar
!= EOF
) {
633 if (CurChar
!= '"') {
634 CurChar
= getNextChar();
635 } else if (peekNextChar() == '"') {
636 // In MASM double-quoted strings, doubled double-quotes mean an escaped
637 // double quote, so should be lexed in.
639 CurChar
= getNextChar();
645 return ReturnError(TokStart
, "unterminated string constant");
646 return AsmToken(AsmToken::String
, StringRef(TokStart
, CurPtr
- TokStart
));
649 // TODO: does gas allow multiline string constants?
650 while (CurChar
!= '"') {
651 if (CurChar
== '\\') {
653 CurChar
= getNextChar();
657 return ReturnError(TokStart
, "unterminated string constant");
659 CurChar
= getNextChar();
662 return AsmToken(AsmToken::String
, StringRef(TokStart
, CurPtr
- TokStart
));
665 StringRef
AsmLexer::LexUntilEndOfStatement() {
668 while (!isAtStartOfComment(CurPtr
) && // Start of line comment.
669 !isAtStatementSeparator(CurPtr
) && // End of statement marker.
670 *CurPtr
!= '\n' && *CurPtr
!= '\r' && CurPtr
!= CurBuf
.end()) {
673 return StringRef(TokStart
, CurPtr
-TokStart
);
676 StringRef
AsmLexer::LexUntilEndOfLine() {
679 while (*CurPtr
!= '\n' && *CurPtr
!= '\r' && CurPtr
!= CurBuf
.end()) {
682 return StringRef(TokStart
, CurPtr
-TokStart
);
685 size_t AsmLexer::peekTokens(MutableArrayRef
<AsmToken
> Buf
,
686 bool ShouldSkipSpace
) {
687 SaveAndRestore
SavedTokenStart(TokStart
);
688 SaveAndRestore
SavedCurPtr(CurPtr
);
689 SaveAndRestore
SavedAtStartOfLine(IsAtStartOfLine
);
690 SaveAndRestore
SavedAtStartOfStatement(IsAtStartOfStatement
);
691 SaveAndRestore
SavedSkipSpace(SkipSpace
, ShouldSkipSpace
);
692 SaveAndRestore
SavedIsPeeking(IsPeeking
, true);
693 std::string SavedErr
= getErr();
694 SMLoc SavedErrLoc
= getErrLoc();
697 for (ReadCount
= 0; ReadCount
< Buf
.size(); ++ReadCount
) {
698 AsmToken Token
= LexToken();
700 Buf
[ReadCount
] = Token
;
702 if (Token
.is(AsmToken::Eof
))
706 SetError(SavedErrLoc
, SavedErr
);
710 bool AsmLexer::isAtStartOfComment(const char *Ptr
) {
711 if (MAI
.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement
)
714 StringRef CommentString
= MAI
.getCommentString();
716 if (CommentString
.size() == 1)
717 return CommentString
[0] == Ptr
[0];
719 // Allow # preprocessor comments also be counted as comments for "##" cases
720 if (CommentString
[1] == '#')
721 return CommentString
[0] == Ptr
[0];
723 return strncmp(Ptr
, CommentString
.data(), CommentString
.size()) == 0;
726 bool AsmLexer::isAtStatementSeparator(const char *Ptr
) {
727 return strncmp(Ptr
, MAI
.getSeparatorString(),
728 strlen(MAI
.getSeparatorString())) == 0;
731 AsmToken
AsmLexer::LexToken() {
733 // This always consumes at least one character.
734 int CurChar
= getNextChar();
736 if (!IsPeeking
&& CurChar
== '#' && IsAtStartOfStatement
) {
737 // If this starts with a '#', this may be a cpp
738 // hash directive and otherwise a line comment.
739 AsmToken TokenBuf
[2];
740 MutableArrayRef
<AsmToken
> Buf(TokenBuf
, 2);
741 size_t num
= peekTokens(Buf
, true);
742 // There cannot be a space preceding this
743 if (IsAtStartOfLine
&& num
== 2 && TokenBuf
[0].is(AsmToken::Integer
) &&
744 TokenBuf
[1].is(AsmToken::String
)) {
745 CurPtr
= TokStart
; // reset curPtr;
746 StringRef s
= LexUntilEndOfLine();
749 return AsmToken(AsmToken::HashDirective
, s
);
752 if (MAI
.shouldAllowAdditionalComments())
753 return LexLineComment();
756 if (isAtStartOfComment(TokStart
))
757 return LexLineComment();
759 if (isAtStatementSeparator(TokStart
)) {
760 CurPtr
+= strlen(MAI
.getSeparatorString()) - 1;
761 IsAtStartOfLine
= true;
762 IsAtStartOfStatement
= true;
763 return AsmToken(AsmToken::EndOfStatement
,
764 StringRef(TokStart
, strlen(MAI
.getSeparatorString())));
767 // If we're missing a newline at EOF, make sure we still get an
768 // EndOfStatement token before the Eof token.
769 if (CurChar
== EOF
&& !IsAtStartOfStatement
&& EndStatementAtEOF
) {
770 IsAtStartOfLine
= true;
771 IsAtStartOfStatement
= true;
772 return AsmToken(AsmToken::EndOfStatement
, StringRef(TokStart
, 0));
774 IsAtStartOfLine
= false;
775 bool OldIsAtStartOfStatement
= IsAtStartOfStatement
;
776 IsAtStartOfStatement
= false;
779 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
780 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
781 // an identifier is target-dependent. These characters are handled in the
782 // respective switch cases.
783 if (isalpha(CurChar
) || CurChar
== '_' || CurChar
== '.')
784 return LexIdentifier();
786 // Unknown character, emit an error.
787 return ReturnError(TokStart
, "invalid character in input");
789 if (EndStatementAtEOF
) {
790 IsAtStartOfLine
= true;
791 IsAtStartOfStatement
= true;
793 return AsmToken(AsmToken::Eof
, StringRef(TokStart
, 0));
797 IsAtStartOfStatement
= OldIsAtStartOfStatement
;
798 while (*CurPtr
== ' ' || *CurPtr
== '\t')
801 return LexToken(); // Ignore whitespace.
803 return AsmToken(AsmToken::Space
, StringRef(TokStart
, CurPtr
- TokStart
));
805 IsAtStartOfLine
= true;
806 IsAtStartOfStatement
= true;
807 // If this is a CR followed by LF, treat that as one token.
808 if (CurPtr
!= CurBuf
.end() && *CurPtr
== '\n')
810 return AsmToken(AsmToken::EndOfStatement
,
811 StringRef(TokStart
, CurPtr
- TokStart
));
814 IsAtStartOfLine
= true;
815 IsAtStartOfStatement
= true;
816 return AsmToken(AsmToken::EndOfStatement
, StringRef(TokStart
, 1));
817 case ':': return AsmToken(AsmToken::Colon
, StringRef(TokStart
, 1));
818 case '+': return AsmToken(AsmToken::Plus
, StringRef(TokStart
, 1));
819 case '~': return AsmToken(AsmToken::Tilde
, StringRef(TokStart
, 1));
820 case '(': return AsmToken(AsmToken::LParen
, StringRef(TokStart
, 1));
821 case ')': return AsmToken(AsmToken::RParen
, StringRef(TokStart
, 1));
822 case '[': return AsmToken(AsmToken::LBrac
, StringRef(TokStart
, 1));
823 case ']': return AsmToken(AsmToken::RBrac
, StringRef(TokStart
, 1));
824 case '{': return AsmToken(AsmToken::LCurly
, StringRef(TokStart
, 1));
825 case '}': return AsmToken(AsmToken::RCurly
, StringRef(TokStart
, 1));
826 case '*': return AsmToken(AsmToken::Star
, StringRef(TokStart
, 1));
827 case ',': return AsmToken(AsmToken::Comma
, StringRef(TokStart
, 1));
829 if (LexMotorolaIntegers
&& isHexDigit(*CurPtr
))
831 if (MAI
.doesAllowDollarAtStartOfIdentifier())
832 return LexIdentifier();
833 return AsmToken(AsmToken::Dollar
, StringRef(TokStart
, 1));
836 if (MAI
.doesAllowAtAtStartOfIdentifier())
837 return LexIdentifier();
838 return AsmToken(AsmToken::At
, StringRef(TokStart
, 1));
840 if (MAI
.doesAllowHashAtStartOfIdentifier())
841 return LexIdentifier();
842 return AsmToken(AsmToken::Hash
, StringRef(TokStart
, 1));
844 if (MAI
.doesAllowQuestionAtStartOfIdentifier())
845 return LexIdentifier();
846 return AsmToken(AsmToken::Question
, StringRef(TokStart
, 1));
847 case '\\': return AsmToken(AsmToken::BackSlash
, StringRef(TokStart
, 1));
849 if (*CurPtr
== '=') {
851 return AsmToken(AsmToken::EqualEqual
, StringRef(TokStart
, 2));
853 return AsmToken(AsmToken::Equal
, StringRef(TokStart
, 1));
855 if (*CurPtr
== '>') {
857 return AsmToken(AsmToken::MinusGreater
, StringRef(TokStart
, 2));
859 return AsmToken(AsmToken::Minus
, StringRef(TokStart
, 1));
861 if (*CurPtr
== '|') {
863 return AsmToken(AsmToken::PipePipe
, StringRef(TokStart
, 2));
865 return AsmToken(AsmToken::Pipe
, StringRef(TokStart
, 1));
866 case '^': return AsmToken(AsmToken::Caret
, StringRef(TokStart
, 1));
868 if (*CurPtr
== '&') {
870 return AsmToken(AsmToken::AmpAmp
, StringRef(TokStart
, 2));
872 return AsmToken(AsmToken::Amp
, StringRef(TokStart
, 1));
874 if (*CurPtr
== '=') {
876 return AsmToken(AsmToken::ExclaimEqual
, StringRef(TokStart
, 2));
878 return AsmToken(AsmToken::Exclaim
, StringRef(TokStart
, 1));
880 if (LexMotorolaIntegers
&& (*CurPtr
== '0' || *CurPtr
== '1')) {
884 if (MAI
.hasMipsExpressions()) {
885 AsmToken::TokenKind Operator
;
886 unsigned OperatorLength
;
888 std::tie(Operator
, OperatorLength
) =
889 StringSwitch
<std::pair
<AsmToken::TokenKind
, unsigned>>(
891 .StartsWith("call16", {AsmToken::PercentCall16
, 7})
892 .StartsWith("call_hi", {AsmToken::PercentCall_Hi
, 8})
893 .StartsWith("call_lo", {AsmToken::PercentCall_Lo
, 8})
894 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi
, 10})
895 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo
, 10})
896 .StartsWith("got_disp", {AsmToken::PercentGot_Disp
, 9})
897 .StartsWith("got_hi", {AsmToken::PercentGot_Hi
, 7})
898 .StartsWith("got_lo", {AsmToken::PercentGot_Lo
, 7})
899 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst
, 9})
900 .StartsWith("got_page", {AsmToken::PercentGot_Page
, 9})
901 .StartsWith("gottprel", {AsmToken::PercentGottprel
, 9})
902 .StartsWith("got", {AsmToken::PercentGot
, 4})
903 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel
, 7})
904 .StartsWith("higher", {AsmToken::PercentHigher
, 7})
905 .StartsWith("highest", {AsmToken::PercentHighest
, 8})
906 .StartsWith("hi", {AsmToken::PercentHi
, 3})
907 .StartsWith("lo", {AsmToken::PercentLo
, 3})
908 .StartsWith("neg", {AsmToken::PercentNeg
, 4})
909 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi
, 9})
910 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo
, 9})
911 .StartsWith("tlsgd", {AsmToken::PercentTlsgd
, 6})
912 .StartsWith("tlsldm", {AsmToken::PercentTlsldm
, 7})
913 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi
, 9})
914 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo
, 9})
915 .Default({AsmToken::Percent
, 1});
917 if (Operator
!= AsmToken::Percent
) {
918 CurPtr
+= OperatorLength
- 1;
919 return AsmToken(Operator
, StringRef(TokStart
, OperatorLength
));
922 return AsmToken(AsmToken::Percent
, StringRef(TokStart
, 1));
924 IsAtStartOfStatement
= OldIsAtStartOfStatement
;
926 case '\'': return LexSingleQuote();
927 case '"': return LexQuote();
928 case '0': case '1': case '2': case '3': case '4':
929 case '5': case '6': case '7': case '8': case '9':
935 return AsmToken(AsmToken::LessLess
, StringRef(TokStart
, 2));
938 return AsmToken(AsmToken::LessEqual
, StringRef(TokStart
, 2));
941 return AsmToken(AsmToken::LessGreater
, StringRef(TokStart
, 2));
943 return AsmToken(AsmToken::Less
, StringRef(TokStart
, 1));
949 return AsmToken(AsmToken::GreaterGreater
, StringRef(TokStart
, 2));
952 return AsmToken(AsmToken::GreaterEqual
, StringRef(TokStart
, 2));
954 return AsmToken(AsmToken::Greater
, StringRef(TokStart
, 1));
957 // TODO: Quoted identifiers (objc methods etc)
958 // local labels: [0-9][:]
959 // Forward/backward labels: [0-9][fb]
960 // Integers, fp constants, character constants.