lib/MC/MCParser/AsmLexer.cpp

   1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This class implements the lexer for assembly files.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/MC/MCParser/AsmLexer.h"
  15 #include "llvm/Support/SMLoc.h"
  16 #include "llvm/Support/MemoryBuffer.h"
  17 #include "llvm/MC/MCAsmInfo.h"
  18 #include <cctype>
  19 #include <cerrno>
  20 #include <cstdio>
  21 #include <cstdlib>
  22 using namespace llvm;
  23
  24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
  25   CurBuf = NULL;
  26   CurPtr = NULL;
  27 }
  28
  29 AsmLexer::~AsmLexer() {
  30 }
  31
  32 void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
  33   CurBuf = buf;
  34
  35   if (ptr)
  36     CurPtr = ptr;
  37   else
  38     CurPtr = CurBuf->getBufferStart();
  39
  40   TokStart = 0;
  41 }
  42
  43 /// ReturnError - Set the error to the specified string at the specified
  44 /// location.  This is defined to always return AsmToken::Error.
  45 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
  46   SetError(SMLoc::getFromPointer(Loc), Msg);
  47
  48   return AsmToken(AsmToken::Error, StringRef(Loc, 0));
  49 }
  50
  51 int AsmLexer::getNextChar() {
  52   char CurChar = *CurPtr++;
  53   switch (CurChar) {
  54   default:
  55     return (unsigned char)CurChar;
  56   case 0:
  57     // A nul character in the stream is either the end of the current buffer or
  58     // a random nul in the file.  Disambiguate that here.
  59     if (CurPtr-1 != CurBuf->getBufferEnd())
  60       return 0;  // Just whitespace.
  61
  62     // Otherwise, return end of file.
  63     --CurPtr;  // Another call to lex will return EOF again.
  64     return EOF;
  65   }
  66 }
  67
  68 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
  69 ///
  70 /// The leading integral digit sequence and dot should have already been
  71 /// consumed, some or all of the fractional digit sequence *can* have been
  72 /// consumed.
  73 AsmToken AsmLexer::LexFloatLiteral() {
  74   // Skip the fractional digit sequence.
  75   while (isdigit(*CurPtr))
  76     ++CurPtr;
  77
  78   // Check for exponent; we intentionally accept a slighlty wider set of
  79   // literals here and rely on the upstream client to reject invalid ones (e.g.,
  80   // "1e+").
  81   if (*CurPtr == 'e' || *CurPtr == 'E') {
  82     ++CurPtr;
  83     if (*CurPtr == '-' || *CurPtr == '+')
  84       ++CurPtr;
  85     while (isdigit(*CurPtr))
  86       ++CurPtr;
  87   }
  88
  89   return AsmToken(AsmToken::Real,
  90                   StringRef(TokStart, CurPtr - TokStart));
  91 }
  92
  93 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
  94 static bool IsIdentifierChar(char c) {
  95   return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@';
  96 }
  97 AsmToken AsmLexer::LexIdentifier() {
  98   // Check for floating point literals.
  99   if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
 100     // Disambiguate a .1243foo identifier from a floating literal.
 101     while (isdigit(*CurPtr))
 102       ++CurPtr;
 103     if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr))
 104       return LexFloatLiteral();
 105   }
 106
 107   while (IsIdentifierChar(*CurPtr))
 108     ++CurPtr;
 109
 110   // Handle . as a special case.
 111   if (CurPtr == TokStart+1 && TokStart[0] == '.')
 112     return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
 113
 114   return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
 115 }
 116
 117 /// LexSlash: Slash: /
 118 ///           C-Style Comment: /* ... */
 119 AsmToken AsmLexer::LexSlash() {
 120   switch (*CurPtr) {
 121   case '*': break; // C style comment.
 122   case '/': return ++CurPtr, LexLineComment();
 123   default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
 124   }
 125
 126   // C Style comment.
 127   ++CurPtr;  // skip the star.
 128   while (1) {
 129     int CurChar = getNextChar();
 130     switch (CurChar) {
 131     case EOF:
 132       return ReturnError(TokStart, "unterminated comment");
 133     case '*':
 134       // End of the comment?
 135       if (CurPtr[0] != '/') break;
 136
 137       ++CurPtr;   // End the */.
 138       return LexToken();
 139     }
 140   }
 141 }
 142
 143 /// LexLineComment: Comment: #[^\n]*
 144 ///                        : //[^\n]*
 145 AsmToken AsmLexer::LexLineComment() {
 146   // FIXME: This is broken if we happen to a comment at the end of a file, which
 147   // was .included, and which doesn't end with a newline.
 148   int CurChar = getNextChar();
 149   while (CurChar != '\n' && CurChar != '\n' && CurChar != EOF)
 150     CurChar = getNextChar();
 151
 152   if (CurChar == EOF)
 153     return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
 154   return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
 155 }
 156
 157 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
 158   if (CurPtr[0] == 'L' && CurPtr[1] == 'L')
 159     CurPtr += 2;
 160   if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L')
 161     CurPtr += 3;
 162 }
 163
 164 /// LexDigit: First character is [0-9].
 165 ///   Local Label: [0-9][:]
 166 ///   Forward/Backward Label: [0-9][fb]
 167 ///   Binary integer: 0b[01]+
 168 ///   Octal integer: 0[0-7]+
 169 ///   Hex integer: 0x[0-9a-fA-F]+
 170 ///   Decimal integer: [1-9][0-9]*
 171 AsmToken AsmLexer::LexDigit() {
 172   // Decimal integer: [1-9][0-9]*
 173   if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
 174     while (isdigit(*CurPtr))
 175       ++CurPtr;
 176
 177     // Check for floating point literals.
 178     if (*CurPtr == '.' || *CurPtr == 'e') {
 179       ++CurPtr;
 180       return LexFloatLiteral();
 181     }
 182
 183     StringRef Result(TokStart, CurPtr - TokStart);
 184
 185     long long Value;
 186     if (Result.getAsInteger(10, Value)) {
 187       // Allow positive values that are too large to fit into a signed 64-bit
 188       // integer, but that do fit in an unsigned one, we just convert them over.
 189       unsigned long long UValue;
 190       if (Result.getAsInteger(10, UValue))
 191         return ReturnError(TokStart, "invalid decimal number");
 192       Value = (long long)UValue;
 193     }
 194
 195     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 196     // suffixes on integer literals.
 197     SkipIgnoredIntegerSuffix(CurPtr);
 198
 199     return AsmToken(AsmToken::Integer, Result, Value);
 200   }
 201
 202   if (*CurPtr == 'b') {
 203     ++CurPtr;
 204     // See if we actually have "0b" as part of something like "jmp 0b\n"
 205     if (!isdigit(CurPtr[0])) {
 206       --CurPtr;
 207       StringRef Result(TokStart, CurPtr - TokStart);
 208       return AsmToken(AsmToken::Integer, Result, 0);
 209     }
 210     const char *NumStart = CurPtr;
 211     while (CurPtr[0] == '0' || CurPtr[0] == '1')
 212       ++CurPtr;
 213
 214     // Requires at least one binary digit.
 215     if (CurPtr == NumStart)
 216       return ReturnError(TokStart, "invalid binary number");
 217
 218     StringRef Result(TokStart, CurPtr - TokStart);
 219
 220     long long Value;
 221     if (Result.substr(2).getAsInteger(2, Value))
 222       return ReturnError(TokStart, "invalid binary number");
 223
 224     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 225     // suffixes on integer literals.
 226     SkipIgnoredIntegerSuffix(CurPtr);
 227
 228     return AsmToken(AsmToken::Integer, Result, Value);
 229   }
 230
 231   if (*CurPtr == 'x') {
 232     ++CurPtr;
 233     const char *NumStart = CurPtr;
 234     while (isxdigit(CurPtr[0]))
 235       ++CurPtr;
 236
 237     // Requires at least one hex digit.
 238     if (CurPtr == NumStart)
 239       return ReturnError(CurPtr-2, "invalid hexadecimal number");
 240
 241     unsigned long long Result;
 242     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
 243       return ReturnError(TokStart, "invalid hexadecimal number");
 244
 245     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 246     // suffixes on integer literals.
 247     SkipIgnoredIntegerSuffix(CurPtr);
 248
 249     return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
 250                     (int64_t)Result);
 251   }
 252
 253   // Must be an octal number, it starts with 0.
 254   while (*CurPtr >= '0' && *CurPtr <= '9')
 255     ++CurPtr;
 256
 257   StringRef Result(TokStart, CurPtr - TokStart);
 258   long long Value;
 259   if (Result.getAsInteger(8, Value))
 260     return ReturnError(TokStart, "invalid octal number");
 261
 262   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
 263   // suffixes on integer literals.
 264   SkipIgnoredIntegerSuffix(CurPtr);
 265
 266   return AsmToken(AsmToken::Integer, Result, Value);
 267 }
 268
 269 /// LexSingleQuote: Integer: 'b'
 270 AsmToken AsmLexer::LexSingleQuote() {
 271   int CurChar = getNextChar();
 272
 273   if (CurChar == '\\')
 274     CurChar = getNextChar();
 275
 276   if (CurChar == EOF)
 277     return ReturnError(TokStart, "unterminated single quote");
 278
 279   CurChar = getNextChar();
 280
 281   if (CurChar != '\'')
 282     return ReturnError(TokStart, "single quote way too long");
 283
 284   // The idea here being that 'c' is basically just an integral
 285   // constant.
 286   StringRef Res = StringRef(TokStart,CurPtr - TokStart);
 287   long long Value;
 288
 289   if (Res.startswith("\'\\")) {
 290     char theChar = Res[2];
 291     switch (theChar) {
 292       default: Value = theChar; break;
 293       case '\'': Value = '\''; break;
 294       case 't': Value = '\t'; break;
 295       case 'n': Value = '\n'; break;
 296       case 'b': Value = '\b'; break;
 297     }
 298   } else
 299     Value = TokStart[1];
 300
 301   return AsmToken(AsmToken::Integer, Res, Value);
 302 }
 303
 304
 305 /// LexQuote: String: "..."
 306 AsmToken AsmLexer::LexQuote() {
 307   int CurChar = getNextChar();
 308   // TODO: does gas allow multiline string constants?
 309   while (CurChar != '"') {
 310     if (CurChar == '\\') {
 311       // Allow \", etc.
 312       CurChar = getNextChar();
 313     }
 314
 315     if (CurChar == EOF)
 316       return ReturnError(TokStart, "unterminated string constant");
 317
 318     CurChar = getNextChar();
 319   }
 320
 321   return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
 322 }
 323
 324 StringRef AsmLexer::LexUntilEndOfStatement() {
 325   TokStart = CurPtr;
 326
 327   while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
 328          !isAtStatementSeparator(CurPtr) && // End of statement marker.
 329          *CurPtr != '\n' &&
 330          *CurPtr != '\r' &&
 331          (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
 332     ++CurPtr;
 333   }
 334   return StringRef(TokStart, CurPtr-TokStart);
 335 }
 336
 337 bool AsmLexer::isAtStartOfComment(char Char) {
 338   // FIXME: This won't work for multi-character comment indicators like "//".
 339   return Char == *MAI.getCommentString();
 340 }
 341
 342 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
 343   return strncmp(Ptr, MAI.getSeparatorString(),
 344                  strlen(MAI.getSeparatorString())) == 0;
 345 }
 346
 347 AsmToken AsmLexer::LexToken() {
 348   TokStart = CurPtr;
 349   // This always consumes at least one character.
 350   int CurChar = getNextChar();
 351
 352   if (isAtStartOfComment(CurChar))
 353     return LexLineComment();
 354   if (isAtStatementSeparator(TokStart)) {
 355     CurPtr += strlen(MAI.getSeparatorString()) - 1;
 356     return AsmToken(AsmToken::EndOfStatement,
 357                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
 358   }
 359
 360   switch (CurChar) {
 361   default:
 362     // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
 363     if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
 364       return LexIdentifier();
 365
 366     // Unknown character, emit an error.
 367     return ReturnError(TokStart, "invalid character in input");
 368   case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
 369   case 0:
 370   case ' ':
 371   case '\t':
 372     // Ignore whitespace.
 373     return LexToken();
 374   case '\n': // FALL THROUGH.
 375   case '\r':
 376     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
 377   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
 378   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
 379   case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
 380   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
 381   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
 382   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
 383   case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
 384   case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
 385   case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
 386   case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
 387   case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
 388   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
 389   case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
 390   case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
 391   case '=':
 392     if (*CurPtr == '=')
 393       return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
 394     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
 395   case '|':
 396     if (*CurPtr == '|')
 397       return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
 398     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
 399   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
 400   case '&':
 401     if (*CurPtr == '&')
 402       return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
 403     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
 404   case '!':
 405     if (*CurPtr == '=')
 406       return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
 407     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
 408   case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
 409   case '/': return LexSlash();
 410   case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
 411   case '\'': return LexSingleQuote();
 412   case '"': return LexQuote();
 413   case '0': case '1': case '2': case '3': case '4':
 414   case '5': case '6': case '7': case '8': case '9':
 415     return LexDigit();
 416   case '<':
 417     switch (*CurPtr) {
 418     case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
 419                                         StringRef(TokStart, 2));
 420     case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
 421                                         StringRef(TokStart, 2));
 422     case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
 423                                         StringRef(TokStart, 2));
 424     default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
 425     }
 426   case '>':
 427     switch (*CurPtr) {
 428     case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
 429                                         StringRef(TokStart, 2));
 430     case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
 431                                         StringRef(TokStart, 2));
 432     default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
 433     }
 434
 435   // TODO: Quoted identifiers (objc methods etc)
 436   // local labels: [0-9][:]
 437   // Forward/backward labels: [0-9][fb]
 438   // Integers, fp constants, character constants.
 439   }
 440 }