tools/llvm-rc/ResourceScriptToken.cpp

   1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===---------------------------------------------------------------------===//
   9 //
  10 // This file implements an interface defined in ResourceScriptToken.h.
  11 // In particular, it defines an .rc script tokenizer.
  12 //
  13 //===---------------------------------------------------------------------===//
  14
  15 #include "ResourceScriptToken.h"
  16 #include "llvm/Support/raw_ostream.h"
  17
  18 #include <algorithm>
  19 #include <cassert>
  20 #include <cctype>
  21 #include <cstdlib>
  22 #include <utility>
  23
  24 using namespace llvm;
  25
  26 using Kind = RCToken::Kind;
  27
  28 // Checks if Representation is a correct description of an RC integer.
  29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
  30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
  31 // character (that is the difference between our representation and
  32 // StringRef's one). If Representation is correct, 'true' is returned and
  33 // the return value is put back in Num.
  34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
  35   size_t Length = Representation.size();
  36   if (Length == 0)
  37     return false;
  38   // Strip the last 'L' if unnecessary.
  39   if (std::toupper(Representation.back()) == 'L')
  40     Representation = Representation.drop_back(1);
  41
  42   return !Representation.getAsInteger<uint32_t>(0, Num);
  43 }
  44
  45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
  46     : TokenKind(RCTokenKind), TokenValue(Value) {}
  47
  48 uint32_t RCToken::intValue() const {
  49   assert(TokenKind == Kind::Int);
  50   // We assume that the token already is a correct integer (checked by
  51   // rcGetAsInteger).
  52   uint32_t Result;
  53   bool IsSuccess = rcGetAsInteger(TokenValue, Result);
  54   assert(IsSuccess);
  55   (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
  56   return Result;
  57 }
  58
  59 bool RCToken::isLongInt() const {
  60   return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
  61 }
  62
  63 StringRef RCToken::value() const { return TokenValue; }
  64
  65 Kind RCToken::kind() const { return TokenKind; }
  66
  67 bool RCToken::isBinaryOp() const {
  68   switch (TokenKind) {
  69   case Kind::Plus:
  70   case Kind::Minus:
  71   case Kind::Pipe:
  72   case Kind::Amp:
  73     return true;
  74   default:
  75     return false;
  76   }
  77 }
  78
  79 static Error getStringError(const Twine &message) {
  80   return make_error<StringError>("Error parsing file: " + message,
  81                                  inconvertibleErrorCode());
  82 }
  83
  84 namespace {
  85
  86 class Tokenizer {
  87 public:
  88   Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
  89
  90   Expected<std::vector<RCToken>> run();
  91
  92 private:
  93   // All 'advancing' methods return boolean values; if they're equal to false,
  94   // the stream has ended or failed.
  95   bool advance(size_t Amount = 1);
  96   bool skipWhitespaces();
  97
  98   // Consumes a token. If any problem occurred, a non-empty Error is returned.
  99   Error consumeToken(const Kind TokenKind);
 100
 101   // Check if tokenizer is about to read FollowingChars.
 102   bool willNowRead(StringRef FollowingChars) const;
 103
 104   // Check if tokenizer can start reading an identifier at current position.
 105   // The original tool did non specify the rules to determine what is a correct
 106   // identifier. We assume they should follow the C convention:
 107   // [a-zA-Z_][a-zA-Z0-9_]*.
 108   bool canStartIdentifier() const;
 109   // Check if tokenizer can continue reading an identifier.
 110   bool canContinueIdentifier() const;
 111
 112   // Check if tokenizer can start reading an integer.
 113   // A correct integer always starts with a 0-9 digit,
 114   // can contain characters 0-9A-Fa-f (digits),
 115   // Ll (marking the integer is 32-bit), Xx (marking the representation
 116   // is hexadecimal). As some kind of separator should come after the
 117   // integer, we can consume the integer until a non-alphanumeric
 118   // character.
 119   bool canStartInt() const;
 120   bool canContinueInt() const;
 121
 122   bool canStartString() const;
 123
 124   // Check if tokenizer can start reading a single line comment (e.g. a comment
 125   // that begins with '//')
 126   bool canStartLineComment() const;
 127
 128   // Check if tokenizer can start or finish reading a block comment (e.g. a
 129   // comment that begins with '/*' and ends with '*/')
 130   bool canStartBlockComment() const;
 131
 132   // Throw away all remaining characters on the current line.
 133   void skipCurrentLine();
 134
 135   bool streamEof() const;
 136
 137   // Classify the token that is about to be read from the current position.
 138   Kind classifyCurrentToken() const;
 139
 140   // Process the Kind::Identifier token - check if it is
 141   // an identifier describing a block start or end.
 142   void processIdentifier(RCToken &token) const;
 143
 144   StringRef Data;
 145   size_t DataLength, Pos;
 146 };
 147
 148 void Tokenizer::skipCurrentLine() {
 149   Pos = Data.find_first_of("\r\n", Pos);
 150   Pos = Data.find_first_not_of("\r\n", Pos);
 151
 152   if (Pos == StringRef::npos)
 153     Pos = DataLength;
 154 }
 155
 156 Expected<std::vector<RCToken>> Tokenizer::run() {
 157   Pos = 0;
 158   std::vector<RCToken> Result;
 159
 160   // Consume an optional UTF-8 Byte Order Mark.
 161   if (willNowRead("\xef\xbb\xbf"))
 162     advance(3);
 163
 164   while (!streamEof()) {
 165     if (!skipWhitespaces())
 166       break;
 167
 168     Kind TokenKind = classifyCurrentToken();
 169     if (TokenKind == Kind::Invalid)
 170       return getStringError("Invalid token found at position " + Twine(Pos));
 171
 172     const size_t TokenStart = Pos;
 173     if (Error TokenError = consumeToken(TokenKind))
 174       return std::move(TokenError);
 175
 176     // Comments are just deleted, don't bother saving them.
 177     if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
 178       continue;
 179
 180     RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
 181     if (TokenKind == Kind::Identifier) {
 182       processIdentifier(Token);
 183     } else if (TokenKind == Kind::Int) {
 184       uint32_t TokenInt;
 185       if (!rcGetAsInteger(Token.value(), TokenInt)) {
 186         // The integer has incorrect format or cannot be represented in
 187         // a 32-bit integer.
 188         return getStringError("Integer invalid or too large: " +
 189                               Token.value().str());
 190       }
 191     }
 192
 193     Result.push_back(Token);
 194   }
 195
 196   return Result;
 197 }
 198
 199 bool Tokenizer::advance(size_t Amount) {
 200   Pos += Amount;
 201   return !streamEof();
 202 }
 203
 204 bool Tokenizer::skipWhitespaces() {
 205   while (!streamEof() && std::isspace(Data[Pos]))
 206     advance();
 207   return !streamEof();
 208 }
 209
 210 Error Tokenizer::consumeToken(const Kind TokenKind) {
 211   switch (TokenKind) {
 212   // One-character token consumption.
 213 #define TOKEN(Name)
 214 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
 215 #include "ResourceScriptTokenList.h"
 216 #undef TOKEN
 217 #undef SHORT_TOKEN
 218     advance();
 219     return Error::success();
 220
 221   case Kind::LineComment:
 222     advance(2);
 223     skipCurrentLine();
 224     return Error::success();
 225
 226   case Kind::StartComment: {
 227     advance(2);
 228     auto EndPos = Data.find("*/", Pos);
 229     if (EndPos == StringRef::npos)
 230       return getStringError(
 231           "Unclosed multi-line comment beginning at position " + Twine(Pos));
 232     advance(EndPos - Pos);
 233     advance(2);
 234     return Error::success();
 235   }
 236   case Kind::Identifier:
 237     while (!streamEof() && canContinueIdentifier())
 238       advance();
 239     return Error::success();
 240
 241   case Kind::Int:
 242     while (!streamEof() && canContinueInt())
 243       advance();
 244     return Error::success();
 245
 246   case Kind::String:
 247     // Consume the preceding 'L', if there is any.
 248     if (std::toupper(Data[Pos]) == 'L')
 249       advance();
 250     // Consume the double-quote.
 251     advance();
 252
 253     // Consume the characters until the end of the file, line or string.
 254     while (true) {
 255       if (streamEof()) {
 256         return getStringError("Unterminated string literal.");
 257       } else if (Data[Pos] == '"') {
 258         // Consume the ending double-quote.
 259         advance();
 260         // However, if another '"' follows this double-quote, the string didn't
 261         // end and we just included '"' into the string.
 262         if (!willNowRead("\""))
 263           return Error::success();
 264       } else if (Data[Pos] == '\n') {
 265         return getStringError("String literal not terminated in the line.");
 266       }
 267
 268       advance();
 269     }
 270
 271   case Kind::Invalid:
 272     assert(false && "Cannot consume an invalid token.");
 273   }
 274
 275   llvm_unreachable("Unknown RCToken::Kind");
 276 }
 277
 278 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
 279   return Data.drop_front(Pos).startswith(FollowingChars);
 280 }
 281
 282 bool Tokenizer::canStartIdentifier() const {
 283   assert(!streamEof());
 284
 285   const char CurChar = Data[Pos];
 286   return std::isalpha(CurChar) || CurChar == '_';
 287 }
 288
 289 bool Tokenizer::canContinueIdentifier() const {
 290   assert(!streamEof());
 291   const char CurChar = Data[Pos];
 292   return std::isalnum(CurChar) || CurChar == '_';
 293 }
 294
 295 bool Tokenizer::canStartInt() const {
 296   assert(!streamEof());
 297   return std::isdigit(Data[Pos]);
 298 }
 299
 300 bool Tokenizer::canStartBlockComment() const {
 301   assert(!streamEof());
 302   return Data.drop_front(Pos).startswith("/*");
 303 }
 304
 305 bool Tokenizer::canStartLineComment() const {
 306   assert(!streamEof());
 307   return Data.drop_front(Pos).startswith("//");
 308 }
 309
 310 bool Tokenizer::canContinueInt() const {
 311   assert(!streamEof());
 312   return std::isalnum(Data[Pos]);
 313 }
 314
 315 bool Tokenizer::canStartString() const {
 316   return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
 317 }
 318
 319 bool Tokenizer::streamEof() const { return Pos == DataLength; }
 320
 321 Kind Tokenizer::classifyCurrentToken() const {
 322   if (canStartBlockComment())
 323     return Kind::StartComment;
 324   if (canStartLineComment())
 325     return Kind::LineComment;
 326
 327   if (canStartInt())
 328     return Kind::Int;
 329   if (canStartString())
 330     return Kind::String;
 331   // BEGIN and END are at this point of lexing recognized as identifiers.
 332   if (canStartIdentifier())
 333     return Kind::Identifier;
 334
 335   const char CurChar = Data[Pos];
 336
 337   switch (CurChar) {
 338   // One-character token classification.
 339 #define TOKEN(Name)
 340 #define SHORT_TOKEN(Name, Ch)                                                  \
 341   case Ch:                                                                     \
 342     return Kind::Name;
 343 #include "ResourceScriptTokenList.h"
 344 #undef TOKEN
 345 #undef SHORT_TOKEN
 346
 347   default:
 348     return Kind::Invalid;
 349   }
 350 }
 351
 352 void Tokenizer::processIdentifier(RCToken &Token) const {
 353   assert(Token.kind() == Kind::Identifier);
 354   StringRef Name = Token.value();
 355
 356   if (Name.equals_lower("begin"))
 357     Token = RCToken(Kind::BlockBegin, Name);
 358   else if (Name.equals_lower("end"))
 359     Token = RCToken(Kind::BlockEnd, Name);
 360 }
 361
 362 } // anonymous namespace
 363
 364 namespace llvm {
 365
 366 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
 367   return Tokenizer(Input).run();
 368 }
 369
 370 } // namespace llvm