tools/llvm-rc/ResourceScriptToken.cpp

   1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===---------------------------------------------------------------------===//
   8 //
   9 // This file implements an interface defined in ResourceScriptToken.h.
  10 // In particular, it defines an .rc script tokenizer.
  11 //
  12 //===---------------------------------------------------------------------===//
  13
  14 #include "ResourceScriptToken.h"
  15 #include "llvm/Support/raw_ostream.h"
  16
  17 #include <algorithm>
  18 #include <cassert>
  19 #include <cctype>
  20 #include <cstdlib>
  21 #include <utility>
  22
  23 using namespace llvm;
  24
  25 using Kind = RCToken::Kind;
  26
  27 // Checks if Representation is a correct description of an RC integer.
  28 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
  29 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
  30 // character (that is the difference between our representation and
  31 // StringRef's one). If Representation is correct, 'true' is returned and
  32 // the return value is put back in Num.
  33 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
  34   size_t Length = Representation.size();
  35   if (Length == 0)
  36     return false;
  37   // Strip the last 'L' if unnecessary.
  38   if (std::toupper(Representation.back()) == 'L')
  39     Representation = Representation.drop_back(1);
  40
  41   return !Representation.getAsInteger<uint32_t>(0, Num);
  42 }
  43
  44 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
  45     : TokenKind(RCTokenKind), TokenValue(Value) {}
  46
  47 uint32_t RCToken::intValue() const {
  48   assert(TokenKind == Kind::Int);
  49   // We assume that the token already is a correct integer (checked by
  50   // rcGetAsInteger).
  51   uint32_t Result;
  52   bool IsSuccess = rcGetAsInteger(TokenValue, Result);
  53   assert(IsSuccess);
  54   (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
  55   return Result;
  56 }
  57
  58 bool RCToken::isLongInt() const {
  59   return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
  60 }
  61
  62 StringRef RCToken::value() const { return TokenValue; }
  63
  64 Kind RCToken::kind() const { return TokenKind; }
  65
  66 bool RCToken::isBinaryOp() const {
  67   switch (TokenKind) {
  68   case Kind::Plus:
  69   case Kind::Minus:
  70   case Kind::Pipe:
  71   case Kind::Amp:
  72     return true;
  73   default:
  74     return false;
  75   }
  76 }
  77
  78 static Error getStringError(const Twine &message) {
  79   return make_error<StringError>("Error parsing file: " + message,
  80                                  inconvertibleErrorCode());
  81 }
  82
  83 namespace {
  84
  85 class Tokenizer {
  86 public:
  87   Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
  88
  89   Expected<std::vector<RCToken>> run();
  90
  91 private:
  92   // All 'advancing' methods return boolean values; if they're equal to false,
  93   // the stream has ended or failed.
  94   bool advance(size_t Amount = 1);
  95   bool skipWhitespaces();
  96
  97   // Consumes a token. If any problem occurred, a non-empty Error is returned.
  98   Error consumeToken(const Kind TokenKind);
  99
 100   // Check if tokenizer is about to read FollowingChars.
 101   bool willNowRead(StringRef FollowingChars) const;
 102
 103   // Check if tokenizer can start reading an identifier at current position.
 104   // The original tool did non specify the rules to determine what is a correct
 105   // identifier. We assume they should follow the C convention:
 106   // [a-zA-Z_][a-zA-Z0-9_]*.
 107   bool canStartIdentifier() const;
 108   // Check if tokenizer can continue reading an identifier.
 109   bool canContinueIdentifier() const;
 110
 111   // Check if tokenizer can start reading an integer.
 112   // A correct integer always starts with a 0-9 digit,
 113   // can contain characters 0-9A-Fa-f (digits),
 114   // Ll (marking the integer is 32-bit), Xx (marking the representation
 115   // is hexadecimal). As some kind of separator should come after the
 116   // integer, we can consume the integer until a non-alphanumeric
 117   // character.
 118   bool canStartInt() const;
 119   bool canContinueInt() const;
 120
 121   bool canStartString() const;
 122
 123   // Check if tokenizer can start reading a single line comment (e.g. a comment
 124   // that begins with '//')
 125   bool canStartLineComment() const;
 126
 127   // Check if tokenizer can start or finish reading a block comment (e.g. a
 128   // comment that begins with '/*' and ends with '*/')
 129   bool canStartBlockComment() const;
 130
 131   // Throw away all remaining characters on the current line.
 132   void skipCurrentLine();
 133
 134   bool streamEof() const;
 135
 136   // Classify the token that is about to be read from the current position.
 137   Kind classifyCurrentToken() const;
 138
 139   // Process the Kind::Identifier token - check if it is
 140   // an identifier describing a block start or end.
 141   void processIdentifier(RCToken &token) const;
 142
 143   StringRef Data;
 144   size_t DataLength, Pos;
 145 };
 146
 147 void Tokenizer::skipCurrentLine() {
 148   Pos = Data.find_first_of("\r\n", Pos);
 149   Pos = Data.find_first_not_of("\r\n", Pos);
 150
 151   if (Pos == StringRef::npos)
 152     Pos = DataLength;
 153 }
 154
 155 Expected<std::vector<RCToken>> Tokenizer::run() {
 156   Pos = 0;
 157   std::vector<RCToken> Result;
 158
 159   // Consume an optional UTF-8 Byte Order Mark.
 160   if (willNowRead("\xef\xbb\xbf"))
 161     advance(3);
 162
 163   while (!streamEof()) {
 164     if (!skipWhitespaces())
 165       break;
 166
 167     Kind TokenKind = classifyCurrentToken();
 168     if (TokenKind == Kind::Invalid)
 169       return getStringError("Invalid token found at position " + Twine(Pos));
 170
 171     const size_t TokenStart = Pos;
 172     if (Error TokenError = consumeToken(TokenKind))
 173       return std::move(TokenError);
 174
 175     // Comments are just deleted, don't bother saving them.
 176     if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
 177       continue;
 178
 179     RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
 180     if (TokenKind == Kind::Identifier) {
 181       processIdentifier(Token);
 182     } else if (TokenKind == Kind::Int) {
 183       uint32_t TokenInt;
 184       if (!rcGetAsInteger(Token.value(), TokenInt)) {
 185         // The integer has incorrect format or cannot be represented in
 186         // a 32-bit integer.
 187         return getStringError("Integer invalid or too large: " +
 188                               Token.value().str());
 189       }
 190     }
 191
 192     Result.push_back(Token);
 193   }
 194
 195   return Result;
 196 }
 197
 198 bool Tokenizer::advance(size_t Amount) {
 199   Pos += Amount;
 200   return !streamEof();
 201 }
 202
 203 bool Tokenizer::skipWhitespaces() {
 204   while (!streamEof() && std::isspace(Data[Pos]))
 205     advance();
 206   return !streamEof();
 207 }
 208
 209 Error Tokenizer::consumeToken(const Kind TokenKind) {
 210   switch (TokenKind) {
 211   // One-character token consumption.
 212 #define TOKEN(Name)
 213 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
 214 #include "ResourceScriptTokenList.def"
 215     advance();
 216     return Error::success();
 217
 218   case Kind::LineComment:
 219     advance(2);
 220     skipCurrentLine();
 221     return Error::success();
 222
 223   case Kind::StartComment: {
 224     advance(2);
 225     auto EndPos = Data.find("*/", Pos);
 226     if (EndPos == StringRef::npos)
 227       return getStringError(
 228           "Unclosed multi-line comment beginning at position " + Twine(Pos));
 229     advance(EndPos - Pos);
 230     advance(2);
 231     return Error::success();
 232   }
 233   case Kind::Identifier:
 234     while (!streamEof() && canContinueIdentifier())
 235       advance();
 236     return Error::success();
 237
 238   case Kind::Int:
 239     while (!streamEof() && canContinueInt())
 240       advance();
 241     return Error::success();
 242
 243   case Kind::String:
 244     // Consume the preceding 'L', if there is any.
 245     if (std::toupper(Data[Pos]) == 'L')
 246       advance();
 247     // Consume the double-quote.
 248     advance();
 249
 250     // Consume the characters until the end of the file, line or string.
 251     while (true) {
 252       if (streamEof()) {
 253         return getStringError("Unterminated string literal.");
 254       } else if (Data[Pos] == '"') {
 255         // Consume the ending double-quote.
 256         advance();
 257         // However, if another '"' follows this double-quote, the string didn't
 258         // end and we just included '"' into the string.
 259         if (!willNowRead("\""))
 260           return Error::success();
 261       } else if (Data[Pos] == '\n') {
 262         return getStringError("String literal not terminated in the line.");
 263       }
 264
 265       advance();
 266     }
 267
 268   case Kind::Invalid:
 269     assert(false && "Cannot consume an invalid token.");
 270   }
 271
 272   llvm_unreachable("Unknown RCToken::Kind");
 273 }
 274
 275 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
 276   return Data.drop_front(Pos).startswith(FollowingChars);
 277 }
 278
 279 bool Tokenizer::canStartIdentifier() const {
 280   assert(!streamEof());
 281
 282   const char CurChar = Data[Pos];
 283   return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
 284 }
 285
 286 bool Tokenizer::canContinueIdentifier() const {
 287   assert(!streamEof());
 288   const char CurChar = Data[Pos];
 289   return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
 290          CurChar == '/' || CurChar == '\\';
 291 }
 292
 293 bool Tokenizer::canStartInt() const {
 294   assert(!streamEof());
 295   return std::isdigit(Data[Pos]);
 296 }
 297
 298 bool Tokenizer::canStartBlockComment() const {
 299   assert(!streamEof());
 300   return Data.drop_front(Pos).startswith("/*");
 301 }
 302
 303 bool Tokenizer::canStartLineComment() const {
 304   assert(!streamEof());
 305   return Data.drop_front(Pos).startswith("//");
 306 }
 307
 308 bool Tokenizer::canContinueInt() const {
 309   assert(!streamEof());
 310   return std::isalnum(Data[Pos]);
 311 }
 312
 313 bool Tokenizer::canStartString() const {
 314   return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
 315 }
 316
 317 bool Tokenizer::streamEof() const { return Pos == DataLength; }
 318
 319 Kind Tokenizer::classifyCurrentToken() const {
 320   if (canStartBlockComment())
 321     return Kind::StartComment;
 322   if (canStartLineComment())
 323     return Kind::LineComment;
 324
 325   if (canStartInt())
 326     return Kind::Int;
 327   if (canStartString())
 328     return Kind::String;
 329   // BEGIN and END are at this point of lexing recognized as identifiers.
 330   if (canStartIdentifier())
 331     return Kind::Identifier;
 332
 333   const char CurChar = Data[Pos];
 334
 335   switch (CurChar) {
 336   // One-character token classification.
 337 #define TOKEN(Name)
 338 #define SHORT_TOKEN(Name, Ch)                                                  \
 339   case Ch:                                                                     \
 340     return Kind::Name;
 341 #include "ResourceScriptTokenList.def"
 342
 343   default:
 344     return Kind::Invalid;
 345   }
 346 }
 347
 348 void Tokenizer::processIdentifier(RCToken &Token) const {
 349   assert(Token.kind() == Kind::Identifier);
 350   StringRef Name = Token.value();
 351
 352   if (Name.equals_lower("begin"))
 353     Token = RCToken(Kind::BlockBegin, Name);
 354   else if (Name.equals_lower("end"))
 355     Token = RCToken(Kind::BlockEnd, Name);
 356 }
 357
 358 } // anonymous namespace
 359
 360 namespace llvm {
 361
 362 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
 363   return Tokenizer(Input).run();
 364 }
 365
 366 } // namespace llvm