[llvm-readobj] - Refine the LLVM-style output to be consistent.
[llvm-complete.git] / tools / llvm-rc / ResourceScriptToken.cpp
blobbb0b3bdab03cff23de5cefe6c0af230a6fea1737
1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This file implements an interface defined in ResourceScriptToken.h.
10 // In particular, it defines an .rc script tokenizer.
12 //===---------------------------------------------------------------------===//
14 #include "ResourceScriptToken.h"
15 #include "llvm/Support/raw_ostream.h"
17 #include <algorithm>
18 #include <cassert>
19 #include <cctype>
20 #include <cstdlib>
21 #include <utility>
23 using namespace llvm;
25 using Kind = RCToken::Kind;
27 // Checks if Representation is a correct description of an RC integer.
28 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
29 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
30 // character (that is the difference between our representation and
31 // StringRef's one). If Representation is correct, 'true' is returned and
32 // the return value is put back in Num.
33 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
34 size_t Length = Representation.size();
35 if (Length == 0)
36 return false;
37 // Strip the last 'L' if unnecessary.
38 if (std::toupper(Representation.back()) == 'L')
39 Representation = Representation.drop_back(1);
41 return !Representation.getAsInteger<uint32_t>(0, Num);
44 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
45 : TokenKind(RCTokenKind), TokenValue(Value) {}
47 uint32_t RCToken::intValue() const {
48 assert(TokenKind == Kind::Int);
49 // We assume that the token already is a correct integer (checked by
50 // rcGetAsInteger).
51 uint32_t Result;
52 bool IsSuccess = rcGetAsInteger(TokenValue, Result);
53 assert(IsSuccess);
54 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
55 return Result;
58 bool RCToken::isLongInt() const {
59 return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
62 StringRef RCToken::value() const { return TokenValue; }
64 Kind RCToken::kind() const { return TokenKind; }
66 bool RCToken::isBinaryOp() const {
67 switch (TokenKind) {
68 case Kind::Plus:
69 case Kind::Minus:
70 case Kind::Pipe:
71 case Kind::Amp:
72 return true;
73 default:
74 return false;
78 static Error getStringError(const Twine &message) {
79 return make_error<StringError>("Error parsing file: " + message,
80 inconvertibleErrorCode());
83 namespace {
85 class Tokenizer {
86 public:
87 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
89 Expected<std::vector<RCToken>> run();
91 private:
92 // All 'advancing' methods return boolean values; if they're equal to false,
93 // the stream has ended or failed.
94 bool advance(size_t Amount = 1);
95 bool skipWhitespaces();
97 // Consumes a token. If any problem occurred, a non-empty Error is returned.
98 Error consumeToken(const Kind TokenKind);
100 // Check if tokenizer is about to read FollowingChars.
101 bool willNowRead(StringRef FollowingChars) const;
103 // Check if tokenizer can start reading an identifier at current position.
104 // The original tool did non specify the rules to determine what is a correct
105 // identifier. We assume they should follow the C convention:
106 // [a-zA-Z_][a-zA-Z0-9_]*.
107 bool canStartIdentifier() const;
108 // Check if tokenizer can continue reading an identifier.
109 bool canContinueIdentifier() const;
111 // Check if tokenizer can start reading an integer.
112 // A correct integer always starts with a 0-9 digit,
113 // can contain characters 0-9A-Fa-f (digits),
114 // Ll (marking the integer is 32-bit), Xx (marking the representation
115 // is hexadecimal). As some kind of separator should come after the
116 // integer, we can consume the integer until a non-alphanumeric
117 // character.
118 bool canStartInt() const;
119 bool canContinueInt() const;
121 bool canStartString() const;
123 // Check if tokenizer can start reading a single line comment (e.g. a comment
124 // that begins with '//')
125 bool canStartLineComment() const;
127 // Check if tokenizer can start or finish reading a block comment (e.g. a
128 // comment that begins with '/*' and ends with '*/')
129 bool canStartBlockComment() const;
131 // Throw away all remaining characters on the current line.
132 void skipCurrentLine();
134 bool streamEof() const;
136 // Classify the token that is about to be read from the current position.
137 Kind classifyCurrentToken() const;
139 // Process the Kind::Identifier token - check if it is
140 // an identifier describing a block start or end.
141 void processIdentifier(RCToken &token) const;
143 StringRef Data;
144 size_t DataLength, Pos;
147 void Tokenizer::skipCurrentLine() {
148 Pos = Data.find_first_of("\r\n", Pos);
149 Pos = Data.find_first_not_of("\r\n", Pos);
151 if (Pos == StringRef::npos)
152 Pos = DataLength;
155 Expected<std::vector<RCToken>> Tokenizer::run() {
156 Pos = 0;
157 std::vector<RCToken> Result;
159 // Consume an optional UTF-8 Byte Order Mark.
160 if (willNowRead("\xef\xbb\xbf"))
161 advance(3);
163 while (!streamEof()) {
164 if (!skipWhitespaces())
165 break;
167 Kind TokenKind = classifyCurrentToken();
168 if (TokenKind == Kind::Invalid)
169 return getStringError("Invalid token found at position " + Twine(Pos));
171 const size_t TokenStart = Pos;
172 if (Error TokenError = consumeToken(TokenKind))
173 return std::move(TokenError);
175 // Comments are just deleted, don't bother saving them.
176 if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
177 continue;
179 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
180 if (TokenKind == Kind::Identifier) {
181 processIdentifier(Token);
182 } else if (TokenKind == Kind::Int) {
183 uint32_t TokenInt;
184 if (!rcGetAsInteger(Token.value(), TokenInt)) {
185 // The integer has incorrect format or cannot be represented in
186 // a 32-bit integer.
187 return getStringError("Integer invalid or too large: " +
188 Token.value().str());
192 Result.push_back(Token);
195 return Result;
198 bool Tokenizer::advance(size_t Amount) {
199 Pos += Amount;
200 return !streamEof();
203 bool Tokenizer::skipWhitespaces() {
204 while (!streamEof() && std::isspace(Data[Pos]))
205 advance();
206 return !streamEof();
209 Error Tokenizer::consumeToken(const Kind TokenKind) {
210 switch (TokenKind) {
211 // One-character token consumption.
212 #define TOKEN(Name)
213 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
214 #include "ResourceScriptTokenList.def"
215 advance();
216 return Error::success();
218 case Kind::LineComment:
219 advance(2);
220 skipCurrentLine();
221 return Error::success();
223 case Kind::StartComment: {
224 advance(2);
225 auto EndPos = Data.find("*/", Pos);
226 if (EndPos == StringRef::npos)
227 return getStringError(
228 "Unclosed multi-line comment beginning at position " + Twine(Pos));
229 advance(EndPos - Pos);
230 advance(2);
231 return Error::success();
233 case Kind::Identifier:
234 while (!streamEof() && canContinueIdentifier())
235 advance();
236 return Error::success();
238 case Kind::Int:
239 while (!streamEof() && canContinueInt())
240 advance();
241 return Error::success();
243 case Kind::String:
244 // Consume the preceding 'L', if there is any.
245 if (std::toupper(Data[Pos]) == 'L')
246 advance();
247 // Consume the double-quote.
248 advance();
250 // Consume the characters until the end of the file, line or string.
251 while (true) {
252 if (streamEof()) {
253 return getStringError("Unterminated string literal.");
254 } else if (Data[Pos] == '"') {
255 // Consume the ending double-quote.
256 advance();
257 // However, if another '"' follows this double-quote, the string didn't
258 // end and we just included '"' into the string.
259 if (!willNowRead("\""))
260 return Error::success();
261 } else if (Data[Pos] == '\n') {
262 return getStringError("String literal not terminated in the line.");
265 advance();
268 case Kind::Invalid:
269 assert(false && "Cannot consume an invalid token.");
272 llvm_unreachable("Unknown RCToken::Kind");
275 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
276 return Data.drop_front(Pos).startswith(FollowingChars);
279 bool Tokenizer::canStartIdentifier() const {
280 assert(!streamEof());
282 const char CurChar = Data[Pos];
283 return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
286 bool Tokenizer::canContinueIdentifier() const {
287 assert(!streamEof());
288 const char CurChar = Data[Pos];
289 return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
290 CurChar == '/' || CurChar == '\\';
293 bool Tokenizer::canStartInt() const {
294 assert(!streamEof());
295 return std::isdigit(Data[Pos]);
298 bool Tokenizer::canStartBlockComment() const {
299 assert(!streamEof());
300 return Data.drop_front(Pos).startswith("/*");
303 bool Tokenizer::canStartLineComment() const {
304 assert(!streamEof());
305 return Data.drop_front(Pos).startswith("//");
308 bool Tokenizer::canContinueInt() const {
309 assert(!streamEof());
310 return std::isalnum(Data[Pos]);
313 bool Tokenizer::canStartString() const {
314 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
317 bool Tokenizer::streamEof() const { return Pos == DataLength; }
319 Kind Tokenizer::classifyCurrentToken() const {
320 if (canStartBlockComment())
321 return Kind::StartComment;
322 if (canStartLineComment())
323 return Kind::LineComment;
325 if (canStartInt())
326 return Kind::Int;
327 if (canStartString())
328 return Kind::String;
329 // BEGIN and END are at this point of lexing recognized as identifiers.
330 if (canStartIdentifier())
331 return Kind::Identifier;
333 const char CurChar = Data[Pos];
335 switch (CurChar) {
336 // One-character token classification.
337 #define TOKEN(Name)
338 #define SHORT_TOKEN(Name, Ch) \
339 case Ch: \
340 return Kind::Name;
341 #include "ResourceScriptTokenList.def"
343 default:
344 return Kind::Invalid;
348 void Tokenizer::processIdentifier(RCToken &Token) const {
349 assert(Token.kind() == Kind::Identifier);
350 StringRef Name = Token.value();
352 if (Name.equals_lower("begin"))
353 Token = RCToken(Kind::BlockBegin, Name);
354 else if (Name.equals_lower("end"))
355 Token = RCToken(Kind::BlockEnd, Name);
358 } // anonymous namespace
360 namespace llvm {
362 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
363 return Tokenizer(Input).run();
366 } // namespace llvm