1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===---------------------------------------------------------------------===//
9 // This file implements an interface defined in ResourceScriptToken.h.
10 // In particular, it defines an .rc script tokenizer.
12 //===---------------------------------------------------------------------===//
14 #include "ResourceScriptToken.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include "llvm/Support/raw_ostream.h"
26 using Kind
= RCToken::Kind
;
28 // Checks if Representation is a correct description of an RC integer.
29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31 // character (that is the difference between our representation and
32 // StringRef's one). If Representation is correct, 'true' is returned and
33 // the return value is put back in Num.
34 static bool rcGetAsInteger(StringRef Representation
, uint32_t &Num
) {
35 size_t Length
= Representation
.size();
38 // Strip the last 'L' if unnecessary.
39 if (std::toupper(Representation
.back()) == 'L')
40 Representation
= Representation
.drop_back(1);
42 return !Representation
.getAsInteger
<uint32_t>(0, Num
);
45 RCToken::RCToken(RCToken::Kind RCTokenKind
, StringRef Value
)
46 : TokenKind(RCTokenKind
), TokenValue(Value
) {}
48 uint32_t RCToken::intValue() const {
49 assert(TokenKind
== Kind::Int
);
50 // We assume that the token already is a correct integer (checked by
53 bool IsSuccess
= rcGetAsInteger(TokenValue
, Result
);
55 (void)IsSuccess
; // Silence the compiler warning when -DNDEBUG flag is on.
59 bool RCToken::isLongInt() const {
60 return TokenKind
== Kind::Int
&& std::toupper(TokenValue
.back()) == 'L';
63 StringRef
RCToken::value() const { return TokenValue
; }
65 Kind
RCToken::kind() const { return TokenKind
; }
67 bool RCToken::isBinaryOp() const {
79 static Error
getStringError(const Twine
&message
) {
80 return make_error
<StringError
>("Error parsing file: " + message
,
81 inconvertibleErrorCode());
88 Tokenizer(StringRef Input
) : Data(Input
), DataLength(Input
.size()), Pos(0) {}
90 Expected
<std::vector
<RCToken
>> run();
93 // All 'advancing' methods return boolean values; if they're equal to false,
94 // the stream has ended or failed.
95 bool advance(size_t Amount
= 1);
96 bool skipWhitespaces();
98 // Consumes a token. If any problem occurred, a non-empty Error is returned.
99 Error
consumeToken(const Kind TokenKind
);
101 // Check if tokenizer is about to read FollowingChars.
102 bool willNowRead(StringRef FollowingChars
) const;
104 // Check if tokenizer can start reading an identifier at current position.
105 // The original tool did non specify the rules to determine what is a correct
106 // identifier. We assume they should follow the C convention:
107 // [a-zA-Z_][a-zA-Z0-9_]*.
108 bool canStartIdentifier() const;
109 // Check if tokenizer can continue reading an identifier.
110 bool canContinueIdentifier() const;
112 // Check if tokenizer can start reading an integer.
113 // A correct integer always starts with a 0-9 digit,
114 // can contain characters 0-9A-Fa-f (digits),
115 // Ll (marking the integer is 32-bit), Xx (marking the representation
116 // is hexadecimal). As some kind of separator should come after the
117 // integer, we can consume the integer until a non-alphanumeric
119 bool canStartInt() const;
120 bool canContinueInt() const;
122 bool canStartString() const;
124 // Check if tokenizer can start reading a single line comment (e.g. a comment
125 // that begins with '//')
126 bool canStartLineComment() const;
128 // Check if tokenizer can start or finish reading a block comment (e.g. a
129 // comment that begins with '/*' and ends with '*/')
130 bool canStartBlockComment() const;
132 // Throw away all remaining characters on the current line.
133 void skipCurrentLine();
135 bool streamEof() const;
137 // Classify the token that is about to be read from the current position.
138 Kind
classifyCurrentToken() const;
140 // Process the Kind::Identifier token - check if it is
141 // an identifier describing a block start or end.
142 void processIdentifier(RCToken
&token
) const;
145 size_t DataLength
, Pos
;
148 void Tokenizer::skipCurrentLine() {
149 Pos
= Data
.find_first_of("\r\n", Pos
);
150 Pos
= Data
.find_first_not_of("\r\n", Pos
);
152 if (Pos
== StringRef::npos
)
156 Expected
<std::vector
<RCToken
>> Tokenizer::run() {
158 std::vector
<RCToken
> Result
;
160 // Consume an optional UTF-8 Byte Order Mark.
161 if (willNowRead("\xef\xbb\xbf"))
164 while (!streamEof()) {
165 if (!skipWhitespaces())
168 Kind TokenKind
= classifyCurrentToken();
169 if (TokenKind
== Kind::Invalid
)
170 return getStringError("Invalid token found at position " + Twine(Pos
));
172 const size_t TokenStart
= Pos
;
173 if (Error TokenError
= consumeToken(TokenKind
))
174 return std::move(TokenError
);
176 // Comments are just deleted, don't bother saving them.
177 if (TokenKind
== Kind::LineComment
|| TokenKind
== Kind::StartComment
)
180 RCToken
Token(TokenKind
, Data
.take_front(Pos
).drop_front(TokenStart
));
181 if (TokenKind
== Kind::Identifier
) {
182 processIdentifier(Token
);
183 } else if (TokenKind
== Kind::Int
) {
185 if (!rcGetAsInteger(Token
.value(), TokenInt
)) {
186 // The integer has incorrect format or cannot be represented in
188 return getStringError("Integer invalid or too large: " +
189 Token
.value().str());
193 Result
.push_back(Token
);
199 bool Tokenizer::advance(size_t Amount
) {
204 bool Tokenizer::skipWhitespaces() {
205 while (!streamEof() && isSpace(Data
[Pos
]))
210 Error
Tokenizer::consumeToken(const Kind TokenKind
) {
212 // One-character token consumption.
214 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
215 #include "ResourceScriptTokenList.def"
217 return Error::success();
219 case Kind::LineComment
:
222 return Error::success();
224 case Kind::StartComment
: {
226 auto EndPos
= Data
.find("*/", Pos
);
227 if (EndPos
== StringRef::npos
)
228 return getStringError(
229 "Unclosed multi-line comment beginning at position " + Twine(Pos
));
230 advance(EndPos
- Pos
);
232 return Error::success();
234 case Kind::Identifier
:
235 while (!streamEof() && canContinueIdentifier())
237 return Error::success();
240 while (!streamEof() && canContinueInt())
242 return Error::success();
245 // Consume the preceding 'L', if there is any.
246 if (std::toupper(Data
[Pos
]) == 'L')
248 // Consume the double-quote.
251 // Consume the characters until the end of the file, line or string.
254 return getStringError("Unterminated string literal.");
255 } else if (Data
[Pos
] == '"') {
256 // Consume the ending double-quote.
258 // However, if another '"' follows this double-quote, the string didn't
259 // end and we just included '"' into the string.
260 if (!willNowRead("\""))
261 return Error::success();
262 } else if (Data
[Pos
] == '\n') {
263 return getStringError("String literal not terminated in the line.");
270 assert(false && "Cannot consume an invalid token.");
273 llvm_unreachable("Unknown RCToken::Kind");
276 bool Tokenizer::willNowRead(StringRef FollowingChars
) const {
277 return Data
.drop_front(Pos
).startswith(FollowingChars
);
280 bool Tokenizer::canStartIdentifier() const {
281 assert(!streamEof());
283 const char CurChar
= Data
[Pos
];
284 return std::isalpha(CurChar
) || CurChar
== '_' || CurChar
== '.';
287 bool Tokenizer::canContinueIdentifier() const {
288 assert(!streamEof());
289 const char CurChar
= Data
[Pos
];
290 return std::isalnum(CurChar
) || CurChar
== '_' || CurChar
== '.' ||
291 CurChar
== '/' || CurChar
== '\\' || CurChar
== '-';
294 bool Tokenizer::canStartInt() const {
295 assert(!streamEof());
296 return std::isdigit(Data
[Pos
]);
299 bool Tokenizer::canStartBlockComment() const {
300 assert(!streamEof());
301 return Data
.drop_front(Pos
).startswith("/*");
304 bool Tokenizer::canStartLineComment() const {
305 assert(!streamEof());
306 return Data
.drop_front(Pos
).startswith("//");
309 bool Tokenizer::canContinueInt() const {
310 assert(!streamEof());
311 return std::isalnum(Data
[Pos
]);
314 bool Tokenizer::canStartString() const {
315 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
318 bool Tokenizer::streamEof() const { return Pos
== DataLength
; }
320 Kind
Tokenizer::classifyCurrentToken() const {
321 if (canStartBlockComment())
322 return Kind::StartComment
;
323 if (canStartLineComment())
324 return Kind::LineComment
;
328 if (canStartString())
330 // BEGIN and END are at this point of lexing recognized as identifiers.
331 if (canStartIdentifier())
332 return Kind::Identifier
;
334 const char CurChar
= Data
[Pos
];
337 // One-character token classification.
339 #define SHORT_TOKEN(Name, Ch) \
342 #include "ResourceScriptTokenList.def"
345 return Kind::Invalid
;
349 void Tokenizer::processIdentifier(RCToken
&Token
) const {
350 assert(Token
.kind() == Kind::Identifier
);
351 StringRef Name
= Token
.value();
353 if (Name
.equals_insensitive("begin"))
354 Token
= RCToken(Kind::BlockBegin
, Name
);
355 else if (Name
.equals_insensitive("end"))
356 Token
= RCToken(Kind::BlockEnd
, Name
);
359 } // anonymous namespace
363 Expected
<std::vector
<RCToken
>> tokenizeRC(StringRef Input
) {
364 return Tokenizer(Input
).run();