Fix for PR34888.
[llvm-core.git] / tools / llvm-rc / ResourceScriptToken.cpp
blob5a3473a4b083ca3b5f69652e2b2455fb00de468c
1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===---------------------------------------------------------------------===//
9 //
10 // This file implements an interface defined in ResourceScriptToken.h.
11 // In particular, it defines an .rc script tokenizer.
13 //===---------------------------------------------------------------------===//
15 #include "ResourceScriptToken.h"
16 #include "llvm/Support/raw_ostream.h"
18 #include <algorithm>
19 #include <cassert>
20 #include <cctype>
21 #include <cstdlib>
22 #include <utility>
24 using namespace llvm;
26 using Kind = RCToken::Kind;
28 // Checks if Representation is a correct description of an RC integer.
29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31 // character (that is the difference between our representation and
32 // StringRef's one). If Representation is correct, 'true' is returned and
33 // the return value is put back in Num.
34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35 size_t Length = Representation.size();
36 if (Length == 0)
37 return false;
38 // Strip the last 'L' if unnecessary.
39 if (std::toupper(Representation.back()) == 'L')
40 Representation = Representation.drop_back(1);
42 return !Representation.getAsInteger<uint32_t>(0, Num);
45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46 : TokenKind(RCTokenKind), TokenValue(Value) {}
48 uint32_t RCToken::intValue() const {
49 assert(TokenKind == Kind::Int);
50 // We assume that the token already is a correct integer (checked by
51 // rcGetAsInteger).
52 uint32_t Result;
53 bool IsSuccess = rcGetAsInteger(TokenValue, Result);
54 assert(IsSuccess);
55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
56 return Result;
59 bool RCToken::isLongInt() const {
60 return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
63 StringRef RCToken::value() const { return TokenValue; }
65 Kind RCToken::kind() const { return TokenKind; }
67 bool RCToken::isBinaryOp() const {
68 switch (TokenKind) {
69 case Kind::Plus:
70 case Kind::Minus:
71 case Kind::Pipe:
72 case Kind::Amp:
73 return true;
74 default:
75 return false;
79 static Error getStringError(const Twine &message) {
80 return make_error<StringError>("Error parsing file: " + message,
81 inconvertibleErrorCode());
84 namespace {
86 class Tokenizer {
87 public:
88 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
90 Expected<std::vector<RCToken>> run();
92 private:
93 // All 'advancing' methods return boolean values; if they're equal to false,
94 // the stream has ended or failed.
95 bool advance(size_t Amount = 1);
96 bool skipWhitespaces();
98 // Consumes a token. If any problem occurred, a non-empty Error is returned.
99 Error consumeToken(const Kind TokenKind);
101 // Check if tokenizer is about to read FollowingChars.
102 bool willNowRead(StringRef FollowingChars) const;
104 // Check if tokenizer can start reading an identifier at current position.
105 // The original tool did non specify the rules to determine what is a correct
106 // identifier. We assume they should follow the C convention:
107 // [a-zA-Z_][a-zA-Z0-9_]*.
108 bool canStartIdentifier() const;
109 // Check if tokenizer can continue reading an identifier.
110 bool canContinueIdentifier() const;
112 // Check if tokenizer can start reading an integer.
113 // A correct integer always starts with a 0-9 digit,
114 // can contain characters 0-9A-Fa-f (digits),
115 // Ll (marking the integer is 32-bit), Xx (marking the representation
116 // is hexadecimal). As some kind of separator should come after the
117 // integer, we can consume the integer until a non-alphanumeric
118 // character.
119 bool canStartInt() const;
120 bool canContinueInt() const;
122 bool canStartString() const;
124 // Check if tokenizer can start reading a single line comment (e.g. a comment
125 // that begins with '//')
126 bool canStartLineComment() const;
128 // Check if tokenizer can start or finish reading a block comment (e.g. a
129 // comment that begins with '/*' and ends with '*/')
130 bool canStartBlockComment() const;
132 // Throw away all remaining characters on the current line.
133 void skipCurrentLine();
135 bool streamEof() const;
137 // Classify the token that is about to be read from the current position.
138 Kind classifyCurrentToken() const;
140 // Process the Kind::Identifier token - check if it is
141 // an identifier describing a block start or end.
142 void processIdentifier(RCToken &token) const;
144 StringRef Data;
145 size_t DataLength, Pos;
148 void Tokenizer::skipCurrentLine() {
149 Pos = Data.find_first_of("\r\n", Pos);
150 Pos = Data.find_first_not_of("\r\n", Pos);
152 if (Pos == StringRef::npos)
153 Pos = DataLength;
156 Expected<std::vector<RCToken>> Tokenizer::run() {
157 Pos = 0;
158 std::vector<RCToken> Result;
160 // Consume an optional UTF-8 Byte Order Mark.
161 if (willNowRead("\xef\xbb\xbf"))
162 advance(3);
164 while (!streamEof()) {
165 if (!skipWhitespaces())
166 break;
168 Kind TokenKind = classifyCurrentToken();
169 if (TokenKind == Kind::Invalid)
170 return getStringError("Invalid token found at position " + Twine(Pos));
172 const size_t TokenStart = Pos;
173 if (Error TokenError = consumeToken(TokenKind))
174 return std::move(TokenError);
176 // Comments are just deleted, don't bother saving them.
177 if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
178 continue;
180 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
181 if (TokenKind == Kind::Identifier) {
182 processIdentifier(Token);
183 } else if (TokenKind == Kind::Int) {
184 uint32_t TokenInt;
185 if (!rcGetAsInteger(Token.value(), TokenInt)) {
186 // The integer has incorrect format or cannot be represented in
187 // a 32-bit integer.
188 return getStringError("Integer invalid or too large: " +
189 Token.value().str());
193 Result.push_back(Token);
196 return Result;
199 bool Tokenizer::advance(size_t Amount) {
200 Pos += Amount;
201 return !streamEof();
204 bool Tokenizer::skipWhitespaces() {
205 while (!streamEof() && std::isspace(Data[Pos]))
206 advance();
207 return !streamEof();
210 Error Tokenizer::consumeToken(const Kind TokenKind) {
211 switch (TokenKind) {
212 // One-character token consumption.
213 #define TOKEN(Name)
214 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
215 #include "ResourceScriptTokenList.h"
216 #undef TOKEN
217 #undef SHORT_TOKEN
218 advance();
219 return Error::success();
221 case Kind::LineComment:
222 advance(2);
223 skipCurrentLine();
224 return Error::success();
226 case Kind::StartComment: {
227 advance(2);
228 auto EndPos = Data.find("*/", Pos);
229 if (EndPos == StringRef::npos)
230 return getStringError(
231 "Unclosed multi-line comment beginning at position " + Twine(Pos));
232 advance(EndPos - Pos);
233 advance(2);
234 return Error::success();
236 case Kind::Identifier:
237 while (!streamEof() && canContinueIdentifier())
238 advance();
239 return Error::success();
241 case Kind::Int:
242 while (!streamEof() && canContinueInt())
243 advance();
244 return Error::success();
246 case Kind::String:
247 // Consume the preceding 'L', if there is any.
248 if (std::toupper(Data[Pos]) == 'L')
249 advance();
250 // Consume the double-quote.
251 advance();
253 // Consume the characters until the end of the file, line or string.
254 while (true) {
255 if (streamEof()) {
256 return getStringError("Unterminated string literal.");
257 } else if (Data[Pos] == '"') {
258 // Consume the ending double-quote.
259 advance();
260 // However, if another '"' follows this double-quote, the string didn't
261 // end and we just included '"' into the string.
262 if (!willNowRead("\""))
263 return Error::success();
264 } else if (Data[Pos] == '\n') {
265 return getStringError("String literal not terminated in the line.");
268 advance();
271 case Kind::Invalid:
272 assert(false && "Cannot consume an invalid token.");
275 llvm_unreachable("Unknown RCToken::Kind");
278 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
279 return Data.drop_front(Pos).startswith(FollowingChars);
282 bool Tokenizer::canStartIdentifier() const {
283 assert(!streamEof());
285 const char CurChar = Data[Pos];
286 return std::isalpha(CurChar) || CurChar == '_';
289 bool Tokenizer::canContinueIdentifier() const {
290 assert(!streamEof());
291 const char CurChar = Data[Pos];
292 return std::isalnum(CurChar) || CurChar == '_';
295 bool Tokenizer::canStartInt() const {
296 assert(!streamEof());
297 return std::isdigit(Data[Pos]);
300 bool Tokenizer::canStartBlockComment() const {
301 assert(!streamEof());
302 return Data.drop_front(Pos).startswith("/*");
305 bool Tokenizer::canStartLineComment() const {
306 assert(!streamEof());
307 return Data.drop_front(Pos).startswith("//");
310 bool Tokenizer::canContinueInt() const {
311 assert(!streamEof());
312 return std::isalnum(Data[Pos]);
315 bool Tokenizer::canStartString() const {
316 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
319 bool Tokenizer::streamEof() const { return Pos == DataLength; }
321 Kind Tokenizer::classifyCurrentToken() const {
322 if (canStartBlockComment())
323 return Kind::StartComment;
324 if (canStartLineComment())
325 return Kind::LineComment;
327 if (canStartInt())
328 return Kind::Int;
329 if (canStartString())
330 return Kind::String;
331 // BEGIN and END are at this point of lexing recognized as identifiers.
332 if (canStartIdentifier())
333 return Kind::Identifier;
335 const char CurChar = Data[Pos];
337 switch (CurChar) {
338 // One-character token classification.
339 #define TOKEN(Name)
340 #define SHORT_TOKEN(Name, Ch) \
341 case Ch: \
342 return Kind::Name;
343 #include "ResourceScriptTokenList.h"
344 #undef TOKEN
345 #undef SHORT_TOKEN
347 default:
348 return Kind::Invalid;
352 void Tokenizer::processIdentifier(RCToken &Token) const {
353 assert(Token.kind() == Kind::Identifier);
354 StringRef Name = Token.value();
356 if (Name.equals_lower("begin"))
357 Token = RCToken(Kind::BlockBegin, Name);
358 else if (Name.equals_lower("end"))
359 Token = RCToken(Kind::BlockEnd, Name);
362 } // anonymous namespace
364 namespace llvm {
366 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
367 return Tokenizer(Input).run();
370 } // namespace llvm