1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "tools/gn/tokenizer.h"
7 #include "base/logging.h"
8 #include "tools/gn/input_file.h"
12 bool CouldBeTwoCharOperatorBegin(char c
) {
13 return c
== '<' || c
== '>' || c
== '!' || c
== '=' || c
== '-' ||
14 c
== '+' || c
== '|' || c
== '&';
17 bool CouldBeTwoCharOperatorEnd(char c
) {
18 return c
== '=' || c
== '|' || c
== '&';
21 bool CouldBeOneCharOperator(char c
) {
22 return c
== '=' || c
== '<' || c
== '>' || c
== '+' || c
== '!' ||
23 c
== ':' || c
== '|' || c
== '&' || c
== '-';
26 bool CouldBeOperator(char c
) {
27 return CouldBeOneCharOperator(c
) || CouldBeTwoCharOperatorBegin(c
);
30 bool IsScoperChar(char c
) {
31 return c
== '(' || c
== ')' || c
== '[' || c
== ']' || c
== '{' || c
== '}';
34 Token::Type
GetSpecificOperatorType(base::StringPiece value
) {
42 return Token::PLUS_EQUALS
;
44 return Token::MINUS_EQUALS
;
46 return Token::EQUAL_EQUAL
;
48 return Token::NOT_EQUAL
;
50 return Token::LESS_EQUAL
;
52 return Token::GREATER_EQUAL
;
54 return Token::LESS_THAN
;
56 return Token::GREATER_THAN
;
58 return Token::BOOLEAN_AND
;
60 return Token::BOOLEAN_OR
;
65 return Token::INVALID
;
70 Tokenizer::Tokenizer(const InputFile
* input_file
, Err
* err
)
71 : input_file_(input_file
),
72 input_(input_file
->contents()),
79 Tokenizer::~Tokenizer() {
83 std::vector
<Token
> Tokenizer::Tokenize(const InputFile
* input_file
, Err
* err
) {
84 Tokenizer
t(input_file
, err
);
88 std::vector
<Token
> Tokenizer::Run() {
89 DCHECK(tokens_
.empty());
94 Location location
= GetCurrentLocation();
96 Token::Type type
= ClassifyCurrent();
97 if (type
== Token::INVALID
) {
98 *err_
= GetErrorForInvalidToken(location
);
101 size_t token_begin
= cur_
;
102 AdvanceToEndOfToken(location
, type
);
105 size_t token_end
= cur_
;
107 base::StringPiece
token_value(&input_
.data()[token_begin
],
108 token_end
- token_begin
);
110 if (type
== Token::UNCLASSIFIED_OPERATOR
)
111 type
= GetSpecificOperatorType(token_value
);
112 if (type
== Token::IDENTIFIER
) {
113 if (token_value
== "if")
115 else if (token_value
== "else")
117 else if (token_value
== "true")
118 type
= Token::TRUE_TOKEN
;
119 else if (token_value
== "false")
120 type
= Token::FALSE_TOKEN
;
123 // TODO(brettw) This just strips comments from the token stream. This
124 // is probably wrong, they should be removed at a later stage so we can
125 // do things like rewrite the file. But this makes the parser simpler and
127 if (type
!= Token::COMMENT
)
128 tokens_
.push_back(Token(location
, type
, token_value
));
130 if (err_
->has_error())
136 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece
& buf
, int n
) {
144 while (cur_byte
< buf
.size()) {
145 if (IsNewline(buf
, cur_byte
)) {
152 return static_cast<size_t>(-1);
156 bool Tokenizer::IsNewline(const base::StringPiece
& buffer
, size_t offset
) {
157 DCHECK(offset
< buffer
.size());
158 // We may need more logic here to handle different line ending styles.
159 return buffer
[offset
] == '\n';
163 void Tokenizer::AdvanceToNextToken() {
164 while (!at_end() && IsCurrentWhitespace())
168 Token::Type
Tokenizer::ClassifyCurrent() const {
170 char next_char
= cur_char();
171 if (IsAsciiDigit(next_char
))
172 return Token::INTEGER
;
173 if (next_char
== '"')
174 return Token::STRING
;
176 // Note: '-' handled specially below.
177 if (next_char
!= '-' && CouldBeOperator(next_char
))
178 return Token::UNCLASSIFIED_OPERATOR
;
180 if (IsIdentifierFirstChar(next_char
))
181 return Token::IDENTIFIER
;
183 if (next_char
== '[')
184 return Token::LEFT_BRACKET
;
185 if (next_char
== ']')
186 return Token::RIGHT_BRACKET
;
187 if (next_char
== '(')
188 return Token::LEFT_PAREN
;
189 if (next_char
== ')')
190 return Token::RIGHT_PAREN
;
191 if (next_char
== '{')
192 return Token::LEFT_BRACE
;
193 if (next_char
== '}')
194 return Token::RIGHT_BRACE
;
196 if (next_char
== '.')
198 if (next_char
== ',')
201 if (next_char
== '#')
202 return Token::COMMENT
;
204 // For the case of '-' differentiate between a negative number and anything
206 if (next_char
== '-') {
208 return Token::UNCLASSIFIED_OPERATOR
; // Just the minus before end of
210 char following_char
= input_
[cur_
+ 1];
211 if (IsAsciiDigit(following_char
))
212 return Token::INTEGER
;
213 return Token::UNCLASSIFIED_OPERATOR
;
216 return Token::INVALID
;
219 void Tokenizer::AdvanceToEndOfToken(const Location
& location
,
225 } while (!at_end() && IsAsciiDigit(cur_char()));
227 // Require the char after a number to be some kind of space, scope,
230 if (!IsCurrentWhitespace() && !CouldBeOperator(c
) &&
231 !IsScoperChar(c
) && c
!= ',') {
232 *err_
= Err(GetCurrentLocation(),
233 "This is not a valid number.",
235 // Highlight the number.
236 err_
->AppendRange(LocationRange(location
, GetCurrentLocation()));
241 case Token::STRING
: {
242 char initial
= cur_char();
243 Advance(); // Advance past initial "
246 *err_
= Err(LocationRange(location
, GetCurrentLocation()),
247 "Unterminated string literal.",
248 "Don't leave me hanging like this!");
251 if (IsCurrentStringTerminator(initial
)) {
252 Advance(); // Skip past last "
254 } else if (cur_char() == '\n') {
255 *err_
= Err(LocationRange(location
, GetCurrentLocation()),
256 "Newline in string constant.");
263 case Token::UNCLASSIFIED_OPERATOR
:
264 // Some operators are two characters, some are one.
265 if (CouldBeTwoCharOperatorBegin(cur_char())) {
266 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_
[cur_
+ 1]))
272 case Token::IDENTIFIER
:
273 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
277 case Token::LEFT_BRACKET
:
278 case Token::RIGHT_BRACKET
:
279 case Token::LEFT_BRACE
:
280 case Token::RIGHT_BRACE
:
281 case Token::LEFT_PAREN
:
282 case Token::RIGHT_PAREN
:
285 Advance(); // All are one char.
290 while (!at_end() && !IsCurrentNewline())
296 *err_
= Err(location
, "Everything is all messed up",
297 "Please insert system disk in drive A: and press any key.");
303 bool Tokenizer::IsCurrentWhitespace() const {
305 char c
= input_
[cur_
];
306 // Note that tab (0x09) is illegal.
307 return c
== 0x0A || c
== 0x0B || c
== 0x0C || c
== 0x0D || c
== 0x20;
310 bool Tokenizer::IsCurrentStringTerminator(char quote_char
) const {
312 if (cur_char() != quote_char
)
315 // Check for escaping. \" is not a string terminator, but \\" is. Count
316 // the number of preceeding backslashes.
317 int num_backslashes
= 0;
318 for (int i
= static_cast<int>(cur_
) - 1; i
>= 0 && input_
[i
] == '\\'; i
--)
321 // Even backslashes mean that they were escaping each other and don't count
322 // as escaping this quote.
323 return (num_backslashes
% 2) == 0;
326 bool Tokenizer::IsCurrentNewline() const {
327 return IsNewline(input_
, cur_
);
330 void Tokenizer::Advance() {
331 DCHECK(cur_
< input_
.size());
332 if (IsCurrentNewline()) {
341 Location
Tokenizer::GetCurrentLocation() const {
342 return Location(input_file_
, line_number_
, char_in_line_
);
345 Err
Tokenizer::GetErrorForInvalidToken(const Location
& location
) const {
347 if (cur_char() == ';') {
349 help
= "Semicolons are not needed, delete this one.";
350 } else if (cur_char() == '\t') {
352 help
= "You got a tab character in here. Tabs are evil. "
353 "Convert to spaces.";
354 } else if (cur_char() == '/' && cur_
+ 1 < input_
.size() &&
355 (input_
[cur_
+ 1] == '/' || input_
[cur_
+ 1] == '*')) {
356 // Different types of comments.
357 help
= "Comments should start with # instead";
359 help
= "I have no idea what this is.";
362 return Err(location
, "Invalid token.", help
);