1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "tools/gn/tokenizer.h"
7 #include "base/logging.h"
8 #include "tools/gn/input_file.h"
12 bool IsNumberChar(char c
) {
13 return c
>= '0' && c
<= '9';
16 bool CouldBeTwoCharOperatorBegin(char c
) {
17 return c
== '<' || c
== '>' || c
== '!' || c
== '=' || c
== '-' ||
18 c
== '+' || c
== '|' || c
== '&';
21 bool CouldBeTwoCharOperatorEnd(char c
) {
22 return c
== '=' || c
== '|' || c
== '&';
25 bool CouldBeOneCharOperator(char c
) {
26 return c
== '=' || c
== '<' || c
== '>' || c
== '+' || c
== '!' ||
27 c
== ':' || c
== '|' || c
== '&' || c
== '-';
30 bool CouldBeOperator(char c
) {
31 return CouldBeOneCharOperator(c
) || CouldBeTwoCharOperatorBegin(c
);
34 bool IsScoperChar(char c
) {
35 return c
== '(' || c
== ')' || c
== '[' || c
== ']' || c
== '{' || c
== '}';
38 Token::Type
GetSpecificOperatorType(base::StringPiece value
) {
46 return Token::PLUS_EQUALS
;
48 return Token::MINUS_EQUALS
;
50 return Token::EQUAL_EQUAL
;
52 return Token::NOT_EQUAL
;
54 return Token::LESS_EQUAL
;
56 return Token::GREATER_EQUAL
;
58 return Token::LESS_THAN
;
60 return Token::GREATER_THAN
;
62 return Token::BOOLEAN_AND
;
64 return Token::BOOLEAN_OR
;
67 return Token::INVALID
;
72 Tokenizer::Tokenizer(const InputFile
* input_file
, Err
* err
)
73 : input_file_(input_file
),
74 input_(input_file
->contents()),
81 Tokenizer::~Tokenizer() {
85 std::vector
<Token
> Tokenizer::Tokenize(const InputFile
* input_file
, Err
* err
) {
86 Tokenizer
t(input_file
, err
);
90 std::vector
<Token
> Tokenizer::Run() {
91 DCHECK(tokens_
.empty());
96 Location location
= GetCurrentLocation();
98 Token::Type type
= ClassifyCurrent();
99 if (type
== Token::INVALID
) {
100 *err_
= GetErrorForInvalidToken(location
);
103 size_t token_begin
= cur_
;
104 AdvanceToEndOfToken(location
, type
);
107 size_t token_end
= cur_
;
109 base::StringPiece
token_value(&input_
.data()[token_begin
],
110 token_end
- token_begin
);
112 if (type
== Token::UNCLASSIFIED_OPERATOR
)
113 type
= GetSpecificOperatorType(token_value
);
114 if (type
== Token::IDENTIFIER
) {
115 if (token_value
== "if")
117 else if (token_value
== "else")
119 else if (token_value
== "true")
120 type
= Token::TRUE_TOKEN
;
121 else if (token_value
== "false")
122 type
= Token::FALSE_TOKEN
;
125 // TODO(brettw) This just strips comments from the token stream. This
126 // is probably wrong, they should be removed at a later stage so we can
127 // do things like rewrite the file. But this makes the parser simpler and
129 if (type
!= Token::COMMENT
)
130 tokens_
.push_back(Token(location
, type
, token_value
));
132 if (err_
->has_error())
138 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece
& buf
, int n
) {
147 while (cur_byte
< buf
.size()) {
148 if (IsNewline(buf
, cur_byte
)) {
159 bool Tokenizer::IsNewline(const base::StringPiece
& buffer
, size_t offset
) {
160 DCHECK(offset
< buffer
.size());
161 // We may need more logic here to handle different line ending styles.
162 return buffer
[offset
] == '\n';
166 void Tokenizer::AdvanceToNextToken() {
167 while (!at_end() && IsCurrentWhitespace())
171 Token::Type
Tokenizer::ClassifyCurrent() const {
173 char next_char
= cur_char();
174 if (next_char
>= '0' && next_char
<= '9')
175 return Token::INTEGER
;
176 if (next_char
== '"')
177 return Token::STRING
;
179 // Note: '-' handled specially below.
180 if (next_char
!= '-' && CouldBeOperator(next_char
))
181 return Token::UNCLASSIFIED_OPERATOR
;
183 if (IsIdentifierFirstChar(next_char
))
184 return Token::IDENTIFIER
;
186 if (next_char
== '[')
187 return Token::LEFT_BRACKET
;
188 if (next_char
== ']')
189 return Token::RIGHT_BRACKET
;
190 if (next_char
== '(')
191 return Token::LEFT_PAREN
;
192 if (next_char
== ')')
193 return Token::RIGHT_PAREN
;
194 if (next_char
== '{')
195 return Token::LEFT_BRACE
;
196 if (next_char
== '}')
197 return Token::RIGHT_BRACE
;
199 if (next_char
== ',')
202 if (next_char
== '#')
203 return Token::COMMENT
;
205 // For the case of '-' differentiate between a negative number and anything
207 if (next_char
== '-') {
209 return Token::UNCLASSIFIED_OPERATOR
; // Just the minus before end of
211 char following_char
= input_
[cur_
+ 1];
212 if (following_char
>= '0' && following_char
<= '9')
213 return Token::INTEGER
;
214 return Token::UNCLASSIFIED_OPERATOR
;
217 return Token::INVALID
;
220 void Tokenizer::AdvanceToEndOfToken(const Location
& location
,
226 } while (!at_end() && IsNumberChar(cur_char()));
228 // Require the char after a number to be some kind of space, scope,
231 if (!IsCurrentWhitespace() && !CouldBeOperator(c
) &&
232 !IsScoperChar(c
) && c
!= ',') {
233 *err_
= Err(GetCurrentLocation(),
234 "This is not a valid number.",
236 // Highlight the number.
237 err_
->AppendRange(LocationRange(location
, GetCurrentLocation()));
242 case Token::STRING
: {
243 char initial
= cur_char();
244 Advance(); // Advance past initial "
247 *err_
= Err(LocationRange(location
,
248 Location(input_file_
, line_number_
, char_in_line_
)),
249 "Unterminated string literal.",
250 "Don't leave me hanging like this!");
253 if (IsCurrentStringTerminator(initial
)) {
254 Advance(); // Skip past last "
256 } else if (cur_char() == '\n') {
257 *err_
= Err(LocationRange(location
,
258 GetCurrentLocation()),
259 "Newline in string constant.");
266 case Token::UNCLASSIFIED_OPERATOR
:
267 // Some operators are two characters, some are one.
268 if (CouldBeTwoCharOperatorBegin(cur_char())) {
269 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_
[cur_
+ 1]))
275 case Token::IDENTIFIER
:
276 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
280 case Token::LEFT_BRACKET
:
281 case Token::RIGHT_BRACKET
:
282 case Token::LEFT_BRACE
:
283 case Token::RIGHT_BRACE
:
284 case Token::LEFT_PAREN
:
285 case Token::RIGHT_PAREN
:
287 Advance(); // All are one char.
292 while (!at_end() && !IsCurrentNewline())
298 *err_
= Err(location
, "Everything is all messed up",
299 "Please insert system disk in drive A: and press any key.");
305 bool Tokenizer::IsCurrentWhitespace() const {
307 char c
= input_
[cur_
];
308 // Note that tab (0x09) is illegal.
309 return c
== 0x0A || c
== 0x0B || c
== 0x0C || c
== 0x0D || c
== 0x20;
312 bool Tokenizer::IsCurrentStringTerminator(char quote_char
) const {
314 if (cur_char() != quote_char
)
317 // Check for escaping. \" is not a string terminator, but \\" is. Count
318 // the number of preceeding backslashes.
319 int num_backslashes
= 0;
320 for (int i
= static_cast<int>(cur_
) - 1; i
>= 0 && input_
[i
] == '\\'; i
--)
323 // Even backslashes mean that they were escaping each other and don't count
324 // as escaping this quote.
325 return (num_backslashes
% 2) == 0;
328 bool Tokenizer::IsCurrentNewline() const {
329 return IsNewline(input_
, cur_
);
332 void Tokenizer::Advance() {
333 DCHECK(cur_
< input_
.size());
334 if (IsCurrentNewline()) {
343 Location
Tokenizer::GetCurrentLocation() const {
344 return Location(input_file_
, line_number_
, char_in_line_
);
347 Err
Tokenizer::GetErrorForInvalidToken(const Location
& location
) const {
349 if (cur_char() == ';') {
351 help
= "Semicolons are not needed, delete this one.";
352 } else if (cur_char() == '\t') {
354 help
= "You got a tab character in here. Tabs are evil. "
355 "Convert to spaces.";
356 } else if (cur_char() == '/' && cur_
+ 1 < input_
.size() &&
357 (input_
[cur_
+ 1] == '/' || input_
[cur_
+ 1] == '*')) {
358 // Different types of comments.
359 help
= "Comments should start with # instead";
361 help
= "I have no idea what this is.";
364 return Err(location
, "Invalid token.", help
);