1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "tools/gn/tokenizer.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "tools/gn/input_file.h"
13 bool CouldBeTwoCharOperatorBegin(char c
) {
14 return c
== '<' || c
== '>' || c
== '!' || c
== '=' || c
== '-' ||
15 c
== '+' || c
== '|' || c
== '&';
18 bool CouldBeTwoCharOperatorEnd(char c
) {
19 return c
== '=' || c
== '|' || c
== '&';
22 bool CouldBeOneCharOperator(char c
) {
23 return c
== '=' || c
== '<' || c
== '>' || c
== '+' || c
== '!' ||
24 c
== ':' || c
== '|' || c
== '&' || c
== '-';
27 bool CouldBeOperator(char c
) {
28 return CouldBeOneCharOperator(c
) || CouldBeTwoCharOperatorBegin(c
);
31 bool IsScoperChar(char c
) {
32 return c
== '(' || c
== ')' || c
== '[' || c
== ']' || c
== '{' || c
== '}';
35 Token::Type
GetSpecificOperatorType(base::StringPiece value
) {
43 return Token::PLUS_EQUALS
;
45 return Token::MINUS_EQUALS
;
47 return Token::EQUAL_EQUAL
;
49 return Token::NOT_EQUAL
;
51 return Token::LESS_EQUAL
;
53 return Token::GREATER_EQUAL
;
55 return Token::LESS_THAN
;
57 return Token::GREATER_THAN
;
59 return Token::BOOLEAN_AND
;
61 return Token::BOOLEAN_OR
;
66 return Token::INVALID
;
71 Tokenizer::Tokenizer(const InputFile
* input_file
, Err
* err
)
72 : input_file_(input_file
),
73 input_(input_file
->contents()),
80 Tokenizer::~Tokenizer() {
84 std::vector
<Token
> Tokenizer::Tokenize(const InputFile
* input_file
, Err
* err
) {
85 Tokenizer
t(input_file
, err
);
89 std::vector
<Token
> Tokenizer::Run() {
90 DCHECK(tokens_
.empty());
95 Location location
= GetCurrentLocation();
97 Token::Type type
= ClassifyCurrent();
98 if (type
== Token::INVALID
) {
99 *err_
= GetErrorForInvalidToken(location
);
102 size_t token_begin
= cur_
;
103 AdvanceToEndOfToken(location
, type
);
106 size_t token_end
= cur_
;
108 base::StringPiece
token_value(&input_
.data()[token_begin
],
109 token_end
- token_begin
);
111 if (type
== Token::UNCLASSIFIED_OPERATOR
) {
112 type
= GetSpecificOperatorType(token_value
);
113 } else if (type
== Token::IDENTIFIER
) {
114 if (token_value
== "if")
116 else if (token_value
== "else")
118 else if (token_value
== "true")
119 type
= Token::TRUE_TOKEN
;
120 else if (token_value
== "false")
121 type
= Token::FALSE_TOKEN
;
122 } else if (type
== Token::UNCLASSIFIED_COMMENT
) {
123 if (AtStartOfLine(token_begin
) &&
124 // If it's a standalone comment, but is a continuation of a comment on
125 // a previous line, then instead make it a continued suffix comment.
126 (tokens_
.empty() || tokens_
.back().type() != Token::SUFFIX_COMMENT
||
127 tokens_
.back().location().line_number() + 1 !=
128 location
.line_number() ||
129 tokens_
.back().location().char_offset() != location
.char_offset())) {
130 type
= Token::LINE_COMMENT
;
131 if (!at_end()) // Could be EOF.
132 Advance(); // The current \n.
133 // If this comment is separated from the next syntax element, then we
134 // want to tag it as a block comment. This will become a standalone
135 // statement at the parser level to keep this comment separate, rather
136 // than attached to the subsequent statement.
137 while (!at_end() && IsCurrentWhitespace()) {
138 if (IsCurrentNewline()) {
139 type
= Token::BLOCK_COMMENT
;
145 type
= Token::SUFFIX_COMMENT
;
149 tokens_
.push_back(Token(location
, type
, token_value
));
151 if (err_
->has_error())
157 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece
& buf
, int n
) {
165 while (cur_byte
< buf
.size()) {
166 if (IsNewline(buf
, cur_byte
)) {
173 return static_cast<size_t>(-1);
177 bool Tokenizer::IsNewline(const base::StringPiece
& buffer
, size_t offset
) {
178 DCHECK(offset
< buffer
.size());
179 // We may need more logic here to handle different line ending styles.
180 return buffer
[offset
] == '\n';
184 void Tokenizer::AdvanceToNextToken() {
185 while (!at_end() && IsCurrentWhitespace())
189 Token::Type
Tokenizer::ClassifyCurrent() const {
191 char next_char
= cur_char();
192 if (base::IsAsciiDigit(next_char
))
193 return Token::INTEGER
;
194 if (next_char
== '"')
195 return Token::STRING
;
197 // Note: '-' handled specially below.
198 if (next_char
!= '-' && CouldBeOperator(next_char
))
199 return Token::UNCLASSIFIED_OPERATOR
;
201 if (IsIdentifierFirstChar(next_char
))
202 return Token::IDENTIFIER
;
204 if (next_char
== '[')
205 return Token::LEFT_BRACKET
;
206 if (next_char
== ']')
207 return Token::RIGHT_BRACKET
;
208 if (next_char
== '(')
209 return Token::LEFT_PAREN
;
210 if (next_char
== ')')
211 return Token::RIGHT_PAREN
;
212 if (next_char
== '{')
213 return Token::LEFT_BRACE
;
214 if (next_char
== '}')
215 return Token::RIGHT_BRACE
;
217 if (next_char
== '.')
219 if (next_char
== ',')
222 if (next_char
== '#')
223 return Token::UNCLASSIFIED_COMMENT
;
225 // For the case of '-' differentiate between a negative number and anything
227 if (next_char
== '-') {
229 return Token::UNCLASSIFIED_OPERATOR
; // Just the minus before end of
231 char following_char
= input_
[cur_
+ 1];
232 if (base::IsAsciiDigit(following_char
))
233 return Token::INTEGER
;
234 return Token::UNCLASSIFIED_OPERATOR
;
237 return Token::INVALID
;
240 void Tokenizer::AdvanceToEndOfToken(const Location
& location
,
246 } while (!at_end() && base::IsAsciiDigit(cur_char()));
248 // Require the char after a number to be some kind of space, scope,
251 if (!IsCurrentWhitespace() && !CouldBeOperator(c
) &&
252 !IsScoperChar(c
) && c
!= ',') {
253 *err_
= Err(GetCurrentLocation(),
254 "This is not a valid number.",
256 // Highlight the number.
257 err_
->AppendRange(LocationRange(location
, GetCurrentLocation()));
262 case Token::STRING
: {
263 char initial
= cur_char();
264 Advance(); // Advance past initial "
267 *err_
= Err(LocationRange(location
, GetCurrentLocation()),
268 "Unterminated string literal.",
269 "Don't leave me hanging like this!");
272 if (IsCurrentStringTerminator(initial
)) {
273 Advance(); // Skip past last "
275 } else if (IsCurrentNewline()) {
276 *err_
= Err(LocationRange(location
, GetCurrentLocation()),
277 "Newline in string constant.");
284 case Token::UNCLASSIFIED_OPERATOR
:
285 // Some operators are two characters, some are one.
286 if (CouldBeTwoCharOperatorBegin(cur_char())) {
287 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_
[cur_
+ 1]))
293 case Token::IDENTIFIER
:
294 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
298 case Token::LEFT_BRACKET
:
299 case Token::RIGHT_BRACKET
:
300 case Token::LEFT_BRACE
:
301 case Token::RIGHT_BRACE
:
302 case Token::LEFT_PAREN
:
303 case Token::RIGHT_PAREN
:
306 Advance(); // All are one char.
309 case Token::UNCLASSIFIED_COMMENT
:
311 while (!at_end() && !IsCurrentNewline())
317 *err_
= Err(location
, "Everything is all messed up",
318 "Please insert system disk in drive A: and press any key.");
324 bool Tokenizer::AtStartOfLine(size_t location
) const {
325 while (location
> 0) {
327 char c
= input_
[location
];
336 bool Tokenizer::IsCurrentWhitespace() const {
338 char c
= input_
[cur_
];
339 // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
340 return c
== 0x0A || c
== 0x0D || c
== 0x20;
343 bool Tokenizer::IsCurrentStringTerminator(char quote_char
) const {
345 if (cur_char() != quote_char
)
348 // Check for escaping. \" is not a string terminator, but \\" is. Count
349 // the number of preceeding backslashes.
350 int num_backslashes
= 0;
351 for (int i
= static_cast<int>(cur_
) - 1; i
>= 0 && input_
[i
] == '\\'; i
--)
354 // Even backslashes mean that they were escaping each other and don't count
355 // as escaping this quote.
356 return (num_backslashes
% 2) == 0;
359 bool Tokenizer::IsCurrentNewline() const {
360 return IsNewline(input_
, cur_
);
363 void Tokenizer::Advance() {
364 DCHECK(cur_
< input_
.size());
365 if (IsCurrentNewline()) {
374 Location
Tokenizer::GetCurrentLocation() const {
376 input_file_
, line_number_
, char_in_line_
, static_cast<int>(cur_
));
379 Err
Tokenizer::GetErrorForInvalidToken(const Location
& location
) const {
381 if (cur_char() == ';') {
383 help
= "Semicolons are not needed, delete this one.";
384 } else if (cur_char() == '\t') {
386 help
= "You got a tab character in here. Tabs are evil. "
387 "Convert to spaces.";
388 } else if (cur_char() == '/' && cur_
+ 1 < input_
.size() &&
389 (input_
[cur_
+ 1] == '/' || input_
[cur_
+ 1] == '*')) {
390 // Different types of comments.
391 help
= "Comments should start with # instead";
392 } else if (cur_char() == '\'') {
393 help
= "Strings are delimited by \" characters, not apostrophes.";
395 help
= "I have no idea what this is.";
398 return Err(location
, "Invalid token.", help
);