1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "tools/gn/tokenizer.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "tools/gn/input_file.h"
13 bool CouldBeTwoCharOperatorBegin(char c
) {
14 return c
== '<' || c
== '>' || c
== '!' || c
== '=' || c
== '-' ||
15 c
== '+' || c
== '|' || c
== '&';
18 bool CouldBeTwoCharOperatorEnd(char c
) {
19 return c
== '=' || c
== '|' || c
== '&';
22 bool CouldBeOneCharOperator(char c
) {
23 return c
== '=' || c
== '<' || c
== '>' || c
== '+' || c
== '!' ||
24 c
== ':' || c
== '|' || c
== '&' || c
== '-';
27 bool CouldBeOperator(char c
) {
28 return CouldBeOneCharOperator(c
) || CouldBeTwoCharOperatorBegin(c
);
31 bool IsScoperChar(char c
) {
32 return c
== '(' || c
== ')' || c
== '[' || c
== ']' || c
== '{' || c
== '}';
35 Token::Type
GetSpecificOperatorType(base::StringPiece value
) {
43 return Token::PLUS_EQUALS
;
45 return Token::MINUS_EQUALS
;
47 return Token::EQUAL_EQUAL
;
49 return Token::NOT_EQUAL
;
51 return Token::LESS_EQUAL
;
53 return Token::GREATER_EQUAL
;
55 return Token::LESS_THAN
;
57 return Token::GREATER_THAN
;
59 return Token::BOOLEAN_AND
;
61 return Token::BOOLEAN_OR
;
66 return Token::INVALID
;
71 Tokenizer::Tokenizer(const InputFile
* input_file
, Err
* err
)
72 : input_file_(input_file
),
73 input_(input_file
->contents()),
80 Tokenizer::~Tokenizer() {
84 std::vector
<Token
> Tokenizer::Tokenize(const InputFile
* input_file
, Err
* err
) {
85 Tokenizer
t(input_file
, err
);
89 std::vector
<Token
> Tokenizer::Run() {
90 DCHECK(tokens_
.empty());
95 Location location
= GetCurrentLocation();
97 Token::Type type
= ClassifyCurrent();
98 if (type
== Token::INVALID
) {
99 *err_
= GetErrorForInvalidToken(location
);
102 size_t token_begin
= cur_
;
103 AdvanceToEndOfToken(location
, type
);
106 size_t token_end
= cur_
;
108 base::StringPiece
token_value(&input_
.data()[token_begin
],
109 token_end
- token_begin
);
111 if (type
== Token::UNCLASSIFIED_OPERATOR
) {
112 type
= GetSpecificOperatorType(token_value
);
113 } else if (type
== Token::IDENTIFIER
) {
114 if (token_value
== "if")
116 else if (token_value
== "else")
118 else if (token_value
== "true")
119 type
= Token::TRUE_TOKEN
;
120 else if (token_value
== "false")
121 type
= Token::FALSE_TOKEN
;
122 } else if (type
== Token::UNCLASSIFIED_COMMENT
) {
123 if (AtStartOfLine(token_begin
) &&
124 // If it's a standalone comment, but is a continuation of a comment on
125 // a previous line, then instead make it a continued suffix comment.
126 (tokens_
.empty() || tokens_
.back().type() != Token::SUFFIX_COMMENT
||
127 tokens_
.back().location().line_number() + 1 !=
128 location
.line_number() ||
129 tokens_
.back().location().char_offset() != location
.char_offset())) {
130 type
= Token::LINE_COMMENT
;
131 Advance(); // The current \n.
132 // If this comment is separated from the next syntax element, then we
133 // want to tag it as a block comment. This will become a standalone
134 // statement at the parser level to keep this comment separate, rather
135 // than attached to the subsequent statement.
136 while (!at_end() && IsCurrentWhitespace()) {
137 if (IsCurrentNewline()) {
138 type
= Token::BLOCK_COMMENT
;
144 type
= Token::SUFFIX_COMMENT
;
148 tokens_
.push_back(Token(location
, type
, token_value
));
150 if (err_
->has_error())
156 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece
& buf
, int n
) {
164 while (cur_byte
< buf
.size()) {
165 if (IsNewline(buf
, cur_byte
)) {
172 return static_cast<size_t>(-1);
176 bool Tokenizer::IsNewline(const base::StringPiece
& buffer
, size_t offset
) {
177 DCHECK(offset
< buffer
.size());
178 // We may need more logic here to handle different line ending styles.
179 return buffer
[offset
] == '\n';
183 void Tokenizer::AdvanceToNextToken() {
184 while (!at_end() && IsCurrentWhitespace())
188 Token::Type
Tokenizer::ClassifyCurrent() const {
190 char next_char
= cur_char();
191 if (IsAsciiDigit(next_char
))
192 return Token::INTEGER
;
193 if (next_char
== '"')
194 return Token::STRING
;
196 // Note: '-' handled specially below.
197 if (next_char
!= '-' && CouldBeOperator(next_char
))
198 return Token::UNCLASSIFIED_OPERATOR
;
200 if (IsIdentifierFirstChar(next_char
))
201 return Token::IDENTIFIER
;
203 if (next_char
== '[')
204 return Token::LEFT_BRACKET
;
205 if (next_char
== ']')
206 return Token::RIGHT_BRACKET
;
207 if (next_char
== '(')
208 return Token::LEFT_PAREN
;
209 if (next_char
== ')')
210 return Token::RIGHT_PAREN
;
211 if (next_char
== '{')
212 return Token::LEFT_BRACE
;
213 if (next_char
== '}')
214 return Token::RIGHT_BRACE
;
216 if (next_char
== '.')
218 if (next_char
== ',')
221 if (next_char
== '#')
222 return Token::UNCLASSIFIED_COMMENT
;
224 // For the case of '-' differentiate between a negative number and anything
226 if (next_char
== '-') {
228 return Token::UNCLASSIFIED_OPERATOR
; // Just the minus before end of
230 char following_char
= input_
[cur_
+ 1];
231 if (IsAsciiDigit(following_char
))
232 return Token::INTEGER
;
233 return Token::UNCLASSIFIED_OPERATOR
;
236 return Token::INVALID
;
239 void Tokenizer::AdvanceToEndOfToken(const Location
& location
,
245 } while (!at_end() && IsAsciiDigit(cur_char()));
247 // Require the char after a number to be some kind of space, scope,
250 if (!IsCurrentWhitespace() && !CouldBeOperator(c
) &&
251 !IsScoperChar(c
) && c
!= ',') {
252 *err_
= Err(GetCurrentLocation(),
253 "This is not a valid number.",
255 // Highlight the number.
256 err_
->AppendRange(LocationRange(location
, GetCurrentLocation()));
261 case Token::STRING
: {
262 char initial
= cur_char();
263 Advance(); // Advance past initial "
266 *err_
= Err(LocationRange(location
, GetCurrentLocation()),
267 "Unterminated string literal.",
268 "Don't leave me hanging like this!");
271 if (IsCurrentStringTerminator(initial
)) {
272 Advance(); // Skip past last "
274 } else if (IsCurrentNewline()) {
275 *err_
= Err(LocationRange(location
, GetCurrentLocation()),
276 "Newline in string constant.");
283 case Token::UNCLASSIFIED_OPERATOR
:
284 // Some operators are two characters, some are one.
285 if (CouldBeTwoCharOperatorBegin(cur_char())) {
286 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_
[cur_
+ 1]))
292 case Token::IDENTIFIER
:
293 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
297 case Token::LEFT_BRACKET
:
298 case Token::RIGHT_BRACKET
:
299 case Token::LEFT_BRACE
:
300 case Token::RIGHT_BRACE
:
301 case Token::LEFT_PAREN
:
302 case Token::RIGHT_PAREN
:
305 Advance(); // All are one char.
308 case Token::UNCLASSIFIED_COMMENT
:
310 while (!at_end() && !IsCurrentNewline())
316 *err_
= Err(location
, "Everything is all messed up",
317 "Please insert system disk in drive A: and press any key.");
323 bool Tokenizer::AtStartOfLine(size_t location
) const {
324 while (location
> 0) {
326 char c
= input_
[location
];
335 bool Tokenizer::IsCurrentWhitespace() const {
337 char c
= input_
[cur_
];
338 // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
339 return c
== 0x0A || c
== 0x0D || c
== 0x20;
342 bool Tokenizer::IsCurrentStringTerminator(char quote_char
) const {
344 if (cur_char() != quote_char
)
347 // Check for escaping. \" is not a string terminator, but \\" is. Count
348 // the number of preceeding backslashes.
349 int num_backslashes
= 0;
350 for (int i
= static_cast<int>(cur_
) - 1; i
>= 0 && input_
[i
] == '\\'; i
--)
353 // Even backslashes mean that they were escaping each other and don't count
354 // as escaping this quote.
355 return (num_backslashes
% 2) == 0;
358 bool Tokenizer::IsCurrentNewline() const {
359 return IsNewline(input_
, cur_
);
362 void Tokenizer::Advance() {
363 DCHECK(cur_
< input_
.size());
364 if (IsCurrentNewline()) {
373 Location
Tokenizer::GetCurrentLocation() const {
375 input_file_
, line_number_
, char_in_line_
, static_cast<int>(cur_
));
378 Err
Tokenizer::GetErrorForInvalidToken(const Location
& location
) const {
380 if (cur_char() == ';') {
382 help
= "Semicolons are not needed, delete this one.";
383 } else if (cur_char() == '\t') {
385 help
= "You got a tab character in here. Tabs are evil. "
386 "Convert to spaces.";
387 } else if (cur_char() == '/' && cur_
+ 1 < input_
.size() &&
388 (input_
[cur_
+ 1] == '/' || input_
[cur_
+ 1] == '*')) {
389 // Different types of comments.
390 help
= "Comments should start with # instead";
392 help
= "I have no idea what this is.";
395 return Err(location
, "Invalid token.", help
);