Correct blacklist entry message
[chromium-blink-merge.git] / tools / gn / tokenizer.cc
blob8acabdb46541b6b6a76d5de40fa97d9bdb6dd3cd
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "tools/gn/tokenizer.h"
7 #include "base/logging.h"
8 #include "tools/gn/input_file.h"
10 namespace {
12 bool IsNumberChar(char c) {
13 return c >= '0' && c <= '9';
16 bool CouldBeTwoCharOperatorBegin(char c) {
17 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
18 c == '+' || c == '|' || c == '&';
21 bool CouldBeTwoCharOperatorEnd(char c) {
22 return c == '=' || c == '|' || c == '&';
25 bool CouldBeOneCharOperator(char c) {
26 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
27 c == ':' || c == '|' || c == '&' || c == '-';
30 bool CouldBeOperator(char c) {
31 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
34 bool IsScoperChar(char c) {
35 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
38 Token::Type GetSpecificOperatorType(base::StringPiece value) {
39 if (value == "=")
40 return Token::EQUAL;
41 if (value == "+")
42 return Token::PLUS;
43 if (value == "-")
44 return Token::MINUS;
45 if (value == "+=")
46 return Token::PLUS_EQUALS;
47 if (value == "-=")
48 return Token::MINUS_EQUALS;
49 if (value == "==")
50 return Token::EQUAL_EQUAL;
51 if (value == "!=")
52 return Token::NOT_EQUAL;
53 if (value == "<=")
54 return Token::LESS_EQUAL;
55 if (value == ">=")
56 return Token::GREATER_EQUAL;
57 if (value == "<")
58 return Token::LESS_THAN;
59 if (value == ">")
60 return Token::GREATER_THAN;
61 if (value == "&&")
62 return Token::BOOLEAN_AND;
63 if (value == "||")
64 return Token::BOOLEAN_OR;
65 if (value == "!")
66 return Token::BANG;
67 return Token::INVALID;
70 } // namespace
72 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
73 : input_file_(input_file),
74 input_(input_file->contents()),
75 err_(err),
76 cur_(0),
77 line_number_(1),
78 char_in_line_(1) {
81 Tokenizer::~Tokenizer() {
84 // static
85 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
86 Tokenizer t(input_file, err);
87 return t.Run();
90 std::vector<Token> Tokenizer::Run() {
91 DCHECK(tokens_.empty());
92 while (!done()) {
93 AdvanceToNextToken();
94 if (done())
95 break;
96 Location location = GetCurrentLocation();
98 Token::Type type = ClassifyCurrent();
99 if (type == Token::INVALID) {
100 *err_ = GetErrorForInvalidToken(location);
101 break;
103 size_t token_begin = cur_;
104 AdvanceToEndOfToken(location, type);
105 if (has_error())
106 break;
107 size_t token_end = cur_;
109 base::StringPiece token_value(&input_.data()[token_begin],
110 token_end - token_begin);
112 if (type == Token::UNCLASSIFIED_OPERATOR)
113 type = GetSpecificOperatorType(token_value);
114 if (type == Token::IDENTIFIER) {
115 if (token_value == "if")
116 type = Token::IF;
117 else if (token_value == "else")
118 type = Token::ELSE;
119 else if (token_value == "true")
120 type = Token::TRUE_TOKEN;
121 else if (token_value == "false")
122 type = Token::FALSE_TOKEN;
125 // TODO(brettw) This just strips comments from the token stream. This
126 // is probably wrong, they should be removed at a later stage so we can
127 // do things like rewrite the file. But this makes the parser simpler and
128 // is OK for now.
129 if (type != Token::COMMENT)
130 tokens_.push_back(Token(location, type, token_value));
132 if (err_->has_error())
133 tokens_.clear();
134 return tokens_;
137 // static
138 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
139 int cur_line = 1;
140 size_t cur_byte = 0;
142 DCHECK(n > 0);
144 if (n == 1)
145 return 0;
147 while (cur_byte < buf.size()) {
148 if (IsNewline(buf, cur_byte)) {
149 cur_line++;
150 if (cur_line == n)
151 return cur_byte + 1;
153 cur_byte++;
155 return -1;
158 // static
159 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
160 DCHECK(offset < buffer.size());
161 // We may need more logic here to handle different line ending styles.
162 return buffer[offset] == '\n';
166 void Tokenizer::AdvanceToNextToken() {
167 while (!at_end() && IsCurrentWhitespace())
168 Advance();
171 Token::Type Tokenizer::ClassifyCurrent() const {
172 DCHECK(!at_end());
173 char next_char = cur_char();
174 if (next_char >= '0' && next_char <= '9')
175 return Token::INTEGER;
176 if (next_char == '"')
177 return Token::STRING;
179 // Note: '-' handled specially below.
180 if (next_char != '-' && CouldBeOperator(next_char))
181 return Token::UNCLASSIFIED_OPERATOR;
183 if (IsIdentifierFirstChar(next_char))
184 return Token::IDENTIFIER;
186 if (next_char == '[')
187 return Token::LEFT_BRACKET;
188 if (next_char == ']')
189 return Token::RIGHT_BRACKET;
190 if (next_char == '(')
191 return Token::LEFT_PAREN;
192 if (next_char == ')')
193 return Token::RIGHT_PAREN;
194 if (next_char == '{')
195 return Token::LEFT_BRACE;
196 if (next_char == '}')
197 return Token::RIGHT_BRACE;
199 if (next_char == ',')
200 return Token::COMMA;
202 if (next_char == '#')
203 return Token::COMMENT;
205 // For the case of '-' differentiate between a negative number and anything
206 // else.
207 if (next_char == '-') {
208 if (!CanIncrement())
209 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
210 // file.
211 char following_char = input_[cur_ + 1];
212 if (following_char >= '0' && following_char <= '9')
213 return Token::INTEGER;
214 return Token::UNCLASSIFIED_OPERATOR;
217 return Token::INVALID;
220 void Tokenizer::AdvanceToEndOfToken(const Location& location,
221 Token::Type type) {
222 switch (type) {
223 case Token::INTEGER:
224 do {
225 Advance();
226 } while (!at_end() && IsNumberChar(cur_char()));
227 if (!at_end()) {
228 // Require the char after a number to be some kind of space, scope,
229 // or operator.
230 char c = cur_char();
231 if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
232 !IsScoperChar(c) && c != ',') {
233 *err_ = Err(GetCurrentLocation(),
234 "This is not a valid number.",
235 "Learn to count.");
236 // Highlight the number.
237 err_->AppendRange(LocationRange(location, GetCurrentLocation()));
240 break;
242 case Token::STRING: {
243 char initial = cur_char();
244 Advance(); // Advance past initial "
245 for (;;) {
246 if (at_end()) {
247 *err_ = Err(LocationRange(location,
248 Location(input_file_, line_number_, char_in_line_)),
249 "Unterminated string literal.",
250 "Don't leave me hanging like this!");
251 break;
253 if (IsCurrentStringTerminator(initial)) {
254 Advance(); // Skip past last "
255 break;
256 } else if (cur_char() == '\n') {
257 *err_ = Err(LocationRange(location,
258 GetCurrentLocation()),
259 "Newline in string constant.");
261 Advance();
263 break;
266 case Token::UNCLASSIFIED_OPERATOR:
267 // Some operators are two characters, some are one.
268 if (CouldBeTwoCharOperatorBegin(cur_char())) {
269 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
270 Advance();
272 Advance();
273 break;
275 case Token::IDENTIFIER:
276 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
277 Advance();
278 break;
280 case Token::LEFT_BRACKET:
281 case Token::RIGHT_BRACKET:
282 case Token::LEFT_BRACE:
283 case Token::RIGHT_BRACE:
284 case Token::LEFT_PAREN:
285 case Token::RIGHT_PAREN:
286 case Token::COMMA:
287 Advance(); // All are one char.
288 break;
290 case Token::COMMENT:
291 // Eat to EOL.
292 while (!at_end() && !IsCurrentNewline())
293 Advance();
294 break;
296 case Token::INVALID:
297 default:
298 *err_ = Err(location, "Everything is all messed up",
299 "Please insert system disk in drive A: and press any key.");
300 NOTREACHED();
301 return;
305 bool Tokenizer::IsCurrentWhitespace() const {
306 DCHECK(!at_end());
307 char c = input_[cur_];
308 // Note that tab (0x09) is illegal.
309 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
312 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
313 DCHECK(!at_end());
314 if (cur_char() != quote_char)
315 return false;
317 // Check for escaping. \" is not a string terminator, but \\" is. Count
318 // the number of preceeding backslashes.
319 int num_backslashes = 0;
320 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
321 num_backslashes++;
323 // Even backslashes mean that they were escaping each other and don't count
324 // as escaping this quote.
325 return (num_backslashes % 2) == 0;
328 bool Tokenizer::IsCurrentNewline() const {
329 return IsNewline(input_, cur_);
332 void Tokenizer::Advance() {
333 DCHECK(cur_ < input_.size());
334 if (IsCurrentNewline()) {
335 line_number_++;
336 char_in_line_ = 1;
337 } else {
338 char_in_line_++;
340 cur_++;
343 Location Tokenizer::GetCurrentLocation() const {
344 return Location(input_file_, line_number_, char_in_line_);
347 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
348 std::string help;
349 if (cur_char() == ';') {
350 // Semicolon.
351 help = "Semicolons are not needed, delete this one.";
352 } else if (cur_char() == '\t') {
353 // Tab.
354 help = "You got a tab character in here. Tabs are evil. "
355 "Convert to spaces.";
356 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
357 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
358 // Different types of comments.
359 help = "Comments should start with # instead";
360 } else {
361 help = "I have no idea what this is.";
364 return Err(location, "Invalid token.", help);