Roll src/third_party/WebKit f298044:aa8346d (svn 202628:202629)
[chromium-blink-merge.git] / tools / gn / tokenizer.cc
blobdb10f6ae094946c11b659aca6dea43f207459f61
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "tools/gn/tokenizer.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "tools/gn/input_file.h"
11 namespace {
13 bool CouldBeTwoCharOperatorBegin(char c) {
14 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
15 c == '+' || c == '|' || c == '&';
18 bool CouldBeTwoCharOperatorEnd(char c) {
19 return c == '=' || c == '|' || c == '&';
22 bool CouldBeOneCharOperator(char c) {
23 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
24 c == ':' || c == '|' || c == '&' || c == '-';
27 bool CouldBeOperator(char c) {
28 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
31 bool IsScoperChar(char c) {
32 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
35 Token::Type GetSpecificOperatorType(base::StringPiece value) {
36 if (value == "=")
37 return Token::EQUAL;
38 if (value == "+")
39 return Token::PLUS;
40 if (value == "-")
41 return Token::MINUS;
42 if (value == "+=")
43 return Token::PLUS_EQUALS;
44 if (value == "-=")
45 return Token::MINUS_EQUALS;
46 if (value == "==")
47 return Token::EQUAL_EQUAL;
48 if (value == "!=")
49 return Token::NOT_EQUAL;
50 if (value == "<=")
51 return Token::LESS_EQUAL;
52 if (value == ">=")
53 return Token::GREATER_EQUAL;
54 if (value == "<")
55 return Token::LESS_THAN;
56 if (value == ">")
57 return Token::GREATER_THAN;
58 if (value == "&&")
59 return Token::BOOLEAN_AND;
60 if (value == "||")
61 return Token::BOOLEAN_OR;
62 if (value == "!")
63 return Token::BANG;
64 if (value == ".")
65 return Token::DOT;
66 return Token::INVALID;
69 } // namespace
71 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
72 : input_file_(input_file),
73 input_(input_file->contents()),
74 err_(err),
75 cur_(0),
76 line_number_(1),
77 char_in_line_(1) {
80 Tokenizer::~Tokenizer() {
83 // static
84 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
85 Tokenizer t(input_file, err);
86 return t.Run();
89 std::vector<Token> Tokenizer::Run() {
90 DCHECK(tokens_.empty());
91 while (!done()) {
92 AdvanceToNextToken();
93 if (done())
94 break;
95 Location location = GetCurrentLocation();
97 Token::Type type = ClassifyCurrent();
98 if (type == Token::INVALID) {
99 *err_ = GetErrorForInvalidToken(location);
100 break;
102 size_t token_begin = cur_;
103 AdvanceToEndOfToken(location, type);
104 if (has_error())
105 break;
106 size_t token_end = cur_;
108 base::StringPiece token_value(&input_.data()[token_begin],
109 token_end - token_begin);
111 if (type == Token::UNCLASSIFIED_OPERATOR) {
112 type = GetSpecificOperatorType(token_value);
113 } else if (type == Token::IDENTIFIER) {
114 if (token_value == "if")
115 type = Token::IF;
116 else if (token_value == "else")
117 type = Token::ELSE;
118 else if (token_value == "true")
119 type = Token::TRUE_TOKEN;
120 else if (token_value == "false")
121 type = Token::FALSE_TOKEN;
122 } else if (type == Token::UNCLASSIFIED_COMMENT) {
123 if (AtStartOfLine(token_begin) &&
124 // If it's a standalone comment, but is a continuation of a comment on
125 // a previous line, then instead make it a continued suffix comment.
126 (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
127 tokens_.back().location().line_number() + 1 !=
128 location.line_number() ||
129 tokens_.back().location().char_offset() != location.char_offset())) {
130 type = Token::LINE_COMMENT;
131 if (!at_end()) // Could be EOF.
132 Advance(); // The current \n.
133 // If this comment is separated from the next syntax element, then we
134 // want to tag it as a block comment. This will become a standalone
135 // statement at the parser level to keep this comment separate, rather
136 // than attached to the subsequent statement.
137 while (!at_end() && IsCurrentWhitespace()) {
138 if (IsCurrentNewline()) {
139 type = Token::BLOCK_COMMENT;
140 break;
142 Advance();
144 } else {
145 type = Token::SUFFIX_COMMENT;
149 tokens_.push_back(Token(location, type, token_value));
151 if (err_->has_error())
152 tokens_.clear();
153 return tokens_;
156 // static
157 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
158 DCHECK_GT(n, 0);
160 if (n == 1)
161 return 0;
163 int cur_line = 1;
164 size_t cur_byte = 0;
165 while (cur_byte < buf.size()) {
166 if (IsNewline(buf, cur_byte)) {
167 cur_line++;
168 if (cur_line == n)
169 return cur_byte + 1;
171 cur_byte++;
173 return static_cast<size_t>(-1);
176 // static
177 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
178 DCHECK(offset < buffer.size());
179 // We may need more logic here to handle different line ending styles.
180 return buffer[offset] == '\n';
184 void Tokenizer::AdvanceToNextToken() {
185 while (!at_end() && IsCurrentWhitespace())
186 Advance();
189 Token::Type Tokenizer::ClassifyCurrent() const {
190 DCHECK(!at_end());
191 char next_char = cur_char();
192 if (base::IsAsciiDigit(next_char))
193 return Token::INTEGER;
194 if (next_char == '"')
195 return Token::STRING;
197 // Note: '-' handled specially below.
198 if (next_char != '-' && CouldBeOperator(next_char))
199 return Token::UNCLASSIFIED_OPERATOR;
201 if (IsIdentifierFirstChar(next_char))
202 return Token::IDENTIFIER;
204 if (next_char == '[')
205 return Token::LEFT_BRACKET;
206 if (next_char == ']')
207 return Token::RIGHT_BRACKET;
208 if (next_char == '(')
209 return Token::LEFT_PAREN;
210 if (next_char == ')')
211 return Token::RIGHT_PAREN;
212 if (next_char == '{')
213 return Token::LEFT_BRACE;
214 if (next_char == '}')
215 return Token::RIGHT_BRACE;
217 if (next_char == '.')
218 return Token::DOT;
219 if (next_char == ',')
220 return Token::COMMA;
222 if (next_char == '#')
223 return Token::UNCLASSIFIED_COMMENT;
225 // For the case of '-' differentiate between a negative number and anything
226 // else.
227 if (next_char == '-') {
228 if (!CanIncrement())
229 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
230 // file.
231 char following_char = input_[cur_ + 1];
232 if (base::IsAsciiDigit(following_char))
233 return Token::INTEGER;
234 return Token::UNCLASSIFIED_OPERATOR;
237 return Token::INVALID;
240 void Tokenizer::AdvanceToEndOfToken(const Location& location,
241 Token::Type type) {
242 switch (type) {
243 case Token::INTEGER:
244 do {
245 Advance();
246 } while (!at_end() && base::IsAsciiDigit(cur_char()));
247 if (!at_end()) {
248 // Require the char after a number to be some kind of space, scope,
249 // or operator.
250 char c = cur_char();
251 if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
252 !IsScoperChar(c) && c != ',') {
253 *err_ = Err(GetCurrentLocation(),
254 "This is not a valid number.",
255 "Learn to count.");
256 // Highlight the number.
257 err_->AppendRange(LocationRange(location, GetCurrentLocation()));
260 break;
262 case Token::STRING: {
263 char initial = cur_char();
264 Advance(); // Advance past initial "
265 for (;;) {
266 if (at_end()) {
267 *err_ = Err(LocationRange(location, GetCurrentLocation()),
268 "Unterminated string literal.",
269 "Don't leave me hanging like this!");
270 break;
272 if (IsCurrentStringTerminator(initial)) {
273 Advance(); // Skip past last "
274 break;
275 } else if (IsCurrentNewline()) {
276 *err_ = Err(LocationRange(location, GetCurrentLocation()),
277 "Newline in string constant.");
279 Advance();
281 break;
284 case Token::UNCLASSIFIED_OPERATOR:
285 // Some operators are two characters, some are one.
286 if (CouldBeTwoCharOperatorBegin(cur_char())) {
287 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
288 Advance();
290 Advance();
291 break;
293 case Token::IDENTIFIER:
294 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
295 Advance();
296 break;
298 case Token::LEFT_BRACKET:
299 case Token::RIGHT_BRACKET:
300 case Token::LEFT_BRACE:
301 case Token::RIGHT_BRACE:
302 case Token::LEFT_PAREN:
303 case Token::RIGHT_PAREN:
304 case Token::DOT:
305 case Token::COMMA:
306 Advance(); // All are one char.
307 break;
309 case Token::UNCLASSIFIED_COMMENT:
310 // Eat to EOL.
311 while (!at_end() && !IsCurrentNewline())
312 Advance();
313 break;
315 case Token::INVALID:
316 default:
317 *err_ = Err(location, "Everything is all messed up",
318 "Please insert system disk in drive A: and press any key.");
319 NOTREACHED();
320 return;
324 bool Tokenizer::AtStartOfLine(size_t location) const {
325 while (location > 0) {
326 --location;
327 char c = input_[location];
328 if (c == '\n')
329 return true;
330 if (c != ' ')
331 return false;
333 return true;
336 bool Tokenizer::IsCurrentWhitespace() const {
337 DCHECK(!at_end());
338 char c = input_[cur_];
339 // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
340 return c == 0x0A || c == 0x0D || c == 0x20;
343 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
344 DCHECK(!at_end());
345 if (cur_char() != quote_char)
346 return false;
348 // Check for escaping. \" is not a string terminator, but \\" is. Count
349 // the number of preceeding backslashes.
350 int num_backslashes = 0;
351 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
352 num_backslashes++;
354 // Even backslashes mean that they were escaping each other and don't count
355 // as escaping this quote.
356 return (num_backslashes % 2) == 0;
359 bool Tokenizer::IsCurrentNewline() const {
360 return IsNewline(input_, cur_);
363 void Tokenizer::Advance() {
364 DCHECK(cur_ < input_.size());
365 if (IsCurrentNewline()) {
366 line_number_++;
367 char_in_line_ = 1;
368 } else {
369 char_in_line_++;
371 cur_++;
374 Location Tokenizer::GetCurrentLocation() const {
375 return Location(
376 input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
379 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
380 std::string help;
381 if (cur_char() == ';') {
382 // Semicolon.
383 help = "Semicolons are not needed, delete this one.";
384 } else if (cur_char() == '\t') {
385 // Tab.
386 help = "You got a tab character in here. Tabs are evil. "
387 "Convert to spaces.";
388 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
389 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
390 // Different types of comments.
391 help = "Comments should start with # instead";
392 } else if (cur_char() == '\'') {
393 help = "Strings are delimited by \" characters, not apostrophes.";
394 } else {
395 help = "I have no idea what this is.";
398 return Err(location, "Invalid token.", help);