tools/gn/tokenizer.cc

   1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "tools/gn/tokenizer.h"
   6
   7 #include "base/logging.h"
   8 #include "base/strings/string_util.h"
   9 #include "tools/gn/input_file.h"
  10
  11 namespace {
  12
  13 bool CouldBeTwoCharOperatorBegin(char c) {
  14   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
  15          c == '+' || c == '|' || c == '&';
  16 }
  17
  18 bool CouldBeTwoCharOperatorEnd(char c) {
  19   return c == '=' || c == '|' || c == '&';
  20 }
  21
  22 bool CouldBeOneCharOperator(char c) {
  23   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
  24          c == ':' || c == '|' || c == '&' || c == '-';
  25 }
  26
  27 bool CouldBeOperator(char c) {
  28   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
  29 }
  30
  31 bool IsScoperChar(char c) {
  32   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
  33 }
  34
  35 Token::Type GetSpecificOperatorType(base::StringPiece value) {
  36   if (value == "=")
  37     return Token::EQUAL;
  38   if (value == "+")
  39     return Token::PLUS;
  40   if (value == "-")
  41     return Token::MINUS;
  42   if (value == "+=")
  43     return Token::PLUS_EQUALS;
  44   if (value == "-=")
  45     return Token::MINUS_EQUALS;
  46   if (value == "==")
  47     return Token::EQUAL_EQUAL;
  48   if (value == "!=")
  49     return Token::NOT_EQUAL;
  50   if (value == "<=")
  51     return Token::LESS_EQUAL;
  52   if (value == ">=")
  53     return Token::GREATER_EQUAL;
  54   if (value == "<")
  55     return Token::LESS_THAN;
  56   if (value == ">")
  57     return Token::GREATER_THAN;
  58   if (value == "&&")
  59     return Token::BOOLEAN_AND;
  60   if (value == "||")
  61     return Token::BOOLEAN_OR;
  62   if (value == "!")
  63     return Token::BANG;
  64   if (value == ".")
  65     return Token::DOT;
  66   return Token::INVALID;
  67 }
  68
  69 }  // namespace
  70
  71 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
  72     : input_file_(input_file),
  73       input_(input_file->contents()),
  74       err_(err),
  75       cur_(0),
  76       line_number_(1),
  77       char_in_line_(1) {
  78 }
  79
  80 Tokenizer::~Tokenizer() {
  81 }
  82
  83 // static
  84 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
  85   Tokenizer t(input_file, err);
  86   return t.Run();
  87 }
  88
  89 std::vector<Token> Tokenizer::Run() {
  90   DCHECK(tokens_.empty());
  91   while (!done()) {
  92     AdvanceToNextToken();
  93     if (done())
  94       break;
  95     Location location = GetCurrentLocation();
  96
  97     Token::Type type = ClassifyCurrent();
  98     if (type == Token::INVALID) {
  99       *err_ = GetErrorForInvalidToken(location);
 100       break;
 101     }
 102     size_t token_begin = cur_;
 103     AdvanceToEndOfToken(location, type);
 104     if (has_error())
 105       break;
 106     size_t token_end = cur_;
 107
 108     base::StringPiece token_value(&input_.data()[token_begin],
 109                                   token_end - token_begin);
 110
 111     if (type == Token::UNCLASSIFIED_OPERATOR) {
 112       type = GetSpecificOperatorType(token_value);
 113     } else if (type == Token::IDENTIFIER) {
 114       if (token_value == "if")
 115         type = Token::IF;
 116       else if (token_value == "else")
 117         type = Token::ELSE;
 118       else if (token_value == "true")
 119         type = Token::TRUE_TOKEN;
 120       else if (token_value == "false")
 121         type = Token::FALSE_TOKEN;
 122     } else if (type == Token::UNCLASSIFIED_COMMENT) {
 123       if (AtStartOfLine(token_begin) &&
 124           // If it's a standalone comment, but is a continuation of a comment on
 125           // a previous line, then instead make it a continued suffix comment.
 126           (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
 127            tokens_.back().location().line_number() + 1 !=
 128                location.line_number() ||
 129            tokens_.back().location().char_offset() != location.char_offset())) {
 130         type = Token::LINE_COMMENT;
 131         if (!at_end())  // Could be EOF.
 132           Advance();  // The current \n.
 133         // If this comment is separated from the next syntax element, then we
 134         // want to tag it as a block comment. This will become a standalone
 135         // statement at the parser level to keep this comment separate, rather
 136         // than attached to the subsequent statement.
 137         while (!at_end() && IsCurrentWhitespace()) {
 138           if (IsCurrentNewline()) {
 139             type = Token::BLOCK_COMMENT;
 140             break;
 141           }
 142           Advance();
 143         }
 144       } else {
 145         type = Token::SUFFIX_COMMENT;
 146       }
 147     }
 148
 149     tokens_.push_back(Token(location, type, token_value));
 150   }
 151   if (err_->has_error())
 152     tokens_.clear();
 153   return tokens_;
 154 }
 155
 156 // static
 157 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
 158   DCHECK_GT(n, 0);
 159
 160   if (n == 1)
 161     return 0;
 162
 163   int cur_line = 1;
 164   size_t cur_byte = 0;
 165   while (cur_byte < buf.size()) {
 166     if (IsNewline(buf, cur_byte)) {
 167       cur_line++;
 168       if (cur_line == n)
 169         return cur_byte + 1;
 170     }
 171     cur_byte++;
 172   }
 173   return static_cast<size_t>(-1);
 174 }
 175
 176 // static
 177 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
 178   DCHECK(offset < buffer.size());
 179   // We may need more logic here to handle different line ending styles.
 180   return buffer[offset] == '\n';
 181 }
 182
 183
 184 void Tokenizer::AdvanceToNextToken() {
 185   while (!at_end() && IsCurrentWhitespace())
 186     Advance();
 187 }
 188
 189 Token::Type Tokenizer::ClassifyCurrent() const {
 190   DCHECK(!at_end());
 191   char next_char = cur_char();
 192   if (base::IsAsciiDigit(next_char))
 193     return Token::INTEGER;
 194   if (next_char == '"')
 195     return Token::STRING;
 196
 197   // Note: '-' handled specially below.
 198   if (next_char != '-' && CouldBeOperator(next_char))
 199     return Token::UNCLASSIFIED_OPERATOR;
 200
 201   if (IsIdentifierFirstChar(next_char))
 202     return Token::IDENTIFIER;
 203
 204   if (next_char == '[')
 205     return Token::LEFT_BRACKET;
 206   if (next_char == ']')
 207     return Token::RIGHT_BRACKET;
 208   if (next_char == '(')
 209     return Token::LEFT_PAREN;
 210   if (next_char == ')')
 211     return Token::RIGHT_PAREN;
 212   if (next_char == '{')
 213     return Token::LEFT_BRACE;
 214   if (next_char == '}')
 215     return Token::RIGHT_BRACE;
 216
 217   if (next_char == '.')
 218     return Token::DOT;
 219   if (next_char == ',')
 220     return Token::COMMA;
 221
 222   if (next_char == '#')
 223     return Token::UNCLASSIFIED_COMMENT;
 224
 225   // For the case of '-' differentiate between a negative number and anything
 226   // else.
 227   if (next_char == '-') {
 228     if (!CanIncrement())
 229       return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
 230                                             // file.
 231     char following_char = input_[cur_ + 1];
 232     if (base::IsAsciiDigit(following_char))
 233       return Token::INTEGER;
 234     return Token::UNCLASSIFIED_OPERATOR;
 235   }
 236
 237   return Token::INVALID;
 238 }
 239
 240 void Tokenizer::AdvanceToEndOfToken(const Location& location,
 241                                     Token::Type type) {
 242   switch (type) {
 243     case Token::INTEGER:
 244       do {
 245         Advance();
 246       } while (!at_end() && base::IsAsciiDigit(cur_char()));
 247       if (!at_end()) {
 248         // Require the char after a number to be some kind of space, scope,
 249         // or operator.
 250         char c = cur_char();
 251         if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
 252             !IsScoperChar(c) && c != ',') {
 253           *err_ = Err(GetCurrentLocation(),
 254                       "This is not a valid number.",
 255                       "Learn to count.");
 256           // Highlight the number.
 257           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
 258         }
 259       }
 260       break;
 261
 262     case Token::STRING: {
 263       char initial = cur_char();
 264       Advance();  // Advance past initial "
 265       for (;;) {
 266         if (at_end()) {
 267           *err_ = Err(LocationRange(location, GetCurrentLocation()),
 268                       "Unterminated string literal.",
 269                       "Don't leave me hanging like this!");
 270           break;
 271         }
 272         if (IsCurrentStringTerminator(initial)) {
 273           Advance();  // Skip past last "
 274           break;
 275         } else if (IsCurrentNewline()) {
 276           *err_ = Err(LocationRange(location, GetCurrentLocation()),
 277                       "Newline in string constant.");
 278         }
 279         Advance();
 280       }
 281       break;
 282     }
 283
 284     case Token::UNCLASSIFIED_OPERATOR:
 285       // Some operators are two characters, some are one.
 286       if (CouldBeTwoCharOperatorBegin(cur_char())) {
 287         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
 288           Advance();
 289       }
 290       Advance();
 291       break;
 292
 293     case Token::IDENTIFIER:
 294       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
 295         Advance();
 296       break;
 297
 298     case Token::LEFT_BRACKET:
 299     case Token::RIGHT_BRACKET:
 300     case Token::LEFT_BRACE:
 301     case Token::RIGHT_BRACE:
 302     case Token::LEFT_PAREN:
 303     case Token::RIGHT_PAREN:
 304     case Token::DOT:
 305     case Token::COMMA:
 306       Advance();  // All are one char.
 307       break;
 308
 309     case Token::UNCLASSIFIED_COMMENT:
 310       // Eat to EOL.
 311       while (!at_end() && !IsCurrentNewline())
 312         Advance();
 313       break;
 314
 315     case Token::INVALID:
 316     default:
 317       *err_ = Err(location, "Everything is all messed up",
 318                   "Please insert system disk in drive A: and press any key.");
 319       NOTREACHED();
 320       return;
 321   }
 322 }
 323
 324 bool Tokenizer::AtStartOfLine(size_t location) const {
 325   while (location > 0) {
 326     --location;
 327     char c = input_[location];
 328     if (c == '\n')
 329       return true;
 330     if (c != ' ')
 331       return false;
 332   }
 333   return true;
 334 }
 335
 336 bool Tokenizer::IsCurrentWhitespace() const {
 337   DCHECK(!at_end());
 338   char c = input_[cur_];
 339   // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
 340   return c == 0x0A || c == 0x0D || c == 0x20;
 341 }
 342
 343 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
 344   DCHECK(!at_end());
 345   if (cur_char() != quote_char)
 346     return false;
 347
 348   // Check for escaping. \" is not a string terminator, but \\" is. Count
 349   // the number of preceeding backslashes.
 350   int num_backslashes = 0;
 351   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
 352     num_backslashes++;
 353
 354   // Even backslashes mean that they were escaping each other and don't count
 355   // as escaping this quote.
 356   return (num_backslashes % 2) == 0;
 357 }
 358
 359 bool Tokenizer::IsCurrentNewline() const {
 360   return IsNewline(input_, cur_);
 361 }
 362
 363 void Tokenizer::Advance() {
 364   DCHECK(cur_ < input_.size());
 365   if (IsCurrentNewline()) {
 366     line_number_++;
 367     char_in_line_ = 1;
 368   } else {
 369     char_in_line_++;
 370   }
 371   cur_++;
 372 }
 373
 374 Location Tokenizer::GetCurrentLocation() const {
 375   return Location(
 376       input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
 377 }
 378
 379 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
 380   std::string help;
 381   if (cur_char() == ';') {
 382     // Semicolon.
 383     help = "Semicolons are not needed, delete this one.";
 384   } else if (cur_char() == '\t') {
 385     // Tab.
 386     help = "You got a tab character in here. Tabs are evil. "
 387            "Convert to spaces.";
 388   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
 389       (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
 390     // Different types of comments.
 391     help = "Comments should start with # instead";
 392   } else if (cur_char() == '\'') {
 393     help = "Strings are delimited by \" characters, not apostrophes.";
 394   } else {
 395     help = "I have no idea what this is.";
 396   }
 397
 398   return Err(location, "Invalid token.", help);
 399 }