tools/gn/tokenizer.cc

   1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "tools/gn/tokenizer.h"
   6
   7 #include "base/logging.h"
   8 #include "tools/gn/input_file.h"
   9
  10 namespace {
  11
  12 bool IsNumberChar(char c) {
  13   return c >= '0' && c <= '9';
  14 }
  15
  16 bool CouldBeTwoCharOperatorBegin(char c) {
  17   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
  18          c == '+' || c == '|' || c == '&';
  19 }
  20
  21 bool CouldBeTwoCharOperatorEnd(char c) {
  22   return c == '=' || c == '|' || c == '&';
  23 }
  24
  25 bool CouldBeOneCharOperator(char c) {
  26   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
  27          c == ':' || c == '|' || c == '&' || c == '-';
  28 }
  29
  30 bool CouldBeOperator(char c) {
  31   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
  32 }
  33
  34 bool IsScoperChar(char c) {
  35   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
  36 }
  37
  38 Token::Type GetSpecificOperatorType(base::StringPiece value) {
  39   if (value == "=")
  40     return Token::EQUAL;
  41   if (value == "+")
  42     return Token::PLUS;
  43   if (value == "-")
  44     return Token::MINUS;
  45   if (value == "+=")
  46     return Token::PLUS_EQUALS;
  47   if (value == "-=")
  48     return Token::MINUS_EQUALS;
  49   if (value == "==")
  50     return Token::EQUAL_EQUAL;
  51   if (value == "!=")
  52     return Token::NOT_EQUAL;
  53   if (value == "<=")
  54     return Token::LESS_EQUAL;
  55   if (value == ">=")
  56     return Token::GREATER_EQUAL;
  57   if (value == "<")
  58     return Token::LESS_THAN;
  59   if (value == ">")
  60     return Token::GREATER_THAN;
  61   if (value == "&&")
  62     return Token::BOOLEAN_AND;
  63   if (value == "||")
  64     return Token::BOOLEAN_OR;
  65   if (value == "!")
  66     return Token::BANG;
  67   return Token::INVALID;
  68 }
  69
  70 }  // namespace
  71
  72 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
  73     : input_file_(input_file),
  74       input_(input_file->contents()),
  75       err_(err),
  76       cur_(0),
  77       line_number_(1),
  78       char_in_line_(1) {
  79 }
  80
  81 Tokenizer::~Tokenizer() {
  82 }
  83
  84 // static
  85 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
  86   Tokenizer t(input_file, err);
  87   return t.Run();
  88 }
  89
  90 std::vector<Token> Tokenizer::Run() {
  91   DCHECK(tokens_.empty());
  92   while (!done()) {
  93     AdvanceToNextToken();
  94     if (done())
  95       break;
  96     Location location = GetCurrentLocation();
  97
  98     Token::Type type = ClassifyCurrent();
  99     if (type == Token::INVALID) {
 100       *err_ = GetErrorForInvalidToken(location);
 101       break;
 102     }
 103     size_t token_begin = cur_;
 104     AdvanceToEndOfToken(location, type);
 105     if (has_error())
 106       break;
 107     size_t token_end = cur_;
 108
 109     base::StringPiece token_value(&input_.data()[token_begin],
 110                                   token_end - token_begin);
 111
 112     if (type == Token::UNCLASSIFIED_OPERATOR)
 113       type = GetSpecificOperatorType(token_value);
 114     if (type == Token::IDENTIFIER) {
 115       if (token_value == "if")
 116         type = Token::IF;
 117       else if (token_value == "else")
 118         type = Token::ELSE;
 119       else if (token_value == "true")
 120         type = Token::TRUE_TOKEN;
 121       else if (token_value == "false")
 122         type = Token::FALSE_TOKEN;
 123     }
 124
 125     // TODO(brettw) This just strips comments from the token stream. This
 126     // is probably wrong, they should be removed at a later stage so we can
 127     // do things like rewrite the file. But this makes the parser simpler and
 128     // is OK for now.
 129     if (type != Token::COMMENT)
 130       tokens_.push_back(Token(location, type, token_value));
 131   }
 132   if (err_->has_error())
 133     tokens_.clear();
 134   return tokens_;
 135 }
 136
 137 // static
 138 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
 139   int cur_line = 1;
 140   size_t cur_byte = 0;
 141
 142   DCHECK(n > 0);
 143
 144   if (n == 1)
 145     return 0;
 146
 147   while (cur_byte < buf.size()) {
 148     if (IsNewline(buf, cur_byte)) {
 149       cur_line++;
 150       if (cur_line == n)
 151         return cur_byte + 1;
 152     }
 153     cur_byte++;
 154   }
 155   return -1;
 156 }
 157
 158 // static
 159 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
 160   DCHECK(offset < buffer.size());
 161   // We may need more logic here to handle different line ending styles.
 162   return buffer[offset] == '\n';
 163 }
 164
 165
 166 void Tokenizer::AdvanceToNextToken() {
 167   while (!at_end() && IsCurrentWhitespace())
 168     Advance();
 169 }
 170
 171 Token::Type Tokenizer::ClassifyCurrent() const {
 172   DCHECK(!at_end());
 173   char next_char = cur_char();
 174   if (next_char >= '0' && next_char <= '9')
 175     return Token::INTEGER;
 176   if (next_char == '"')
 177     return Token::STRING;
 178
 179   // Note: '-' handled specially below.
 180   if (next_char != '-' && CouldBeOperator(next_char))
 181     return Token::UNCLASSIFIED_OPERATOR;
 182
 183   if (IsIdentifierFirstChar(next_char))
 184     return Token::IDENTIFIER;
 185
 186   if (next_char == '[')
 187     return Token::LEFT_BRACKET;
 188   if (next_char == ']')
 189     return Token::RIGHT_BRACKET;
 190   if (next_char == '(')
 191     return Token::LEFT_PAREN;
 192   if (next_char == ')')
 193     return Token::RIGHT_PAREN;
 194   if (next_char == '{')
 195     return Token::LEFT_BRACE;
 196   if (next_char == '}')
 197     return Token::RIGHT_BRACE;
 198
 199   if (next_char == ',')
 200     return Token::COMMA;
 201
 202   if (next_char == '#')
 203     return Token::COMMENT;
 204
 205   // For the case of '-' differentiate between a negative number and anything
 206   // else.
 207   if (next_char == '-') {
 208     if (!CanIncrement())
 209       return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
 210                                             // file.
 211     char following_char = input_[cur_ + 1];
 212     if (following_char >= '0' && following_char <= '9')
 213       return Token::INTEGER;
 214     return Token::UNCLASSIFIED_OPERATOR;
 215   }
 216
 217   return Token::INVALID;
 218 }
 219
 220 void Tokenizer::AdvanceToEndOfToken(const Location& location,
 221                                     Token::Type type) {
 222   switch (type) {
 223     case Token::INTEGER:
 224       do {
 225         Advance();
 226       } while (!at_end() && IsNumberChar(cur_char()));
 227       if (!at_end()) {
 228         // Require the char after a number to be some kind of space, scope,
 229         // or operator.
 230         char c = cur_char();
 231         if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
 232             !IsScoperChar(c) && c != ',') {
 233           *err_ = Err(GetCurrentLocation(),
 234               "This is not a valid number.",
 235               "Learn to count.");
 236           // Highlight the number.
 237           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
 238         }
 239       }
 240       break;
 241
 242     case Token::STRING: {
 243       char initial = cur_char();
 244       Advance();  // Advance past initial "
 245       for (;;) {
 246         if (at_end()) {
 247           *err_ = Err(LocationRange(location,
 248                           Location(input_file_, line_number_, char_in_line_)),
 249                      "Unterminated string literal.",
 250                      "Don't leave me hanging like this!");
 251           break;
 252         }
 253         if (IsCurrentStringTerminator(initial)) {
 254           Advance();  // Skip past last "
 255           break;
 256         } else if (cur_char() == '\n') {
 257           *err_ = Err(LocationRange(location,
 258                                    GetCurrentLocation()),
 259                      "Newline in string constant.");
 260         }
 261         Advance();
 262       }
 263       break;
 264     }
 265
 266     case Token::UNCLASSIFIED_OPERATOR:
 267       // Some operators are two characters, some are one.
 268       if (CouldBeTwoCharOperatorBegin(cur_char())) {
 269         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
 270           Advance();
 271       }
 272       Advance();
 273       break;
 274
 275     case Token::IDENTIFIER:
 276       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
 277         Advance();
 278       break;
 279
 280     case Token::LEFT_BRACKET:
 281     case Token::RIGHT_BRACKET:
 282     case Token::LEFT_BRACE:
 283     case Token::RIGHT_BRACE:
 284     case Token::LEFT_PAREN:
 285     case Token::RIGHT_PAREN:
 286     case Token::COMMA:
 287       Advance();  // All are one char.
 288       break;
 289
 290     case Token::COMMENT:
 291       // Eat to EOL.
 292       while (!at_end() && !IsCurrentNewline())
 293         Advance();
 294       break;
 295
 296     case Token::INVALID:
 297     default:
 298       *err_ = Err(location, "Everything is all messed up",
 299                   "Please insert system disk in drive A: and press any key.");
 300       NOTREACHED();
 301       return;
 302   }
 303 }
 304
 305 bool Tokenizer::IsCurrentWhitespace() const {
 306   DCHECK(!at_end());
 307   char c = input_[cur_];
 308   // Note that tab (0x09) is illegal.
 309   return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
 310 }
 311
 312 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
 313   DCHECK(!at_end());
 314   if (cur_char() != quote_char)
 315     return false;
 316
 317   // Check for escaping. \" is not a string terminator, but \\" is. Count
 318   // the number of preceeding backslashes.
 319   int num_backslashes = 0;
 320   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
 321     num_backslashes++;
 322
 323   // Even backslashes mean that they were escaping each other and don't count
 324   // as escaping this quote.
 325   return (num_backslashes % 2) == 0;
 326 }
 327
 328 bool Tokenizer::IsCurrentNewline() const {
 329   return IsNewline(input_, cur_);
 330 }
 331
 332 void Tokenizer::Advance() {
 333   DCHECK(cur_ < input_.size());
 334   if (IsCurrentNewline()) {
 335     line_number_++;
 336     char_in_line_ = 1;
 337   } else {
 338     char_in_line_++;
 339   }
 340   cur_++;
 341 }
 342
 343 Location Tokenizer::GetCurrentLocation() const {
 344   return Location(input_file_, line_number_, char_in_line_);
 345 }
 346
 347 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
 348   std::string help;
 349   if (cur_char() == ';') {
 350     // Semicolon.
 351     help = "Semicolons are not needed, delete this one.";
 352   } else if (cur_char() == '\t') {
 353     // Tab.
 354     help = "You got a tab character in here. Tabs are evil. "
 355            "Convert to spaces.";
 356   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
 357       (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
 358     // Different types of comments.
 359     help = "Comments should start with # instead";
 360   } else {
 361     help = "I have no idea what this is.";
 362   }
 363
 364   return Err(location, "Invalid token.", help);
 365 }