clang/lib/AST/CommentLexer.cpp

   1 //===--- CommentLexer.cpp -------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "clang/AST/CommentLexer.h"
  10 #include "clang/AST/CommentCommandTraits.h"
  11 #include "clang/AST/CommentDiagnostic.h"
  12 #include "clang/Basic/CharInfo.h"
  13 #include "llvm/ADT/StringExtras.h"
  14 #include "llvm/ADT/StringSwitch.h"
  15 #include "llvm/Support/ConvertUTF.h"
  16 #include "llvm/Support/ErrorHandling.h"
  17
  18 namespace clang {
  19 namespace comments {
  20
  21 void Token::dump(const Lexer &L, const SourceManager &SM) const {
  22   llvm::errs() << "comments::Token Kind=" << Kind << " ";
  23   Loc.print(llvm::errs(), SM);
  24   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  25 }
  26
  27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  28   return isLetter(C);
  29 }
  30
  31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  32   return isDigit(C);
  33 }
  34
  35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  36   return isHexDigit(C);
  37 }
  38
  39 static inline StringRef convertCodePointToUTF8(
  40                                       llvm::BumpPtrAllocator &Allocator,
  41                                       unsigned CodePoint) {
  42   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  43   char *ResolvedPtr = Resolved;
  44   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  45     return StringRef(Resolved, ResolvedPtr - Resolved);
  46   else
  47     return StringRef();
  48 }
  49
  50 namespace {
  51
  52 #include "clang/AST/CommentHTMLTags.inc"
  53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  54
  55 } // end anonymous namespace
  56
  57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  58   // Fast path, first check a few most widely used named character references.
  59   return llvm::StringSwitch<StringRef>(Name)
  60       .Case("amp", "&")
  61       .Case("lt", "<")
  62       .Case("gt", ">")
  63       .Case("quot", "\"")
  64       .Case("apos", "\'")
  65       // Slow path.
  66       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  67 }
  68
  69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  70   unsigned CodePoint = 0;
  71   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  72     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  73     CodePoint *= 10;
  74     CodePoint += Name[i] - '0';
  75   }
  76   return convertCodePointToUTF8(Allocator, CodePoint);
  77 }
  78
  79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  80   unsigned CodePoint = 0;
  81   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  82     CodePoint *= 16;
  83     const char C = Name[i];
  84     assert(isHTMLHexCharacterReferenceCharacter(C));
  85     CodePoint += llvm::hexDigitValue(C);
  86   }
  87   return convertCodePointToUTF8(Allocator, CodePoint);
  88 }
  89
  90 void Lexer::skipLineStartingDecorations() {
  91   // This function should be called only for C comments
  92   assert(CommentState == LCS_InsideCComment);
  93
  94   if (BufferPtr == CommentEnd)
  95     return;
  96
  97   const char *NewBufferPtr = BufferPtr;
  98   while (isHorizontalWhitespace(*NewBufferPtr))
  99     if (++NewBufferPtr == CommentEnd)
 100       return;
 101   if (*NewBufferPtr == '*')
 102     BufferPtr = NewBufferPtr + 1;
 103 }
 104
 105 namespace {
 106 /// Returns pointer to the first newline character in the string.
 107 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
 108   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 109     if (isVerticalWhitespace(*BufferPtr))
 110       return BufferPtr;
 111   }
 112   return BufferEnd;
 113 }
 114
 115 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
 116   if (BufferPtr == BufferEnd)
 117     return BufferPtr;
 118
 119   if (*BufferPtr == '\n')
 120     BufferPtr++;
 121   else {
 122     assert(*BufferPtr == '\r');
 123     BufferPtr++;
 124     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
 125       BufferPtr++;
 126   }
 127   return BufferPtr;
 128 }
 129
 130 const char *skipNamedCharacterReference(const char *BufferPtr,
 131                                         const char *BufferEnd) {
 132   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 133     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
 134       return BufferPtr;
 135   }
 136   return BufferEnd;
 137 }
 138
 139 const char *skipDecimalCharacterReference(const char *BufferPtr,
 140                                           const char *BufferEnd) {
 141   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 142     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
 143       return BufferPtr;
 144   }
 145   return BufferEnd;
 146 }
 147
 148 const char *skipHexCharacterReference(const char *BufferPtr,
 149                                       const char *BufferEnd) {
 150   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 151     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
 152       return BufferPtr;
 153   }
 154   return BufferEnd;
 155 }
 156
 157 bool isHTMLIdentifierStartingCharacter(char C) {
 158   return isLetter(C);
 159 }
 160
 161 bool isHTMLIdentifierCharacter(char C) {
 162   return isAlphanumeric(C);
 163 }
 164
 165 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
 166   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 167     if (!isHTMLIdentifierCharacter(*BufferPtr))
 168       return BufferPtr;
 169   }
 170   return BufferEnd;
 171 }
 172
 173 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
 174 /// string allowed.
 175 ///
 176 /// Returns pointer to closing quote.
 177 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
 178 {
 179   const char Quote = *BufferPtr;
 180   assert(Quote == '\"' || Quote == '\'');
 181
 182   BufferPtr++;
 183   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 184     const char C = *BufferPtr;
 185     if (C == Quote && BufferPtr[-1] != '\\')
 186       return BufferPtr;
 187   }
 188   return BufferEnd;
 189 }
 190
 191 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
 192   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 193     if (!isWhitespace(*BufferPtr))
 194       return BufferPtr;
 195   }
 196   return BufferEnd;
 197 }
 198
 199 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
 200   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 201 }
 202
 203 bool isCommandNameStartCharacter(char C) {
 204   return isLetter(C);
 205 }
 206
 207 bool isCommandNameCharacter(char C) {
 208   return isAlphanumeric(C);
 209 }
 210
 211 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
 212   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 213     if (!isCommandNameCharacter(*BufferPtr))
 214       return BufferPtr;
 215   }
 216   return BufferEnd;
 217 }
 218
 219 /// Return the one past end pointer for BCPL comments.
 220 /// Handles newlines escaped with backslash or trigraph for backslahs.
 221 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 222   const char *CurPtr = BufferPtr;
 223   while (CurPtr != BufferEnd) {
 224     while (!isVerticalWhitespace(*CurPtr)) {
 225       CurPtr++;
 226       if (CurPtr == BufferEnd)
 227         return BufferEnd;
 228     }
 229     // We found a newline, check if it is escaped.
 230     const char *EscapePtr = CurPtr - 1;
 231     while(isHorizontalWhitespace(*EscapePtr))
 232       EscapePtr--;
 233
 234     if (*EscapePtr == '\\' ||
 235         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
 236          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
 237       // We found an escaped newline.
 238       CurPtr = skipNewline(CurPtr, BufferEnd);
 239     } else
 240       return CurPtr; // Not an escaped newline.
 241   }
 242   return BufferEnd;
 243 }
 244
 245 /// Return the one past end pointer for C comments.
 246 /// Very dumb, does not handle escaped newlines or trigraphs.
 247 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 248   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 249     if (*BufferPtr == '*') {
 250       assert(BufferPtr + 1 != BufferEnd);
 251       if (*(BufferPtr + 1) == '/')
 252         return BufferPtr;
 253     }
 254   }
 255   llvm_unreachable("buffer end hit before '*/' was seen");
 256 }
 257
 258 } // end anonymous namespace
 259
 260 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
 261                                tok::TokenKind Kind) {
 262   const unsigned TokLen = TokEnd - BufferPtr;
 263   Result.setLocation(getSourceLocation(BufferPtr));
 264   Result.setKind(Kind);
 265   Result.setLength(TokLen);
 266 #ifndef NDEBUG
 267   Result.TextPtr = "<UNSET>";
 268   Result.IntVal = 7;
 269 #endif
 270   BufferPtr = TokEnd;
 271 }
 272
 273 const char *Lexer::skipTextToken() {
 274   const char *TokenPtr = BufferPtr;
 275   assert(TokenPtr < CommentEnd);
 276   StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
 277
 278 again:
 279   size_t End =
 280       StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
 281   if (End == StringRef::npos)
 282     return CommentEnd;
 283
 284   // Doxygen doesn't recognize any commands in a one-line double quotation.
 285   // If we don't find an ending quotation mark, we pretend it never began.
 286   if (*(TokenPtr + End) == '\"') {
 287     TokenPtr += End + 1;
 288     End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
 289     if (End != StringRef::npos && *(TokenPtr + End) == '\"')
 290       TokenPtr += End + 1;
 291     goto again;
 292   }
 293   return TokenPtr + End;
 294 }
 295
 296 void Lexer::lexCommentText(Token &T) {
 297   assert(CommentState == LCS_InsideBCPLComment ||
 298          CommentState == LCS_InsideCComment);
 299
 300   // Handles lexing non-command text, i.e. text and newline.
 301   auto HandleNonCommandToken = [&]() -> void {
 302     assert(State == LS_Normal);
 303
 304     const char *TokenPtr = BufferPtr;
 305     assert(TokenPtr < CommentEnd);
 306     switch (*TokenPtr) {
 307       case '\n':
 308       case '\r':
 309           TokenPtr = skipNewline(TokenPtr, CommentEnd);
 310           formTokenWithChars(T, TokenPtr, tok::newline);
 311
 312           if (CommentState == LCS_InsideCComment)
 313             skipLineStartingDecorations();
 314           return;
 315
 316       default:
 317         return formTextToken(T, skipTextToken());
 318     }
 319   };
 320
 321   if (!ParseCommands)
 322     return HandleNonCommandToken();
 323
 324   switch (State) {
 325   case LS_Normal:
 326     break;
 327   case LS_VerbatimBlockFirstLine:
 328     lexVerbatimBlockFirstLine(T);
 329     return;
 330   case LS_VerbatimBlockBody:
 331     lexVerbatimBlockBody(T);
 332     return;
 333   case LS_VerbatimLineText:
 334     lexVerbatimLineText(T);
 335     return;
 336   case LS_HTMLStartTag:
 337     lexHTMLStartTag(T);
 338     return;
 339   case LS_HTMLEndTag:
 340     lexHTMLEndTag(T);
 341     return;
 342   }
 343
 344   assert(State == LS_Normal);
 345   const char *TokenPtr = BufferPtr;
 346   assert(TokenPtr < CommentEnd);
 347   switch(*TokenPtr) {
 348     case '\\':
 349     case '@': {
 350       // Commands that start with a backslash and commands that start with
 351       // 'at' have equivalent semantics.  But we keep information about the
 352       // exact syntax in AST for comments.
 353       tok::TokenKind CommandKind =
 354           (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
 355       TokenPtr++;
 356       if (TokenPtr == CommentEnd) {
 357         formTextToken(T, TokenPtr);
 358         return;
 359       }
 360       char C = *TokenPtr;
 361       switch (C) {
 362       default:
 363         break;
 364
 365       case '\\': case '@': case '&': case '$':
 366       case '#':  case '<': case '>': case '%':
 367       case '\"': case '.': case ':':
 368         // This is one of \\ \@ \& \$ etc escape sequences.
 369         TokenPtr++;
 370         if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
 371           // This is the \:: escape sequence.
 372           TokenPtr++;
 373         }
 374         StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
 375         formTokenWithChars(T, TokenPtr, tok::text);
 376         T.setText(UnescapedText);
 377         return;
 378       }
 379
 380       // Don't make zero-length commands.
 381       if (!isCommandNameStartCharacter(*TokenPtr)) {
 382         formTextToken(T, TokenPtr);
 383         return;
 384       }
 385
 386       TokenPtr = skipCommandName(TokenPtr, CommentEnd);
 387       unsigned Length = TokenPtr - (BufferPtr + 1);
 388
 389       // Hardcoded support for lexing LaTeX formula commands
 390       // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
 391       if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
 392         C = *TokenPtr;
 393         if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
 394             C == '{' || C == '}') {
 395           TokenPtr++;
 396           Length++;
 397         }
 398       }
 399
 400       StringRef CommandName(BufferPtr + 1, Length);
 401
 402       const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
 403       if (!Info) {
 404         if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
 405           StringRef CorrectedName = Info->Name;
 406           SourceLocation Loc = getSourceLocation(BufferPtr);
 407           SourceLocation EndLoc = getSourceLocation(TokenPtr);
 408           SourceRange FullRange = SourceRange(Loc, EndLoc);
 409           SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
 410           Diag(Loc, diag::warn_correct_comment_command_name)
 411             << FullRange << CommandName << CorrectedName
 412             << FixItHint::CreateReplacement(CommandRange, CorrectedName);
 413         } else {
 414           formTokenWithChars(T, TokenPtr, tok::unknown_command);
 415           T.setUnknownCommandName(CommandName);
 416           Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
 417               << SourceRange(T.getLocation(), T.getEndLocation());
 418           return;
 419         }
 420       }
 421       if (Info->IsVerbatimBlockCommand) {
 422         setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
 423         return;
 424       }
 425       if (Info->IsVerbatimLineCommand) {
 426         setupAndLexVerbatimLine(T, TokenPtr, Info);
 427         return;
 428       }
 429       formTokenWithChars(T, TokenPtr, CommandKind);
 430       T.setCommandID(Info->getID());
 431       return;
 432     }
 433
 434     case '&':
 435       lexHTMLCharacterReference(T);
 436       return;
 437
 438     case '<': {
 439       TokenPtr++;
 440       if (TokenPtr == CommentEnd) {
 441         formTextToken(T, TokenPtr);
 442         return;
 443       }
 444       const char C = *TokenPtr;
 445       if (isHTMLIdentifierStartingCharacter(C))
 446         setupAndLexHTMLStartTag(T);
 447       else if (C == '/')
 448         setupAndLexHTMLEndTag(T);
 449       else
 450         formTextToken(T, TokenPtr);
 451       return;
 452     }
 453
 454     default:
 455       return HandleNonCommandToken();
 456   }
 457 }
 458
 459 void Lexer::setupAndLexVerbatimBlock(Token &T,
 460                                      const char *TextBegin,
 461                                      char Marker, const CommandInfo *Info) {
 462   assert(Info->IsVerbatimBlockCommand);
 463
 464   VerbatimBlockEndCommandName.clear();
 465   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
 466   VerbatimBlockEndCommandName.append(Info->EndCommandName);
 467
 468   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
 469   T.setVerbatimBlockID(Info->getID());
 470
 471   // If there is a newline following the verbatim opening command, skip the
 472   // newline so that we don't create an tok::verbatim_block_line with empty
 473   // text content.
 474   if (BufferPtr != CommentEnd &&
 475       isVerticalWhitespace(*BufferPtr)) {
 476     BufferPtr = skipNewline(BufferPtr, CommentEnd);
 477     State = LS_VerbatimBlockBody;
 478     return;
 479   }
 480
 481   State = LS_VerbatimBlockFirstLine;
 482 }
 483
 484 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
 485 again:
 486   assert(BufferPtr < CommentEnd);
 487
 488   // FIXME: It would be better to scan the text once, finding either the block
 489   // end command or newline.
 490   //
 491   // Extract current line.
 492   const char *Newline = findNewline(BufferPtr, CommentEnd);
 493   StringRef Line(BufferPtr, Newline - BufferPtr);
 494
 495   // Look for end command in current line.
 496   size_t Pos = Line.find(VerbatimBlockEndCommandName);
 497   const char *TextEnd;
 498   const char *NextLine;
 499   if (Pos == StringRef::npos) {
 500     // Current line is completely verbatim.
 501     TextEnd = Newline;
 502     NextLine = skipNewline(Newline, CommentEnd);
 503   } else if (Pos == 0) {
 504     // Current line contains just an end command.
 505     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
 506     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
 507     formTokenWithChars(T, End, tok::verbatim_block_end);
 508     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
 509     State = LS_Normal;
 510     return;
 511   } else {
 512     // There is some text, followed by end command.  Extract text first.
 513     TextEnd = BufferPtr + Pos;
 514     NextLine = TextEnd;
 515     // If there is only whitespace before end command, skip whitespace.
 516     if (isWhitespace(BufferPtr, TextEnd)) {
 517       BufferPtr = TextEnd;
 518       goto again;
 519     }
 520   }
 521
 522   StringRef Text(BufferPtr, TextEnd - BufferPtr);
 523   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
 524   T.setVerbatimBlockText(Text);
 525
 526   State = LS_VerbatimBlockBody;
 527 }
 528
 529 void Lexer::lexVerbatimBlockBody(Token &T) {
 530   assert(State == LS_VerbatimBlockBody);
 531
 532   if (CommentState == LCS_InsideCComment)
 533     skipLineStartingDecorations();
 534
 535   if (BufferPtr == CommentEnd) {
 536     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
 537     T.setVerbatimBlockText("");
 538     return;
 539   }
 540
 541   lexVerbatimBlockFirstLine(T);
 542 }
 543
 544 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
 545                                     const CommandInfo *Info) {
 546   assert(Info->IsVerbatimLineCommand);
 547   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
 548   T.setVerbatimLineID(Info->getID());
 549
 550   State = LS_VerbatimLineText;
 551 }
 552
 553 void Lexer::lexVerbatimLineText(Token &T) {
 554   assert(State == LS_VerbatimLineText);
 555
 556   // Extract current line.
 557   const char *Newline = findNewline(BufferPtr, CommentEnd);
 558   StringRef Text(BufferPtr, Newline - BufferPtr);
 559   formTokenWithChars(T, Newline, tok::verbatim_line_text);
 560   T.setVerbatimLineText(Text);
 561
 562   State = LS_Normal;
 563 }
 564
 565 void Lexer::lexHTMLCharacterReference(Token &T) {
 566   const char *TokenPtr = BufferPtr;
 567   assert(*TokenPtr == '&');
 568   TokenPtr++;
 569   if (TokenPtr == CommentEnd) {
 570     formTextToken(T, TokenPtr);
 571     return;
 572   }
 573   const char *NamePtr;
 574   bool isNamed = false;
 575   bool isDecimal = false;
 576   char C = *TokenPtr;
 577   if (isHTMLNamedCharacterReferenceCharacter(C)) {
 578     NamePtr = TokenPtr;
 579     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
 580     isNamed = true;
 581   } else if (C == '#') {
 582     TokenPtr++;
 583     if (TokenPtr == CommentEnd) {
 584       formTextToken(T, TokenPtr);
 585       return;
 586     }
 587     C = *TokenPtr;
 588     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
 589       NamePtr = TokenPtr;
 590       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
 591       isDecimal = true;
 592     } else if (C == 'x' || C == 'X') {
 593       TokenPtr++;
 594       NamePtr = TokenPtr;
 595       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
 596     } else {
 597       formTextToken(T, TokenPtr);
 598       return;
 599     }
 600   } else {
 601     formTextToken(T, TokenPtr);
 602     return;
 603   }
 604   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
 605       *TokenPtr != ';') {
 606     formTextToken(T, TokenPtr);
 607     return;
 608   }
 609   StringRef Name(NamePtr, TokenPtr - NamePtr);
 610   TokenPtr++; // Skip semicolon.
 611   StringRef Resolved;
 612   if (isNamed)
 613     Resolved = resolveHTMLNamedCharacterReference(Name);
 614   else if (isDecimal)
 615     Resolved = resolveHTMLDecimalCharacterReference(Name);
 616   else
 617     Resolved = resolveHTMLHexCharacterReference(Name);
 618
 619   if (Resolved.empty()) {
 620     formTextToken(T, TokenPtr);
 621     return;
 622   }
 623   formTokenWithChars(T, TokenPtr, tok::text);
 624   T.setText(Resolved);
 625 }
 626
 627 void Lexer::setupAndLexHTMLStartTag(Token &T) {
 628   assert(BufferPtr[0] == '<' &&
 629          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
 630   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
 631   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
 632   if (!isHTMLTagName(Name)) {
 633     formTextToken(T, TagNameEnd);
 634     return;
 635   }
 636
 637   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
 638   T.setHTMLTagStartName(Name);
 639
 640   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 641
 642   const char C = *BufferPtr;
 643   if (BufferPtr != CommentEnd &&
 644       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
 645     State = LS_HTMLStartTag;
 646 }
 647
 648 void Lexer::lexHTMLStartTag(Token &T) {
 649   assert(State == LS_HTMLStartTag);
 650
 651   const char *TokenPtr = BufferPtr;
 652   char C = *TokenPtr;
 653   if (isHTMLIdentifierCharacter(C)) {
 654     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
 655     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
 656     formTokenWithChars(T, TokenPtr, tok::html_ident);
 657     T.setHTMLIdent(Ident);
 658   } else {
 659     switch (C) {
 660     case '=':
 661       TokenPtr++;
 662       formTokenWithChars(T, TokenPtr, tok::html_equals);
 663       break;
 664     case '\"':
 665     case '\'': {
 666       const char *OpenQuote = TokenPtr;
 667       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
 668       const char *ClosingQuote = TokenPtr;
 669       if (TokenPtr != CommentEnd) // Skip closing quote.
 670         TokenPtr++;
 671       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
 672       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
 673                                       ClosingQuote - (OpenQuote + 1)));
 674       break;
 675     }
 676     case '>':
 677       TokenPtr++;
 678       formTokenWithChars(T, TokenPtr, tok::html_greater);
 679       State = LS_Normal;
 680       return;
 681     case '/':
 682       TokenPtr++;
 683       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
 684         TokenPtr++;
 685         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
 686       } else
 687         formTextToken(T, TokenPtr);
 688
 689       State = LS_Normal;
 690       return;
 691     }
 692   }
 693
 694   // Now look ahead and return to normal state if we don't see any HTML tokens
 695   // ahead.
 696   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 697   if (BufferPtr == CommentEnd) {
 698     State = LS_Normal;
 699     return;
 700   }
 701
 702   C = *BufferPtr;
 703   if (!isHTMLIdentifierStartingCharacter(C) &&
 704       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
 705     State = LS_Normal;
 706     return;
 707   }
 708 }
 709
 710 void Lexer::setupAndLexHTMLEndTag(Token &T) {
 711   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
 712
 713   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
 714   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
 715   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
 716   if (!isHTMLTagName(Name)) {
 717     formTextToken(T, TagNameEnd);
 718     return;
 719   }
 720
 721   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
 722
 723   formTokenWithChars(T, End, tok::html_end_tag);
 724   T.setHTMLTagEndName(Name);
 725
 726   if (BufferPtr != CommentEnd && *BufferPtr == '>')
 727     State = LS_HTMLEndTag;
 728 }
 729
 730 void Lexer::lexHTMLEndTag(Token &T) {
 731   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
 732
 733   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
 734   State = LS_Normal;
 735 }
 736
 737 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
 738              const CommandTraits &Traits, SourceLocation FileLoc,
 739              const char *BufferStart, const char *BufferEnd, bool ParseCommands)
 740     : Allocator(Allocator), Diags(Diags), Traits(Traits),
 741       BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
 742       FileLoc(FileLoc), ParseCommands(ParseCommands),
 743       CommentState(LCS_BeforeComment), State(LS_Normal) {}
 744
 745 void Lexer::lex(Token &T) {
 746 again:
 747   switch (CommentState) {
 748   case LCS_BeforeComment:
 749     if (BufferPtr == BufferEnd) {
 750       formTokenWithChars(T, BufferPtr, tok::eof);
 751       return;
 752     }
 753
 754     assert(*BufferPtr == '/');
 755     BufferPtr++; // Skip first slash.
 756     switch(*BufferPtr) {
 757     case '/': { // BCPL comment.
 758       BufferPtr++; // Skip second slash.
 759
 760       if (BufferPtr != BufferEnd) {
 761         // Skip Doxygen magic marker, if it is present.
 762         // It might be missing because of a typo //< or /*<, or because we
 763         // merged this non-Doxygen comment into a bunch of Doxygen comments
 764         // around it: /** ... */ /* ... */ /** ... */
 765         const char C = *BufferPtr;
 766         if (C == '/' || C == '!')
 767           BufferPtr++;
 768       }
 769
 770       // Skip less-than symbol that marks trailing comments.
 771       // Skip it even if the comment is not a Doxygen one, because //< and /*<
 772       // are frequent typos.
 773       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 774         BufferPtr++;
 775
 776       CommentState = LCS_InsideBCPLComment;
 777       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
 778         State = LS_Normal;
 779       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
 780       goto again;
 781     }
 782     case '*': { // C comment.
 783       BufferPtr++; // Skip star.
 784
 785       // Skip Doxygen magic marker.
 786       const char C = *BufferPtr;
 787       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
 788         BufferPtr++;
 789
 790       // Skip less-than symbol that marks trailing comments.
 791       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 792         BufferPtr++;
 793
 794       CommentState = LCS_InsideCComment;
 795       State = LS_Normal;
 796       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
 797       goto again;
 798     }
 799     default:
 800       llvm_unreachable("second character of comment should be '/' or '*'");
 801     }
 802
 803   case LCS_BetweenComments: {
 804     // Consecutive comments are extracted only if there is only whitespace
 805     // between them.  So we can search for the start of the next comment.
 806     const char *EndWhitespace = BufferPtr;
 807     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
 808       EndWhitespace++;
 809
 810     // Turn any whitespace between comments (and there is only whitespace
 811     // between them -- guaranteed by comment extraction) into a newline.  We
 812     // have two newlines between C comments in total (first one was synthesized
 813     // after a comment).
 814     formTokenWithChars(T, EndWhitespace, tok::newline);
 815
 816     CommentState = LCS_BeforeComment;
 817     break;
 818   }
 819
 820   case LCS_InsideBCPLComment:
 821   case LCS_InsideCComment:
 822     if (BufferPtr != CommentEnd) {
 823       lexCommentText(T);
 824       break;
 825     } else {
 826       // Skip C comment closing sequence.
 827       if (CommentState == LCS_InsideCComment) {
 828         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
 829         BufferPtr += 2;
 830         assert(BufferPtr <= BufferEnd);
 831
 832         // Synthenize newline just after the C comment, regardless if there is
 833         // actually a newline.
 834         formTokenWithChars(T, BufferPtr, tok::newline);
 835
 836         CommentState = LCS_BetweenComments;
 837         break;
 838       } else {
 839         // Don't synthesized a newline after BCPL comment.
 840         CommentState = LCS_BetweenComments;
 841         goto again;
 842       }
 843     }
 844   }
 845 }
 846
 847 StringRef Lexer::getSpelling(const Token &Tok,
 848                              const SourceManager &SourceMgr) const {
 849   SourceLocation Loc = Tok.getLocation();
 850   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
 851
 852   bool InvalidTemp = false;
 853   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
 854   if (InvalidTemp)
 855     return StringRef();
 856
 857   const char *Begin = File.data() + LocInfo.second;
 858   return StringRef(Begin, Tok.getLength());
 859 }
 860
 861 } // end namespace comments
 862 } // end namespace clang