clang/lib/AST/RawCommentList.cpp

   1 //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "clang/AST/RawCommentList.h"
  10 #include "clang/AST/ASTContext.h"
  11 #include "clang/AST/Comment.h"
  12 #include "clang/AST/CommentBriefParser.h"
  13 #include "clang/AST/CommentCommandTraits.h"
  14 #include "clang/AST/CommentLexer.h"
  15 #include "clang/AST/CommentParser.h"
  16 #include "clang/AST/CommentSema.h"
  17 #include "clang/Basic/CharInfo.h"
  18 #include "llvm/ADT/STLExtras.h"
  19 #include "llvm/ADT/StringExtras.h"
  20 #include "llvm/Support/Allocator.h"
  21
  22 using namespace clang;
  23
  24 namespace {
  25 /// Get comment kind and bool describing if it is a trailing comment.
  26 std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
  27                                                         bool ParseAllComments) {
  28   const size_t MinCommentLength = ParseAllComments ? 2 : 3;
  29   if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
  30     return std::make_pair(RawComment::RCK_Invalid, false);
  31
  32   RawComment::CommentKind K;
  33   if (Comment[1] == '/') {
  34     if (Comment.size() < 3)
  35       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  36
  37     if (Comment[2] == '/')
  38       K = RawComment::RCK_BCPLSlash;
  39     else if (Comment[2] == '!')
  40       K = RawComment::RCK_BCPLExcl;
  41     else
  42       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  43   } else {
  44     assert(Comment.size() >= 4);
  45
  46     // Comment lexer does not understand escapes in comment markers, so pretend
  47     // that this is not a comment.
  48     if (Comment[1] != '*' ||
  49         Comment[Comment.size() - 2] != '*' ||
  50         Comment[Comment.size() - 1] != '/')
  51       return std::make_pair(RawComment::RCK_Invalid, false);
  52
  53     if (Comment[2] == '*')
  54       K = RawComment::RCK_JavaDoc;
  55     else if (Comment[2] == '!')
  56       K = RawComment::RCK_Qt;
  57     else
  58       return std::make_pair(RawComment::RCK_OrdinaryC, false);
  59   }
  60   const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
  61   return std::make_pair(K, TrailingComment);
  62 }
  63
  64 bool mergedCommentIsTrailingComment(StringRef Comment) {
  65   return (Comment.size() > 3) && (Comment[3] == '<');
  66 }
  67
  68 /// Returns true if R1 and R2 both have valid locations that start on the same
  69 /// column.
  70 bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
  71                                const RawComment &R2) {
  72   SourceLocation L1 = R1.getBeginLoc();
  73   SourceLocation L2 = R2.getBeginLoc();
  74   bool Invalid = false;
  75   unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
  76   if (!Invalid) {
  77     unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
  78     return !Invalid && (C1 == C2);
  79   }
  80   return false;
  81 }
  82 } // unnamed namespace
  83
  84 /// Determines whether there is only whitespace in `Buffer` between `P`
  85 /// and the previous line.
  86 /// \param Buffer The buffer to search in.
  87 /// \param P The offset from the beginning of `Buffer` to start from.
  88 /// \return true if all of the characters in `Buffer` ranging from the closest
  89 /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
  90 /// are whitespace.
  91 static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
  92   // Search backwards until we see linefeed or carriage return.
  93   for (unsigned I = P; I != 0; --I) {
  94     char C = Buffer[I - 1];
  95     if (isVerticalWhitespace(C))
  96       return true;
  97     if (!isHorizontalWhitespace(C))
  98       return false;
  99   }
 100   // We hit the beginning of the buffer.
 101   return true;
 102 }
 103
 104 /// Returns whether `K` is an ordinary comment kind.
 105 static bool isOrdinaryKind(RawComment::CommentKind K) {
 106   return (K == RawComment::RCK_OrdinaryBCPL) ||
 107          (K == RawComment::RCK_OrdinaryC);
 108 }
 109
 110 RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
 111                        const CommentOptions &CommentOpts, bool Merged) :
 112     Range(SR), RawTextValid(false), BriefTextValid(false),
 113     IsAttached(false), IsTrailingComment(false),
 114     IsAlmostTrailingComment(false) {
 115   // Extract raw comment text, if possible.
 116   if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
 117     Kind = RCK_Invalid;
 118     return;
 119   }
 120
 121   // Guess comment kind.
 122   std::pair<CommentKind, bool> K =
 123       getCommentKind(RawText, CommentOpts.ParseAllComments);
 124
 125   // Guess whether an ordinary comment is trailing.
 126   if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
 127     FileID BeginFileID;
 128     unsigned BeginOffset;
 129     std::tie(BeginFileID, BeginOffset) =
 130         SourceMgr.getDecomposedLoc(Range.getBegin());
 131     if (BeginOffset != 0) {
 132       bool Invalid = false;
 133       const char *Buffer =
 134           SourceMgr.getBufferData(BeginFileID, &Invalid).data();
 135       IsTrailingComment |=
 136           (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
 137     }
 138   }
 139
 140   if (!Merged) {
 141     Kind = K.first;
 142     IsTrailingComment |= K.second;
 143
 144     IsAlmostTrailingComment = RawText.startswith("//<") ||
 145                                  RawText.startswith("/*<");
 146   } else {
 147     Kind = RCK_Merged;
 148     IsTrailingComment =
 149         IsTrailingComment || mergedCommentIsTrailingComment(RawText);
 150   }
 151 }
 152
 153 StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
 154   FileID BeginFileID;
 155   FileID EndFileID;
 156   unsigned BeginOffset;
 157   unsigned EndOffset;
 158
 159   std::tie(BeginFileID, BeginOffset) =
 160       SourceMgr.getDecomposedLoc(Range.getBegin());
 161   std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
 162
 163   const unsigned Length = EndOffset - BeginOffset;
 164   if (Length < 2)
 165     return StringRef();
 166
 167   // The comment can't begin in one file and end in another.
 168   assert(BeginFileID == EndFileID);
 169
 170   bool Invalid = false;
 171   const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
 172                                                     &Invalid).data();
 173   if (Invalid)
 174     return StringRef();
 175
 176   return StringRef(BufferStart + BeginOffset, Length);
 177 }
 178
 179 const char *RawComment::extractBriefText(const ASTContext &Context) const {
 180   // Lazily initialize RawText using the accessor before using it.
 181   (void)getRawText(Context.getSourceManager());
 182
 183   // Since we will be copying the resulting text, all allocations made during
 184   // parsing are garbage after resulting string is formed.  Thus we can use
 185   // a separate allocator for all temporary stuff.
 186   llvm::BumpPtrAllocator Allocator;
 187
 188   comments::Lexer L(Allocator, Context.getDiagnostics(),
 189                     Context.getCommentCommandTraits(),
 190                     Range.getBegin(),
 191                     RawText.begin(), RawText.end());
 192   comments::BriefParser P(L, Context.getCommentCommandTraits());
 193
 194   const std::string Result = P.Parse();
 195   const unsigned BriefTextLength = Result.size();
 196   char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
 197   memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
 198   BriefText = BriefTextPtr;
 199   BriefTextValid = true;
 200
 201   return BriefTextPtr;
 202 }
 203
 204 comments::FullComment *RawComment::parse(const ASTContext &Context,
 205                                          const Preprocessor *PP,
 206                                          const Decl *D) const {
 207   // Lazily initialize RawText using the accessor before using it.
 208   (void)getRawText(Context.getSourceManager());
 209
 210   comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
 211                     Context.getCommentCommandTraits(),
 212                     getSourceRange().getBegin(),
 213                     RawText.begin(), RawText.end());
 214   comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
 215                    Context.getDiagnostics(),
 216                    Context.getCommentCommandTraits(),
 217                    PP);
 218   S.setDecl(D);
 219   comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
 220                      Context.getDiagnostics(),
 221                      Context.getCommentCommandTraits());
 222
 223   return P.parseFullComment();
 224 }
 225
 226 static bool onlyWhitespaceBetween(SourceManager &SM,
 227                                   SourceLocation Loc1, SourceLocation Loc2,
 228                                   unsigned MaxNewlinesAllowed) {
 229   std::pair<FileID, unsigned> Loc1Info = SM.getDecomposedLoc(Loc1);
 230   std::pair<FileID, unsigned> Loc2Info = SM.getDecomposedLoc(Loc2);
 231
 232   // Question does not make sense if locations are in different files.
 233   if (Loc1Info.first != Loc2Info.first)
 234     return false;
 235
 236   bool Invalid = false;
 237   const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
 238   if (Invalid)
 239     return false;
 240
 241   unsigned NumNewlines = 0;
 242   assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
 243   // Look for non-whitespace characters and remember any newlines seen.
 244   for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
 245     switch (Buffer[I]) {
 246     default:
 247       return false;
 248     case ' ':
 249     case '\t':
 250     case '\f':
 251     case '\v':
 252       break;
 253     case '\r':
 254     case '\n':
 255       ++NumNewlines;
 256
 257       // Check if we have found more than the maximum allowed number of
 258       // newlines.
 259       if (NumNewlines > MaxNewlinesAllowed)
 260         return false;
 261
 262       // Collapse \r\n and \n\r into a single newline.
 263       if (I + 1 != Loc2Info.second &&
 264           (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
 265           Buffer[I] != Buffer[I + 1])
 266         ++I;
 267       break;
 268     }
 269   }
 270
 271   return true;
 272 }
 273
 274 void RawCommentList::addComment(const RawComment &RC,
 275                                 const CommentOptions &CommentOpts,
 276                                 llvm::BumpPtrAllocator &Allocator) {
 277   if (RC.isInvalid())
 278     return;
 279
 280   // Ordinary comments are not interesting for us.
 281   if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
 282     return;
 283
 284   std::pair<FileID, unsigned> Loc =
 285       SourceMgr.getDecomposedLoc(RC.getBeginLoc());
 286
 287   const FileID CommentFile = Loc.first;
 288   const unsigned CommentOffset = Loc.second;
 289
 290   // If this is the first Doxygen comment, save it (because there isn't
 291   // anything to merge it with).
 292   if (OrderedComments[CommentFile].empty()) {
 293     OrderedComments[CommentFile][CommentOffset] =
 294         new (Allocator) RawComment(RC);
 295     return;
 296   }
 297
 298   const RawComment &C1 = *OrderedComments[CommentFile].rbegin()->second;
 299   const RawComment &C2 = RC;
 300
 301   // Merge comments only if there is only whitespace between them.
 302   // Can't merge trailing and non-trailing comments unless the second is
 303   // non-trailing ordinary in the same column, as in the case:
 304   //   int x; // documents x
 305   //          // more text
 306   // versus:
 307   //   int x; // documents x
 308   //   int y; // documents y
 309   // or:
 310   //   int x; // documents x
 311   //   // documents y
 312   //   int y;
 313   // Merge comments if they are on same or consecutive lines.
 314   if ((C1.isTrailingComment() == C2.isTrailingComment() ||
 315        (C1.isTrailingComment() && !C2.isTrailingComment() &&
 316         isOrdinaryKind(C2.getKind()) &&
 317         commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
 318       onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
 319                             /*MaxNewlinesAllowed=*/1)) {
 320     SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
 321     *OrderedComments[CommentFile].rbegin()->second =
 322         RawComment(SourceMgr, MergedRange, CommentOpts, true);
 323   } else {
 324     OrderedComments[CommentFile][CommentOffset] =
 325         new (Allocator) RawComment(RC);
 326   }
 327 }
 328
 329 const std::map<unsigned, RawComment *> *
 330 RawCommentList::getCommentsInFile(FileID File) const {
 331   auto CommentsInFile = OrderedComments.find(File);
 332   if (CommentsInFile == OrderedComments.end())
 333     return nullptr;
 334
 335   return &CommentsInFile->second;
 336 }
 337
 338 bool RawCommentList::empty() const { return OrderedComments.empty(); }
 339
 340 unsigned RawCommentList::getCommentBeginLine(RawComment *C, FileID File,
 341                                              unsigned Offset) const {
 342   auto Cached = CommentBeginLine.find(C);
 343   if (Cached != CommentBeginLine.end())
 344     return Cached->second;
 345   const unsigned Line = SourceMgr.getLineNumber(File, Offset);
 346   CommentBeginLine[C] = Line;
 347   return Line;
 348 }
 349
 350 unsigned RawCommentList::getCommentEndOffset(RawComment *C) const {
 351   auto Cached = CommentEndOffset.find(C);
 352   if (Cached != CommentEndOffset.end())
 353     return Cached->second;
 354   const unsigned Offset =
 355       SourceMgr.getDecomposedLoc(C->getSourceRange().getEnd()).second;
 356   CommentEndOffset[C] = Offset;
 357   return Offset;
 358 }
 359
 360 std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
 361                                          DiagnosticsEngine &Diags) const {
 362   llvm::StringRef CommentText = getRawText(SourceMgr);
 363   if (CommentText.empty())
 364     return "";
 365
 366   std::string Result;
 367   for (const RawComment::CommentLine &Line :
 368        getFormattedLines(SourceMgr, Diags))
 369     Result += Line.Text + "\n";
 370
 371   auto LastChar = Result.find_last_not_of('\n');
 372   Result.erase(LastChar + 1, Result.size());
 373
 374   return Result;
 375 }
 376
 377 std::vector<RawComment::CommentLine>
 378 RawComment::getFormattedLines(const SourceManager &SourceMgr,
 379                               DiagnosticsEngine &Diags) const {
 380   llvm::StringRef CommentText = getRawText(SourceMgr);
 381   if (CommentText.empty())
 382     return {};
 383
 384   llvm::BumpPtrAllocator Allocator;
 385   // We do not parse any commands, so CommentOptions are ignored by
 386   // comments::Lexer. Therefore, we just use default-constructed options.
 387   CommentOptions DefOpts;
 388   comments::CommandTraits EmptyTraits(Allocator, DefOpts);
 389   comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
 390                     CommentText.begin(), CommentText.end(),
 391                     /*ParseCommands=*/false);
 392
 393   std::vector<RawComment::CommentLine> Result;
 394   // A column number of the first non-whitespace token in the comment text.
 395   // We skip whitespace up to this column, but keep the whitespace after this
 396   // column. IndentColumn is calculated when lexing the first line and reused
 397   // for the rest of lines.
 398   unsigned IndentColumn = 0;
 399
 400   // Record the line number of the last processed comment line.
 401   // For block-style comments, an extra newline token will be produced after
 402   // the end-comment marker, e.g.:
 403   //   /** This is a multi-line comment block.
 404   //       The lexer will produce two newline tokens here > */
 405   // previousLine will record the line number when we previously saw a newline
 406   // token and recorded a comment line. If we see another newline token on the
 407   // same line, don't record anything in between.
 408   unsigned PreviousLine = 0;
 409
 410   // Processes one line of the comment and adds it to the result.
 411   // Handles skipping the indent at the start of the line.
 412   // Returns false when eof is reached and true otherwise.
 413   auto LexLine = [&](bool IsFirstLine) -> bool {
 414     comments::Token Tok;
 415     // Lex the first token on the line. We handle it separately, because we to
 416     // fix up its indentation.
 417     L.lex(Tok);
 418     if (Tok.is(comments::tok::eof))
 419       return false;
 420     if (Tok.is(comments::tok::newline)) {
 421       PresumedLoc Loc = SourceMgr.getPresumedLoc(Tok.getLocation());
 422       if (Loc.getLine() != PreviousLine) {
 423         Result.emplace_back("", Loc, Loc);
 424         PreviousLine = Loc.getLine();
 425       }
 426       return true;
 427     }
 428     SmallString<124> Line;
 429     llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
 430     bool LocInvalid = false;
 431     unsigned TokColumn =
 432         SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
 433     assert(!LocInvalid && "getFormattedText for invalid location");
 434
 435     // Amount of leading whitespace in TokText.
 436     size_t WhitespaceLen = TokText.find_first_not_of(" \t");
 437     if (WhitespaceLen == StringRef::npos)
 438       WhitespaceLen = TokText.size();
 439     // Remember the amount of whitespace we skipped in the first line to remove
 440     // indent up to that column in the following lines.
 441     if (IsFirstLine)
 442       IndentColumn = TokColumn + WhitespaceLen;
 443
 444     // Amount of leading whitespace we actually want to skip.
 445     // For the first line we skip all the whitespace.
 446     // For the rest of the lines, we skip whitespace up to IndentColumn.
 447     unsigned SkipLen =
 448         IsFirstLine
 449             ? WhitespaceLen
 450             : std::min<size_t>(
 451                   WhitespaceLen,
 452                   std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
 453     llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
 454     Line += Trimmed;
 455     // Get the beginning location of the adjusted comment line.
 456     PresumedLoc Begin =
 457         SourceMgr.getPresumedLoc(Tok.getLocation().getLocWithOffset(SkipLen));
 458
 459     // Lex all tokens in the rest of the line.
 460     for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
 461       if (Tok.is(comments::tok::newline)) {
 462         // Get the ending location of the comment line.
 463         PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
 464         if (End.getLine() != PreviousLine) {
 465           Result.emplace_back(Line, Begin, End);
 466           PreviousLine = End.getLine();
 467         }
 468         return true;
 469       }
 470       Line += L.getSpelling(Tok, SourceMgr);
 471     }
 472     PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
 473     Result.emplace_back(Line, Begin, End);
 474     // We've reached the end of file token.
 475     return false;
 476   };
 477
 478   // Process first line separately to remember indent for the following lines.
 479   if (!LexLine(/*IsFirstLine=*/true))
 480     return Result;
 481   // Process the rest of the lines.
 482   while (LexLine(/*IsFirstLine=*/false))
 483     ;
 484   return Result;
 485 }