1 //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang/AST/RawCommentList.h"
10 #include "clang/AST/ASTContext.h"
11 #include "clang/AST/Comment.h"
12 #include "clang/AST/CommentBriefParser.h"
13 #include "clang/AST/CommentCommandTraits.h"
14 #include "clang/AST/CommentLexer.h"
15 #include "clang/AST/CommentParser.h"
16 #include "clang/AST/CommentSema.h"
17 #include "clang/Basic/CharInfo.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/StringExtras.h"
20 #include "llvm/Support/Allocator.h"
22 using namespace clang
;
25 /// Get comment kind and bool describing if it is a trailing comment.
26 std::pair
<RawComment::CommentKind
, bool> getCommentKind(StringRef Comment
,
27 bool ParseAllComments
) {
28 const size_t MinCommentLength
= ParseAllComments
? 2 : 3;
29 if ((Comment
.size() < MinCommentLength
) || Comment
[0] != '/')
30 return std::make_pair(RawComment::RCK_Invalid
, false);
32 RawComment::CommentKind K
;
33 if (Comment
[1] == '/') {
34 if (Comment
.size() < 3)
35 return std::make_pair(RawComment::RCK_OrdinaryBCPL
, false);
37 if (Comment
[2] == '/')
38 K
= RawComment::RCK_BCPLSlash
;
39 else if (Comment
[2] == '!')
40 K
= RawComment::RCK_BCPLExcl
;
42 return std::make_pair(RawComment::RCK_OrdinaryBCPL
, false);
44 assert(Comment
.size() >= 4);
46 // Comment lexer does not understand escapes in comment markers, so pretend
47 // that this is not a comment.
48 if (Comment
[1] != '*' ||
49 Comment
[Comment
.size() - 2] != '*' ||
50 Comment
[Comment
.size() - 1] != '/')
51 return std::make_pair(RawComment::RCK_Invalid
, false);
53 if (Comment
[2] == '*')
54 K
= RawComment::RCK_JavaDoc
;
55 else if (Comment
[2] == '!')
56 K
= RawComment::RCK_Qt
;
58 return std::make_pair(RawComment::RCK_OrdinaryC
, false);
60 const bool TrailingComment
= (Comment
.size() > 3) && (Comment
[3] == '<');
61 return std::make_pair(K
, TrailingComment
);
64 bool mergedCommentIsTrailingComment(StringRef Comment
) {
65 return (Comment
.size() > 3) && (Comment
[3] == '<');
68 /// Returns true if R1 and R2 both have valid locations that start on the same
70 bool commentsStartOnSameColumn(const SourceManager
&SM
, const RawComment
&R1
,
71 const RawComment
&R2
) {
72 SourceLocation L1
= R1
.getBeginLoc();
73 SourceLocation L2
= R2
.getBeginLoc();
75 unsigned C1
= SM
.getPresumedColumnNumber(L1
, &Invalid
);
77 unsigned C2
= SM
.getPresumedColumnNumber(L2
, &Invalid
);
78 return !Invalid
&& (C1
== C2
);
82 } // unnamed namespace
84 /// Determines whether there is only whitespace in `Buffer` between `P`
85 /// and the previous line.
86 /// \param Buffer The buffer to search in.
87 /// \param P The offset from the beginning of `Buffer` to start from.
88 /// \return true if all of the characters in `Buffer` ranging from the closest
89 /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
91 static bool onlyWhitespaceOnLineBefore(const char *Buffer
, unsigned P
) {
92 // Search backwards until we see linefeed or carriage return.
93 for (unsigned I
= P
; I
!= 0; --I
) {
94 char C
= Buffer
[I
- 1];
95 if (isVerticalWhitespace(C
))
97 if (!isHorizontalWhitespace(C
))
100 // We hit the beginning of the buffer.
104 /// Returns whether `K` is an ordinary comment kind.
105 static bool isOrdinaryKind(RawComment::CommentKind K
) {
106 return (K
== RawComment::RCK_OrdinaryBCPL
) ||
107 (K
== RawComment::RCK_OrdinaryC
);
110 RawComment::RawComment(const SourceManager
&SourceMgr
, SourceRange SR
,
111 const CommentOptions
&CommentOpts
, bool Merged
) :
112 Range(SR
), RawTextValid(false), BriefTextValid(false),
113 IsAttached(false), IsTrailingComment(false),
114 IsAlmostTrailingComment(false) {
115 // Extract raw comment text, if possible.
116 if (SR
.getBegin() == SR
.getEnd() || getRawText(SourceMgr
).empty()) {
121 // Guess comment kind.
122 std::pair
<CommentKind
, bool> K
=
123 getCommentKind(RawText
, CommentOpts
.ParseAllComments
);
125 // Guess whether an ordinary comment is trailing.
126 if (CommentOpts
.ParseAllComments
&& isOrdinaryKind(K
.first
)) {
128 unsigned BeginOffset
;
129 std::tie(BeginFileID
, BeginOffset
) =
130 SourceMgr
.getDecomposedLoc(Range
.getBegin());
131 if (BeginOffset
!= 0) {
132 bool Invalid
= false;
134 SourceMgr
.getBufferData(BeginFileID
, &Invalid
).data();
136 (!Invalid
&& !onlyWhitespaceOnLineBefore(Buffer
, BeginOffset
));
142 IsTrailingComment
|= K
.second
;
144 IsAlmostTrailingComment
= RawText
.startswith("//<") ||
145 RawText
.startswith("/*<");
149 IsTrailingComment
|| mergedCommentIsTrailingComment(RawText
);
153 StringRef
RawComment::getRawTextSlow(const SourceManager
&SourceMgr
) const {
156 unsigned BeginOffset
;
159 std::tie(BeginFileID
, BeginOffset
) =
160 SourceMgr
.getDecomposedLoc(Range
.getBegin());
161 std::tie(EndFileID
, EndOffset
) = SourceMgr
.getDecomposedLoc(Range
.getEnd());
163 const unsigned Length
= EndOffset
- BeginOffset
;
167 // The comment can't begin in one file and end in another.
168 assert(BeginFileID
== EndFileID
);
170 bool Invalid
= false;
171 const char *BufferStart
= SourceMgr
.getBufferData(BeginFileID
,
176 return StringRef(BufferStart
+ BeginOffset
, Length
);
179 const char *RawComment::extractBriefText(const ASTContext
&Context
) const {
180 // Lazily initialize RawText using the accessor before using it.
181 (void)getRawText(Context
.getSourceManager());
183 // Since we will be copying the resulting text, all allocations made during
184 // parsing are garbage after resulting string is formed. Thus we can use
185 // a separate allocator for all temporary stuff.
186 llvm::BumpPtrAllocator Allocator
;
188 comments::Lexer
L(Allocator
, Context
.getDiagnostics(),
189 Context
.getCommentCommandTraits(),
191 RawText
.begin(), RawText
.end());
192 comments::BriefParser
P(L
, Context
.getCommentCommandTraits());
194 const std::string Result
= P
.Parse();
195 const unsigned BriefTextLength
= Result
.size();
196 char *BriefTextPtr
= new (Context
) char[BriefTextLength
+ 1];
197 memcpy(BriefTextPtr
, Result
.c_str(), BriefTextLength
+ 1);
198 BriefText
= BriefTextPtr
;
199 BriefTextValid
= true;
204 comments::FullComment
*RawComment::parse(const ASTContext
&Context
,
205 const Preprocessor
*PP
,
206 const Decl
*D
) const {
207 // Lazily initialize RawText using the accessor before using it.
208 (void)getRawText(Context
.getSourceManager());
210 comments::Lexer
L(Context
.getAllocator(), Context
.getDiagnostics(),
211 Context
.getCommentCommandTraits(),
212 getSourceRange().getBegin(),
213 RawText
.begin(), RawText
.end());
214 comments::Sema
S(Context
.getAllocator(), Context
.getSourceManager(),
215 Context
.getDiagnostics(),
216 Context
.getCommentCommandTraits(),
219 comments::Parser
P(L
, S
, Context
.getAllocator(), Context
.getSourceManager(),
220 Context
.getDiagnostics(),
221 Context
.getCommentCommandTraits());
223 return P
.parseFullComment();
226 static bool onlyWhitespaceBetween(SourceManager
&SM
,
227 SourceLocation Loc1
, SourceLocation Loc2
,
228 unsigned MaxNewlinesAllowed
) {
229 std::pair
<FileID
, unsigned> Loc1Info
= SM
.getDecomposedLoc(Loc1
);
230 std::pair
<FileID
, unsigned> Loc2Info
= SM
.getDecomposedLoc(Loc2
);
232 // Question does not make sense if locations are in different files.
233 if (Loc1Info
.first
!= Loc2Info
.first
)
236 bool Invalid
= false;
237 const char *Buffer
= SM
.getBufferData(Loc1Info
.first
, &Invalid
).data();
241 unsigned NumNewlines
= 0;
242 assert(Loc1Info
.second
<= Loc2Info
.second
&& "Loc1 after Loc2!");
243 // Look for non-whitespace characters and remember any newlines seen.
244 for (unsigned I
= Loc1Info
.second
; I
!= Loc2Info
.second
; ++I
) {
257 // Check if we have found more than the maximum allowed number of
259 if (NumNewlines
> MaxNewlinesAllowed
)
262 // Collapse \r\n and \n\r into a single newline.
263 if (I
+ 1 != Loc2Info
.second
&&
264 (Buffer
[I
+ 1] == '\n' || Buffer
[I
+ 1] == '\r') &&
265 Buffer
[I
] != Buffer
[I
+ 1])
274 void RawCommentList::addComment(const RawComment
&RC
,
275 const CommentOptions
&CommentOpts
,
276 llvm::BumpPtrAllocator
&Allocator
) {
280 // Ordinary comments are not interesting for us.
281 if (RC
.isOrdinary() && !CommentOpts
.ParseAllComments
)
284 std::pair
<FileID
, unsigned> Loc
=
285 SourceMgr
.getDecomposedLoc(RC
.getBeginLoc());
287 const FileID CommentFile
= Loc
.first
;
288 const unsigned CommentOffset
= Loc
.second
;
290 // If this is the first Doxygen comment, save it (because there isn't
291 // anything to merge it with).
292 if (OrderedComments
[CommentFile
].empty()) {
293 OrderedComments
[CommentFile
][CommentOffset
] =
294 new (Allocator
) RawComment(RC
);
298 const RawComment
&C1
= *OrderedComments
[CommentFile
].rbegin()->second
;
299 const RawComment
&C2
= RC
;
301 // Merge comments only if there is only whitespace between them.
302 // Can't merge trailing and non-trailing comments unless the second is
303 // non-trailing ordinary in the same column, as in the case:
304 // int x; // documents x
307 // int x; // documents x
308 // int y; // documents y
310 // int x; // documents x
313 // Merge comments if they are on same or consecutive lines.
314 if ((C1
.isTrailingComment() == C2
.isTrailingComment() ||
315 (C1
.isTrailingComment() && !C2
.isTrailingComment() &&
316 isOrdinaryKind(C2
.getKind()) &&
317 commentsStartOnSameColumn(SourceMgr
, C1
, C2
))) &&
318 onlyWhitespaceBetween(SourceMgr
, C1
.getEndLoc(), C2
.getBeginLoc(),
319 /*MaxNewlinesAllowed=*/1)) {
320 SourceRange
MergedRange(C1
.getBeginLoc(), C2
.getEndLoc());
321 *OrderedComments
[CommentFile
].rbegin()->second
=
322 RawComment(SourceMgr
, MergedRange
, CommentOpts
, true);
324 OrderedComments
[CommentFile
][CommentOffset
] =
325 new (Allocator
) RawComment(RC
);
329 const std::map
<unsigned, RawComment
*> *
330 RawCommentList::getCommentsInFile(FileID File
) const {
331 auto CommentsInFile
= OrderedComments
.find(File
);
332 if (CommentsInFile
== OrderedComments
.end())
335 return &CommentsInFile
->second
;
338 bool RawCommentList::empty() const { return OrderedComments
.empty(); }
340 unsigned RawCommentList::getCommentBeginLine(RawComment
*C
, FileID File
,
341 unsigned Offset
) const {
342 auto Cached
= CommentBeginLine
.find(C
);
343 if (Cached
!= CommentBeginLine
.end())
344 return Cached
->second
;
345 const unsigned Line
= SourceMgr
.getLineNumber(File
, Offset
);
346 CommentBeginLine
[C
] = Line
;
350 unsigned RawCommentList::getCommentEndOffset(RawComment
*C
) const {
351 auto Cached
= CommentEndOffset
.find(C
);
352 if (Cached
!= CommentEndOffset
.end())
353 return Cached
->second
;
354 const unsigned Offset
=
355 SourceMgr
.getDecomposedLoc(C
->getSourceRange().getEnd()).second
;
356 CommentEndOffset
[C
] = Offset
;
360 std::string
RawComment::getFormattedText(const SourceManager
&SourceMgr
,
361 DiagnosticsEngine
&Diags
) const {
362 llvm::StringRef CommentText
= getRawText(SourceMgr
);
363 if (CommentText
.empty())
367 for (const RawComment::CommentLine
&Line
:
368 getFormattedLines(SourceMgr
, Diags
))
369 Result
+= Line
.Text
+ "\n";
371 auto LastChar
= Result
.find_last_not_of('\n');
372 Result
.erase(LastChar
+ 1, Result
.size());
377 std::vector
<RawComment::CommentLine
>
378 RawComment::getFormattedLines(const SourceManager
&SourceMgr
,
379 DiagnosticsEngine
&Diags
) const {
380 llvm::StringRef CommentText
= getRawText(SourceMgr
);
381 if (CommentText
.empty())
384 llvm::BumpPtrAllocator Allocator
;
385 // We do not parse any commands, so CommentOptions are ignored by
386 // comments::Lexer. Therefore, we just use default-constructed options.
387 CommentOptions DefOpts
;
388 comments::CommandTraits
EmptyTraits(Allocator
, DefOpts
);
389 comments::Lexer
L(Allocator
, Diags
, EmptyTraits
, getSourceRange().getBegin(),
390 CommentText
.begin(), CommentText
.end(),
391 /*ParseCommands=*/false);
393 std::vector
<RawComment::CommentLine
> Result
;
394 // A column number of the first non-whitespace token in the comment text.
395 // We skip whitespace up to this column, but keep the whitespace after this
396 // column. IndentColumn is calculated when lexing the first line and reused
397 // for the rest of lines.
398 unsigned IndentColumn
= 0;
400 // Record the line number of the last processed comment line.
401 // For block-style comments, an extra newline token will be produced after
402 // the end-comment marker, e.g.:
403 // /** This is a multi-line comment block.
404 // The lexer will produce two newline tokens here > */
405 // previousLine will record the line number when we previously saw a newline
406 // token and recorded a comment line. If we see another newline token on the
407 // same line, don't record anything in between.
408 unsigned PreviousLine
= 0;
410 // Processes one line of the comment and adds it to the result.
411 // Handles skipping the indent at the start of the line.
412 // Returns false when eof is reached and true otherwise.
413 auto LexLine
= [&](bool IsFirstLine
) -> bool {
415 // Lex the first token on the line. We handle it separately, because we to
416 // fix up its indentation.
418 if (Tok
.is(comments::tok::eof
))
420 if (Tok
.is(comments::tok::newline
)) {
421 PresumedLoc Loc
= SourceMgr
.getPresumedLoc(Tok
.getLocation());
422 if (Loc
.getLine() != PreviousLine
) {
423 Result
.emplace_back("", Loc
, Loc
);
424 PreviousLine
= Loc
.getLine();
428 SmallString
<124> Line
;
429 llvm::StringRef TokText
= L
.getSpelling(Tok
, SourceMgr
);
430 bool LocInvalid
= false;
432 SourceMgr
.getSpellingColumnNumber(Tok
.getLocation(), &LocInvalid
);
433 assert(!LocInvalid
&& "getFormattedText for invalid location");
435 // Amount of leading whitespace in TokText.
436 size_t WhitespaceLen
= TokText
.find_first_not_of(" \t");
437 if (WhitespaceLen
== StringRef::npos
)
438 WhitespaceLen
= TokText
.size();
439 // Remember the amount of whitespace we skipped in the first line to remove
440 // indent up to that column in the following lines.
442 IndentColumn
= TokColumn
+ WhitespaceLen
;
444 // Amount of leading whitespace we actually want to skip.
445 // For the first line we skip all the whitespace.
446 // For the rest of the lines, we skip whitespace up to IndentColumn.
452 std::max
<int>(static_cast<int>(IndentColumn
) - TokColumn
, 0));
453 llvm::StringRef Trimmed
= TokText
.drop_front(SkipLen
);
455 // Get the beginning location of the adjusted comment line.
457 SourceMgr
.getPresumedLoc(Tok
.getLocation().getLocWithOffset(SkipLen
));
459 // Lex all tokens in the rest of the line.
460 for (L
.lex(Tok
); Tok
.isNot(comments::tok::eof
); L
.lex(Tok
)) {
461 if (Tok
.is(comments::tok::newline
)) {
462 // Get the ending location of the comment line.
463 PresumedLoc End
= SourceMgr
.getPresumedLoc(Tok
.getLocation());
464 if (End
.getLine() != PreviousLine
) {
465 Result
.emplace_back(Line
, Begin
, End
);
466 PreviousLine
= End
.getLine();
470 Line
+= L
.getSpelling(Tok
, SourceMgr
);
472 PresumedLoc End
= SourceMgr
.getPresumedLoc(Tok
.getLocation());
473 Result
.emplace_back(Line
, Begin
, End
);
474 // We've reached the end of file token.
478 // Process first line separately to remember indent for the following lines.
479 if (!LexLine(/*IsFirstLine=*/true))
481 // Process the rest of the lines.
482 while (LexLine(/*IsFirstLine=*/false))