1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang/AST/CommentParser.h"
10 #include "clang/AST/CommentCommandTraits.h"
11 #include "clang/AST/CommentDiagnostic.h"
12 #include "clang/AST/CommentSema.h"
13 #include "clang/Basic/CharInfo.h"
14 #include "clang/Basic/SourceManager.h"
15 #include "llvm/Support/ErrorHandling.h"
19 static inline bool isWhitespace(llvm::StringRef S
) {
20 for (StringRef::const_iterator I
= S
.begin(), E
= S
.end(); I
!= E
; ++I
) {
21 if (!isWhitespace(*I
))
29 /// Re-lexes a sequence of tok::text tokens.
30 class TextTokenRetokenizer
{
31 llvm::BumpPtrAllocator
&Allocator
;
34 /// This flag is set when there are no more tokens we can fetch from lexer.
35 bool NoMoreInterestingTokens
;
37 /// Token buffer: tokens we have processed and lookahead.
38 SmallVector
<Token
, 16> Toks
;
40 /// A position in \c Toks.
42 const char *BufferStart
;
43 const char *BufferEnd
;
44 const char *BufferPtr
;
45 SourceLocation BufferStartLoc
;
49 /// Current position in Toks.
53 return Pos
.CurToken
>= Toks
.size();
56 /// Sets up the buffer pointers to point to current token.
59 const Token
&Tok
= Toks
[Pos
.CurToken
];
61 Pos
.BufferStart
= Tok
.getText().begin();
62 Pos
.BufferEnd
= Tok
.getText().end();
63 Pos
.BufferPtr
= Pos
.BufferStart
;
64 Pos
.BufferStartLoc
= Tok
.getLocation();
67 SourceLocation
getSourceLocation() const {
68 const unsigned CharNo
= Pos
.BufferPtr
- Pos
.BufferStart
;
69 return Pos
.BufferStartLoc
.getLocWithOffset(CharNo
);
74 assert(Pos
.BufferPtr
!= Pos
.BufferEnd
);
75 return *Pos
.BufferPtr
;
80 assert(Pos
.BufferPtr
!= Pos
.BufferEnd
);
82 if (Pos
.BufferPtr
== Pos
.BufferEnd
) {
84 if (isEnd() && !addToken())
93 /// Returns true on success, false if there are no interesting tokens to
96 if (NoMoreInterestingTokens
)
99 if (P
.Tok
.is(tok::newline
)) {
100 // If we see a single newline token between text tokens, skip it.
101 Token Newline
= P
.Tok
;
103 if (P
.Tok
.isNot(tok::text
)) {
105 NoMoreInterestingTokens
= true;
109 if (P
.Tok
.isNot(tok::text
)) {
110 NoMoreInterestingTokens
= true;
114 Toks
.push_back(P
.Tok
);
116 if (Toks
.size() == 1)
121 void consumeWhitespace() {
123 if (isWhitespace(peek()))
130 void formTokenWithChars(Token
&Result
,
132 const char *TokBegin
,
135 Result
.setLocation(Loc
);
136 Result
.setKind(tok::text
);
137 Result
.setLength(TokLength
);
139 Result
.TextPtr
= "<UNSET>";
142 Result
.setText(Text
);
146 TextTokenRetokenizer(llvm::BumpPtrAllocator
&Allocator
, Parser
&P
):
147 Allocator(Allocator
), P(P
), NoMoreInterestingTokens(false) {
152 /// Extract a word -- sequence of non-whitespace characters.
153 bool lexWord(Token
&Tok
) {
157 Position SavedPos
= Pos
;
160 SmallString
<32> WordText
;
161 const char *WordBegin
= Pos
.BufferPtr
;
162 SourceLocation Loc
= getSourceLocation();
164 const char C
= peek();
165 if (!isWhitespace(C
)) {
166 WordText
.push_back(C
);
171 const unsigned Length
= WordText
.size();
177 char *TextPtr
= Allocator
.Allocate
<char>(Length
+ 1);
179 memcpy(TextPtr
, WordText
.c_str(), Length
+ 1);
180 StringRef Text
= StringRef(TextPtr
, Length
);
182 formTokenWithChars(Tok
, Loc
, WordBegin
, Length
, Text
);
186 bool lexDelimitedSeq(Token
&Tok
, char OpenDelim
, char CloseDelim
) {
190 Position SavedPos
= Pos
;
193 SmallString
<32> WordText
;
194 const char *WordBegin
= Pos
.BufferPtr
;
195 SourceLocation Loc
= getSourceLocation();
198 const char C
= peek();
199 if (C
== OpenDelim
) {
200 WordText
.push_back(C
);
206 while (!Error
&& !isEnd()) {
208 WordText
.push_back(C
);
213 if (!Error
&& C
!= CloseDelim
)
221 const unsigned Length
= WordText
.size();
222 char *TextPtr
= Allocator
.Allocate
<char>(Length
+ 1);
224 memcpy(TextPtr
, WordText
.c_str(), Length
+ 1);
225 StringRef Text
= StringRef(TextPtr
, Length
);
227 formTokenWithChars(Tok
, Loc
, WordBegin
,
228 Pos
.BufferPtr
- WordBegin
, Text
);
232 /// Put back tokens that we didn't consume.
233 void putBackLeftoverTokens() {
237 bool HavePartialTok
= false;
239 if (Pos
.BufferPtr
!= Pos
.BufferStart
) {
240 formTokenWithChars(PartialTok
, getSourceLocation(),
241 Pos
.BufferPtr
, Pos
.BufferEnd
- Pos
.BufferPtr
,
242 StringRef(Pos
.BufferPtr
,
243 Pos
.BufferEnd
- Pos
.BufferPtr
));
244 HavePartialTok
= true;
248 P
.putBack(llvm::ArrayRef(Toks
.begin() + Pos
.CurToken
, Toks
.end()));
249 Pos
.CurToken
= Toks
.size();
252 P
.putBack(PartialTok
);
256 Parser::Parser(Lexer
&L
, Sema
&S
, llvm::BumpPtrAllocator
&Allocator
,
257 const SourceManager
&SourceMgr
, DiagnosticsEngine
&Diags
,
258 const CommandTraits
&Traits
):
259 L(L
), S(S
), Allocator(Allocator
), SourceMgr(SourceMgr
), Diags(Diags
),
264 void Parser::parseParamCommandArgs(ParamCommandComment
*PC
,
265 TextTokenRetokenizer
&Retokenizer
) {
267 // Check if argument looks like direction specification: [dir]
268 // e.g., [in], [out], [in,out]
269 if (Retokenizer
.lexDelimitedSeq(Arg
, '[', ']'))
270 S
.actOnParamCommandDirectionArg(PC
,
272 Arg
.getEndLocation(),
275 if (Retokenizer
.lexWord(Arg
))
276 S
.actOnParamCommandParamNameArg(PC
,
278 Arg
.getEndLocation(),
282 void Parser::parseTParamCommandArgs(TParamCommandComment
*TPC
,
283 TextTokenRetokenizer
&Retokenizer
) {
285 if (Retokenizer
.lexWord(Arg
))
286 S
.actOnTParamCommandParamNameArg(TPC
,
288 Arg
.getEndLocation(),
292 ArrayRef
<Comment::Argument
>
293 Parser::parseCommandArgs(TextTokenRetokenizer
&Retokenizer
, unsigned NumArgs
) {
294 auto *Args
= new (Allocator
.Allocate
<Comment::Argument
>(NumArgs
))
295 Comment::Argument
[NumArgs
];
296 unsigned ParsedArgs
= 0;
298 while (ParsedArgs
< NumArgs
&& Retokenizer
.lexWord(Arg
)) {
299 Args
[ParsedArgs
] = Comment::Argument
{
300 SourceRange(Arg
.getLocation(), Arg
.getEndLocation()), Arg
.getText()};
304 return llvm::ArrayRef(Args
, ParsedArgs
);
307 BlockCommandComment
*Parser::parseBlockCommand() {
308 assert(Tok
.is(tok::backslash_command
) || Tok
.is(tok::at_command
));
310 ParamCommandComment
*PC
= nullptr;
311 TParamCommandComment
*TPC
= nullptr;
312 BlockCommandComment
*BC
= nullptr;
313 const CommandInfo
*Info
= Traits
.getCommandInfo(Tok
.getCommandID());
314 CommandMarkerKind CommandMarker
=
315 Tok
.is(tok::backslash_command
) ? CMK_Backslash
: CMK_At
;
316 if (Info
->IsParamCommand
) {
317 PC
= S
.actOnParamCommandStart(Tok
.getLocation(),
318 Tok
.getEndLocation(),
321 } else if (Info
->IsTParamCommand
) {
322 TPC
= S
.actOnTParamCommandStart(Tok
.getLocation(),
323 Tok
.getEndLocation(),
327 BC
= S
.actOnBlockCommandStart(Tok
.getLocation(),
328 Tok
.getEndLocation(),
334 if (isTokBlockCommand()) {
335 // Block command ahead. We can't nest block commands, so pretend that this
336 // command has an empty argument.
337 ParagraphComment
*Paragraph
= S
.actOnParagraphComment(std::nullopt
);
339 S
.actOnParamCommandFinish(PC
, Paragraph
);
342 S
.actOnTParamCommandFinish(TPC
, Paragraph
);
345 S
.actOnBlockCommandFinish(BC
, Paragraph
);
350 if (PC
|| TPC
|| Info
->NumArgs
> 0) {
351 // In order to parse command arguments we need to retokenize a few
352 // following text tokens.
353 TextTokenRetokenizer
Retokenizer(Allocator
, *this);
356 parseParamCommandArgs(PC
, Retokenizer
);
358 parseTParamCommandArgs(TPC
, Retokenizer
);
360 S
.actOnBlockCommandArgs(BC
, parseCommandArgs(Retokenizer
, Info
->NumArgs
));
362 Retokenizer
.putBackLeftoverTokens();
365 // If there's a block command ahead, we will attach an empty paragraph to
367 bool EmptyParagraph
= false;
368 if (isTokBlockCommand())
369 EmptyParagraph
= true;
370 else if (Tok
.is(tok::newline
)) {
373 EmptyParagraph
= isTokBlockCommand();
377 ParagraphComment
*Paragraph
;
379 Paragraph
= S
.actOnParagraphComment(std::nullopt
);
381 BlockContentComment
*Block
= parseParagraphOrBlockCommand();
382 // Since we have checked for a block command, we should have parsed a
384 Paragraph
= cast
<ParagraphComment
>(Block
);
388 S
.actOnParamCommandFinish(PC
, Paragraph
);
391 S
.actOnTParamCommandFinish(TPC
, Paragraph
);
394 S
.actOnBlockCommandFinish(BC
, Paragraph
);
399 InlineCommandComment
*Parser::parseInlineCommand() {
400 assert(Tok
.is(tok::backslash_command
) || Tok
.is(tok::at_command
));
401 const CommandInfo
*Info
= Traits
.getCommandInfo(Tok
.getCommandID());
403 const Token CommandTok
= Tok
;
406 TextTokenRetokenizer
Retokenizer(Allocator
, *this);
407 ArrayRef
<Comment::Argument
> Args
=
408 parseCommandArgs(Retokenizer
, Info
->NumArgs
);
410 InlineCommandComment
*IC
= S
.actOnInlineCommand(
411 CommandTok
.getLocation(), CommandTok
.getEndLocation(),
412 CommandTok
.getCommandID(), Args
);
414 if (Args
.size() < Info
->NumArgs
) {
415 Diag(CommandTok
.getEndLocation().getLocWithOffset(1),
416 diag::warn_doc_inline_command_not_enough_arguments
)
417 << CommandTok
.is(tok::at_command
) << Info
->Name
<< Args
.size()
419 << SourceRange(CommandTok
.getLocation(), CommandTok
.getEndLocation());
422 Retokenizer
.putBackLeftoverTokens();
427 HTMLStartTagComment
*Parser::parseHTMLStartTag() {
428 assert(Tok
.is(tok::html_start_tag
));
429 HTMLStartTagComment
*HST
=
430 S
.actOnHTMLStartTagStart(Tok
.getLocation(),
431 Tok
.getHTMLTagStartName());
434 SmallVector
<HTMLStartTagComment::Attribute
, 2> Attrs
;
436 switch (Tok
.getKind()) {
437 case tok::html_ident
: {
440 if (Tok
.isNot(tok::html_equals
)) {
441 Attrs
.push_back(HTMLStartTagComment::Attribute(Ident
.getLocation(),
442 Ident
.getHTMLIdent()));
447 if (Tok
.isNot(tok::html_quoted_string
)) {
448 Diag(Tok
.getLocation(),
449 diag::warn_doc_html_start_tag_expected_quoted_string
)
450 << SourceRange(Equals
.getLocation());
451 Attrs
.push_back(HTMLStartTagComment::Attribute(Ident
.getLocation(),
452 Ident
.getHTMLIdent()));
453 while (Tok
.is(tok::html_equals
) ||
454 Tok
.is(tok::html_quoted_string
))
458 Attrs
.push_back(HTMLStartTagComment::Attribute(
460 Ident
.getHTMLIdent(),
461 Equals
.getLocation(),
462 SourceRange(Tok
.getLocation(),
463 Tok
.getEndLocation()),
464 Tok
.getHTMLQuotedString()));
469 case tok::html_greater
:
470 S
.actOnHTMLStartTagFinish(HST
, S
.copyArray(llvm::ArrayRef(Attrs
)),
472 /* IsSelfClosing = */ false);
476 case tok::html_slash_greater
:
477 S
.actOnHTMLStartTagFinish(HST
, S
.copyArray(llvm::ArrayRef(Attrs
)),
479 /* IsSelfClosing = */ true);
483 case tok::html_equals
:
484 case tok::html_quoted_string
:
485 Diag(Tok
.getLocation(),
486 diag::warn_doc_html_start_tag_expected_ident_or_greater
);
487 while (Tok
.is(tok::html_equals
) ||
488 Tok
.is(tok::html_quoted_string
))
490 if (Tok
.is(tok::html_ident
) ||
491 Tok
.is(tok::html_greater
) ||
492 Tok
.is(tok::html_slash_greater
))
495 S
.actOnHTMLStartTagFinish(HST
, S
.copyArray(llvm::ArrayRef(Attrs
)),
497 /* IsSelfClosing = */ false);
501 // Not a token from an HTML start tag. Thus HTML tag prematurely ended.
502 S
.actOnHTMLStartTagFinish(HST
, S
.copyArray(llvm::ArrayRef(Attrs
)),
504 /* IsSelfClosing = */ false);
505 bool StartLineInvalid
;
506 const unsigned StartLine
= SourceMgr
.getPresumedLineNumber(
510 const unsigned EndLine
= SourceMgr
.getPresumedLineNumber(
513 if (StartLineInvalid
|| EndLineInvalid
|| StartLine
== EndLine
)
514 Diag(Tok
.getLocation(),
515 diag::warn_doc_html_start_tag_expected_ident_or_greater
)
516 << HST
->getSourceRange();
518 Diag(Tok
.getLocation(),
519 diag::warn_doc_html_start_tag_expected_ident_or_greater
);
520 Diag(HST
->getLocation(), diag::note_doc_html_tag_started_here
)
521 << HST
->getSourceRange();
528 HTMLEndTagComment
*Parser::parseHTMLEndTag() {
529 assert(Tok
.is(tok::html_end_tag
));
530 Token TokEndTag
= Tok
;
533 if (Tok
.is(tok::html_greater
)) {
534 Loc
= Tok
.getLocation();
538 return S
.actOnHTMLEndTag(TokEndTag
.getLocation(),
540 TokEndTag
.getHTMLTagEndName());
543 BlockContentComment
*Parser::parseParagraphOrBlockCommand() {
544 SmallVector
<InlineContentComment
*, 8> Content
;
547 switch (Tok
.getKind()) {
548 case tok::verbatim_block_begin
:
549 case tok::verbatim_line_name
:
551 break; // Block content or EOF ahead, finish this parapgaph.
553 case tok::unknown_command
:
554 Content
.push_back(S
.actOnUnknownCommand(Tok
.getLocation(),
555 Tok
.getEndLocation(),
556 Tok
.getUnknownCommandName()));
560 case tok::backslash_command
:
561 case tok::at_command
: {
562 const CommandInfo
*Info
= Traits
.getCommandInfo(Tok
.getCommandID());
563 if (Info
->IsBlockCommand
) {
564 if (Content
.size() == 0)
565 return parseBlockCommand();
566 break; // Block command ahead, finish this parapgaph.
568 if (Info
->IsVerbatimBlockEndCommand
) {
569 Diag(Tok
.getLocation(),
570 diag::warn_verbatim_block_end_without_start
)
571 << Tok
.is(tok::at_command
)
573 << SourceRange(Tok
.getLocation(), Tok
.getEndLocation());
577 if (Info
->IsUnknownCommand
) {
578 Content
.push_back(S
.actOnUnknownCommand(Tok
.getLocation(),
579 Tok
.getEndLocation(),
584 assert(Info
->IsInlineCommand
);
585 Content
.push_back(parseInlineCommand());
591 if (Tok
.is(tok::newline
) || Tok
.is(tok::eof
)) {
593 break; // Two newlines -- end of paragraph.
595 // Also allow [tok::newline, tok::text, tok::newline] if the middle
596 // tok::text is just whitespace.
597 if (Tok
.is(tok::text
) && isWhitespace(Tok
.getText())) {
598 Token WhitespaceTok
= Tok
;
600 if (Tok
.is(tok::newline
) || Tok
.is(tok::eof
)) {
604 // We have [tok::newline, tok::text, non-newline]. Put back tok::text.
605 putBack(WhitespaceTok
);
607 if (Content
.size() > 0)
608 Content
.back()->addTrailingNewline();
612 // Don't deal with HTML tag soup now.
613 case tok::html_start_tag
:
614 Content
.push_back(parseHTMLStartTag());
617 case tok::html_end_tag
:
618 Content
.push_back(parseHTMLEndTag());
622 Content
.push_back(S
.actOnText(Tok
.getLocation(),
623 Tok
.getEndLocation(),
628 case tok::verbatim_block_line
:
629 case tok::verbatim_block_end
:
630 case tok::verbatim_line_text
:
631 case tok::html_ident
:
632 case tok::html_equals
:
633 case tok::html_quoted_string
:
634 case tok::html_greater
:
635 case tok::html_slash_greater
:
636 llvm_unreachable("should not see this token");
641 return S
.actOnParagraphComment(S
.copyArray(llvm::ArrayRef(Content
)));
644 VerbatimBlockComment
*Parser::parseVerbatimBlock() {
645 assert(Tok
.is(tok::verbatim_block_begin
));
647 VerbatimBlockComment
*VB
=
648 S
.actOnVerbatimBlockStart(Tok
.getLocation(),
649 Tok
.getVerbatimBlockID());
652 // Don't create an empty line if verbatim opening command is followed
654 if (Tok
.is(tok::newline
))
657 SmallVector
<VerbatimBlockLineComment
*, 8> Lines
;
658 while (Tok
.is(tok::verbatim_block_line
) ||
659 Tok
.is(tok::newline
)) {
660 VerbatimBlockLineComment
*Line
;
661 if (Tok
.is(tok::verbatim_block_line
)) {
662 Line
= S
.actOnVerbatimBlockLine(Tok
.getLocation(),
663 Tok
.getVerbatimBlockText());
665 if (Tok
.is(tok::newline
)) {
669 // Empty line, just a tok::newline.
670 Line
= S
.actOnVerbatimBlockLine(Tok
.getLocation(), "");
673 Lines
.push_back(Line
);
676 if (Tok
.is(tok::verbatim_block_end
)) {
677 const CommandInfo
*Info
= Traits
.getCommandInfo(Tok
.getVerbatimBlockID());
678 S
.actOnVerbatimBlockFinish(VB
, Tok
.getLocation(), Info
->Name
,
679 S
.copyArray(llvm::ArrayRef(Lines
)));
682 // Unterminated \\verbatim block
683 S
.actOnVerbatimBlockFinish(VB
, SourceLocation(), "",
684 S
.copyArray(llvm::ArrayRef(Lines
)));
690 VerbatimLineComment
*Parser::parseVerbatimLine() {
691 assert(Tok
.is(tok::verbatim_line_name
));
696 SourceLocation TextBegin
;
698 // Next token might not be a tok::verbatim_line_text if verbatim line
699 // starting command comes just before a newline or comment end.
700 if (Tok
.is(tok::verbatim_line_text
)) {
701 TextBegin
= Tok
.getLocation();
702 Text
= Tok
.getVerbatimLineText();
704 TextBegin
= NameTok
.getEndLocation();
708 VerbatimLineComment
*VL
= S
.actOnVerbatimLine(NameTok
.getLocation(),
709 NameTok
.getVerbatimLineID(),
716 BlockContentComment
*Parser::parseBlockContent() {
717 switch (Tok
.getKind()) {
719 case tok::unknown_command
:
720 case tok::backslash_command
:
721 case tok::at_command
:
722 case tok::html_start_tag
:
723 case tok::html_end_tag
:
724 return parseParagraphOrBlockCommand();
726 case tok::verbatim_block_begin
:
727 return parseVerbatimBlock();
729 case tok::verbatim_line_name
:
730 return parseVerbatimLine();
734 case tok::verbatim_block_line
:
735 case tok::verbatim_block_end
:
736 case tok::verbatim_line_text
:
737 case tok::html_ident
:
738 case tok::html_equals
:
739 case tok::html_quoted_string
:
740 case tok::html_greater
:
741 case tok::html_slash_greater
:
742 llvm_unreachable("should not see this token");
744 llvm_unreachable("bogus token kind");
747 FullComment
*Parser::parseFullComment() {
748 // Skip newlines at the beginning of the comment.
749 while (Tok
.is(tok::newline
))
752 SmallVector
<BlockContentComment
*, 8> Blocks
;
753 while (Tok
.isNot(tok::eof
)) {
754 Blocks
.push_back(parseBlockContent());
756 // Skip extra newlines after paragraph end.
757 while (Tok
.is(tok::newline
))
760 return S
.actOnFullComment(S
.copyArray(llvm::ArrayRef(Blocks
)));
763 } // end namespace comments
764 } // end namespace clang