1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 #include "clang/AST/CommentParser.h"
11 #include "clang/AST/CommentCommandTraits.h"
12 #include "clang/AST/CommentDiagnostic.h"
13 #include "clang/AST/CommentSema.h"
14 #include "clang/Basic/CharInfo.h"
15 #include "clang/Basic/SourceManager.h"
16 #include "llvm/Support/ErrorHandling.h"
20 static inline bool isWhitespace(llvm::StringRef S
) {
21 for (StringRef::const_iterator I
= S
.begin(), E
= S
.end(); I
!= E
; ++I
) {
22 if (!isWhitespace(*I
))
30 /// Re-lexes a sequence of tok::text tokens.
31 class TextTokenRetokenizer
{
32 llvm::BumpPtrAllocator
&Allocator
;
35 /// This flag is set when there are no more tokens we can fetch from lexer.
36 bool NoMoreInterestingTokens
;
38 /// Token buffer: tokens we have processed and lookahead.
39 SmallVector
<Token
, 16> Toks
;
41 /// A position in \c Toks.
44 const char *BufferStart
;
45 const char *BufferEnd
;
46 const char *BufferPtr
;
47 SourceLocation BufferStartLoc
;
50 /// Current position in Toks.
54 return Pos
.CurToken
>= Toks
.size();
57 /// Sets up the buffer pointers to point to current token.
60 const Token
&Tok
= Toks
[Pos
.CurToken
];
62 Pos
.BufferStart
= Tok
.getText().begin();
63 Pos
.BufferEnd
= Tok
.getText().end();
64 Pos
.BufferPtr
= Pos
.BufferStart
;
65 Pos
.BufferStartLoc
= Tok
.getLocation();
68 SourceLocation
getSourceLocation() const {
69 const unsigned CharNo
= Pos
.BufferPtr
- Pos
.BufferStart
;
70 return Pos
.BufferStartLoc
.getLocWithOffset(CharNo
);
75 assert(Pos
.BufferPtr
!= Pos
.BufferEnd
);
76 return *Pos
.BufferPtr
;
81 assert(Pos
.BufferPtr
!= Pos
.BufferEnd
);
83 if (Pos
.BufferPtr
== Pos
.BufferEnd
) {
85 if (isEnd() && !addToken())
94 /// Returns true on success, false if there are no interesting tokens to
97 if (NoMoreInterestingTokens
)
100 if (P
.Tok
.is(tok::newline
)) {
101 // If we see a single newline token between text tokens, skip it.
102 Token Newline
= P
.Tok
;
104 if (P
.Tok
.isNot(tok::text
)) {
106 NoMoreInterestingTokens
= true;
110 if (P
.Tok
.isNot(tok::text
)) {
111 NoMoreInterestingTokens
= true;
115 Toks
.push_back(P
.Tok
);
117 if (Toks
.size() == 1)
122 void consumeWhitespace() {
124 if (isWhitespace(peek()))
131 void formTokenWithChars(Token
&Result
,
133 const char *TokBegin
,
136 Result
.setLocation(Loc
);
137 Result
.setKind(tok::text
);
138 Result
.setLength(TokLength
);
140 Result
.TextPtr
= "<UNSET>";
143 Result
.setText(Text
);
147 TextTokenRetokenizer(llvm::BumpPtrAllocator
&Allocator
, Parser
&P
):
148 Allocator(Allocator
), P(P
), NoMoreInterestingTokens(false) {
153 /// Extract a word -- sequence of non-whitespace characters.
154 bool lexWord(Token
&Tok
) {
158 Position SavedPos
= Pos
;
161 SmallString
<32> WordText
;
162 const char *WordBegin
= Pos
.BufferPtr
;
163 SourceLocation Loc
= getSourceLocation();
165 const char C
= peek();
166 if (!isWhitespace(C
)) {
167 WordText
.push_back(C
);
172 const unsigned Length
= WordText
.size();
178 char *TextPtr
= Allocator
.Allocate
<char>(Length
+ 1);
180 memcpy(TextPtr
, WordText
.c_str(), Length
+ 1);
181 StringRef Text
= StringRef(TextPtr
, Length
);
183 formTokenWithChars(Tok
, Loc
, WordBegin
, Length
, Text
);
187 bool lexDelimitedSeq(Token
&Tok
, char OpenDelim
, char CloseDelim
) {
191 Position SavedPos
= Pos
;
194 SmallString
<32> WordText
;
195 const char *WordBegin
= Pos
.BufferPtr
;
196 SourceLocation Loc
= getSourceLocation();
199 const char C
= peek();
200 if (C
== OpenDelim
) {
201 WordText
.push_back(C
);
207 while (!Error
&& !isEnd()) {
209 WordText
.push_back(C
);
214 if (!Error
&& C
!= CloseDelim
)
222 const unsigned Length
= WordText
.size();
223 char *TextPtr
= Allocator
.Allocate
<char>(Length
+ 1);
225 memcpy(TextPtr
, WordText
.c_str(), Length
+ 1);
226 StringRef Text
= StringRef(TextPtr
, Length
);
228 formTokenWithChars(Tok
, Loc
, WordBegin
,
229 Pos
.BufferPtr
- WordBegin
, Text
);
233 /// Put back tokens that we didn't consume.
234 void putBackLeftoverTokens() {
238 bool HavePartialTok
= false;
240 if (Pos
.BufferPtr
!= Pos
.BufferStart
) {
241 formTokenWithChars(PartialTok
, getSourceLocation(),
242 Pos
.BufferPtr
, Pos
.BufferEnd
- Pos
.BufferPtr
,
243 StringRef(Pos
.BufferPtr
,
244 Pos
.BufferEnd
- Pos
.BufferPtr
));
245 HavePartialTok
= true;
249 P
.putBack(llvm::makeArrayRef(Toks
.begin() + Pos
.CurToken
, Toks
.end()));
250 Pos
.CurToken
= Toks
.size();
253 P
.putBack(PartialTok
);
257 Parser::Parser(Lexer
&L
, Sema
&S
, llvm::BumpPtrAllocator
&Allocator
,
258 const SourceManager
&SourceMgr
, DiagnosticsEngine
&Diags
,
259 const CommandTraits
&Traits
):
260 L(L
), S(S
), Allocator(Allocator
), SourceMgr(SourceMgr
), Diags(Diags
),
265 void Parser::parseParamCommandArgs(ParamCommandComment
*PC
,
266 TextTokenRetokenizer
&Retokenizer
) {
268 // Check if argument looks like direction specification: [dir]
269 // e.g., [in], [out], [in,out]
270 if (Retokenizer
.lexDelimitedSeq(Arg
, '[', ']'))
271 S
.actOnParamCommandDirectionArg(PC
,
273 Arg
.getEndLocation(),
276 if (Retokenizer
.lexWord(Arg
))
277 S
.actOnParamCommandParamNameArg(PC
,
279 Arg
.getEndLocation(),
283 void Parser::parseTParamCommandArgs(TParamCommandComment
*TPC
,
284 TextTokenRetokenizer
&Retokenizer
) {
286 if (Retokenizer
.lexWord(Arg
))
287 S
.actOnTParamCommandParamNameArg(TPC
,
289 Arg
.getEndLocation(),
293 void Parser::parseBlockCommandArgs(BlockCommandComment
*BC
,
294 TextTokenRetokenizer
&Retokenizer
,
296 typedef BlockCommandComment::Argument Argument
;
298 new (Allocator
.Allocate
<Argument
>(NumArgs
)) Argument
[NumArgs
];
299 unsigned ParsedArgs
= 0;
301 while (ParsedArgs
< NumArgs
&& Retokenizer
.lexWord(Arg
)) {
302 Args
[ParsedArgs
] = Argument(SourceRange(Arg
.getLocation(),
303 Arg
.getEndLocation()),
308 S
.actOnBlockCommandArgs(BC
, llvm::makeArrayRef(Args
, ParsedArgs
));
311 BlockCommandComment
*Parser::parseBlockCommand() {
312 assert(Tok
.is(tok::backslash_command
) || Tok
.is(tok::at_command
));
314 ParamCommandComment
*PC
= nullptr;
315 TParamCommandComment
*TPC
= nullptr;
316 BlockCommandComment
*BC
= nullptr;
317 const CommandInfo
*Info
= Traits
.getCommandInfo(Tok
.getCommandID());
318 CommandMarkerKind CommandMarker
=
319 Tok
.is(tok::backslash_command
) ? CMK_Backslash
: CMK_At
;
320 if (Info
->IsParamCommand
) {
321 PC
= S
.actOnParamCommandStart(Tok
.getLocation(),
322 Tok
.getEndLocation(),
325 } else if (Info
->IsTParamCommand
) {
326 TPC
= S
.actOnTParamCommandStart(Tok
.getLocation(),
327 Tok
.getEndLocation(),
331 BC
= S
.actOnBlockCommandStart(Tok
.getLocation(),
332 Tok
.getEndLocation(),
338 if (isTokBlockCommand()) {
339 // Block command ahead. We can't nest block commands, so pretend that this
340 // command has an empty argument.
341 ParagraphComment
*Paragraph
= S
.actOnParagraphComment(None
);
343 S
.actOnParamCommandFinish(PC
, Paragraph
);
346 S
.actOnTParamCommandFinish(TPC
, Paragraph
);
349 S
.actOnBlockCommandFinish(BC
, Paragraph
);
354 if (PC
|| TPC
|| Info
->NumArgs
> 0) {
355 // In order to parse command arguments we need to retokenize a few
356 // following text tokens.
357 TextTokenRetokenizer
Retokenizer(Allocator
, *this);
360 parseParamCommandArgs(PC
, Retokenizer
);
362 parseTParamCommandArgs(TPC
, Retokenizer
);
364 parseBlockCommandArgs(BC
, Retokenizer
, Info
->NumArgs
);
366 Retokenizer
.putBackLeftoverTokens();
369 // If there's a block command ahead, we will attach an empty paragraph to
371 bool EmptyParagraph
= false;
372 if (isTokBlockCommand())
373 EmptyParagraph
= true;
374 else if (Tok
.is(tok::newline
)) {
377 EmptyParagraph
= isTokBlockCommand();
381 ParagraphComment
*Paragraph
;
383 Paragraph
= S
.actOnParagraphComment(None
);
385 BlockContentComment
*Block
= parseParagraphOrBlockCommand();
386 // Since we have checked for a block command, we should have parsed a
388 Paragraph
= cast
<ParagraphComment
>(Block
);
392 S
.actOnParamCommandFinish(PC
, Paragraph
);
395 S
.actOnTParamCommandFinish(TPC
, Paragraph
);
398 S
.actOnBlockCommandFinish(BC
, Paragraph
);
403 InlineCommandComment
*Parser::parseInlineCommand() {
404 assert(Tok
.is(tok::backslash_command
) || Tok
.is(tok::at_command
));
406 const Token CommandTok
= Tok
;
409 TextTokenRetokenizer
Retokenizer(Allocator
, *this);
412 bool ArgTokValid
= Retokenizer
.lexWord(ArgTok
);
414 InlineCommandComment
*IC
;
416 IC
= S
.actOnInlineCommand(CommandTok
.getLocation(),
417 CommandTok
.getEndLocation(),
418 CommandTok
.getCommandID(),
419 ArgTok
.getLocation(),
420 ArgTok
.getEndLocation(),
423 IC
= S
.actOnInlineCommand(CommandTok
.getLocation(),
424 CommandTok
.getEndLocation(),
425 CommandTok
.getCommandID());
428 Retokenizer
.putBackLeftoverTokens();
433 HTMLStartTagComment
*Parser::parseHTMLStartTag() {
434 assert(Tok
.is(tok::html_start_tag
));
435 HTMLStartTagComment
*HST
=
436 S
.actOnHTMLStartTagStart(Tok
.getLocation(),
437 Tok
.getHTMLTagStartName());
440 SmallVector
<HTMLStartTagComment::Attribute
, 2> Attrs
;
442 switch (Tok
.getKind()) {
443 case tok::html_ident
: {
446 if (Tok
.isNot(tok::html_equals
)) {
447 Attrs
.push_back(HTMLStartTagComment::Attribute(Ident
.getLocation(),
448 Ident
.getHTMLIdent()));
453 if (Tok
.isNot(tok::html_quoted_string
)) {
454 Diag(Tok
.getLocation(),
455 diag::warn_doc_html_start_tag_expected_quoted_string
)
456 << SourceRange(Equals
.getLocation());
457 Attrs
.push_back(HTMLStartTagComment::Attribute(Ident
.getLocation(),
458 Ident
.getHTMLIdent()));
459 while (Tok
.is(tok::html_equals
) ||
460 Tok
.is(tok::html_quoted_string
))
464 Attrs
.push_back(HTMLStartTagComment::Attribute(
466 Ident
.getHTMLIdent(),
467 Equals
.getLocation(),
468 SourceRange(Tok
.getLocation(),
469 Tok
.getEndLocation()),
470 Tok
.getHTMLQuotedString()));
475 case tok::html_greater
:
476 S
.actOnHTMLStartTagFinish(HST
,
477 S
.copyArray(llvm::makeArrayRef(Attrs
)),
479 /* IsSelfClosing = */ false);
483 case tok::html_slash_greater
:
484 S
.actOnHTMLStartTagFinish(HST
,
485 S
.copyArray(llvm::makeArrayRef(Attrs
)),
487 /* IsSelfClosing = */ true);
491 case tok::html_equals
:
492 case tok::html_quoted_string
:
493 Diag(Tok
.getLocation(),
494 diag::warn_doc_html_start_tag_expected_ident_or_greater
);
495 while (Tok
.is(tok::html_equals
) ||
496 Tok
.is(tok::html_quoted_string
))
498 if (Tok
.is(tok::html_ident
) ||
499 Tok
.is(tok::html_greater
) ||
500 Tok
.is(tok::html_slash_greater
))
503 S
.actOnHTMLStartTagFinish(HST
,
504 S
.copyArray(llvm::makeArrayRef(Attrs
)),
506 /* IsSelfClosing = */ false);
510 // Not a token from an HTML start tag. Thus HTML tag prematurely ended.
511 S
.actOnHTMLStartTagFinish(HST
,
512 S
.copyArray(llvm::makeArrayRef(Attrs
)),
514 /* IsSelfClosing = */ false);
515 bool StartLineInvalid
;
516 const unsigned StartLine
= SourceMgr
.getPresumedLineNumber(
520 const unsigned EndLine
= SourceMgr
.getPresumedLineNumber(
523 if (StartLineInvalid
|| EndLineInvalid
|| StartLine
== EndLine
)
524 Diag(Tok
.getLocation(),
525 diag::warn_doc_html_start_tag_expected_ident_or_greater
)
526 << HST
->getSourceRange();
528 Diag(Tok
.getLocation(),
529 diag::warn_doc_html_start_tag_expected_ident_or_greater
);
530 Diag(HST
->getLocation(), diag::note_doc_html_tag_started_here
)
531 << HST
->getSourceRange();
538 HTMLEndTagComment
*Parser::parseHTMLEndTag() {
539 assert(Tok
.is(tok::html_end_tag
));
540 Token TokEndTag
= Tok
;
543 if (Tok
.is(tok::html_greater
)) {
544 Loc
= Tok
.getLocation();
548 return S
.actOnHTMLEndTag(TokEndTag
.getLocation(),
550 TokEndTag
.getHTMLTagEndName());
553 BlockContentComment
*Parser::parseParagraphOrBlockCommand() {
554 SmallVector
<InlineContentComment
*, 8> Content
;
557 switch (Tok
.getKind()) {
558 case tok::verbatim_block_begin
:
559 case tok::verbatim_line_name
:
561 assert(Content
.size() != 0);
562 break; // Block content or EOF ahead, finish this parapgaph.
564 case tok::unknown_command
:
565 Content
.push_back(S
.actOnUnknownCommand(Tok
.getLocation(),
566 Tok
.getEndLocation(),
567 Tok
.getUnknownCommandName()));
571 case tok::backslash_command
:
572 case tok::at_command
: {
573 const CommandInfo
*Info
= Traits
.getCommandInfo(Tok
.getCommandID());
574 if (Info
->IsBlockCommand
) {
575 if (Content
.size() == 0)
576 return parseBlockCommand();
577 break; // Block command ahead, finish this parapgaph.
579 if (Info
->IsVerbatimBlockEndCommand
) {
580 Diag(Tok
.getLocation(),
581 diag::warn_verbatim_block_end_without_start
)
582 << Tok
.is(tok::at_command
)
584 << SourceRange(Tok
.getLocation(), Tok
.getEndLocation());
588 if (Info
->IsUnknownCommand
) {
589 Content
.push_back(S
.actOnUnknownCommand(Tok
.getLocation(),
590 Tok
.getEndLocation(),
595 assert(Info
->IsInlineCommand
);
596 Content
.push_back(parseInlineCommand());
602 if (Tok
.is(tok::newline
) || Tok
.is(tok::eof
)) {
604 break; // Two newlines -- end of paragraph.
606 // Also allow [tok::newline, tok::text, tok::newline] if the middle
607 // tok::text is just whitespace.
608 if (Tok
.is(tok::text
) && isWhitespace(Tok
.getText())) {
609 Token WhitespaceTok
= Tok
;
611 if (Tok
.is(tok::newline
) || Tok
.is(tok::eof
)) {
615 // We have [tok::newline, tok::text, non-newline]. Put back tok::text.
616 putBack(WhitespaceTok
);
618 if (Content
.size() > 0)
619 Content
.back()->addTrailingNewline();
623 // Don't deal with HTML tag soup now.
624 case tok::html_start_tag
:
625 Content
.push_back(parseHTMLStartTag());
628 case tok::html_end_tag
:
629 Content
.push_back(parseHTMLEndTag());
633 Content
.push_back(S
.actOnText(Tok
.getLocation(),
634 Tok
.getEndLocation(),
639 case tok::verbatim_block_line
:
640 case tok::verbatim_block_end
:
641 case tok::verbatim_line_text
:
642 case tok::html_ident
:
643 case tok::html_equals
:
644 case tok::html_quoted_string
:
645 case tok::html_greater
:
646 case tok::html_slash_greater
:
647 llvm_unreachable("should not see this token");
652 return S
.actOnParagraphComment(S
.copyArray(llvm::makeArrayRef(Content
)));
655 VerbatimBlockComment
*Parser::parseVerbatimBlock() {
656 assert(Tok
.is(tok::verbatim_block_begin
));
658 VerbatimBlockComment
*VB
=
659 S
.actOnVerbatimBlockStart(Tok
.getLocation(),
660 Tok
.getVerbatimBlockID());
663 // Don't create an empty line if verbatim opening command is followed
665 if (Tok
.is(tok::newline
))
668 SmallVector
<VerbatimBlockLineComment
*, 8> Lines
;
669 while (Tok
.is(tok::verbatim_block_line
) ||
670 Tok
.is(tok::newline
)) {
671 VerbatimBlockLineComment
*Line
;
672 if (Tok
.is(tok::verbatim_block_line
)) {
673 Line
= S
.actOnVerbatimBlockLine(Tok
.getLocation(),
674 Tok
.getVerbatimBlockText());
676 if (Tok
.is(tok::newline
)) {
680 // Empty line, just a tok::newline.
681 Line
= S
.actOnVerbatimBlockLine(Tok
.getLocation(), "");
684 Lines
.push_back(Line
);
687 if (Tok
.is(tok::verbatim_block_end
)) {
688 const CommandInfo
*Info
= Traits
.getCommandInfo(Tok
.getVerbatimBlockID());
689 S
.actOnVerbatimBlockFinish(VB
, Tok
.getLocation(),
691 S
.copyArray(llvm::makeArrayRef(Lines
)));
694 // Unterminated \\verbatim block
695 S
.actOnVerbatimBlockFinish(VB
, SourceLocation(), "",
696 S
.copyArray(llvm::makeArrayRef(Lines
)));
702 VerbatimLineComment
*Parser::parseVerbatimLine() {
703 assert(Tok
.is(tok::verbatim_line_name
));
708 SourceLocation TextBegin
;
710 // Next token might not be a tok::verbatim_line_text if verbatim line
711 // starting command comes just before a newline or comment end.
712 if (Tok
.is(tok::verbatim_line_text
)) {
713 TextBegin
= Tok
.getLocation();
714 Text
= Tok
.getVerbatimLineText();
716 TextBegin
= NameTok
.getEndLocation();
720 VerbatimLineComment
*VL
= S
.actOnVerbatimLine(NameTok
.getLocation(),
721 NameTok
.getVerbatimLineID(),
728 BlockContentComment
*Parser::parseBlockContent() {
729 switch (Tok
.getKind()) {
731 case tok::unknown_command
:
732 case tok::backslash_command
:
733 case tok::at_command
:
734 case tok::html_start_tag
:
735 case tok::html_end_tag
:
736 return parseParagraphOrBlockCommand();
738 case tok::verbatim_block_begin
:
739 return parseVerbatimBlock();
741 case tok::verbatim_line_name
:
742 return parseVerbatimLine();
746 case tok::verbatim_block_line
:
747 case tok::verbatim_block_end
:
748 case tok::verbatim_line_text
:
749 case tok::html_ident
:
750 case tok::html_equals
:
751 case tok::html_quoted_string
:
752 case tok::html_greater
:
753 case tok::html_slash_greater
:
754 llvm_unreachable("should not see this token");
756 llvm_unreachable("bogus token kind");
759 FullComment
*Parser::parseFullComment() {
760 // Skip newlines at the beginning of the comment.
761 while (Tok
.is(tok::newline
))
764 SmallVector
<BlockContentComment
*, 8> Blocks
;
765 while (Tok
.isNot(tok::eof
)) {
766 Blocks
.push_back(parseBlockContent());
768 // Skip extra newlines after paragraph end.
769 while (Tok
.is(tok::newline
))
772 return S
.actOnFullComment(S
.copyArray(llvm::makeArrayRef(Blocks
)));
775 } // end namespace comments
776 } // end namespace clang