1 //===--- CommentLexer.cpp -------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang/AST/CommentLexer.h"
10 #include "clang/AST/CommentCommandTraits.h"
11 #include "clang/AST/CommentDiagnostic.h"
12 #include "clang/Basic/CharInfo.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringSwitch.h"
15 #include "llvm/Support/ConvertUTF.h"
16 #include "llvm/Support/ErrorHandling.h"
21 void Token::dump(const Lexer
&L
, const SourceManager
&SM
) const {
22 llvm::errs() << "comments::Token Kind=" << Kind
<< " ";
23 Loc
.print(llvm::errs(), SM
);
24 llvm::errs() << " " << Length
<< " \"" << L
.getSpelling(*this, SM
) << "\"\n";
27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C
) {
31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C
) {
35 static inline bool isHTMLHexCharacterReferenceCharacter(char C
) {
39 static inline StringRef
convertCodePointToUTF8(
40 llvm::BumpPtrAllocator
&Allocator
,
42 char *Resolved
= Allocator
.Allocate
<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT
);
43 char *ResolvedPtr
= Resolved
;
44 if (llvm::ConvertCodePointToUTF8(CodePoint
, ResolvedPtr
))
45 return StringRef(Resolved
, ResolvedPtr
- Resolved
);
52 #include "clang/AST/CommentHTMLTags.inc"
53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
55 } // end anonymous namespace
57 StringRef
Lexer::resolveHTMLNamedCharacterReference(StringRef Name
) const {
58 // Fast path, first check a few most widely used named character references.
59 return llvm::StringSwitch
<StringRef
>(Name
)
66 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name
));
69 StringRef
Lexer::resolveHTMLDecimalCharacterReference(StringRef Name
) const {
70 unsigned CodePoint
= 0;
71 for (unsigned i
= 0, e
= Name
.size(); i
!= e
; ++i
) {
72 assert(isHTMLDecimalCharacterReferenceCharacter(Name
[i
]));
74 CodePoint
+= Name
[i
] - '0';
76 return convertCodePointToUTF8(Allocator
, CodePoint
);
79 StringRef
Lexer::resolveHTMLHexCharacterReference(StringRef Name
) const {
80 unsigned CodePoint
= 0;
81 for (unsigned i
= 0, e
= Name
.size(); i
!= e
; ++i
) {
83 const char C
= Name
[i
];
84 assert(isHTMLHexCharacterReferenceCharacter(C
));
85 CodePoint
+= llvm::hexDigitValue(C
);
87 return convertCodePointToUTF8(Allocator
, CodePoint
);
90 void Lexer::skipLineStartingDecorations() {
91 // This function should be called only for C comments
92 assert(CommentState
== LCS_InsideCComment
);
94 if (BufferPtr
== CommentEnd
)
97 const char *NewBufferPtr
= BufferPtr
;
98 while (isHorizontalWhitespace(*NewBufferPtr
))
99 if (++NewBufferPtr
== CommentEnd
)
101 if (*NewBufferPtr
== '*')
102 BufferPtr
= NewBufferPtr
+ 1;
106 /// Returns pointer to the first newline character in the string.
107 const char *findNewline(const char *BufferPtr
, const char *BufferEnd
) {
108 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
109 if (isVerticalWhitespace(*BufferPtr
))
115 const char *skipNewline(const char *BufferPtr
, const char *BufferEnd
) {
116 if (BufferPtr
== BufferEnd
)
119 if (*BufferPtr
== '\n')
122 assert(*BufferPtr
== '\r');
124 if (BufferPtr
!= BufferEnd
&& *BufferPtr
== '\n')
130 const char *skipNamedCharacterReference(const char *BufferPtr
,
131 const char *BufferEnd
) {
132 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
133 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr
))
139 const char *skipDecimalCharacterReference(const char *BufferPtr
,
140 const char *BufferEnd
) {
141 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
142 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr
))
148 const char *skipHexCharacterReference(const char *BufferPtr
,
149 const char *BufferEnd
) {
150 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
151 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr
))
157 bool isHTMLIdentifierStartingCharacter(char C
) {
161 bool isHTMLIdentifierCharacter(char C
) {
162 return isAlphanumeric(C
);
165 const char *skipHTMLIdentifier(const char *BufferPtr
, const char *BufferEnd
) {
166 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
167 if (!isHTMLIdentifierCharacter(*BufferPtr
))
173 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
176 /// Returns pointer to closing quote.
177 const char *skipHTMLQuotedString(const char *BufferPtr
, const char *BufferEnd
)
179 const char Quote
= *BufferPtr
;
180 assert(Quote
== '\"' || Quote
== '\'');
183 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
184 const char C
= *BufferPtr
;
185 if (C
== Quote
&& BufferPtr
[-1] != '\\')
191 const char *skipWhitespace(const char *BufferPtr
, const char *BufferEnd
) {
192 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
193 if (!isWhitespace(*BufferPtr
))
199 bool isWhitespace(const char *BufferPtr
, const char *BufferEnd
) {
200 return skipWhitespace(BufferPtr
, BufferEnd
) == BufferEnd
;
203 bool isCommandNameStartCharacter(char C
) {
207 bool isCommandNameCharacter(char C
) {
208 return isAlphanumeric(C
);
211 const char *skipCommandName(const char *BufferPtr
, const char *BufferEnd
) {
212 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
213 if (!isCommandNameCharacter(*BufferPtr
))
219 /// Return the one past end pointer for BCPL comments.
220 /// Handles newlines escaped with backslash or trigraph for backslahs.
221 const char *findBCPLCommentEnd(const char *BufferPtr
, const char *BufferEnd
) {
222 const char *CurPtr
= BufferPtr
;
223 while (CurPtr
!= BufferEnd
) {
224 while (!isVerticalWhitespace(*CurPtr
)) {
226 if (CurPtr
== BufferEnd
)
229 // We found a newline, check if it is escaped.
230 const char *EscapePtr
= CurPtr
- 1;
231 while(isHorizontalWhitespace(*EscapePtr
))
234 if (*EscapePtr
== '\\' ||
235 (EscapePtr
- 2 >= BufferPtr
&& EscapePtr
[0] == '/' &&
236 EscapePtr
[-1] == '?' && EscapePtr
[-2] == '?')) {
237 // We found an escaped newline.
238 CurPtr
= skipNewline(CurPtr
, BufferEnd
);
240 return CurPtr
; // Not an escaped newline.
245 /// Return the one past end pointer for C comments.
246 /// Very dumb, does not handle escaped newlines or trigraphs.
247 const char *findCCommentEnd(const char *BufferPtr
, const char *BufferEnd
) {
248 for ( ; BufferPtr
!= BufferEnd
; ++BufferPtr
) {
249 if (*BufferPtr
== '*') {
250 assert(BufferPtr
+ 1 != BufferEnd
);
251 if (*(BufferPtr
+ 1) == '/')
255 llvm_unreachable("buffer end hit before '*/' was seen");
258 } // end anonymous namespace
260 void Lexer::formTokenWithChars(Token
&Result
, const char *TokEnd
,
261 tok::TokenKind Kind
) {
262 const unsigned TokLen
= TokEnd
- BufferPtr
;
263 Result
.setLocation(getSourceLocation(BufferPtr
));
264 Result
.setKind(Kind
);
265 Result
.setLength(TokLen
);
267 Result
.TextPtr
= "<UNSET>";
273 const char *Lexer::skipTextToken() {
274 const char *TokenPtr
= BufferPtr
;
275 assert(TokenPtr
< CommentEnd
);
276 StringRef TokStartSymbols
= ParseCommands
? "\n\r\\@\"&<" : "\n\r";
280 StringRef(TokenPtr
, CommentEnd
- TokenPtr
).find_first_of(TokStartSymbols
);
281 if (End
== StringRef::npos
)
284 // Doxygen doesn't recognize any commands in a one-line double quotation.
285 // If we don't find an ending quotation mark, we pretend it never began.
286 if (*(TokenPtr
+ End
) == '\"') {
288 End
= StringRef(TokenPtr
, CommentEnd
- TokenPtr
).find_first_of("\n\r\"");
289 if (End
!= StringRef::npos
&& *(TokenPtr
+ End
) == '\"')
293 return TokenPtr
+ End
;
296 void Lexer::lexCommentText(Token
&T
) {
297 assert(CommentState
== LCS_InsideBCPLComment
||
298 CommentState
== LCS_InsideCComment
);
300 // Handles lexing non-command text, i.e. text and newline.
301 auto HandleNonCommandToken
= [&]() -> void {
302 assert(State
== LS_Normal
);
304 const char *TokenPtr
= BufferPtr
;
305 assert(TokenPtr
< CommentEnd
);
309 TokenPtr
= skipNewline(TokenPtr
, CommentEnd
);
310 formTokenWithChars(T
, TokenPtr
, tok::newline
);
312 if (CommentState
== LCS_InsideCComment
)
313 skipLineStartingDecorations();
317 return formTextToken(T
, skipTextToken());
322 return HandleNonCommandToken();
327 case LS_VerbatimBlockFirstLine
:
328 lexVerbatimBlockFirstLine(T
);
330 case LS_VerbatimBlockBody
:
331 lexVerbatimBlockBody(T
);
333 case LS_VerbatimLineText
:
334 lexVerbatimLineText(T
);
336 case LS_HTMLStartTag
:
344 assert(State
== LS_Normal
);
345 const char *TokenPtr
= BufferPtr
;
346 assert(TokenPtr
< CommentEnd
);
350 // Commands that start with a backslash and commands that start with
351 // 'at' have equivalent semantics. But we keep information about the
352 // exact syntax in AST for comments.
353 tok::TokenKind CommandKind
=
354 (*TokenPtr
== '@') ? tok::at_command
: tok::backslash_command
;
356 if (TokenPtr
== CommentEnd
) {
357 formTextToken(T
, TokenPtr
);
365 case '\\': case '@': case '&': case '$':
366 case '#': case '<': case '>': case '%':
367 case '\"': case '.': case ':':
368 // This is one of \\ \@ \& \$ etc escape sequences.
370 if (C
== ':' && TokenPtr
!= CommentEnd
&& *TokenPtr
== ':') {
371 // This is the \:: escape sequence.
374 StringRef
UnescapedText(BufferPtr
+ 1, TokenPtr
- (BufferPtr
+ 1));
375 formTokenWithChars(T
, TokenPtr
, tok::text
);
376 T
.setText(UnescapedText
);
380 // Don't make zero-length commands.
381 if (!isCommandNameStartCharacter(*TokenPtr
)) {
382 formTextToken(T
, TokenPtr
);
386 TokenPtr
= skipCommandName(TokenPtr
, CommentEnd
);
387 unsigned Length
= TokenPtr
- (BufferPtr
+ 1);
389 // Hardcoded support for lexing LaTeX formula commands
390 // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391 if (Length
== 1 && TokenPtr
[-1] == 'f' && TokenPtr
!= CommentEnd
) {
393 if (C
== '$' || C
== '(' || C
== ')' || C
== '[' || C
== ']' ||
394 C
== '{' || C
== '}') {
400 StringRef
CommandName(BufferPtr
+ 1, Length
);
402 const CommandInfo
*Info
= Traits
.getCommandInfoOrNULL(CommandName
);
404 if ((Info
= Traits
.getTypoCorrectCommandInfo(CommandName
))) {
405 StringRef CorrectedName
= Info
->Name
;
406 SourceLocation Loc
= getSourceLocation(BufferPtr
);
407 SourceLocation EndLoc
= getSourceLocation(TokenPtr
);
408 SourceRange FullRange
= SourceRange(Loc
, EndLoc
);
409 SourceRange
CommandRange(Loc
.getLocWithOffset(1), EndLoc
);
410 Diag(Loc
, diag::warn_correct_comment_command_name
)
411 << FullRange
<< CommandName
<< CorrectedName
412 << FixItHint::CreateReplacement(CommandRange
, CorrectedName
);
414 formTokenWithChars(T
, TokenPtr
, tok::unknown_command
);
415 T
.setUnknownCommandName(CommandName
);
416 Diag(T
.getLocation(), diag::warn_unknown_comment_command_name
)
417 << SourceRange(T
.getLocation(), T
.getEndLocation());
421 if (Info
->IsVerbatimBlockCommand
) {
422 setupAndLexVerbatimBlock(T
, TokenPtr
, *BufferPtr
, Info
);
425 if (Info
->IsVerbatimLineCommand
) {
426 setupAndLexVerbatimLine(T
, TokenPtr
, Info
);
429 formTokenWithChars(T
, TokenPtr
, CommandKind
);
430 T
.setCommandID(Info
->getID());
435 lexHTMLCharacterReference(T
);
440 if (TokenPtr
== CommentEnd
) {
441 formTextToken(T
, TokenPtr
);
444 const char C
= *TokenPtr
;
445 if (isHTMLIdentifierStartingCharacter(C
))
446 setupAndLexHTMLStartTag(T
);
448 setupAndLexHTMLEndTag(T
);
450 formTextToken(T
, TokenPtr
);
455 return HandleNonCommandToken();
459 void Lexer::setupAndLexVerbatimBlock(Token
&T
,
460 const char *TextBegin
,
461 char Marker
, const CommandInfo
*Info
) {
462 assert(Info
->IsVerbatimBlockCommand
);
464 VerbatimBlockEndCommandName
.clear();
465 VerbatimBlockEndCommandName
.append(Marker
== '\\' ? "\\" : "@");
466 VerbatimBlockEndCommandName
.append(Info
->EndCommandName
);
468 formTokenWithChars(T
, TextBegin
, tok::verbatim_block_begin
);
469 T
.setVerbatimBlockID(Info
->getID());
471 // If there is a newline following the verbatim opening command, skip the
472 // newline so that we don't create an tok::verbatim_block_line with empty
474 if (BufferPtr
!= CommentEnd
&&
475 isVerticalWhitespace(*BufferPtr
)) {
476 BufferPtr
= skipNewline(BufferPtr
, CommentEnd
);
477 State
= LS_VerbatimBlockBody
;
481 State
= LS_VerbatimBlockFirstLine
;
484 void Lexer::lexVerbatimBlockFirstLine(Token
&T
) {
486 assert(BufferPtr
< CommentEnd
);
488 // FIXME: It would be better to scan the text once, finding either the block
489 // end command or newline.
491 // Extract current line.
492 const char *Newline
= findNewline(BufferPtr
, CommentEnd
);
493 StringRef
Line(BufferPtr
, Newline
- BufferPtr
);
495 // Look for end command in current line.
496 size_t Pos
= Line
.find(VerbatimBlockEndCommandName
);
498 const char *NextLine
;
499 if (Pos
== StringRef::npos
) {
500 // Current line is completely verbatim.
502 NextLine
= skipNewline(Newline
, CommentEnd
);
503 } else if (Pos
== 0) {
504 // Current line contains just an end command.
505 const char *End
= BufferPtr
+ VerbatimBlockEndCommandName
.size();
506 StringRef
Name(BufferPtr
+ 1, End
- (BufferPtr
+ 1));
507 formTokenWithChars(T
, End
, tok::verbatim_block_end
);
508 T
.setVerbatimBlockID(Traits
.getCommandInfo(Name
)->getID());
512 // There is some text, followed by end command. Extract text first.
513 TextEnd
= BufferPtr
+ Pos
;
515 // If there is only whitespace before end command, skip whitespace.
516 if (isWhitespace(BufferPtr
, TextEnd
)) {
522 StringRef
Text(BufferPtr
, TextEnd
- BufferPtr
);
523 formTokenWithChars(T
, NextLine
, tok::verbatim_block_line
);
524 T
.setVerbatimBlockText(Text
);
526 State
= LS_VerbatimBlockBody
;
529 void Lexer::lexVerbatimBlockBody(Token
&T
) {
530 assert(State
== LS_VerbatimBlockBody
);
532 if (CommentState
== LCS_InsideCComment
)
533 skipLineStartingDecorations();
535 if (BufferPtr
== CommentEnd
) {
536 formTokenWithChars(T
, BufferPtr
, tok::verbatim_block_line
);
537 T
.setVerbatimBlockText("");
541 lexVerbatimBlockFirstLine(T
);
544 void Lexer::setupAndLexVerbatimLine(Token
&T
, const char *TextBegin
,
545 const CommandInfo
*Info
) {
546 assert(Info
->IsVerbatimLineCommand
);
547 formTokenWithChars(T
, TextBegin
, tok::verbatim_line_name
);
548 T
.setVerbatimLineID(Info
->getID());
550 State
= LS_VerbatimLineText
;
553 void Lexer::lexVerbatimLineText(Token
&T
) {
554 assert(State
== LS_VerbatimLineText
);
556 // Extract current line.
557 const char *Newline
= findNewline(BufferPtr
, CommentEnd
);
558 StringRef
Text(BufferPtr
, Newline
- BufferPtr
);
559 formTokenWithChars(T
, Newline
, tok::verbatim_line_text
);
560 T
.setVerbatimLineText(Text
);
565 void Lexer::lexHTMLCharacterReference(Token
&T
) {
566 const char *TokenPtr
= BufferPtr
;
567 assert(*TokenPtr
== '&');
569 if (TokenPtr
== CommentEnd
) {
570 formTextToken(T
, TokenPtr
);
574 bool isNamed
= false;
575 bool isDecimal
= false;
577 if (isHTMLNamedCharacterReferenceCharacter(C
)) {
579 TokenPtr
= skipNamedCharacterReference(TokenPtr
, CommentEnd
);
581 } else if (C
== '#') {
583 if (TokenPtr
== CommentEnd
) {
584 formTextToken(T
, TokenPtr
);
588 if (isHTMLDecimalCharacterReferenceCharacter(C
)) {
590 TokenPtr
= skipDecimalCharacterReference(TokenPtr
, CommentEnd
);
592 } else if (C
== 'x' || C
== 'X') {
595 TokenPtr
= skipHexCharacterReference(TokenPtr
, CommentEnd
);
597 formTextToken(T
, TokenPtr
);
601 formTextToken(T
, TokenPtr
);
604 if (NamePtr
== TokenPtr
|| TokenPtr
== CommentEnd
||
606 formTextToken(T
, TokenPtr
);
609 StringRef
Name(NamePtr
, TokenPtr
- NamePtr
);
610 TokenPtr
++; // Skip semicolon.
613 Resolved
= resolveHTMLNamedCharacterReference(Name
);
615 Resolved
= resolveHTMLDecimalCharacterReference(Name
);
617 Resolved
= resolveHTMLHexCharacterReference(Name
);
619 if (Resolved
.empty()) {
620 formTextToken(T
, TokenPtr
);
623 formTokenWithChars(T
, TokenPtr
, tok::text
);
627 void Lexer::setupAndLexHTMLStartTag(Token
&T
) {
628 assert(BufferPtr
[0] == '<' &&
629 isHTMLIdentifierStartingCharacter(BufferPtr
[1]));
630 const char *TagNameEnd
= skipHTMLIdentifier(BufferPtr
+ 2, CommentEnd
);
631 StringRef
Name(BufferPtr
+ 1, TagNameEnd
- (BufferPtr
+ 1));
632 if (!isHTMLTagName(Name
)) {
633 formTextToken(T
, TagNameEnd
);
637 formTokenWithChars(T
, TagNameEnd
, tok::html_start_tag
);
638 T
.setHTMLTagStartName(Name
);
640 BufferPtr
= skipWhitespace(BufferPtr
, CommentEnd
);
642 const char C
= *BufferPtr
;
643 if (BufferPtr
!= CommentEnd
&&
644 (C
== '>' || C
== '/' || isHTMLIdentifierStartingCharacter(C
)))
645 State
= LS_HTMLStartTag
;
648 void Lexer::lexHTMLStartTag(Token
&T
) {
649 assert(State
== LS_HTMLStartTag
);
651 const char *TokenPtr
= BufferPtr
;
653 if (isHTMLIdentifierCharacter(C
)) {
654 TokenPtr
= skipHTMLIdentifier(TokenPtr
, CommentEnd
);
655 StringRef
Ident(BufferPtr
, TokenPtr
- BufferPtr
);
656 formTokenWithChars(T
, TokenPtr
, tok::html_ident
);
657 T
.setHTMLIdent(Ident
);
662 formTokenWithChars(T
, TokenPtr
, tok::html_equals
);
666 const char *OpenQuote
= TokenPtr
;
667 TokenPtr
= skipHTMLQuotedString(TokenPtr
, CommentEnd
);
668 const char *ClosingQuote
= TokenPtr
;
669 if (TokenPtr
!= CommentEnd
) // Skip closing quote.
671 formTokenWithChars(T
, TokenPtr
, tok::html_quoted_string
);
672 T
.setHTMLQuotedString(StringRef(OpenQuote
+ 1,
673 ClosingQuote
- (OpenQuote
+ 1)));
678 formTokenWithChars(T
, TokenPtr
, tok::html_greater
);
683 if (TokenPtr
!= CommentEnd
&& *TokenPtr
== '>') {
685 formTokenWithChars(T
, TokenPtr
, tok::html_slash_greater
);
687 formTextToken(T
, TokenPtr
);
694 // Now look ahead and return to normal state if we don't see any HTML tokens
696 BufferPtr
= skipWhitespace(BufferPtr
, CommentEnd
);
697 if (BufferPtr
== CommentEnd
) {
703 if (!isHTMLIdentifierStartingCharacter(C
) &&
704 C
!= '=' && C
!= '\"' && C
!= '\'' && C
!= '>' && C
!= '/') {
710 void Lexer::setupAndLexHTMLEndTag(Token
&T
) {
711 assert(BufferPtr
[0] == '<' && BufferPtr
[1] == '/');
713 const char *TagNameBegin
= skipWhitespace(BufferPtr
+ 2, CommentEnd
);
714 const char *TagNameEnd
= skipHTMLIdentifier(TagNameBegin
, CommentEnd
);
715 StringRef
Name(TagNameBegin
, TagNameEnd
- TagNameBegin
);
716 if (!isHTMLTagName(Name
)) {
717 formTextToken(T
, TagNameEnd
);
721 const char *End
= skipWhitespace(TagNameEnd
, CommentEnd
);
723 formTokenWithChars(T
, End
, tok::html_end_tag
);
724 T
.setHTMLTagEndName(Name
);
726 if (BufferPtr
!= CommentEnd
&& *BufferPtr
== '>')
727 State
= LS_HTMLEndTag
;
730 void Lexer::lexHTMLEndTag(Token
&T
) {
731 assert(BufferPtr
!= CommentEnd
&& *BufferPtr
== '>');
733 formTokenWithChars(T
, BufferPtr
+ 1, tok::html_greater
);
737 Lexer::Lexer(llvm::BumpPtrAllocator
&Allocator
, DiagnosticsEngine
&Diags
,
738 const CommandTraits
&Traits
, SourceLocation FileLoc
,
739 const char *BufferStart
, const char *BufferEnd
, bool ParseCommands
)
740 : Allocator(Allocator
), Diags(Diags
), Traits(Traits
),
741 BufferStart(BufferStart
), BufferEnd(BufferEnd
), BufferPtr(BufferStart
),
742 FileLoc(FileLoc
), ParseCommands(ParseCommands
),
743 CommentState(LCS_BeforeComment
), State(LS_Normal
) {}
745 void Lexer::lex(Token
&T
) {
747 switch (CommentState
) {
748 case LCS_BeforeComment
:
749 if (BufferPtr
== BufferEnd
) {
750 formTokenWithChars(T
, BufferPtr
, tok::eof
);
754 assert(*BufferPtr
== '/');
755 BufferPtr
++; // Skip first slash.
757 case '/': { // BCPL comment.
758 BufferPtr
++; // Skip second slash.
760 if (BufferPtr
!= BufferEnd
) {
761 // Skip Doxygen magic marker, if it is present.
762 // It might be missing because of a typo //< or /*<, or because we
763 // merged this non-Doxygen comment into a bunch of Doxygen comments
764 // around it: /** ... */ /* ... */ /** ... */
765 const char C
= *BufferPtr
;
766 if (C
== '/' || C
== '!')
770 // Skip less-than symbol that marks trailing comments.
771 // Skip it even if the comment is not a Doxygen one, because //< and /*<
772 // are frequent typos.
773 if (BufferPtr
!= BufferEnd
&& *BufferPtr
== '<')
776 CommentState
= LCS_InsideBCPLComment
;
777 if (State
!= LS_VerbatimBlockBody
&& State
!= LS_VerbatimBlockFirstLine
)
779 CommentEnd
= findBCPLCommentEnd(BufferPtr
, BufferEnd
);
782 case '*': { // C comment.
783 BufferPtr
++; // Skip star.
785 // Skip Doxygen magic marker.
786 const char C
= *BufferPtr
;
787 if ((C
== '*' && *(BufferPtr
+ 1) != '/') || C
== '!')
790 // Skip less-than symbol that marks trailing comments.
791 if (BufferPtr
!= BufferEnd
&& *BufferPtr
== '<')
794 CommentState
= LCS_InsideCComment
;
796 CommentEnd
= findCCommentEnd(BufferPtr
, BufferEnd
);
800 llvm_unreachable("second character of comment should be '/' or '*'");
803 case LCS_BetweenComments
: {
804 // Consecutive comments are extracted only if there is only whitespace
805 // between them. So we can search for the start of the next comment.
806 const char *EndWhitespace
= BufferPtr
;
807 while(EndWhitespace
!= BufferEnd
&& *EndWhitespace
!= '/')
810 // Turn any whitespace between comments (and there is only whitespace
811 // between them -- guaranteed by comment extraction) into a newline. We
812 // have two newlines between C comments in total (first one was synthesized
814 formTokenWithChars(T
, EndWhitespace
, tok::newline
);
816 CommentState
= LCS_BeforeComment
;
820 case LCS_InsideBCPLComment
:
821 case LCS_InsideCComment
:
822 if (BufferPtr
!= CommentEnd
) {
826 // Skip C comment closing sequence.
827 if (CommentState
== LCS_InsideCComment
) {
828 assert(BufferPtr
[0] == '*' && BufferPtr
[1] == '/');
830 assert(BufferPtr
<= BufferEnd
);
832 // Synthenize newline just after the C comment, regardless if there is
833 // actually a newline.
834 formTokenWithChars(T
, BufferPtr
, tok::newline
);
836 CommentState
= LCS_BetweenComments
;
839 // Don't synthesized a newline after BCPL comment.
840 CommentState
= LCS_BetweenComments
;
847 StringRef
Lexer::getSpelling(const Token
&Tok
,
848 const SourceManager
&SourceMgr
) const {
849 SourceLocation Loc
= Tok
.getLocation();
850 std::pair
<FileID
, unsigned> LocInfo
= SourceMgr
.getDecomposedLoc(Loc
);
852 bool InvalidTemp
= false;
853 StringRef File
= SourceMgr
.getBufferData(LocInfo
.first
, &InvalidTemp
);
857 const char *Begin
= File
.data() + LocInfo
.second
;
858 return StringRef(Begin
, Tok
.getLength());
861 } // end namespace comments
862 } // end namespace clang