1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
13 //===----------------------------------------------------------------------===//
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
25 FormatTokenLexer::FormatTokenLexer(
26 const SourceManager
&SourceMgr
, FileID ID
, unsigned Column
,
27 const FormatStyle
&Style
, encoding::Encoding Encoding
,
28 llvm::SpecificBumpPtrAllocator
<FormatToken
> &Allocator
,
29 IdentifierTable
&IdentTable
)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL
}),
31 Column(Column
), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style
)), SourceMgr(SourceMgr
), ID(ID
),
33 Style(Style
), IdentTable(IdentTable
), Keywords(IdentTable
),
34 Encoding(Encoding
), Allocator(Allocator
), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style
.MacroBlockBegin
),
36 MacroBlockEndRegex(Style
.MacroBlockEnd
) {
37 Lex
.reset(new Lexer(ID
, SourceMgr
.getBufferOrFake(ID
), SourceMgr
, LangOpts
));
38 Lex
->SetKeepWhitespaceMode(true);
40 for (const std::string
&ForEachMacro
: Style
.ForEachMacros
) {
41 auto Identifier
= &IdentTable
.get(ForEachMacro
);
42 Macros
.insert({Identifier
, TT_ForEachMacro
});
44 for (const std::string
&IfMacro
: Style
.IfMacros
) {
45 auto Identifier
= &IdentTable
.get(IfMacro
);
46 Macros
.insert({Identifier
, TT_IfMacro
});
48 for (const std::string
&AttributeMacro
: Style
.AttributeMacros
) {
49 auto Identifier
= &IdentTable
.get(AttributeMacro
);
50 Macros
.insert({Identifier
, TT_AttributeMacro
});
52 for (const std::string
&StatementMacro
: Style
.StatementMacros
) {
53 auto Identifier
= &IdentTable
.get(StatementMacro
);
54 Macros
.insert({Identifier
, TT_StatementMacro
});
56 for (const std::string
&TypenameMacro
: Style
.TypenameMacros
) {
57 auto Identifier
= &IdentTable
.get(TypenameMacro
);
58 Macros
.insert({Identifier
, TT_TypenameMacro
});
60 for (const std::string
&NamespaceMacro
: Style
.NamespaceMacros
) {
61 auto Identifier
= &IdentTable
.get(NamespaceMacro
);
62 Macros
.insert({Identifier
, TT_NamespaceMacro
});
64 for (const std::string
&WhitespaceSensitiveMacro
:
65 Style
.WhitespaceSensitiveMacros
) {
66 auto Identifier
= &IdentTable
.get(WhitespaceSensitiveMacro
);
67 Macros
.insert({Identifier
, TT_UntouchableMacroFunc
});
69 for (const std::string
&StatementAttributeLikeMacro
:
70 Style
.StatementAttributeLikeMacros
) {
71 auto Identifier
= &IdentTable
.get(StatementAttributeLikeMacro
);
72 Macros
.insert({Identifier
, TT_StatementAttributeLikeMacro
});
75 for (const auto &TypeName
: Style
.TypeNames
)
76 TypeNames
.insert(&IdentTable
.get(TypeName
));
79 ArrayRef
<FormatToken
*> FormatTokenLexer::lex() {
80 assert(Tokens
.empty());
81 assert(FirstInLineIndex
== 0);
83 Tokens
.push_back(getNextToken());
84 if (Style
.isJavaScript()) {
85 tryParseJSRegexLiteral();
86 handleTemplateStrings();
88 if (Style
.Language
== FormatStyle::LK_TextProto
)
89 tryParsePythonComment();
90 tryMergePreviousTokens();
91 if (Style
.isCSharp()) {
92 // This needs to come after tokens have been merged so that C#
93 // string literals are correctly identified.
94 handleCSharpVerbatimAndInterpolatedStrings();
96 if (Tokens
.back()->NewlinesBefore
> 0 || Tokens
.back()->IsMultiline
)
97 FirstInLineIndex
= Tokens
.size() - 1;
98 } while (Tokens
.back()->isNot(tok::eof
));
102 void FormatTokenLexer::tryMergePreviousTokens() {
103 if (tryMerge_TMacro())
105 if (tryMergeConflictMarkers())
107 if (tryMergeLessLess())
109 if (tryMergeGreaterGreater())
111 if (tryMergeForEach())
113 if (Style
.isCpp() && tryTransformTryUsageForC())
116 if (Style
.isJavaScript() || Style
.isCSharp()) {
117 static const tok::TokenKind NullishCoalescingOperator
[] = {tok::question
,
119 static const tok::TokenKind NullPropagatingOperator
[] = {tok::question
,
121 static const tok::TokenKind FatArrow
[] = {tok::equal
, tok::greater
};
123 if (tryMergeTokens(FatArrow
, TT_FatArrow
))
125 if (tryMergeTokens(NullishCoalescingOperator
, TT_NullCoalescingOperator
)) {
126 // Treat like the "||" operator (as opposed to the ternary ?).
127 Tokens
.back()->Tok
.setKind(tok::pipepipe
);
130 if (tryMergeTokens(NullPropagatingOperator
, TT_NullPropagatingOperator
)) {
131 // Treat like a regular "." access.
132 Tokens
.back()->Tok
.setKind(tok::period
);
135 if (tryMergeNullishCoalescingEqual())
139 if (Style
.isCSharp()) {
140 static const tok::TokenKind CSharpNullConditionalLSquare
[] = {
141 tok::question
, tok::l_square
};
143 if (tryMergeCSharpKeywordVariables())
145 if (tryMergeCSharpStringLiteral())
147 if (tryTransformCSharpForEach())
149 if (tryMergeTokens(CSharpNullConditionalLSquare
,
150 TT_CSharpNullConditionalLSquare
)) {
151 // Treat like a regular "[" operator.
152 Tokens
.back()->Tok
.setKind(tok::l_square
);
157 if (tryMergeNSStringLiteral())
160 if (Style
.isJavaScript()) {
161 static const tok::TokenKind JSIdentity
[] = {tok::equalequal
, tok::equal
};
162 static const tok::TokenKind JSNotIdentity
[] = {tok::exclaimequal
,
164 static const tok::TokenKind JSShiftEqual
[] = {tok::greater
, tok::greater
,
166 static const tok::TokenKind JSExponentiation
[] = {tok::star
, tok::star
};
167 static const tok::TokenKind JSExponentiationEqual
[] = {tok::star
,
169 static const tok::TokenKind JSPipePipeEqual
[] = {tok::pipepipe
, tok::equal
};
170 static const tok::TokenKind JSAndAndEqual
[] = {tok::ampamp
, tok::equal
};
172 // FIXME: Investigate what token type gives the correct operator priority.
173 if (tryMergeTokens(JSIdentity
, TT_BinaryOperator
))
175 if (tryMergeTokens(JSNotIdentity
, TT_BinaryOperator
))
177 if (tryMergeTokens(JSShiftEqual
, TT_BinaryOperator
))
179 if (tryMergeTokens(JSExponentiation
, TT_JsExponentiation
))
181 if (tryMergeTokens(JSExponentiationEqual
, TT_JsExponentiationEqual
)) {
182 Tokens
.back()->Tok
.setKind(tok::starequal
);
185 if (tryMergeTokens(JSAndAndEqual
, TT_JsAndAndEqual
) ||
186 tryMergeTokens(JSPipePipeEqual
, TT_JsPipePipeEqual
)) {
187 // Treat like the "=" assignment operator.
188 Tokens
.back()->Tok
.setKind(tok::equal
);
191 if (tryMergeJSPrivateIdentifier())
195 if (Style
.Language
== FormatStyle::LK_Java
) {
196 static const tok::TokenKind JavaRightLogicalShiftAssign
[] = {
197 tok::greater
, tok::greater
, tok::greaterequal
};
198 if (tryMergeTokens(JavaRightLogicalShiftAssign
, TT_BinaryOperator
))
202 if (Style
.isVerilog()) {
203 // Merge the number following a base like `'h?a0`.
204 if (Tokens
.size() >= 3 && Tokens
.end()[-3]->is(TT_VerilogNumberBase
) &&
205 Tokens
.end()[-2]->is(tok::numeric_constant
) &&
206 Tokens
.back()->isOneOf(tok::numeric_constant
, tok::identifier
,
208 tryMergeTokens(2, TT_Unknown
)) {
212 if (tryMergeTokensAny({{tok::minus
, tok::colon
}, {tok::plus
, tok::colon
}},
216 // Xnor. The combined token is treated as a caret which can also be either a
217 // unary or binary operator. The actual type is determined in
218 // TokenAnnotator. We also check the token length so we know it is not
219 // already a merged token.
220 if (Tokens
.back()->TokenText
.size() == 1 &&
221 tryMergeTokensAny({{tok::caret
, tok::tilde
}, {tok::tilde
, tok::caret
}},
222 TT_BinaryOperator
)) {
223 Tokens
.back()->Tok
.setKind(tok::caret
);
226 // Signed shift and distribution weight.
227 if (tryMergeTokens({tok::less
, tok::less
}, TT_BinaryOperator
)) {
228 Tokens
.back()->Tok
.setKind(tok::lessless
);
231 if (tryMergeTokens({tok::greater
, tok::greater
}, TT_BinaryOperator
)) {
232 Tokens
.back()->Tok
.setKind(tok::greatergreater
);
235 if (tryMergeTokensAny({{tok::lessless
, tok::equal
},
236 {tok::lessless
, tok::lessequal
},
237 {tok::greatergreater
, tok::equal
},
238 {tok::greatergreater
, tok::greaterequal
},
239 {tok::colon
, tok::equal
},
240 {tok::colon
, tok::slash
}},
241 TT_BinaryOperator
)) {
242 Tokens
.back()->ForcedPrecedence
= prec::Assignment
;
245 // Exponentiation, signed shift, case equality, and wildcard equality.
246 if (tryMergeTokensAny({{tok::star
, tok::star
},
247 {tok::lessless
, tok::less
},
248 {tok::greatergreater
, tok::greater
},
249 {tok::exclaimequal
, tok::equal
},
250 {tok::exclaimequal
, tok::question
},
251 {tok::equalequal
, tok::equal
},
252 {tok::equalequal
, tok::question
}},
253 TT_BinaryOperator
)) {
256 // Module paths in specify blocks and the implication and boolean equality
258 if (tryMergeTokensAny({{tok::plusequal
, tok::greater
},
259 {tok::plus
, tok::star
, tok::greater
},
260 {tok::minusequal
, tok::greater
},
261 {tok::minus
, tok::star
, tok::greater
},
262 {tok::less
, tok::arrow
},
263 {tok::equal
, tok::greater
},
264 {tok::star
, tok::greater
},
265 {tok::pipeequal
, tok::greater
},
266 {tok::pipe
, tok::arrow
},
267 {tok::hash
, tok::minus
, tok::hash
},
268 {tok::hash
, tok::equal
, tok::hash
}},
269 TT_BinaryOperator
) ||
270 Tokens
.back()->is(tok::arrow
)) {
271 Tokens
.back()->ForcedPrecedence
= prec::Comma
;
277 bool FormatTokenLexer::tryMergeNSStringLiteral() {
278 if (Tokens
.size() < 2)
280 auto &At
= *(Tokens
.end() - 2);
281 auto &String
= *(Tokens
.end() - 1);
282 if (At
->isNot(tok::at
) || String
->isNot(tok::string_literal
))
284 At
->Tok
.setKind(tok::string_literal
);
285 At
->TokenText
= StringRef(At
->TokenText
.begin(),
286 String
->TokenText
.end() - At
->TokenText
.begin());
287 At
->ColumnWidth
+= String
->ColumnWidth
;
288 At
->setType(TT_ObjCStringLiteral
);
289 Tokens
.erase(Tokens
.end() - 1);
293 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
294 // Merges #idenfier into a single identifier with the text #identifier
295 // but the token tok::identifier.
296 if (Tokens
.size() < 2)
298 auto &Hash
= *(Tokens
.end() - 2);
299 auto &Identifier
= *(Tokens
.end() - 1);
300 if (Hash
->isNot(tok::hash
) || Identifier
->isNot(tok::identifier
))
302 Hash
->Tok
.setKind(tok::identifier
);
304 StringRef(Hash
->TokenText
.begin(),
305 Identifier
->TokenText
.end() - Hash
->TokenText
.begin());
306 Hash
->ColumnWidth
+= Identifier
->ColumnWidth
;
307 Hash
->setType(TT_JsPrivateIdentifier
);
308 Tokens
.erase(Tokens
.end() - 1);
312 // Search for verbatim or interpolated string literals @"ABC" or
313 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
314 // prevent splitting of @, $ and ".
315 // Merging of multiline verbatim strings with embedded '"' is handled in
316 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
317 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
318 if (Tokens
.size() < 2)
321 // Look for @"aaaaaa" or $"aaaaaa".
322 const auto String
= *(Tokens
.end() - 1);
323 if (String
->isNot(tok::string_literal
))
326 auto Prefix
= *(Tokens
.end() - 2);
327 if (Prefix
->isNot(tok::at
) && Prefix
->TokenText
!= "$")
330 if (Tokens
.size() > 2) {
331 const auto Tok
= *(Tokens
.end() - 3);
332 if ((Tok
->TokenText
== "$" && Prefix
->is(tok::at
)) ||
333 (Tok
->is(tok::at
) && Prefix
->TokenText
== "$")) {
334 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
335 Tok
->ColumnWidth
+= Prefix
->ColumnWidth
;
336 Tokens
.erase(Tokens
.end() - 2);
341 // Convert back into just a string_literal.
342 Prefix
->Tok
.setKind(tok::string_literal
);
344 StringRef(Prefix
->TokenText
.begin(),
345 String
->TokenText
.end() - Prefix
->TokenText
.begin());
346 Prefix
->ColumnWidth
+= String
->ColumnWidth
;
347 Prefix
->setType(TT_CSharpStringLiteral
);
348 Tokens
.erase(Tokens
.end() - 1);
352 // Valid C# attribute targets:
353 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
354 const llvm::StringSet
<> FormatTokenLexer::CSharpAttributeTargets
= {
355 "assembly", "module", "field", "event", "method",
356 "param", "property", "return", "type",
359 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
360 if (Tokens
.size() < 2)
362 auto &NullishCoalescing
= *(Tokens
.end() - 2);
363 auto &Equal
= *(Tokens
.end() - 1);
364 if (NullishCoalescing
->getType() != TT_NullCoalescingOperator
||
365 Equal
->isNot(tok::equal
)) {
368 NullishCoalescing
->Tok
.setKind(tok::equal
); // no '??=' in clang tokens.
369 NullishCoalescing
->TokenText
=
370 StringRef(NullishCoalescing
->TokenText
.begin(),
371 Equal
->TokenText
.end() - NullishCoalescing
->TokenText
.begin());
372 NullishCoalescing
->ColumnWidth
+= Equal
->ColumnWidth
;
373 NullishCoalescing
->setType(TT_NullCoalescingEqual
);
374 Tokens
.erase(Tokens
.end() - 1);
378 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
379 if (Tokens
.size() < 2)
381 const auto At
= *(Tokens
.end() - 2);
382 if (At
->isNot(tok::at
))
384 const auto Keyword
= *(Tokens
.end() - 1);
385 if (Keyword
->TokenText
== "$")
387 if (!Keywords
.isCSharpKeyword(*Keyword
))
390 At
->Tok
.setKind(tok::identifier
);
391 At
->TokenText
= StringRef(At
->TokenText
.begin(),
392 Keyword
->TokenText
.end() - At
->TokenText
.begin());
393 At
->ColumnWidth
+= Keyword
->ColumnWidth
;
394 At
->setType(Keyword
->getType());
395 Tokens
.erase(Tokens
.end() - 1);
399 // In C# transform identifier foreach into kw_foreach
400 bool FormatTokenLexer::tryTransformCSharpForEach() {
401 if (Tokens
.size() < 1)
403 auto &Identifier
= *(Tokens
.end() - 1);
404 if (Identifier
->isNot(tok::identifier
))
406 if (Identifier
->TokenText
!= "foreach")
409 Identifier
->setType(TT_ForEachMacro
);
410 Identifier
->Tok
.setKind(tok::kw_for
);
414 bool FormatTokenLexer::tryMergeForEach() {
415 if (Tokens
.size() < 2)
417 auto &For
= *(Tokens
.end() - 2);
418 auto &Each
= *(Tokens
.end() - 1);
419 if (For
->isNot(tok::kw_for
))
421 if (Each
->isNot(tok::identifier
))
423 if (Each
->TokenText
!= "each")
426 For
->setType(TT_ForEachMacro
);
427 For
->Tok
.setKind(tok::kw_for
);
429 For
->TokenText
= StringRef(For
->TokenText
.begin(),
430 Each
->TokenText
.end() - For
->TokenText
.begin());
431 For
->ColumnWidth
+= Each
->ColumnWidth
;
432 Tokens
.erase(Tokens
.end() - 1);
436 bool FormatTokenLexer::tryTransformTryUsageForC() {
437 if (Tokens
.size() < 2)
439 auto &Try
= *(Tokens
.end() - 2);
440 if (Try
->isNot(tok::kw_try
))
442 auto &Next
= *(Tokens
.end() - 1);
443 if (Next
->isOneOf(tok::l_brace
, tok::colon
, tok::hash
, tok::comment
))
446 if (Tokens
.size() > 2) {
447 auto &At
= *(Tokens
.end() - 3);
452 Try
->Tok
.setKind(tok::identifier
);
456 bool FormatTokenLexer::tryMergeLessLess() {
457 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
458 if (Tokens
.size() < 3)
461 auto First
= Tokens
.end() - 3;
462 if (First
[0]->isNot(tok::less
) || First
[1]->isNot(tok::less
))
465 // Only merge if there currently is no whitespace between the two "<".
466 if (First
[1]->hasWhitespaceBefore())
469 auto X
= Tokens
.size() > 3 ? First
[-1] : nullptr;
470 if (X
&& X
->is(tok::less
))
474 if ((!X
|| X
->isNot(tok::kw_operator
)) && Y
->is(tok::less
))
477 First
[0]->Tok
.setKind(tok::lessless
);
478 First
[0]->TokenText
= "<<";
479 First
[0]->ColumnWidth
+= 1;
480 Tokens
.erase(Tokens
.end() - 2);
484 bool FormatTokenLexer::tryMergeGreaterGreater() {
485 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
486 if (Tokens
.size() < 2)
489 auto First
= Tokens
.end() - 2;
490 if (First
[0]->isNot(tok::greater
) || First
[1]->isNot(tok::greater
))
493 // Only merge if there currently is no whitespace between the first two ">".
494 if (First
[1]->hasWhitespaceBefore())
497 auto Tok
= Tokens
.size() > 2 ? First
[-1] : nullptr;
498 if (Tok
&& Tok
->isNot(tok::kw_operator
))
501 First
[0]->Tok
.setKind(tok::greatergreater
);
502 First
[0]->TokenText
= ">>";
503 First
[0]->ColumnWidth
+= 1;
504 Tokens
.erase(Tokens
.end() - 1);
508 bool FormatTokenLexer::tryMergeTokens(ArrayRef
<tok::TokenKind
> Kinds
,
510 if (Tokens
.size() < Kinds
.size())
513 SmallVectorImpl
<FormatToken
*>::const_iterator First
=
514 Tokens
.end() - Kinds
.size();
515 for (unsigned i
= 0; i
< Kinds
.size(); ++i
)
516 if (First
[i
]->isNot(Kinds
[i
]))
519 return tryMergeTokens(Kinds
.size(), NewType
);
522 bool FormatTokenLexer::tryMergeTokens(size_t Count
, TokenType NewType
) {
523 if (Tokens
.size() < Count
)
526 SmallVectorImpl
<FormatToken
*>::const_iterator First
= Tokens
.end() - Count
;
527 unsigned AddLength
= 0;
528 for (size_t i
= 1; i
< Count
; ++i
) {
529 // If there is whitespace separating the token and the previous one,
530 // they should not be merged.
531 if (First
[i
]->hasWhitespaceBefore())
533 AddLength
+= First
[i
]->TokenText
.size();
536 Tokens
.resize(Tokens
.size() - Count
+ 1);
537 First
[0]->TokenText
= StringRef(First
[0]->TokenText
.data(),
538 First
[0]->TokenText
.size() + AddLength
);
539 First
[0]->ColumnWidth
+= AddLength
;
540 First
[0]->setType(NewType
);
544 bool FormatTokenLexer::tryMergeTokensAny(
545 ArrayRef
<ArrayRef
<tok::TokenKind
>> Kinds
, TokenType NewType
) {
546 return llvm::any_of(Kinds
, [this, NewType
](ArrayRef
<tok::TokenKind
> Kinds
) {
547 return tryMergeTokens(Kinds
, NewType
);
551 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
552 bool FormatTokenLexer::precedesOperand(FormatToken
*Tok
) {
553 // NB: This is not entirely correct, as an r_paren can introduce an operand
554 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
555 // corner case to not matter in practice, though.
556 return Tok
->isOneOf(tok::period
, tok::l_paren
, tok::comma
, tok::l_brace
,
557 tok::r_brace
, tok::l_square
, tok::semi
, tok::exclaim
,
558 tok::colon
, tok::question
, tok::tilde
) ||
559 Tok
->isOneOf(tok::kw_return
, tok::kw_do
, tok::kw_case
, tok::kw_throw
,
560 tok::kw_else
, tok::kw_new
, tok::kw_delete
, tok::kw_void
,
561 tok::kw_typeof
, Keywords
.kw_instanceof
, Keywords
.kw_in
) ||
562 Tok
->isBinaryOperator();
565 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken
*Prev
) {
569 // Regex literals can only follow after prefix unary operators, not after
570 // postfix unary operators. If the '++' is followed by a non-operand
571 // introducing token, the slash here is the operand and not the start of a
573 // `!` is an unary prefix operator, but also a post-fix operator that casts
574 // away nullability, so the same check applies.
575 if (Prev
->isOneOf(tok::plusplus
, tok::minusminus
, tok::exclaim
))
576 return Tokens
.size() < 3 || precedesOperand(Tokens
[Tokens
.size() - 3]);
578 // The previous token must introduce an operand location where regex
579 // literals can occur.
580 if (!precedesOperand(Prev
))
586 // Tries to parse a JavaScript Regex literal starting at the current token,
587 // if that begins with a slash and is in a location where JavaScript allows
588 // regex literals. Changes the current token to a regex literal and updates
589 // its text if successful.
590 void FormatTokenLexer::tryParseJSRegexLiteral() {
591 FormatToken
*RegexToken
= Tokens
.back();
592 if (!RegexToken
->isOneOf(tok::slash
, tok::slashequal
))
595 FormatToken
*Prev
= nullptr;
596 for (FormatToken
*FT
: llvm::drop_begin(llvm::reverse(Tokens
))) {
597 // NB: Because previous pointers are not initialized yet, this cannot use
598 // Token.getPreviousNonComment.
599 if (FT
->isNot(tok::comment
)) {
605 if (!canPrecedeRegexLiteral(Prev
))
608 // 'Manually' lex ahead in the current file buffer.
609 const char *Offset
= Lex
->getBufferLocation();
610 const char *RegexBegin
= Offset
- RegexToken
->TokenText
.size();
611 StringRef Buffer
= Lex
->getBuffer();
612 bool InCharacterClass
= false;
613 bool HaveClosingSlash
= false;
614 for (; !HaveClosingSlash
&& Offset
!= Buffer
.end(); ++Offset
) {
615 // Regular expressions are terminated with a '/', which can only be
616 // escaped using '\' or a character class between '[' and ']'.
617 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
620 // Skip the escaped character.
624 InCharacterClass
= true;
627 InCharacterClass
= false;
630 if (!InCharacterClass
)
631 HaveClosingSlash
= true;
636 RegexToken
->setType(TT_RegexLiteral
);
637 // Treat regex literals like other string_literals.
638 RegexToken
->Tok
.setKind(tok::string_literal
);
639 RegexToken
->TokenText
= StringRef(RegexBegin
, Offset
- RegexBegin
);
640 RegexToken
->ColumnWidth
= RegexToken
->TokenText
.size();
642 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(Offset
)));
645 static auto lexCSharpString(const char *Begin
, const char *End
, bool Verbatim
,
647 auto Repeated
= [&Begin
, End
]() {
648 return Begin
+ 1 < End
&& Begin
[1] == Begin
[0];
651 // Look for a terminating '"' in the current file buffer.
652 // Make no effort to format code within an interpolated or verbatim string.
654 // Interpolated strings could contain { } with " characters inside.
656 // should not be split into $"{x ?? ", null, "}" but should be treated as a
657 // single string-literal.
659 // We opt not to try and format expressions inside {} within a C#
660 // interpolated string. Formatting expressions within an interpolated string
661 // would require similar work as that done for JavaScript template strings
662 // in `handleTemplateStrings()`.
663 for (int UnmatchedOpeningBraceCount
= 0; Begin
< End
; ++Begin
) {
671 // {{ inside an interpolated string is escaped, so skip it.
675 ++UnmatchedOpeningBraceCount
;
680 // }} inside an interpolated string is escaped, so skip it.
683 else if (UnmatchedOpeningBraceCount
> 0)
684 --UnmatchedOpeningBraceCount
;
690 if (UnmatchedOpeningBraceCount
> 0)
692 // "" within a verbatim string is an escaped double quote: skip it.
693 if (Verbatim
&& Repeated()) {
704 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
705 FormatToken
*CSharpStringLiteral
= Tokens
.back();
707 if (CSharpStringLiteral
->isNot(TT_CSharpStringLiteral
))
710 auto &TokenText
= CSharpStringLiteral
->TokenText
;
712 bool Verbatim
= false;
713 bool Interpolated
= false;
714 if (TokenText
.starts_with(R
"($@")") || TokenText.starts_with(R"(@$
")")) {
717 } else if (TokenText
.starts_with(R
"(@")")) {
719 } else if (TokenText.starts_with(R"($
")")) {
723 // Deal with multiline strings.
724 if (!Verbatim
&& !Interpolated
)
727 const char *StrBegin
= Lex
->getBufferLocation() - TokenText
.size();
728 const char *Offset
= StrBegin
;
729 if (Verbatim
&& Interpolated
)
734 const auto End
= Lex
->getBuffer().end();
735 Offset
= lexCSharpString(Offset
, End
, Verbatim
, Interpolated
);
737 // Make no attempt to format code properly if a verbatim string is
742 StringRef
LiteralText(StrBegin
, Offset
- StrBegin
+ 1);
743 TokenText
= LiteralText
;
745 // Adjust width for potentially multiline string literals.
746 size_t FirstBreak
= LiteralText
.find('\n');
747 StringRef FirstLineText
= FirstBreak
== StringRef::npos
749 : LiteralText
.substr(0, FirstBreak
);
750 CSharpStringLiteral
->ColumnWidth
= encoding::columnWidthWithTabs(
751 FirstLineText
, CSharpStringLiteral
->OriginalColumn
, Style
.TabWidth
,
753 size_t LastBreak
= LiteralText
.rfind('\n');
754 if (LastBreak
!= StringRef::npos
) {
755 CSharpStringLiteral
->IsMultiline
= true;
756 unsigned StartColumn
= 0;
757 CSharpStringLiteral
->LastLineColumnWidth
=
758 encoding::columnWidthWithTabs(LiteralText
.substr(LastBreak
+ 1),
759 StartColumn
, Style
.TabWidth
, Encoding
);
762 assert(Offset
< End
);
763 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(Offset
+ 1)));
766 void FormatTokenLexer::handleTemplateStrings() {
767 FormatToken
*BacktickToken
= Tokens
.back();
769 if (BacktickToken
->is(tok::l_brace
)) {
770 StateStack
.push(LexerState::NORMAL
);
773 if (BacktickToken
->is(tok::r_brace
)) {
774 if (StateStack
.size() == 1)
777 if (StateStack
.top() != LexerState::TEMPLATE_STRING
)
779 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
780 } else if (BacktickToken
->is(tok::unknown
) &&
781 BacktickToken
->TokenText
== "`") {
782 StateStack
.push(LexerState::TEMPLATE_STRING
);
784 return; // Not actually a template
787 // 'Manually' lex ahead in the current file buffer.
788 const char *Offset
= Lex
->getBufferLocation();
789 const char *TmplBegin
= Offset
- BacktickToken
->TokenText
.size(); // at "`"
790 for (; Offset
!= Lex
->getBuffer().end(); ++Offset
) {
791 if (Offset
[0] == '`') {
796 if (Offset
[0] == '\\') {
797 ++Offset
; // Skip the escaped character.
798 } else if (Offset
+ 1 < Lex
->getBuffer().end() && Offset
[0] == '$' &&
800 // '${' introduces an expression interpolation in the template string.
801 StateStack
.push(LexerState::NORMAL
);
807 StringRef
LiteralText(TmplBegin
, Offset
- TmplBegin
);
808 BacktickToken
->setType(TT_TemplateString
);
809 BacktickToken
->Tok
.setKind(tok::string_literal
);
810 BacktickToken
->TokenText
= LiteralText
;
812 // Adjust width for potentially multiline string literals.
813 size_t FirstBreak
= LiteralText
.find('\n');
814 StringRef FirstLineText
= FirstBreak
== StringRef::npos
816 : LiteralText
.substr(0, FirstBreak
);
817 BacktickToken
->ColumnWidth
= encoding::columnWidthWithTabs(
818 FirstLineText
, BacktickToken
->OriginalColumn
, Style
.TabWidth
, Encoding
);
819 size_t LastBreak
= LiteralText
.rfind('\n');
820 if (LastBreak
!= StringRef::npos
) {
821 BacktickToken
->IsMultiline
= true;
822 unsigned StartColumn
= 0; // The template tail spans the entire line.
823 BacktickToken
->LastLineColumnWidth
=
824 encoding::columnWidthWithTabs(LiteralText
.substr(LastBreak
+ 1),
825 StartColumn
, Style
.TabWidth
, Encoding
);
828 SourceLocation loc
= Lex
->getSourceLocation(Offset
);
829 resetLexer(SourceMgr
.getFileOffset(loc
));
832 void FormatTokenLexer::tryParsePythonComment() {
833 FormatToken
*HashToken
= Tokens
.back();
834 if (!HashToken
->isOneOf(tok::hash
, tok::hashhash
))
836 // Turn the remainder of this line into a comment.
837 const char *CommentBegin
=
838 Lex
->getBufferLocation() - HashToken
->TokenText
.size(); // at "#"
839 size_t From
= CommentBegin
- Lex
->getBuffer().begin();
840 size_t To
= Lex
->getBuffer().find_first_of('\n', From
);
841 if (To
== StringRef::npos
)
842 To
= Lex
->getBuffer().size();
843 size_t Len
= To
- From
;
844 HashToken
->setType(TT_LineComment
);
845 HashToken
->Tok
.setKind(tok::comment
);
846 HashToken
->TokenText
= Lex
->getBuffer().substr(From
, Len
);
847 SourceLocation Loc
= To
< Lex
->getBuffer().size()
848 ? Lex
->getSourceLocation(CommentBegin
+ Len
)
849 : SourceMgr
.getLocForEndOfFile(ID
);
850 resetLexer(SourceMgr
.getFileOffset(Loc
));
853 bool FormatTokenLexer::tryMerge_TMacro() {
854 if (Tokens
.size() < 4)
856 FormatToken
*Last
= Tokens
.back();
857 if (Last
->isNot(tok::r_paren
))
860 FormatToken
*String
= Tokens
[Tokens
.size() - 2];
861 if (String
->isNot(tok::string_literal
) || String
->IsMultiline
)
864 if (Tokens
[Tokens
.size() - 3]->isNot(tok::l_paren
))
867 FormatToken
*Macro
= Tokens
[Tokens
.size() - 4];
868 if (Macro
->TokenText
!= "_T")
871 const char *Start
= Macro
->TokenText
.data();
872 const char *End
= Last
->TokenText
.data() + Last
->TokenText
.size();
873 String
->TokenText
= StringRef(Start
, End
- Start
);
874 String
->IsFirst
= Macro
->IsFirst
;
875 String
->LastNewlineOffset
= Macro
->LastNewlineOffset
;
876 String
->WhitespaceRange
= Macro
->WhitespaceRange
;
877 String
->OriginalColumn
= Macro
->OriginalColumn
;
878 String
->ColumnWidth
= encoding::columnWidthWithTabs(
879 String
->TokenText
, String
->OriginalColumn
, Style
.TabWidth
, Encoding
);
880 String
->NewlinesBefore
= Macro
->NewlinesBefore
;
881 String
->HasUnescapedNewline
= Macro
->HasUnescapedNewline
;
886 Tokens
.back() = String
;
887 if (FirstInLineIndex
>= Tokens
.size())
888 FirstInLineIndex
= Tokens
.size() - 1;
892 bool FormatTokenLexer::tryMergeConflictMarkers() {
893 if (Tokens
.back()->NewlinesBefore
== 0 && Tokens
.back()->isNot(tok::eof
))
896 // Conflict lines look like:
897 // <marker> <text from the vcs>
899 // >>>>>>> /file/in/file/system at revision 1234
901 // We merge all tokens in a line that starts with a conflict marker
902 // into a single token with a special token type that the unwrapped line
903 // parser will use to correctly rebuild the underlying code.
906 // Get the position of the first token in the line.
907 unsigned FirstInLineOffset
;
908 std::tie(ID
, FirstInLineOffset
) = SourceMgr
.getDecomposedLoc(
909 Tokens
[FirstInLineIndex
]->getStartOfNonWhitespace());
910 StringRef Buffer
= SourceMgr
.getBufferOrFake(ID
).getBuffer();
911 // Calculate the offset of the start of the current line.
912 auto LineOffset
= Buffer
.rfind('\n', FirstInLineOffset
);
913 if (LineOffset
== StringRef::npos
)
918 auto FirstSpace
= Buffer
.find_first_of(" \n", LineOffset
);
920 if (FirstSpace
== StringRef::npos
)
921 LineStart
= Buffer
.substr(LineOffset
);
923 LineStart
= Buffer
.substr(LineOffset
, FirstSpace
- LineOffset
);
925 TokenType Type
= TT_Unknown
;
926 if (LineStart
== "<<<<<<<" || LineStart
== ">>>>") {
927 Type
= TT_ConflictStart
;
928 } else if (LineStart
== "|||||||" || LineStart
== "=======" ||
929 LineStart
== "====") {
930 Type
= TT_ConflictAlternative
;
931 } else if (LineStart
== ">>>>>>>" || LineStart
== "<<<<") {
932 Type
= TT_ConflictEnd
;
935 if (Type
!= TT_Unknown
) {
936 FormatToken
*Next
= Tokens
.back();
938 Tokens
.resize(FirstInLineIndex
+ 1);
939 // We do not need to build a complete token here, as we will skip it
940 // during parsing anyway (as we must not touch whitespace around conflict
942 Tokens
.back()->setType(Type
);
943 Tokens
.back()->Tok
.setKind(tok::kw___unknown_anytype
);
945 Tokens
.push_back(Next
);
952 FormatToken
*FormatTokenLexer::getStashedToken() {
953 // Create a synthesized second '>' or '<' token.
954 Token Tok
= FormatTok
->Tok
;
955 StringRef TokenText
= FormatTok
->TokenText
;
957 unsigned OriginalColumn
= FormatTok
->OriginalColumn
;
958 FormatTok
= new (Allocator
.Allocate()) FormatToken
;
959 FormatTok
->Tok
= Tok
;
960 SourceLocation TokLocation
=
961 FormatTok
->Tok
.getLocation().getLocWithOffset(Tok
.getLength() - 1);
962 FormatTok
->Tok
.setLocation(TokLocation
);
963 FormatTok
->WhitespaceRange
= SourceRange(TokLocation
, TokLocation
);
964 FormatTok
->TokenText
= TokenText
;
965 FormatTok
->ColumnWidth
= 1;
966 FormatTok
->OriginalColumn
= OriginalColumn
+ 1;
971 /// Truncate the current token to the new length and make the lexer continue
972 /// from the end of the truncated token. Used for other languages that have
973 /// different token boundaries, like JavaScript in which a comment ends at a
974 /// line break regardless of whether the line break follows a backslash. Also
975 /// used to set the lexer to the end of whitespace if the lexer regards
976 /// whitespace and an unrecognized symbol as one token.
977 void FormatTokenLexer::truncateToken(size_t NewLen
) {
978 assert(NewLen
<= FormatTok
->TokenText
.size());
979 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(
980 Lex
->getBufferLocation() - FormatTok
->TokenText
.size() + NewLen
)));
981 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, NewLen
);
982 FormatTok
->ColumnWidth
= encoding::columnWidthWithTabs(
983 FormatTok
->TokenText
, FormatTok
->OriginalColumn
, Style
.TabWidth
,
985 FormatTok
->Tok
.setLength(NewLen
);
988 /// Count the length of leading whitespace in a token.
989 static size_t countLeadingWhitespace(StringRef Text
) {
990 // Basically counting the length matched by this regex.
991 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
992 // Directly using the regex turned out to be slow. With the regex
993 // version formatting all files in this directory took about 1.25
994 // seconds. This version took about 0.5 seconds.
995 const unsigned char *const Begin
= Text
.bytes_begin();
996 const unsigned char *const End
= Text
.bytes_end();
997 const unsigned char *Cur
= Begin
;
999 if (isspace(Cur
[0])) {
1001 } else if (Cur
[0] == '\\' && (Cur
[1] == '\n' || Cur
[1] == '\r')) {
1002 // A '\' followed by a newline always escapes the newline, regardless
1003 // of whether there is another '\' before it.
1004 // The source has a null byte at the end. So the end of the entire input
1005 // isn't reached yet. Also the lexer doesn't break apart an escaped
1007 assert(End
- Cur
>= 2);
1009 } else if (Cur
[0] == '?' && Cur
[1] == '?' && Cur
[2] == '/' &&
1010 (Cur
[3] == '\n' || Cur
[3] == '\r')) {
1011 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1012 // characters are quoted individually in this comment because if we write
1013 // them together some compilers warn that we have a trigraph in the code.
1014 assert(End
- Cur
>= 4);
1023 FormatToken
*FormatTokenLexer::getNextToken() {
1024 if (StateStack
.top() == LexerState::TOKEN_STASHED
) {
1026 return getStashedToken();
1029 FormatTok
= new (Allocator
.Allocate()) FormatToken
;
1030 readRawToken(*FormatTok
);
1031 SourceLocation WhitespaceStart
=
1032 FormatTok
->Tok
.getLocation().getLocWithOffset(-TrailingWhitespace
);
1033 FormatTok
->IsFirst
= IsFirstToken
;
1034 IsFirstToken
= false;
1036 // Consume and record whitespace until we find a significant token.
1037 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1038 // followed by a symbol such as backtick. Those symbols may be
1039 // significant in other languages.
1040 unsigned WhitespaceLength
= TrailingWhitespace
;
1041 while (FormatTok
->isNot(tok::eof
)) {
1042 auto LeadingWhitespace
= countLeadingWhitespace(FormatTok
->TokenText
);
1043 if (LeadingWhitespace
== 0)
1045 if (LeadingWhitespace
< FormatTok
->TokenText
.size())
1046 truncateToken(LeadingWhitespace
);
1047 StringRef Text
= FormatTok
->TokenText
;
1048 bool InEscape
= false;
1049 for (int i
= 0, e
= Text
.size(); i
!= e
; ++i
) {
1052 // If this is a CRLF sequence, break here and the LF will be handled on
1053 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1054 // the same as a single LF.
1055 if (i
+ 1 < e
&& Text
[i
+ 1] == '\n')
1059 ++FormatTok
->NewlinesBefore
;
1061 FormatTok
->HasUnescapedNewline
= true;
1064 FormatTok
->LastNewlineOffset
= WhitespaceLength
+ i
+ 1;
1076 Style
.TabWidth
- (Style
.TabWidth
? Column
% Style
.TabWidth
: 0);
1081 // The text was entirely whitespace when this loop was entered. Thus
1082 // this has to be an escape sequence.
1083 assert(Text
.substr(i
, 2) == "\\\r" || Text
.substr(i
, 2) == "\\\n" ||
1084 Text
.substr(i
, 4) == "\?\?/\r" ||
1085 Text
.substr(i
, 4) == "\?\?/\n" ||
1086 (i
>= 1 && (Text
.substr(i
- 1, 4) == "\?\?/\r" ||
1087 Text
.substr(i
- 1, 4) == "\?\?/\n")) ||
1088 (i
>= 2 && (Text
.substr(i
- 2, 4) == "\?\?/\r" ||
1089 Text
.substr(i
- 2, 4) == "\?\?/\n")));
1093 // This shouldn't happen.
1098 WhitespaceLength
+= Text
.size();
1099 readRawToken(*FormatTok
);
1102 if (FormatTok
->is(tok::unknown
))
1103 FormatTok
->setType(TT_ImplicitStringLiteral
);
1105 // JavaScript and Java do not allow to escape the end of the line with a
1106 // backslash. Backslashes are syntax errors in plain source, but can occur in
1107 // comments. When a single line comment ends with a \, it'll cause the next
1108 // line of code to be lexed as a comment, breaking formatting. The code below
1109 // finds comments that contain a backslash followed by a line break, truncates
1110 // the comment token at the backslash, and resets the lexer to restart behind
1112 if ((Style
.isJavaScript() || Style
.Language
== FormatStyle::LK_Java
) &&
1113 FormatTok
->is(tok::comment
) && FormatTok
->TokenText
.starts_with("//")) {
1114 size_t BackslashPos
= FormatTok
->TokenText
.find('\\');
1115 while (BackslashPos
!= StringRef::npos
) {
1116 if (BackslashPos
+ 1 < FormatTok
->TokenText
.size() &&
1117 FormatTok
->TokenText
[BackslashPos
+ 1] == '\n') {
1118 truncateToken(BackslashPos
+ 1);
1121 BackslashPos
= FormatTok
->TokenText
.find('\\', BackslashPos
+ 1);
1125 if (Style
.isVerilog()) {
1126 static const llvm::Regex
NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase
);
1127 SmallVector
<StringRef
, 1> Matches
;
1128 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1129 // And it uses the hash for delays and parameter lists. In order to continue
1130 // using `tok::hash` in other places, the backtick gets marked as the hash
1131 // here. And in order to tell the backtick and hash apart for
1132 // Verilog-specific stuff, the hash becomes an identifier.
1133 if (FormatTok
->is(tok::numeric_constant
)) {
1134 // In Verilog the quote is not part of a number.
1135 auto Quote
= FormatTok
->TokenText
.find('\'');
1136 if (Quote
!= StringRef::npos
)
1137 truncateToken(Quote
);
1138 } else if (FormatTok
->isOneOf(tok::hash
, tok::hashhash
)) {
1139 FormatTok
->Tok
.setKind(tok::raw_identifier
);
1140 } else if (FormatTok
->is(tok::raw_identifier
)) {
1141 if (FormatTok
->TokenText
== "`") {
1142 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1143 FormatTok
->Tok
.setKind(tok::hash
);
1144 } else if (FormatTok
->TokenText
== "``") {
1145 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1146 FormatTok
->Tok
.setKind(tok::hashhash
);
1147 } else if (Tokens
.size() > 0 &&
1148 Tokens
.back()->is(Keywords
.kw_apostrophe
) &&
1149 NumberBase
.match(FormatTok
->TokenText
, &Matches
)) {
1150 // In Verilog in a based number literal like `'b10`, there may be
1151 // whitespace between `'b` and `10`. Therefore we handle the base and
1152 // the rest of the number literal as two tokens. But if there is no
1153 // space in the input code, we need to manually separate the two parts.
1154 truncateToken(Matches
[0].size());
1155 FormatTok
->setFinalizedType(TT_VerilogNumberBase
);
1160 FormatTok
->WhitespaceRange
= SourceRange(
1161 WhitespaceStart
, WhitespaceStart
.getLocWithOffset(WhitespaceLength
));
1163 FormatTok
->OriginalColumn
= Column
;
1165 TrailingWhitespace
= 0;
1166 if (FormatTok
->is(tok::comment
)) {
1167 // FIXME: Add the trimmed whitespace to Column.
1168 StringRef UntrimmedText
= FormatTok
->TokenText
;
1169 FormatTok
->TokenText
= FormatTok
->TokenText
.rtrim(" \t\v\f");
1170 TrailingWhitespace
= UntrimmedText
.size() - FormatTok
->TokenText
.size();
1171 } else if (FormatTok
->is(tok::raw_identifier
)) {
1172 IdentifierInfo
&Info
= IdentTable
.get(FormatTok
->TokenText
);
1173 FormatTok
->Tok
.setIdentifierInfo(&Info
);
1174 FormatTok
->Tok
.setKind(Info
.getTokenID());
1175 if (Style
.Language
== FormatStyle::LK_Java
&&
1176 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
, tok::kw_delete
,
1177 tok::kw_operator
)) {
1178 FormatTok
->Tok
.setKind(tok::identifier
);
1179 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1180 } else if (Style
.isJavaScript() &&
1181 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
,
1182 tok::kw_operator
)) {
1183 FormatTok
->Tok
.setKind(tok::identifier
);
1184 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1186 } else if (FormatTok
->is(tok::greatergreater
)) {
1187 FormatTok
->Tok
.setKind(tok::greater
);
1188 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1190 StateStack
.push(LexerState::TOKEN_STASHED
);
1191 } else if (FormatTok
->is(tok::lessless
)) {
1192 FormatTok
->Tok
.setKind(tok::less
);
1193 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1195 StateStack
.push(LexerState::TOKEN_STASHED
);
1198 if (Style
.isVerilog() && Tokens
.size() > 0 &&
1199 Tokens
.back()->is(TT_VerilogNumberBase
) &&
1200 FormatTok
->Tok
.isOneOf(tok::identifier
, tok::question
)) {
1201 // Mark the number following a base like `'h?a0` as a number.
1202 FormatTok
->Tok
.setKind(tok::numeric_constant
);
1205 // Now FormatTok is the next non-whitespace token.
1207 StringRef Text
= FormatTok
->TokenText
;
1208 size_t FirstNewlinePos
= Text
.find('\n');
1209 if (FirstNewlinePos
== StringRef::npos
) {
1210 // FIXME: ColumnWidth actually depends on the start column, we need to
1211 // take this into account when the token is moved.
1212 FormatTok
->ColumnWidth
=
1213 encoding::columnWidthWithTabs(Text
, Column
, Style
.TabWidth
, Encoding
);
1214 Column
+= FormatTok
->ColumnWidth
;
1216 FormatTok
->IsMultiline
= true;
1217 // FIXME: ColumnWidth actually depends on the start column, we need to
1218 // take this into account when the token is moved.
1219 FormatTok
->ColumnWidth
= encoding::columnWidthWithTabs(
1220 Text
.substr(0, FirstNewlinePos
), Column
, Style
.TabWidth
, Encoding
);
1222 // The last line of the token always starts in column 0.
1223 // Thus, the length can be precomputed even in the presence of tabs.
1224 FormatTok
->LastLineColumnWidth
= encoding::columnWidthWithTabs(
1225 Text
.substr(Text
.find_last_of('\n') + 1), 0, Style
.TabWidth
, Encoding
);
1226 Column
= FormatTok
->LastLineColumnWidth
;
1229 if (Style
.isCpp()) {
1230 auto *Identifier
= FormatTok
->Tok
.getIdentifierInfo();
1231 auto it
= Macros
.find(Identifier
);
1232 if (!(Tokens
.size() > 0 && Tokens
.back()->Tok
.getIdentifierInfo() &&
1233 Tokens
.back()->Tok
.getIdentifierInfo()->getPPKeywordID() ==
1235 it
!= Macros
.end()) {
1236 FormatTok
->setType(it
->second
);
1237 if (it
->second
== TT_IfMacro
) {
1238 // The lexer token currently has type tok::kw_unknown. However, for this
1239 // substitution to be treated correctly in the TokenAnnotator, faking
1240 // the tok value seems to be needed. Not sure if there's a more elegant
1242 FormatTok
->Tok
.setKind(tok::kw_if
);
1244 } else if (FormatTok
->is(tok::identifier
)) {
1245 if (MacroBlockBeginRegex
.match(Text
))
1246 FormatTok
->setType(TT_MacroBlockBegin
);
1247 else if (MacroBlockEndRegex
.match(Text
))
1248 FormatTok
->setType(TT_MacroBlockEnd
);
1249 else if (TypeNames
.contains(Identifier
))
1250 FormatTok
->setFinalizedType(TT_TypeName
);
1257 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token
&Tok
) {
1258 // In Verilog the quote is not a character literal.
1260 // Make the backtick and double backtick identifiers to match against them
1263 // In Verilog an escaped identifier starts with backslash and ends with
1264 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1265 // also begin an escaped newline outside of an escaped identifier. We check
1266 // for that outside of the Regex since we can't use negative lookhead
1267 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1268 // identifier may have a length of 0 according to Section A.9.3.
1269 // FIXME: If there is an escaped newline in the middle of an escaped
1270 // identifier, allow for pasting the two lines together, But escaped
1271 // identifiers usually occur only in generated code anyway.
1272 static const llvm::Regex
VerilogToken(R
"re(^('|``?|\\(\\)re"
1273 "(\r?\n|\r)|[^[:space:]])*)");
1275 SmallVector
<StringRef
, 4> Matches
;
1276 const char *Start
= Lex
->getBufferLocation();
1277 if (!VerilogToken
.match(StringRef(Start
, Lex
->getBuffer().end() - Start
),
1281 // There is a null byte at the end of the buffer, so we don't have to check
1282 // Start[1] is within the buffer.
1283 if (Start
[0] == '\\' && (Start
[1] == '\r' || Start
[1] == '\n'))
1285 size_t Len
= Matches
[0].size();
1287 // The kind has to be an identifier so we can match it against those defined
1288 // in Keywords. The kind has to be set before the length because the setLength
1289 // function checks that the kind is not an annotation.
1290 Tok
.setKind(tok::raw_identifier
);
1292 Tok
.setLocation(Lex
->getSourceLocation(Start
, Len
));
1293 Tok
.setRawIdentifierData(Start
);
1294 Lex
->seek(Lex
->getCurrentBufferOffset() + Len
, /*IsAtStartofline=*/false);
1298 void FormatTokenLexer::readRawToken(FormatToken
&Tok
) {
1299 // For Verilog, first see if there is a special token, and fall back to the
1300 // normal lexer if there isn't one.
1301 if (!Style
.isVerilog() || !readRawTokenVerilogSpecific(Tok
.Tok
))
1302 Lex
->LexFromRawLexer(Tok
.Tok
);
1303 Tok
.TokenText
= StringRef(SourceMgr
.getCharacterData(Tok
.Tok
.getLocation()),
1304 Tok
.Tok
.getLength());
1305 // For formatting, treat unterminated string literals like normal string
1307 if (Tok
.is(tok::unknown
)) {
1308 if (!Tok
.TokenText
.empty() && Tok
.TokenText
[0] == '"') {
1309 Tok
.Tok
.setKind(tok::string_literal
);
1310 Tok
.IsUnterminatedLiteral
= true;
1311 } else if (Style
.isJavaScript() && Tok
.TokenText
== "''") {
1312 Tok
.Tok
.setKind(tok::string_literal
);
1316 if ((Style
.isJavaScript() || Style
.isProto()) && Tok
.is(tok::char_constant
))
1317 Tok
.Tok
.setKind(tok::string_literal
);
1319 if (Tok
.is(tok::comment
) && isClangFormatOn(Tok
.TokenText
))
1320 FormattingDisabled
= false;
1322 Tok
.Finalized
= FormattingDisabled
;
1324 if (Tok
.is(tok::comment
) && isClangFormatOff(Tok
.TokenText
))
1325 FormattingDisabled
= true;
1328 void FormatTokenLexer::resetLexer(unsigned Offset
) {
1329 StringRef Buffer
= SourceMgr
.getBufferData(ID
);
1330 LangOpts
= getFormattingLangOpts(Style
);
1331 Lex
.reset(new Lexer(SourceMgr
.getLocForStartOfFile(ID
), LangOpts
,
1332 Buffer
.begin(), Buffer
.begin() + Offset
, Buffer
.end()));
1333 Lex
->SetKeepWhitespaceMode(true);
1334 TrailingWhitespace
= 0;
1337 } // namespace format
1338 } // namespace clang