1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
13 //===----------------------------------------------------------------------===//
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
25 FormatTokenLexer::FormatTokenLexer(
26 const SourceManager
&SourceMgr
, FileID ID
, unsigned Column
,
27 const FormatStyle
&Style
, encoding::Encoding Encoding
,
28 llvm::SpecificBumpPtrAllocator
<FormatToken
> &Allocator
,
29 IdentifierTable
&IdentTable
)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL
}),
31 Column(Column
), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style
)), SourceMgr(SourceMgr
), ID(ID
),
33 Style(Style
), IdentTable(IdentTable
), Keywords(IdentTable
),
34 Encoding(Encoding
), Allocator(Allocator
), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style
.MacroBlockBegin
),
36 MacroBlockEndRegex(Style
.MacroBlockEnd
) {
37 Lex
.reset(new Lexer(ID
, SourceMgr
.getBufferOrFake(ID
), SourceMgr
, LangOpts
));
38 Lex
->SetKeepWhitespaceMode(true);
40 for (const std::string
&ForEachMacro
: Style
.ForEachMacros
) {
41 auto Identifier
= &IdentTable
.get(ForEachMacro
);
42 Macros
.insert({Identifier
, TT_ForEachMacro
});
44 for (const std::string
&IfMacro
: Style
.IfMacros
) {
45 auto Identifier
= &IdentTable
.get(IfMacro
);
46 Macros
.insert({Identifier
, TT_IfMacro
});
48 for (const std::string
&AttributeMacro
: Style
.AttributeMacros
) {
49 auto Identifier
= &IdentTable
.get(AttributeMacro
);
50 Macros
.insert({Identifier
, TT_AttributeMacro
});
52 for (const std::string
&StatementMacro
: Style
.StatementMacros
) {
53 auto Identifier
= &IdentTable
.get(StatementMacro
);
54 Macros
.insert({Identifier
, TT_StatementMacro
});
56 for (const std::string
&TypenameMacro
: Style
.TypenameMacros
) {
57 auto Identifier
= &IdentTable
.get(TypenameMacro
);
58 Macros
.insert({Identifier
, TT_TypenameMacro
});
60 for (const std::string
&NamespaceMacro
: Style
.NamespaceMacros
) {
61 auto Identifier
= &IdentTable
.get(NamespaceMacro
);
62 Macros
.insert({Identifier
, TT_NamespaceMacro
});
64 for (const std::string
&WhitespaceSensitiveMacro
:
65 Style
.WhitespaceSensitiveMacros
) {
66 auto Identifier
= &IdentTable
.get(WhitespaceSensitiveMacro
);
67 Macros
.insert({Identifier
, TT_UntouchableMacroFunc
});
69 for (const std::string
&StatementAttributeLikeMacro
:
70 Style
.StatementAttributeLikeMacros
) {
71 auto Identifier
= &IdentTable
.get(StatementAttributeLikeMacro
);
72 Macros
.insert({Identifier
, TT_StatementAttributeLikeMacro
});
75 for (const auto &TypeName
: Style
.TypeNames
)
76 TypeNames
.insert(&IdentTable
.get(TypeName
));
79 ArrayRef
<FormatToken
*> FormatTokenLexer::lex() {
80 assert(Tokens
.empty());
81 assert(FirstInLineIndex
== 0);
83 Tokens
.push_back(getNextToken());
84 if (Style
.isJavaScript()) {
85 tryParseJSRegexLiteral();
86 handleTemplateStrings();
88 if (Style
.Language
== FormatStyle::LK_TextProto
)
89 tryParsePythonComment();
90 tryMergePreviousTokens();
91 if (Style
.isCSharp()) {
92 // This needs to come after tokens have been merged so that C#
93 // string literals are correctly identified.
94 handleCSharpVerbatimAndInterpolatedStrings();
96 if (Tokens
.back()->NewlinesBefore
> 0 || Tokens
.back()->IsMultiline
)
97 FirstInLineIndex
= Tokens
.size() - 1;
98 } while (Tokens
.back()->isNot(tok::eof
));
102 void FormatTokenLexer::tryMergePreviousTokens() {
103 if (tryMerge_TMacro())
105 if (tryMergeConflictMarkers())
107 if (tryMergeLessLess())
109 if (tryMergeGreaterGreater())
111 if (tryMergeForEach())
113 if (Style
.isCpp() && tryTransformTryUsageForC())
116 if (Style
.isJavaScript() || Style
.isCSharp()) {
117 static const tok::TokenKind NullishCoalescingOperator
[] = {tok::question
,
119 static const tok::TokenKind NullPropagatingOperator
[] = {tok::question
,
121 static const tok::TokenKind FatArrow
[] = {tok::equal
, tok::greater
};
123 if (tryMergeTokens(FatArrow
, TT_FatArrow
))
125 if (tryMergeTokens(NullishCoalescingOperator
, TT_NullCoalescingOperator
)) {
126 // Treat like the "||" operator (as opposed to the ternary ?).
127 Tokens
.back()->Tok
.setKind(tok::pipepipe
);
130 if (tryMergeTokens(NullPropagatingOperator
, TT_NullPropagatingOperator
)) {
131 // Treat like a regular "." access.
132 Tokens
.back()->Tok
.setKind(tok::period
);
135 if (tryMergeNullishCoalescingEqual())
139 if (Style
.isCSharp()) {
140 static const tok::TokenKind CSharpNullConditionalLSquare
[] = {
141 tok::question
, tok::l_square
};
143 if (tryMergeCSharpKeywordVariables())
145 if (tryMergeCSharpStringLiteral())
147 if (tryTransformCSharpForEach())
149 if (tryMergeTokens(CSharpNullConditionalLSquare
,
150 TT_CSharpNullConditionalLSquare
)) {
151 // Treat like a regular "[" operator.
152 Tokens
.back()->Tok
.setKind(tok::l_square
);
157 if (tryMergeNSStringLiteral())
160 if (Style
.isJavaScript()) {
161 static const tok::TokenKind JSIdentity
[] = {tok::equalequal
, tok::equal
};
162 static const tok::TokenKind JSNotIdentity
[] = {tok::exclaimequal
,
164 static const tok::TokenKind JSShiftEqual
[] = {tok::greater
, tok::greater
,
166 static const tok::TokenKind JSExponentiation
[] = {tok::star
, tok::star
};
167 static const tok::TokenKind JSExponentiationEqual
[] = {tok::star
,
169 static const tok::TokenKind JSPipePipeEqual
[] = {tok::pipepipe
, tok::equal
};
170 static const tok::TokenKind JSAndAndEqual
[] = {tok::ampamp
, tok::equal
};
172 // FIXME: Investigate what token type gives the correct operator priority.
173 if (tryMergeTokens(JSIdentity
, TT_BinaryOperator
))
175 if (tryMergeTokens(JSNotIdentity
, TT_BinaryOperator
))
177 if (tryMergeTokens(JSShiftEqual
, TT_BinaryOperator
))
179 if (tryMergeTokens(JSExponentiation
, TT_JsExponentiation
))
181 if (tryMergeTokens(JSExponentiationEqual
, TT_JsExponentiationEqual
)) {
182 Tokens
.back()->Tok
.setKind(tok::starequal
);
185 if (tryMergeTokens(JSAndAndEqual
, TT_JsAndAndEqual
) ||
186 tryMergeTokens(JSPipePipeEqual
, TT_JsPipePipeEqual
)) {
187 // Treat like the "=" assignment operator.
188 Tokens
.back()->Tok
.setKind(tok::equal
);
191 if (tryMergeJSPrivateIdentifier())
195 if (Style
.Language
== FormatStyle::LK_Java
) {
196 static const tok::TokenKind JavaRightLogicalShiftAssign
[] = {
197 tok::greater
, tok::greater
, tok::greaterequal
};
198 if (tryMergeTokens(JavaRightLogicalShiftAssign
, TT_BinaryOperator
))
202 if (Style
.isVerilog()) {
203 // Merge the number following a base like `'h?a0`.
204 if (Tokens
.size() >= 3 && Tokens
.end()[-3]->is(TT_VerilogNumberBase
) &&
205 Tokens
.end()[-2]->is(tok::numeric_constant
) &&
206 Tokens
.back()->isOneOf(tok::numeric_constant
, tok::identifier
,
208 tryMergeTokens(2, TT_Unknown
)) {
212 if (tryMergeTokensAny({{tok::minus
, tok::colon
}, {tok::plus
, tok::colon
}},
216 // Xnor. The combined token is treated as a caret which can also be either a
217 // unary or binary operator. The actual type is determined in
218 // TokenAnnotator. We also check the token length so we know it is not
219 // already a merged token.
220 if (Tokens
.back()->TokenText
.size() == 1 &&
221 tryMergeTokensAny({{tok::caret
, tok::tilde
}, {tok::tilde
, tok::caret
}},
222 TT_BinaryOperator
)) {
223 Tokens
.back()->Tok
.setKind(tok::caret
);
226 // Signed shift and distribution weight.
227 if (tryMergeTokens({tok::less
, tok::less
}, TT_BinaryOperator
)) {
228 Tokens
.back()->Tok
.setKind(tok::lessless
);
231 if (tryMergeTokens({tok::greater
, tok::greater
}, TT_BinaryOperator
)) {
232 Tokens
.back()->Tok
.setKind(tok::greatergreater
);
235 if (tryMergeTokensAny({{tok::lessless
, tok::equal
},
236 {tok::lessless
, tok::lessequal
},
237 {tok::greatergreater
, tok::equal
},
238 {tok::greatergreater
, tok::greaterequal
},
239 {tok::colon
, tok::equal
},
240 {tok::colon
, tok::slash
}},
241 TT_BinaryOperator
)) {
242 Tokens
.back()->ForcedPrecedence
= prec::Assignment
;
245 // Exponentiation, signed shift, case equality, and wildcard equality.
246 if (tryMergeTokensAny({{tok::star
, tok::star
},
247 {tok::lessless
, tok::less
},
248 {tok::greatergreater
, tok::greater
},
249 {tok::exclaimequal
, tok::equal
},
250 {tok::exclaimequal
, tok::question
},
251 {tok::equalequal
, tok::equal
},
252 {tok::equalequal
, tok::question
}},
253 TT_BinaryOperator
)) {
256 // Module paths in specify blocks and implications in properties.
257 if (tryMergeTokensAny({{tok::plusequal
, tok::greater
},
258 {tok::plus
, tok::star
, tok::greater
},
259 {tok::minusequal
, tok::greater
},
260 {tok::minus
, tok::star
, tok::greater
},
261 {tok::less
, tok::arrow
},
262 {tok::equal
, tok::greater
},
263 {tok::star
, tok::greater
},
264 {tok::pipeequal
, tok::greater
},
265 {tok::pipe
, tok::arrow
},
266 {tok::hash
, tok::minus
, tok::hash
},
267 {tok::hash
, tok::equal
, tok::hash
}},
268 TT_BinaryOperator
)) {
269 Tokens
.back()->ForcedPrecedence
= prec::Comma
;
275 bool FormatTokenLexer::tryMergeNSStringLiteral() {
276 if (Tokens
.size() < 2)
278 auto &At
= *(Tokens
.end() - 2);
279 auto &String
= *(Tokens
.end() - 1);
280 if (At
->isNot(tok::at
) || String
->isNot(tok::string_literal
))
282 At
->Tok
.setKind(tok::string_literal
);
283 At
->TokenText
= StringRef(At
->TokenText
.begin(),
284 String
->TokenText
.end() - At
->TokenText
.begin());
285 At
->ColumnWidth
+= String
->ColumnWidth
;
286 At
->setType(TT_ObjCStringLiteral
);
287 Tokens
.erase(Tokens
.end() - 1);
291 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
292 // Merges #idenfier into a single identifier with the text #identifier
293 // but the token tok::identifier.
294 if (Tokens
.size() < 2)
296 auto &Hash
= *(Tokens
.end() - 2);
297 auto &Identifier
= *(Tokens
.end() - 1);
298 if (Hash
->isNot(tok::hash
) || Identifier
->isNot(tok::identifier
))
300 Hash
->Tok
.setKind(tok::identifier
);
302 StringRef(Hash
->TokenText
.begin(),
303 Identifier
->TokenText
.end() - Hash
->TokenText
.begin());
304 Hash
->ColumnWidth
+= Identifier
->ColumnWidth
;
305 Hash
->setType(TT_JsPrivateIdentifier
);
306 Tokens
.erase(Tokens
.end() - 1);
310 // Search for verbatim or interpolated string literals @"ABC" or
311 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
312 // prevent splitting of @, $ and ".
313 // Merging of multiline verbatim strings with embedded '"' is handled in
314 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
315 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
316 if (Tokens
.size() < 2)
319 // Look for @"aaaaaa" or $"aaaaaa".
320 const auto String
= *(Tokens
.end() - 1);
321 if (String
->isNot(tok::string_literal
))
324 auto Prefix
= *(Tokens
.end() - 2);
325 if (Prefix
->isNot(tok::at
) && Prefix
->TokenText
!= "$")
328 if (Tokens
.size() > 2) {
329 const auto Tok
= *(Tokens
.end() - 3);
330 if ((Tok
->TokenText
== "$" && Prefix
->is(tok::at
)) ||
331 (Tok
->is(tok::at
) && Prefix
->TokenText
== "$")) {
332 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
333 Tok
->ColumnWidth
+= Prefix
->ColumnWidth
;
334 Tokens
.erase(Tokens
.end() - 2);
339 // Convert back into just a string_literal.
340 Prefix
->Tok
.setKind(tok::string_literal
);
342 StringRef(Prefix
->TokenText
.begin(),
343 String
->TokenText
.end() - Prefix
->TokenText
.begin());
344 Prefix
->ColumnWidth
+= String
->ColumnWidth
;
345 Prefix
->setType(TT_CSharpStringLiteral
);
346 Tokens
.erase(Tokens
.end() - 1);
350 // Valid C# attribute targets:
351 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
352 const llvm::StringSet
<> FormatTokenLexer::CSharpAttributeTargets
= {
353 "assembly", "module", "field", "event", "method",
354 "param", "property", "return", "type",
357 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
358 if (Tokens
.size() < 2)
360 auto &NullishCoalescing
= *(Tokens
.end() - 2);
361 auto &Equal
= *(Tokens
.end() - 1);
362 if (NullishCoalescing
->getType() != TT_NullCoalescingOperator
||
363 Equal
->isNot(tok::equal
)) {
366 NullishCoalescing
->Tok
.setKind(tok::equal
); // no '??=' in clang tokens.
367 NullishCoalescing
->TokenText
=
368 StringRef(NullishCoalescing
->TokenText
.begin(),
369 Equal
->TokenText
.end() - NullishCoalescing
->TokenText
.begin());
370 NullishCoalescing
->ColumnWidth
+= Equal
->ColumnWidth
;
371 NullishCoalescing
->setType(TT_NullCoalescingEqual
);
372 Tokens
.erase(Tokens
.end() - 1);
376 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
377 if (Tokens
.size() < 2)
379 const auto At
= *(Tokens
.end() - 2);
380 if (At
->isNot(tok::at
))
382 const auto Keyword
= *(Tokens
.end() - 1);
383 if (Keyword
->TokenText
== "$")
385 if (!Keywords
.isCSharpKeyword(*Keyword
))
388 At
->Tok
.setKind(tok::identifier
);
389 At
->TokenText
= StringRef(At
->TokenText
.begin(),
390 Keyword
->TokenText
.end() - At
->TokenText
.begin());
391 At
->ColumnWidth
+= Keyword
->ColumnWidth
;
392 At
->setType(Keyword
->getType());
393 Tokens
.erase(Tokens
.end() - 1);
397 // In C# transform identifier foreach into kw_foreach
398 bool FormatTokenLexer::tryTransformCSharpForEach() {
399 if (Tokens
.size() < 1)
401 auto &Identifier
= *(Tokens
.end() - 1);
402 if (Identifier
->isNot(tok::identifier
))
404 if (Identifier
->TokenText
!= "foreach")
407 Identifier
->setType(TT_ForEachMacro
);
408 Identifier
->Tok
.setKind(tok::kw_for
);
412 bool FormatTokenLexer::tryMergeForEach() {
413 if (Tokens
.size() < 2)
415 auto &For
= *(Tokens
.end() - 2);
416 auto &Each
= *(Tokens
.end() - 1);
417 if (For
->isNot(tok::kw_for
))
419 if (Each
->isNot(tok::identifier
))
421 if (Each
->TokenText
!= "each")
424 For
->setType(TT_ForEachMacro
);
425 For
->Tok
.setKind(tok::kw_for
);
427 For
->TokenText
= StringRef(For
->TokenText
.begin(),
428 Each
->TokenText
.end() - For
->TokenText
.begin());
429 For
->ColumnWidth
+= Each
->ColumnWidth
;
430 Tokens
.erase(Tokens
.end() - 1);
434 bool FormatTokenLexer::tryTransformTryUsageForC() {
435 if (Tokens
.size() < 2)
437 auto &Try
= *(Tokens
.end() - 2);
438 if (Try
->isNot(tok::kw_try
))
440 auto &Next
= *(Tokens
.end() - 1);
441 if (Next
->isOneOf(tok::l_brace
, tok::colon
, tok::hash
, tok::comment
))
444 if (Tokens
.size() > 2) {
445 auto &At
= *(Tokens
.end() - 3);
450 Try
->Tok
.setKind(tok::identifier
);
454 bool FormatTokenLexer::tryMergeLessLess() {
455 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
456 if (Tokens
.size() < 3)
459 auto First
= Tokens
.end() - 3;
460 if (First
[0]->isNot(tok::less
) || First
[1]->isNot(tok::less
))
463 // Only merge if there currently is no whitespace between the two "<".
464 if (First
[1]->hasWhitespaceBefore())
467 auto X
= Tokens
.size() > 3 ? First
[-1] : nullptr;
468 if (X
&& X
->is(tok::less
))
472 if ((!X
|| X
->isNot(tok::kw_operator
)) && Y
->is(tok::less
))
475 First
[0]->Tok
.setKind(tok::lessless
);
476 First
[0]->TokenText
= "<<";
477 First
[0]->ColumnWidth
+= 1;
478 Tokens
.erase(Tokens
.end() - 2);
482 bool FormatTokenLexer::tryMergeGreaterGreater() {
483 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
484 if (Tokens
.size() < 2)
487 auto First
= Tokens
.end() - 2;
488 if (First
[0]->isNot(tok::greater
) || First
[1]->isNot(tok::greater
))
491 // Only merge if there currently is no whitespace between the first two ">".
492 if (First
[1]->hasWhitespaceBefore())
495 auto Tok
= Tokens
.size() > 2 ? First
[-1] : nullptr;
496 if (Tok
&& Tok
->isNot(tok::kw_operator
))
499 First
[0]->Tok
.setKind(tok::greatergreater
);
500 First
[0]->TokenText
= ">>";
501 First
[0]->ColumnWidth
+= 1;
502 Tokens
.erase(Tokens
.end() - 1);
506 bool FormatTokenLexer::tryMergeTokens(ArrayRef
<tok::TokenKind
> Kinds
,
508 if (Tokens
.size() < Kinds
.size())
511 SmallVectorImpl
<FormatToken
*>::const_iterator First
=
512 Tokens
.end() - Kinds
.size();
513 for (unsigned i
= 0; i
< Kinds
.size(); ++i
)
514 if (First
[i
]->isNot(Kinds
[i
]))
517 return tryMergeTokens(Kinds
.size(), NewType
);
520 bool FormatTokenLexer::tryMergeTokens(size_t Count
, TokenType NewType
) {
521 if (Tokens
.size() < Count
)
524 SmallVectorImpl
<FormatToken
*>::const_iterator First
= Tokens
.end() - Count
;
525 unsigned AddLength
= 0;
526 for (size_t i
= 1; i
< Count
; ++i
) {
527 // If there is whitespace separating the token and the previous one,
528 // they should not be merged.
529 if (First
[i
]->hasWhitespaceBefore())
531 AddLength
+= First
[i
]->TokenText
.size();
534 Tokens
.resize(Tokens
.size() - Count
+ 1);
535 First
[0]->TokenText
= StringRef(First
[0]->TokenText
.data(),
536 First
[0]->TokenText
.size() + AddLength
);
537 First
[0]->ColumnWidth
+= AddLength
;
538 First
[0]->setType(NewType
);
542 bool FormatTokenLexer::tryMergeTokensAny(
543 ArrayRef
<ArrayRef
<tok::TokenKind
>> Kinds
, TokenType NewType
) {
544 return llvm::any_of(Kinds
, [this, NewType
](ArrayRef
<tok::TokenKind
> Kinds
) {
545 return tryMergeTokens(Kinds
, NewType
);
549 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
550 bool FormatTokenLexer::precedesOperand(FormatToken
*Tok
) {
551 // NB: This is not entirely correct, as an r_paren can introduce an operand
552 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
553 // corner case to not matter in practice, though.
554 return Tok
->isOneOf(tok::period
, tok::l_paren
, tok::comma
, tok::l_brace
,
555 tok::r_brace
, tok::l_square
, tok::semi
, tok::exclaim
,
556 tok::colon
, tok::question
, tok::tilde
) ||
557 Tok
->isOneOf(tok::kw_return
, tok::kw_do
, tok::kw_case
, tok::kw_throw
,
558 tok::kw_else
, tok::kw_new
, tok::kw_delete
, tok::kw_void
,
559 tok::kw_typeof
, Keywords
.kw_instanceof
, Keywords
.kw_in
) ||
560 Tok
->isBinaryOperator();
563 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken
*Prev
) {
567 // Regex literals can only follow after prefix unary operators, not after
568 // postfix unary operators. If the '++' is followed by a non-operand
569 // introducing token, the slash here is the operand and not the start of a
571 // `!` is an unary prefix operator, but also a post-fix operator that casts
572 // away nullability, so the same check applies.
573 if (Prev
->isOneOf(tok::plusplus
, tok::minusminus
, tok::exclaim
))
574 return Tokens
.size() < 3 || precedesOperand(Tokens
[Tokens
.size() - 3]);
576 // The previous token must introduce an operand location where regex
577 // literals can occur.
578 if (!precedesOperand(Prev
))
584 // Tries to parse a JavaScript Regex literal starting at the current token,
585 // if that begins with a slash and is in a location where JavaScript allows
586 // regex literals. Changes the current token to a regex literal and updates
587 // its text if successful.
588 void FormatTokenLexer::tryParseJSRegexLiteral() {
589 FormatToken
*RegexToken
= Tokens
.back();
590 if (!RegexToken
->isOneOf(tok::slash
, tok::slashequal
))
593 FormatToken
*Prev
= nullptr;
594 for (FormatToken
*FT
: llvm::drop_begin(llvm::reverse(Tokens
))) {
595 // NB: Because previous pointers are not initialized yet, this cannot use
596 // Token.getPreviousNonComment.
597 if (FT
->isNot(tok::comment
)) {
603 if (!canPrecedeRegexLiteral(Prev
))
606 // 'Manually' lex ahead in the current file buffer.
607 const char *Offset
= Lex
->getBufferLocation();
608 const char *RegexBegin
= Offset
- RegexToken
->TokenText
.size();
609 StringRef Buffer
= Lex
->getBuffer();
610 bool InCharacterClass
= false;
611 bool HaveClosingSlash
= false;
612 for (; !HaveClosingSlash
&& Offset
!= Buffer
.end(); ++Offset
) {
613 // Regular expressions are terminated with a '/', which can only be
614 // escaped using '\' or a character class between '[' and ']'.
615 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
618 // Skip the escaped character.
622 InCharacterClass
= true;
625 InCharacterClass
= false;
628 if (!InCharacterClass
)
629 HaveClosingSlash
= true;
634 RegexToken
->setType(TT_RegexLiteral
);
635 // Treat regex literals like other string_literals.
636 RegexToken
->Tok
.setKind(tok::string_literal
);
637 RegexToken
->TokenText
= StringRef(RegexBegin
, Offset
- RegexBegin
);
638 RegexToken
->ColumnWidth
= RegexToken
->TokenText
.size();
640 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(Offset
)));
643 static auto lexCSharpString(const char *Begin
, const char *End
, bool Verbatim
,
645 auto Repeated
= [&Begin
, End
]() {
646 return Begin
+ 1 < End
&& Begin
[1] == Begin
[0];
649 // Look for a terminating '"' in the current file buffer.
650 // Make no effort to format code within an interpolated or verbatim string.
652 // Interpolated strings could contain { } with " characters inside.
654 // should not be split into $"{x ?? ", null, "}" but should be treated as a
655 // single string-literal.
657 // We opt not to try and format expressions inside {} within a C#
658 // interpolated string. Formatting expressions within an interpolated string
659 // would require similar work as that done for JavaScript template strings
660 // in `handleTemplateStrings()`.
661 for (int UnmatchedOpeningBraceCount
= 0; Begin
< End
; ++Begin
) {
669 // {{ inside an interpolated string is escaped, so skip it.
673 ++UnmatchedOpeningBraceCount
;
678 // }} inside an interpolated string is escaped, so skip it.
681 else if (UnmatchedOpeningBraceCount
> 0)
682 --UnmatchedOpeningBraceCount
;
688 if (UnmatchedOpeningBraceCount
> 0)
690 // "" within a verbatim string is an escaped double quote: skip it.
691 if (Verbatim
&& Repeated()) {
702 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
703 FormatToken
*CSharpStringLiteral
= Tokens
.back();
705 if (CSharpStringLiteral
->isNot(TT_CSharpStringLiteral
))
708 auto &TokenText
= CSharpStringLiteral
->TokenText
;
710 bool Verbatim
= false;
711 bool Interpolated
= false;
712 if (TokenText
.startswith(R
"($@")") || TokenText.startswith(R"(@$
")")) {
715 } else if (TokenText
.startswith(R
"(@")")) {
717 } else if (TokenText.startswith(R"($
")")) {
721 // Deal with multiline strings.
722 if (!Verbatim
&& !Interpolated
)
725 const char *StrBegin
= Lex
->getBufferLocation() - TokenText
.size();
726 const char *Offset
= StrBegin
;
727 if (Verbatim
&& Interpolated
)
732 const auto End
= Lex
->getBuffer().end();
733 Offset
= lexCSharpString(Offset
, End
, Verbatim
, Interpolated
);
735 // Make no attempt to format code properly if a verbatim string is
740 StringRef
LiteralText(StrBegin
, Offset
- StrBegin
+ 1);
741 TokenText
= LiteralText
;
743 // Adjust width for potentially multiline string literals.
744 size_t FirstBreak
= LiteralText
.find('\n');
745 StringRef FirstLineText
= FirstBreak
== StringRef::npos
747 : LiteralText
.substr(0, FirstBreak
);
748 CSharpStringLiteral
->ColumnWidth
= encoding::columnWidthWithTabs(
749 FirstLineText
, CSharpStringLiteral
->OriginalColumn
, Style
.TabWidth
,
751 size_t LastBreak
= LiteralText
.rfind('\n');
752 if (LastBreak
!= StringRef::npos
) {
753 CSharpStringLiteral
->IsMultiline
= true;
754 unsigned StartColumn
= 0;
755 CSharpStringLiteral
->LastLineColumnWidth
=
756 encoding::columnWidthWithTabs(LiteralText
.substr(LastBreak
+ 1),
757 StartColumn
, Style
.TabWidth
, Encoding
);
760 assert(Offset
< End
);
761 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(Offset
+ 1)));
764 void FormatTokenLexer::handleTemplateStrings() {
765 FormatToken
*BacktickToken
= Tokens
.back();
767 if (BacktickToken
->is(tok::l_brace
)) {
768 StateStack
.push(LexerState::NORMAL
);
771 if (BacktickToken
->is(tok::r_brace
)) {
772 if (StateStack
.size() == 1)
775 if (StateStack
.top() != LexerState::TEMPLATE_STRING
)
777 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
778 } else if (BacktickToken
->is(tok::unknown
) &&
779 BacktickToken
->TokenText
== "`") {
780 StateStack
.push(LexerState::TEMPLATE_STRING
);
782 return; // Not actually a template
785 // 'Manually' lex ahead in the current file buffer.
786 const char *Offset
= Lex
->getBufferLocation();
787 const char *TmplBegin
= Offset
- BacktickToken
->TokenText
.size(); // at "`"
788 for (; Offset
!= Lex
->getBuffer().end(); ++Offset
) {
789 if (Offset
[0] == '`') {
794 if (Offset
[0] == '\\') {
795 ++Offset
; // Skip the escaped character.
796 } else if (Offset
+ 1 < Lex
->getBuffer().end() && Offset
[0] == '$' &&
798 // '${' introduces an expression interpolation in the template string.
799 StateStack
.push(LexerState::NORMAL
);
805 StringRef
LiteralText(TmplBegin
, Offset
- TmplBegin
);
806 BacktickToken
->setType(TT_TemplateString
);
807 BacktickToken
->Tok
.setKind(tok::string_literal
);
808 BacktickToken
->TokenText
= LiteralText
;
810 // Adjust width for potentially multiline string literals.
811 size_t FirstBreak
= LiteralText
.find('\n');
812 StringRef FirstLineText
= FirstBreak
== StringRef::npos
814 : LiteralText
.substr(0, FirstBreak
);
815 BacktickToken
->ColumnWidth
= encoding::columnWidthWithTabs(
816 FirstLineText
, BacktickToken
->OriginalColumn
, Style
.TabWidth
, Encoding
);
817 size_t LastBreak
= LiteralText
.rfind('\n');
818 if (LastBreak
!= StringRef::npos
) {
819 BacktickToken
->IsMultiline
= true;
820 unsigned StartColumn
= 0; // The template tail spans the entire line.
821 BacktickToken
->LastLineColumnWidth
=
822 encoding::columnWidthWithTabs(LiteralText
.substr(LastBreak
+ 1),
823 StartColumn
, Style
.TabWidth
, Encoding
);
826 SourceLocation loc
= Lex
->getSourceLocation(Offset
);
827 resetLexer(SourceMgr
.getFileOffset(loc
));
830 void FormatTokenLexer::tryParsePythonComment() {
831 FormatToken
*HashToken
= Tokens
.back();
832 if (!HashToken
->isOneOf(tok::hash
, tok::hashhash
))
834 // Turn the remainder of this line into a comment.
835 const char *CommentBegin
=
836 Lex
->getBufferLocation() - HashToken
->TokenText
.size(); // at "#"
837 size_t From
= CommentBegin
- Lex
->getBuffer().begin();
838 size_t To
= Lex
->getBuffer().find_first_of('\n', From
);
839 if (To
== StringRef::npos
)
840 To
= Lex
->getBuffer().size();
841 size_t Len
= To
- From
;
842 HashToken
->setType(TT_LineComment
);
843 HashToken
->Tok
.setKind(tok::comment
);
844 HashToken
->TokenText
= Lex
->getBuffer().substr(From
, Len
);
845 SourceLocation Loc
= To
< Lex
->getBuffer().size()
846 ? Lex
->getSourceLocation(CommentBegin
+ Len
)
847 : SourceMgr
.getLocForEndOfFile(ID
);
848 resetLexer(SourceMgr
.getFileOffset(Loc
));
851 bool FormatTokenLexer::tryMerge_TMacro() {
852 if (Tokens
.size() < 4)
854 FormatToken
*Last
= Tokens
.back();
855 if (Last
->isNot(tok::r_paren
))
858 FormatToken
*String
= Tokens
[Tokens
.size() - 2];
859 if (String
->isNot(tok::string_literal
) || String
->IsMultiline
)
862 if (Tokens
[Tokens
.size() - 3]->isNot(tok::l_paren
))
865 FormatToken
*Macro
= Tokens
[Tokens
.size() - 4];
866 if (Macro
->TokenText
!= "_T")
869 const char *Start
= Macro
->TokenText
.data();
870 const char *End
= Last
->TokenText
.data() + Last
->TokenText
.size();
871 String
->TokenText
= StringRef(Start
, End
- Start
);
872 String
->IsFirst
= Macro
->IsFirst
;
873 String
->LastNewlineOffset
= Macro
->LastNewlineOffset
;
874 String
->WhitespaceRange
= Macro
->WhitespaceRange
;
875 String
->OriginalColumn
= Macro
->OriginalColumn
;
876 String
->ColumnWidth
= encoding::columnWidthWithTabs(
877 String
->TokenText
, String
->OriginalColumn
, Style
.TabWidth
, Encoding
);
878 String
->NewlinesBefore
= Macro
->NewlinesBefore
;
879 String
->HasUnescapedNewline
= Macro
->HasUnescapedNewline
;
884 Tokens
.back() = String
;
885 if (FirstInLineIndex
>= Tokens
.size())
886 FirstInLineIndex
= Tokens
.size() - 1;
890 bool FormatTokenLexer::tryMergeConflictMarkers() {
891 if (Tokens
.back()->NewlinesBefore
== 0 && Tokens
.back()->isNot(tok::eof
))
894 // Conflict lines look like:
895 // <marker> <text from the vcs>
897 // >>>>>>> /file/in/file/system at revision 1234
899 // We merge all tokens in a line that starts with a conflict marker
900 // into a single token with a special token type that the unwrapped line
901 // parser will use to correctly rebuild the underlying code.
904 // Get the position of the first token in the line.
905 unsigned FirstInLineOffset
;
906 std::tie(ID
, FirstInLineOffset
) = SourceMgr
.getDecomposedLoc(
907 Tokens
[FirstInLineIndex
]->getStartOfNonWhitespace());
908 StringRef Buffer
= SourceMgr
.getBufferOrFake(ID
).getBuffer();
909 // Calculate the offset of the start of the current line.
910 auto LineOffset
= Buffer
.rfind('\n', FirstInLineOffset
);
911 if (LineOffset
== StringRef::npos
)
916 auto FirstSpace
= Buffer
.find_first_of(" \n", LineOffset
);
918 if (FirstSpace
== StringRef::npos
)
919 LineStart
= Buffer
.substr(LineOffset
);
921 LineStart
= Buffer
.substr(LineOffset
, FirstSpace
- LineOffset
);
923 TokenType Type
= TT_Unknown
;
924 if (LineStart
== "<<<<<<<" || LineStart
== ">>>>") {
925 Type
= TT_ConflictStart
;
926 } else if (LineStart
== "|||||||" || LineStart
== "=======" ||
927 LineStart
== "====") {
928 Type
= TT_ConflictAlternative
;
929 } else if (LineStart
== ">>>>>>>" || LineStart
== "<<<<") {
930 Type
= TT_ConflictEnd
;
933 if (Type
!= TT_Unknown
) {
934 FormatToken
*Next
= Tokens
.back();
936 Tokens
.resize(FirstInLineIndex
+ 1);
937 // We do not need to build a complete token here, as we will skip it
938 // during parsing anyway (as we must not touch whitespace around conflict
940 Tokens
.back()->setType(Type
);
941 Tokens
.back()->Tok
.setKind(tok::kw___unknown_anytype
);
943 Tokens
.push_back(Next
);
950 FormatToken
*FormatTokenLexer::getStashedToken() {
951 // Create a synthesized second '>' or '<' token.
952 Token Tok
= FormatTok
->Tok
;
953 StringRef TokenText
= FormatTok
->TokenText
;
955 unsigned OriginalColumn
= FormatTok
->OriginalColumn
;
956 FormatTok
= new (Allocator
.Allocate()) FormatToken
;
957 FormatTok
->Tok
= Tok
;
958 SourceLocation TokLocation
=
959 FormatTok
->Tok
.getLocation().getLocWithOffset(Tok
.getLength() - 1);
960 FormatTok
->Tok
.setLocation(TokLocation
);
961 FormatTok
->WhitespaceRange
= SourceRange(TokLocation
, TokLocation
);
962 FormatTok
->TokenText
= TokenText
;
963 FormatTok
->ColumnWidth
= 1;
964 FormatTok
->OriginalColumn
= OriginalColumn
+ 1;
969 /// Truncate the current token to the new length and make the lexer continue
970 /// from the end of the truncated token. Used for other languages that have
971 /// different token boundaries, like JavaScript in which a comment ends at a
972 /// line break regardless of whether the line break follows a backslash. Also
973 /// used to set the lexer to the end of whitespace if the lexer regards
974 /// whitespace and an unrecognized symbol as one token.
975 void FormatTokenLexer::truncateToken(size_t NewLen
) {
976 assert(NewLen
<= FormatTok
->TokenText
.size());
977 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(
978 Lex
->getBufferLocation() - FormatTok
->TokenText
.size() + NewLen
)));
979 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, NewLen
);
980 FormatTok
->ColumnWidth
= encoding::columnWidthWithTabs(
981 FormatTok
->TokenText
, FormatTok
->OriginalColumn
, Style
.TabWidth
,
983 FormatTok
->Tok
.setLength(NewLen
);
986 /// Count the length of leading whitespace in a token.
987 static size_t countLeadingWhitespace(StringRef Text
) {
988 // Basically counting the length matched by this regex.
989 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
990 // Directly using the regex turned out to be slow. With the regex
991 // version formatting all files in this directory took about 1.25
992 // seconds. This version took about 0.5 seconds.
993 const unsigned char *const Begin
= Text
.bytes_begin();
994 const unsigned char *const End
= Text
.bytes_end();
995 const unsigned char *Cur
= Begin
;
997 if (isspace(Cur
[0])) {
999 } else if (Cur
[0] == '\\' && (Cur
[1] == '\n' || Cur
[1] == '\r')) {
1000 // A '\' followed by a newline always escapes the newline, regardless
1001 // of whether there is another '\' before it.
1002 // The source has a null byte at the end. So the end of the entire input
1003 // isn't reached yet. Also the lexer doesn't break apart an escaped
1005 assert(End
- Cur
>= 2);
1007 } else if (Cur
[0] == '?' && Cur
[1] == '?' && Cur
[2] == '/' &&
1008 (Cur
[3] == '\n' || Cur
[3] == '\r')) {
1009 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1010 // characters are quoted individually in this comment because if we write
1011 // them together some compilers warn that we have a trigraph in the code.
1012 assert(End
- Cur
>= 4);
1021 FormatToken
*FormatTokenLexer::getNextToken() {
1022 if (StateStack
.top() == LexerState::TOKEN_STASHED
) {
1024 return getStashedToken();
1027 FormatTok
= new (Allocator
.Allocate()) FormatToken
;
1028 readRawToken(*FormatTok
);
1029 SourceLocation WhitespaceStart
=
1030 FormatTok
->Tok
.getLocation().getLocWithOffset(-TrailingWhitespace
);
1031 FormatTok
->IsFirst
= IsFirstToken
;
1032 IsFirstToken
= false;
1034 // Consume and record whitespace until we find a significant token.
1035 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1036 // followed by a symbol such as backtick. Those symbols may be
1037 // significant in other languages.
1038 unsigned WhitespaceLength
= TrailingWhitespace
;
1039 while (FormatTok
->isNot(tok::eof
)) {
1040 auto LeadingWhitespace
= countLeadingWhitespace(FormatTok
->TokenText
);
1041 if (LeadingWhitespace
== 0)
1043 if (LeadingWhitespace
< FormatTok
->TokenText
.size())
1044 truncateToken(LeadingWhitespace
);
1045 StringRef Text
= FormatTok
->TokenText
;
1046 bool InEscape
= false;
1047 for (int i
= 0, e
= Text
.size(); i
!= e
; ++i
) {
1050 // If this is a CRLF sequence, break here and the LF will be handled on
1051 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1052 // the same as a single LF.
1053 if (i
+ 1 < e
&& Text
[i
+ 1] == '\n')
1057 ++FormatTok
->NewlinesBefore
;
1059 FormatTok
->HasUnescapedNewline
= true;
1062 FormatTok
->LastNewlineOffset
= WhitespaceLength
+ i
+ 1;
1074 Style
.TabWidth
- (Style
.TabWidth
? Column
% Style
.TabWidth
: 0);
1079 // The text was entirely whitespace when this loop was entered. Thus
1080 // this has to be an escape sequence.
1081 assert(Text
.substr(i
, 2) == "\\\r" || Text
.substr(i
, 2) == "\\\n" ||
1082 Text
.substr(i
, 4) == "\?\?/\r" ||
1083 Text
.substr(i
, 4) == "\?\?/\n" ||
1084 (i
>= 1 && (Text
.substr(i
- 1, 4) == "\?\?/\r" ||
1085 Text
.substr(i
- 1, 4) == "\?\?/\n")) ||
1086 (i
>= 2 && (Text
.substr(i
- 2, 4) == "\?\?/\r" ||
1087 Text
.substr(i
- 2, 4) == "\?\?/\n")));
1091 // This shouldn't happen.
1096 WhitespaceLength
+= Text
.size();
1097 readRawToken(*FormatTok
);
1100 if (FormatTok
->is(tok::unknown
))
1101 FormatTok
->setType(TT_ImplicitStringLiteral
);
1103 // JavaScript and Java do not allow to escape the end of the line with a
1104 // backslash. Backslashes are syntax errors in plain source, but can occur in
1105 // comments. When a single line comment ends with a \, it'll cause the next
1106 // line of code to be lexed as a comment, breaking formatting. The code below
1107 // finds comments that contain a backslash followed by a line break, truncates
1108 // the comment token at the backslash, and resets the lexer to restart behind
1110 if ((Style
.isJavaScript() || Style
.Language
== FormatStyle::LK_Java
) &&
1111 FormatTok
->is(tok::comment
) && FormatTok
->TokenText
.startswith("//")) {
1112 size_t BackslashPos
= FormatTok
->TokenText
.find('\\');
1113 while (BackslashPos
!= StringRef::npos
) {
1114 if (BackslashPos
+ 1 < FormatTok
->TokenText
.size() &&
1115 FormatTok
->TokenText
[BackslashPos
+ 1] == '\n') {
1116 truncateToken(BackslashPos
+ 1);
1119 BackslashPos
= FormatTok
->TokenText
.find('\\', BackslashPos
+ 1);
1123 if (Style
.isVerilog()) {
1124 static const llvm::Regex
NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase
);
1125 SmallVector
<StringRef
, 1> Matches
;
1126 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1127 // And it uses the hash for delays and parameter lists. In order to continue
1128 // using `tok::hash` in other places, the backtick gets marked as the hash
1129 // here. And in order to tell the backtick and hash apart for
1130 // Verilog-specific stuff, the hash becomes an identifier.
1131 if (FormatTok
->is(tok::numeric_constant
)) {
1132 // In Verilog the quote is not part of a number.
1133 auto Quote
= FormatTok
->TokenText
.find('\'');
1134 if (Quote
!= StringRef::npos
)
1135 truncateToken(Quote
);
1136 } else if (FormatTok
->isOneOf(tok::hash
, tok::hashhash
)) {
1137 FormatTok
->Tok
.setKind(tok::raw_identifier
);
1138 } else if (FormatTok
->is(tok::raw_identifier
)) {
1139 if (FormatTok
->TokenText
== "`") {
1140 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1141 FormatTok
->Tok
.setKind(tok::hash
);
1142 } else if (FormatTok
->TokenText
== "``") {
1143 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1144 FormatTok
->Tok
.setKind(tok::hashhash
);
1145 } else if (Tokens
.size() > 0 &&
1146 Tokens
.back()->is(Keywords
.kw_apostrophe
) &&
1147 NumberBase
.match(FormatTok
->TokenText
, &Matches
)) {
1148 // In Verilog in a based number literal like `'b10`, there may be
1149 // whitespace between `'b` and `10`. Therefore we handle the base and
1150 // the rest of the number literal as two tokens. But if there is no
1151 // space in the input code, we need to manually separate the two parts.
1152 truncateToken(Matches
[0].size());
1153 FormatTok
->setFinalizedType(TT_VerilogNumberBase
);
1158 FormatTok
->WhitespaceRange
= SourceRange(
1159 WhitespaceStart
, WhitespaceStart
.getLocWithOffset(WhitespaceLength
));
1161 FormatTok
->OriginalColumn
= Column
;
1163 TrailingWhitespace
= 0;
1164 if (FormatTok
->is(tok::comment
)) {
1165 // FIXME: Add the trimmed whitespace to Column.
1166 StringRef UntrimmedText
= FormatTok
->TokenText
;
1167 FormatTok
->TokenText
= FormatTok
->TokenText
.rtrim(" \t\v\f");
1168 TrailingWhitespace
= UntrimmedText
.size() - FormatTok
->TokenText
.size();
1169 } else if (FormatTok
->is(tok::raw_identifier
)) {
1170 IdentifierInfo
&Info
= IdentTable
.get(FormatTok
->TokenText
);
1171 FormatTok
->Tok
.setIdentifierInfo(&Info
);
1172 FormatTok
->Tok
.setKind(Info
.getTokenID());
1173 if (Style
.Language
== FormatStyle::LK_Java
&&
1174 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
, tok::kw_delete
,
1175 tok::kw_operator
)) {
1176 FormatTok
->Tok
.setKind(tok::identifier
);
1177 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1178 } else if (Style
.isJavaScript() &&
1179 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
,
1180 tok::kw_operator
)) {
1181 FormatTok
->Tok
.setKind(tok::identifier
);
1182 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1184 } else if (FormatTok
->is(tok::greatergreater
)) {
1185 FormatTok
->Tok
.setKind(tok::greater
);
1186 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1188 StateStack
.push(LexerState::TOKEN_STASHED
);
1189 } else if (FormatTok
->is(tok::lessless
)) {
1190 FormatTok
->Tok
.setKind(tok::less
);
1191 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1193 StateStack
.push(LexerState::TOKEN_STASHED
);
1196 if (Style
.isVerilog() && Tokens
.size() > 0 &&
1197 Tokens
.back()->is(TT_VerilogNumberBase
) &&
1198 FormatTok
->Tok
.isOneOf(tok::identifier
, tok::question
)) {
1199 // Mark the number following a base like `'h?a0` as a number.
1200 FormatTok
->Tok
.setKind(tok::numeric_constant
);
1203 // Now FormatTok is the next non-whitespace token.
1205 StringRef Text
= FormatTok
->TokenText
;
1206 size_t FirstNewlinePos
= Text
.find('\n');
1207 if (FirstNewlinePos
== StringRef::npos
) {
1208 // FIXME: ColumnWidth actually depends on the start column, we need to
1209 // take this into account when the token is moved.
1210 FormatTok
->ColumnWidth
=
1211 encoding::columnWidthWithTabs(Text
, Column
, Style
.TabWidth
, Encoding
);
1212 Column
+= FormatTok
->ColumnWidth
;
1214 FormatTok
->IsMultiline
= true;
1215 // FIXME: ColumnWidth actually depends on the start column, we need to
1216 // take this into account when the token is moved.
1217 FormatTok
->ColumnWidth
= encoding::columnWidthWithTabs(
1218 Text
.substr(0, FirstNewlinePos
), Column
, Style
.TabWidth
, Encoding
);
1220 // The last line of the token always starts in column 0.
1221 // Thus, the length can be precomputed even in the presence of tabs.
1222 FormatTok
->LastLineColumnWidth
= encoding::columnWidthWithTabs(
1223 Text
.substr(Text
.find_last_of('\n') + 1), 0, Style
.TabWidth
, Encoding
);
1224 Column
= FormatTok
->LastLineColumnWidth
;
1227 if (Style
.isCpp()) {
1228 auto *Identifier
= FormatTok
->Tok
.getIdentifierInfo();
1229 auto it
= Macros
.find(Identifier
);
1230 if (!(Tokens
.size() > 0 && Tokens
.back()->Tok
.getIdentifierInfo() &&
1231 Tokens
.back()->Tok
.getIdentifierInfo()->getPPKeywordID() ==
1233 it
!= Macros
.end()) {
1234 FormatTok
->setType(it
->second
);
1235 if (it
->second
== TT_IfMacro
) {
1236 // The lexer token currently has type tok::kw_unknown. However, for this
1237 // substitution to be treated correctly in the TokenAnnotator, faking
1238 // the tok value seems to be needed. Not sure if there's a more elegant
1240 FormatTok
->Tok
.setKind(tok::kw_if
);
1242 } else if (FormatTok
->is(tok::identifier
)) {
1243 if (MacroBlockBeginRegex
.match(Text
))
1244 FormatTok
->setType(TT_MacroBlockBegin
);
1245 else if (MacroBlockEndRegex
.match(Text
))
1246 FormatTok
->setType(TT_MacroBlockEnd
);
1247 else if (TypeNames
.contains(Identifier
))
1248 FormatTok
->setFinalizedType(TT_TypeName
);
1255 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token
&Tok
) {
1256 // In Verilog the quote is not a character literal.
1258 // Make the backtick and double backtick identifiers to match against them
1261 // In Verilog an escaped identifier starts with backslash and ends with
1262 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1263 // also begin an escaped newline outside of an escaped identifier. We check
1264 // for that outside of the Regex since we can't use negative lookhead
1265 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1266 // identifier may have a length of 0 according to Section A.9.3.
1267 // FIXME: If there is an escaped newline in the middle of an escaped
1268 // identifier, allow for pasting the two lines together, But escaped
1269 // identifiers usually occur only in generated code anyway.
1270 static const llvm::Regex
VerilogToken(R
"re(^('|``?|\\(\\)re"
1271 "(\r?\n|\r)|[^[:space:]])*)");
1273 SmallVector
<StringRef
, 4> Matches
;
1274 const char *Start
= Lex
->getBufferLocation();
1275 if (!VerilogToken
.match(StringRef(Start
, Lex
->getBuffer().end() - Start
),
1279 // There is a null byte at the end of the buffer, so we don't have to check
1280 // Start[1] is within the buffer.
1281 if (Start
[0] == '\\' && (Start
[1] == '\r' || Start
[1] == '\n'))
1283 size_t Len
= Matches
[0].size();
1285 // The kind has to be an identifier so we can match it against those defined
1286 // in Keywords. The kind has to be set before the length because the setLength
1287 // function checks that the kind is not an annotation.
1288 Tok
.setKind(tok::raw_identifier
);
1290 Tok
.setLocation(Lex
->getSourceLocation(Start
, Len
));
1291 Tok
.setRawIdentifierData(Start
);
1292 Lex
->seek(Lex
->getCurrentBufferOffset() + Len
, /*IsAtStartofline=*/false);
1296 void FormatTokenLexer::readRawToken(FormatToken
&Tok
) {
1297 // For Verilog, first see if there is a special token, and fall back to the
1298 // normal lexer if there isn't one.
1299 if (!Style
.isVerilog() || !readRawTokenVerilogSpecific(Tok
.Tok
))
1300 Lex
->LexFromRawLexer(Tok
.Tok
);
1301 Tok
.TokenText
= StringRef(SourceMgr
.getCharacterData(Tok
.Tok
.getLocation()),
1302 Tok
.Tok
.getLength());
1303 // For formatting, treat unterminated string literals like normal string
1305 if (Tok
.is(tok::unknown
)) {
1306 if (!Tok
.TokenText
.empty() && Tok
.TokenText
[0] == '"') {
1307 Tok
.Tok
.setKind(tok::string_literal
);
1308 Tok
.IsUnterminatedLiteral
= true;
1309 } else if (Style
.isJavaScript() && Tok
.TokenText
== "''") {
1310 Tok
.Tok
.setKind(tok::string_literal
);
1314 if ((Style
.isJavaScript() || Style
.Language
== FormatStyle::LK_Proto
||
1315 Style
.Language
== FormatStyle::LK_TextProto
) &&
1316 Tok
.is(tok::char_constant
)) {
1317 Tok
.Tok
.setKind(tok::string_literal
);
1320 if (Tok
.is(tok::comment
) && isClangFormatOn(Tok
.TokenText
))
1321 FormattingDisabled
= false;
1323 Tok
.Finalized
= FormattingDisabled
;
1325 if (Tok
.is(tok::comment
) && isClangFormatOff(Tok
.TokenText
))
1326 FormattingDisabled
= true;
1329 void FormatTokenLexer::resetLexer(unsigned Offset
) {
1330 StringRef Buffer
= SourceMgr
.getBufferData(ID
);
1331 LangOpts
= getFormattingLangOpts(Style
);
1332 Lex
.reset(new Lexer(SourceMgr
.getLocForStartOfFile(ID
), LangOpts
,
1333 Buffer
.begin(), Buffer
.begin() + Offset
, Buffer
.end()));
1334 Lex
->SetKeepWhitespaceMode(true);
1335 TrailingWhitespace
= 0;
1338 } // namespace format
1339 } // namespace clang