1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
13 //===----------------------------------------------------------------------===//
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
25 FormatTokenLexer::FormatTokenLexer(
26 const SourceManager
&SourceMgr
, FileID ID
, unsigned Column
,
27 const FormatStyle
&Style
, encoding::Encoding Encoding
,
28 llvm::SpecificBumpPtrAllocator
<FormatToken
> &Allocator
,
29 IdentifierTable
&IdentTable
)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL
}),
31 Column(Column
), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style
)), SourceMgr(SourceMgr
), ID(ID
),
33 Style(Style
), IdentTable(IdentTable
), Keywords(IdentTable
),
34 Encoding(Encoding
), Allocator(Allocator
), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style
.MacroBlockBegin
),
36 MacroBlockEndRegex(Style
.MacroBlockEnd
) {
37 Lex
.reset(new Lexer(ID
, SourceMgr
.getBufferOrFake(ID
), SourceMgr
, LangOpts
));
38 Lex
->SetKeepWhitespaceMode(true);
40 for (const std::string
&ForEachMacro
: Style
.ForEachMacros
) {
41 auto Identifier
= &IdentTable
.get(ForEachMacro
);
42 Macros
.insert({Identifier
, TT_ForEachMacro
});
44 for (const std::string
&IfMacro
: Style
.IfMacros
) {
45 auto Identifier
= &IdentTable
.get(IfMacro
);
46 Macros
.insert({Identifier
, TT_IfMacro
});
48 for (const std::string
&AttributeMacro
: Style
.AttributeMacros
) {
49 auto Identifier
= &IdentTable
.get(AttributeMacro
);
50 Macros
.insert({Identifier
, TT_AttributeMacro
});
52 for (const std::string
&StatementMacro
: Style
.StatementMacros
) {
53 auto Identifier
= &IdentTable
.get(StatementMacro
);
54 Macros
.insert({Identifier
, TT_StatementMacro
});
56 for (const std::string
&TypenameMacro
: Style
.TypenameMacros
) {
57 auto Identifier
= &IdentTable
.get(TypenameMacro
);
58 Macros
.insert({Identifier
, TT_TypenameMacro
});
60 for (const std::string
&NamespaceMacro
: Style
.NamespaceMacros
) {
61 auto Identifier
= &IdentTable
.get(NamespaceMacro
);
62 Macros
.insert({Identifier
, TT_NamespaceMacro
});
64 for (const std::string
&WhitespaceSensitiveMacro
:
65 Style
.WhitespaceSensitiveMacros
) {
66 auto Identifier
= &IdentTable
.get(WhitespaceSensitiveMacro
);
67 Macros
.insert({Identifier
, TT_UntouchableMacroFunc
});
69 for (const std::string
&StatementAttributeLikeMacro
:
70 Style
.StatementAttributeLikeMacros
) {
71 auto Identifier
= &IdentTable
.get(StatementAttributeLikeMacro
);
72 Macros
.insert({Identifier
, TT_StatementAttributeLikeMacro
});
76 ArrayRef
<FormatToken
*> FormatTokenLexer::lex() {
77 assert(Tokens
.empty());
78 assert(FirstInLineIndex
== 0);
80 Tokens
.push_back(getNextToken());
81 if (Style
.isJavaScript()) {
82 tryParseJSRegexLiteral();
83 handleTemplateStrings();
85 if (Style
.Language
== FormatStyle::LK_TextProto
)
86 tryParsePythonComment();
87 tryMergePreviousTokens();
88 if (Style
.isCSharp()) {
89 // This needs to come after tokens have been merged so that C#
90 // string literals are correctly identified.
91 handleCSharpVerbatimAndInterpolatedStrings();
93 if (Tokens
.back()->NewlinesBefore
> 0 || Tokens
.back()->IsMultiline
)
94 FirstInLineIndex
= Tokens
.size() - 1;
95 } while (Tokens
.back()->isNot(tok::eof
));
99 void FormatTokenLexer::tryMergePreviousTokens() {
100 if (tryMerge_TMacro())
102 if (tryMergeConflictMarkers())
104 if (tryMergeLessLess())
106 if (tryMergeGreaterGreater())
108 if (tryMergeForEach())
110 if (Style
.isCpp() && tryTransformTryUsageForC())
113 if (Style
.isJavaScript() || Style
.isCSharp()) {
114 static const tok::TokenKind NullishCoalescingOperator
[] = {tok::question
,
116 static const tok::TokenKind NullPropagatingOperator
[] = {tok::question
,
118 static const tok::TokenKind FatArrow
[] = {tok::equal
, tok::greater
};
120 if (tryMergeTokens(FatArrow
, TT_FatArrow
))
122 if (tryMergeTokens(NullishCoalescingOperator
, TT_NullCoalescingOperator
)) {
123 // Treat like the "||" operator (as opposed to the ternary ?).
124 Tokens
.back()->Tok
.setKind(tok::pipepipe
);
127 if (tryMergeTokens(NullPropagatingOperator
, TT_NullPropagatingOperator
)) {
128 // Treat like a regular "." access.
129 Tokens
.back()->Tok
.setKind(tok::period
);
132 if (tryMergeNullishCoalescingEqual())
136 if (Style
.isCSharp()) {
137 static const tok::TokenKind CSharpNullConditionalLSquare
[] = {
138 tok::question
, tok::l_square
};
140 if (tryMergeCSharpKeywordVariables())
142 if (tryMergeCSharpStringLiteral())
144 if (tryTransformCSharpForEach())
146 if (tryMergeTokens(CSharpNullConditionalLSquare
,
147 TT_CSharpNullConditionalLSquare
)) {
148 // Treat like a regular "[" operator.
149 Tokens
.back()->Tok
.setKind(tok::l_square
);
154 if (tryMergeNSStringLiteral())
157 if (Style
.isJavaScript()) {
158 static const tok::TokenKind JSIdentity
[] = {tok::equalequal
, tok::equal
};
159 static const tok::TokenKind JSNotIdentity
[] = {tok::exclaimequal
,
161 static const tok::TokenKind JSShiftEqual
[] = {tok::greater
, tok::greater
,
163 static const tok::TokenKind JSExponentiation
[] = {tok::star
, tok::star
};
164 static const tok::TokenKind JSExponentiationEqual
[] = {tok::star
,
166 static const tok::TokenKind JSPipePipeEqual
[] = {tok::pipepipe
, tok::equal
};
167 static const tok::TokenKind JSAndAndEqual
[] = {tok::ampamp
, tok::equal
};
169 // FIXME: Investigate what token type gives the correct operator priority.
170 if (tryMergeTokens(JSIdentity
, TT_BinaryOperator
))
172 if (tryMergeTokens(JSNotIdentity
, TT_BinaryOperator
))
174 if (tryMergeTokens(JSShiftEqual
, TT_BinaryOperator
))
176 if (tryMergeTokens(JSExponentiation
, TT_JsExponentiation
))
178 if (tryMergeTokens(JSExponentiationEqual
, TT_JsExponentiationEqual
)) {
179 Tokens
.back()->Tok
.setKind(tok::starequal
);
182 if (tryMergeTokens(JSAndAndEqual
, TT_JsAndAndEqual
) ||
183 tryMergeTokens(JSPipePipeEqual
, TT_JsPipePipeEqual
)) {
184 // Treat like the "=" assignment operator.
185 Tokens
.back()->Tok
.setKind(tok::equal
);
188 if (tryMergeJSPrivateIdentifier())
192 if (Style
.Language
== FormatStyle::LK_Java
) {
193 static const tok::TokenKind JavaRightLogicalShiftAssign
[] = {
194 tok::greater
, tok::greater
, tok::greaterequal
};
195 if (tryMergeTokens(JavaRightLogicalShiftAssign
, TT_BinaryOperator
))
199 if (Style
.isVerilog()) {
200 // Merge the number following a base like `'h?a0`.
201 if (Tokens
.size() >= 3 && Tokens
.end()[-3]->is(TT_VerilogNumberBase
) &&
202 Tokens
.end()[-2]->is(tok::numeric_constant
) &&
203 Tokens
.back()->isOneOf(tok::numeric_constant
, tok::identifier
,
205 tryMergeTokens(2, TT_Unknown
)) {
209 if (tryMergeTokensAny({{tok::minus
, tok::colon
}, {tok::plus
, tok::colon
}},
213 // Xnor. The combined token is treated as a caret which can also be either a
214 // unary or binary operator. The actual type is determined in
215 // TokenAnnotator. We also check the token length so we know it is not
216 // already a merged token.
217 if (Tokens
.back()->TokenText
.size() == 1 &&
218 tryMergeTokensAny({{tok::caret
, tok::tilde
}, {tok::tilde
, tok::caret
}},
219 TT_BinaryOperator
)) {
220 Tokens
.back()->Tok
.setKind(tok::caret
);
223 // Signed shift and distribution weight.
224 if (tryMergeTokens({tok::less
, tok::less
}, TT_BinaryOperator
)) {
225 Tokens
.back()->Tok
.setKind(tok::lessless
);
228 if (tryMergeTokens({tok::greater
, tok::greater
}, TT_BinaryOperator
)) {
229 Tokens
.back()->Tok
.setKind(tok::greatergreater
);
232 if (tryMergeTokensAny({{tok::lessless
, tok::equal
},
233 {tok::lessless
, tok::lessequal
},
234 {tok::greatergreater
, tok::equal
},
235 {tok::greatergreater
, tok::greaterequal
},
236 {tok::colon
, tok::equal
},
237 {tok::colon
, tok::slash
}},
238 TT_BinaryOperator
)) {
239 Tokens
.back()->ForcedPrecedence
= prec::Assignment
;
242 // Exponentiation, signed shift, case equality, and wildcard equality.
243 if (tryMergeTokensAny({{tok::star
, tok::star
},
244 {tok::lessless
, tok::less
},
245 {tok::greatergreater
, tok::greater
},
246 {tok::exclaimequal
, tok::equal
},
247 {tok::exclaimequal
, tok::question
},
248 {tok::equalequal
, tok::equal
},
249 {tok::equalequal
, tok::question
}},
250 TT_BinaryOperator
)) {
253 // Module paths in specify blocks and implications in properties.
254 if (tryMergeTokensAny({{tok::plusequal
, tok::greater
},
255 {tok::plus
, tok::star
, tok::greater
},
256 {tok::minusequal
, tok::greater
},
257 {tok::minus
, tok::star
, tok::greater
},
258 {tok::less
, tok::arrow
},
259 {tok::equal
, tok::greater
},
260 {tok::star
, tok::greater
},
261 {tok::pipeequal
, tok::greater
},
262 {tok::pipe
, tok::arrow
},
263 {tok::hash
, tok::minus
, tok::hash
},
264 {tok::hash
, tok::equal
, tok::hash
}},
265 TT_BinaryOperator
)) {
266 Tokens
.back()->ForcedPrecedence
= prec::Comma
;
272 bool FormatTokenLexer::tryMergeNSStringLiteral() {
273 if (Tokens
.size() < 2)
275 auto &At
= *(Tokens
.end() - 2);
276 auto &String
= *(Tokens
.end() - 1);
277 if (!At
->is(tok::at
) || !String
->is(tok::string_literal
))
279 At
->Tok
.setKind(tok::string_literal
);
280 At
->TokenText
= StringRef(At
->TokenText
.begin(),
281 String
->TokenText
.end() - At
->TokenText
.begin());
282 At
->ColumnWidth
+= String
->ColumnWidth
;
283 At
->setType(TT_ObjCStringLiteral
);
284 Tokens
.erase(Tokens
.end() - 1);
288 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
289 // Merges #idenfier into a single identifier with the text #identifier
290 // but the token tok::identifier.
291 if (Tokens
.size() < 2)
293 auto &Hash
= *(Tokens
.end() - 2);
294 auto &Identifier
= *(Tokens
.end() - 1);
295 if (!Hash
->is(tok::hash
) || !Identifier
->is(tok::identifier
))
297 Hash
->Tok
.setKind(tok::identifier
);
299 StringRef(Hash
->TokenText
.begin(),
300 Identifier
->TokenText
.end() - Hash
->TokenText
.begin());
301 Hash
->ColumnWidth
+= Identifier
->ColumnWidth
;
302 Hash
->setType(TT_JsPrivateIdentifier
);
303 Tokens
.erase(Tokens
.end() - 1);
307 // Search for verbatim or interpolated string literals @"ABC" or
308 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
309 // prevent splitting of @, $ and ".
310 // Merging of multiline verbatim strings with embedded '"' is handled in
311 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
312 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
313 if (Tokens
.size() < 2)
316 // Look for @"aaaaaa" or $"aaaaaa".
317 const auto String
= *(Tokens
.end() - 1);
318 if (String
->isNot(tok::string_literal
))
321 auto Prefix
= *(Tokens
.end() - 2);
322 if (Prefix
->isNot(tok::at
) && Prefix
->TokenText
!= "$")
325 if (Tokens
.size() > 2) {
326 const auto Tok
= *(Tokens
.end() - 3);
327 if ((Tok
->TokenText
== "$" && Prefix
->is(tok::at
)) ||
328 (Tok
->is(tok::at
) && Prefix
->TokenText
== "$")) {
329 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
330 Tok
->ColumnWidth
+= Prefix
->ColumnWidth
;
331 Tokens
.erase(Tokens
.end() - 2);
336 // Convert back into just a string_literal.
337 Prefix
->Tok
.setKind(tok::string_literal
);
339 StringRef(Prefix
->TokenText
.begin(),
340 String
->TokenText
.end() - Prefix
->TokenText
.begin());
341 Prefix
->ColumnWidth
+= String
->ColumnWidth
;
342 Prefix
->setType(TT_CSharpStringLiteral
);
343 Tokens
.erase(Tokens
.end() - 1);
347 // Valid C# attribute targets:
348 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
349 const llvm::StringSet
<> FormatTokenLexer::CSharpAttributeTargets
= {
350 "assembly", "module", "field", "event", "method",
351 "param", "property", "return", "type",
354 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
355 if (Tokens
.size() < 2)
357 auto &NullishCoalescing
= *(Tokens
.end() - 2);
358 auto &Equal
= *(Tokens
.end() - 1);
359 if (NullishCoalescing
->getType() != TT_NullCoalescingOperator
||
360 !Equal
->is(tok::equal
)) {
363 NullishCoalescing
->Tok
.setKind(tok::equal
); // no '??=' in clang tokens.
364 NullishCoalescing
->TokenText
=
365 StringRef(NullishCoalescing
->TokenText
.begin(),
366 Equal
->TokenText
.end() - NullishCoalescing
->TokenText
.begin());
367 NullishCoalescing
->ColumnWidth
+= Equal
->ColumnWidth
;
368 NullishCoalescing
->setType(TT_NullCoalescingEqual
);
369 Tokens
.erase(Tokens
.end() - 1);
373 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
374 if (Tokens
.size() < 2)
376 const auto At
= *(Tokens
.end() - 2);
377 if (At
->isNot(tok::at
))
379 const auto Keyword
= *(Tokens
.end() - 1);
380 if (Keyword
->TokenText
== "$")
382 if (!Keywords
.isCSharpKeyword(*Keyword
))
385 At
->Tok
.setKind(tok::identifier
);
386 At
->TokenText
= StringRef(At
->TokenText
.begin(),
387 Keyword
->TokenText
.end() - At
->TokenText
.begin());
388 At
->ColumnWidth
+= Keyword
->ColumnWidth
;
389 At
->setType(Keyword
->getType());
390 Tokens
.erase(Tokens
.end() - 1);
394 // In C# transform identifier foreach into kw_foreach
395 bool FormatTokenLexer::tryTransformCSharpForEach() {
396 if (Tokens
.size() < 1)
398 auto &Identifier
= *(Tokens
.end() - 1);
399 if (!Identifier
->is(tok::identifier
))
401 if (Identifier
->TokenText
!= "foreach")
404 Identifier
->setType(TT_ForEachMacro
);
405 Identifier
->Tok
.setKind(tok::kw_for
);
409 bool FormatTokenLexer::tryMergeForEach() {
410 if (Tokens
.size() < 2)
412 auto &For
= *(Tokens
.end() - 2);
413 auto &Each
= *(Tokens
.end() - 1);
414 if (!For
->is(tok::kw_for
))
416 if (!Each
->is(tok::identifier
))
418 if (Each
->TokenText
!= "each")
421 For
->setType(TT_ForEachMacro
);
422 For
->Tok
.setKind(tok::kw_for
);
424 For
->TokenText
= StringRef(For
->TokenText
.begin(),
425 Each
->TokenText
.end() - For
->TokenText
.begin());
426 For
->ColumnWidth
+= Each
->ColumnWidth
;
427 Tokens
.erase(Tokens
.end() - 1);
431 bool FormatTokenLexer::tryTransformTryUsageForC() {
432 if (Tokens
.size() < 2)
434 auto &Try
= *(Tokens
.end() - 2);
435 if (!Try
->is(tok::kw_try
))
437 auto &Next
= *(Tokens
.end() - 1);
438 if (Next
->isOneOf(tok::l_brace
, tok::colon
, tok::hash
, tok::comment
))
441 if (Tokens
.size() > 2) {
442 auto &At
= *(Tokens
.end() - 3);
447 Try
->Tok
.setKind(tok::identifier
);
451 bool FormatTokenLexer::tryMergeLessLess() {
452 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
453 if (Tokens
.size() < 3)
456 auto First
= Tokens
.end() - 3;
457 if (First
[0]->isNot(tok::less
) || First
[1]->isNot(tok::less
))
460 // Only merge if there currently is no whitespace between the two "<".
461 if (First
[1]->hasWhitespaceBefore())
464 auto X
= Tokens
.size() > 3 ? First
[-1] : nullptr;
465 if (X
&& X
->is(tok::less
))
469 if ((!X
|| X
->isNot(tok::kw_operator
)) && Y
->is(tok::less
))
472 First
[0]->Tok
.setKind(tok::lessless
);
473 First
[0]->TokenText
= "<<";
474 First
[0]->ColumnWidth
+= 1;
475 Tokens
.erase(Tokens
.end() - 2);
479 bool FormatTokenLexer::tryMergeGreaterGreater() {
480 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
481 if (Tokens
.size() < 2)
484 auto First
= Tokens
.end() - 2;
485 if (First
[0]->isNot(tok::greater
) || First
[1]->isNot(tok::greater
))
488 // Only merge if there currently is no whitespace between the first two ">".
489 if (First
[1]->hasWhitespaceBefore())
492 auto Tok
= Tokens
.size() > 2 ? First
[-1] : nullptr;
493 if (Tok
&& Tok
->isNot(tok::kw_operator
))
496 First
[0]->Tok
.setKind(tok::greatergreater
);
497 First
[0]->TokenText
= ">>";
498 First
[0]->ColumnWidth
+= 1;
499 Tokens
.erase(Tokens
.end() - 1);
503 bool FormatTokenLexer::tryMergeTokens(ArrayRef
<tok::TokenKind
> Kinds
,
505 if (Tokens
.size() < Kinds
.size())
508 SmallVectorImpl
<FormatToken
*>::const_iterator First
=
509 Tokens
.end() - Kinds
.size();
510 for (unsigned i
= 0; i
< Kinds
.size(); ++i
)
511 if (!First
[i
]->is(Kinds
[i
]))
514 return tryMergeTokens(Kinds
.size(), NewType
);
517 bool FormatTokenLexer::tryMergeTokens(size_t Count
, TokenType NewType
) {
518 if (Tokens
.size() < Count
)
521 SmallVectorImpl
<FormatToken
*>::const_iterator First
= Tokens
.end() - Count
;
522 unsigned AddLength
= 0;
523 for (size_t i
= 1; i
< Count
; ++i
) {
524 // If there is whitespace separating the token and the previous one,
525 // they should not be merged.
526 if (First
[i
]->hasWhitespaceBefore())
528 AddLength
+= First
[i
]->TokenText
.size();
531 Tokens
.resize(Tokens
.size() - Count
+ 1);
532 First
[0]->TokenText
= StringRef(First
[0]->TokenText
.data(),
533 First
[0]->TokenText
.size() + AddLength
);
534 First
[0]->ColumnWidth
+= AddLength
;
535 First
[0]->setType(NewType
);
539 bool FormatTokenLexer::tryMergeTokensAny(
540 ArrayRef
<ArrayRef
<tok::TokenKind
>> Kinds
, TokenType NewType
) {
541 return llvm::any_of(Kinds
, [this, NewType
](ArrayRef
<tok::TokenKind
> Kinds
) {
542 return tryMergeTokens(Kinds
, NewType
);
546 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
547 bool FormatTokenLexer::precedesOperand(FormatToken
*Tok
) {
548 // NB: This is not entirely correct, as an r_paren can introduce an operand
549 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
550 // corner case to not matter in practice, though.
551 return Tok
->isOneOf(tok::period
, tok::l_paren
, tok::comma
, tok::l_brace
,
552 tok::r_brace
, tok::l_square
, tok::semi
, tok::exclaim
,
553 tok::colon
, tok::question
, tok::tilde
) ||
554 Tok
->isOneOf(tok::kw_return
, tok::kw_do
, tok::kw_case
, tok::kw_throw
,
555 tok::kw_else
, tok::kw_new
, tok::kw_delete
, tok::kw_void
,
556 tok::kw_typeof
, Keywords
.kw_instanceof
, Keywords
.kw_in
) ||
557 Tok
->isBinaryOperator();
560 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken
*Prev
) {
564 // Regex literals can only follow after prefix unary operators, not after
565 // postfix unary operators. If the '++' is followed by a non-operand
566 // introducing token, the slash here is the operand and not the start of a
568 // `!` is an unary prefix operator, but also a post-fix operator that casts
569 // away nullability, so the same check applies.
570 if (Prev
->isOneOf(tok::plusplus
, tok::minusminus
, tok::exclaim
))
571 return Tokens
.size() < 3 || precedesOperand(Tokens
[Tokens
.size() - 3]);
573 // The previous token must introduce an operand location where regex
574 // literals can occur.
575 if (!precedesOperand(Prev
))
581 // Tries to parse a JavaScript Regex literal starting at the current token,
582 // if that begins with a slash and is in a location where JavaScript allows
583 // regex literals. Changes the current token to a regex literal and updates
584 // its text if successful.
585 void FormatTokenLexer::tryParseJSRegexLiteral() {
586 FormatToken
*RegexToken
= Tokens
.back();
587 if (!RegexToken
->isOneOf(tok::slash
, tok::slashequal
))
590 FormatToken
*Prev
= nullptr;
591 for (FormatToken
*FT
: llvm::drop_begin(llvm::reverse(Tokens
))) {
592 // NB: Because previous pointers are not initialized yet, this cannot use
593 // Token.getPreviousNonComment.
594 if (FT
->isNot(tok::comment
)) {
600 if (!canPrecedeRegexLiteral(Prev
))
603 // 'Manually' lex ahead in the current file buffer.
604 const char *Offset
= Lex
->getBufferLocation();
605 const char *RegexBegin
= Offset
- RegexToken
->TokenText
.size();
606 StringRef Buffer
= Lex
->getBuffer();
607 bool InCharacterClass
= false;
608 bool HaveClosingSlash
= false;
609 for (; !HaveClosingSlash
&& Offset
!= Buffer
.end(); ++Offset
) {
610 // Regular expressions are terminated with a '/', which can only be
611 // escaped using '\' or a character class between '[' and ']'.
612 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
615 // Skip the escaped character.
619 InCharacterClass
= true;
622 InCharacterClass
= false;
625 if (!InCharacterClass
)
626 HaveClosingSlash
= true;
631 RegexToken
->setType(TT_RegexLiteral
);
632 // Treat regex literals like other string_literals.
633 RegexToken
->Tok
.setKind(tok::string_literal
);
634 RegexToken
->TokenText
= StringRef(RegexBegin
, Offset
- RegexBegin
);
635 RegexToken
->ColumnWidth
= RegexToken
->TokenText
.size();
637 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(Offset
)));
640 static auto lexCSharpString(const char *Begin
, const char *End
, bool Verbatim
,
642 auto Repeated
= [&Begin
, End
]() {
643 return Begin
+ 1 < End
&& Begin
[1] == Begin
[0];
646 // Look for a terminating '"' in the current file buffer.
647 // Make no effort to format code within an interpolated or verbatim string.
649 // Interpolated strings could contain { } with " characters inside.
651 // should not be split into $"{x ?? ", null, "}" but should be treated as a
652 // single string-literal.
654 // We opt not to try and format expressions inside {} within a C#
655 // interpolated string. Formatting expressions within an interpolated string
656 // would require similar work as that done for JavaScript template strings
657 // in `handleTemplateStrings()`.
658 for (int UnmatchedOpeningBraceCount
= 0; Begin
< End
; ++Begin
) {
666 // {{ inside an interpolated string is escaped, so skip it.
670 ++UnmatchedOpeningBraceCount
;
675 // }} inside an interpolated string is escaped, so skip it.
678 else if (UnmatchedOpeningBraceCount
> 0)
679 --UnmatchedOpeningBraceCount
;
685 if (UnmatchedOpeningBraceCount
> 0)
687 // "" within a verbatim string is an escaped double quote: skip it.
688 if (Verbatim
&& Repeated()) {
699 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
700 FormatToken
*CSharpStringLiteral
= Tokens
.back();
702 if (CSharpStringLiteral
->isNot(TT_CSharpStringLiteral
))
705 auto &TokenText
= CSharpStringLiteral
->TokenText
;
707 bool Verbatim
= false;
708 bool Interpolated
= false;
709 if (TokenText
.startswith(R
"($@")") || TokenText.startswith(R"(@$
")")) {
712 } else if (TokenText
.startswith(R
"(@")")) {
714 } else if (TokenText.startswith(R"($
")")) {
718 // Deal with multiline strings.
719 if (!Verbatim
&& !Interpolated
)
722 const char *StrBegin
= Lex
->getBufferLocation() - TokenText
.size();
723 const char *Offset
= StrBegin
;
724 if (Verbatim
&& Interpolated
)
729 const auto End
= Lex
->getBuffer().end();
730 Offset
= lexCSharpString(Offset
, End
, Verbatim
, Interpolated
);
732 // Make no attempt to format code properly if a verbatim string is
737 StringRef
LiteralText(StrBegin
, Offset
- StrBegin
+ 1);
738 TokenText
= LiteralText
;
740 // Adjust width for potentially multiline string literals.
741 size_t FirstBreak
= LiteralText
.find('\n');
742 StringRef FirstLineText
= FirstBreak
== StringRef::npos
744 : LiteralText
.substr(0, FirstBreak
);
745 CSharpStringLiteral
->ColumnWidth
= encoding::columnWidthWithTabs(
746 FirstLineText
, CSharpStringLiteral
->OriginalColumn
, Style
.TabWidth
,
748 size_t LastBreak
= LiteralText
.rfind('\n');
749 if (LastBreak
!= StringRef::npos
) {
750 CSharpStringLiteral
->IsMultiline
= true;
751 unsigned StartColumn
= 0;
752 CSharpStringLiteral
->LastLineColumnWidth
=
753 encoding::columnWidthWithTabs(LiteralText
.substr(LastBreak
+ 1),
754 StartColumn
, Style
.TabWidth
, Encoding
);
757 assert(Offset
< End
);
758 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(Offset
+ 1)));
761 void FormatTokenLexer::handleTemplateStrings() {
762 FormatToken
*BacktickToken
= Tokens
.back();
764 if (BacktickToken
->is(tok::l_brace
)) {
765 StateStack
.push(LexerState::NORMAL
);
768 if (BacktickToken
->is(tok::r_brace
)) {
769 if (StateStack
.size() == 1)
772 if (StateStack
.top() != LexerState::TEMPLATE_STRING
)
774 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
775 } else if (BacktickToken
->is(tok::unknown
) &&
776 BacktickToken
->TokenText
== "`") {
777 StateStack
.push(LexerState::TEMPLATE_STRING
);
779 return; // Not actually a template
782 // 'Manually' lex ahead in the current file buffer.
783 const char *Offset
= Lex
->getBufferLocation();
784 const char *TmplBegin
= Offset
- BacktickToken
->TokenText
.size(); // at "`"
785 for (; Offset
!= Lex
->getBuffer().end(); ++Offset
) {
786 if (Offset
[0] == '`') {
791 if (Offset
[0] == '\\') {
792 ++Offset
; // Skip the escaped character.
793 } else if (Offset
+ 1 < Lex
->getBuffer().end() && Offset
[0] == '$' &&
795 // '${' introduces an expression interpolation in the template string.
796 StateStack
.push(LexerState::NORMAL
);
802 StringRef
LiteralText(TmplBegin
, Offset
- TmplBegin
);
803 BacktickToken
->setType(TT_TemplateString
);
804 BacktickToken
->Tok
.setKind(tok::string_literal
);
805 BacktickToken
->TokenText
= LiteralText
;
807 // Adjust width for potentially multiline string literals.
808 size_t FirstBreak
= LiteralText
.find('\n');
809 StringRef FirstLineText
= FirstBreak
== StringRef::npos
811 : LiteralText
.substr(0, FirstBreak
);
812 BacktickToken
->ColumnWidth
= encoding::columnWidthWithTabs(
813 FirstLineText
, BacktickToken
->OriginalColumn
, Style
.TabWidth
, Encoding
);
814 size_t LastBreak
= LiteralText
.rfind('\n');
815 if (LastBreak
!= StringRef::npos
) {
816 BacktickToken
->IsMultiline
= true;
817 unsigned StartColumn
= 0; // The template tail spans the entire line.
818 BacktickToken
->LastLineColumnWidth
=
819 encoding::columnWidthWithTabs(LiteralText
.substr(LastBreak
+ 1),
820 StartColumn
, Style
.TabWidth
, Encoding
);
823 SourceLocation loc
= Lex
->getSourceLocation(Offset
);
824 resetLexer(SourceMgr
.getFileOffset(loc
));
827 void FormatTokenLexer::tryParsePythonComment() {
828 FormatToken
*HashToken
= Tokens
.back();
829 if (!HashToken
->isOneOf(tok::hash
, tok::hashhash
))
831 // Turn the remainder of this line into a comment.
832 const char *CommentBegin
=
833 Lex
->getBufferLocation() - HashToken
->TokenText
.size(); // at "#"
834 size_t From
= CommentBegin
- Lex
->getBuffer().begin();
835 size_t To
= Lex
->getBuffer().find_first_of('\n', From
);
836 if (To
== StringRef::npos
)
837 To
= Lex
->getBuffer().size();
838 size_t Len
= To
- From
;
839 HashToken
->setType(TT_LineComment
);
840 HashToken
->Tok
.setKind(tok::comment
);
841 HashToken
->TokenText
= Lex
->getBuffer().substr(From
, Len
);
842 SourceLocation Loc
= To
< Lex
->getBuffer().size()
843 ? Lex
->getSourceLocation(CommentBegin
+ Len
)
844 : SourceMgr
.getLocForEndOfFile(ID
);
845 resetLexer(SourceMgr
.getFileOffset(Loc
));
848 bool FormatTokenLexer::tryMerge_TMacro() {
849 if (Tokens
.size() < 4)
851 FormatToken
*Last
= Tokens
.back();
852 if (!Last
->is(tok::r_paren
))
855 FormatToken
*String
= Tokens
[Tokens
.size() - 2];
856 if (!String
->is(tok::string_literal
) || String
->IsMultiline
)
859 if (!Tokens
[Tokens
.size() - 3]->is(tok::l_paren
))
862 FormatToken
*Macro
= Tokens
[Tokens
.size() - 4];
863 if (Macro
->TokenText
!= "_T")
866 const char *Start
= Macro
->TokenText
.data();
867 const char *End
= Last
->TokenText
.data() + Last
->TokenText
.size();
868 String
->TokenText
= StringRef(Start
, End
- Start
);
869 String
->IsFirst
= Macro
->IsFirst
;
870 String
->LastNewlineOffset
= Macro
->LastNewlineOffset
;
871 String
->WhitespaceRange
= Macro
->WhitespaceRange
;
872 String
->OriginalColumn
= Macro
->OriginalColumn
;
873 String
->ColumnWidth
= encoding::columnWidthWithTabs(
874 String
->TokenText
, String
->OriginalColumn
, Style
.TabWidth
, Encoding
);
875 String
->NewlinesBefore
= Macro
->NewlinesBefore
;
876 String
->HasUnescapedNewline
= Macro
->HasUnescapedNewline
;
881 Tokens
.back() = String
;
882 if (FirstInLineIndex
>= Tokens
.size())
883 FirstInLineIndex
= Tokens
.size() - 1;
887 bool FormatTokenLexer::tryMergeConflictMarkers() {
888 if (Tokens
.back()->NewlinesBefore
== 0 && Tokens
.back()->isNot(tok::eof
))
891 // Conflict lines look like:
892 // <marker> <text from the vcs>
894 // >>>>>>> /file/in/file/system at revision 1234
896 // We merge all tokens in a line that starts with a conflict marker
897 // into a single token with a special token type that the unwrapped line
898 // parser will use to correctly rebuild the underlying code.
901 // Get the position of the first token in the line.
902 unsigned FirstInLineOffset
;
903 std::tie(ID
, FirstInLineOffset
) = SourceMgr
.getDecomposedLoc(
904 Tokens
[FirstInLineIndex
]->getStartOfNonWhitespace());
905 StringRef Buffer
= SourceMgr
.getBufferOrFake(ID
).getBuffer();
906 // Calculate the offset of the start of the current line.
907 auto LineOffset
= Buffer
.rfind('\n', FirstInLineOffset
);
908 if (LineOffset
== StringRef::npos
)
913 auto FirstSpace
= Buffer
.find_first_of(" \n", LineOffset
);
915 if (FirstSpace
== StringRef::npos
)
916 LineStart
= Buffer
.substr(LineOffset
);
918 LineStart
= Buffer
.substr(LineOffset
, FirstSpace
- LineOffset
);
920 TokenType Type
= TT_Unknown
;
921 if (LineStart
== "<<<<<<<" || LineStart
== ">>>>") {
922 Type
= TT_ConflictStart
;
923 } else if (LineStart
== "|||||||" || LineStart
== "=======" ||
924 LineStart
== "====") {
925 Type
= TT_ConflictAlternative
;
926 } else if (LineStart
== ">>>>>>>" || LineStart
== "<<<<") {
927 Type
= TT_ConflictEnd
;
930 if (Type
!= TT_Unknown
) {
931 FormatToken
*Next
= Tokens
.back();
933 Tokens
.resize(FirstInLineIndex
+ 1);
934 // We do not need to build a complete token here, as we will skip it
935 // during parsing anyway (as we must not touch whitespace around conflict
937 Tokens
.back()->setType(Type
);
938 Tokens
.back()->Tok
.setKind(tok::kw___unknown_anytype
);
940 Tokens
.push_back(Next
);
947 FormatToken
*FormatTokenLexer::getStashedToken() {
948 // Create a synthesized second '>' or '<' token.
949 Token Tok
= FormatTok
->Tok
;
950 StringRef TokenText
= FormatTok
->TokenText
;
952 unsigned OriginalColumn
= FormatTok
->OriginalColumn
;
953 FormatTok
= new (Allocator
.Allocate()) FormatToken
;
954 FormatTok
->Tok
= Tok
;
955 SourceLocation TokLocation
=
956 FormatTok
->Tok
.getLocation().getLocWithOffset(Tok
.getLength() - 1);
957 FormatTok
->Tok
.setLocation(TokLocation
);
958 FormatTok
->WhitespaceRange
= SourceRange(TokLocation
, TokLocation
);
959 FormatTok
->TokenText
= TokenText
;
960 FormatTok
->ColumnWidth
= 1;
961 FormatTok
->OriginalColumn
= OriginalColumn
+ 1;
966 /// Truncate the current token to the new length and make the lexer continue
967 /// from the end of the truncated token. Used for other languages that have
968 /// different token boundaries, like JavaScript in which a comment ends at a
969 /// line break regardless of whether the line break follows a backslash. Also
970 /// used to set the lexer to the end of whitespace if the lexer regards
971 /// whitespace and an unrecognized symbol as one token.
972 void FormatTokenLexer::truncateToken(size_t NewLen
) {
973 assert(NewLen
<= FormatTok
->TokenText
.size());
974 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(
975 Lex
->getBufferLocation() - FormatTok
->TokenText
.size() + NewLen
)));
976 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, NewLen
);
977 FormatTok
->ColumnWidth
= encoding::columnWidthWithTabs(
978 FormatTok
->TokenText
, FormatTok
->OriginalColumn
, Style
.TabWidth
,
980 FormatTok
->Tok
.setLength(NewLen
);
983 /// Count the length of leading whitespace in a token.
984 static size_t countLeadingWhitespace(StringRef Text
) {
985 // Basically counting the length matched by this regex.
986 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
987 // Directly using the regex turned out to be slow. With the regex
988 // version formatting all files in this directory took about 1.25
989 // seconds. This version took about 0.5 seconds.
990 const unsigned char *const Begin
= Text
.bytes_begin();
991 const unsigned char *const End
= Text
.bytes_end();
992 const unsigned char *Cur
= Begin
;
994 if (isspace(Cur
[0])) {
996 } else if (Cur
[0] == '\\' && (Cur
[1] == '\n' || Cur
[1] == '\r')) {
997 // A '\' followed by a newline always escapes the newline, regardless
998 // of whether there is another '\' before it.
999 // The source has a null byte at the end. So the end of the entire input
1000 // isn't reached yet. Also the lexer doesn't break apart an escaped
1002 assert(End
- Cur
>= 2);
1004 } else if (Cur
[0] == '?' && Cur
[1] == '?' && Cur
[2] == '/' &&
1005 (Cur
[3] == '\n' || Cur
[3] == '\r')) {
1006 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1007 // characters are quoted individually in this comment because if we write
1008 // them together some compilers warn that we have a trigraph in the code.
1009 assert(End
- Cur
>= 4);
1018 FormatToken
*FormatTokenLexer::getNextToken() {
1019 if (StateStack
.top() == LexerState::TOKEN_STASHED
) {
1021 return getStashedToken();
1024 FormatTok
= new (Allocator
.Allocate()) FormatToken
;
1025 readRawToken(*FormatTok
);
1026 SourceLocation WhitespaceStart
=
1027 FormatTok
->Tok
.getLocation().getLocWithOffset(-TrailingWhitespace
);
1028 FormatTok
->IsFirst
= IsFirstToken
;
1029 IsFirstToken
= false;
1031 // Consume and record whitespace until we find a significant token.
1032 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1033 // followed by a symbol such as backtick. Those symbols may be
1034 // significant in other languages.
1035 unsigned WhitespaceLength
= TrailingWhitespace
;
1036 while (FormatTok
->isNot(tok::eof
)) {
1037 auto LeadingWhitespace
= countLeadingWhitespace(FormatTok
->TokenText
);
1038 if (LeadingWhitespace
== 0)
1040 if (LeadingWhitespace
< FormatTok
->TokenText
.size())
1041 truncateToken(LeadingWhitespace
);
1042 StringRef Text
= FormatTok
->TokenText
;
1043 bool InEscape
= false;
1044 for (int i
= 0, e
= Text
.size(); i
!= e
; ++i
) {
1047 // If this is a CRLF sequence, break here and the LF will be handled on
1048 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1049 // the same as a single LF.
1050 if (i
+ 1 < e
&& Text
[i
+ 1] == '\n')
1054 ++FormatTok
->NewlinesBefore
;
1056 FormatTok
->HasUnescapedNewline
= true;
1059 FormatTok
->LastNewlineOffset
= WhitespaceLength
+ i
+ 1;
1071 Style
.TabWidth
- (Style
.TabWidth
? Column
% Style
.TabWidth
: 0);
1076 // The text was entirely whitespace when this loop was entered. Thus
1077 // this has to be an escape sequence.
1078 assert(Text
.substr(i
, 2) == "\\\r" || Text
.substr(i
, 2) == "\\\n" ||
1079 Text
.substr(i
, 4) == "\?\?/\r" ||
1080 Text
.substr(i
, 4) == "\?\?/\n" ||
1081 (i
>= 1 && (Text
.substr(i
- 1, 4) == "\?\?/\r" ||
1082 Text
.substr(i
- 1, 4) == "\?\?/\n")) ||
1083 (i
>= 2 && (Text
.substr(i
- 2, 4) == "\?\?/\r" ||
1084 Text
.substr(i
- 2, 4) == "\?\?/\n")));
1088 // This shouldn't happen.
1093 WhitespaceLength
+= Text
.size();
1094 readRawToken(*FormatTok
);
1097 if (FormatTok
->is(tok::unknown
))
1098 FormatTok
->setType(TT_ImplicitStringLiteral
);
1100 // JavaScript and Java do not allow to escape the end of the line with a
1101 // backslash. Backslashes are syntax errors in plain source, but can occur in
1102 // comments. When a single line comment ends with a \, it'll cause the next
1103 // line of code to be lexed as a comment, breaking formatting. The code below
1104 // finds comments that contain a backslash followed by a line break, truncates
1105 // the comment token at the backslash, and resets the lexer to restart behind
1107 if ((Style
.isJavaScript() || Style
.Language
== FormatStyle::LK_Java
) &&
1108 FormatTok
->is(tok::comment
) && FormatTok
->TokenText
.startswith("//")) {
1109 size_t BackslashPos
= FormatTok
->TokenText
.find('\\');
1110 while (BackslashPos
!= StringRef::npos
) {
1111 if (BackslashPos
+ 1 < FormatTok
->TokenText
.size() &&
1112 FormatTok
->TokenText
[BackslashPos
+ 1] == '\n') {
1113 truncateToken(BackslashPos
+ 1);
1116 BackslashPos
= FormatTok
->TokenText
.find('\\', BackslashPos
+ 1);
1120 if (Style
.isVerilog()) {
1121 static const llvm::Regex
NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase
);
1122 SmallVector
<StringRef
, 1> Matches
;
1123 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1124 // And it uses the hash for delays and parameter lists. In order to continue
1125 // using `tok::hash` in other places, the backtick gets marked as the hash
1126 // here. And in order to tell the backtick and hash apart for
1127 // Verilog-specific stuff, the hash becomes an identifier.
1128 if (FormatTok
->is(tok::numeric_constant
)) {
1129 // In Verilog the quote is not part of a number.
1130 auto Quote
= FormatTok
->TokenText
.find('\'');
1131 if (Quote
!= StringRef::npos
)
1132 truncateToken(Quote
);
1133 } else if (FormatTok
->isOneOf(tok::hash
, tok::hashhash
)) {
1134 FormatTok
->Tok
.setKind(tok::raw_identifier
);
1135 } else if (FormatTok
->is(tok::raw_identifier
)) {
1136 if (FormatTok
->TokenText
== "`") {
1137 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1138 FormatTok
->Tok
.setKind(tok::hash
);
1139 } else if (FormatTok
->TokenText
== "``") {
1140 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1141 FormatTok
->Tok
.setKind(tok::hashhash
);
1142 } else if (Tokens
.size() > 0 &&
1143 Tokens
.back()->is(Keywords
.kw_apostrophe
) &&
1144 NumberBase
.match(FormatTok
->TokenText
, &Matches
)) {
1145 // In Verilog in a based number literal like `'b10`, there may be
1146 // whitespace between `'b` and `10`. Therefore we handle the base and
1147 // the rest of the number literal as two tokens. But if there is no
1148 // space in the input code, we need to manually separate the two parts.
1149 truncateToken(Matches
[0].size());
1150 FormatTok
->setFinalizedType(TT_VerilogNumberBase
);
1155 FormatTok
->WhitespaceRange
= SourceRange(
1156 WhitespaceStart
, WhitespaceStart
.getLocWithOffset(WhitespaceLength
));
1158 FormatTok
->OriginalColumn
= Column
;
1160 TrailingWhitespace
= 0;
1161 if (FormatTok
->is(tok::comment
)) {
1162 // FIXME: Add the trimmed whitespace to Column.
1163 StringRef UntrimmedText
= FormatTok
->TokenText
;
1164 FormatTok
->TokenText
= FormatTok
->TokenText
.rtrim(" \t\v\f");
1165 TrailingWhitespace
= UntrimmedText
.size() - FormatTok
->TokenText
.size();
1166 } else if (FormatTok
->is(tok::raw_identifier
)) {
1167 IdentifierInfo
&Info
= IdentTable
.get(FormatTok
->TokenText
);
1168 FormatTok
->Tok
.setIdentifierInfo(&Info
);
1169 FormatTok
->Tok
.setKind(Info
.getTokenID());
1170 if (Style
.Language
== FormatStyle::LK_Java
&&
1171 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
, tok::kw_delete
,
1172 tok::kw_operator
)) {
1173 FormatTok
->Tok
.setKind(tok::identifier
);
1174 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1175 } else if (Style
.isJavaScript() &&
1176 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
,
1177 tok::kw_operator
)) {
1178 FormatTok
->Tok
.setKind(tok::identifier
);
1179 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1181 } else if (FormatTok
->is(tok::greatergreater
)) {
1182 FormatTok
->Tok
.setKind(tok::greater
);
1183 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1185 StateStack
.push(LexerState::TOKEN_STASHED
);
1186 } else if (FormatTok
->is(tok::lessless
)) {
1187 FormatTok
->Tok
.setKind(tok::less
);
1188 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1190 StateStack
.push(LexerState::TOKEN_STASHED
);
1193 if (Style
.isVerilog() && Tokens
.size() > 0 &&
1194 Tokens
.back()->is(TT_VerilogNumberBase
) &&
1195 FormatTok
->Tok
.isOneOf(tok::identifier
, tok::question
)) {
1196 // Mark the number following a base like `'h?a0` as a number.
1197 FormatTok
->Tok
.setKind(tok::numeric_constant
);
1200 // Now FormatTok is the next non-whitespace token.
1202 StringRef Text
= FormatTok
->TokenText
;
1203 size_t FirstNewlinePos
= Text
.find('\n');
1204 if (FirstNewlinePos
== StringRef::npos
) {
1205 // FIXME: ColumnWidth actually depends on the start column, we need to
1206 // take this into account when the token is moved.
1207 FormatTok
->ColumnWidth
=
1208 encoding::columnWidthWithTabs(Text
, Column
, Style
.TabWidth
, Encoding
);
1209 Column
+= FormatTok
->ColumnWidth
;
1211 FormatTok
->IsMultiline
= true;
1212 // FIXME: ColumnWidth actually depends on the start column, we need to
1213 // take this into account when the token is moved.
1214 FormatTok
->ColumnWidth
= encoding::columnWidthWithTabs(
1215 Text
.substr(0, FirstNewlinePos
), Column
, Style
.TabWidth
, Encoding
);
1217 // The last line of the token always starts in column 0.
1218 // Thus, the length can be precomputed even in the presence of tabs.
1219 FormatTok
->LastLineColumnWidth
= encoding::columnWidthWithTabs(
1220 Text
.substr(Text
.find_last_of('\n') + 1), 0, Style
.TabWidth
, Encoding
);
1221 Column
= FormatTok
->LastLineColumnWidth
;
1224 if (Style
.isCpp()) {
1225 auto it
= Macros
.find(FormatTok
->Tok
.getIdentifierInfo());
1226 if (!(Tokens
.size() > 0 && Tokens
.back()->Tok
.getIdentifierInfo() &&
1227 Tokens
.back()->Tok
.getIdentifierInfo()->getPPKeywordID() ==
1229 it
!= Macros
.end()) {
1230 FormatTok
->setType(it
->second
);
1231 if (it
->second
== TT_IfMacro
) {
1232 // The lexer token currently has type tok::kw_unknown. However, for this
1233 // substitution to be treated correctly in the TokenAnnotator, faking
1234 // the tok value seems to be needed. Not sure if there's a more elegant
1236 FormatTok
->Tok
.setKind(tok::kw_if
);
1238 } else if (FormatTok
->is(tok::identifier
)) {
1239 if (MacroBlockBeginRegex
.match(Text
))
1240 FormatTok
->setType(TT_MacroBlockBegin
);
1241 else if (MacroBlockEndRegex
.match(Text
))
1242 FormatTok
->setType(TT_MacroBlockEnd
);
1249 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token
&Tok
) {
1250 // In Verilog the quote is not a character literal.
1252 // Make the backtick and double backtick identifiers to match against them
1255 // In Verilog an escaped identifier starts with backslash and ends with
1256 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1257 // also begin an escaped newline outside of an escaped identifier. We check
1258 // for that outside of the Regex since we can't use negative lookhead
1259 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1260 // identifier may have a length of 0 according to Section A.9.3.
1261 // FIXME: If there is an escaped newline in the middle of an escaped
1262 // identifier, allow for pasting the two lines together, But escaped
1263 // identifiers usually occur only in generated code anyway.
1264 static const llvm::Regex
VerilogToken(R
"re(^('|``?|\\(\\)re"
1265 "(\r?\n|\r)|[^[:space:]])*)");
1267 SmallVector
<StringRef
, 4> Matches
;
1268 const char *Start
= Lex
->getBufferLocation();
1269 if (!VerilogToken
.match(StringRef(Start
, Lex
->getBuffer().end() - Start
),
1273 // There is a null byte at the end of the buffer, so we don't have to check
1274 // Start[1] is within the buffer.
1275 if (Start
[0] == '\\' && (Start
[1] == '\r' || Start
[1] == '\n'))
1277 size_t Len
= Matches
[0].size();
1279 // The kind has to be an identifier so we can match it against those defined
1280 // in Keywords. The kind has to be set before the length because the setLength
1281 // function checks that the kind is not an annotation.
1282 Tok
.setKind(tok::raw_identifier
);
1284 Tok
.setLocation(Lex
->getSourceLocation(Start
, Len
));
1285 Tok
.setRawIdentifierData(Start
);
1286 Lex
->seek(Lex
->getCurrentBufferOffset() + Len
, /*IsAtStartofline=*/false);
1290 void FormatTokenLexer::readRawToken(FormatToken
&Tok
) {
1291 // For Verilog, first see if there is a special token, and fall back to the
1292 // normal lexer if there isn't one.
1293 if (!Style
.isVerilog() || !readRawTokenVerilogSpecific(Tok
.Tok
))
1294 Lex
->LexFromRawLexer(Tok
.Tok
);
1295 Tok
.TokenText
= StringRef(SourceMgr
.getCharacterData(Tok
.Tok
.getLocation()),
1296 Tok
.Tok
.getLength());
1297 // For formatting, treat unterminated string literals like normal string
1299 if (Tok
.is(tok::unknown
)) {
1300 if (!Tok
.TokenText
.empty() && Tok
.TokenText
[0] == '"') {
1301 Tok
.Tok
.setKind(tok::string_literal
);
1302 Tok
.IsUnterminatedLiteral
= true;
1303 } else if (Style
.isJavaScript() && Tok
.TokenText
== "''") {
1304 Tok
.Tok
.setKind(tok::string_literal
);
1308 if ((Style
.isJavaScript() || Style
.Language
== FormatStyle::LK_Proto
||
1309 Style
.Language
== FormatStyle::LK_TextProto
) &&
1310 Tok
.is(tok::char_constant
)) {
1311 Tok
.Tok
.setKind(tok::string_literal
);
1314 if (Tok
.is(tok::comment
) && isClangFormatOn(Tok
.TokenText
))
1315 FormattingDisabled
= false;
1317 Tok
.Finalized
= FormattingDisabled
;
1319 if (Tok
.is(tok::comment
) && isClangFormatOff(Tok
.TokenText
))
1320 FormattingDisabled
= true;
1323 void FormatTokenLexer::resetLexer(unsigned Offset
) {
1324 StringRef Buffer
= SourceMgr
.getBufferData(ID
);
1325 LangOpts
= getFormattingLangOpts(Style
);
1326 Lex
.reset(new Lexer(SourceMgr
.getLocForStartOfFile(ID
), LangOpts
,
1327 Buffer
.begin(), Buffer
.begin() + Offset
, Buffer
.end()));
1328 Lex
->SetKeepWhitespaceMode(true);
1329 TrailingWhitespace
= 0;
1332 } // namespace format
1333 } // namespace clang