1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
13 //===----------------------------------------------------------------------===//
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
25 FormatTokenLexer::FormatTokenLexer(
26 const SourceManager
&SourceMgr
, FileID ID
, unsigned Column
,
27 const FormatStyle
&Style
, encoding::Encoding Encoding
,
28 llvm::SpecificBumpPtrAllocator
<FormatToken
> &Allocator
,
29 IdentifierTable
&IdentTable
)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL
}),
31 Column(Column
), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style
)), SourceMgr(SourceMgr
), ID(ID
),
33 Style(Style
), IdentTable(IdentTable
), Keywords(IdentTable
),
34 Encoding(Encoding
), Allocator(Allocator
), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style
.MacroBlockBegin
),
36 MacroBlockEndRegex(Style
.MacroBlockEnd
) {
37 Lex
.reset(new Lexer(ID
, SourceMgr
.getBufferOrFake(ID
), SourceMgr
, LangOpts
));
38 Lex
->SetKeepWhitespaceMode(true);
40 for (const std::string
&ForEachMacro
: Style
.ForEachMacros
) {
41 auto Identifier
= &IdentTable
.get(ForEachMacro
);
42 Macros
.insert({Identifier
, TT_ForEachMacro
});
44 for (const std::string
&IfMacro
: Style
.IfMacros
) {
45 auto Identifier
= &IdentTable
.get(IfMacro
);
46 Macros
.insert({Identifier
, TT_IfMacro
});
48 for (const std::string
&AttributeMacro
: Style
.AttributeMacros
) {
49 auto Identifier
= &IdentTable
.get(AttributeMacro
);
50 Macros
.insert({Identifier
, TT_AttributeMacro
});
52 for (const std::string
&StatementMacro
: Style
.StatementMacros
) {
53 auto Identifier
= &IdentTable
.get(StatementMacro
);
54 Macros
.insert({Identifier
, TT_StatementMacro
});
56 for (const std::string
&TypenameMacro
: Style
.TypenameMacros
) {
57 auto Identifier
= &IdentTable
.get(TypenameMacro
);
58 Macros
.insert({Identifier
, TT_TypenameMacro
});
60 for (const std::string
&NamespaceMacro
: Style
.NamespaceMacros
) {
61 auto Identifier
= &IdentTable
.get(NamespaceMacro
);
62 Macros
.insert({Identifier
, TT_NamespaceMacro
});
64 for (const std::string
&WhitespaceSensitiveMacro
:
65 Style
.WhitespaceSensitiveMacros
) {
66 auto Identifier
= &IdentTable
.get(WhitespaceSensitiveMacro
);
67 Macros
.insert({Identifier
, TT_UntouchableMacroFunc
});
69 for (const std::string
&StatementAttributeLikeMacro
:
70 Style
.StatementAttributeLikeMacros
) {
71 auto Identifier
= &IdentTable
.get(StatementAttributeLikeMacro
);
72 Macros
.insert({Identifier
, TT_StatementAttributeLikeMacro
});
76 ArrayRef
<FormatToken
*> FormatTokenLexer::lex() {
77 assert(Tokens
.empty());
78 assert(FirstInLineIndex
== 0);
80 Tokens
.push_back(getNextToken());
81 if (Style
.isJavaScript()) {
82 tryParseJSRegexLiteral();
83 handleTemplateStrings();
85 if (Style
.Language
== FormatStyle::LK_TextProto
)
86 tryParsePythonComment();
87 tryMergePreviousTokens();
88 if (Style
.isCSharp()) {
89 // This needs to come after tokens have been merged so that C#
90 // string literals are correctly identified.
91 handleCSharpVerbatimAndInterpolatedStrings();
93 if (Tokens
.back()->NewlinesBefore
> 0 || Tokens
.back()->IsMultiline
)
94 FirstInLineIndex
= Tokens
.size() - 1;
95 } while (Tokens
.back()->isNot(tok::eof
));
99 void FormatTokenLexer::tryMergePreviousTokens() {
100 if (tryMerge_TMacro())
102 if (tryMergeConflictMarkers())
104 if (tryMergeLessLess())
106 if (tryMergeForEach())
108 if (Style
.isCpp() && tryTransformTryUsageForC())
111 if (Style
.isJavaScript() || Style
.isCSharp()) {
112 static const tok::TokenKind NullishCoalescingOperator
[] = {tok::question
,
114 static const tok::TokenKind NullPropagatingOperator
[] = {tok::question
,
116 static const tok::TokenKind FatArrow
[] = {tok::equal
, tok::greater
};
118 if (tryMergeTokens(FatArrow
, TT_FatArrow
))
120 if (tryMergeTokens(NullishCoalescingOperator
, TT_NullCoalescingOperator
)) {
121 // Treat like the "||" operator (as opposed to the ternary ?).
122 Tokens
.back()->Tok
.setKind(tok::pipepipe
);
125 if (tryMergeTokens(NullPropagatingOperator
, TT_NullPropagatingOperator
)) {
126 // Treat like a regular "." access.
127 Tokens
.back()->Tok
.setKind(tok::period
);
130 if (tryMergeNullishCoalescingEqual())
134 if (Style
.isCSharp()) {
135 static const tok::TokenKind CSharpNullConditionalLSquare
[] = {
136 tok::question
, tok::l_square
};
138 if (tryMergeCSharpKeywordVariables())
140 if (tryMergeCSharpStringLiteral())
142 if (tryTransformCSharpForEach())
144 if (tryMergeTokens(CSharpNullConditionalLSquare
,
145 TT_CSharpNullConditionalLSquare
)) {
146 // Treat like a regular "[" operator.
147 Tokens
.back()->Tok
.setKind(tok::l_square
);
152 if (tryMergeNSStringLiteral())
155 if (Style
.isJavaScript()) {
156 static const tok::TokenKind JSIdentity
[] = {tok::equalequal
, tok::equal
};
157 static const tok::TokenKind JSNotIdentity
[] = {tok::exclaimequal
,
159 static const tok::TokenKind JSShiftEqual
[] = {tok::greater
, tok::greater
,
161 static const tok::TokenKind JSExponentiation
[] = {tok::star
, tok::star
};
162 static const tok::TokenKind JSExponentiationEqual
[] = {tok::star
,
164 static const tok::TokenKind JSPipePipeEqual
[] = {tok::pipepipe
, tok::equal
};
165 static const tok::TokenKind JSAndAndEqual
[] = {tok::ampamp
, tok::equal
};
167 // FIXME: Investigate what token type gives the correct operator priority.
168 if (tryMergeTokens(JSIdentity
, TT_BinaryOperator
))
170 if (tryMergeTokens(JSNotIdentity
, TT_BinaryOperator
))
172 if (tryMergeTokens(JSShiftEqual
, TT_BinaryOperator
))
174 if (tryMergeTokens(JSExponentiation
, TT_JsExponentiation
))
176 if (tryMergeTokens(JSExponentiationEqual
, TT_JsExponentiationEqual
)) {
177 Tokens
.back()->Tok
.setKind(tok::starequal
);
180 if (tryMergeTokens(JSAndAndEqual
, TT_JsAndAndEqual
) ||
181 tryMergeTokens(JSPipePipeEqual
, TT_JsPipePipeEqual
)) {
182 // Treat like the "=" assignment operator.
183 Tokens
.back()->Tok
.setKind(tok::equal
);
186 if (tryMergeJSPrivateIdentifier())
190 if (Style
.Language
== FormatStyle::LK_Java
) {
191 static const tok::TokenKind JavaRightLogicalShiftAssign
[] = {
192 tok::greater
, tok::greater
, tok::greaterequal
};
193 if (tryMergeTokens(JavaRightLogicalShiftAssign
, TT_BinaryOperator
))
197 if (Style
.isVerilog()) {
198 // Merge the number following a base like `'h?a0`.
199 if (Tokens
.size() >= 3 && Tokens
.end()[-3]->is(TT_VerilogNumberBase
) &&
200 Tokens
.end()[-2]->is(tok::numeric_constant
) &&
201 Tokens
.back()->isOneOf(tok::numeric_constant
, tok::identifier
,
203 tryMergeTokens(2, TT_Unknown
)) {
207 if (tryMergeTokensAny({{tok::minus
, tok::colon
}, {tok::plus
, tok::colon
}},
211 // Xnor. The combined token is treated as a caret which can also be either a
212 // unary or binary operator. The actual type is determined in
213 // TokenAnnotator. We also check the token length so we know it is not
214 // already a merged token.
215 if (Tokens
.back()->TokenText
.size() == 1 &&
216 tryMergeTokensAny({{tok::caret
, tok::tilde
}, {tok::tilde
, tok::caret
}},
217 TT_BinaryOperator
)) {
218 Tokens
.back()->Tok
.setKind(tok::caret
);
221 // Signed shift and distribution weight.
222 if (tryMergeTokens({tok::less
, tok::less
}, TT_BinaryOperator
)) {
223 Tokens
.back()->Tok
.setKind(tok::lessless
);
226 if (tryMergeTokens({tok::greater
, tok::greater
}, TT_BinaryOperator
)) {
227 Tokens
.back()->Tok
.setKind(tok::greatergreater
);
230 if (tryMergeTokensAny({{tok::lessless
, tok::equal
},
231 {tok::lessless
, tok::lessequal
},
232 {tok::greatergreater
, tok::equal
},
233 {tok::greatergreater
, tok::greaterequal
},
234 {tok::colon
, tok::equal
},
235 {tok::colon
, tok::slash
}},
236 TT_BinaryOperator
)) {
237 Tokens
.back()->ForcedPrecedence
= prec::Assignment
;
240 // Exponentiation, signed shift, case equality, and wildcard equality.
241 if (tryMergeTokensAny({{tok::star
, tok::star
},
242 {tok::lessless
, tok::less
},
243 {tok::greatergreater
, tok::greater
},
244 {tok::exclaimequal
, tok::equal
},
245 {tok::exclaimequal
, tok::question
},
246 {tok::equalequal
, tok::equal
},
247 {tok::equalequal
, tok::question
}},
248 TT_BinaryOperator
)) {
251 // Module paths in specify blocks and implications in properties.
252 if (tryMergeTokensAny({{tok::plusequal
, tok::greater
},
253 {tok::plus
, tok::star
, tok::greater
},
254 {tok::minusequal
, tok::greater
},
255 {tok::minus
, tok::star
, tok::greater
},
256 {tok::less
, tok::arrow
},
257 {tok::equal
, tok::greater
},
258 {tok::star
, tok::greater
},
259 {tok::pipeequal
, tok::greater
},
260 {tok::pipe
, tok::arrow
},
261 {tok::hash
, tok::minus
, tok::hash
},
262 {tok::hash
, tok::equal
, tok::hash
}},
263 TT_BinaryOperator
)) {
264 Tokens
.back()->ForcedPrecedence
= prec::Comma
;
270 bool FormatTokenLexer::tryMergeNSStringLiteral() {
271 if (Tokens
.size() < 2)
273 auto &At
= *(Tokens
.end() - 2);
274 auto &String
= *(Tokens
.end() - 1);
275 if (!At
->is(tok::at
) || !String
->is(tok::string_literal
))
277 At
->Tok
.setKind(tok::string_literal
);
278 At
->TokenText
= StringRef(At
->TokenText
.begin(),
279 String
->TokenText
.end() - At
->TokenText
.begin());
280 At
->ColumnWidth
+= String
->ColumnWidth
;
281 At
->setType(TT_ObjCStringLiteral
);
282 Tokens
.erase(Tokens
.end() - 1);
286 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
287 // Merges #idenfier into a single identifier with the text #identifier
288 // but the token tok::identifier.
289 if (Tokens
.size() < 2)
291 auto &Hash
= *(Tokens
.end() - 2);
292 auto &Identifier
= *(Tokens
.end() - 1);
293 if (!Hash
->is(tok::hash
) || !Identifier
->is(tok::identifier
))
295 Hash
->Tok
.setKind(tok::identifier
);
297 StringRef(Hash
->TokenText
.begin(),
298 Identifier
->TokenText
.end() - Hash
->TokenText
.begin());
299 Hash
->ColumnWidth
+= Identifier
->ColumnWidth
;
300 Hash
->setType(TT_JsPrivateIdentifier
);
301 Tokens
.erase(Tokens
.end() - 1);
305 // Search for verbatim or interpolated string literals @"ABC" or
306 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
307 // prevent splitting of @, $ and ".
308 // Merging of multiline verbatim strings with embedded '"' is handled in
309 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
310 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
311 if (Tokens
.size() < 2)
314 // Look for @"aaaaaa" or $"aaaaaa".
315 auto &String
= *(Tokens
.end() - 1);
316 if (!String
->is(tok::string_literal
))
319 auto &At
= *(Tokens
.end() - 2);
320 if (!(At
->is(tok::at
) || At
->TokenText
== "$"))
323 if (Tokens
.size() > 2 && At
->is(tok::at
)) {
324 auto &Dollar
= *(Tokens
.end() - 3);
325 if (Dollar
->TokenText
== "$") {
326 // This looks like $@"aaaaa" so we need to combine all 3 tokens.
327 Dollar
->Tok
.setKind(tok::string_literal
);
329 StringRef(Dollar
->TokenText
.begin(),
330 String
->TokenText
.end() - Dollar
->TokenText
.begin());
331 Dollar
->ColumnWidth
+= (At
->ColumnWidth
+ String
->ColumnWidth
);
332 Dollar
->setType(TT_CSharpStringLiteral
);
333 Tokens
.erase(Tokens
.end() - 2);
334 Tokens
.erase(Tokens
.end() - 1);
339 // Convert back into just a string_literal.
340 At
->Tok
.setKind(tok::string_literal
);
341 At
->TokenText
= StringRef(At
->TokenText
.begin(),
342 String
->TokenText
.end() - At
->TokenText
.begin());
343 At
->ColumnWidth
+= String
->ColumnWidth
;
344 At
->setType(TT_CSharpStringLiteral
);
345 Tokens
.erase(Tokens
.end() - 1);
349 // Valid C# attribute targets:
350 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
351 const llvm::StringSet
<> FormatTokenLexer::CSharpAttributeTargets
= {
352 "assembly", "module", "field", "event", "method",
353 "param", "property", "return", "type",
356 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
357 if (Tokens
.size() < 2)
359 auto &NullishCoalescing
= *(Tokens
.end() - 2);
360 auto &Equal
= *(Tokens
.end() - 1);
361 if (NullishCoalescing
->getType() != TT_NullCoalescingOperator
||
362 !Equal
->is(tok::equal
)) {
365 NullishCoalescing
->Tok
.setKind(tok::equal
); // no '??=' in clang tokens.
366 NullishCoalescing
->TokenText
=
367 StringRef(NullishCoalescing
->TokenText
.begin(),
368 Equal
->TokenText
.end() - NullishCoalescing
->TokenText
.begin());
369 NullishCoalescing
->ColumnWidth
+= Equal
->ColumnWidth
;
370 NullishCoalescing
->setType(TT_NullCoalescingEqual
);
371 Tokens
.erase(Tokens
.end() - 1);
375 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
376 if (Tokens
.size() < 2)
378 auto &At
= *(Tokens
.end() - 2);
379 auto &Keyword
= *(Tokens
.end() - 1);
380 if (!At
->is(tok::at
))
382 if (!Keywords
.isCSharpKeyword(*Keyword
))
385 At
->Tok
.setKind(tok::identifier
);
386 At
->TokenText
= StringRef(At
->TokenText
.begin(),
387 Keyword
->TokenText
.end() - At
->TokenText
.begin());
388 At
->ColumnWidth
+= Keyword
->ColumnWidth
;
389 At
->setType(Keyword
->getType());
390 Tokens
.erase(Tokens
.end() - 1);
394 // In C# transform identifier foreach into kw_foreach
395 bool FormatTokenLexer::tryTransformCSharpForEach() {
396 if (Tokens
.size() < 1)
398 auto &Identifier
= *(Tokens
.end() - 1);
399 if (!Identifier
->is(tok::identifier
))
401 if (Identifier
->TokenText
!= "foreach")
404 Identifier
->setType(TT_ForEachMacro
);
405 Identifier
->Tok
.setKind(tok::kw_for
);
409 bool FormatTokenLexer::tryMergeForEach() {
410 if (Tokens
.size() < 2)
412 auto &For
= *(Tokens
.end() - 2);
413 auto &Each
= *(Tokens
.end() - 1);
414 if (!For
->is(tok::kw_for
))
416 if (!Each
->is(tok::identifier
))
418 if (Each
->TokenText
!= "each")
421 For
->setType(TT_ForEachMacro
);
422 For
->Tok
.setKind(tok::kw_for
);
424 For
->TokenText
= StringRef(For
->TokenText
.begin(),
425 Each
->TokenText
.end() - For
->TokenText
.begin());
426 For
->ColumnWidth
+= Each
->ColumnWidth
;
427 Tokens
.erase(Tokens
.end() - 1);
431 bool FormatTokenLexer::tryTransformTryUsageForC() {
432 if (Tokens
.size() < 2)
434 auto &Try
= *(Tokens
.end() - 2);
435 if (!Try
->is(tok::kw_try
))
437 auto &Next
= *(Tokens
.end() - 1);
438 if (Next
->isOneOf(tok::l_brace
, tok::colon
, tok::hash
, tok::comment
))
441 if (Tokens
.size() > 2) {
442 auto &At
= *(Tokens
.end() - 3);
447 Try
->Tok
.setKind(tok::identifier
);
451 bool FormatTokenLexer::tryMergeLessLess() {
452 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
453 if (Tokens
.size() < 3)
456 auto First
= Tokens
.end() - 3;
457 if (First
[0]->isNot(tok::less
) || First
[1]->isNot(tok::less
))
460 // Only merge if there currently is no whitespace between the two "<".
461 if (First
[1]->hasWhitespaceBefore())
464 auto X
= Tokens
.size() > 3 ? First
[-1] : nullptr;
466 if ((X
&& X
->is(tok::less
)) || Y
->is(tok::less
))
469 // Do not remove a whitespace between the two "<" e.g. "operator< <>".
470 if (X
&& X
->is(tok::kw_operator
) && Y
->is(tok::greater
))
473 First
[0]->Tok
.setKind(tok::lessless
);
474 First
[0]->TokenText
= "<<";
475 First
[0]->ColumnWidth
+= 1;
476 Tokens
.erase(Tokens
.end() - 2);
480 bool FormatTokenLexer::tryMergeTokens(ArrayRef
<tok::TokenKind
> Kinds
,
482 if (Tokens
.size() < Kinds
.size())
485 SmallVectorImpl
<FormatToken
*>::const_iterator First
=
486 Tokens
.end() - Kinds
.size();
487 for (unsigned i
= 0; i
< Kinds
.size(); ++i
)
488 if (!First
[i
]->is(Kinds
[i
]))
491 return tryMergeTokens(Kinds
.size(), NewType
);
494 bool FormatTokenLexer::tryMergeTokens(size_t Count
, TokenType NewType
) {
495 if (Tokens
.size() < Count
)
498 SmallVectorImpl
<FormatToken
*>::const_iterator First
= Tokens
.end() - Count
;
499 unsigned AddLength
= 0;
500 for (size_t i
= 1; i
< Count
; ++i
) {
501 // If there is whitespace separating the token and the previous one,
502 // they should not be merged.
503 if (First
[i
]->hasWhitespaceBefore())
505 AddLength
+= First
[i
]->TokenText
.size();
508 Tokens
.resize(Tokens
.size() - Count
+ 1);
509 First
[0]->TokenText
= StringRef(First
[0]->TokenText
.data(),
510 First
[0]->TokenText
.size() + AddLength
);
511 First
[0]->ColumnWidth
+= AddLength
;
512 First
[0]->setType(NewType
);
516 bool FormatTokenLexer::tryMergeTokensAny(
517 ArrayRef
<ArrayRef
<tok::TokenKind
>> Kinds
, TokenType NewType
) {
518 return llvm::any_of(Kinds
, [this, NewType
](ArrayRef
<tok::TokenKind
> Kinds
) {
519 return tryMergeTokens(Kinds
, NewType
);
523 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
524 bool FormatTokenLexer::precedesOperand(FormatToken
*Tok
) {
525 // NB: This is not entirely correct, as an r_paren can introduce an operand
526 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
527 // corner case to not matter in practice, though.
528 return Tok
->isOneOf(tok::period
, tok::l_paren
, tok::comma
, tok::l_brace
,
529 tok::r_brace
, tok::l_square
, tok::semi
, tok::exclaim
,
530 tok::colon
, tok::question
, tok::tilde
) ||
531 Tok
->isOneOf(tok::kw_return
, tok::kw_do
, tok::kw_case
, tok::kw_throw
,
532 tok::kw_else
, tok::kw_new
, tok::kw_delete
, tok::kw_void
,
533 tok::kw_typeof
, Keywords
.kw_instanceof
, Keywords
.kw_in
) ||
534 Tok
->isBinaryOperator();
537 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken
*Prev
) {
541 // Regex literals can only follow after prefix unary operators, not after
542 // postfix unary operators. If the '++' is followed by a non-operand
543 // introducing token, the slash here is the operand and not the start of a
545 // `!` is an unary prefix operator, but also a post-fix operator that casts
546 // away nullability, so the same check applies.
547 if (Prev
->isOneOf(tok::plusplus
, tok::minusminus
, tok::exclaim
))
548 return Tokens
.size() < 3 || precedesOperand(Tokens
[Tokens
.size() - 3]);
550 // The previous token must introduce an operand location where regex
551 // literals can occur.
552 if (!precedesOperand(Prev
))
558 // Tries to parse a JavaScript Regex literal starting at the current token,
559 // if that begins with a slash and is in a location where JavaScript allows
560 // regex literals. Changes the current token to a regex literal and updates
561 // its text if successful.
562 void FormatTokenLexer::tryParseJSRegexLiteral() {
563 FormatToken
*RegexToken
= Tokens
.back();
564 if (!RegexToken
->isOneOf(tok::slash
, tok::slashequal
))
567 FormatToken
*Prev
= nullptr;
568 for (FormatToken
*FT
: llvm::drop_begin(llvm::reverse(Tokens
))) {
569 // NB: Because previous pointers are not initialized yet, this cannot use
570 // Token.getPreviousNonComment.
571 if (FT
->isNot(tok::comment
)) {
577 if (!canPrecedeRegexLiteral(Prev
))
580 // 'Manually' lex ahead in the current file buffer.
581 const char *Offset
= Lex
->getBufferLocation();
582 const char *RegexBegin
= Offset
- RegexToken
->TokenText
.size();
583 StringRef Buffer
= Lex
->getBuffer();
584 bool InCharacterClass
= false;
585 bool HaveClosingSlash
= false;
586 for (; !HaveClosingSlash
&& Offset
!= Buffer
.end(); ++Offset
) {
587 // Regular expressions are terminated with a '/', which can only be
588 // escaped using '\' or a character class between '[' and ']'.
589 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
592 // Skip the escaped character.
596 InCharacterClass
= true;
599 InCharacterClass
= false;
602 if (!InCharacterClass
)
603 HaveClosingSlash
= true;
608 RegexToken
->setType(TT_RegexLiteral
);
609 // Treat regex literals like other string_literals.
610 RegexToken
->Tok
.setKind(tok::string_literal
);
611 RegexToken
->TokenText
= StringRef(RegexBegin
, Offset
- RegexBegin
);
612 RegexToken
->ColumnWidth
= RegexToken
->TokenText
.size();
614 resetLexer(SourceMgr
.getFileOffset(Lex
->getSourceLocation(Offset
)));
617 static auto lexCSharpString(const char *Begin
, const char *End
, bool Verbatim
,
619 auto Repeated
= [&Begin
, End
]() {
620 return Begin
+ 1 < End
&& Begin
[1] == Begin
[0];
623 // Look for a terminating '"' in the current file buffer.
624 // Make no effort to format code within an interpolated or verbatim string.
626 // Interpolated strings could contain { } with " characters inside.
628 // should not be split into $"{x ?? ", null, "}" but should be treated as a
629 // single string-literal.
631 // We opt not to try and format expressions inside {} within a C#
632 // interpolated string. Formatting expressions within an interpolated string
633 // would require similar work as that done for JavaScript template strings
634 // in `handleTemplateStrings()`.
635 for (int UnmatchedOpeningBraceCount
= 0; Begin
< End
; ++Begin
) {
643 // {{ inside an interpolated string is escaped, so skip it.
647 ++UnmatchedOpeningBraceCount
;
652 // }} inside an interpolated string is escaped, so skip it.
655 else if (UnmatchedOpeningBraceCount
> 0)
656 --UnmatchedOpeningBraceCount
;
662 if (UnmatchedOpeningBraceCount
> 0)
664 // "" within a verbatim string is an escaped double quote: skip it.
665 if (Verbatim
&& Repeated()) {
676 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
677 FormatToken
*CSharpStringLiteral
= Tokens
.back();
679 if (CSharpStringLiteral
->isNot(TT_CSharpStringLiteral
))
682 auto &TokenText
= CSharpStringLiteral
->TokenText
;
684 bool Verbatim
= false;
685 bool Interpolated
= false;
686 if (TokenText
.startswith(R
"($@")")) {
689 } else if (TokenText.startswith(R"(@
")")) {
691 } else if (TokenText
.startswith(R
"($")")) {
695 // Deal with multiline strings.
696 if (!Verbatim && !Interpolated)
699 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
700 const char *Offset = StrBegin;
701 if (Verbatim && Interpolated)
706 const auto End = Lex->getBuffer().end();
707 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
709 // Make no attempt to format code properly if a verbatim string is
714 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
715 TokenText = LiteralText;
717 // Adjust width for potentially multiline string literals.
718 size_t FirstBreak = LiteralText.find('\n');
719 StringRef FirstLineText = FirstBreak == StringRef::npos
721 : LiteralText.substr(0, FirstBreak);
722 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
723 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
725 size_t LastBreak = LiteralText.rfind('\n');
726 if (LastBreak != StringRef::npos) {
727 CSharpStringLiteral->IsMultiline = true;
728 unsigned StartColumn = 0;
729 CSharpStringLiteral->LastLineColumnWidth =
730 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
731 StartColumn, Style.TabWidth, Encoding);
734 assert(Offset < End);
735 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
738 void FormatTokenLexer::handleTemplateStrings() {
739 FormatToken *BacktickToken = Tokens.back();
741 if (BacktickToken->is(tok::l_brace)) {
742 StateStack.push(LexerState::NORMAL);
745 if (BacktickToken->is(tok::r_brace)) {
746 if (StateStack.size() == 1)
749 if (StateStack.top() != LexerState::TEMPLATE_STRING)
751 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
752 } else if (BacktickToken->is(tok::unknown) &&
753 BacktickToken->TokenText == "`
") {
754 StateStack.push(LexerState::TEMPLATE_STRING);
756 return; // Not actually a template
759 // 'Manually' lex ahead in the current file buffer.
760 const char *Offset = Lex->getBufferLocation();
761 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`
"
762 for (; Offset != Lex->getBuffer().end(); ++Offset) {
763 if (Offset[0] == '`') {
767 if (Offset[0] == '\\') {
768 ++Offset; // Skip the escaped character.
769 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
771 // '${' introduces an expression interpolation in the template string.
772 StateStack.push(LexerState::NORMAL);
778 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
779 BacktickToken->setType(TT_TemplateString);
780 BacktickToken->Tok.setKind(tok::string_literal);
781 BacktickToken->TokenText = LiteralText;
783 // Adjust width for potentially multiline string literals.
784 size_t FirstBreak = LiteralText.find('\n');
785 StringRef FirstLineText = FirstBreak == StringRef::npos
787 : LiteralText.substr(0, FirstBreak);
788 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
789 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
790 size_t LastBreak = LiteralText.rfind('\n');
791 if (LastBreak != StringRef::npos) {
792 BacktickToken->IsMultiline = true;
793 unsigned StartColumn = 0; // The template tail spans the entire line.
794 BacktickToken->LastLineColumnWidth =
795 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
796 StartColumn, Style.TabWidth, Encoding);
799 SourceLocation loc = Offset < Lex->getBuffer().end()
800 ? Lex->getSourceLocation(Offset + 1)
801 : SourceMgr.getLocForEndOfFile(ID);
802 resetLexer(SourceMgr.getFileOffset(loc));
805 void FormatTokenLexer::tryParsePythonComment() {
806 FormatToken *HashToken = Tokens.back();
807 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
809 // Turn the remainder of this line into a comment.
810 const char *CommentBegin =
811 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
812 size_t From = CommentBegin - Lex->getBuffer().begin();
813 size_t To = Lex->getBuffer().find_first_of('\n', From);
814 if (To == StringRef::npos)
815 To = Lex->getBuffer().size();
816 size_t Len = To - From;
817 HashToken->setType(TT_LineComment);
818 HashToken->Tok.setKind(tok::comment);
819 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
820 SourceLocation Loc = To < Lex->getBuffer().size()
821 ? Lex->getSourceLocation(CommentBegin + Len)
822 : SourceMgr.getLocForEndOfFile(ID);
823 resetLexer(SourceMgr.getFileOffset(Loc));
826 bool FormatTokenLexer::tryMerge_TMacro() {
827 if (Tokens.size() < 4)
829 FormatToken *Last = Tokens.back();
830 if (!Last->is(tok::r_paren))
833 FormatToken *String = Tokens[Tokens.size() - 2];
834 if (!String->is(tok::string_literal) || String->IsMultiline)
837 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
840 FormatToken *Macro = Tokens[Tokens.size() - 4];
841 if (Macro->TokenText != "_T")
844 const char *Start = Macro->TokenText.data();
845 const char *End = Last->TokenText.data() + Last->TokenText.size();
846 String->TokenText = StringRef(Start, End - Start);
847 String->IsFirst = Macro->IsFirst;
848 String->LastNewlineOffset = Macro->LastNewlineOffset;
849 String->WhitespaceRange = Macro->WhitespaceRange;
850 String->OriginalColumn = Macro->OriginalColumn;
851 String->ColumnWidth = encoding::columnWidthWithTabs(
852 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
853 String->NewlinesBefore = Macro->NewlinesBefore;
854 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
859 Tokens.back() = String;
860 if (FirstInLineIndex >= Tokens.size())
861 FirstInLineIndex = Tokens.size() - 1;
865 bool FormatTokenLexer::tryMergeConflictMarkers() {
866 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
869 // Conflict lines look like:
870 // <marker> <text from the vcs>
872 // >>>>>>> /file/in/file/system at revision 1234
874 // We merge all tokens in a line that starts with a conflict marker
875 // into a single token with a special token type that the unwrapped line
876 // parser will use to correctly rebuild the underlying code.
879 // Get the position of the first token in the line.
880 unsigned FirstInLineOffset;
881 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
882 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
883 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
884 // Calculate the offset of the start of the current line.
885 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
886 if (LineOffset == StringRef::npos)
891 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
893 if (FirstSpace == StringRef::npos)
894 LineStart = Buffer.substr(LineOffset);
896 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
898 TokenType Type = TT_Unknown;
899 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
900 Type = TT_ConflictStart;
901 } else if (LineStart == "|||||||" || LineStart == "=======" ||
902 LineStart == "====") {
903 Type = TT_ConflictAlternative;
904 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
905 Type = TT_ConflictEnd;
908 if (Type != TT_Unknown) {
909 FormatToken *Next = Tokens.back();
911 Tokens.resize(FirstInLineIndex + 1);
912 // We do not need to build a complete token here, as we will skip it
913 // during parsing anyway (as we must not touch whitespace around conflict
915 Tokens.back()->setType(Type);
916 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
918 Tokens.push_back(Next);
925 FormatToken *FormatTokenLexer::getStashedToken() {
926 // Create a synthesized second '>' or '<' token.
927 Token Tok = FormatTok->Tok;
928 StringRef TokenText = FormatTok->TokenText;
930 unsigned OriginalColumn = FormatTok->OriginalColumn;
931 FormatTok = new (Allocator.Allocate()) FormatToken;
932 FormatTok->Tok = Tok;
933 SourceLocation TokLocation =
934 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
935 FormatTok->Tok.setLocation(TokLocation);
936 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
937 FormatTok->TokenText = TokenText;
938 FormatTok->ColumnWidth = 1;
939 FormatTok->OriginalColumn = OriginalColumn + 1;
944 /// Truncate the current token to the new length and make the lexer continue
945 /// from the end of the truncated token. Used for other languages that have
946 /// different token boundaries, like JavaScript in which a comment ends at a
947 /// line break regardless of whether the line break follows a backslash. Also
948 /// used to set the lexer to the end of whitespace if the lexer regards
949 /// whitespace and an unrecognized symbol as one token.
950 void FormatTokenLexer::truncateToken(size_t NewLen) {
951 assert(NewLen <= FormatTok->TokenText.size());
952 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
953 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
954 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
955 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
956 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
958 FormatTok->Tok.setLength(NewLen);
961 /// Count the length of leading whitespace in a token.
962 static size_t countLeadingWhitespace(StringRef Text) {
963 // Basically counting the length matched by this regex.
964 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
965 // Directly using the regex turned out to be slow. With the regex
966 // version formatting all files in this directory took about 1.25
967 // seconds. This version took about 0.5 seconds.
968 const unsigned char *const Begin = Text.bytes_begin();
969 const unsigned char *const End = Text.bytes_end();
970 const unsigned char *Cur = Begin;
972 if (isspace(Cur[0])) {
974 } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
975 // A '\' followed by a newline always escapes the newline, regardless
976 // of whether there is another '\' before it.
977 // The source has a null byte at the end. So the end of the entire input
978 // isn't reached yet. Also the lexer doesn't break apart an escaped
980 assert(End - Cur >= 2);
982 } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
983 (Cur[3] == '\n' || Cur[3] == '\r')) {
984 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
985 // characters are quoted individually in this comment because if we write
986 // them together some compilers warn that we have a trigraph in the code.
987 assert(End - Cur >= 4);
996 FormatToken *FormatTokenLexer::getNextToken() {
997 if (StateStack.top() == LexerState::TOKEN_STASHED) {
999 return getStashedToken();
1002 FormatTok = new (Allocator.Allocate()) FormatToken;
1003 readRawToken(*FormatTok);
1004 SourceLocation WhitespaceStart =
1005 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1006 FormatTok->IsFirst = IsFirstToken;
1007 IsFirstToken = false;
1009 // Consume and record whitespace until we find a significant token.
1010 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1011 // followed by a symbol such as backtick. Those symbols may be
1012 // significant in other languages.
1013 unsigned WhitespaceLength = TrailingWhitespace;
1014 while (FormatTok->isNot(tok::eof)) {
1015 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1016 if (LeadingWhitespace == 0)
1018 if (LeadingWhitespace < FormatTok->TokenText.size())
1019 truncateToken(LeadingWhitespace);
1020 StringRef Text = FormatTok->TokenText;
1021 bool InEscape = false;
1022 for (int i = 0, e = Text.size(); i != e; ++i) {
1025 // If this is a CRLF sequence, break here and the LF will be handled on
1026 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1027 // the same as a single LF.
1028 if (i + 1 < e && Text[i + 1] == '\n')
1032 ++FormatTok->NewlinesBefore;
1034 FormatTok->HasUnescapedNewline = true;
1037 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1049 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1054 // The text was entirely whitespace when this loop was entered. Thus
1055 // this has to be an escape sequence.
1056 assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1057 Text.substr(i, 4) == "\?\?/\r" ||
1058 Text.substr(i, 4) == "\?\?/\n" ||
1059 (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1060 Text.substr(i - 1, 4) == "\?\?/\n")) ||
1061 (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1062 Text.substr(i - 2, 4) == "\?\?/\n")));
1066 // This shouldn't happen.
1071 WhitespaceLength += Text.size();
1072 readRawToken(*FormatTok);
1075 if (FormatTok->is(tok::unknown))
1076 FormatTok->setType(TT_ImplicitStringLiteral);
1078 // JavaScript and Java do not allow to escape the end of the line with a
1079 // backslash. Backslashes are syntax errors in plain source, but can occur in
1080 // comments. When a single line comment ends with a \, it'll cause the next
1081 // line of code to be lexed as a comment, breaking formatting. The code below
1082 // finds comments that contain a backslash followed by a line break, truncates
1083 // the comment token at the backslash, and resets the lexer to restart behind
1085 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1086 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
1087 size_t BackslashPos
= FormatTok
->TokenText
.find('\\');
1088 while (BackslashPos
!= StringRef::npos
) {
1089 if (BackslashPos
+ 1 < FormatTok
->TokenText
.size() &&
1090 FormatTok
->TokenText
[BackslashPos
+ 1] == '\n') {
1091 truncateToken(BackslashPos
+ 1);
1094 BackslashPos
= FormatTok
->TokenText
.find('\\', BackslashPos
+ 1);
1098 if (Style
.isVerilog()) {
1099 static const llvm::Regex
NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase
);
1100 SmallVector
<StringRef
, 1> Matches
;
1101 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1102 // And it uses the hash for delays and parameter lists. In order to continue
1103 // using `tok::hash` in other places, the backtick gets marked as the hash
1104 // here. And in order to tell the backtick and hash apart for
1105 // Verilog-specific stuff, the hash becomes an identifier.
1106 if (FormatTok
->is(tok::numeric_constant
)) {
1107 // In Verilog the quote is not part of a number.
1108 auto Quote
= FormatTok
->TokenText
.find('\'');
1109 if (Quote
!= StringRef::npos
)
1110 truncateToken(Quote
);
1111 } else if (FormatTok
->isOneOf(tok::hash
, tok::hashhash
)) {
1112 FormatTok
->Tok
.setKind(tok::raw_identifier
);
1113 } else if (FormatTok
->is(tok::raw_identifier
)) {
1114 if (FormatTok
->TokenText
== "`") {
1115 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1116 FormatTok
->Tok
.setKind(tok::hash
);
1117 } else if (FormatTok
->TokenText
== "``") {
1118 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1119 FormatTok
->Tok
.setKind(tok::hashhash
);
1120 } else if (Tokens
.size() > 0 &&
1121 Tokens
.back()->is(Keywords
.kw_apostrophe
) &&
1122 NumberBase
.match(FormatTok
->TokenText
, &Matches
)) {
1123 // In Verilog in a based number literal like `'b10`, there may be
1124 // whitespace between `'b` and `10`. Therefore we handle the base and
1125 // the rest of the number literal as two tokens. But if there is no
1126 // space in the input code, we need to manually separate the two parts.
1127 truncateToken(Matches
[0].size());
1128 FormatTok
->setFinalizedType(TT_VerilogNumberBase
);
1133 FormatTok
->WhitespaceRange
= SourceRange(
1134 WhitespaceStart
, WhitespaceStart
.getLocWithOffset(WhitespaceLength
));
1136 FormatTok
->OriginalColumn
= Column
;
1138 TrailingWhitespace
= 0;
1139 if (FormatTok
->is(tok::comment
)) {
1140 // FIXME: Add the trimmed whitespace to Column.
1141 StringRef UntrimmedText
= FormatTok
->TokenText
;
1142 FormatTok
->TokenText
= FormatTok
->TokenText
.rtrim(" \t\v\f");
1143 TrailingWhitespace
= UntrimmedText
.size() - FormatTok
->TokenText
.size();
1144 } else if (FormatTok
->is(tok::raw_identifier
)) {
1145 IdentifierInfo
&Info
= IdentTable
.get(FormatTok
->TokenText
);
1146 FormatTok
->Tok
.setIdentifierInfo(&Info
);
1147 FormatTok
->Tok
.setKind(Info
.getTokenID());
1148 if (Style
.Language
== FormatStyle::LK_Java
&&
1149 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
, tok::kw_delete
,
1150 tok::kw_operator
)) {
1151 FormatTok
->Tok
.setKind(tok::identifier
);
1152 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1153 } else if (Style
.isJavaScript() &&
1154 FormatTok
->isOneOf(tok::kw_struct
, tok::kw_union
,
1155 tok::kw_operator
)) {
1156 FormatTok
->Tok
.setKind(tok::identifier
);
1157 FormatTok
->Tok
.setIdentifierInfo(nullptr);
1159 } else if (FormatTok
->is(tok::greatergreater
)) {
1160 FormatTok
->Tok
.setKind(tok::greater
);
1161 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1163 StateStack
.push(LexerState::TOKEN_STASHED
);
1164 } else if (FormatTok
->is(tok::lessless
)) {
1165 FormatTok
->Tok
.setKind(tok::less
);
1166 FormatTok
->TokenText
= FormatTok
->TokenText
.substr(0, 1);
1168 StateStack
.push(LexerState::TOKEN_STASHED
);
1171 if (Style
.isVerilog() && Tokens
.size() > 0 &&
1172 Tokens
.back()->is(TT_VerilogNumberBase
) &&
1173 FormatTok
->Tok
.isOneOf(tok::identifier
, tok::question
)) {
1174 // Mark the number following a base like `'h?a0` as a number.
1175 FormatTok
->Tok
.setKind(tok::numeric_constant
);
1178 // Now FormatTok is the next non-whitespace token.
1180 StringRef Text
= FormatTok
->TokenText
;
1181 size_t FirstNewlinePos
= Text
.find('\n');
1182 if (FirstNewlinePos
== StringRef::npos
) {
1183 // FIXME: ColumnWidth actually depends on the start column, we need to
1184 // take this into account when the token is moved.
1185 FormatTok
->ColumnWidth
=
1186 encoding::columnWidthWithTabs(Text
, Column
, Style
.TabWidth
, Encoding
);
1187 Column
+= FormatTok
->ColumnWidth
;
1189 FormatTok
->IsMultiline
= true;
1190 // FIXME: ColumnWidth actually depends on the start column, we need to
1191 // take this into account when the token is moved.
1192 FormatTok
->ColumnWidth
= encoding::columnWidthWithTabs(
1193 Text
.substr(0, FirstNewlinePos
), Column
, Style
.TabWidth
, Encoding
);
1195 // The last line of the token always starts in column 0.
1196 // Thus, the length can be precomputed even in the presence of tabs.
1197 FormatTok
->LastLineColumnWidth
= encoding::columnWidthWithTabs(
1198 Text
.substr(Text
.find_last_of('\n') + 1), 0, Style
.TabWidth
, Encoding
);
1199 Column
= FormatTok
->LastLineColumnWidth
;
1202 if (Style
.isCpp()) {
1203 auto it
= Macros
.find(FormatTok
->Tok
.getIdentifierInfo());
1204 if (!(Tokens
.size() > 0 && Tokens
.back()->Tok
.getIdentifierInfo() &&
1205 Tokens
.back()->Tok
.getIdentifierInfo()->getPPKeywordID() ==
1207 it
!= Macros
.end()) {
1208 FormatTok
->setType(it
->second
);
1209 if (it
->second
== TT_IfMacro
) {
1210 // The lexer token currently has type tok::kw_unknown. However, for this
1211 // substitution to be treated correctly in the TokenAnnotator, faking
1212 // the tok value seems to be needed. Not sure if there's a more elegant
1214 FormatTok
->Tok
.setKind(tok::kw_if
);
1216 } else if (FormatTok
->is(tok::identifier
)) {
1217 if (MacroBlockBeginRegex
.match(Text
))
1218 FormatTok
->setType(TT_MacroBlockBegin
);
1219 else if (MacroBlockEndRegex
.match(Text
))
1220 FormatTok
->setType(TT_MacroBlockEnd
);
1227 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token
&Tok
) {
1228 // In Verilog the quote is not a character literal.
1230 // Make the backtick and double backtick identifiers to match against them
1233 // In Verilog an escaped identifier starts with backslash and ends with
1234 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1235 // also begin an escaped newline outside of an escaped identifier. We check
1236 // for that outside of the Regex since we can't use negative lookhead
1237 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1238 // identifier may have a length of 0 according to Section A.9.3.
1239 // FIXME: If there is an escaped newline in the middle of an escaped
1240 // identifier, allow for pasting the two lines together, But escaped
1241 // identifiers usually occur only in generated code anyway.
1242 static const llvm::Regex
VerilogToken(R
"re(^('|``?|\\(\\)re"
1243 "(\r?\n|\r)|[^[:space:]])*)");
1245 SmallVector
<StringRef
, 4> Matches
;
1246 const char *Start
= Lex
->getBufferLocation();
1247 if (!VerilogToken
.match(StringRef(Start
, Lex
->getBuffer().end() - Start
),
1251 // There is a null byte at the end of the buffer, so we don't have to check
1252 // Start[1] is within the buffer.
1253 if (Start
[0] == '\\' && (Start
[1] == '\r' || Start
[1] == '\n'))
1255 size_t Len
= Matches
[0].size();
1257 // The kind has to be an identifier so we can match it against those defined
1258 // in Keywords. The kind has to be set before the length because the setLength
1259 // function checks that the kind is not an annotation.
1260 Tok
.setKind(tok::raw_identifier
);
1262 Tok
.setLocation(Lex
->getSourceLocation(Start
, Len
));
1263 Tok
.setRawIdentifierData(Start
);
1264 Lex
->seek(Lex
->getCurrentBufferOffset() + Len
, /*IsAtStartofline=*/false);
1268 void FormatTokenLexer::readRawToken(FormatToken
&Tok
) {
1269 // For Verilog, first see if there is a special token, and fall back to the
1270 // normal lexer if there isn't one.
1271 if (!Style
.isVerilog() || !readRawTokenVerilogSpecific(Tok
.Tok
))
1272 Lex
->LexFromRawLexer(Tok
.Tok
);
1273 Tok
.TokenText
= StringRef(SourceMgr
.getCharacterData(Tok
.Tok
.getLocation()),
1274 Tok
.Tok
.getLength());
1275 // For formatting, treat unterminated string literals like normal string
1277 if (Tok
.is(tok::unknown
)) {
1278 if (!Tok
.TokenText
.empty() && Tok
.TokenText
[0] == '"') {
1279 Tok
.Tok
.setKind(tok::string_literal
);
1280 Tok
.IsUnterminatedLiteral
= true;
1281 } else if (Style
.isJavaScript() && Tok
.TokenText
== "''") {
1282 Tok
.Tok
.setKind(tok::string_literal
);
1286 if ((Style
.isJavaScript() || Style
.Language
== FormatStyle::LK_Proto
||
1287 Style
.Language
== FormatStyle::LK_TextProto
) &&
1288 Tok
.is(tok::char_constant
)) {
1289 Tok
.Tok
.setKind(tok::string_literal
);
1292 if (Tok
.is(tok::comment
) && (Tok
.TokenText
== "// clang-format on" ||
1293 Tok
.TokenText
== "/* clang-format on */")) {
1294 FormattingDisabled
= false;
1297 Tok
.Finalized
= FormattingDisabled
;
1299 if (Tok
.is(tok::comment
) && (Tok
.TokenText
== "// clang-format off" ||
1300 Tok
.TokenText
== "/* clang-format off */")) {
1301 FormattingDisabled
= true;
1305 void FormatTokenLexer::resetLexer(unsigned Offset
) {
1306 StringRef Buffer
= SourceMgr
.getBufferData(ID
);
1307 LangOpts
= getFormattingLangOpts(Style
);
1308 Lex
.reset(new Lexer(SourceMgr
.getLocForStartOfFile(ID
), LangOpts
,
1309 Buffer
.begin(), Buffer
.begin() + Offset
, Buffer
.end()));
1310 Lex
->SetKeepWhitespaceMode(true);
1311 TrailingWhitespace
= 0;
1314 } // namespace format
1315 } // namespace clang