1 //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the TokenConcatenation class.
11 //===----------------------------------------------------------------------===//
13 #include "clang/Lex/TokenConcatenation.h"
14 #include "clang/Basic/CharInfo.h"
15 #include "clang/Lex/Preprocessor.h"
16 #include "llvm/Support/ErrorHandling.h"
17 using namespace clang
;
20 /// IsStringPrefix - Return true if Str is a string prefix.
21 /// 'L', 'u', 'U', or 'u8'. Including raw versions.
22 static bool IsStringPrefix(StringRef Str
, bool CPlusPlus11
) {
25 (CPlusPlus11
&& (Str
[0] == 'u' || Str
[0] == 'U' || Str
[0] == 'R'))) {
28 return true; // "L", "u", "U", and "R"
30 // Check for raw flavors. Need to make sure the first character wasn't
31 // already R. Need CPlusPlus11 check for "LR".
32 if (Str
[1] == 'R' && Str
[0] != 'R' && Str
.size() == 2 && CPlusPlus11
)
33 return true; // "LR", "uR", "UR"
35 // Check for "u8" and "u8R"
36 if (Str
[0] == 'u' && Str
[1] == '8') {
37 if (Str
.size() == 2) return true; // "u8"
38 if (Str
.size() == 3 && Str
[2] == 'R') return true; // "u8R"
45 /// IsIdentifierStringPrefix - Return true if the spelling of the token
46 /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions.
47 bool TokenConcatenation::IsIdentifierStringPrefix(const Token
&Tok
) const {
48 const LangOptions
&LangOpts
= PP
.getLangOpts();
50 if (!Tok
.needsCleaning()) {
51 if (Tok
.getLength() < 1 || Tok
.getLength() > 3)
53 SourceManager
&SM
= PP
.getSourceManager();
54 const char *Ptr
= SM
.getCharacterData(SM
.getSpellingLoc(Tok
.getLocation()));
55 return IsStringPrefix(StringRef(Ptr
, Tok
.getLength()),
56 LangOpts
.CPlusPlus11
);
59 if (Tok
.getLength() < 256) {
61 const char *TokPtr
= Buffer
;
62 unsigned length
= PP
.getSpelling(Tok
, TokPtr
);
63 return IsStringPrefix(StringRef(TokPtr
, length
), LangOpts
.CPlusPlus11
);
66 return IsStringPrefix(StringRef(PP
.getSpelling(Tok
)), LangOpts
.CPlusPlus11
);
69 TokenConcatenation::TokenConcatenation(const Preprocessor
&pp
) : PP(pp
) {
70 memset(TokenInfo
, 0, sizeof(TokenInfo
));
72 // These tokens have custom code in AvoidConcat.
73 TokenInfo
[tok::identifier
] |= aci_custom
;
74 TokenInfo
[tok::numeric_constant
] |= aci_custom_firstchar
;
75 TokenInfo
[tok::period
] |= aci_custom_firstchar
;
76 TokenInfo
[tok::amp
] |= aci_custom_firstchar
;
77 TokenInfo
[tok::plus
] |= aci_custom_firstchar
;
78 TokenInfo
[tok::minus
] |= aci_custom_firstchar
;
79 TokenInfo
[tok::slash
] |= aci_custom_firstchar
;
80 TokenInfo
[tok::less
] |= aci_custom_firstchar
;
81 TokenInfo
[tok::greater
] |= aci_custom_firstchar
;
82 TokenInfo
[tok::pipe
] |= aci_custom_firstchar
;
83 TokenInfo
[tok::percent
] |= aci_custom_firstchar
;
84 TokenInfo
[tok::colon
] |= aci_custom_firstchar
;
85 TokenInfo
[tok::hash
] |= aci_custom_firstchar
;
86 TokenInfo
[tok::arrow
] |= aci_custom_firstchar
;
88 // These tokens have custom code in C++11 mode.
89 if (PP
.getLangOpts().CPlusPlus11
) {
90 TokenInfo
[tok::string_literal
] |= aci_custom
;
91 TokenInfo
[tok::wide_string_literal
] |= aci_custom
;
92 TokenInfo
[tok::utf8_string_literal
] |= aci_custom
;
93 TokenInfo
[tok::utf16_string_literal
] |= aci_custom
;
94 TokenInfo
[tok::utf32_string_literal
] |= aci_custom
;
95 TokenInfo
[tok::char_constant
] |= aci_custom
;
96 TokenInfo
[tok::wide_char_constant
] |= aci_custom
;
97 TokenInfo
[tok::utf16_char_constant
] |= aci_custom
;
98 TokenInfo
[tok::utf32_char_constant
] |= aci_custom
;
101 // These tokens have custom code in C++17 mode.
102 if (PP
.getLangOpts().CPlusPlus17
)
103 TokenInfo
[tok::utf8_char_constant
] |= aci_custom
;
105 // These tokens have custom code in C++2a mode.
106 if (PP
.getLangOpts().CPlusPlus20
)
107 TokenInfo
[tok::lessequal
] |= aci_custom_firstchar
;
109 // These tokens change behavior if followed by an '='.
110 TokenInfo
[tok::amp
] |= aci_avoid_equal
; // &=
111 TokenInfo
[tok::plus
] |= aci_avoid_equal
; // +=
112 TokenInfo
[tok::minus
] |= aci_avoid_equal
; // -=
113 TokenInfo
[tok::slash
] |= aci_avoid_equal
; // /=
114 TokenInfo
[tok::less
] |= aci_avoid_equal
; // <=
115 TokenInfo
[tok::greater
] |= aci_avoid_equal
; // >=
116 TokenInfo
[tok::pipe
] |= aci_avoid_equal
; // |=
117 TokenInfo
[tok::percent
] |= aci_avoid_equal
; // %=
118 TokenInfo
[tok::star
] |= aci_avoid_equal
; // *=
119 TokenInfo
[tok::exclaim
] |= aci_avoid_equal
; // !=
120 TokenInfo
[tok::lessless
] |= aci_avoid_equal
; // <<=
121 TokenInfo
[tok::greatergreater
] |= aci_avoid_equal
; // >>=
122 TokenInfo
[tok::caret
] |= aci_avoid_equal
; // ^=
123 TokenInfo
[tok::equal
] |= aci_avoid_equal
; // ==
126 /// GetFirstChar - Get the first character of the token \arg Tok,
127 /// avoiding calls to getSpelling where possible.
128 static char GetFirstChar(const Preprocessor
&PP
, const Token
&Tok
) {
129 if (IdentifierInfo
*II
= Tok
.getIdentifierInfo()) {
130 // Avoid spelling identifiers, the most common form of token.
131 return II
->getNameStart()[0];
132 } else if (!Tok
.needsCleaning()) {
133 if (Tok
.isLiteral() && Tok
.getLiteralData()) {
134 return *Tok
.getLiteralData();
136 SourceManager
&SM
= PP
.getSourceManager();
137 return *SM
.getCharacterData(SM
.getSpellingLoc(Tok
.getLocation()));
139 } else if (Tok
.getLength() < 256) {
141 const char *TokPtr
= Buffer
;
142 PP
.getSpelling(Tok
, TokPtr
);
145 return PP
.getSpelling(Tok
)[0];
149 /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
150 /// the two individual tokens to be lexed as a single token, return true
151 /// (which causes a space to be printed between them). This allows the output
152 /// of -E mode to be lexed to the same token stream as lexing the input
155 /// This code must conservatively return true if it doesn't want to be 100%
156 /// accurate. This will cause the output to include extra space characters,
157 /// but the resulting output won't have incorrect concatenations going on.
158 /// Examples include "..", which we print with a space between, because we
159 /// don't want to track enough to tell "x.." from "...".
160 bool TokenConcatenation::AvoidConcat(const Token
&PrevPrevTok
,
161 const Token
&PrevTok
,
162 const Token
&Tok
) const {
163 // Conservatively assume that every annotation token that has a printable
164 // form requires whitespace.
165 if (PrevTok
.isAnnotation())
168 // First, check to see if the tokens were directly adjacent in the original
169 // source. If they were, it must be okay to stick them together: if there
170 // were an issue, the tokens would have been lexed differently.
171 SourceManager
&SM
= PP
.getSourceManager();
172 SourceLocation PrevSpellLoc
= SM
.getSpellingLoc(PrevTok
.getLocation());
173 SourceLocation SpellLoc
= SM
.getSpellingLoc(Tok
.getLocation());
174 if (PrevSpellLoc
.getLocWithOffset(PrevTok
.getLength()) == SpellLoc
)
177 tok::TokenKind PrevKind
= PrevTok
.getKind();
178 if (!PrevTok
.isAnnotation() && PrevTok
.getIdentifierInfo())
179 PrevKind
= tok::identifier
; // Language keyword or named operator.
181 // Look up information on when we should avoid concatenation with prevtok.
182 unsigned ConcatInfo
= TokenInfo
[PrevKind
];
184 // If prevtok never causes a problem for anything after it, return quickly.
185 if (ConcatInfo
== 0) return false;
187 if (ConcatInfo
& aci_avoid_equal
) {
188 // If the next token is '=' or '==', avoid concatenation.
189 if (Tok
.isOneOf(tok::equal
, tok::equalequal
))
191 ConcatInfo
&= ~aci_avoid_equal
;
193 if (Tok
.isAnnotation()) {
194 // Modules annotation can show up when generated automatically for includes.
195 assert(Tok
.isOneOf(tok::annot_module_include
, tok::annot_module_begin
,
196 tok::annot_module_end
) &&
197 "unexpected annotation in AvoidConcat");
204 // Basic algorithm: we look at the first character of the second token, and
205 // determine whether it, if appended to the first token, would form (or
206 // would contribute) to a larger token if concatenated.
208 if (ConcatInfo
& aci_custom
) {
209 // If the token does not need to know the first character, don't get it.
211 FirstChar
= GetFirstChar(PP
, Tok
);
216 llvm_unreachable("InitAvoidConcatTokenInfo built wrong");
218 case tok::raw_identifier
:
219 llvm_unreachable("tok::raw_identifier in non-raw lexing mode!");
221 case tok::string_literal
:
222 case tok::wide_string_literal
:
223 case tok::utf8_string_literal
:
224 case tok::utf16_string_literal
:
225 case tok::utf32_string_literal
:
226 case tok::char_constant
:
227 case tok::wide_char_constant
:
228 case tok::utf8_char_constant
:
229 case tok::utf16_char_constant
:
230 case tok::utf32_char_constant
:
231 if (!PP
.getLangOpts().CPlusPlus11
)
234 // In C++11, a string or character literal followed by an identifier is a
236 if (Tok
.getIdentifierInfo())
239 // A ud-suffix is an identifier. If the previous token ends with one, treat
240 // it as an identifier.
241 if (!PrevTok
.hasUDSuffix())
244 case tok::identifier
: // id+id or id+number or id+L"foo".
245 // id+'.'... will not append.
246 if (Tok
.is(tok::numeric_constant
))
247 return GetFirstChar(PP
, Tok
) != '.';
249 if (Tok
.getIdentifierInfo() ||
250 Tok
.isOneOf(tok::wide_string_literal
, tok::utf8_string_literal
,
251 tok::utf16_string_literal
, tok::utf32_string_literal
,
252 tok::wide_char_constant
, tok::utf8_char_constant
,
253 tok::utf16_char_constant
, tok::utf32_char_constant
))
256 // If this isn't identifier + string, we're done.
257 if (Tok
.isNot(tok::char_constant
) && Tok
.isNot(tok::string_literal
))
260 // Otherwise, this is a narrow character or string. If the *identifier*
261 // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo".
262 return IsIdentifierStringPrefix(PrevTok
);
264 case tok::numeric_constant
:
265 return isPreprocessingNumberBody(FirstChar
) ||
266 FirstChar
== '+' || FirstChar
== '-';
267 case tok::period
: // ..., .*, .1234
268 return (FirstChar
== '.' && PrevPrevTok
.is(tok::period
)) ||
269 isDigit(FirstChar
) ||
270 (PP
.getLangOpts().CPlusPlus
&& FirstChar
== '*');
272 return FirstChar
== '&';
273 case tok::plus
: // ++
274 return FirstChar
== '+';
275 case tok::minus
: // --, ->, ->*
276 return FirstChar
== '-' || FirstChar
== '>';
277 case tok::slash
: //, /*, //
278 return FirstChar
== '*' || FirstChar
== '/';
279 case tok::less
: // <<, <<=, <:, <%
280 return FirstChar
== '<' || FirstChar
== ':' || FirstChar
== '%';
281 case tok::greater
: // >>, >>=
282 return FirstChar
== '>';
283 case tok::pipe
: // ||
284 return FirstChar
== '|';
285 case tok::percent
: // %>, %:
286 return FirstChar
== '>' || FirstChar
== ':';
287 case tok::colon
: // ::, :>
288 return FirstChar
== '>' ||
289 (PP
.getLangOpts().CPlusPlus
&& FirstChar
== ':');
290 case tok::hash
: // ##, #@, %:%:
291 return FirstChar
== '#' || FirstChar
== '@' || FirstChar
== '%';
292 case tok::arrow
: // ->*
293 return PP
.getLangOpts().CPlusPlus
&& FirstChar
== '*';
294 case tok::lessequal
: // <=> (C++2a)
295 return PP
.getLangOpts().CPlusPlus20
&& FirstChar
== '>';