1 //===--- HeaderIncludes.cpp - Insert/Delete #includes --*- C++ -*----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang/Tooling/Inclusions/HeaderIncludes.h"
10 #include "clang/Basic/FileManager.h"
11 #include "clang/Basic/SourceManager.h"
12 #include "clang/Lex/Lexer.h"
13 #include "llvm/Support/FormatVariadic.h"
14 #include "llvm/Support/Path.h"
21 LangOptions
createLangOpts() {
23 LangOpts
.CPlusPlus
= 1;
24 LangOpts
.CPlusPlus11
= 1;
25 LangOpts
.CPlusPlus14
= 1;
26 LangOpts
.LineComment
= 1;
27 LangOpts
.CXXOperatorNames
= 1;
30 LangOpts
.MicrosoftExt
= 1; // To get kw___try, kw___finally.
31 LangOpts
.DeclSpecKeyword
= 1; // To get __declspec.
32 LangOpts
.WChar
= 1; // To get wchar_t
36 // Returns the offset after skipping a sequence of tokens, matched by \p
37 // GetOffsetAfterSequence, from the start of the code.
38 // \p GetOffsetAfterSequence should be a function that matches a sequence of
39 // tokens and returns an offset after the sequence.
40 unsigned getOffsetAfterTokenSequence(
41 StringRef FileName
, StringRef Code
, const IncludeStyle
&Style
,
42 llvm::function_ref
<unsigned(const SourceManager
&, Lexer
&, Token
&)>
43 GetOffsetAfterSequence
) {
44 SourceManagerForFile
VirtualSM(FileName
, Code
);
45 SourceManager
&SM
= VirtualSM
.get();
46 LangOptions LangOpts
= createLangOpts();
47 Lexer
Lex(SM
.getMainFileID(), SM
.getBufferOrFake(SM
.getMainFileID()), SM
,
50 // Get the first token.
51 Lex
.LexFromRawLexer(Tok
);
52 return GetOffsetAfterSequence(SM
, Lex
, Tok
);
55 // Check if a sequence of tokens is like "#<Name> <raw_identifier>". If it is,
56 // \p Tok will be the token after this directive; otherwise, it can be any token
57 // after the given \p Tok (including \p Tok). If \p RawIDName is provided, the
58 // (second) raw_identifier name is checked.
59 bool checkAndConsumeDirectiveWithName(
60 Lexer
&Lex
, StringRef Name
, Token
&Tok
,
61 std::optional
<StringRef
> RawIDName
= std::nullopt
) {
62 bool Matched
= Tok
.is(tok::hash
) && !Lex
.LexFromRawLexer(Tok
) &&
63 Tok
.is(tok::raw_identifier
) &&
64 Tok
.getRawIdentifier() == Name
&& !Lex
.LexFromRawLexer(Tok
) &&
65 Tok
.is(tok::raw_identifier
) &&
66 (!RawIDName
|| Tok
.getRawIdentifier() == *RawIDName
);
68 Lex
.LexFromRawLexer(Tok
);
72 void skipComments(Lexer
&Lex
, Token
&Tok
) {
73 while (Tok
.is(tok::comment
))
74 if (Lex
.LexFromRawLexer(Tok
))
78 // Returns the offset after header guard directives and any comments
79 // before/after header guards (e.g. #ifndef/#define pair, #pragma once). If no
80 // header guard is present in the code, this will return the offset after
81 // skipping all comments from the start of the code.
82 unsigned getOffsetAfterHeaderGuardsAndComments(StringRef FileName
,
84 const IncludeStyle
&Style
) {
85 // \p Consume returns location after header guard or 0 if no header guard is
87 auto ConsumeHeaderGuardAndComment
=
88 [&](std::function
<unsigned(const SourceManager
&SM
, Lexer
&Lex
,
91 return getOffsetAfterTokenSequence(
92 FileName
, Code
, Style
,
93 [&Consume
](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) {
94 skipComments(Lex
, Tok
);
95 unsigned InitialOffset
= SM
.getFileOffset(Tok
.getLocation());
96 return std::max(InitialOffset
, Consume(SM
, Lex
, Tok
));
101 ConsumeHeaderGuardAndComment(
102 [](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) -> unsigned {
103 if (checkAndConsumeDirectiveWithName(Lex
, "ifndef", Tok
)) {
104 skipComments(Lex
, Tok
);
105 if (checkAndConsumeDirectiveWithName(Lex
, "define", Tok
) &&
106 Tok
.isAtStartOfLine())
107 return SM
.getFileOffset(Tok
.getLocation());
112 ConsumeHeaderGuardAndComment(
113 [](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) -> unsigned {
114 if (checkAndConsumeDirectiveWithName(Lex
, "pragma", Tok
,
116 return SM
.getFileOffset(Tok
.getLocation());
121 // Check if a sequence of tokens is like
122 // "#include ("header.h" | <header.h>)".
123 // If it is, \p Tok will be the token after this directive; otherwise, it can be
124 // any token after the given \p Tok (including \p Tok).
125 bool checkAndConsumeInclusiveDirective(Lexer
&Lex
, Token
&Tok
) {
126 auto Matched
= [&]() {
127 Lex
.LexFromRawLexer(Tok
);
130 if (Tok
.is(tok::hash
) && !Lex
.LexFromRawLexer(Tok
) &&
131 Tok
.is(tok::raw_identifier
) && Tok
.getRawIdentifier() == "include") {
132 if (Lex
.LexFromRawLexer(Tok
))
134 if (Tok
.is(tok::string_literal
))
136 if (Tok
.is(tok::less
)) {
137 while (!Lex
.LexFromRawLexer(Tok
) && Tok
.isNot(tok::greater
)) {
139 if (Tok
.is(tok::greater
))
146 // Returns the offset of the last #include directive after which a new
147 // #include can be inserted. This ignores #include's after the #include block(s)
148 // in the beginning of a file to avoid inserting headers into code sections
149 // where new #include's should not be added by default.
150 // These code sections include:
151 // - raw string literals (containing #include).
153 // - Special #include's among declarations (e.g. functions).
155 // If no #include after which a new #include can be inserted, this returns the
156 // offset after skipping all comments from the start of the code.
157 // Inserting after an #include is not allowed if it comes after code that is not
158 // #include (e.g. pre-processing directive that is not #include, declarations).
159 unsigned getMaxHeaderInsertionOffset(StringRef FileName
, StringRef Code
,
160 const IncludeStyle
&Style
) {
161 return getOffsetAfterTokenSequence(
162 FileName
, Code
, Style
,
163 [](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) {
164 skipComments(Lex
, Tok
);
165 unsigned MaxOffset
= SM
.getFileOffset(Tok
.getLocation());
166 while (checkAndConsumeInclusiveDirective(Lex
, Tok
))
167 MaxOffset
= SM
.getFileOffset(Tok
.getLocation());
172 inline StringRef
trimInclude(StringRef IncludeName
) {
173 return IncludeName
.trim("\"<>");
176 const char IncludeRegexPattern
[] =
177 R
"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))";
179 // The filename of Path excluding extension.
180 // Used to match implementation with headers, this differs from sys::path::stem:
181 // - in names with multiple dots (foo.cu.cc) it terminates at the *first*
182 // - an empty stem is never returned: /foo/.bar.x => .bar
183 // - we don't bother to handle . and .. specially
184 StringRef
matchingStem(llvm::StringRef Path
) {
185 StringRef Name
= llvm::sys::path::filename(Path
);
186 return Name
.substr(0, Name
.find('.', 1));
189 } // anonymous namespace
191 IncludeCategoryManager::IncludeCategoryManager(const IncludeStyle
&Style
,
193 : Style(Style
), FileName(FileName
) {
194 for (const auto &Category
: Style
.IncludeCategories
) {
195 CategoryRegexs
.emplace_back(Category
.Regex
, Category
.RegexIsCaseSensitive
196 ? llvm::Regex::NoFlags
197 : llvm::Regex::IgnoreCase
);
199 IsMainFile
= FileName
.ends_with(".c") || FileName
.ends_with(".cc") ||
200 FileName
.ends_with(".cpp") || FileName
.ends_with(".c++") ||
201 FileName
.ends_with(".cxx") || FileName
.ends_with(".m") ||
202 FileName
.ends_with(".mm");
203 if (!Style
.IncludeIsMainSourceRegex
.empty()) {
204 llvm::Regex
MainFileRegex(Style
.IncludeIsMainSourceRegex
);
205 IsMainFile
|= MainFileRegex
.match(FileName
);
209 int IncludeCategoryManager::getIncludePriority(StringRef IncludeName
,
210 bool CheckMainHeader
) const {
212 for (unsigned i
= 0, e
= CategoryRegexs
.size(); i
!= e
; ++i
)
213 if (CategoryRegexs
[i
].match(IncludeName
)) {
214 Ret
= Style
.IncludeCategories
[i
].Priority
;
217 if (CheckMainHeader
&& IsMainFile
&& Ret
> 0 && isMainHeader(IncludeName
))
222 int IncludeCategoryManager::getSortIncludePriority(StringRef IncludeName
,
223 bool CheckMainHeader
) const {
225 for (unsigned i
= 0, e
= CategoryRegexs
.size(); i
!= e
; ++i
)
226 if (CategoryRegexs
[i
].match(IncludeName
)) {
227 Ret
= Style
.IncludeCategories
[i
].SortPriority
;
229 Ret
= Style
.IncludeCategories
[i
].Priority
;
232 if (CheckMainHeader
&& IsMainFile
&& Ret
> 0 && isMainHeader(IncludeName
))
236 bool IncludeCategoryManager::isMainHeader(StringRef IncludeName
) const {
237 if (!IncludeName
.starts_with("\""))
241 IncludeName
.drop_front(1).drop_back(1); // remove the surrounding "" or <>
242 // Not matchingStem: implementation files may have compound extensions but
244 StringRef HeaderStem
= llvm::sys::path::stem(IncludeName
);
245 StringRef FileStem
= llvm::sys::path::stem(FileName
); // foo.cu for foo.cu.cc
246 StringRef MatchingFileStem
= matchingStem(FileName
); // foo for foo.cu.cc
247 // main-header examples:
248 // 1) foo.h => foo.cc
249 // 2) foo.h => foo.cu.cc
250 // 3) foo.proto.h => foo.proto.cc
252 // non-main-header examples:
253 // 1) foo.h => bar.cc
254 // 2) foo.proto.h => foo.cc
256 if (MatchingFileStem
.starts_with_insensitive(HeaderStem
))
257 Matching
= MatchingFileStem
; // example 1), 2)
258 else if (FileStem
.equals_insensitive(HeaderStem
))
259 Matching
= FileStem
; // example 3)
260 if (!Matching
.empty()) {
261 llvm::Regex
MainIncludeRegex(HeaderStem
.str() + Style
.IncludeIsMainRegex
,
262 llvm::Regex::IgnoreCase
);
263 if (MainIncludeRegex
.match(Matching
))
269 const llvm::Regex
HeaderIncludes::IncludeRegex(IncludeRegexPattern
);
271 HeaderIncludes::HeaderIncludes(StringRef FileName
, StringRef Code
,
272 const IncludeStyle
&Style
)
273 : FileName(FileName
), Code(Code
), FirstIncludeOffset(-1),
275 getOffsetAfterHeaderGuardsAndComments(FileName
, Code
, Style
)),
276 MaxInsertOffset(MinInsertOffset
+
277 getMaxHeaderInsertionOffset(
278 FileName
, Code
.drop_front(MinInsertOffset
), Style
)),
279 MainIncludeFound(false),
280 Categories(Style
, FileName
) {
281 // Add 0 for main header and INT_MAX for headers that are not in any
283 Priorities
= {0, INT_MAX
};
284 for (const auto &Category
: Style
.IncludeCategories
)
285 Priorities
.insert(Category
.Priority
);
286 SmallVector
<StringRef
, 32> Lines
;
287 Code
.drop_front(MinInsertOffset
).split(Lines
, "\n");
289 unsigned Offset
= MinInsertOffset
;
290 unsigned NextLineOffset
;
291 SmallVector
<StringRef
, 4> Matches
;
292 for (auto Line
: Lines
) {
293 NextLineOffset
= std::min(Code
.size(), Offset
+ Line
.size() + 1);
294 if (IncludeRegex
.match(Line
, &Matches
)) {
295 // If this is the last line without trailing newline, we need to make
296 // sure we don't delete across the file boundary.
300 Offset
, std::min(Line
.size() + 1, Code
.size() - Offset
)),
301 Matches
[1] == "import" ? tooling::IncludeDirective::Import
302 : tooling::IncludeDirective::Include
),
305 Offset
= NextLineOffset
;
308 // Populate CategoryEndOfssets:
309 // - Ensure that CategoryEndOffset[Highest] is always populated.
310 // - If CategoryEndOffset[Priority] isn't set, use the next higher value
311 // that is set, up to CategoryEndOffset[Highest].
312 auto Highest
= Priorities
.begin();
313 if (CategoryEndOffsets
.find(*Highest
) == CategoryEndOffsets
.end()) {
314 if (FirstIncludeOffset
>= 0)
315 CategoryEndOffsets
[*Highest
] = FirstIncludeOffset
;
317 CategoryEndOffsets
[*Highest
] = MinInsertOffset
;
319 // By this point, CategoryEndOffset[Highest] is always set appropriately:
320 // - to an appropriate location before/after existing #includes, or
321 // - to right after the header guard, or
322 // - to the beginning of the file.
323 for (auto I
= ++Priorities
.begin(), E
= Priorities
.end(); I
!= E
; ++I
)
324 if (CategoryEndOffsets
.find(*I
) == CategoryEndOffsets
.end())
325 CategoryEndOffsets
[*I
] = CategoryEndOffsets
[*std::prev(I
)];
328 // \p Offset: the start of the line following this include directive.
329 void HeaderIncludes::addExistingInclude(Include IncludeToAdd
,
330 unsigned NextLineOffset
) {
332 ExistingIncludes
.try_emplace(trimInclude(IncludeToAdd
.Name
)).first
;
333 Iter
->second
.push_back(std::move(IncludeToAdd
));
334 auto &CurInclude
= Iter
->second
.back();
335 // The header name with quotes or angle brackets.
336 // Only record the offset of current #include if we can insert after it.
337 if (CurInclude
.R
.getOffset() <= MaxInsertOffset
) {
338 int Priority
= Categories
.getIncludePriority(
339 CurInclude
.Name
, /*CheckMainHeader=*/!MainIncludeFound
);
341 MainIncludeFound
= true;
342 CategoryEndOffsets
[Priority
] = NextLineOffset
;
343 IncludesByPriority
[Priority
].push_back(&CurInclude
);
344 if (FirstIncludeOffset
< 0)
345 FirstIncludeOffset
= CurInclude
.R
.getOffset();
349 std::optional
<tooling::Replacement
>
350 HeaderIncludes::insert(llvm::StringRef IncludeName
, bool IsAngled
,
351 IncludeDirective Directive
) const {
352 assert(IncludeName
== trimInclude(IncludeName
));
353 // If a <header> ("header") already exists in code, "header" (<header>) with
354 // different quotation and/or directive will still be inserted.
355 // FIXME: figure out if this is the best behavior.
356 auto It
= ExistingIncludes
.find(IncludeName
);
357 if (It
!= ExistingIncludes
.end()) {
358 for (const auto &Inc
: It
->second
)
359 if (Inc
.Directive
== Directive
&&
360 ((IsAngled
&& StringRef(Inc
.Name
).starts_with("<")) ||
361 (!IsAngled
&& StringRef(Inc
.Name
).starts_with("\""))))
365 std::string(llvm::formatv(IsAngled
? "<{0}>" : "\"{0}\"", IncludeName
));
366 StringRef QuotedName
= Quoted
;
367 int Priority
= Categories
.getIncludePriority(
368 QuotedName
, /*CheckMainHeader=*/!MainIncludeFound
);
369 auto CatOffset
= CategoryEndOffsets
.find(Priority
);
370 assert(CatOffset
!= CategoryEndOffsets
.end());
371 unsigned InsertOffset
= CatOffset
->second
; // Fall back offset
372 auto Iter
= IncludesByPriority
.find(Priority
);
373 if (Iter
!= IncludesByPriority
.end()) {
374 for (const auto *Inc
: Iter
->second
) {
375 if (QuotedName
< Inc
->Name
) {
376 InsertOffset
= Inc
->R
.getOffset();
381 assert(InsertOffset
<= Code
.size());
382 llvm::StringRef DirectiveSpelling
=
383 Directive
== IncludeDirective::Include
? "include" : "import";
384 std::string NewInclude
=
385 llvm::formatv("#{0} {1}\n", DirectiveSpelling
, QuotedName
);
386 // When inserting headers at end of the code, also append '\n' to the code
387 // if it does not end with '\n'.
388 // FIXME: when inserting multiple #includes at the end of code, only one
389 // newline should be added.
390 if (InsertOffset
== Code
.size() && (!Code
.empty() && Code
.back() != '\n'))
391 NewInclude
= "\n" + NewInclude
;
392 return tooling::Replacement(FileName
, InsertOffset
, 0, NewInclude
);
395 tooling::Replacements
HeaderIncludes::remove(llvm::StringRef IncludeName
,
396 bool IsAngled
) const {
397 assert(IncludeName
== trimInclude(IncludeName
));
398 tooling::Replacements Result
;
399 auto Iter
= ExistingIncludes
.find(IncludeName
);
400 if (Iter
== ExistingIncludes
.end())
402 for (const auto &Inc
: Iter
->second
) {
403 if ((IsAngled
&& StringRef(Inc
.Name
).starts_with("\"")) ||
404 (!IsAngled
&& StringRef(Inc
.Name
).starts_with("<")))
406 llvm::Error Err
= Result
.add(tooling::Replacement(
407 FileName
, Inc
.R
.getOffset(), Inc
.R
.getLength(), ""));
409 auto ErrMsg
= "Unexpected conflicts in #include deletions: " +
410 llvm::toString(std::move(Err
));
411 llvm_unreachable(ErrMsg
.c_str());
417 } // namespace tooling