1 //===--- HeaderIncludes.cpp - Insert/Delete #includes --*- C++ -*----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "clang/Tooling/Inclusions/HeaderIncludes.h"
10 #include "clang/Basic/FileManager.h"
11 #include "clang/Basic/SourceManager.h"
12 #include "clang/Lex/Lexer.h"
13 #include "llvm/Support/FormatVariadic.h"
14 #include "llvm/Support/Path.h"
21 LangOptions
createLangOpts() {
23 LangOpts
.CPlusPlus
= 1;
24 LangOpts
.CPlusPlus11
= 1;
25 LangOpts
.CPlusPlus14
= 1;
26 LangOpts
.LineComment
= 1;
27 LangOpts
.CXXOperatorNames
= 1;
30 LangOpts
.MicrosoftExt
= 1; // To get kw___try, kw___finally.
31 LangOpts
.DeclSpecKeyword
= 1; // To get __declspec.
32 LangOpts
.WChar
= 1; // To get wchar_t
36 // Returns the offset after skipping a sequence of tokens, matched by \p
37 // GetOffsetAfterSequence, from the start of the code.
38 // \p GetOffsetAfterSequence should be a function that matches a sequence of
39 // tokens and returns an offset after the sequence.
40 unsigned getOffsetAfterTokenSequence(
41 StringRef FileName
, StringRef Code
, const IncludeStyle
&Style
,
42 llvm::function_ref
<unsigned(const SourceManager
&, Lexer
&, Token
&)>
43 GetOffsetAfterSequence
) {
44 SourceManagerForFile
VirtualSM(FileName
, Code
);
45 SourceManager
&SM
= VirtualSM
.get();
46 LangOptions LangOpts
= createLangOpts();
47 Lexer
Lex(SM
.getMainFileID(), SM
.getBufferOrFake(SM
.getMainFileID()), SM
,
50 // Get the first token.
51 Lex
.LexFromRawLexer(Tok
);
52 return GetOffsetAfterSequence(SM
, Lex
, Tok
);
55 // Check if a sequence of tokens is like "#<Name> <raw_identifier>". If it is,
56 // \p Tok will be the token after this directive; otherwise, it can be any token
57 // after the given \p Tok (including \p Tok). If \p RawIDName is provided, the
58 // (second) raw_identifier name is checked.
59 bool checkAndConsumeDirectiveWithName(
60 Lexer
&Lex
, StringRef Name
, Token
&Tok
,
61 std::optional
<StringRef
> RawIDName
= std::nullopt
) {
62 bool Matched
= Tok
.is(tok::hash
) && !Lex
.LexFromRawLexer(Tok
) &&
63 Tok
.is(tok::raw_identifier
) &&
64 Tok
.getRawIdentifier() == Name
&& !Lex
.LexFromRawLexer(Tok
) &&
65 Tok
.is(tok::raw_identifier
) &&
66 (!RawIDName
|| Tok
.getRawIdentifier() == *RawIDName
);
68 Lex
.LexFromRawLexer(Tok
);
72 void skipComments(Lexer
&Lex
, Token
&Tok
) {
73 while (Tok
.is(tok::comment
))
74 if (Lex
.LexFromRawLexer(Tok
))
78 // Returns the offset after header guard directives and any comments
79 // before/after header guards (e.g. #ifndef/#define pair, #pragma once). If no
80 // header guard is present in the code, this will return the offset after
81 // skipping all comments from the start of the code.
82 unsigned getOffsetAfterHeaderGuardsAndComments(StringRef FileName
,
84 const IncludeStyle
&Style
) {
85 // \p Consume returns location after header guard or 0 if no header guard is
87 auto ConsumeHeaderGuardAndComment
=
88 [&](std::function
<unsigned(const SourceManager
&SM
, Lexer
&Lex
,
91 return getOffsetAfterTokenSequence(
92 FileName
, Code
, Style
,
93 [&Consume
](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) {
94 skipComments(Lex
, Tok
);
95 unsigned InitialOffset
= SM
.getFileOffset(Tok
.getLocation());
96 return std::max(InitialOffset
, Consume(SM
, Lex
, Tok
));
101 ConsumeHeaderGuardAndComment(
102 [](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) -> unsigned {
103 if (checkAndConsumeDirectiveWithName(Lex
, "ifndef", Tok
)) {
104 skipComments(Lex
, Tok
);
105 if (checkAndConsumeDirectiveWithName(Lex
, "define", Tok
) &&
106 Tok
.isAtStartOfLine())
107 return SM
.getFileOffset(Tok
.getLocation());
112 ConsumeHeaderGuardAndComment(
113 [](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) -> unsigned {
114 if (checkAndConsumeDirectiveWithName(Lex
, "pragma", Tok
,
116 return SM
.getFileOffset(Tok
.getLocation());
121 // Check if a sequence of tokens is like
122 // "#include ("header.h" | <header.h>)".
123 // If it is, \p Tok will be the token after this directive; otherwise, it can be
124 // any token after the given \p Tok (including \p Tok).
125 bool checkAndConsumeInclusiveDirective(Lexer
&Lex
, Token
&Tok
) {
126 auto Matched
= [&]() {
127 Lex
.LexFromRawLexer(Tok
);
130 if (Tok
.is(tok::hash
) && !Lex
.LexFromRawLexer(Tok
) &&
131 Tok
.is(tok::raw_identifier
) && Tok
.getRawIdentifier() == "include") {
132 if (Lex
.LexFromRawLexer(Tok
))
134 if (Tok
.is(tok::string_literal
))
136 if (Tok
.is(tok::less
)) {
137 while (!Lex
.LexFromRawLexer(Tok
) && Tok
.isNot(tok::greater
)) {
139 if (Tok
.is(tok::greater
))
146 // Returns the offset of the last #include directive after which a new
147 // #include can be inserted. This ignores #include's after the #include block(s)
148 // in the beginning of a file to avoid inserting headers into code sections
149 // where new #include's should not be added by default.
150 // These code sections include:
151 // - raw string literals (containing #include).
153 // - Special #include's among declarations (e.g. functions).
155 // If no #include after which a new #include can be inserted, this returns the
156 // offset after skipping all comments from the start of the code.
157 // Inserting after an #include is not allowed if it comes after code that is not
158 // #include (e.g. pre-processing directive that is not #include, declarations).
159 unsigned getMaxHeaderInsertionOffset(StringRef FileName
, StringRef Code
,
160 const IncludeStyle
&Style
) {
161 return getOffsetAfterTokenSequence(
162 FileName
, Code
, Style
,
163 [](const SourceManager
&SM
, Lexer
&Lex
, Token Tok
) {
164 skipComments(Lex
, Tok
);
165 unsigned MaxOffset
= SM
.getFileOffset(Tok
.getLocation());
166 while (checkAndConsumeInclusiveDirective(Lex
, Tok
))
167 MaxOffset
= SM
.getFileOffset(Tok
.getLocation());
172 inline StringRef
trimInclude(StringRef IncludeName
) {
173 return IncludeName
.trim("\"<>");
176 const char IncludeRegexPattern
[] =
177 R
"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))";
179 // The filename of Path excluding extension.
180 // Used to match implementation with headers, this differs from sys::path::stem:
181 // - in names with multiple dots (foo.cu.cc) it terminates at the *first*
182 // - an empty stem is never returned: /foo/.bar.x => .bar
183 // - we don't bother to handle . and .. specially
184 StringRef
matchingStem(llvm::StringRef Path
) {
185 StringRef Name
= llvm::sys::path::filename(Path
);
186 return Name
.substr(0, Name
.find('.', 1));
189 } // anonymous namespace
191 IncludeCategoryManager::IncludeCategoryManager(const IncludeStyle
&Style
,
193 : Style(Style
), FileName(FileName
) {
194 for (const auto &Category
: Style
.IncludeCategories
) {
195 CategoryRegexs
.emplace_back(Category
.Regex
, Category
.RegexIsCaseSensitive
196 ? llvm::Regex::NoFlags
197 : llvm::Regex::IgnoreCase
);
199 IsMainFile
= FileName
.ends_with(".c") || FileName
.ends_with(".cc") ||
200 FileName
.ends_with(".cpp") || FileName
.ends_with(".c++") ||
201 FileName
.ends_with(".cxx") || FileName
.ends_with(".m") ||
202 FileName
.ends_with(".mm");
203 if (!Style
.IncludeIsMainSourceRegex
.empty()) {
204 llvm::Regex
MainFileRegex(Style
.IncludeIsMainSourceRegex
);
205 IsMainFile
|= MainFileRegex
.match(FileName
);
209 int IncludeCategoryManager::getIncludePriority(StringRef IncludeName
,
210 bool CheckMainHeader
) const {
212 for (unsigned i
= 0, e
= CategoryRegexs
.size(); i
!= e
; ++i
)
213 if (CategoryRegexs
[i
].match(IncludeName
)) {
214 Ret
= Style
.IncludeCategories
[i
].Priority
;
217 if (CheckMainHeader
&& IsMainFile
&& Ret
> 0 && isMainHeader(IncludeName
))
222 int IncludeCategoryManager::getSortIncludePriority(StringRef IncludeName
,
223 bool CheckMainHeader
) const {
225 for (unsigned i
= 0, e
= CategoryRegexs
.size(); i
!= e
; ++i
)
226 if (CategoryRegexs
[i
].match(IncludeName
)) {
227 Ret
= Style
.IncludeCategories
[i
].SortPriority
;
229 Ret
= Style
.IncludeCategories
[i
].Priority
;
232 if (CheckMainHeader
&& IsMainFile
&& Ret
> 0 && isMainHeader(IncludeName
))
236 bool IncludeCategoryManager::isMainHeader(StringRef IncludeName
) const {
237 switch (Style
.MainIncludeChar
) {
238 case IncludeStyle::MICD_Quote
:
239 if (!IncludeName
.starts_with("\""))
242 case IncludeStyle::MICD_AngleBracket
:
243 if (!IncludeName
.starts_with("<"))
246 case IncludeStyle::MICD_Any
:
251 IncludeName
.drop_front(1).drop_back(1); // remove the surrounding "" or <>
252 // Not matchingStem: implementation files may have compound extensions but
254 StringRef HeaderStem
= llvm::sys::path::stem(IncludeName
);
255 StringRef FileStem
= llvm::sys::path::stem(FileName
); // foo.cu for foo.cu.cc
256 StringRef MatchingFileStem
= matchingStem(FileName
); // foo for foo.cu.cc
257 // main-header examples:
258 // 1) foo.h => foo.cc
259 // 2) foo.h => foo.cu.cc
260 // 3) foo.proto.h => foo.proto.cc
262 // non-main-header examples:
263 // 1) foo.h => bar.cc
264 // 2) foo.proto.h => foo.cc
266 if (MatchingFileStem
.starts_with_insensitive(HeaderStem
))
267 Matching
= MatchingFileStem
; // example 1), 2)
268 else if (FileStem
.equals_insensitive(HeaderStem
))
269 Matching
= FileStem
; // example 3)
270 if (!Matching
.empty()) {
271 llvm::Regex
MainIncludeRegex(HeaderStem
.str() + Style
.IncludeIsMainRegex
,
272 llvm::Regex::IgnoreCase
);
273 if (MainIncludeRegex
.match(Matching
))
279 const llvm::Regex
HeaderIncludes::IncludeRegex(IncludeRegexPattern
);
281 HeaderIncludes::HeaderIncludes(StringRef FileName
, StringRef Code
,
282 const IncludeStyle
&Style
)
283 : FileName(FileName
), Code(Code
), FirstIncludeOffset(-1),
285 getOffsetAfterHeaderGuardsAndComments(FileName
, Code
, Style
)),
286 MaxInsertOffset(MinInsertOffset
+
287 getMaxHeaderInsertionOffset(
288 FileName
, Code
.drop_front(MinInsertOffset
), Style
)),
289 MainIncludeFound(false),
290 Categories(Style
, FileName
) {
291 // Add 0 for main header and INT_MAX for headers that are not in any
293 Priorities
= {0, INT_MAX
};
294 for (const auto &Category
: Style
.IncludeCategories
)
295 Priorities
.insert(Category
.Priority
);
296 SmallVector
<StringRef
, 32> Lines
;
297 Code
.drop_front(MinInsertOffset
).split(Lines
, "\n");
299 unsigned Offset
= MinInsertOffset
;
300 unsigned NextLineOffset
;
301 SmallVector
<StringRef
, 4> Matches
;
302 for (auto Line
: Lines
) {
303 NextLineOffset
= std::min(Code
.size(), Offset
+ Line
.size() + 1);
304 if (IncludeRegex
.match(Line
, &Matches
)) {
305 // If this is the last line without trailing newline, we need to make
306 // sure we don't delete across the file boundary.
310 Offset
, std::min(Line
.size() + 1, Code
.size() - Offset
)),
311 Matches
[1] == "import" ? tooling::IncludeDirective::Import
312 : tooling::IncludeDirective::Include
),
315 Offset
= NextLineOffset
;
318 // Populate CategoryEndOfssets:
319 // - Ensure that CategoryEndOffset[Highest] is always populated.
320 // - If CategoryEndOffset[Priority] isn't set, use the next higher value
321 // that is set, up to CategoryEndOffset[Highest].
322 auto Highest
= Priorities
.begin();
323 auto [It
, Inserted
] = CategoryEndOffsets
.try_emplace(*Highest
);
325 It
->second
= FirstIncludeOffset
>= 0 ? FirstIncludeOffset
: MinInsertOffset
;
326 // By this point, CategoryEndOffset[Highest] is always set appropriately:
327 // - to an appropriate location before/after existing #includes, or
328 // - to right after the header guard, or
329 // - to the beginning of the file.
330 for (auto I
= ++Priorities
.begin(), E
= Priorities
.end(); I
!= E
; ++I
)
331 if (CategoryEndOffsets
.find(*I
) == CategoryEndOffsets
.end())
332 CategoryEndOffsets
[*I
] = CategoryEndOffsets
[*std::prev(I
)];
335 // \p Offset: the start of the line following this include directive.
336 void HeaderIncludes::addExistingInclude(Include IncludeToAdd
,
337 unsigned NextLineOffset
) {
338 auto &Incs
= ExistingIncludes
[trimInclude(IncludeToAdd
.Name
)];
339 Incs
.push_back(std::move(IncludeToAdd
));
340 auto &CurInclude
= Incs
.back();
341 // The header name with quotes or angle brackets.
342 // Only record the offset of current #include if we can insert after it.
343 if (CurInclude
.R
.getOffset() <= MaxInsertOffset
) {
344 int Priority
= Categories
.getIncludePriority(
345 CurInclude
.Name
, /*CheckMainHeader=*/!MainIncludeFound
);
347 MainIncludeFound
= true;
348 CategoryEndOffsets
[Priority
] = NextLineOffset
;
349 IncludesByPriority
[Priority
].push_back(&CurInclude
);
350 if (FirstIncludeOffset
< 0)
351 FirstIncludeOffset
= CurInclude
.R
.getOffset();
355 std::optional
<tooling::Replacement
>
356 HeaderIncludes::insert(llvm::StringRef IncludeName
, bool IsAngled
,
357 IncludeDirective Directive
) const {
358 assert(IncludeName
== trimInclude(IncludeName
));
359 // If a <header> ("header") already exists in code, "header" (<header>) with
360 // different quotation and/or directive will still be inserted.
361 // FIXME: figure out if this is the best behavior.
362 auto It
= ExistingIncludes
.find(IncludeName
);
363 if (It
!= ExistingIncludes
.end()) {
364 for (const auto &Inc
: It
->second
)
365 if (Inc
.Directive
== Directive
&&
366 ((IsAngled
&& StringRef(Inc
.Name
).starts_with("<")) ||
367 (!IsAngled
&& StringRef(Inc
.Name
).starts_with("\""))))
371 std::string(llvm::formatv(IsAngled
? "<{0}>" : "\"{0}\"", IncludeName
));
372 StringRef QuotedName
= Quoted
;
373 int Priority
= Categories
.getIncludePriority(
374 QuotedName
, /*CheckMainHeader=*/!MainIncludeFound
);
375 auto CatOffset
= CategoryEndOffsets
.find(Priority
);
376 assert(CatOffset
!= CategoryEndOffsets
.end());
377 unsigned InsertOffset
= CatOffset
->second
; // Fall back offset
378 auto Iter
= IncludesByPriority
.find(Priority
);
379 if (Iter
!= IncludesByPriority
.end()) {
380 for (const auto *Inc
: Iter
->second
) {
381 if (QuotedName
< Inc
->Name
) {
382 InsertOffset
= Inc
->R
.getOffset();
387 assert(InsertOffset
<= Code
.size());
388 llvm::StringRef DirectiveSpelling
=
389 Directive
== IncludeDirective::Include
? "include" : "import";
390 std::string NewInclude
=
391 llvm::formatv("#{0} {1}\n", DirectiveSpelling
, QuotedName
);
392 // When inserting headers at end of the code, also append '\n' to the code
393 // if it does not end with '\n'.
394 // FIXME: when inserting multiple #includes at the end of code, only one
395 // newline should be added.
396 if (InsertOffset
== Code
.size() && (!Code
.empty() && Code
.back() != '\n'))
397 NewInclude
= "\n" + NewInclude
;
398 return tooling::Replacement(FileName
, InsertOffset
, 0, NewInclude
);
401 tooling::Replacements
HeaderIncludes::remove(llvm::StringRef IncludeName
,
402 bool IsAngled
) const {
403 assert(IncludeName
== trimInclude(IncludeName
));
404 tooling::Replacements Result
;
405 auto Iter
= ExistingIncludes
.find(IncludeName
);
406 if (Iter
== ExistingIncludes
.end())
408 for (const auto &Inc
: Iter
->second
) {
409 if ((IsAngled
&& StringRef(Inc
.Name
).starts_with("\"")) ||
410 (!IsAngled
&& StringRef(Inc
.Name
).starts_with("<")))
412 llvm::Error Err
= Result
.add(tooling::Replacement(
413 FileName
, Inc
.R
.getOffset(), Inc
.R
.getLength(), ""));
415 auto ErrMsg
= "Unexpected conflicts in #include deletions: " +
416 llvm::toString(std::move(Err
));
417 llvm_unreachable(ErrMsg
.c_str());
423 } // namespace tooling