clang/lib/Tooling/Inclusions/HeaderIncludes.cpp

   1 //===--- HeaderIncludes.cpp - Insert/Delete #includes --*- C++ -*----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "clang/Tooling/Inclusions/HeaderIncludes.h"
  10 #include "clang/Basic/FileManager.h"
  11 #include "clang/Basic/SourceManager.h"
  12 #include "clang/Lex/Lexer.h"
  13 #include "llvm/Support/FormatVariadic.h"
  14 #include "llvm/Support/Path.h"
  15 #include <optional>
  16
  17 namespace clang {
  18 namespace tooling {
  19 namespace {
  20
  21 LangOptions createLangOpts() {
  22   LangOptions LangOpts;
  23   LangOpts.CPlusPlus = 1;
  24   LangOpts.CPlusPlus11 = 1;
  25   LangOpts.CPlusPlus14 = 1;
  26   LangOpts.LineComment = 1;
  27   LangOpts.CXXOperatorNames = 1;
  28   LangOpts.Bool = 1;
  29   LangOpts.ObjC = 1;
  30   LangOpts.MicrosoftExt = 1;    // To get kw___try, kw___finally.
  31   LangOpts.DeclSpecKeyword = 1; // To get __declspec.
  32   LangOpts.WChar = 1;           // To get wchar_t
  33   return LangOpts;
  34 }
  35
  36 // Returns the offset after skipping a sequence of tokens, matched by \p
  37 // GetOffsetAfterSequence, from the start of the code.
  38 // \p GetOffsetAfterSequence should be a function that matches a sequence of
  39 // tokens and returns an offset after the sequence.
  40 unsigned getOffsetAfterTokenSequence(
  41     StringRef FileName, StringRef Code, const IncludeStyle &Style,
  42     llvm::function_ref<unsigned(const SourceManager &, Lexer &, Token &)>
  43         GetOffsetAfterSequence) {
  44   SourceManagerForFile VirtualSM(FileName, Code);
  45   SourceManager &SM = VirtualSM.get();
  46   LangOptions LangOpts = createLangOpts();
  47   Lexer Lex(SM.getMainFileID(), SM.getBufferOrFake(SM.getMainFileID()), SM,
  48             LangOpts);
  49   Token Tok;
  50   // Get the first token.
  51   Lex.LexFromRawLexer(Tok);
  52   return GetOffsetAfterSequence(SM, Lex, Tok);
  53 }
  54
  55 // Check if a sequence of tokens is like "#<Name> <raw_identifier>". If it is,
  56 // \p Tok will be the token after this directive; otherwise, it can be any token
  57 // after the given \p Tok (including \p Tok). If \p RawIDName is provided, the
  58 // (second) raw_identifier name is checked.
  59 bool checkAndConsumeDirectiveWithName(
  60     Lexer &Lex, StringRef Name, Token &Tok,
  61     std::optional<StringRef> RawIDName = std::nullopt) {
  62   bool Matched = Tok.is(tok::hash) && !Lex.LexFromRawLexer(Tok) &&
  63                  Tok.is(tok::raw_identifier) &&
  64                  Tok.getRawIdentifier() == Name && !Lex.LexFromRawLexer(Tok) &&
  65                  Tok.is(tok::raw_identifier) &&
  66                  (!RawIDName || Tok.getRawIdentifier() == *RawIDName);
  67   if (Matched)
  68     Lex.LexFromRawLexer(Tok);
  69   return Matched;
  70 }
  71
  72 void skipComments(Lexer &Lex, Token &Tok) {
  73   while (Tok.is(tok::comment))
  74     if (Lex.LexFromRawLexer(Tok))
  75       return;
  76 }
  77
  78 // Returns the offset after header guard directives and any comments
  79 // before/after header guards (e.g. #ifndef/#define pair, #pragma once). If no
  80 // header guard is present in the code, this will return the offset after
  81 // skipping all comments from the start of the code.
  82 unsigned getOffsetAfterHeaderGuardsAndComments(StringRef FileName,
  83                                                StringRef Code,
  84                                                const IncludeStyle &Style) {
  85   // \p Consume returns location after header guard or 0 if no header guard is
  86   // found.
  87   auto ConsumeHeaderGuardAndComment =
  88       [&](std::function<unsigned(const SourceManager &SM, Lexer &Lex,
  89                                  Token Tok)>
  90               Consume) {
  91         return getOffsetAfterTokenSequence(
  92             FileName, Code, Style,
  93             [&Consume](const SourceManager &SM, Lexer &Lex, Token Tok) {
  94               skipComments(Lex, Tok);
  95               unsigned InitialOffset = SM.getFileOffset(Tok.getLocation());
  96               return std::max(InitialOffset, Consume(SM, Lex, Tok));
  97             });
  98       };
  99   return std::max(
 100       // #ifndef/#define
 101       ConsumeHeaderGuardAndComment(
 102           [](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned {
 103             if (checkAndConsumeDirectiveWithName(Lex, "ifndef", Tok)) {
 104               skipComments(Lex, Tok);
 105               if (checkAndConsumeDirectiveWithName(Lex, "define", Tok) &&
 106                   Tok.isAtStartOfLine())
 107                 return SM.getFileOffset(Tok.getLocation());
 108             }
 109             return 0;
 110           }),
 111       // #pragma once
 112       ConsumeHeaderGuardAndComment(
 113           [](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned {
 114             if (checkAndConsumeDirectiveWithName(Lex, "pragma", Tok,
 115                                                  StringRef("once")))
 116               return SM.getFileOffset(Tok.getLocation());
 117             return 0;
 118           }));
 119 }
 120
 121 // Check if a sequence of tokens is like
 122 //    "#include ("header.h" | <header.h>)".
 123 // If it is, \p Tok will be the token after this directive; otherwise, it can be
 124 // any token after the given \p Tok (including \p Tok).
 125 bool checkAndConsumeInclusiveDirective(Lexer &Lex, Token &Tok) {
 126   auto Matched = [&]() {
 127     Lex.LexFromRawLexer(Tok);
 128     return true;
 129   };
 130   if (Tok.is(tok::hash) && !Lex.LexFromRawLexer(Tok) &&
 131       Tok.is(tok::raw_identifier) && Tok.getRawIdentifier() == "include") {
 132     if (Lex.LexFromRawLexer(Tok))
 133       return false;
 134     if (Tok.is(tok::string_literal))
 135       return Matched();
 136     if (Tok.is(tok::less)) {
 137       while (!Lex.LexFromRawLexer(Tok) && Tok.isNot(tok::greater)) {
 138       }
 139       if (Tok.is(tok::greater))
 140         return Matched();
 141     }
 142   }
 143   return false;
 144 }
 145
 146 // Returns the offset of the last #include directive after which a new
 147 // #include can be inserted. This ignores #include's after the #include block(s)
 148 // in the beginning of a file to avoid inserting headers into code sections
 149 // where new #include's should not be added by default.
 150 // These code sections include:
 151 //      - raw string literals (containing #include).
 152 //      - #if blocks.
 153 //      - Special #include's among declarations (e.g. functions).
 154 //
 155 // If no #include after which a new #include can be inserted, this returns the
 156 // offset after skipping all comments from the start of the code.
 157 // Inserting after an #include is not allowed if it comes after code that is not
 158 // #include (e.g. pre-processing directive that is not #include, declarations).
 159 unsigned getMaxHeaderInsertionOffset(StringRef FileName, StringRef Code,
 160                                      const IncludeStyle &Style) {
 161   return getOffsetAfterTokenSequence(
 162       FileName, Code, Style,
 163       [](const SourceManager &SM, Lexer &Lex, Token Tok) {
 164         skipComments(Lex, Tok);
 165         unsigned MaxOffset = SM.getFileOffset(Tok.getLocation());
 166         while (checkAndConsumeInclusiveDirective(Lex, Tok))
 167           MaxOffset = SM.getFileOffset(Tok.getLocation());
 168         return MaxOffset;
 169       });
 170 }
 171
 172 inline StringRef trimInclude(StringRef IncludeName) {
 173   return IncludeName.trim("\"<>");
 174 }
 175
 176 const char IncludeRegexPattern[] =
 177     R"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))";
 178
 179 // The filename of Path excluding extension.
 180 // Used to match implementation with headers, this differs from sys::path::stem:
 181 //  - in names with multiple dots (foo.cu.cc) it terminates at the *first*
 182 //  - an empty stem is never returned: /foo/.bar.x => .bar
 183 //  - we don't bother to handle . and .. specially
 184 StringRef matchingStem(llvm::StringRef Path) {
 185   StringRef Name = llvm::sys::path::filename(Path);
 186   return Name.substr(0, Name.find('.', 1));
 187 }
 188
 189 } // anonymous namespace
 190
 191 IncludeCategoryManager::IncludeCategoryManager(const IncludeStyle &Style,
 192                                                StringRef FileName)
 193     : Style(Style), FileName(FileName) {
 194   for (const auto &Category : Style.IncludeCategories) {
 195     CategoryRegexs.emplace_back(Category.Regex, Category.RegexIsCaseSensitive
 196                                                     ? llvm::Regex::NoFlags
 197                                                     : llvm::Regex::IgnoreCase);
 198   }
 199   IsMainFile = FileName.endswith(".c") || FileName.endswith(".cc") ||
 200                FileName.endswith(".cpp") || FileName.endswith(".c++") ||
 201                FileName.endswith(".cxx") || FileName.endswith(".m") ||
 202                FileName.endswith(".mm");
 203   if (!Style.IncludeIsMainSourceRegex.empty()) {
 204     llvm::Regex MainFileRegex(Style.IncludeIsMainSourceRegex);
 205     IsMainFile |= MainFileRegex.match(FileName);
 206   }
 207 }
 208
 209 int IncludeCategoryManager::getIncludePriority(StringRef IncludeName,
 210                                                bool CheckMainHeader) const {
 211   int Ret = INT_MAX;
 212   for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i)
 213     if (CategoryRegexs[i].match(IncludeName)) {
 214       Ret = Style.IncludeCategories[i].Priority;
 215       break;
 216     }
 217   if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName))
 218     Ret = 0;
 219   return Ret;
 220 }
 221
 222 int IncludeCategoryManager::getSortIncludePriority(StringRef IncludeName,
 223                                                    bool CheckMainHeader) const {
 224   int Ret = INT_MAX;
 225   for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i)
 226     if (CategoryRegexs[i].match(IncludeName)) {
 227       Ret = Style.IncludeCategories[i].SortPriority;
 228       if (Ret == 0)
 229         Ret = Style.IncludeCategories[i].Priority;
 230       break;
 231     }
 232   if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName))
 233     Ret = 0;
 234   return Ret;
 235 }
 236 bool IncludeCategoryManager::isMainHeader(StringRef IncludeName) const {
 237   if (!IncludeName.startswith("\""))
 238     return false;
 239
 240   IncludeName =
 241       IncludeName.drop_front(1).drop_back(1); // remove the surrounding "" or <>
 242   // Not matchingStem: implementation files may have compound extensions but
 243   // headers may not.
 244   StringRef HeaderStem = llvm::sys::path::stem(IncludeName);
 245   StringRef FileStem = llvm::sys::path::stem(FileName); // foo.cu for foo.cu.cc
 246   StringRef MatchingFileStem = matchingStem(FileName);  // foo for foo.cu.cc
 247   // main-header examples:
 248   //  1) foo.h => foo.cc
 249   //  2) foo.h => foo.cu.cc
 250   //  3) foo.proto.h => foo.proto.cc
 251   //
 252   // non-main-header examples:
 253   //  1) foo.h => bar.cc
 254   //  2) foo.proto.h => foo.cc
 255   StringRef Matching;
 256   if (MatchingFileStem.starts_with_insensitive(HeaderStem))
 257     Matching = MatchingFileStem; // example 1), 2)
 258   else if (FileStem.equals_insensitive(HeaderStem))
 259     Matching = FileStem; // example 3)
 260   if (!Matching.empty()) {
 261     llvm::Regex MainIncludeRegex(HeaderStem.str() + Style.IncludeIsMainRegex,
 262                                  llvm::Regex::IgnoreCase);
 263     if (MainIncludeRegex.match(Matching))
 264       return true;
 265   }
 266   return false;
 267 }
 268
 269 const llvm::Regex HeaderIncludes::IncludeRegex(IncludeRegexPattern);
 270
 271 HeaderIncludes::HeaderIncludes(StringRef FileName, StringRef Code,
 272                                const IncludeStyle &Style)
 273     : FileName(FileName), Code(Code), FirstIncludeOffset(-1),
 274       MinInsertOffset(
 275           getOffsetAfterHeaderGuardsAndComments(FileName, Code, Style)),
 276       MaxInsertOffset(MinInsertOffset +
 277                       getMaxHeaderInsertionOffset(
 278                           FileName, Code.drop_front(MinInsertOffset), Style)),
 279       MainIncludeFound(false),
 280       Categories(Style, FileName) {
 281   // Add 0 for main header and INT_MAX for headers that are not in any
 282   // category.
 283   Priorities = {0, INT_MAX};
 284   for (const auto &Category : Style.IncludeCategories)
 285     Priorities.insert(Category.Priority);
 286   SmallVector<StringRef, 32> Lines;
 287   Code.drop_front(MinInsertOffset).split(Lines, "\n");
 288
 289   unsigned Offset = MinInsertOffset;
 290   unsigned NextLineOffset;
 291   SmallVector<StringRef, 4> Matches;
 292   for (auto Line : Lines) {
 293     NextLineOffset = std::min(Code.size(), Offset + Line.size() + 1);
 294     if (IncludeRegex.match(Line, &Matches)) {
 295       // If this is the last line without trailing newline, we need to make
 296       // sure we don't delete across the file boundary.
 297       addExistingInclude(
 298           Include(Matches[2],
 299                   tooling::Range(
 300                       Offset, std::min(Line.size() + 1, Code.size() - Offset)),
 301                   Matches[1] == "import" ? tooling::IncludeDirective::Import
 302                                          : tooling::IncludeDirective::Include),
 303           NextLineOffset);
 304     }
 305     Offset = NextLineOffset;
 306   }
 307
 308   // Populate CategoryEndOfssets:
 309   // - Ensure that CategoryEndOffset[Highest] is always populated.
 310   // - If CategoryEndOffset[Priority] isn't set, use the next higher value
 311   // that is set, up to CategoryEndOffset[Highest].
 312   auto Highest = Priorities.begin();
 313   if (CategoryEndOffsets.find(*Highest) == CategoryEndOffsets.end()) {
 314     if (FirstIncludeOffset >= 0)
 315       CategoryEndOffsets[*Highest] = FirstIncludeOffset;
 316     else
 317       CategoryEndOffsets[*Highest] = MinInsertOffset;
 318   }
 319   // By this point, CategoryEndOffset[Highest] is always set appropriately:
 320   //  - to an appropriate location before/after existing #includes, or
 321   //  - to right after the header guard, or
 322   //  - to the beginning of the file.
 323   for (auto I = ++Priorities.begin(), E = Priorities.end(); I != E; ++I)
 324     if (CategoryEndOffsets.find(*I) == CategoryEndOffsets.end())
 325       CategoryEndOffsets[*I] = CategoryEndOffsets[*std::prev(I)];
 326 }
 327
 328 // \p Offset: the start of the line following this include directive.
 329 void HeaderIncludes::addExistingInclude(Include IncludeToAdd,
 330                                         unsigned NextLineOffset) {
 331   auto Iter =
 332       ExistingIncludes.try_emplace(trimInclude(IncludeToAdd.Name)).first;
 333   Iter->second.push_back(std::move(IncludeToAdd));
 334   auto &CurInclude = Iter->second.back();
 335   // The header name with quotes or angle brackets.
 336   // Only record the offset of current #include if we can insert after it.
 337   if (CurInclude.R.getOffset() <= MaxInsertOffset) {
 338     int Priority = Categories.getIncludePriority(
 339         CurInclude.Name, /*CheckMainHeader=*/!MainIncludeFound);
 340     if (Priority == 0)
 341       MainIncludeFound = true;
 342     CategoryEndOffsets[Priority] = NextLineOffset;
 343     IncludesByPriority[Priority].push_back(&CurInclude);
 344     if (FirstIncludeOffset < 0)
 345       FirstIncludeOffset = CurInclude.R.getOffset();
 346   }
 347 }
 348
 349 std::optional<tooling::Replacement>
 350 HeaderIncludes::insert(llvm::StringRef IncludeName, bool IsAngled,
 351                        IncludeDirective Directive) const {
 352   assert(IncludeName == trimInclude(IncludeName));
 353   // If a <header> ("header") already exists in code, "header" (<header>) with
 354   // different quotation and/or directive will still be inserted.
 355   // FIXME: figure out if this is the best behavior.
 356   auto It = ExistingIncludes.find(IncludeName);
 357   if (It != ExistingIncludes.end()) {
 358     for (const auto &Inc : It->second)
 359       if (Inc.Directive == Directive &&
 360           ((IsAngled && StringRef(Inc.Name).startswith("<")) ||
 361            (!IsAngled && StringRef(Inc.Name).startswith("\""))))
 362         return std::nullopt;
 363   }
 364   std::string Quoted =
 365       std::string(llvm::formatv(IsAngled ? "<{0}>" : "\"{0}\"", IncludeName));
 366   StringRef QuotedName = Quoted;
 367   int Priority = Categories.getIncludePriority(
 368       QuotedName, /*CheckMainHeader=*/!MainIncludeFound);
 369   auto CatOffset = CategoryEndOffsets.find(Priority);
 370   assert(CatOffset != CategoryEndOffsets.end());
 371   unsigned InsertOffset = CatOffset->second; // Fall back offset
 372   auto Iter = IncludesByPriority.find(Priority);
 373   if (Iter != IncludesByPriority.end()) {
 374     for (const auto *Inc : Iter->second) {
 375       if (QuotedName < Inc->Name) {
 376         InsertOffset = Inc->R.getOffset();
 377         break;
 378       }
 379     }
 380   }
 381   assert(InsertOffset <= Code.size());
 382   llvm::StringRef DirectiveSpelling =
 383       Directive == IncludeDirective::Include ? "include" : "import";
 384   std::string NewInclude =
 385       llvm::formatv("#{0} {1}\n", DirectiveSpelling, QuotedName);
 386   // When inserting headers at end of the code, also append '\n' to the code
 387   // if it does not end with '\n'.
 388   // FIXME: when inserting multiple #includes at the end of code, only one
 389   // newline should be added.
 390   if (InsertOffset == Code.size() && (!Code.empty() && Code.back() != '\n'))
 391     NewInclude = "\n" + NewInclude;
 392   return tooling::Replacement(FileName, InsertOffset, 0, NewInclude);
 393 }
 394
 395 tooling::Replacements HeaderIncludes::remove(llvm::StringRef IncludeName,
 396                                              bool IsAngled) const {
 397   assert(IncludeName == trimInclude(IncludeName));
 398   tooling::Replacements Result;
 399   auto Iter = ExistingIncludes.find(IncludeName);
 400   if (Iter == ExistingIncludes.end())
 401     return Result;
 402   for (const auto &Inc : Iter->second) {
 403     if ((IsAngled && StringRef(Inc.Name).startswith("\"")) ||
 404         (!IsAngled && StringRef(Inc.Name).startswith("<")))
 405       continue;
 406     llvm::Error Err = Result.add(tooling::Replacement(
 407         FileName, Inc.R.getOffset(), Inc.R.getLength(), ""));
 408     if (Err) {
 409       auto ErrMsg = "Unexpected conflicts in #include deletions: " +
 410                     llvm::toString(std::move(Err));
 411       llvm_unreachable(ErrMsg.c_str());
 412     }
 413   }
 414   return Result;
 415 }
 416
 417 } // namespace tooling
 418 } // namespace clang