clang/lib/Tooling/Transformer/SourceCode.cpp

   1 //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 //  This file provides functions that simplify extraction of source code.
  10 //
  11 //===----------------------------------------------------------------------===//
  12 #include "clang/Tooling/Transformer/SourceCode.h"
  13 #include "clang/AST/ASTContext.h"
  14 #include "clang/AST/Attr.h"
  15 #include "clang/AST/Comment.h"
  16 #include "clang/AST/Decl.h"
  17 #include "clang/AST/DeclCXX.h"
  18 #include "clang/AST/DeclTemplate.h"
  19 #include "clang/AST/Expr.h"
  20 #include "clang/Basic/SourceManager.h"
  21 #include "clang/Lex/Lexer.h"
  22 #include "llvm/Support/Errc.h"
  23 #include "llvm/Support/Error.h"
  24 #include <set>
  25
  26 using namespace clang;
  27
  28 using llvm::errc;
  29 using llvm::StringError;
  30
  31 StringRef clang::tooling::getText(CharSourceRange Range,
  32                                   const ASTContext &Context) {
  33   return Lexer::getSourceText(Range, Context.getSourceManager(),
  34                               Context.getLangOpts());
  35 }
  36
  37 CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
  38                                                  tok::TokenKind Next,
  39                                                  ASTContext &Context) {
  40   CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(),
  41                                             Context.getLangOpts());
  42   if (R.isInvalid())
  43     return Range;
  44   Token Tok;
  45   bool Err =
  46       Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),
  47                          Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);
  48   if (Err || !Tok.is(Next))
  49     return Range;
  50   return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());
  51 }
  52
  53 llvm::Error clang::tooling::validateRange(const CharSourceRange &Range,
  54                                           const SourceManager &SM,
  55                                           bool AllowSystemHeaders) {
  56   if (Range.isInvalid())
  57     return llvm::make_error<StringError>(errc::invalid_argument,
  58                                          "Invalid range");
  59
  60   if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
  61     return llvm::make_error<StringError>(
  62         errc::invalid_argument, "Range starts or ends in a macro expansion");
  63
  64   if (!AllowSystemHeaders) {
  65     if (SM.isInSystemHeader(Range.getBegin()) ||
  66         SM.isInSystemHeader(Range.getEnd()))
  67       return llvm::make_error<StringError>(errc::invalid_argument,
  68                                            "Range is in system header");
  69   }
  70
  71   std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
  72   std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
  73   if (BeginInfo.first != EndInfo.first)
  74     return llvm::make_error<StringError>(
  75         errc::invalid_argument, "Range begins and ends in different files");
  76
  77   if (BeginInfo.second > EndInfo.second)
  78     return llvm::make_error<StringError>(errc::invalid_argument,
  79                                          "Range's begin is past its end");
  80
  81   return llvm::Error::success();
  82 }
  83
  84 llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
  85                                               const SourceManager &SM) {
  86   return validateRange(Range, SM, /*AllowSystemHeaders=*/false);
  87 }
  88
  89 static bool spelledInMacroDefinition(SourceLocation Loc,
  90                                      const SourceManager &SM) {
  91   while (Loc.isMacroID()) {
  92     const auto &Expansion = SM.getSLocEntry(SM.getFileID(Loc)).getExpansion();
  93     if (Expansion.isMacroArgExpansion()) {
  94       // Check the spelling location of the macro arg, in case the arg itself is
  95       // in a macro expansion.
  96       Loc = Expansion.getSpellingLoc();
  97     } else {
  98       return true;
  99     }
 100   }
 101   return false;
 102 }
 103
 104 static CharSourceRange getRange(const CharSourceRange &EditRange,
 105                                 const SourceManager &SM,
 106                                 const LangOptions &LangOpts,
 107                                 bool IncludeMacroExpansion) {
 108   CharSourceRange Range;
 109   if (IncludeMacroExpansion) {
 110     Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
 111   } else {
 112     if (spelledInMacroDefinition(EditRange.getBegin(), SM) ||
 113         spelledInMacroDefinition(EditRange.getEnd(), SM))
 114       return {};
 115
 116     auto B = SM.getSpellingLoc(EditRange.getBegin());
 117     auto E = SM.getSpellingLoc(EditRange.getEnd());
 118     if (EditRange.isTokenRange())
 119       E = Lexer::getLocForEndOfToken(E, 0, SM, LangOpts);
 120     Range = CharSourceRange::getCharRange(B, E);
 121   }
 122   return Range;
 123 }
 124
 125 std::optional<CharSourceRange> clang::tooling::getFileRangeForEdit(
 126     const CharSourceRange &EditRange, const SourceManager &SM,
 127     const LangOptions &LangOpts, bool IncludeMacroExpansion) {
 128   CharSourceRange Range =
 129       getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
 130   bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
 131   if (IsInvalid)
 132     return std::nullopt;
 133   return Range;
 134 }
 135
 136 std::optional<CharSourceRange> clang::tooling::getFileRange(
 137     const CharSourceRange &EditRange, const SourceManager &SM,
 138     const LangOptions &LangOpts, bool IncludeMacroExpansion) {
 139   CharSourceRange Range =
 140       getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
 141   bool IsInvalid =
 142       llvm::errorToBool(validateRange(Range, SM, /*AllowSystemHeaders=*/true));
 143   if (IsInvalid)
 144     return std::nullopt;
 145   return Range;
 146 }
 147
 148 static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
 149   return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
 150 }
 151
 152 static bool contains(const std::set<tok::TokenKind> &Terminators,
 153                      const Token &Tok) {
 154   return Terminators.count(Tok.getKind()) > 0;
 155 }
 156
 157 // Returns the exclusive, *file* end location of the entity whose last token is
 158 // at location 'EntityLast'. That is, it returns the location one past the last
 159 // relevant character.
 160 //
 161 // Associated tokens include comments, horizontal whitespace and 'Terminators'
 162 // -- optional tokens, which, if any are found, will be included; if
 163 // 'Terminators' is empty, we will not include any extra tokens beyond comments
 164 // and horizontal whitespace.
 165 static SourceLocation
 166 getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
 167                 const std::set<tok::TokenKind> &Terminators,
 168                 const LangOptions &LangOpts) {
 169   assert(EntityLast.isValid() && "Invalid end location found.");
 170
 171   // We remember the last location of a non-horizontal-whitespace token we have
 172   // lexed; this is the location up to which we will want to delete.
 173   // FIXME: Support using the spelling loc here for cases where we want to
 174   // analyze the macro text.
 175
 176   CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
 177   // FIXME: Should check isTokenRange(), for the (rare) case that
 178   // `ExpansionRange` is a character range.
 179   std::unique_ptr<Lexer> Lexer = [&]() {
 180     bool Invalid = false;
 181     auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
 182     llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
 183     assert(!Invalid && "Cannot get file/offset");
 184     return std::make_unique<clang::Lexer>(
 185         SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
 186         File.data() + FileOffset.second, File.end());
 187   }();
 188
 189   // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
 190   Lexer->SetKeepWhitespaceMode(true);
 191
 192   // Generally, the code we want to include looks like this ([] are optional),
 193   // If Terminators is empty:
 194   //   [ <comment> ] [ <newline> ]
 195   // Otherwise:
 196   //   ... <terminator> [ <comment> ] [ <newline> ]
 197
 198   Token Tok;
 199   bool Terminated = false;
 200
 201   // First, lex to the current token (which is the last token of the range that
 202   // is definitely associated with the decl). Then, we process the first token
 203   // separately from the rest based on conditions that hold specifically for
 204   // that first token.
 205   //
 206   // We do not search for a terminator if none is required or we've already
 207   // encountered it. Otherwise, if the original `EntityLast` location was in a
 208   // macro expansion, we don't have visibility into the text, so we assume we've
 209   // already terminated. However, we note this assumption with
 210   // `TerminatedByMacro`, because we'll want to handle it somewhat differently
 211   // for the terminators semicolon and comma. These terminators can be safely
 212   // associated with the entity when they appear after the macro -- extra
 213   // semicolons have no effect on the program and a well-formed program won't
 214   // have multiple commas in a row, so we're guaranteed that there is only one.
 215   //
 216   // FIXME: This handling of macros is more conservative than necessary. When
 217   // the end of the expansion coincides with the end of the node, we can still
 218   // safely analyze the code. But, it is more complicated, because we need to
 219   // start by lexing the spelling loc for the first token and then switch to the
 220   // expansion loc.
 221   bool TerminatedByMacro = false;
 222   Lexer->LexFromRawLexer(Tok);
 223   if (Terminators.empty() || contains(Terminators, Tok))
 224     Terminated = true;
 225   else if (EntityLast.isMacroID()) {
 226     Terminated = true;
 227     TerminatedByMacro = true;
 228   }
 229
 230   // We save the most recent candidate for the exclusive end location.
 231   SourceLocation End = Tok.getEndLoc();
 232
 233   while (!Terminated) {
 234     // Lex the next token we want to possibly expand the range with.
 235     Lexer->LexFromRawLexer(Tok);
 236
 237     switch (Tok.getKind()) {
 238     case tok::eof:
 239     // Unexpected separators.
 240     case tok::l_brace:
 241     case tok::r_brace:
 242     case tok::comma:
 243       return End;
 244     // Whitespace pseudo-tokens.
 245     case tok::unknown:
 246       if (startsWithNewline(SM, Tok))
 247         // Include at least until the end of the line.
 248         End = Tok.getEndLoc();
 249       break;
 250     default:
 251       if (contains(Terminators, Tok))
 252         Terminated = true;
 253       End = Tok.getEndLoc();
 254       break;
 255     }
 256   }
 257
 258   do {
 259     // Lex the next token we want to possibly expand the range with.
 260     Lexer->LexFromRawLexer(Tok);
 261
 262     switch (Tok.getKind()) {
 263     case tok::unknown:
 264       if (startsWithNewline(SM, Tok))
 265         // We're done, but include this newline.
 266         return Tok.getEndLoc();
 267       break;
 268     case tok::comment:
 269       // Include any comments we find on the way.
 270       End = Tok.getEndLoc();
 271       break;
 272     case tok::semi:
 273     case tok::comma:
 274       if (TerminatedByMacro && contains(Terminators, Tok)) {
 275         End = Tok.getEndLoc();
 276         // We've found a real terminator.
 277         TerminatedByMacro = false;
 278         break;
 279       }
 280       // Found an unrelated token; stop and don't include it.
 281       return End;
 282     default:
 283       // Found an unrelated token; stop and don't include it.
 284       return End;
 285     }
 286   } while (true);
 287 }
 288
 289 // Returns the expected terminator tokens for the given declaration.
 290 //
 291 // If we do not know the correct terminator token, returns an empty set.
 292 //
 293 // There are cases where we have more than one possible terminator (for example,
 294 // we find either a comma or a semicolon after a VarDecl).
 295 static std::set<tok::TokenKind> getTerminators(const Decl &D) {
 296   if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
 297     return {tok::semi};
 298
 299   if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
 300     return {tok::r_brace, tok::semi};
 301
 302   if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
 303     return {tok::comma, tok::semi};
 304
 305   return {};
 306 }
 307
 308 // Starting from `Loc`, skips whitespace up to, and including, a single
 309 // newline. Returns the (exclusive) end of any skipped whitespace (that is, the
 310 // location immediately after the whitespace).
 311 static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
 312                                                SourceLocation Loc,
 313                                                const LangOptions &LangOpts) {
 314   const char *LocChars = SM.getCharacterData(Loc);
 315   int i = 0;
 316   while (isHorizontalWhitespace(LocChars[i]))
 317     ++i;
 318   if (isVerticalWhitespace(LocChars[i]))
 319     ++i;
 320   return Loc.getLocWithOffset(i);
 321 }
 322
 323 // Is `Loc` separated from any following decl by something meaningful (e.g. an
 324 // empty line, a comment), ignoring horizontal whitespace?  Since this is a
 325 // heuristic, we return false when in doubt.  `Loc` cannot be the first location
 326 // in the file.
 327 static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
 328                                  const LangOptions &LangOpts) {
 329   // If the preceding character is a newline, we'll check for an empty line as a
 330   // separator. However, we can't identify an empty line using tokens, so we
 331   // analyse the characters. If we try to use tokens, we'll just end up with a
 332   // whitespace token, whose characters we'd have to analyse anyhow.
 333   bool Invalid = false;
 334   const char *LocChars =
 335       SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
 336   assert(!Invalid &&
 337          "Loc must be a valid character and not the first of the source file.");
 338   if (isVerticalWhitespace(LocChars[0])) {
 339     for (int i = 1; isWhitespace(LocChars[i]); ++i)
 340       if (isVerticalWhitespace(LocChars[i]))
 341         return true;
 342   }
 343   // We didn't find an empty line, so lex the next token, skipping past any
 344   // whitespace we just scanned.
 345   Token Tok;
 346   bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
 347                                    /*IgnoreWhiteSpace=*/true);
 348   if (Failed)
 349     // Any text that confuses the lexer seems fair to consider a separation.
 350     return true;
 351
 352   switch (Tok.getKind()) {
 353   case tok::comment:
 354   case tok::l_brace:
 355   case tok::r_brace:
 356   case tok::eof:
 357     return true;
 358   default:
 359     return false;
 360   }
 361 }
 362
 363 CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
 364                                             ASTContext &Context) {
 365   const SourceManager &SM = Context.getSourceManager();
 366   const LangOptions &LangOpts = Context.getLangOpts();
 367   CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
 368
 369   // First, expand to the start of the template<> declaration if necessary.
 370   if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
 371     if (const auto *T = Record->getDescribedClassTemplate())
 372       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
 373         Range.setBegin(T->getBeginLoc());
 374   } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
 375     if (const auto *T = F->getDescribedFunctionTemplate())
 376       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
 377         Range.setBegin(T->getBeginLoc());
 378   }
 379
 380   // Next, expand the end location past trailing comments to include a potential
 381   // newline at the end of the decl's line.
 382   Range.setEnd(
 383       getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
 384   Range.setTokenRange(false);
 385
 386   // Expand to include preceeding associated comments. We ignore any comments
 387   // that are not preceeding the decl, since we've already skipped trailing
 388   // comments with getEntityEndLoc.
 389   if (const RawComment *Comment =
 390           Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
 391     // Only include a preceding comment if:
 392     // * it is *not* separate from the declaration (not including any newline
 393     //   that immediately follows the comment),
 394     // * the decl *is* separate from any following entity (so, there are no
 395     //   other entities the comment could refer to), and
 396     // * it is not a IfThisThenThat lint check.
 397     if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
 398                                      Range.getBegin()) &&
 399         !atOrBeforeSeparation(
 400             SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
 401             LangOpts) &&
 402         atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
 403       const StringRef CommentText = Comment->getRawText(SM);
 404       if (!CommentText.contains("LINT.IfChange") &&
 405           !CommentText.contains("LINT.ThenChange"))
 406         Range.setBegin(Comment->getBeginLoc());
 407     }
 408   // Add leading attributes.
 409   for (auto *Attr : Decl.attrs()) {
 410     if (Attr->getLocation().isInvalid() ||
 411         !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
 412       continue;
 413     Range.setBegin(Attr->getLocation());
 414
 415     // Extend to the left '[[' or '__attribute((' if we saw the attribute,
 416     // unless it is not a valid location.
 417     bool Invalid;
 418     StringRef Source =
 419         SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
 420     if (Invalid)
 421       continue;
 422     llvm::StringRef BeforeAttr =
 423         Source.substr(0, SM.getFileOffset(Range.getBegin()));
 424     llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
 425
 426     for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
 427       // Handle whitespace between attribute prefix and attribute value.
 428       if (BeforeAttrStripped.endswith(Prefix)) {
 429         // Move start to start position of prefix, which is
 430         // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
 431         // positions to the left.
 432         Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
 433             -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
 434         break;
 435         // If we didn't see '[[' or '__attribute' it's probably coming from a
 436         // macro expansion which is already handled by makeFileCharRange(),
 437         // below.
 438       }
 439     }
 440   }
 441
 442   // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
 443   // Range.getBegin() may be inside an expansion.
 444   return Lexer::makeFileCharRange(Range, SM, LangOpts);
 445 }