clang/lib/Tooling/Transformer/SourceCode.cpp

   1 //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 //  This file provides functions that simplify extraction of source code.
  10 //
  11 //===----------------------------------------------------------------------===//
  12 #include "clang/Tooling/Transformer/SourceCode.h"
  13 #include "clang/AST/ASTContext.h"
  14 #include "clang/AST/Attr.h"
  15 #include "clang/AST/Comment.h"
  16 #include "clang/AST/Decl.h"
  17 #include "clang/AST/DeclCXX.h"
  18 #include "clang/AST/DeclTemplate.h"
  19 #include "clang/AST/Expr.h"
  20 #include "clang/Basic/SourceManager.h"
  21 #include "clang/Lex/Lexer.h"
  22 #include "llvm/Support/Errc.h"
  23 #include "llvm/Support/Error.h"
  24 #include <set>
  25
  26 using namespace clang;
  27
  28 using llvm::errc;
  29 using llvm::StringError;
  30
  31 StringRef clang::tooling::getText(CharSourceRange Range,
  32                                   const ASTContext &Context) {
  33   return Lexer::getSourceText(Range, Context.getSourceManager(),
  34                               Context.getLangOpts());
  35 }
  36
  37 CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
  38                                                  tok::TokenKind Next,
  39                                                  ASTContext &Context) {
  40   CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(),
  41                                             Context.getLangOpts());
  42   if (R.isInvalid())
  43     return Range;
  44   Token Tok;
  45   bool Err =
  46       Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),
  47                          Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);
  48   if (Err || !Tok.is(Next))
  49     return Range;
  50   return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());
  51 }
  52
  53 llvm::Error clang::tooling::validateRange(const CharSourceRange &Range,
  54                                           const SourceManager &SM,
  55                                           bool AllowSystemHeaders) {
  56   if (Range.isInvalid())
  57     return llvm::make_error<StringError>(errc::invalid_argument,
  58                                          "Invalid range");
  59
  60   if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
  61     return llvm::make_error<StringError>(
  62         errc::invalid_argument, "Range starts or ends in a macro expansion");
  63
  64   if (!AllowSystemHeaders) {
  65     if (SM.isInSystemHeader(Range.getBegin()) ||
  66         SM.isInSystemHeader(Range.getEnd()))
  67       return llvm::make_error<StringError>(errc::invalid_argument,
  68                                            "Range is in system header");
  69   }
  70
  71   std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
  72   std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
  73   if (BeginInfo.first != EndInfo.first)
  74     return llvm::make_error<StringError>(
  75         errc::invalid_argument, "Range begins and ends in different files");
  76
  77   if (BeginInfo.second > EndInfo.second)
  78     return llvm::make_error<StringError>(errc::invalid_argument,
  79                                          "Range's begin is past its end");
  80
  81   return llvm::Error::success();
  82 }
  83
  84 llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
  85                                               const SourceManager &SM) {
  86   return validateRange(Range, SM, /*AllowSystemHeaders=*/false);
  87 }
  88
  89 static bool spelledInMacroDefinition(SourceLocation Loc,
  90                                      const SourceManager &SM) {
  91   while (Loc.isMacroID()) {
  92     const auto &Expansion = SM.getSLocEntry(SM.getFileID(Loc)).getExpansion();
  93     if (Expansion.isMacroArgExpansion()) {
  94       // Check the spelling location of the macro arg, in case the arg itself is
  95       // in a macro expansion.
  96       Loc = Expansion.getSpellingLoc();
  97     } else {
  98       return true;
  99     }
 100   }
 101   return false;
 102 }
 103
 104 // Returns the expansion char-range of `Loc` if `Loc` is a split token. For
 105 // example, `>>` in nested templates needs the first `>` to be split, otherwise
 106 // the `SourceLocation` of the token would lex as `>>` instead of `>`.
 107 static std::optional<CharSourceRange>
 108 getExpansionForSplitToken(SourceLocation Loc, const SourceManager &SM,
 109                           const LangOptions &LangOpts) {
 110   if (Loc.isMacroID()) {
 111     bool Invalid = false;
 112     auto &SLoc = SM.getSLocEntry(SM.getFileID(Loc), &Invalid);
 113     if (Invalid)
 114       return std::nullopt;
 115     if (auto &Expansion = SLoc.getExpansion();
 116         !Expansion.isExpansionTokenRange()) {
 117       // A char-range expansion is only used where a token-range would be
 118       // incorrect, and so identifies this as a split token (and importantly,
 119       // not as a macro).
 120       return Expansion.getExpansionLocRange();
 121     }
 122   }
 123   return std::nullopt;
 124 }
 125
 126 // If `Range` covers a split token, returns the expansion range, otherwise
 127 // returns `Range`.
 128 static CharSourceRange getRangeForSplitTokens(CharSourceRange Range,
 129                                               const SourceManager &SM,
 130                                               const LangOptions &LangOpts) {
 131   if (Range.isTokenRange()) {
 132     auto BeginToken = getExpansionForSplitToken(Range.getBegin(), SM, LangOpts);
 133     auto EndToken = getExpansionForSplitToken(Range.getEnd(), SM, LangOpts);
 134     if (EndToken) {
 135       SourceLocation BeginLoc =
 136           BeginToken ? BeginToken->getBegin() : Range.getBegin();
 137       // We can't use the expansion location with a token-range, because that
 138       // will incorrectly lex the end token, so use a char-range that ends at
 139       // the split.
 140       return CharSourceRange::getCharRange(BeginLoc, EndToken->getEnd());
 141     } else if (BeginToken) {
 142       // Since the end token is not split, the whole range covers the split, so
 143       // the only adjustment we make is to use the expansion location of the
 144       // begin token.
 145       return CharSourceRange::getTokenRange(BeginToken->getBegin(),
 146                                             Range.getEnd());
 147     }
 148   }
 149   return Range;
 150 }
 151
 152 static CharSourceRange getRange(const CharSourceRange &EditRange,
 153                                 const SourceManager &SM,
 154                                 const LangOptions &LangOpts,
 155                                 bool IncludeMacroExpansion) {
 156   CharSourceRange Range;
 157   if (IncludeMacroExpansion) {
 158     Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
 159   } else {
 160     auto AdjustedRange = getRangeForSplitTokens(EditRange, SM, LangOpts);
 161     if (spelledInMacroDefinition(AdjustedRange.getBegin(), SM) ||
 162         spelledInMacroDefinition(AdjustedRange.getEnd(), SM))
 163       return {};
 164
 165     auto B = SM.getSpellingLoc(AdjustedRange.getBegin());
 166     auto E = SM.getSpellingLoc(AdjustedRange.getEnd());
 167     if (AdjustedRange.isTokenRange())
 168       E = Lexer::getLocForEndOfToken(E, 0, SM, LangOpts);
 169     Range = CharSourceRange::getCharRange(B, E);
 170   }
 171   return Range;
 172 }
 173
 174 std::optional<CharSourceRange> clang::tooling::getFileRangeForEdit(
 175     const CharSourceRange &EditRange, const SourceManager &SM,
 176     const LangOptions &LangOpts, bool IncludeMacroExpansion) {
 177   CharSourceRange Range =
 178       getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
 179   bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
 180   if (IsInvalid)
 181     return std::nullopt;
 182   return Range;
 183 }
 184
 185 std::optional<CharSourceRange> clang::tooling::getFileRange(
 186     const CharSourceRange &EditRange, const SourceManager &SM,
 187     const LangOptions &LangOpts, bool IncludeMacroExpansion) {
 188   CharSourceRange Range =
 189       getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
 190   bool IsInvalid =
 191       llvm::errorToBool(validateRange(Range, SM, /*AllowSystemHeaders=*/true));
 192   if (IsInvalid)
 193     return std::nullopt;
 194   return Range;
 195 }
 196
 197 static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
 198   return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
 199 }
 200
 201 static bool contains(const std::set<tok::TokenKind> &Terminators,
 202                      const Token &Tok) {
 203   return Terminators.count(Tok.getKind()) > 0;
 204 }
 205
 206 // Returns the exclusive, *file* end location of the entity whose last token is
 207 // at location 'EntityLast'. That is, it returns the location one past the last
 208 // relevant character.
 209 //
 210 // Associated tokens include comments, horizontal whitespace and 'Terminators'
 211 // -- optional tokens, which, if any are found, will be included; if
 212 // 'Terminators' is empty, we will not include any extra tokens beyond comments
 213 // and horizontal whitespace.
 214 static SourceLocation
 215 getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
 216                 const std::set<tok::TokenKind> &Terminators,
 217                 const LangOptions &LangOpts) {
 218   assert(EntityLast.isValid() && "Invalid end location found.");
 219
 220   // We remember the last location of a non-horizontal-whitespace token we have
 221   // lexed; this is the location up to which we will want to delete.
 222   // FIXME: Support using the spelling loc here for cases where we want to
 223   // analyze the macro text.
 224
 225   CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
 226   // FIXME: Should check isTokenRange(), for the (rare) case that
 227   // `ExpansionRange` is a character range.
 228   std::unique_ptr<Lexer> Lexer = [&]() {
 229     bool Invalid = false;
 230     auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
 231     llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
 232     assert(!Invalid && "Cannot get file/offset");
 233     return std::make_unique<clang::Lexer>(
 234         SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
 235         File.data() + FileOffset.second, File.end());
 236   }();
 237
 238   // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
 239   Lexer->SetKeepWhitespaceMode(true);
 240
 241   // Generally, the code we want to include looks like this ([] are optional),
 242   // If Terminators is empty:
 243   //   [ <comment> ] [ <newline> ]
 244   // Otherwise:
 245   //   ... <terminator> [ <comment> ] [ <newline> ]
 246
 247   Token Tok;
 248   bool Terminated = false;
 249
 250   // First, lex to the current token (which is the last token of the range that
 251   // is definitely associated with the decl). Then, we process the first token
 252   // separately from the rest based on conditions that hold specifically for
 253   // that first token.
 254   //
 255   // We do not search for a terminator if none is required or we've already
 256   // encountered it. Otherwise, if the original `EntityLast` location was in a
 257   // macro expansion, we don't have visibility into the text, so we assume we've
 258   // already terminated. However, we note this assumption with
 259   // `TerminatedByMacro`, because we'll want to handle it somewhat differently
 260   // for the terminators semicolon and comma. These terminators can be safely
 261   // associated with the entity when they appear after the macro -- extra
 262   // semicolons have no effect on the program and a well-formed program won't
 263   // have multiple commas in a row, so we're guaranteed that there is only one.
 264   //
 265   // FIXME: This handling of macros is more conservative than necessary. When
 266   // the end of the expansion coincides with the end of the node, we can still
 267   // safely analyze the code. But, it is more complicated, because we need to
 268   // start by lexing the spelling loc for the first token and then switch to the
 269   // expansion loc.
 270   bool TerminatedByMacro = false;
 271   Lexer->LexFromRawLexer(Tok);
 272   if (Terminators.empty() || contains(Terminators, Tok))
 273     Terminated = true;
 274   else if (EntityLast.isMacroID()) {
 275     Terminated = true;
 276     TerminatedByMacro = true;
 277   }
 278
 279   // We save the most recent candidate for the exclusive end location.
 280   SourceLocation End = Tok.getEndLoc();
 281
 282   while (!Terminated) {
 283     // Lex the next token we want to possibly expand the range with.
 284     Lexer->LexFromRawLexer(Tok);
 285
 286     switch (Tok.getKind()) {
 287     case tok::eof:
 288     // Unexpected separators.
 289     case tok::l_brace:
 290     case tok::r_brace:
 291     case tok::comma:
 292       return End;
 293     // Whitespace pseudo-tokens.
 294     case tok::unknown:
 295       if (startsWithNewline(SM, Tok))
 296         // Include at least until the end of the line.
 297         End = Tok.getEndLoc();
 298       break;
 299     default:
 300       if (contains(Terminators, Tok))
 301         Terminated = true;
 302       End = Tok.getEndLoc();
 303       break;
 304     }
 305   }
 306
 307   do {
 308     // Lex the next token we want to possibly expand the range with.
 309     Lexer->LexFromRawLexer(Tok);
 310
 311     switch (Tok.getKind()) {
 312     case tok::unknown:
 313       if (startsWithNewline(SM, Tok))
 314         // We're done, but include this newline.
 315         return Tok.getEndLoc();
 316       break;
 317     case tok::comment:
 318       // Include any comments we find on the way.
 319       End = Tok.getEndLoc();
 320       break;
 321     case tok::semi:
 322     case tok::comma:
 323       if (TerminatedByMacro && contains(Terminators, Tok)) {
 324         End = Tok.getEndLoc();
 325         // We've found a real terminator.
 326         TerminatedByMacro = false;
 327         break;
 328       }
 329       // Found an unrelated token; stop and don't include it.
 330       return End;
 331     default:
 332       // Found an unrelated token; stop and don't include it.
 333       return End;
 334     }
 335   } while (true);
 336 }
 337
 338 // Returns the expected terminator tokens for the given declaration.
 339 //
 340 // If we do not know the correct terminator token, returns an empty set.
 341 //
 342 // There are cases where we have more than one possible terminator (for example,
 343 // we find either a comma or a semicolon after a VarDecl).
 344 static std::set<tok::TokenKind> getTerminators(const Decl &D) {
 345   if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
 346     return {tok::semi};
 347
 348   if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
 349     return {tok::r_brace, tok::semi};
 350
 351   if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
 352     return {tok::comma, tok::semi};
 353
 354   return {};
 355 }
 356
 357 // Starting from `Loc`, skips whitespace up to, and including, a single
 358 // newline. Returns the (exclusive) end of any skipped whitespace (that is, the
 359 // location immediately after the whitespace).
 360 static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
 361                                                SourceLocation Loc,
 362                                                const LangOptions &LangOpts) {
 363   const char *LocChars = SM.getCharacterData(Loc);
 364   int i = 0;
 365   while (isHorizontalWhitespace(LocChars[i]))
 366     ++i;
 367   if (isVerticalWhitespace(LocChars[i]))
 368     ++i;
 369   return Loc.getLocWithOffset(i);
 370 }
 371
 372 // Is `Loc` separated from any following decl by something meaningful (e.g. an
 373 // empty line, a comment), ignoring horizontal whitespace?  Since this is a
 374 // heuristic, we return false when in doubt.  `Loc` cannot be the first location
 375 // in the file.
 376 static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
 377                                  const LangOptions &LangOpts) {
 378   // If the preceding character is a newline, we'll check for an empty line as a
 379   // separator. However, we can't identify an empty line using tokens, so we
 380   // analyse the characters. If we try to use tokens, we'll just end up with a
 381   // whitespace token, whose characters we'd have to analyse anyhow.
 382   bool Invalid = false;
 383   const char *LocChars =
 384       SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
 385   assert(!Invalid &&
 386          "Loc must be a valid character and not the first of the source file.");
 387   if (isVerticalWhitespace(LocChars[0])) {
 388     for (int i = 1; isWhitespace(LocChars[i]); ++i)
 389       if (isVerticalWhitespace(LocChars[i]))
 390         return true;
 391   }
 392   // We didn't find an empty line, so lex the next token, skipping past any
 393   // whitespace we just scanned.
 394   Token Tok;
 395   bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
 396                                    /*IgnoreWhiteSpace=*/true);
 397   if (Failed)
 398     // Any text that confuses the lexer seems fair to consider a separation.
 399     return true;
 400
 401   switch (Tok.getKind()) {
 402   case tok::comment:
 403   case tok::l_brace:
 404   case tok::r_brace:
 405   case tok::eof:
 406     return true;
 407   default:
 408     return false;
 409   }
 410 }
 411
 412 CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
 413                                             ASTContext &Context) {
 414   const SourceManager &SM = Context.getSourceManager();
 415   const LangOptions &LangOpts = Context.getLangOpts();
 416   CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
 417
 418   // First, expand to the start of the template<> declaration if necessary.
 419   if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
 420     if (const auto *T = Record->getDescribedClassTemplate())
 421       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
 422         Range.setBegin(T->getBeginLoc());
 423   } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
 424     if (const auto *T = F->getDescribedFunctionTemplate())
 425       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
 426         Range.setBegin(T->getBeginLoc());
 427   }
 428
 429   // Next, expand the end location past trailing comments to include a potential
 430   // newline at the end of the decl's line.
 431   Range.setEnd(
 432       getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
 433   Range.setTokenRange(false);
 434
 435   // Expand to include preceeding associated comments. We ignore any comments
 436   // that are not preceeding the decl, since we've already skipped trailing
 437   // comments with getEntityEndLoc.
 438   if (const RawComment *Comment =
 439           Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
 440     // Only include a preceding comment if:
 441     // * it is *not* separate from the declaration (not including any newline
 442     //   that immediately follows the comment),
 443     // * the decl *is* separate from any following entity (so, there are no
 444     //   other entities the comment could refer to), and
 445     // * it is not a IfThisThenThat lint check.
 446     if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
 447                                      Range.getBegin()) &&
 448         !atOrBeforeSeparation(
 449             SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
 450             LangOpts) &&
 451         atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
 452       const StringRef CommentText = Comment->getRawText(SM);
 453       if (!CommentText.contains("LINT.IfChange") &&
 454           !CommentText.contains("LINT.ThenChange"))
 455         Range.setBegin(Comment->getBeginLoc());
 456     }
 457   // Add leading attributes.
 458   for (auto *Attr : Decl.attrs()) {
 459     if (Attr->getLocation().isInvalid() ||
 460         !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
 461       continue;
 462     Range.setBegin(Attr->getLocation());
 463
 464     // Extend to the left '[[' or '__attribute((' if we saw the attribute,
 465     // unless it is not a valid location.
 466     bool Invalid;
 467     StringRef Source =
 468         SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
 469     if (Invalid)
 470       continue;
 471     llvm::StringRef BeforeAttr =
 472         Source.substr(0, SM.getFileOffset(Range.getBegin()));
 473     llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
 474
 475     for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
 476       // Handle whitespace between attribute prefix and attribute value.
 477       if (BeforeAttrStripped.ends_with(Prefix)) {
 478         // Move start to start position of prefix, which is
 479         // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
 480         // positions to the left.
 481         Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
 482             -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
 483         break;
 484         // If we didn't see '[[' or '__attribute' it's probably coming from a
 485         // macro expansion which is already handled by makeFileCharRange(),
 486         // below.
 487       }
 488     }
 489   }
 490
 491   // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
 492   // Range.getBegin() may be inside an expansion.
 493   return Lexer::makeFileCharRange(Range, SM, LangOpts);
 494 }