1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Various code that examines C++ source code without using heavy AST machinery
10 // (and often not even the lexer). To be used sparingly!
12 //===----------------------------------------------------------------------===//
13 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H
14 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H
17 #include "support/Context.h"
18 #include "support/ThreadsafeFS.h"
19 #include "clang/Basic/CharInfo.h"
20 #include "clang/Basic/Diagnostic.h"
21 #include "clang/Basic/LangOptions.h"
22 #include "clang/Basic/SourceLocation.h"
23 #include "clang/Basic/SourceManager.h"
24 #include "clang/Format/Format.h"
25 #include "clang/Lex/HeaderSearch.h"
26 #include "clang/Tooling/Core/Replacement.h"
27 #include "clang/Tooling/Syntax/Tokens.h"
28 #include "llvm/ADT/StringRef.h"
29 #include "llvm/ADT/StringSet.h"
30 #include "llvm/Support/Error.h"
39 // We tend to generate digests for source codes in a lot of different places.
40 // This represents the type for those digests to prevent us hard coding details
41 // of hashing function at every place that needs to store this information.
42 using FileDigest
= std::array
<uint8_t, 8>;
43 FileDigest
digest(StringRef Content
);
44 std::optional
<FileDigest
> digestFile(const SourceManager
&SM
, FileID FID
);
46 // This context variable controls the behavior of functions in this file
47 // that convert between LSP offsets and native clang byte offsets.
48 // If not set, defaults to UTF-16 for backwards-compatibility.
49 extern Key
<OffsetEncoding
> kCurrentOffsetEncoding
;
51 // Counts the number of UTF-16 code units needed to represent a string (LSP
52 // specifies string lengths in UTF-16 code units).
53 // Use of UTF-16 may be overridden by kCurrentOffsetEncoding.
54 size_t lspLength(StringRef Code
);
56 /// Turn a [line, column] pair into an offset in Code.
58 /// If P.character exceeds the line length, returns the offset at end-of-line.
59 /// (If !AllowColumnsBeyondLineLength, then returns an error instead).
60 /// If the line number is out of range, returns an error.
62 /// The returned value is in the range [0, Code.size()].
63 llvm::Expected
<size_t>
64 positionToOffset(llvm::StringRef Code
, Position P
,
65 bool AllowColumnsBeyondLineLength
= true);
67 /// Turn an offset in Code into a [line, column] pair.
68 /// The offset must be in range [0, Code.size()].
69 Position
offsetToPosition(llvm::StringRef Code
, size_t Offset
);
71 /// Turn a SourceLocation into a [line, column] pair.
72 /// FIXME: This should return an error if the location is invalid.
73 Position
sourceLocToPosition(const SourceManager
&SM
, SourceLocation Loc
);
75 /// Return the file location, corresponding to \p P. Note that one should take
76 /// care to avoid comparing the result with expansion locations.
77 llvm::Expected
<SourceLocation
> sourceLocationInMainFile(const SourceManager
&SM
,
80 /// Returns true iff \p Loc is inside the main file. This function handles
81 /// file & macro locations. For macro locations, returns iff the macro is being
82 /// expanded inside the main file.
84 /// The function is usually used to check whether a declaration is inside the
86 bool isInsideMainFile(SourceLocation Loc
, const SourceManager
&SM
);
88 /// Returns the #include location through which IncludedFIle was loaded.
89 /// Where SM.getIncludeLoc() returns the location of the *filename*, which may
90 /// be in a macro, includeHashLoc() returns the location of the #.
91 SourceLocation
includeHashLoc(FileID IncludedFile
, const SourceManager
&SM
);
93 /// Returns true if the token at Loc is spelled in the source code.
94 /// This is not the case for:
95 /// * symbols formed via macro concatenation, the spelling location will
96 /// be "<scratch space>"
97 /// * symbols controlled and defined by a compile command-line option
98 /// `-DName=foo`, the spelling location will be "<command line>".
99 bool isSpelledInSource(SourceLocation Loc
, const SourceManager
&SM
);
101 /// Turns a token range into a half-open range and checks its correctness.
102 /// The resulting range will have only valid source location on both sides, both
103 /// of which are file locations.
105 /// File locations always point to a particular offset in a file, i.e. they
106 /// never refer to a location inside a macro expansion. Turning locations from
107 /// macro expansions into file locations is ambiguous - one can use
108 /// SourceManager::{getExpansion|getFile|getSpelling}Loc. This function
109 /// calls SourceManager::getFileLoc on both ends of \p R to do the conversion.
111 /// User input (e.g. cursor position) is expressed as a file location, so this
112 /// function can be viewed as a way to normalize the ranges used in the clang
113 /// AST so that they are comparable with ranges coming from the user input.
114 std::optional
<SourceRange
> toHalfOpenFileRange(const SourceManager
&Mgr
,
115 const LangOptions
&LangOpts
,
118 /// Returns true iff all of the following conditions hold:
119 /// - start and end locations are valid,
120 /// - start and end locations are file locations from the same file
121 /// (i.e. expansion locations are not taken into account).
122 /// - start offset <= end offset.
123 /// FIXME: introduce a type for source range with this invariant.
124 bool isValidFileRange(const SourceManager
&Mgr
, SourceRange R
);
126 /// Returns the source code covered by the source range.
127 /// EXPECTS: isValidFileRange(R) == true.
128 llvm::StringRef
toSourceCode(const SourceManager
&SM
, SourceRange R
);
130 // Converts a half-open clang source range to an LSP range.
131 // Note that clang also uses closed source ranges, which this can't handle!
132 Range
halfOpenToRange(const SourceManager
&SM
, CharSourceRange R
);
134 // Expand range `A` to also contain `B`.
135 void unionRanges(Range
&A
, Range B
);
137 // Converts an offset to a clang line/column (1-based, columns are bytes).
138 // The offset must be in range [0, Code.size()].
139 // Prefer to use SourceManager if one is available.
140 std::pair
<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code
,
143 /// From "a::b::c", return {"a::b::", "c"}. Scope is empty if there's no
145 std::pair
<llvm::StringRef
, llvm::StringRef
>
146 splitQualifiedName(llvm::StringRef QName
);
148 TextEdit
replacementToEdit(StringRef Code
, const tooling::Replacement
&R
);
150 std::vector
<TextEdit
> replacementsToEdits(StringRef Code
,
151 const tooling::Replacements
&Repls
);
153 TextEdit
toTextEdit(const FixItHint
&FixIt
, const SourceManager
&M
,
154 const LangOptions
&L
);
156 /// Get the canonical path of \p F. This means:
159 /// - Symlinks resolved
160 /// - No "." or ".." component
161 /// - No duplicate or trailing directory separator
163 /// This function should be used when paths needs to be used outside the
164 /// component that generate it, so that paths are normalized as much as
166 std::optional
<std::string
> getCanonicalPath(const FileEntryRef F
,
167 FileManager
&FileMgr
);
169 /// Choose the clang-format style we should apply to a certain file.
170 /// This will usually use FS to look for .clang-format directories.
171 /// FIXME: should we be caching the .clang-format file search?
172 /// This uses format::DefaultFormatStyle and format::DefaultFallbackStyle,
173 /// though the latter may have been overridden in main()!
174 /// \p FormatFile indicates whether the returned FormatStyle is used
175 /// to format the entire main file (or a range selected by the user
176 /// which can be arbitrarily long).
177 format::FormatStyle
getFormatStyleForFile(llvm::StringRef File
,
178 llvm::StringRef Content
,
179 const ThreadsafeFS
&TFS
,
182 /// Cleanup and format the given replacements.
183 llvm::Expected
<tooling::Replacements
>
184 cleanupAndFormat(StringRef Code
, const tooling::Replacements
&Replaces
,
185 const format::FormatStyle
&Style
);
187 /// A set of edits generated for a single file. Can verify whether it is safe to
188 /// apply these edits to a code block.
190 tooling::Replacements Replacements
;
191 std::string InitialCode
;
195 Edit(llvm::StringRef Code
, tooling::Replacements Reps
)
196 : Replacements(std::move(Reps
)), InitialCode(Code
) {}
198 /// Returns the file contents after changes are applied.
199 llvm::Expected
<std::string
> apply() const;
201 /// Represents Replacements as TextEdits that are available for use in LSP.
202 std::vector
<TextEdit
> asTextEdits() const;
204 /// Checks whether the Replacements are applicable to given Code.
205 bool canApplyTo(llvm::StringRef Code
) const;
207 /// A mapping from absolute file path (the one used for accessing the underlying
209 using FileEdits
= llvm::StringMap
<Edit
>;
211 /// Formats the edits and code around it according to Style. Changes
212 /// Replacements to formatted ones if succeeds.
213 llvm::Error
reformatEdit(Edit
&E
, const format::FormatStyle
&Style
);
215 /// Apply an incremental update to a text document.
216 llvm::Error
applyChange(std::string
&Contents
,
217 const TextDocumentContentChangeEvent
&Change
);
219 /// Collects identifiers with counts in the source code.
220 llvm::StringMap
<unsigned> collectIdentifiers(llvm::StringRef Content
,
221 const format::FormatStyle
&Style
);
223 /// Collects all ranges of the given identifier in the source code.
224 std::vector
<Range
> collectIdentifierRanges(llvm::StringRef Identifier
,
225 llvm::StringRef Content
,
226 const LangOptions
&LangOpts
);
228 /// Collects words from the source code.
229 /// Unlike collectIdentifiers:
230 /// - also finds text in comments:
231 /// - splits text into words
232 /// - drops stopwords like "get" and "for"
233 llvm::StringSet
<> collectWords(llvm::StringRef Content
);
235 // Something that looks like a word in the source code.
236 // Could be a "real" token that's "live" in the AST, a spelled token consumed by
237 // the preprocessor, or part of a spelled token (e.g. word in a comment).
239 // (Spelling) location of the start of the word.
240 SourceLocation Location
;
241 // The range of the word itself, excluding any quotes.
242 // This is a subrange of the file buffer.
243 llvm::StringRef Text
;
244 // Whether this word is likely to refer to an identifier. True if:
245 // - the word is a spelled identifier token
246 // - Text is identifier-like (e.g. "foo_bar")
247 // - Text is surrounded by backticks (e.g. Foo in "// returns `Foo`")
248 bool LikelyIdentifier
= false;
249 // Set if the word is contained in a token spelled in the file.
250 // (This should always be true, but comments aren't retained by TokenBuffer).
251 const syntax::Token
*PartOfSpelledToken
= nullptr;
252 // Set if the word is exactly a token spelled in the file.
253 const syntax::Token
*SpelledToken
= nullptr;
254 // Set if the word is a token spelled in the file, and that token survives
255 // preprocessing to emit an expanded token spelled the same way.
256 const syntax::Token
*ExpandedToken
= nullptr;
258 // Find the unique word that contains SpelledLoc or starts/ends there.
259 static std::optional
<SpelledWord
> touching(SourceLocation SpelledLoc
,
260 const syntax::TokenBuffer
&TB
,
261 const LangOptions
&LangOpts
);
264 /// Return true if the \p TokenName is in the list of reversed keywords of the
266 bool isKeyword(llvm::StringRef TokenName
, const LangOptions
&LangOpts
);
268 /// Heuristically determine namespaces visible at a point, without parsing Code.
269 /// This considers using-directives and enclosing namespace-declarations that
270 /// are visible (and not obfuscated) in the file itself (not headers).
271 /// Code should be truncated at the point of interest.
273 /// The returned vector is always non-empty.
274 /// - The first element is the namespace that encloses the point: a declaration
275 /// near the point would be within this namespace.
276 /// - The elements are the namespaces in scope at the point: an unqualified
277 /// lookup would search within these namespaces.
279 /// Using directives are resolved against all enclosing scopes, but no other
280 /// namespace directives.
283 /// using namespace a;
285 /// using namespace b;
287 /// visibleNamespaces are {"foo::", "", "a::", "b::", "foo::b::"}, not "a::b::".
288 std::vector
<std::string
> visibleNamespaces(llvm::StringRef Code
,
289 const LangOptions
&LangOpts
);
291 /// Represents locations that can accept a definition.
292 struct EligibleRegion
{
293 /// Namespace that owns all of the EligiblePoints, e.g.
294 /// namespace a{ namespace b {^ void foo();^} }
295 /// It will be “a::b” for both carrot locations.
296 std::string EnclosingNamespace
;
297 /// Offsets into the code marking eligible points to insert a function
299 std::vector
<Position
> EligiblePoints
;
302 /// Returns most eligible region to insert a definition for \p
303 /// FullyQualifiedName in the \p Code.
304 /// Pseudo parses \pCode under the hood to determine namespace decls and
305 /// possible insertion points. Choses the region that matches the longest prefix
306 /// of \p FullyQualifiedName. Returns EOF if there are no shared namespaces.
307 /// \p FullyQualifiedName should not contain anonymous namespaces.
308 EligibleRegion
getEligiblePoints(llvm::StringRef Code
,
309 llvm::StringRef FullyQualifiedName
,
310 const LangOptions
&LangOpts
);
312 struct DefinedMacro
{
313 llvm::StringRef Name
;
314 const MacroInfo
*Info
;
315 /// Location of the identifier that names the macro.
316 /// Unlike Info->Location, this translates preamble-patch locations to
317 /// main-file locations.
318 SourceLocation NameLoc
;
320 /// Gets the macro referenced by \p SpelledTok. It must be a spelled token
321 /// aligned to the beginning of an identifier.
322 std::optional
<DefinedMacro
> locateMacroAt(const syntax::Token
&SpelledTok
,
325 /// Infers whether this is a header from the FileName and LangOpts (if
327 bool isHeaderFile(llvm::StringRef FileName
,
328 std::optional
<LangOptions
> LangOpts
= std::nullopt
);
330 /// Returns true if the given location is in a generated protobuf file.
331 bool isProtoFile(SourceLocation Loc
, const SourceManager
&SourceMgr
);
333 /// Returns true if Name is reserved, like _Foo or __Vector_base.
334 inline bool isReservedName(llvm::StringRef Name
) {
335 // This doesn't catch all cases, but the most common.
336 return Name
.size() >= 2 && Name
[0] == '_' &&
337 (isUppercase(Name
[1]) || Name
[1] == '_');
340 /// Translates locations inside preamble patch to their main-file equivalent
341 /// using presumed locations. Returns \p Loc if it isn't inside preamble patch.
342 SourceLocation
translatePreamblePatchLocation(SourceLocation Loc
,
343 const SourceManager
&SM
);
345 /// Returns the range starting at offset and spanning the whole line. Escaped
346 /// newlines are not handled.
347 clangd::Range
rangeTillEOL(llvm::StringRef Code
, unsigned HashOffset
);
348 } // namespace clangd