1 //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 #include "FileDistance.h"
11 #include "FuzzyMatch.h"
14 #include "index/Index.h"
15 #include "index/dex/Iterator.h"
16 #include "index/dex/Token.h"
17 #include "index/dex/Trigram.h"
18 #include "support/Logger.h"
19 #include "support/Trace.h"
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/StringMap.h"
22 #include "llvm/ADT/StringSet.h"
23 #include "llvm/Support/Path.h"
24 #include "llvm/Support/ScopedPrinter.h"
35 std::unique_ptr
<SymbolIndex
> Dex::build(SymbolSlab Symbols
, RefSlab Refs
,
37 auto Size
= Symbols
.bytes() + Refs
.bytes();
38 // There is no need to include "Rels" in Data because the relations are self-
39 // contained, without references into a backing store.
40 auto Data
= std::make_pair(std::move(Symbols
), std::move(Refs
));
41 return std::make_unique
<Dex
>(Data
.first
, Data
.second
, Rels
, std::move(Data
),
47 // Mark symbols which are can be used for code completion.
48 const Token RestrictedForCodeCompletion
=
49 Token(Token::Kind::Sentinel
, "Restricted For Code Completion");
51 // Helper to efficiently assemble the inverse index (token -> matching docs).
52 // The output is a nice uniform structure keyed on Token, but constructing
53 // the Token object every time we want to insert into the map is wasteful.
54 // Instead we have various maps keyed on things that are cheap to compute,
55 // and produce the Token keys once at the end.
57 llvm::DenseMap
<Trigram
, std::vector
<DocID
>> TrigramDocs
;
58 std::vector
<DocID
> RestrictedCCDocs
;
59 llvm::StringMap
<std::vector
<DocID
>> TypeDocs
;
60 llvm::StringMap
<std::vector
<DocID
>> ScopeDocs
;
61 llvm::StringMap
<std::vector
<DocID
>> ProximityDocs
;
62 std::vector
<Trigram
> TrigramScratch
;
65 // Add the tokens which are given symbol's characteristics.
66 // This includes fuzzy matching trigrams, symbol's scope, etc.
67 // FIXME(kbobyrev): Support more token types:
68 // * Namespace proximity
69 void add(const Symbol
&Sym
, DocID D
) {
70 generateIdentifierTrigrams(Sym
.Name
, TrigramScratch
);
71 for (Trigram T
: TrigramScratch
)
72 TrigramDocs
[T
].push_back(D
);
73 ScopeDocs
[Sym
.Scope
].push_back(D
);
74 if (!llvm::StringRef(Sym
.CanonicalDeclaration
.FileURI
).empty())
75 for (const auto &ProximityURI
:
76 generateProximityURIs(Sym
.CanonicalDeclaration
.FileURI
))
77 ProximityDocs
[ProximityURI
].push_back(D
);
78 if (Sym
.Flags
& Symbol::IndexedForCodeCompletion
)
79 RestrictedCCDocs
.push_back(D
);
80 if (!Sym
.Type
.empty())
81 TypeDocs
[Sym
.Type
].push_back(D
);
84 // Assemble the final compressed posting lists for the added symbols.
85 llvm::DenseMap
<Token
, PostingList
> build() && {
86 llvm::DenseMap
<Token
, PostingList
> Result(/*InitialReserve=*/
88 RestrictedCCDocs
.size() +
91 ProximityDocs
.size());
92 // Tear down intermediate structs as we go to reduce memory usage.
93 // Since we're trying to get rid of underlying allocations, clearing the
94 // containers is not enough.
95 auto CreatePostingList
=
96 [&Result
](Token::Kind TK
, llvm::StringMap
<std::vector
<DocID
>> &Docs
) {
97 for (auto &E
: Docs
) {
98 Result
.try_emplace(Token(TK
, E
.first()), E
.second
);
103 CreatePostingList(Token::Kind::Type
, TypeDocs
);
104 CreatePostingList(Token::Kind::Scope
, ScopeDocs
);
105 CreatePostingList(Token::Kind::ProximityURI
, ProximityDocs
);
107 // TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a
108 // map, treat them specially.
109 for (auto &E
: TrigramDocs
) {
110 Result
.try_emplace(Token(Token::Kind::Trigram
, E
.first
.str()), E
.second
);
113 TrigramDocs
= llvm::DenseMap
<Trigram
, std::vector
<DocID
>>{};
114 if (!RestrictedCCDocs
.empty())
115 Result
.try_emplace(RestrictedForCodeCompletion
,
116 std::move(RestrictedCCDocs
));
123 void Dex::buildIndex() {
124 this->Corpus
= dex::Corpus(Symbols
.size());
125 std::vector
<std::pair
<float, const Symbol
*>> ScoredSymbols(Symbols
.size());
127 for (size_t I
= 0; I
< Symbols
.size(); ++I
) {
128 const Symbol
*Sym
= Symbols
[I
];
129 LookupTable
[Sym
->ID
] = Sym
;
130 ScoredSymbols
[I
] = {quality(*Sym
), Sym
};
133 // Symbols are sorted by symbol qualities so that items in the posting lists
134 // are stored in the descending order of symbol quality.
135 llvm::sort(ScoredSymbols
, std::greater
<std::pair
<float, const Symbol
*>>());
137 // SymbolQuality was empty up until now.
138 SymbolQuality
.resize(Symbols
.size());
139 // Populate internal storage using Symbol + Score pairs.
140 for (size_t I
= 0; I
< ScoredSymbols
.size(); ++I
) {
141 SymbolQuality
[I
] = ScoredSymbols
[I
].first
;
142 Symbols
[I
] = ScoredSymbols
[I
].second
;
145 // Build posting lists for symbols.
146 IndexBuilder Builder
;
147 for (DocID SymbolRank
= 0; SymbolRank
< Symbols
.size(); ++SymbolRank
)
148 Builder
.add(*Symbols
[SymbolRank
], SymbolRank
);
149 InvertedIndex
= std::move(Builder
).build();
152 std::unique_ptr
<Iterator
> Dex::iterator(const Token
&Tok
) const {
153 auto It
= InvertedIndex
.find(Tok
);
154 return It
== InvertedIndex
.end() ? Corpus
.none()
155 : It
->second
.iterator(&It
->first
);
158 // Constructs BOOST iterators for Path Proximities.
159 std::unique_ptr
<Iterator
> Dex::createFileProximityIterator(
160 llvm::ArrayRef
<std::string
> ProximityPaths
) const {
161 std::vector
<std::unique_ptr
<Iterator
>> BoostingIterators
;
162 // Deduplicate parent URIs extracted from the ProximityPaths.
163 llvm::StringSet
<> ParentURIs
;
164 llvm::StringMap
<SourceParams
> Sources
;
165 for (const auto &Path
: ProximityPaths
) {
166 Sources
[Path
] = SourceParams();
167 auto PathURI
= URI::create(Path
).toString();
168 const auto PathProximityURIs
= generateProximityURIs(PathURI
.c_str());
169 for (const auto &ProximityURI
: PathProximityURIs
)
170 ParentURIs
.insert(ProximityURI
);
172 // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
173 // for all parameters except for Proximity Path distance signal.
174 SymbolRelevanceSignals PathProximitySignals
;
175 // DistanceCalculator will find the shortest distance from ProximityPaths to
176 // any URI extracted from the ProximityPaths.
177 URIDistance
DistanceCalculator(Sources
);
178 PathProximitySignals
.FileProximityMatch
= &DistanceCalculator
;
179 // Try to build BOOST iterator for each Proximity Path provided by
180 // ProximityPaths. Boosting factor should depend on the distance to the
181 // Proximity Path: the closer processed path is, the higher boosting factor.
182 for (const auto &ParentURI
: ParentURIs
.keys()) {
183 // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
184 auto It
= iterator(Token(Token::Kind::ProximityURI
, ParentURI
));
185 if (It
->kind() != Iterator::Kind::False
) {
186 PathProximitySignals
.SymbolURI
= ParentURI
;
187 BoostingIterators
.push_back(Corpus
.boost(
188 std::move(It
), PathProximitySignals
.evaluateHeuristics()));
191 BoostingIterators
.push_back(Corpus
.all());
192 return Corpus
.unionOf(std::move(BoostingIterators
));
195 // Constructs BOOST iterators for preferred types.
196 std::unique_ptr
<Iterator
>
197 Dex::createTypeBoostingIterator(llvm::ArrayRef
<std::string
> Types
) const {
198 std::vector
<std::unique_ptr
<Iterator
>> BoostingIterators
;
199 SymbolRelevanceSignals PreferredTypeSignals
;
200 PreferredTypeSignals
.TypeMatchesPreferred
= true;
201 auto Boost
= PreferredTypeSignals
.evaluateHeuristics();
202 for (const auto &T
: Types
)
203 BoostingIterators
.push_back(
204 Corpus
.boost(iterator(Token(Token::Kind::Type
, T
)), Boost
));
205 BoostingIterators
.push_back(Corpus
.all());
206 return Corpus
.unionOf(std::move(BoostingIterators
));
209 /// Constructs iterators over tokens extracted from the query and exhausts it
210 /// while applying Callback to each symbol in the order of decreasing quality
211 /// of the matched symbols.
212 bool Dex::fuzzyFind(const FuzzyFindRequest
&Req
,
213 llvm::function_ref
<void(const Symbol
&)> Callback
) const {
214 assert(!StringRef(Req
.Query
).contains("::") &&
215 "There must be no :: in query.");
216 trace::Span
Tracer("Dex fuzzyFind");
217 FuzzyMatcher
Filter(Req
.Query
);
218 // For short queries we use specialized trigrams that don't yield all results.
219 // Prevent clients from postfiltering them for longer queries.
220 bool More
= !Req
.Query
.empty() && Req
.Query
.size() < 3;
222 std::vector
<std::unique_ptr
<Iterator
>> Criteria
;
223 const auto TrigramTokens
= generateQueryTrigrams(Req
.Query
);
225 // Generate query trigrams and construct AND iterator over all query
227 std::vector
<std::unique_ptr
<Iterator
>> TrigramIterators
;
228 for (const auto &Trigram
: TrigramTokens
)
229 TrigramIterators
.push_back(iterator(Trigram
));
230 Criteria
.push_back(Corpus
.intersect(std::move(TrigramIterators
)));
232 // Generate scope tokens for search query.
233 std::vector
<std::unique_ptr
<Iterator
>> ScopeIterators
;
234 for (const auto &Scope
: Req
.Scopes
)
235 ScopeIterators
.push_back(iterator(Token(Token::Kind::Scope
, Scope
)));
237 ScopeIterators
.push_back(
238 Corpus
.boost(Corpus
.all(), ScopeIterators
.empty() ? 1.0 : 0.2));
239 Criteria
.push_back(Corpus
.unionOf(std::move(ScopeIterators
)));
241 // Add proximity paths boosting (all symbols, some boosted).
242 Criteria
.push_back(createFileProximityIterator(Req
.ProximityPaths
));
243 // Add boosting for preferred types.
244 Criteria
.push_back(createTypeBoostingIterator(Req
.PreferredTypes
));
246 if (Req
.RestrictForCodeCompletion
)
247 Criteria
.push_back(iterator(RestrictedForCodeCompletion
));
249 // Use TRUE iterator if both trigrams and scopes from the query are not
250 // present in the symbol index.
251 auto Root
= Corpus
.intersect(std::move(Criteria
));
252 // Retrieve more items than it was requested: some of the items with high
253 // final score might not be retrieved otherwise.
254 // FIXME(kbobyrev): Tune this ratio.
256 Root
= Corpus
.limit(std::move(Root
), *Req
.Limit
* 100);
257 SPAN_ATTACH(Tracer
, "query", llvm::to_string(*Root
));
258 vlog("Dex query tree: {0}", *Root
);
260 using IDAndScore
= std::pair
<DocID
, float>;
261 std::vector
<IDAndScore
> IDAndScores
= consume(*Root
);
263 auto Compare
= [](const IDAndScore
&LHS
, const IDAndScore
&RHS
) {
264 return LHS
.second
> RHS
.second
;
266 TopN
<IDAndScore
, decltype(Compare
)> Top(
267 Req
.Limit
.value_or(std::numeric_limits
<size_t>::max()), Compare
);
268 for (const auto &IDAndScore
: IDAndScores
) {
269 const DocID SymbolDocID
= IDAndScore
.first
;
270 const auto *Sym
= Symbols
[SymbolDocID
];
271 const std::optional
<float> Score
= Filter
.match(Sym
->Name
);
274 // Combine Fuzzy Matching score, precomputed symbol quality and boosting
275 // score for a cumulative final symbol score.
276 const float FinalScore
=
277 (*Score
) * SymbolQuality
[SymbolDocID
] * IDAndScore
.second
;
278 // If Top.push(...) returns true, it means that it had to pop an item. In
279 // this case, it is possible to retrieve more symbols.
280 if (Top
.push({SymbolDocID
, FinalScore
}))
284 // Apply callback to the top Req.Limit items in the descending
285 // order of cumulative score.
286 for (const auto &Item
: std::move(Top
).items())
287 Callback(*Symbols
[Item
.first
]);
291 void Dex::lookup(const LookupRequest
&Req
,
292 llvm::function_ref
<void(const Symbol
&)> Callback
) const {
293 trace::Span
Tracer("Dex lookup");
294 for (const auto &ID
: Req
.IDs
) {
295 auto I
= LookupTable
.find(ID
);
296 if (I
!= LookupTable
.end())
297 Callback(*I
->second
);
301 bool Dex::refs(const RefsRequest
&Req
,
302 llvm::function_ref
<void(const Ref
&)> Callback
) const {
303 trace::Span
Tracer("Dex refs");
304 uint32_t Remaining
= Req
.Limit
.value_or(std::numeric_limits
<uint32_t>::max());
305 for (const auto &ID
: Req
.IDs
)
306 for (const auto &Ref
: Refs
.lookup(ID
)) {
307 if (!static_cast<int>(Req
.Filter
& Ref
.Kind
))
310 return true; // More refs were available.
314 return false; // We reported all refs.
318 const RelationsRequest
&Req
,
319 llvm::function_ref
<void(const SymbolID
&, const Symbol
&)> Callback
) const {
320 trace::Span
Tracer("Dex relations");
321 uint32_t Remaining
= Req
.Limit
.value_or(std::numeric_limits
<uint32_t>::max());
322 for (const SymbolID
&Subject
: Req
.Subjects
) {
323 LookupRequest LookupReq
;
324 auto It
= Relations
.find(
325 std::make_pair(Subject
, static_cast<uint8_t>(Req
.Predicate
)));
326 if (It
!= Relations
.end()) {
327 for (const auto &Object
: It
->second
) {
330 LookupReq
.IDs
.insert(Object
);
334 lookup(LookupReq
, [&](const Symbol
&Object
) { Callback(Subject
, Object
); });
338 llvm::unique_function
<IndexContents(llvm::StringRef
) const>
339 Dex::indexedFiles() const {
340 return [this](llvm::StringRef FileURI
) {
341 return Files
.contains(FileURI
) ? IdxContents
: IndexContents::None
;
345 size_t Dex::estimateMemoryUsage() const {
346 size_t Bytes
= Symbols
.size() * sizeof(const Symbol
*);
347 Bytes
+= SymbolQuality
.size() * sizeof(float);
348 Bytes
+= LookupTable
.getMemorySize();
349 Bytes
+= InvertedIndex
.getMemorySize();
350 for (const auto &TokenToPostingList
: InvertedIndex
)
351 Bytes
+= TokenToPostingList
.second
.bytes();
352 Bytes
+= Refs
.getMemorySize();
353 Bytes
+= Relations
.getMemorySize();
354 return Bytes
+ BackingDataSize
;
357 // Given foo://bar/one/two
358 // Returns ~~~~~~~~ (or empty for bad URI)
359 llvm::StringRef
findPathInURI(llvm::StringRef S
) {
360 S
= S
.split(':').second
; // Eat scheme.
361 if (S
.consume_front("//")) // Eat authority.
362 S
= S
.drop_until([](char C
) { return C
== '/'; });
366 // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
367 // size of resulting vector. Some projects might want to have higher limit if
368 // the file hierarchy is deeper. For the generic case, it would be useful to
369 // calculate Limit in the index build stage by calculating the maximum depth
370 // of the project source tree at runtime.
371 constexpr unsigned ProximityURILimit
= 5;
373 llvm::SmallVector
<llvm::StringRef
, ProximityURILimit
>
374 generateProximityURIs(llvm::StringRef URI
) {
375 // This function is hot when indexing, so don't parse/reserialize URIPath,
376 // just emit substrings of it instead.
381 llvm::StringRef Path
= findPathInURI(URI
);
383 return {}; // Bad URI.
384 assert(Path
.begin() >= URI
.begin() && Path
.begin() < URI
.end() &&
385 Path
.end() == URI
.end());
387 // The original is a proximity URI.
388 llvm::SmallVector
<llvm::StringRef
, ProximityURILimit
> Result
= {URI
};
389 // Form proximity URIs by chopping before each slash in the path part.
390 for (auto Slash
= Path
.rfind('/'); Slash
> 0 && Slash
!= StringRef::npos
;
391 Slash
= Path
.rfind('/')) {
392 Path
= Path
.substr(0, Slash
);
393 Result
.push_back(URI
.substr(0, Path
.end() - URI
.data()));
394 if (Result
.size() == ProximityURILimit
)
397 // The root foo://bar/ is a proximity URI.
398 if (Path
.starts_with("/"))
399 Result
.push_back(URI
.substr(0, Path
.begin() + 1 - URI
.data()));
404 } // namespace clangd