1 //===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Symbol index queries consist of specific requirements for the requested
11 /// symbol, such as high fuzzy matching score, scope, type etc. The lists of all
12 /// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope)
13 /// are expressed in a form of Search Tokens which are stored in the inverted
14 /// index. Inverted index maps these tokens to the posting lists - sorted (by
15 /// symbol quality) sequences of symbol IDs matching the token, e.g. scope token
16 /// "clangd::clangd::" is mapped to the list of IDs of all symbols which are
17 /// declared in this namespace. Search queries are build from a set of
18 /// requirements which can be combined with each other forming the query trees.
19 /// The leafs of such trees are posting lists, and the nodes are operations on
20 /// these posting lists, e.g. intersection or union. Efficient processing of
21 /// these multi-level queries is handled by Iterators. Iterators advance through
22 /// all leaf posting lists producing the result of search query, which preserves
23 /// the sorted order of IDs. Having the resulting IDs sorted is important,
24 /// because it allows receiving a certain number of the most valuable items
25 /// (e.g. symbols with highest quality which was the sorting key in the first
26 /// place) without processing all items with requested properties (this might
27 /// not be computationally effective if search request is not very restrictive).
29 //===----------------------------------------------------------------------===//
31 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
32 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
34 #include "llvm/Support/raw_ostream.h"
43 /// Symbol position in the list of all index symbols sorted by a pre-computed
45 using DocID
= uint32_t;
47 /// Iterator is the interface for Query Tree node. The simplest type of Iterator
48 /// is DocumentIterator which is simply a wrapper around PostingList iterator
49 /// and serves as the Query Tree leaf. More sophisticated examples of iterators
50 /// can manage intersection, union of the elements produced by other iterators
51 /// (their children) to form a multi-level Query Tree. The interface is designed
52 /// to be extensible in order to support multiple types of iterators.
55 /// Returns true if all valid DocIDs were processed and hence the iterator is
57 virtual bool reachedEnd() const = 0;
58 /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted
59 /// and proceeds to the END.
61 /// Note: reachedEnd() must be false.
62 virtual void advance() = 0;
63 /// Moves to the first valid DocID which is equal or higher than given ID. If
64 /// it doesn't exist, the iterator is exhausted and proceeds to the END.
66 /// Note: reachedEnd() must be false.
67 virtual void advanceTo(DocID ID
) = 0;
68 /// Returns the current element this iterator points to.
70 /// Note: reachedEnd() must be false.
71 virtual DocID
peek() const = 0;
72 /// Informs the iterator that the current document was consumed, and returns
75 /// Note: If this iterator has any child iterators that contain the document,
76 /// consume() should be called on those and their boosts incorporated.
77 /// consume() must *not* be called on children that don't contain the current
79 virtual float consume() = 0;
80 /// Returns an estimate of advance() calls before the iterator is exhausted.
81 virtual size_t estimateSize() const = 0;
83 virtual ~Iterator() {}
85 /// Prints a convenient human-readable iterator representation by recursively
86 /// dumping iterators in the following format:
88 /// (Type Child1 Child2 ...)
90 /// Where Type is the iterator type representation: "&" for And, "|" for Or,
91 /// ChildN is N-th iterator child. Raw iterators over PostingList are
92 /// represented as "[... CurID ...]" where CurID is the current PostingList
93 /// entry being inspected.
94 friend llvm::raw_ostream
&operator<<(llvm::raw_ostream
&OS
,
95 const Iterator
&Iterator
) {
96 return Iterator
.dump(OS
);
99 /// Inspect iterator type, used internally for optimizing query trees.
100 enum class Kind
{ And
, Or
, True
, False
, Other
};
101 Kind
kind() const { return MyKind
; }
104 Iterator(Kind MyKind
= Kind::Other
) : MyKind(MyKind
) {}
107 virtual llvm::raw_ostream
&dump(llvm::raw_ostream
&OS
) const = 0;
111 /// Advances the iterator until it is exhausted. Returns pairs of document IDs
112 /// with the corresponding boosting score.
114 /// Boosting can be seen as a compromise between retrieving too many items and
115 /// calculating finals score for each of them (which might be very expensive)
116 /// and not retrieving enough items so that items with very high final score
117 /// would not be processed. Boosting score is a computationally efficient way
118 /// to acquire preliminary scores of requested items.
119 std::vector
<std::pair
<DocID
, float>> consume(Iterator
&It
);
122 // Variadic template machinery.
123 inline void populateChildren(std::vector
<std::unique_ptr
<Iterator
>> &) {}
124 template <typename
... TailT
>
125 void populateChildren(std::vector
<std::unique_ptr
<Iterator
>> &Children
,
126 std::unique_ptr
<Iterator
> Head
, TailT
... Tail
) {
127 Children
.push_back(std::move(Head
));
128 populateChildren(Children
, std::move(Tail
)...);
130 } // namespace detail
132 // A corpus is a set of documents, and a factory for iterators over them.
137 explicit Corpus(DocID Size
) : Size(Size
) {}
139 /// Returns AND Iterator which performs the intersection of the PostingLists
142 /// consume(): AND Iterator returns the product of Childrens' boosting
144 std::unique_ptr
<Iterator
>
145 intersect(std::vector
<std::unique_ptr
<Iterator
>> Children
) const;
147 /// Returns OR Iterator which performs the union of the PostingLists of its
150 /// consume(): OR Iterator returns the highest boost value among children
151 /// containing the requested item.
152 std::unique_ptr
<Iterator
>
153 unionOf(std::vector
<std::unique_ptr
<Iterator
>> Children
) const;
155 /// Returns TRUE Iterator which iterates over "virtual" PostingList
156 /// containing all items in range [0, Size) in an efficient manner.
157 std::unique_ptr
<Iterator
> all() const;
159 /// Returns FALSE Iterator which iterates over no documents.
160 std::unique_ptr
<Iterator
> none() const;
162 /// Returns BOOST iterator which multiplies the score of each item by given
163 /// factor. Boosting can be used as a computationally inexpensive filtering.
164 /// Users can return significantly more items using consumeAndBoost() and
165 /// then trim Top K using retrieval score.
166 std::unique_ptr
<Iterator
> boost(std::unique_ptr
<Iterator
> Child
,
169 /// Returns LIMIT iterator, which yields up to N elements of its child
170 /// iterator. Elements only count towards the limit if they are part of the
171 /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2)
172 /// 1)) yields (2), not ().
173 std::unique_ptr
<Iterator
> limit(std::unique_ptr
<Iterator
> Child
,
176 /// This allows intersect(create(...), create(...)) syntax.
177 template <typename
... Args
>
178 std::unique_ptr
<Iterator
> intersect(Args
... args
) const {
179 std::vector
<std::unique_ptr
<Iterator
>> Children
;
180 detail::populateChildren(Children
, std::forward
<Args
>(args
)...);
181 return intersect(std::move(Children
));
184 /// This allows unionOf(create(...), create(...)) syntax.
185 template <typename
... Args
>
186 std::unique_ptr
<Iterator
> unionOf(Args
... args
) const {
187 std::vector
<std::unique_ptr
<Iterator
>> Children
;
188 detail::populateChildren(Children
, std::forward
<Args
>(args
)...);
189 return unionOf(std::move(Children
));
194 } // namespace clangd
197 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H