1 //===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "PostingList.h"
10 #include "index/dex/Iterator.h"
11 #include "index/dex/Token.h"
12 #include "llvm/Support/MathExtras.h"
20 /// Implements iterator of PostingList chunks. This requires iterating over two
21 /// levels: the first level iterator iterates over the chunks and decompresses
22 /// them on-the-fly when the contents of chunk are to be seen.
23 class ChunkIterator
: public Iterator
{
25 explicit ChunkIterator(const Token
*Tok
, llvm::ArrayRef
<Chunk
> Chunks
)
26 : Tok(Tok
), Chunks(Chunks
), CurrentChunk(Chunks
.begin()) {
27 if (!Chunks
.empty()) {
28 DecompressedChunk
= CurrentChunk
->decompress();
29 CurrentID
= DecompressedChunk
.begin();
33 bool reachedEnd() const override
{ return CurrentChunk
== Chunks
.end(); }
35 /// Advances cursor to the next item.
36 void advance() override
{
37 assert(!reachedEnd() &&
38 "Posting List iterator can't advance() at the end.");
43 /// Applies binary search to advance cursor to the next item with DocID
44 /// equal or higher than the given one.
45 void advanceTo(DocID ID
) override
{
46 assert(!reachedEnd() &&
47 "Posting List iterator can't advance() at the end.");
51 // Try to find ID within current chunk.
52 CurrentID
= std::partition_point(CurrentID
, DecompressedChunk
.end(),
53 [&](const DocID D
) { return D
< ID
; });
57 DocID
peek() const override
{
58 assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
62 float consume() override
{
63 assert(!reachedEnd() &&
64 "Posting List iterator can't consume() at the end.");
68 size_t estimateSize() const override
{
69 return Chunks
.size() * ApproxEntriesPerChunk
;
73 llvm::raw_ostream
&dump(llvm::raw_ostream
&OS
) const override
{
78 for (const Chunk
&C
: Chunks
)
79 for (const DocID Doc
: C
.decompress()) {
86 /// If the cursor is at the end of a chunk, place it at the start of the next
88 void normalizeCursor() {
89 // Invariant is already established if examined chunk is not exhausted.
90 if (CurrentID
!= std::end(DecompressedChunk
))
92 // Advance to next chunk if current one is exhausted.
94 if (CurrentChunk
== Chunks
.end()) // Reached the end of PostingList.
96 DecompressedChunk
= CurrentChunk
->decompress();
97 CurrentID
= DecompressedChunk
.begin();
100 /// Advances CurrentChunk to the chunk which might contain ID.
101 void advanceToChunk(DocID ID
) {
102 if ((CurrentChunk
!= Chunks
.end() - 1) &&
103 ((CurrentChunk
+ 1)->Head
<= ID
)) {
105 std::partition_point(CurrentChunk
+ 1, Chunks
.end(),
106 [&](const Chunk
&C
) { return C
.Head
< ID
; });
108 DecompressedChunk
= CurrentChunk
->decompress();
109 CurrentID
= DecompressedChunk
.begin();
114 llvm::ArrayRef
<Chunk
> Chunks
;
115 /// Iterator over chunks.
116 /// If CurrentChunk is valid, then DecompressedChunk is
117 /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
119 decltype(Chunks
)::const_iterator CurrentChunk
;
120 llvm::SmallVector
<DocID
, Chunk::PayloadSize
+ 1> DecompressedChunk
;
121 /// Iterator over DecompressedChunk.
122 decltype(DecompressedChunk
)::iterator CurrentID
;
124 static constexpr size_t ApproxEntriesPerChunk
= 15;
127 static constexpr size_t BitsPerEncodingByte
= 7;
129 /// Writes a variable length DocID into the buffer and updates the buffer size.
130 /// If it doesn't fit, returns false and doesn't write to the buffer.
131 bool encodeVByte(DocID Delta
, llvm::MutableArrayRef
<uint8_t> &Payload
) {
132 assert(Delta
!= 0 && "0 is not a valid PostingList delta.");
133 // Calculate number of bytes Delta encoding would take by examining the
135 unsigned Width
= 1 + llvm::Log2_64(Delta
) / BitsPerEncodingByte
;
136 if (Width
> Payload
.size())
140 uint8_t Encoding
= Delta
& 0x7f;
142 Payload
.front() = Delta
? Encoding
| 0x80 : Encoding
;
143 Payload
= Payload
.drop_front();
144 } while (Delta
!= 0);
148 /// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
149 /// DocIDs. The compression stores deltas (differences) between subsequent
150 /// DocIDs and encodes these deltas utilizing the least possible number of
153 /// Each encoding byte consists of two parts: the first bit (continuation bit)
154 /// indicates whether this is the last byte (0 if this byte is the last) of
155 /// current encoding and seven bytes a piece of DocID (payload). DocID contains
156 /// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
157 /// payloads and one 4-bit payload), but in practice it is expected that gaps
158 /// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
159 /// In very dense posting lists (with average gaps less than 128) this
160 /// representation would be 4 times more efficient than raw DocID array.
162 /// PostingList encoding example:
164 /// DocIDs 42 47 7000
166 /// Encoding (raw number) 00000101 10110110 00101110
167 std::vector
<Chunk
> encodeStream(llvm::ArrayRef
<DocID
> Documents
) {
168 assert(!Documents
.empty() && "Can't encode empty sequence.");
169 std::vector
<Chunk
> Result
;
170 Result
.emplace_back();
171 DocID Last
= Result
.back().Head
= Documents
.front();
172 llvm::MutableArrayRef
<uint8_t> RemainingPayload
= Result
.back().Payload
;
173 for (DocID Doc
: Documents
.drop_front()) {
174 if (!encodeVByte(Doc
- Last
, RemainingPayload
)) { // didn't fit, flush chunk
175 Result
.emplace_back();
176 Result
.back().Head
= Doc
;
177 RemainingPayload
= Result
.back().Payload
;
181 return std::vector
<Chunk
>(Result
); // no move, shrink-to-fit
184 /// Reads variable length DocID from the buffer and updates the buffer size. If
185 /// the stream is terminated, return std::nullopt.
186 std::optional
<DocID
> readVByte(llvm::ArrayRef
<uint8_t> &Bytes
) {
187 if (Bytes
.front() == 0 || Bytes
.empty())
190 bool HasNextByte
= true;
191 for (size_t Length
= 0; HasNextByte
&& !Bytes
.empty(); ++Length
) {
192 assert(Length
<= 5 && "Malformed VByte encoding sequence.");
193 // Write meaningful bits to the correct place in the document decoding.
194 Result
|= (Bytes
.front() & 0x7f) << (BitsPerEncodingByte
* Length
);
195 if ((Bytes
.front() & 0x80) == 0)
197 Bytes
= Bytes
.drop_front();
204 llvm::SmallVector
<DocID
, Chunk::PayloadSize
+ 1> Chunk::decompress() const {
205 llvm::SmallVector
<DocID
, Chunk::PayloadSize
+ 1> Result
{Head
};
206 llvm::ArrayRef
<uint8_t> Bytes(Payload
);
208 for (DocID Current
= Head
; !Bytes
.empty(); Current
+= Delta
) {
209 auto MaybeDelta
= readVByte(Bytes
);
213 Result
.push_back(Current
+ Delta
);
215 return llvm::SmallVector
<DocID
, Chunk::PayloadSize
+ 1>{Result
};
218 PostingList::PostingList(llvm::ArrayRef
<DocID
> Documents
)
219 : Chunks(encodeStream(Documents
)) {}
221 std::unique_ptr
<Iterator
> PostingList::iterator(const Token
*Tok
) const {
222 return std::make_unique
<ChunkIterator
>(Tok
, Chunks
);
226 } // namespace clangd