clang-tools-extra/clangd/index/dex/PostingList.cpp

   1 //===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "PostingList.h"
  10 #include "index/dex/Iterator.h"
  11 #include "index/dex/Token.h"
  12 #include "llvm/Support/MathExtras.h"
  13 #include <optional>
  14
  15 namespace clang {
  16 namespace clangd {
  17 namespace dex {
  18 namespace {
  19
  20 /// Implements iterator of PostingList chunks. This requires iterating over two
  21 /// levels: the first level iterator iterates over the chunks and decompresses
  22 /// them on-the-fly when the contents of chunk are to be seen.
  23 class ChunkIterator : public Iterator {
  24 public:
  25   explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
  26       : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
  27     if (!Chunks.empty()) {
  28       DecompressedChunk = CurrentChunk->decompress();
  29       CurrentID = DecompressedChunk.begin();
  30     }
  31   }
  32
  33   bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
  34
  35   /// Advances cursor to the next item.
  36   void advance() override {
  37     assert(!reachedEnd() &&
  38            "Posting List iterator can't advance() at the end.");
  39     ++CurrentID;
  40     normalizeCursor();
  41   }
  42
  43   /// Applies binary search to advance cursor to the next item with DocID
  44   /// equal or higher than the given one.
  45   void advanceTo(DocID ID) override {
  46     assert(!reachedEnd() &&
  47            "Posting List iterator can't advance() at the end.");
  48     if (ID <= peek())
  49       return;
  50     advanceToChunk(ID);
  51     // Try to find ID within current chunk.
  52     CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),
  53                                      [&](const DocID D) { return D < ID; });
  54     normalizeCursor();
  55   }
  56
  57   DocID peek() const override {
  58     assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
  59     return *CurrentID;
  60   }
  61
  62   float consume() override {
  63     assert(!reachedEnd() &&
  64            "Posting List iterator can't consume() at the end.");
  65     return 1;
  66   }
  67
  68   size_t estimateSize() const override {
  69     return Chunks.size() * ApproxEntriesPerChunk;
  70   }
  71
  72 private:
  73   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
  74     if (Tok != nullptr)
  75       return OS << *Tok;
  76     OS << '[';
  77     const char *Sep = "";
  78     for (const Chunk &C : Chunks)
  79       for (const DocID Doc : C.decompress()) {
  80         OS << Sep << Doc;
  81         Sep = " ";
  82       }
  83     return OS << ']';
  84   }
  85
  86   /// If the cursor is at the end of a chunk, place it at the start of the next
  87   /// chunk.
  88   void normalizeCursor() {
  89     // Invariant is already established if examined chunk is not exhausted.
  90     if (CurrentID != std::end(DecompressedChunk))
  91       return;
  92     // Advance to next chunk if current one is exhausted.
  93     ++CurrentChunk;
  94     if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
  95       return;
  96     DecompressedChunk = CurrentChunk->decompress();
  97     CurrentID = DecompressedChunk.begin();
  98   }
  99
 100   /// Advances CurrentChunk to the chunk which might contain ID.
 101   void advanceToChunk(DocID ID) {
 102     if ((CurrentChunk != Chunks.end() - 1) &&
 103         ((CurrentChunk + 1)->Head <= ID)) {
 104       CurrentChunk =
 105           std::partition_point(CurrentChunk + 1, Chunks.end(),
 106                                [&](const Chunk &C) { return C.Head < ID; });
 107       --CurrentChunk;
 108       DecompressedChunk = CurrentChunk->decompress();
 109       CurrentID = DecompressedChunk.begin();
 110     }
 111   }
 112
 113   const Token *Tok;
 114   llvm::ArrayRef<Chunk> Chunks;
 115   /// Iterator over chunks.
 116   /// If CurrentChunk is valid, then DecompressedChunk is
 117   /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
 118   /// into it.
 119   decltype(Chunks)::const_iterator CurrentChunk;
 120   llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
 121   /// Iterator over DecompressedChunk.
 122   decltype(DecompressedChunk)::iterator CurrentID;
 123
 124   static constexpr size_t ApproxEntriesPerChunk = 15;
 125 };
 126
 127 static constexpr size_t BitsPerEncodingByte = 7;
 128
 129 /// Writes a variable length DocID into the buffer and updates the buffer size.
 130 /// If it doesn't fit, returns false and doesn't write to the buffer.
 131 bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
 132   assert(Delta != 0 && "0 is not a valid PostingList delta.");
 133   // Calculate number of bytes Delta encoding would take by examining the
 134   // meaningful bits.
 135   unsigned Width = 1 + llvm::Log2_64(Delta) / BitsPerEncodingByte;
 136   if (Width > Payload.size())
 137     return false;
 138
 139   do {
 140     uint8_t Encoding = Delta & 0x7f;
 141     Delta >>= 7;
 142     Payload.front() = Delta ? Encoding | 0x80 : Encoding;
 143     Payload = Payload.drop_front();
 144   } while (Delta != 0);
 145   return true;
 146 }
 147
 148 /// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
 149 /// DocIDs. The compression stores deltas (differences) between subsequent
 150 /// DocIDs and encodes these deltas utilizing the least possible number of
 151 /// bytes.
 152 ///
 153 /// Each encoding byte consists of two parts: the first bit (continuation bit)
 154 /// indicates whether this is the last byte (0 if this byte is the last) of
 155 /// current encoding and seven bytes a piece of DocID (payload). DocID contains
 156 /// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
 157 /// payloads and one 4-bit payload), but in practice it is expected that gaps
 158 /// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
 159 /// In very dense posting lists (with average gaps less than 128) this
 160 /// representation would be 4 times more efficient than raw DocID array.
 161 ///
 162 /// PostingList encoding example:
 163 ///
 164 /// DocIDs    42            47        7000
 165 /// gaps                    5         6958
 166 /// Encoding  (raw number)  00000101  10110110 00101110
 167 std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
 168   assert(!Documents.empty() && "Can't encode empty sequence.");
 169   std::vector<Chunk> Result;
 170   Result.emplace_back();
 171   DocID Last = Result.back().Head = Documents.front();
 172   llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
 173   for (DocID Doc : Documents.drop_front()) {
 174     if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
 175       Result.emplace_back();
 176       Result.back().Head = Doc;
 177       RemainingPayload = Result.back().Payload;
 178     }
 179     Last = Doc;
 180   }
 181   return std::vector<Chunk>(Result); // no move, shrink-to-fit
 182 }
 183
 184 /// Reads variable length DocID from the buffer and updates the buffer size. If
 185 /// the stream is terminated, return std::nullopt.
 186 std::optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
 187   if (Bytes.front() == 0 || Bytes.empty())
 188     return std::nullopt;
 189   DocID Result = 0;
 190   bool HasNextByte = true;
 191   for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
 192     assert(Length <= 5 && "Malformed VByte encoding sequence.");
 193     // Write meaningful bits to the correct place in the document decoding.
 194     Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
 195     if ((Bytes.front() & 0x80) == 0)
 196       HasNextByte = false;
 197     Bytes = Bytes.drop_front();
 198   }
 199   return Result;
 200 }
 201
 202 } // namespace
 203
 204 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
 205   llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
 206   llvm::ArrayRef<uint8_t> Bytes(Payload);
 207   DocID Delta;
 208   for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
 209     auto MaybeDelta = readVByte(Bytes);
 210     if (!MaybeDelta)
 211       break;
 212     Delta = *MaybeDelta;
 213     Result.push_back(Current + Delta);
 214   }
 215   return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
 216 }
 217
 218 PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
 219     : Chunks(encodeStream(Documents)) {}
 220
 221 std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const {
 222   return std::make_unique<ChunkIterator>(Tok, Chunks);
 223 }
 224
 225 } // namespace dex
 226 } // namespace clangd
 227 } // namespace clang