clang-tools-extra/clangd/index/dex/PostingList.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228

//===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "PostingList.h"
#include "Iterator.h"
#include "Token.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/MathExtras.h"

namespace clang {
namespace clangd {
namespace dex {
namespace {

/// Implements iterator of PostingList chunks. This requires iterating over two
/// levels: the first level iterator iterates over the chunks and decompresses
/// them on-the-fly when the contents of chunk are to be seen.
class ChunkIterator : public Iterator {
public:
  explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
      : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
    if (!Chunks.empty()) {
      DecompressedChunk = CurrentChunk->decompress();
      CurrentID = DecompressedChunk.begin();
    }
  }

  bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }

  /// Advances cursor to the next item.
  void advance() override {
    assert(!reachedEnd() &&
           "Posting List iterator can't advance() at the end.");
    ++CurrentID;
    normalizeCursor();
  }

  /// Applies binary search to advance cursor to the next item with DocID
  /// equal or higher than the given one.
  void advanceTo(DocID ID) override {
    assert(!reachedEnd() &&
           "Posting List iterator can't advance() at the end.");
    if (ID <= peek())
      return;
    advanceToChunk(ID);
    // Try to find ID within current chunk.
    CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),
                                     [&](const DocID D) { return D < ID; });
    normalizeCursor();
  }

  DocID peek() const override {
    assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
    return *CurrentID;
  }

  float consume() override {
    assert(!reachedEnd() &&
           "Posting List iterator can't consume() at the end.");
    return 1;
  }

  size_t estimateSize() const override {
    return Chunks.size() * ApproxEntriesPerChunk;
  }

private:
  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
    if (Tok != nullptr)
      return OS << *Tok;
    OS << '[';
    const char *Sep = "";
    for (const Chunk &C : Chunks)
      for (const DocID Doc : C.decompress()) {
        OS << Sep << Doc;
        Sep = " ";
      }
    return OS << ']';
  }

  /// If the cursor is at the end of a chunk, place it at the start of the next
  /// chunk.
  void normalizeCursor() {
    // Invariant is already established if examined chunk is not exhausted.
    if (CurrentID != std::end(DecompressedChunk))
      return;
    // Advance to next chunk if current one is exhausted.
    ++CurrentChunk;
    if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
      return;
    DecompressedChunk = CurrentChunk->decompress();
    CurrentID = DecompressedChunk.begin();
  }

  /// Advances CurrentChunk to the chunk which might contain ID.
  void advanceToChunk(DocID ID) {
    if ((CurrentChunk != Chunks.end() - 1) &&
        ((CurrentChunk + 1)->Head <= ID)) {
      CurrentChunk =
          std::partition_point(CurrentChunk + 1, Chunks.end(),
                               [&](const Chunk &C) { return C.Head < ID; });
      --CurrentChunk;
      DecompressedChunk = CurrentChunk->decompress();
      CurrentID = DecompressedChunk.begin();
    }
  }

  const Token *Tok;
  llvm::ArrayRef<Chunk> Chunks;
  /// Iterator over chunks.
  /// If CurrentChunk is valid, then DecompressedChunk is
  /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
  /// into it.
  decltype(Chunks)::const_iterator CurrentChunk;
  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
  /// Iterator over DecompressedChunk.
  decltype(DecompressedChunk)::iterator CurrentID;

  static constexpr size_t ApproxEntriesPerChunk = 15;
};

static constexpr size_t BitsPerEncodingByte = 7;

/// Writes a variable length DocID into the buffer and updates the buffer size.
/// If it doesn't fit, returns false and doesn't write to the buffer.
bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
  assert(Delta != 0 && "0 is not a valid PostingList delta.");
  // Calculate number of bytes Delta encoding would take by examining the
  // meaningful bits.
  unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte;
  if (Width > Payload.size())
    return false;

  do {
    uint8_t Encoding = Delta & 0x7f;
    Delta >>= 7;
    Payload.front() = Delta ? Encoding | 0x80 : Encoding;
    Payload = Payload.drop_front();
  } while (Delta != 0);
  return true;
}

/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
/// DocIDs. The compression stores deltas (differences) between subsequent
/// DocIDs and encodes these deltas utilizing the least possible number of
/// bytes.
///
/// Each encoding byte consists of two parts: the first bit (continuation bit)
/// indicates whether this is the last byte (0 if this byte is the last) of
/// current encoding and seven bytes a piece of DocID (payload). DocID contains
/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
/// payloads and one 4-bit payload), but in practice it is expected that gaps
/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
/// In very dense posting lists (with average gaps less than 128) this
/// representation would be 4 times more efficient than raw DocID array.
///
/// PostingList encoding example:
///
/// DocIDs    42            47        7000
/// gaps                    5         6958
/// Encoding  (raw number)  00000101  10110110 00101110
std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
  assert(!Documents.empty() && "Can't encode empty sequence.");
  std::vector<Chunk> Result;
  Result.emplace_back();
  DocID Last = Result.back().Head = Documents.front();
  llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
  for (DocID Doc : Documents.drop_front()) {
    if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
      Result.emplace_back();
      Result.back().Head = Doc;
      RemainingPayload = Result.back().Payload;
    }
    Last = Doc;
  }
  return std::vector<Chunk>(Result); // no move, shrink-to-fit
}

/// Reads variable length DocID from the buffer and updates the buffer size. If
/// the stream is terminated, return None.
llvm::Optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
  if (Bytes.front() == 0 || Bytes.empty())
    return None;
  DocID Result = 0;
  bool HasNextByte = true;
  for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
    assert(Length <= 5 && "Malformed VByte encoding sequence.");
    // Write meaningful bits to the correct place in the document decoding.
    Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
    if ((Bytes.front() & 0x80) == 0)
      HasNextByte = false;
    Bytes = Bytes.drop_front();
  }
  return Result;
}

} // namespace

llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
  llvm::ArrayRef<uint8_t> Bytes(Payload);
  DocID Delta;
  for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
    auto MaybeDelta = readVByte(Bytes);
    if (!MaybeDelta)
      break;
    Delta = *MaybeDelta;
    Result.push_back(Current + Delta);
  }
  return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
}

PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
    : Chunks(encodeStream(Documents)) {}

std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const {
  return llvm::make_unique<ChunkIterator>(Tok, Chunks);
}

} // namespace dex
} // namespace clangd
} // namespace clang