//===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// /// \file /// This defines posting list interface: a storage for identifiers of symbols /// which can be characterized by a specific feature (such as fuzzy-find /// trigram, scope, type or any other Search Token). Posting lists can be /// traversed in order using an iterator and are values for inverted index, /// which maps search tokens to corresponding posting lists. /// /// In order to decrease size of Index in-memory representation, Variable Byte /// Encoding (VByte) is used for PostingLists compression. An overview of VByte /// algorithm can be found in "Introduction to Information Retrieval" book: /// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html /// //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H #include "Iterator.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include #include namespace clang { namespace clangd { namespace dex { struct Token; /// NOTE: This is an implementation detail. /// /// Chunk is a fixed-width piece of PostingList which contains the first DocID /// in uncompressed format (Head) and delta-encoded Payload. It can be /// decompressed upon request. struct Chunk { /// Keep sizeof(Chunk) == 32. static constexpr size_t PayloadSize = 32 - sizeof(DocID); llvm::SmallVector decompress() const; /// The first element of decompressed Chunk. DocID Head; /// VByte-encoded deltas. std::array Payload; }; static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory."); /// PostingList is the storage of DocIDs which can be inserted to the Query /// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs /// are stored in underlying chunks. Compression saves memory at a small cost /// in access time, which is still fast enough in practice. class PostingList { public: explicit PostingList(llvm::ArrayRef Documents); /// Constructs DocumentIterator over given posting list. DocumentIterator will /// go through the chunks and decompress them on-the-fly when necessary. /// If given, Tok is only used for the string representation. std::unique_ptr iterator(const Token *Tok = nullptr) const; /// Returns in-memory size of external storage. size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); } private: const std::vector Chunks; }; } // namespace dex } // namespace clangd } // namespace clang #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H