summaryrefslogtreecommitdiffstats
path: root/clang-tools-extra/clangd/index/Serialization.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'clang-tools-extra/clangd/index/Serialization.cpp')
-rw-r--r--clang-tools-extra/clangd/index/Serialization.cpp366
1 files changed, 366 insertions, 0 deletions
diff --git a/clang-tools-extra/clangd/index/Serialization.cpp b/clang-tools-extra/clangd/index/Serialization.cpp
new file mode 100644
index 00000000000..df88142d102
--- /dev/null
+++ b/clang-tools-extra/clangd/index/Serialization.cpp
@@ -0,0 +1,366 @@
+//===-- Serialization.cpp - Binary serialization of index data ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "Serialization.h"
+#include "../RIFF.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+namespace clang {
+namespace clangd {
+namespace {
+Error makeError(const Twine &Msg) {
+ return make_error<StringError>(Msg, inconvertibleErrorCode());
+}
+
+// IO PRIMITIVES
+// We use little-endian 32 bit ints, sometimes with variable-length encoding.
+
+StringRef consume(StringRef &Data, int N) {
+ StringRef Ret = Data.take_front(N);
+ Data = Data.drop_front(N);
+ return Ret;
+}
+
+uint8_t consume8(StringRef &Data) {
+ uint8_t Ret = Data.front();
+ Data = Data.drop_front();
+ return Ret;
+}
+
+uint32_t consume32(StringRef &Data) {
+ auto Ret = support::endian::read32le(Data.bytes_begin());
+ Data = Data.drop_front(4);
+ return Ret;
+}
+
+void write32(uint32_t I, raw_ostream &OS) {
+ char buf[4];
+ support::endian::write32le(buf, I);
+ OS.write(buf, sizeof(buf));
+}
+
+// Variable-length int encoding (varint) uses the bottom 7 bits of each byte
+// to encode the number, and the top bit to indicate whether more bytes follow.
+// e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop].
+// This represents 0x1a | 0x2f<<7 = 6042.
+// A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact.
+void writeVar(uint32_t I, raw_ostream &OS) {
+ constexpr static uint8_t More = 1 << 7;
+ if (LLVM_LIKELY(I < 1 << 7)) {
+ OS.write(I);
+ return;
+ }
+ for (;;) {
+ OS.write(I | More);
+ I >>= 7;
+ if (I < 1 << 7) {
+ OS.write(I);
+ return;
+ }
+ }
+}
+
+uint32_t consumeVar(StringRef &Data) {
+ constexpr static uint8_t More = 1 << 7;
+ uint8_t B = consume8(Data);
+ if (LLVM_LIKELY(!(B & More)))
+ return B;
+ uint32_t Val = B & ~More;
+ for (int Shift = 7; B & More && Shift < 32; Shift += 7) {
+ B = consume8(Data);
+ Val |= (B & ~More) << Shift;
+ }
+ return Val;
+}
+
+// STRING TABLE ENCODING
+// Index data has many string fields, and many strings are identical.
+// We store each string once, and refer to them by index.
+//
+// The string table's format is:
+// - UncompressedSize : uint32
+// - CompressedData : byte[CompressedSize]
+//
+// CompressedData is a zlib-compressed byte[UncompressedSize].
+// It contains a sequence of null-terminated strings, e.g. "foo\0bar\0".
+// These are sorted to improve compression.
+
+// Maps each string to a canonical representation.
+// Strings remain owned externally (e.g. by SymbolSlab).
+class StringTableOut {
+ DenseSet<StringRef> Unique;
+ std::vector<StringRef> Sorted;
+ // Since strings are interned, look up can be by pointer.
+ DenseMap<std::pair<const char *, size_t>, unsigned> Index;
+
+public:
+ // Add a string to the table. Overwrites S if an identical string exists.
+ void intern(StringRef &S) { S = *Unique.insert(S).first; };
+ // Finalize the table and write it to OS. No more strings may be added.
+ void finalize(raw_ostream &OS) {
+ Sorted = {Unique.begin(), Unique.end()};
+ std::sort(Sorted.begin(), Sorted.end());
+ for (unsigned I = 0; I < Sorted.size(); ++I)
+ Index.try_emplace({Sorted[I].data(), Sorted[I].size()}, I);
+
+ std::string RawTable;
+ for (StringRef S : Sorted) {
+ RawTable.append(S);
+ RawTable.push_back(0);
+ }
+ SmallString<1> Compressed;
+ cantFail(zlib::compress(RawTable, Compressed));
+ write32(RawTable.size(), OS);
+ OS << Compressed;
+ }
+ // Get the ID of an string, which must be interned. Table must be finalized.
+ unsigned index(StringRef S) const {
+ assert(!Sorted.empty() && "table not finalized");
+ assert(Index.count({S.data(), S.size()}) && "string not interned");
+ return Index.find({S.data(), S.size()})->second;
+ }
+};
+
+struct StringTableIn {
+ BumpPtrAllocator Arena;
+ std::vector<StringRef> Strings;
+};
+
+Expected<StringTableIn> readStringTable(StringRef Data) {
+ if (Data.size() < 4)
+ return makeError("Bad string table: not enough metadata");
+ size_t UncompressedSize = consume32(Data);
+ SmallString<1> Uncompressed;
+ if (Error E = llvm::zlib::uncompress(Data, Uncompressed, UncompressedSize))
+ return std::move(E);
+
+ StringTableIn Table;
+ StringSaver Saver(Table.Arena);
+ for (StringRef Rest = Uncompressed; !Rest.empty();) {
+ auto Len = Rest.find(0);
+ if (Len == StringRef::npos)
+ return makeError("Bad string table: not null terminated");
+ Table.Strings.push_back(Saver.save(consume(Rest, Len)));
+ Rest = Rest.drop_front();
+ }
+ return Table;
+}
+
+// SYMBOL ENCODING
+// Each field of clangd::Symbol is encoded in turn (see implementation).
+// - StringRef fields encode as varint (index into the string table)
+// - enums encode as the underlying type
+// - most numbers encode as varint
+
+// It's useful to the implementation to assume symbols have a bounded size.
+constexpr size_t SymbolSizeBound = 512;
+// To ensure the bounded size, restrict the number of include headers stored.
+constexpr unsigned MaxIncludes = 50;
+
+void writeSymbol(const Symbol &Sym, const StringTableOut &Strings,
+ raw_ostream &OS) {
+ auto StartOffset = OS.tell();
+ OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists,
+ // symbol IDs should probably be in a string table.
+ OS.write(static_cast<uint8_t>(Sym.SymInfo.Kind));
+ OS.write(static_cast<uint8_t>(Sym.SymInfo.Lang));
+ writeVar(Strings.index(Sym.Name), OS);
+ writeVar(Strings.index(Sym.Scope), OS);
+ for (const auto &Loc : {Sym.Definition, Sym.CanonicalDeclaration}) {
+ writeVar(Strings.index(Loc.FileURI), OS);
+ for (const auto &Endpoint : {Loc.Start, Loc.End}) {
+ writeVar(Endpoint.Line, OS);
+ writeVar(Endpoint.Column, OS);
+ }
+ }
+ writeVar(Sym.References, OS);
+ OS.write(Sym.IsIndexedForCodeCompletion);
+ OS.write(static_cast<uint8_t>(Sym.Origin));
+ writeVar(Strings.index(Sym.Signature), OS);
+ writeVar(Strings.index(Sym.CompletionSnippetSuffix), OS);
+ writeVar(Strings.index(Sym.Documentation), OS);
+ writeVar(Strings.index(Sym.ReturnType), OS);
+
+ auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) {
+ writeVar(Strings.index(Include.IncludeHeader), OS);
+ writeVar(Include.References, OS);
+ };
+ // There are almost certainly few includes, so we can just write them.
+ if (LLVM_LIKELY(Sym.IncludeHeaders.size() <= MaxIncludes)) {
+ writeVar(Sym.IncludeHeaders.size(), OS);
+ for (const auto &Include : Sym.IncludeHeaders)
+ WriteInclude(Include);
+ } else {
+ // If there are too many, make sure we truncate the least important.
+ using Pointer = const Symbol::IncludeHeaderWithReferences *;
+ std::vector<Pointer> Pointers;
+ for (const auto &Include : Sym.IncludeHeaders)
+ Pointers.push_back(&Include);
+ std::sort(Pointers.begin(), Pointers.end(), [](Pointer L, Pointer R) {
+ return L->References > R->References;
+ });
+ Pointers.resize(MaxIncludes);
+
+ writeVar(MaxIncludes, OS);
+ for (Pointer P : Pointers)
+ WriteInclude(*P);
+ }
+
+ assert(OS.tell() - StartOffset < SymbolSizeBound && "Symbol length unsafe!");
+ (void)StartOffset; // Unused in NDEBUG;
+}
+
+Expected<Symbol> readSymbol(StringRef &Data, const StringTableIn &Strings) {
+ // Usually we can skip bounds checks because the buffer is huge.
+ // Near the end of the buffer, this would be unsafe. In this rare case, copy
+ // the data into a bigger buffer so we can again skip the checks.
+ if (LLVM_UNLIKELY(Data.size() < SymbolSizeBound)) {
+ std::string Buf(Data);
+ Buf.resize(SymbolSizeBound);
+ StringRef ExtendedData = Buf;
+ auto Ret = readSymbol(ExtendedData, Strings);
+ unsigned BytesRead = Buf.size() - ExtendedData.size();
+ if (BytesRead > Data.size())
+ return makeError("read past end of data");
+ Data = Data.drop_front(BytesRead);
+ return Ret;
+ }
+
+#define READ_STRING(Field) \
+ do { \
+ auto StringIndex = consumeVar(Data); \
+ if (LLVM_UNLIKELY(StringIndex >= Strings.Strings.size())) \
+ return makeError("Bad string index"); \
+ Field = Strings.Strings[StringIndex]; \
+ } while (0)
+
+ Symbol Sym;
+ Sym.ID = SymbolID::fromRaw(consume(Data, 20));
+ Sym.SymInfo.Kind = static_cast<index::SymbolKind>(consume8(Data));
+ Sym.SymInfo.Lang = static_cast<index::SymbolLanguage>(consume8(Data));
+ READ_STRING(Sym.Name);
+ READ_STRING(Sym.Scope);
+ for (SymbolLocation *Loc : {&Sym.Definition, &Sym.CanonicalDeclaration}) {
+ READ_STRING(Loc->FileURI);
+ for (auto &Endpoint : {&Loc->Start, &Loc->End}) {
+ Endpoint->Line = consumeVar(Data);
+ Endpoint->Column = consumeVar(Data);
+ }
+ }
+ Sym.References = consumeVar(Data);
+ Sym.IsIndexedForCodeCompletion = consume8(Data);
+ Sym.Origin = static_cast<SymbolOrigin>(consume8(Data));
+ READ_STRING(Sym.Signature);
+ READ_STRING(Sym.CompletionSnippetSuffix);
+ READ_STRING(Sym.Documentation);
+ READ_STRING(Sym.ReturnType);
+ unsigned IncludeHeaderN = consumeVar(Data);
+ if (IncludeHeaderN > MaxIncludes)
+ return makeError("too many IncludeHeaders");
+ Sym.IncludeHeaders.resize(IncludeHeaderN);
+ for (auto &I : Sym.IncludeHeaders) {
+ READ_STRING(I.IncludeHeader);
+ I.References = consumeVar(Data);
+ }
+
+#undef READ_STRING
+ return Sym;
+}
+
+} // namespace
+
+// FILE ENCODING
+// A file is a RIFF chunk with type 'CdIx'.
+// It contains the sections:
+// - meta: version number
+// - stri: string table
+// - symb: symbols
+
+// The current versioning scheme is simple - non-current versions are rejected.
+// This allows arbitrary format changes, which invalidate stored data.
+// Later we may want to support some backward compatibility.
+constexpr static uint32_t Version = 1;
+
+Expected<IndexFileIn> readIndexFile(StringRef Data) {
+ auto RIFF = riff::readFile(Data);
+ if (!RIFF)
+ return RIFF.takeError();
+ if (RIFF->Type != riff::fourCC("CdIx"))
+ return makeError("wrong RIFF type");
+ StringMap<StringRef> Chunks;
+ for (const auto &Chunk : RIFF->Chunks)
+ Chunks.try_emplace(StringRef(Chunk.ID.data(), Chunk.ID.size()), Chunk.Data);
+
+ for (StringRef RequiredChunk : {"meta", "stri"})
+ if (!Chunks.count(RequiredChunk))
+ return makeError("missing required chunk " + RequiredChunk);
+
+ StringRef Meta = Chunks.lookup("meta");
+ if (Meta.size() < 4 || consume32(Meta) != Version)
+ return makeError("wrong version");
+
+ auto Strings = readStringTable(Chunks.lookup("stri"));
+ if (!Strings)
+ return Strings.takeError();
+
+ IndexFileIn Result;
+ if (Chunks.count("symb")) {
+ StringRef SymbolData = Chunks.lookup("symb");
+ SymbolSlab::Builder Symbols;
+ while (!SymbolData.empty())
+ if (auto Sym = readSymbol(SymbolData, *Strings))
+ Symbols.insert(*Sym);
+ else
+ return Sym.takeError();
+ Result.Symbols = std::move(Symbols).build();
+ }
+ return Result;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const IndexFileOut &Data) {
+ assert(Data.Symbols && "An index file without symbols makes no sense!");
+ riff::File RIFF;
+ RIFF.Type = riff::fourCC("CdIx");
+
+ SmallString<4> Meta;
+ {
+ raw_svector_ostream MetaOS(Meta);
+ write32(Version, MetaOS);
+ }
+ RIFF.Chunks.push_back({riff::fourCC("meta"), Meta});
+
+ StringTableOut Strings;
+ std::vector<Symbol> Symbols;
+ for (const auto &Sym : *Data.Symbols) {
+ Symbols.emplace_back(Sym);
+ visitStrings(Symbols.back(), [&](StringRef &S) { Strings.intern(S); });
+ }
+
+ std::string StringSection;
+ {
+ raw_string_ostream StringOS(StringSection);
+ Strings.finalize(StringOS);
+ }
+ RIFF.Chunks.push_back({riff::fourCC("stri"), StringSection});
+
+ std::string SymbolSection;
+ {
+ raw_string_ostream SymbolOS(SymbolSection);
+ for (const auto &Sym : Symbols)
+ writeSymbol(Sym, Strings, SymbolOS);
+ }
+ RIFF.Chunks.push_back({riff::fourCC("symb"), SymbolSection});
+
+ return OS << RIFF;
+}
+
+} // namespace clangd
+} // namespace clang
OpenPOWER on IntegriCloud