1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
//===--- DexIndex.cpp - Dex Symbol Index Implementation ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "DexIndex.h"
#include "../../FuzzyMatch.h"
#include "../../Logger.h"
#include <algorithm>
#include <queue>
namespace clang {
namespace clangd {
namespace dex {
namespace {
// Returns the tokens which are given symbol's characteristics. Currently, the
// generated tokens only contain fuzzy matching trigrams and symbol's scope,
// but in the future this will also return path proximity tokens and other
// types of tokens such as symbol type (if applicable).
// Returns the tokens which are given symbols's characteristics. For example,
// trigrams and scopes.
// FIXME(kbobyrev): Support more token types:
// * Path proximity
// * Types
std::vector<Token> generateSearchTokens(const Symbol &Sym) {
std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
Result.push_back(Token(Token::Kind::Scope, Sym.Scope));
return Result;
}
} // namespace
void DexIndex::build(std::shared_ptr<std::vector<const Symbol *>> Syms) {
llvm::DenseMap<SymbolID, const Symbol *> TempLookupTable;
llvm::DenseMap<const Symbol *, float> TempSymbolQuality;
for (const Symbol *Sym : *Syms) {
TempLookupTable[Sym->ID] = Sym;
TempSymbolQuality[Sym] = quality(*Sym);
}
// Symbols are sorted by symbol qualities so that items in the posting lists
// are stored in the descending order of symbol quality.
std::sort(begin(*Syms), end(*Syms),
[&](const Symbol *LHS, const Symbol *RHS) {
return TempSymbolQuality[LHS] > TempSymbolQuality[RHS];
});
llvm::DenseMap<Token, PostingList> TempInvertedIndex;
// Populate TempInvertedIndex with posting lists for index symbols.
for (DocID SymbolRank = 0; SymbolRank < Syms->size(); ++SymbolRank) {
const auto *Sym = (*Syms)[SymbolRank];
for (const auto &Token : generateSearchTokens(*Sym))
TempInvertedIndex[Token].push_back(SymbolRank);
}
{
std::lock_guard<std::mutex> Lock(Mutex);
// Replace outdated index with the new one.
LookupTable = std::move(TempLookupTable);
Symbols = std::move(Syms);
InvertedIndex = std::move(TempInvertedIndex);
SymbolQuality = std::move(TempSymbolQuality);
}
vlog("Built DexIndex with estimated memory usage {0} bytes.",
estimateMemoryUsage());
}
std::unique_ptr<SymbolIndex> DexIndex::build(SymbolSlab Slab) {
auto Idx = llvm::make_unique<DexIndex>();
Idx->build(getSymbolsFromSlab(std::move(Slab)));
return std::move(Idx);
}
/// Constructs iterators over tokens extracted from the query and exhausts it
/// while applying Callback to each symbol in the order of decreasing quality
/// of the matched symbols.
bool DexIndex::fuzzyFind(
const FuzzyFindRequest &Req,
llvm::function_ref<void(const Symbol &)> Callback) const {
assert(!StringRef(Req.Query).contains("::") &&
"There must be no :: in query.");
FuzzyMatcher Filter(Req.Query);
bool More = false;
std::vector<std::unique_ptr<Iterator>> TopLevelChildren;
const auto TrigramTokens = generateIdentifierTrigrams(Req.Query);
{
std::lock_guard<std::mutex> Lock(Mutex);
// Generate query trigrams and construct AND iterator over all query
// trigrams.
std::vector<std::unique_ptr<Iterator>> TrigramIterators;
for (const auto &Trigram : TrigramTokens) {
const auto It = InvertedIndex.find(Trigram);
if (It != InvertedIndex.end())
TrigramIterators.push_back(create(It->second));
}
if (!TrigramIterators.empty())
TopLevelChildren.push_back(createAnd(move(TrigramIterators)));
// Generate scope tokens for search query.
std::vector<std::unique_ptr<Iterator>> ScopeIterators;
for (const auto &Scope : Req.Scopes) {
const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope));
if (It != InvertedIndex.end())
ScopeIterators.push_back(create(It->second));
}
// Add OR iterator for scopes if there are any Scope Iterators.
if (!ScopeIterators.empty())
TopLevelChildren.push_back(createOr(move(ScopeIterators)));
// Use TRUE iterator if both trigrams and scopes from the query are not
// present in the symbol index.
auto QueryIterator = TopLevelChildren.empty()
? createTrue(Symbols->size())
: createAnd(move(TopLevelChildren));
// Retrieve more items than it was requested: some of the items with high
// final score might not be retrieved otherwise.
// FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
// using 100x of the requested number might not be good in practice, e.g.
// when the requested number of items is small.
const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
auto Root = createLimit(move(QueryIterator), ItemsToRetrieve);
// FIXME(kbobyrev): Add boosting to the query and utilize retrieved
// boosting scores.
std::vector<std::pair<DocID, float>> SymbolDocIDs = consume(*Root);
// Retrieve top Req.MaxCandidateCount items.
std::priority_queue<std::pair<float, const Symbol *>> Top;
for (const auto &P : SymbolDocIDs) {
const DocID SymbolDocID = P.first;
const auto *Sym = (*Symbols)[SymbolDocID];
const llvm::Optional<float> Score = Filter.match(Sym->Name);
if (!Score)
continue;
// Multiply score by a negative factor so that Top stores items with the
// highest actual score.
Top.emplace(-(*Score) * SymbolQuality.find(Sym)->second, Sym);
if (Top.size() > Req.MaxCandidateCount) {
More = true;
Top.pop();
}
}
// Apply callback to the top Req.MaxCandidateCount items.
for (; !Top.empty(); Top.pop())
Callback(*Top.top().second);
}
return More;
}
void DexIndex::lookup(const LookupRequest &Req,
llvm::function_ref<void(const Symbol &)> Callback) const {
std::lock_guard<std::mutex> Lock(Mutex);
for (const auto &ID : Req.IDs) {
auto I = LookupTable.find(ID);
if (I != LookupTable.end())
Callback(*I->second);
}
}
void DexIndex::findOccurrences(
const OccurrencesRequest &Req,
llvm::function_ref<void(const SymbolOccurrence &)> Callback) const {
log("findOccurrences is not implemented.");
}
size_t DexIndex::estimateMemoryUsage() const {
std::lock_guard<std::mutex> Lock(Mutex);
size_t Bytes =
LookupTable.size() * sizeof(std::pair<SymbolID, const Symbol *>);
Bytes += SymbolQuality.size() * sizeof(std::pair<const Symbol *, float>);
Bytes += InvertedIndex.size() * sizeof(Token);
for (const auto &P : InvertedIndex) {
Bytes += P.second.size() * sizeof(DocID);
}
return Bytes;
}
} // namespace dex
} // namespace clangd
} // namespace clang
|