summaryrefslogtreecommitdiffstats
path: root/clang-tools-extra/clangd/index
diff options
context:
space:
mode:
Diffstat (limited to 'clang-tools-extra/clangd/index')
-rw-r--r--clang-tools-extra/clangd/index/dex/Dex.cpp4
-rw-r--r--clang-tools-extra/clangd/index/dex/Trigram.cpp98
-rw-r--r--clang-tools-extra/clangd/index/dex/Trigram.h29
3 files changed, 43 insertions, 88 deletions
diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp
index 08f168dddef..59cd8fc4304 100644
--- a/clang-tools-extra/clangd/index/dex/Dex.cpp
+++ b/clang-tools-extra/clangd/index/dex/Dex.cpp
@@ -156,7 +156,9 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
"There must be no :: in query.");
trace::Span Tracer("Dex fuzzyFind");
FuzzyMatcher Filter(Req.Query);
- bool More = false;
+ // For short queries we use specialized trigrams that don't yield all results.
+ // Prevent clients from postfiltering them for longer queries.
+ bool More = !Req.Query.empty() && Req.Query.size() < 3;
std::vector<std::unique_ptr<Iterator>> TopLevelChildren;
const auto TrigramTokens = generateQueryTrigrams(Req.Query);
diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp
index a646a38de9d..eb5473ddd53 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -23,11 +23,6 @@ namespace clang {
namespace clangd {
namespace dex {
-/// This is used to mark unigrams and bigrams and distinct them from complete
-/// trigrams. Since '$' is not present in valid identifier names, it is safe to
-/// use it as the special symbol.
-static const char END_MARKER = '$';
-
std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Identifier.size());
@@ -47,40 +42,22 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
// not available then 0 is stored.
std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
- // Store two first HEAD characters in the identifier (if present).
- std::deque<char> TwoHeads;
for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
Next[I] = {{NextTail, NextHead, NextNextHead}};
NextTail = Roles[I] == Tail ? I : 0;
if (Roles[I] == Head) {
NextNextHead = NextHead;
NextHead = I;
- TwoHeads.push_front(LowercaseIdentifier[I]);
- if (TwoHeads.size() > 2)
- TwoHeads.pop_back();
}
}
DenseSet<Token> UniqueTrigrams;
- auto add = [&](std::string Chars) {
+ auto Add = [&](std::string Chars) {
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
};
- if (TwoHeads.size() == 2)
- add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}});
-
- if (!LowercaseIdentifier.empty())
- add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}});
-
- if (LowercaseIdentifier.size() >= 2)
- add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}});
-
- if (LowercaseIdentifier.size() >= 3)
- add({{LowercaseIdentifier[0], LowercaseIdentifier[1],
- LowercaseIdentifier[2]}});
-
- // Iterate through valid seqneces of three characters Fuzzy Matcher can
+ // Iterate through valid sequneces of three characters Fuzzy Matcher can
// process.
for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
// Skip delimiters.
@@ -92,66 +69,47 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
for (const unsigned K : Next[J]) {
if (K == 0)
continue;
- add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
+ Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
LowercaseIdentifier[K]}});
}
}
}
+ // Emit short-query trigrams: FooBar -> f, fo, fb.
+ if (!LowercaseIdentifier.empty())
+ Add({LowercaseIdentifier[0]});
+ if (LowercaseIdentifier.size() >= 2)
+ Add({LowercaseIdentifier[0], LowercaseIdentifier[1]});
+ for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
+ if (Roles[I] == Head) {
+ Add({LowercaseIdentifier[0], LowercaseIdentifier[I]});
+ break;
+ }
- std::vector<Token> Result;
- for (const auto &Trigram : UniqueTrigrams)
- Result.push_back(Trigram);
-
- return Result;
+ return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
}
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
+ std::string LowercaseQuery = Query.lower();
+ if (Query.size() < 3) // short-query trigrams only
+ return {Token(Token::Kind::Trigram, LowercaseQuery)};
+
// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Query.size());
calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
- // Additional pass is necessary to count valid identifier characters.
- // Depending on that, this function might return incomplete trigram.
- unsigned ValidSymbolsCount = 0;
- for (const auto Role : Roles)
- if (Role == Head || Role == Tail)
- ++ValidSymbolsCount;
-
- std::string LowercaseQuery = Query.lower();
-
DenseSet<Token> UniqueTrigrams;
-
- // If the number of symbols which can form fuzzy matching trigram is not
- // sufficient, generate a single incomplete trigram for query.
- if (ValidSymbolsCount < 3) {
- std::string Chars =
- LowercaseQuery.substr(0, std::min<size_t>(3UL, Query.size()));
- Chars.append(3 - Chars.size(), END_MARKER);
- UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
- } else {
- std::deque<char> Chars;
- for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
- // If current symbol is delimiter, just skip it.
- if (Roles[I] != Head && Roles[I] != Tail)
- continue;
-
- Chars.push_back(LowercaseQuery[I]);
-
- if (Chars.size() > 3)
- Chars.pop_front();
-
- if (Chars.size() == 3) {
- UniqueTrigrams.insert(
- Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))));
- }
- }
+ std::string Chars;
+ for (unsigned I = 0; I < Query.size(); ++I) {
+ if (Roles[I] != Head && Roles[I] != Tail)
+ continue; // Skip delimiters.
+ Chars.push_back(LowercaseQuery[I]);
+ if (Chars.size() > 3)
+ Chars.erase(Chars.begin());
+ if (Chars.size() == 3)
+ UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
}
- std::vector<Token> Result;
- for (const auto &Trigram : UniqueTrigrams)
- Result.push_back(Trigram);
-
- return Result;
+ return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
}
} // namespace dex
diff --git a/clang-tools-extra/clangd/index/dex/Trigram.h b/clang-tools-extra/clangd/index/dex/Trigram.h
index a01c1063dc1..d0fb474ffd7 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.h
+++ b/clang-tools-extra/clangd/index/dex/Trigram.h
@@ -33,26 +33,21 @@ namespace clangd {
namespace dex {
/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+/// The trigrams give the 3-character query substrings this symbol can match.
///
-/// First, given Identifier (unqualified symbol name) is segmented using
-/// FuzzyMatch API and lowercased. After segmentation, the following technique
-/// is applied for generating trigrams: for each letter or digit in the input
-/// string the algorithms looks for the possible next and skip-1-next characters
-/// which can be jumped to during fuzzy matching. Each combination of such three
-/// characters is inserted into the result.
-///
+/// The symbol's name is broken into segments, e.g. "FooBar" has two segments.
/// Trigrams can start at any character in the input. Then we can choose to move
-/// to the next character, move to the start of the next segment, or skip over a
-/// segment.
+/// to the next character, move to the start of the next segment, or stop.
///
-/// This also generates incomplete trigrams for short query scenarios:
-/// * Empty trigram: "$$$".
-/// * Unigram: the first character of the identifier.
-/// * Bigrams: a 2-char prefix of the identifier and a bigram of the first two
-/// HEAD characters (if they exist).
-//
-/// Note: the returned list of trigrams does not have duplicates, if any trigram
-/// belongs to more than one class it is only inserted once.
+/// Short trigrams (length 1-2) are used for short queries. These are:
+/// - prefixes of the identifier, of length 1 and 2
+/// - the first character + next head character
+///
+/// For "FooBar" we get the following trigrams:
+/// {f, fo, fb, foo, fob, fba, oob, oba, bar}.
+///
+/// Trigrams are lowercase, as trigram matching is case-insensitive.
+/// Trigrams in the returned list are deduplicated.
std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
/// Returns list of unique fuzzy-search trigrams given a query.
OpenPOWER on IntegriCloud