diff options
Diffstat (limited to 'clang-tools-extra/clangd/index')
-rw-r--r-- | clang-tools-extra/clangd/index/dex/Dex.cpp | 4 | ||||
-rw-r--r-- | clang-tools-extra/clangd/index/dex/Trigram.cpp | 98 | ||||
-rw-r--r-- | clang-tools-extra/clangd/index/dex/Trigram.h | 29 |
3 files changed, 43 insertions, 88 deletions
diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp index 08f168dddef..59cd8fc4304 100644 --- a/clang-tools-extra/clangd/index/dex/Dex.cpp +++ b/clang-tools-extra/clangd/index/dex/Dex.cpp @@ -156,7 +156,9 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req, "There must be no :: in query."); trace::Span Tracer("Dex fuzzyFind"); FuzzyMatcher Filter(Req.Query); - bool More = false; + // For short queries we use specialized trigrams that don't yield all results. + // Prevent clients from postfiltering them for longer queries. + bool More = !Req.Query.empty() && Req.Query.size() < 3; std::vector<std::unique_ptr<Iterator>> TopLevelChildren; const auto TrigramTokens = generateQueryTrigrams(Req.Query); diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp index a646a38de9d..eb5473ddd53 100644 --- a/clang-tools-extra/clangd/index/dex/Trigram.cpp +++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -23,11 +23,6 @@ namespace clang { namespace clangd { namespace dex { -/// This is used to mark unigrams and bigrams and distinct them from complete -/// trigrams. Since '$' is not present in valid identifier names, it is safe to -/// use it as the special symbol. -static const char END_MARKER = '$'; - std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) { // Apply fuzzy matching text segmentation. std::vector<CharRole> Roles(Identifier.size()); @@ -47,40 +42,22 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) { // not available then 0 is stored. std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size()); unsigned NextTail = 0, NextHead = 0, NextNextHead = 0; - // Store two first HEAD characters in the identifier (if present). - std::deque<char> TwoHeads; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { Next[I] = {{NextTail, NextHead, NextNextHead}}; NextTail = Roles[I] == Tail ? I : 0; if (Roles[I] == Head) { NextNextHead = NextHead; NextHead = I; - TwoHeads.push_front(LowercaseIdentifier[I]); - if (TwoHeads.size() > 2) - TwoHeads.pop_back(); } } DenseSet<Token> UniqueTrigrams; - auto add = [&](std::string Chars) { + auto Add = [&](std::string Chars) { UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); }; - if (TwoHeads.size() == 2) - add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}}); - - if (!LowercaseIdentifier.empty()) - add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}}); - - if (LowercaseIdentifier.size() >= 2) - add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}}); - - if (LowercaseIdentifier.size() >= 3) - add({{LowercaseIdentifier[0], LowercaseIdentifier[1], - LowercaseIdentifier[2]}}); - - // Iterate through valid seqneces of three characters Fuzzy Matcher can + // Iterate through valid sequneces of three characters Fuzzy Matcher can // process. for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. @@ -92,66 +69,47 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) { for (const unsigned K : Next[J]) { if (K == 0) continue; - add({{LowercaseIdentifier[I], LowercaseIdentifier[J], + Add({{LowercaseIdentifier[I], LowercaseIdentifier[J], LowercaseIdentifier[K]}}); } } } + // Emit short-query trigrams: FooBar -> f, fo, fb. + if (!LowercaseIdentifier.empty()) + Add({LowercaseIdentifier[0]}); + if (LowercaseIdentifier.size() >= 2) + Add({LowercaseIdentifier[0], LowercaseIdentifier[1]}); + for (size_t I = 1; I < LowercaseIdentifier.size(); ++I) + if (Roles[I] == Head) { + Add({LowercaseIdentifier[0], LowercaseIdentifier[I]}); + break; + } - std::vector<Token> Result; - for (const auto &Trigram : UniqueTrigrams) - Result.push_back(Trigram); - - return Result; + return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; } std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) { + std::string LowercaseQuery = Query.lower(); + if (Query.size() < 3) // short-query trigrams only + return {Token(Token::Kind::Trigram, LowercaseQuery)}; + // Apply fuzzy matching text segmentation. std::vector<CharRole> Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); - // Additional pass is necessary to count valid identifier characters. - // Depending on that, this function might return incomplete trigram. - unsigned ValidSymbolsCount = 0; - for (const auto Role : Roles) - if (Role == Head || Role == Tail) - ++ValidSymbolsCount; - - std::string LowercaseQuery = Query.lower(); - DenseSet<Token> UniqueTrigrams; - - // If the number of symbols which can form fuzzy matching trigram is not - // sufficient, generate a single incomplete trigram for query. - if (ValidSymbolsCount < 3) { - std::string Chars = - LowercaseQuery.substr(0, std::min<size_t>(3UL, Query.size())); - Chars.append(3 - Chars.size(), END_MARKER); - UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); - } else { - std::deque<char> Chars; - for (size_t I = 0; I < LowercaseQuery.size(); ++I) { - // If current symbol is delimiter, just skip it. - if (Roles[I] != Head && Roles[I] != Tail) - continue; - - Chars.push_back(LowercaseQuery[I]); - - if (Chars.size() > 3) - Chars.pop_front(); - - if (Chars.size() == 3) { - UniqueTrigrams.insert( - Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)))); - } - } + std::string Chars; + for (unsigned I = 0; I < Query.size(); ++I) { + if (Roles[I] != Head && Roles[I] != Tail) + continue; // Skip delimiters. + Chars.push_back(LowercaseQuery[I]); + if (Chars.size() > 3) + Chars.erase(Chars.begin()); + if (Chars.size() == 3) + UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); } - std::vector<Token> Result; - for (const auto &Trigram : UniqueTrigrams) - Result.push_back(Trigram); - - return Result; + return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; } } // namespace dex diff --git a/clang-tools-extra/clangd/index/dex/Trigram.h b/clang-tools-extra/clangd/index/dex/Trigram.h index a01c1063dc1..d0fb474ffd7 100644 --- a/clang-tools-extra/clangd/index/dex/Trigram.h +++ b/clang-tools-extra/clangd/index/dex/Trigram.h @@ -33,26 +33,21 @@ namespace clangd { namespace dex { /// Returns list of unique fuzzy-search trigrams from unqualified symbol. +/// The trigrams give the 3-character query substrings this symbol can match. /// -/// First, given Identifier (unqualified symbol name) is segmented using -/// FuzzyMatch API and lowercased. After segmentation, the following technique -/// is applied for generating trigrams: for each letter or digit in the input -/// string the algorithms looks for the possible next and skip-1-next characters -/// which can be jumped to during fuzzy matching. Each combination of such three -/// characters is inserted into the result. -/// +/// The symbol's name is broken into segments, e.g. "FooBar" has two segments. /// Trigrams can start at any character in the input. Then we can choose to move -/// to the next character, move to the start of the next segment, or skip over a -/// segment. +/// to the next character, move to the start of the next segment, or stop. /// -/// This also generates incomplete trigrams for short query scenarios: -/// * Empty trigram: "$$$". -/// * Unigram: the first character of the identifier. -/// * Bigrams: a 2-char prefix of the identifier and a bigram of the first two -/// HEAD characters (if they exist). -// -/// Note: the returned list of trigrams does not have duplicates, if any trigram -/// belongs to more than one class it is only inserted once. +/// Short trigrams (length 1-2) are used for short queries. These are: +/// - prefixes of the identifier, of length 1 and 2 +/// - the first character + next head character +/// +/// For "FooBar" we get the following trigrams: +/// {f, fo, fb, foo, fob, fba, oob, oba, bar}. +/// +/// Trigrams are lowercase, as trigram matching is case-insensitive. +/// Trigrams in the returned list are deduplicated. std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier); /// Returns list of unique fuzzy-search trigrams given a query. |