summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--clang-tools-extra/clangd/index/dex/Iterator.h8
-rw-r--r--clang-tools-extra/clangd/index/dex/Trigram.cpp94
-rw-r--r--clang-tools-extra/clangd/index/dex/Trigram.h14
-rw-r--r--clang-tools-extra/unittests/clangd/DexIndexTests.cpp60
4 files changed, 118 insertions, 58 deletions
diff --git a/clang-tools-extra/clangd/index/dex/Iterator.h b/clang-tools-extra/clangd/index/dex/Iterator.h
index 5e13b17af34..317b5496470 100644
--- a/clang-tools-extra/clangd/index/dex/Iterator.h
+++ b/clang-tools-extra/clangd/index/dex/Iterator.h
@@ -47,6 +47,14 @@ using DocID = uint32_t;
/// Contains sorted sequence of DocIDs all of which belong to symbols matching
/// certain criteria, i.e. containing a Search Token. PostingLists are values
/// for the inverted index.
+// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are
+// likely to be very dense and hence require special attention so that the index
+// doesn't use too much memory. Possible solution would be to construct
+// compressed posting lists which consist of ranges of DocIDs instead of
+// distinct DocIDs. A special case would be the empty query: for that case
+// TrueIterator should be implemented - an iterator which doesn't actually store
+// any PostingList within itself, but "contains" all DocIDs in range
+// [0, IndexSize).
using PostingList = std::vector<DocID>;
/// Immutable reference to PostingList object.
using PostingListRef = llvm::ArrayRef<DocID>;
diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp
index 549093df744..370ca29ea6d 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -10,11 +10,9 @@
#include "Trigram.h"
#include "../../FuzzyMatch.h"
#include "Token.h"
-
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringExtras.h"
-
#include <cctype>
#include <queue>
#include <string>
@@ -25,12 +23,11 @@ namespace clang {
namespace clangd {
namespace dex {
-// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
-// be generating unigrams and bigrams here, too. This would prevent symbol index
-// from applying fuzzy matching on a tremendous number of symbols and allow
-// supplementary retrieval for short queries.
-//
-// Short names (total segment length <3 characters) are currently ignored.
+/// This is used to mark unigrams and bigrams and distinct them from complete
+/// trigrams. Since '$' is not present in valid identifier names, it is safe to
+/// use it as the special symbol.
+static const char END_MARKER = '$';
+
std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Identifier.size());
@@ -50,17 +47,45 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
// not available then 0 is stored.
std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
+ // Store two first HEAD characters in the identifier (if present).
+ std::deque<char> TwoHeads;
for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
Next[I] = {{NextTail, NextHead, NextNextHead}};
NextTail = Roles[I] == Tail ? I : 0;
if (Roles[I] == Head) {
NextNextHead = NextHead;
NextHead = I;
+ TwoHeads.push_front(LowercaseIdentifier[I]);
+ if (TwoHeads.size() > 2)
+ TwoHeads.pop_back();
}
}
DenseSet<Token> UniqueTrigrams;
- std::array<char, 4> Chars;
+
+ auto add = [&](std::string Chars) {
+ UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
+ };
+
+ // FIXME(kbobyrev): Instead of producing empty trigram for each identifier,
+ // just use True Iterator on the query side when the query string is empty.
+ add({{END_MARKER, END_MARKER, END_MARKER}});
+
+ if (TwoHeads.size() == 2)
+ add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}});
+
+ if (!LowercaseIdentifier.empty())
+ add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}});
+
+ if (LowercaseIdentifier.size() >= 2)
+ add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}});
+
+ if (LowercaseIdentifier.size() >= 3)
+ add({{LowercaseIdentifier[0], LowercaseIdentifier[1],
+ LowercaseIdentifier[2]}});
+
+ // Iterate through valid seqneces of three characters Fuzzy Matcher can
+ // process.
for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
// Skip delimiters.
if (Roles[I] != Head && Roles[I] != Tail)
@@ -71,13 +96,8 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
for (const unsigned K : Next[J]) {
if (!K)
continue;
- Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J],
- LowercaseIdentifier[K], 0}};
- auto Trigram = Token(Token::Kind::Trigram, Chars.data());
- // Push unique trigrams to the result.
- if (!UniqueTrigrams.count(Trigram)) {
- UniqueTrigrams.insert(Trigram);
- }
+ add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
+ LowercaseIdentifier[K]}});
}
}
}
@@ -89,33 +109,43 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
return Result;
}
-// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
-// inputs (total segment length <3 characters).
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Query.size());
calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
+ // Additional pass is necessary to count valid identifier characters.
+ // Depending on that, this function might return incomplete trigram.
+ unsigned ValidSymbolsCount = 0;
+ for (size_t I = 0; I < Roles.size(); ++I)
+ if (Roles[I] == Head || Roles[I] == Tail)
+ ++ValidSymbolsCount;
+
std::string LowercaseQuery = Query.lower();
DenseSet<Token> UniqueTrigrams;
- std::deque<char> Chars;
- for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
- // If current symbol is delimiter, just skip it.
- if (Roles[I] != Head && Roles[I] != Tail)
- continue;
+ // If the number of symbols which can form fuzzy matching trigram is not
+ // sufficient, generate a single incomplete trigram for query.
+ if (ValidSymbolsCount < 3) {
+ std::string Chars = LowercaseQuery.substr(0, std::min(3UL, Query.size()));
+ Chars.append(3 - Chars.size(), END_MARKER);
+ UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
+ } else {
+ std::deque<char> Chars;
+ for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
+ // If current symbol is delimiter, just skip it.
+ if (Roles[I] != Head && Roles[I] != Tail)
+ continue;
+
+ Chars.push_back(LowercaseQuery[I]);
- Chars.push_back(LowercaseQuery[I]);
+ if (Chars.size() > 3)
+ Chars.pop_front();
- if (Chars.size() > 3)
- Chars.pop_front();
- if (Chars.size() == 3) {
- auto Trigram =
- Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
- // Push unique trigrams to the result.
- if (!UniqueTrigrams.count(Trigram)) {
- UniqueTrigrams.insert(Trigram);
+ if (Chars.size() == 3) {
+ UniqueTrigrams.insert(
+ Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))));
}
}
}
diff --git a/clang-tools-extra/clangd/index/dex/Trigram.h b/clang-tools-extra/clangd/index/dex/Trigram.h
index 0e21c7d131c..00614edf56c 100644
--- a/clang-tools-extra/clangd/index/dex/Trigram.h
+++ b/clang-tools-extra/clangd/index/dex/Trigram.h
@@ -36,14 +36,20 @@ namespace dex {
/// First, given Identifier (unqualified symbol name) is segmented using
/// FuzzyMatch API and lowercased. After segmentation, the following technique
/// is applied for generating trigrams: for each letter or digit in the input
-/// string the algorithms looks for the possible next and skip-1-next symbols
+/// string the algorithms looks for the possible next and skip-1-next characters
/// which can be jumped to during fuzzy matching. Each combination of such three
-/// symbols is inserted into the result.
+/// characters is inserted into the result.
///
/// Trigrams can start at any character in the input. Then we can choose to move
/// to the next character, move to the start of the next segment, or skip over a
/// segment.
///
+/// This also generates incomplete trigrams for short query scenarios:
+/// * Empty trigram: "$$$".
+/// * Unigram: the first character of the identifier.
+/// * Bigrams: a 2-char prefix of the identifier and a bigram of the first two
+/// HEAD characters (if they exist).
+//
/// Note: the returned list of trigrams does not have duplicates, if any trigram
/// belongs to more than one class it is only inserted once.
std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
@@ -53,6 +59,10 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
/// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then,
/// the simplest trigrams - sequences of three consecutive letters and digits
/// are extracted and returned after deduplication.
+///
+/// For short queries (less than 3 characters with Head or Tail roles in Fuzzy
+/// Matching segmentation) this returns a single trigram with the first
+/// characters (up to 3) to perfrom prefix match.
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
} // namespace dex
diff --git a/clang-tools-extra/unittests/clangd/DexIndexTests.cpp b/clang-tools-extra/unittests/clangd/DexIndexTests.cpp
index d5db97ce1d0..f23192fbe80 100644
--- a/clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ b/clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -271,45 +271,57 @@ trigramsAre(std::initializer_list<std::string> Trigrams) {
}
TEST(DexIndexTrigrams, IdentifierTrigrams) {
- EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"}));
+ EXPECT_THAT(generateIdentifierTrigrams("X86"),
+ trigramsAre({"x86", "x$$", "x8$", "$$$"}));
- EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({}));
+ EXPECT_THAT(generateIdentifierTrigrams("nl"),
+ trigramsAre({"nl$", "n$$", "$$$"}));
+
+ EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$", "$$$"}));
EXPECT_THAT(generateIdentifierTrigrams("clangd"),
- trigramsAre({"cla", "lan", "ang", "ngd"}));
+ trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd", "$$$"}));
EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
- trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
+ trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde",
+ "def", "ab$", "ad$", "$$$"}));
- EXPECT_THAT(
- generateIdentifierTrigrams("a_b_c_d_e_"),
- trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+ EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
+ trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace",
+ "bcd", "bce", "bde", "cde", "ab$", "$$$"}));
- EXPECT_THAT(
- generateIdentifierTrigrams("unique_ptr"),
- trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
- "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+ EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
+ trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt",
+ "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep",
+ "ept", "ptr", "un$", "up$", "$$$"}));
EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
- trigramsAre({"tud", "tde", "ude", "dec", "ecl"}));
+ trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$",
+ "td$", "$$$"}));
EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
- trigramsAre({"iso", "iok", "sok"}));
-
- EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"),
- trigramsAre({
- "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
- "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
- "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
- "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
- "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
- }));
+ trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$", "$$$"}));
+
+ EXPECT_THAT(
+ generateIdentifierTrigrams("abc_defGhij__klm"),
+ trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh",
+ "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk",
+ "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek",
+ "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl",
+ "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik",
+ "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$", "$$$"}));
}
TEST(DexIndexTrigrams, QueryTrigrams) {
- EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
+ EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"}));
+ EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"}));
+ EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
- EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({}));
+ EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"}));
+ EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"}));
+ EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"}));
+
+ EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
EXPECT_THAT(generateQueryTrigrams("clangd"),
trigramsAre({"cla", "lan", "ang", "ngd"}));
OpenPOWER on IntegriCloud