[clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

Summary: The intent was that [ar] doesn't match "FooBar"; the first character must match a Head character (hard requirement, not just a low score). This matches VSCode, and was "tested" but the tests were defective. The tests expected matches("FooBar") to fail for lack of a match. But instead it fails because the string should be annotated - matches("FooB[ar]"). This patch makes matches("FooBar") ignore annotations, as was intended. Fixing the code to reject weak matches for the first char causes problems: - [bre] no longer matches "HTMLBRElement". We allow matching against an uppercase char even if we don't think it's head. Only do this if there's at least one lowercase, to avoid triggering on MACROS - [print] no longer matches "sprintf". This is hard to fix without false positives (e.g. [int] vs "sprintf"]) This patch leaves this case broken. A future patch will add a dictionary providing custom segmentation to common names from the standard library. Fixed a couple of index tests that indirectly relied on broken fuzzy matching. Added const in a couple of missing places for consistency with new code. Subscribers: klimek, ilya-biryukov, jkorous-apple, ioeric, cfe-commits Differential Revision: https://reviews.llvm.org/D44003 llvm-svn: 326721
author: Sam McCall <sam.mccall@gmail.com> 2018-03-05 17:34:33 +0000
committer: Sam McCall <sam.mccall@gmail.com> 2018-03-05 17:34:33 +0000
commit: 77a719cb9ebc01db7d6936d1b4dd2bfcfa38b39b (patch)
tree: 58c8947de2fffb697be9a5676460eac9c8d9d51f
parent: 0b7c6422fbd0c837142f347ab92a650706ebcedf (diff)
download: bcm5719-llvm-77a719cb9ebc01db7d6936d1b4dd2bfcfa38b39b.tar.gz
bcm5719-llvm-77a719cb9ebc01db7d6936d1b4dd2bfcfa38b39b.zip
4 files changed, 76 insertions, 47 deletions
diff --git a/clang-tools-extra/clangd/FuzzyMatch.cpp b/clang-tools-extra/clangd/FuzzyMatch.cpp
index 68ba695fe64..152f1799777 100644
--- a/clang-tools-extra/clangd/FuzzyMatch.cpp
+++ b/clang-tools-extra/clangd/FuzzyMatch.cpp
@@ -33,6 +33,8 @@
 //    Legal if the characters match.
 //  - Moving down (consuming a pattern character) is never legal.
 //    Never legal: all pattern characters must match something.
+// Characters are matched case-insensitively.
+// The first pattern character may only match the start of a word segment.
 //
 // The scoring is based on heuristics:
 //  - when matching a character, apply a bonus or penalty depending on the
@@ -74,13 +76,11 @@ static bool isAwful(int S) { return S < AwfulScore / 2; }
 static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
 
 FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
-    : PatN(std::min<int>(MaxPat, Pattern.size())), CaseSensitive(false),
+    : PatN(std::min<int>(MaxPat, Pattern.size())),
       ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
   std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
-  for (int I = 0; I < PatN; ++I) {
+  for (int I = 0; I < PatN; ++I)
     LowPat[I] = lower(Pat[I]);
-    CaseSensitive |= LowPat[I] != Pat[I];
-  }
   Scores[0][0][Miss] = {0, Miss};
   Scores[0][0][Match] = {AwfulScore, Miss};
   for (int P = 0; P <= PatN; ++P)
@@ -88,7 +88,7 @@ FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
       for (Action A : {Miss, Match})
         Scores[P][W][A] = {AwfulScore, Miss};
   if (PatN > 0)
-    calculateRoles(Pat, PatRole, PatN);
+    calculateRoles(Pat, PatRole, PatTypeSet, PatN);
 }
 
 Optional<float> FuzzyMatcher::match(StringRef Word) {
@@ -177,16 +177,21 @@ constexpr static uint8_t CharRoles[] = {
 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
   return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
 }
-void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
+                                  int N) {
   assert(N > 0);
+  CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
+  TypeSet = 1 << Type;
   // Types holds a sliding window of (Prev, Curr, Next) types.
   // Initial value is (Empty, Empty, type of Text[0]).
-  int Types = packedLookup<CharType>(CharTypes, Text[0]);
+  int Types = Type;
   // Rotate slides in the type of the next character.
   auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
   for (int I = 0; I < N - 1; ++I) {
     // For each character, rotate in the next, and look up the role.
-    Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
+    Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
+    TypeSet |= 1 << Type;
+    Rotate(Type);
     *Out++ = packedLookup<CharRole>(CharRoles, Types);
   }
   // For the last character, the "next character" is Empty.
@@ -214,7 +219,10 @@ bool FuzzyMatcher::init(StringRef NewWord) {
       ++P;
   }
 
-  calculateRoles(Word, WordRole, WordN);
+  // FIXME: some words are hard to tokenize algorithmically.
+  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
+  // We could add a tokenization dictionary for common stdlib names.
+  calculateRoles(Word, WordRole, WordTypeSet, WordN);
   return true;
 }
 
@@ -247,7 +255,7 @@ void FuzzyMatcher::buildGraph() {
                         ? ScoreInfo{MatchMissScore, Match}
                         : ScoreInfo{MissMissScore, Miss};
 
-      if (LowPat[P] != LowWord[W]) { // No match possible.
+      if (!allowMatch(P, W)) {
         Score[Match] = {AwfulScore, Miss};
       } else {
         auto &PreMatch = Scores[P][W];
@@ -261,7 +269,22 @@ void FuzzyMatcher::buildGraph() {
   }
 }
 
-int FuzzyMatcher::skipPenalty(int W, Action Last) {
+bool FuzzyMatcher::allowMatch(int P, int W) const {
+  if (LowPat[P] != LowWord[W])
+    return false;
+  // We require a "strong" match for the first pattern character only.
+  if (P > 0)
+    return true;
+  // Obvious "strong match" for first char: match against a word head.
+  // We're banning matches outright, so conservatively accept some other cases
+  // where our segmentation might be wrong:
+  //  - allow matching B in ABCDef (but not in NDEBUG)
+  //  - we'd like to accept print in sprintf, but too many false positives
+  return WordRole[W] != Tail ||
+         (Word[W] != LowWord[W] && WordTypeSet & 1 << Lower);
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) const {
   int S = 0;
   if (WordRole[W] == Head) // Skipping a segment.
     S += 1;
@@ -270,14 +293,14 @@ int FuzzyMatcher::skipPenalty(int W, Action Last) {
   return S;
 }
 
-int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
   assert(LowPat[P] == LowWord[W]);
   int S = 1;
   // Bonus: pattern so far is a (case-insensitive) prefix of the word.
   if (P == W) // We can't skip pattern characters, so we must have matched all.
     ++S;
   // Bonus: case matches, or a Head in the pattern aligns with one in the word.
-  if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) ||
+  if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
       (PatRole[P] == Head && WordRole[W] == Head))
     ++S;
   // Penalty: matching inside a segment (and previous char wasn't matched).
@@ -312,7 +335,7 @@ llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
                               Scores[PatN][WordN][Miss].Score))) {
     OS << "Substring check passed, but all matches are forbidden\n";
   }
-  if (!CaseSensitive)
+  if (!(PatTypeSet & 1 << Upper))
     OS << "Lowercase query, so scoring ignores case\n";
 
   // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
diff --git a/clang-tools-extra/clangd/FuzzyMatch.h b/clang-tools-extra/clangd/FuzzyMatch.h
index 21bf92ce256..13a25e28311 100644
--- a/clang-tools-extra/clangd/FuzzyMatch.h
+++ b/clang-tools-extra/clangd/FuzzyMatch.h
@@ -56,16 +56,17 @@ private:
 
   bool init(llvm::StringRef Word);
   void buildGraph();
-  void calculateRoles(const char *Text, CharRole *Out, int N);
-  int skipPenalty(int W, Action Last);
-  int matchBonus(int P, int W, Action Last);
+  void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
+  bool allowMatch(int P, int W) const;
+  int skipPenalty(int W, Action Last) const;
+  int matchBonus(int P, int W, Action Last) const;
 
   // Pattern data is initialized by the constructor, then constant.
   char Pat[MaxPat];         // Pattern data
   int PatN;                 // Length
   char LowPat[MaxPat];      // Pattern in lowercase
   CharRole PatRole[MaxPat]; // Pattern segmentation info
-  bool CaseSensitive;       // Case-sensitive match if pattern has uppercase
+  int PatTypeSet;           // Bitmask of 1<<CharType
   float ScoreScale;         // Normalizes scores for the pattern length.
 
   // Word data is initialized on each call to match(), mostly by init().
@@ -73,6 +74,7 @@ private:
   int WordN;                  // Length
   char LowWord[MaxWord];      // Word in lowercase
   CharRole WordRole[MaxWord]; // Word segmentation info
+  int WordTypeSet;            // Bitmask of 1<<CharType
   bool WordContainsPattern;   // Simple substring check
 
   // Cumulative best-match score table.
diff --git a/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp b/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp
index 6efcdbe8077..b665fe36e4d 100644
--- a/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp
+++ b/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp
@@ -20,16 +20,27 @@ using namespace llvm;
 using testing::Not;
 
 struct ExpectedMatch {
-  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+  // Annotations are optional, and will not be asserted if absent.
+  ExpectedMatch(StringRef Match) : Word(Match), Annotated(Match) {
     for (char C : "[]")
       Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+    if (Word.size() == Annotated->size())
+      Annotated = None;
+  }
+  bool accepts(StringRef ActualAnnotated) const {
+    return !Annotated || ActualAnnotated == *Annotated;
   }
   std::string Word;
-  StringRef Annotated;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
+    return OS << "'" << M.Word;
+    if (M.Annotated)
+      OS << "' as " << *M.Annotated;
+  }
+
+private:
+  Optional<StringRef> Annotated;
 };
-raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
-  return OS << "'" << M.Word << "' as " << M.Annotated;
-}
 
 struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
   ExpectedMatch Candidate;
@@ -47,7 +58,7 @@ struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
     FuzzyMatcher Matcher(Pattern);
     auto Result = Matcher.match(Candidate.Word);
     auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
-    return Result && AnnotatedMatch == Candidate.Annotated;
+    return Result && Candidate.accepts(AnnotatedMatch);
   }
 };
 
@@ -122,6 +133,7 @@ TEST(FuzzyMatch, Matches) {
   EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
   EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
   EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+  EXPECT_THAT("b", Not(matches("NDEBUG")));
   EXPECT_THAT("Three", matches("[Three]"));
   EXPECT_THAT("fo", Not(matches("barfoo")));
   EXPECT_THAT("fo", matches("bar_[fo]o"));
@@ -151,8 +163,9 @@ TEST(FuzzyMatch, Matches) {
   EXPECT_THAT("g", matches("zz[G]roup"));
 
   EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive.
-  EXPECT_THAT("printf", matches("s[printf]"));
-  EXPECT_THAT("str", matches("o[str]eam"));
+  // These would ideally match, but would need special segmentation rules.
+  EXPECT_THAT("printf", Not(matches("s[printf]")));
+  EXPECT_THAT("str", Not(matches("o[str]eam")));
 }
 
 struct RankMatcher : public testing::MatcherInterface<StringRef> {
@@ -188,9 +201,8 @@ struct RankMatcher : public testing::MatcherInterface<StringRef> {
         llvm::raw_string_ostream Info(Buf);
         auto AnnotatedMatch = Matcher.dumpLast(Info);
 
-        if (AnnotatedMatch != Str.Annotated) {
-          *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch
-              << " instead of " << Str.Annotated << "\n"
+        if (!Str.accepts(AnnotatedMatch)) {
+          *OS << "\nDoesn't match " << Str << ", but " << AnnotatedMatch << "\n"
               << Info.str();
           Ok = false;
         } else if (LastScore && *LastScore < *Score) {
diff --git a/clang-tools-extra/unittests/clangd/IndexTests.cpp b/clang-tools-extra/unittests/clangd/IndexTests.cpp
index 3c61533818e..20eb4527d9f 100644
--- a/clang-tools-extra/unittests/clangd/IndexTests.cpp
+++ b/clang-tools-extra/unittests/clangd/IndexTests.cpp
@@ -116,14 +116,6 @@ TEST(MemIndexTest, MemIndexSymbolsRecycled) {
   EXPECT_TRUE(Symbols.expired());
 }
 
-TEST(MemIndexTest, MemIndexMatchSubstring) {
-  MemIndex I;
-  I.build(generateNumSymbols(5, 25));
-  FuzzyFindRequest Req;
-  Req.Query = "5";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25"));
-}
-
 TEST(MemIndexTest, MemIndexDeduplicate) {
   auto Symbols = generateNumSymbols(0, 10);
 
@@ -166,46 +158,46 @@ TEST(MemIndexTest, FuzzyMatch) {
 
 TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a", "b"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(MemIndexTest, NoMatchNestedScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::b::yy"}));
+  I.build(generateSymbols({"a::y1", "a::b::y2"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(MemIndexTest, IgnoreCases) {
author	Sam McCall <sam.mccall@gmail.com>	2018-03-05 17:34:33 +0000
committer	Sam McCall <sam.mccall@gmail.com>	2018-03-05 17:34:33 +0000
commit	77a719cb9ebc01db7d6936d1b4dd2bfcfa38b39b (patch)
tree	58c8947de2fffb697be9a5676460eac9c8d9d51f
parent	0b7c6422fbd0c837142f347ab92a650706ebcedf (diff)
download	bcm5719-llvm-77a719cb9ebc01db7d6936d1b4dd2bfcfa38b39b.tar.gz bcm5719-llvm-77a719cb9ebc01db7d6936d1b4dd2bfcfa38b39b.zip