diff options
Diffstat (limited to 'clang-tools-extra/clangd/FuzzyMatch.h')
-rw-r--r-- | clang-tools-extra/clangd/FuzzyMatch.h | 50 |
1 files changed, 45 insertions, 5 deletions
diff --git a/clang-tools-extra/clangd/FuzzyMatch.h b/clang-tools-extra/clangd/FuzzyMatch.h index 552ab78e8e1..f0c7e722ab3 100644 --- a/clang-tools-extra/clangd/FuzzyMatch.h +++ b/clang-tools-extra/clangd/FuzzyMatch.h @@ -16,6 +16,7 @@ #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" @@ -24,6 +25,48 @@ namespace clang { namespace clangd { +// Utilities for word segmentation. +// FuzzyMatcher already incorporates this logic, so most users don't need this. +// +// A name like "fooBar_baz" consists of several parts foo, bar, baz. +// Aligning segmentation of word and pattern improves the fuzzy-match. +// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" +// +// First we classify each character into types (uppercase, lowercase, etc). +// Then we look at the sequence: e.g. [upper, lower] is the start of a segment. + +// We distinguish the types of characters that affect segmentation. +// It's not obvious how to segment digits, we treat them as lowercase letters. +// As we don't decode UTF-8, we treat bytes over 127 as lowercase too. +// This means we require exact (case-sensitive) match for those characters. +enum CharType : unsigned char { + Empty = 0, // Before-the-start and after-the-end (and control chars). + Lower = 1, // Lowercase letters, digits, and non-ASCII bytes. + Upper = 2, // Uppercase letters. + Punctuation = 3, // ASCII punctuation (including Space) +}; +// A CharTypeSet is a bitfield representing all the character types in a word. +// Its bits are 1<<Empty, 1<<Lower, etc. +using CharTypeSet = unsigned char; + +// Each character's Role is the Head or Tail of a segment, or a Separator. +// e.g. XMLHttpRequest_Async +// +--+---+------ +---- +// ^Head ^Tail ^Separator +enum CharRole : unsigned char { + Unknown = 0, // Stray control characters or impossible states. + Tail = 1, // Part of a word segment, but not the first character. + Head = 2, // The first character of a word segment. + Separator = 3, // Punctuation characters that separate word segments. +}; + +// Compute segmentation of Text. +// Character roles are stored in Roles (Roles.size() must equal Text.size()). +// The set of character types encountered is returned, this may inform +// heuristics for dealing with poorly-segmented identifiers like "strndup". +CharTypeSet calculateRoles(llvm::StringRef Text, + llvm::MutableArrayRef<CharRole> Roles); + // A matcher capable of matching and scoring strings against a single pattern. // It's optimized for matching against many strings - match() does not allocate. class FuzzyMatcher { @@ -48,8 +91,6 @@ public: private: // We truncate the pattern and the word to bound the cost of matching. constexpr static int MaxPat = 63, MaxWord = 127; - enum CharRole : unsigned char; // For segmentation. - enum CharType : unsigned char; // For segmentation. // Action describes how a word character was matched to the pattern. // It should be an enum, but this causes bitfield problems: // - for MSVC the enum type must be explicitly unsigned for correctness @@ -60,7 +101,6 @@ private: bool init(llvm::StringRef Word); void buildGraph(); - void calculateRoles(const char *Text, CharRole *Out, int &Types, int N); bool allowMatch(int P, int W, Action Last) const; int skipPenalty(int W, Action Last) const; int matchBonus(int P, int W, Action Last) const; @@ -70,7 +110,7 @@ private: int PatN; // Length char LowPat[MaxPat]; // Pattern in lowercase CharRole PatRole[MaxPat]; // Pattern segmentation info - int PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters + CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters float ScoreScale; // Normalizes scores for the pattern length. // Word data is initialized on each call to match(), mostly by init(). @@ -78,7 +118,7 @@ private: int WordN; // Length char LowWord[MaxWord]; // Word in lowercase CharRole WordRole[MaxWord]; // Word segmentation info - int WordTypeSet; // Bitmask of 1<<CharType for all Word characters + CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters bool WordContainsPattern; // Simple substring check // Cumulative best-match score table. |