summaryrefslogtreecommitdiffstats
path: root/clang-tools-extra/clangd/FuzzyMatch.h
diff options
context:
space:
mode:
Diffstat (limited to 'clang-tools-extra/clangd/FuzzyMatch.h')
-rw-r--r--clang-tools-extra/clangd/FuzzyMatch.h50
1 files changed, 45 insertions, 5 deletions
diff --git a/clang-tools-extra/clangd/FuzzyMatch.h b/clang-tools-extra/clangd/FuzzyMatch.h
index 552ab78e8e1..f0c7e722ab3 100644
--- a/clang-tools-extra/clangd/FuzzyMatch.h
+++ b/clang-tools-extra/clangd/FuzzyMatch.h
@@ -16,6 +16,7 @@
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
@@ -24,6 +25,48 @@
namespace clang {
namespace clangd {
+// Utilities for word segmentation.
+// FuzzyMatcher already incorporates this logic, so most users don't need this.
+//
+// A name like "fooBar_baz" consists of several parts foo, bar, baz.
+// Aligning segmentation of word and pattern improves the fuzzy-match.
+// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
+//
+// First we classify each character into types (uppercase, lowercase, etc).
+// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
+
+// We distinguish the types of characters that affect segmentation.
+// It's not obvious how to segment digits, we treat them as lowercase letters.
+// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
+// This means we require exact (case-sensitive) match for those characters.
+enum CharType : unsigned char {
+ Empty = 0, // Before-the-start and after-the-end (and control chars).
+ Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.
+ Upper = 2, // Uppercase letters.
+ Punctuation = 3, // ASCII punctuation (including Space)
+};
+// A CharTypeSet is a bitfield representing all the character types in a word.
+// Its bits are 1<<Empty, 1<<Lower, etc.
+using CharTypeSet = unsigned char;
+
+// Each character's Role is the Head or Tail of a segment, or a Separator.
+// e.g. XMLHttpRequest_Async
+// +--+---+------ +----
+// ^Head ^Tail ^Separator
+enum CharRole : unsigned char {
+ Unknown = 0, // Stray control characters or impossible states.
+ Tail = 1, // Part of a word segment, but not the first character.
+ Head = 2, // The first character of a word segment.
+ Separator = 3, // Punctuation characters that separate word segments.
+};
+
+// Compute segmentation of Text.
+// Character roles are stored in Roles (Roles.size() must equal Text.size()).
+// The set of character types encountered is returned, this may inform
+// heuristics for dealing with poorly-segmented identifiers like "strndup".
+CharTypeSet calculateRoles(llvm::StringRef Text,
+ llvm::MutableArrayRef<CharRole> Roles);
+
// A matcher capable of matching and scoring strings against a single pattern.
// It's optimized for matching against many strings - match() does not allocate.
class FuzzyMatcher {
@@ -48,8 +91,6 @@ public:
private:
// We truncate the pattern and the word to bound the cost of matching.
constexpr static int MaxPat = 63, MaxWord = 127;
- enum CharRole : unsigned char; // For segmentation.
- enum CharType : unsigned char; // For segmentation.
// Action describes how a word character was matched to the pattern.
// It should be an enum, but this causes bitfield problems:
// - for MSVC the enum type must be explicitly unsigned for correctness
@@ -60,7 +101,6 @@ private:
bool init(llvm::StringRef Word);
void buildGraph();
- void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
bool allowMatch(int P, int W, Action Last) const;
int skipPenalty(int W, Action Last) const;
int matchBonus(int P, int W, Action Last) const;
@@ -70,7 +110,7 @@ private:
int PatN; // Length
char LowPat[MaxPat]; // Pattern in lowercase
CharRole PatRole[MaxPat]; // Pattern segmentation info
- int PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters
+ CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters
float ScoreScale; // Normalizes scores for the pattern length.
// Word data is initialized on each call to match(), mostly by init().
@@ -78,7 +118,7 @@ private:
int WordN; // Length
char LowWord[MaxWord]; // Word in lowercase
CharRole WordRole[MaxWord]; // Word segmentation info
- int WordTypeSet; // Bitmask of 1<<CharType for all Word characters
+ CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters
bool WordContainsPattern; // Simple substring check
// Cumulative best-match score table.
OpenPOWER on IntegriCloud