summaryrefslogtreecommitdiffstats
path: root/clang-tools-extra/clangd/FuzzyMatch.h
blob: 13a25e28311811dce123b2836a0f1459e9c3c353 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements fuzzy-matching of strings against identifiers.
// It indicates both the existence and quality of a match:
// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H

#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"

namespace clang {
namespace clangd {

// A matcher capable of matching and scoring strings against a single pattern.
// It's optimized for matching against many strings - match() does not allocate.
class FuzzyMatcher {
public:
  // Characters beyond MaxPat are ignored.
  FuzzyMatcher(llvm::StringRef Pattern);

  // If Word matches the pattern, return a score in [0,1] (higher is better).
  // Characters beyond MaxWord are ignored.
  llvm::Optional<float> match(llvm::StringRef Word);

  llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }
  bool empty() const { return PatN == 0; }

  // Dump internal state from the last match() to the stream, for debugging.
  // Returns the pattern with [] around matched characters, e.g.
  //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
  llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;

private:
  // We truncate the pattern and the word to bound the cost of matching.
  constexpr static int MaxPat = 63, MaxWord = 127;
  enum CharRole : unsigned char; // For segmentation.
  enum CharType : unsigned char; // For segmentation.
  // Action should be an enum, but this causes bitfield problems:
  //   - for MSVC the enum type must be explicitly unsigned for correctness
  //   - GCC 4.8 complains not all values fit if the type is unsigned
  using Action = bool;
  constexpr static Action Miss = false, Match = true;

  bool init(llvm::StringRef Word);
  void buildGraph();
  void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
  bool allowMatch(int P, int W) const;
  int skipPenalty(int W, Action Last) const;
  int matchBonus(int P, int W, Action Last) const;

  // Pattern data is initialized by the constructor, then constant.
  char Pat[MaxPat];         // Pattern data
  int PatN;                 // Length
  char LowPat[MaxPat];      // Pattern in lowercase
  CharRole PatRole[MaxPat]; // Pattern segmentation info
  int PatTypeSet;           // Bitmask of 1<<CharType
  float ScoreScale;         // Normalizes scores for the pattern length.

  // Word data is initialized on each call to match(), mostly by init().
  char Word[MaxWord];         // Word data
  int WordN;                  // Length
  char LowWord[MaxWord];      // Word in lowercase
  CharRole WordRole[MaxWord]; // Word segmentation info
  int WordTypeSet;            // Bitmask of 1<<CharType
  bool WordContainsPattern;   // Simple substring check

  // Cumulative best-match score table.
  // Boundary conditions are filled in by the constructor.
  // The rest is repopulated for each match(), by buildGraph().
  struct ScoreInfo {
    signed int Score : 15;
    Action Prev : 1;
  };
  ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
};

} // namespace clangd
} // namespace clang

#endif
OpenPOWER on IntegriCloud