clang-tools-extra/clangd/FuzzyMatch.h

   1 //===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements fuzzy-matching of strings against identifiers.
  10 // It indicates both the existence and quality of a match:
  11 // 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
  16 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
  17
  18 #include "llvm/ADT/ArrayRef.h"
  19 #include "llvm/ADT/SmallString.h"
  20 #include "llvm/ADT/StringRef.h"
  21 #include "llvm/Support/raw_ostream.h"
  22 #include <optional>
  23
  24 namespace clang {
  25 namespace clangd {
  26
  27 // Utilities for word segmentation.
  28 // FuzzyMatcher already incorporates this logic, so most users don't need this.
  29 //
  30 // A name like "fooBar_baz" consists of several parts foo, bar, baz.
  31 // Aligning segmentation of word and pattern improves the fuzzy-match.
  32 // For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
  33 //
  34 // First we classify each character into types (uppercase, lowercase, etc).
  35 // Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
  36
  37 // We distinguish the types of characters that affect segmentation.
  38 // It's not obvious how to segment digits, we treat them as lowercase letters.
  39 // As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
  40 // This means we require exact (case-sensitive) match for those characters.
  41 enum CharType : unsigned char {
  42   Empty = 0,       // Before-the-start and after-the-end (and control chars).
  43   Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
  44   Upper = 2,       // Uppercase letters.
  45   Punctuation = 3, // ASCII punctuation (including Space)
  46 };
  47 // A CharTypeSet is a bitfield representing all the character types in a word.
  48 // Its bits are 1<<Empty, 1<<Lower, etc.
  49 using CharTypeSet = unsigned char;
  50
  51 // Each character's Role is the Head or Tail of a segment, or a Separator.
  52 // e.g. XMLHttpRequest_Async
  53 //      +--+---+------ +----
  54 //      ^Head   ^Tail ^Separator
  55 enum CharRole : unsigned char {
  56   Unknown = 0,   // Stray control characters or impossible states.
  57   Tail = 1,      // Part of a word segment, but not the first character.
  58   Head = 2,      // The first character of a word segment.
  59   Separator = 3, // Punctuation characters that separate word segments.
  60 };
  61
  62 // Compute segmentation of Text.
  63 // Character roles are stored in Roles (Roles.size() must equal Text.size()).
  64 // The set of character types encountered is returned, this may inform
  65 // heuristics for dealing with poorly-segmented identifiers like "strndup".
  66 CharTypeSet calculateRoles(llvm::StringRef Text,
  67                            llvm::MutableArrayRef<CharRole> Roles);
  68
  69 // A matcher capable of matching and scoring strings against a single pattern.
  70 // It's optimized for matching against many strings - match() does not allocate.
  71 class FuzzyMatcher {
  72 public:
  73   // Characters beyond MaxPat are ignored.
  74   FuzzyMatcher(llvm::StringRef Pattern);
  75
  76   // If Word matches the pattern, return a score indicating the quality match.
  77   // Scores usually fall in a [0,1] range, with 1 being a very good score.
  78   // "Super" scores in (1,2] are possible if the pattern is the full word.
  79   // Characters beyond MaxWord are ignored.
  80   std::optional<float> match(llvm::StringRef Word);
  81
  82   llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }
  83   bool empty() const { return PatN == 0; }
  84
  85   // Dump internal state from the last match() to the stream, for debugging.
  86   // Returns the pattern with [] around matched characters, e.g.
  87   //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
  88   llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;
  89
  90 private:
  91   // We truncate the pattern and the word to bound the cost of matching.
  92   constexpr static int MaxPat = 63, MaxWord = 127;
  93   // Action describes how a word character was matched to the pattern.
  94   // It should be an enum, but this causes bitfield problems:
  95   //   - for MSVC the enum type must be explicitly unsigned for correctness
  96   //   - GCC 4.8 complains not all values fit if the type is unsigned
  97   using Action = bool;
  98   constexpr static Action Miss = false; // Word character was skipped.
  99   constexpr static Action Match = true; // Matched against a pattern character.
 100
 101   bool init(llvm::StringRef Word);
 102   void buildGraph();
 103   bool allowMatch(int P, int W, Action Last) const;
 104   int skipPenalty(int W, Action Last) const;
 105   int matchBonus(int P, int W, Action Last) const;
 106
 107   // Pattern data is initialized by the constructor, then constant.
 108   char Pat[MaxPat];         // Pattern data
 109   int PatN;                 // Length
 110   char LowPat[MaxPat];      // Pattern in lowercase
 111   CharRole PatRole[MaxPat]; // Pattern segmentation info
 112   CharTypeSet PatTypeSet;   // Bitmask of 1<<CharType for all Pattern characters
 113   float ScoreScale;         // Normalizes scores for the pattern length.
 114
 115   // Word data is initialized on each call to match(), mostly by init().
 116   char Word[MaxWord];         // Word data
 117   int WordN;                  // Length
 118   char LowWord[MaxWord];      // Word in lowercase
 119   CharRole WordRole[MaxWord]; // Word segmentation info
 120   CharTypeSet WordTypeSet;    // Bitmask of 1<<CharType for all Word characters
 121   bool WordContainsPattern;   // Simple substring check
 122
 123   // Cumulative best-match score table.
 124   // Boundary conditions are filled in by the constructor.
 125   // The rest is repopulated for each match(), by buildGraph().
 126   struct ScoreInfo {
 127     signed int Score : 15;
 128     Action Prev : 1;
 129   };
 130   ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
 131 };
 132
 133 } // namespace clangd
 134 } // namespace clang
 135
 136 #endif