clang-tools-extra/clangd/FuzzyMatch.cpp

   1 //===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
  10 // we consider the possible partial match states:
  11 //
  12 //     u n i q u e _ p t r
  13 //   +---------------------
  14 //   |A . . . . . . . . . .
  15 //  u|
  16 //   |. . . . . . . . . . .
  17 //  _|
  18 //   |. . . . . . . O . . .
  19 //  p|
  20 //   |. . . . . . . . . . B
  21 //
  22 // Each dot represents some prefix of the pattern being matched against some
  23 // prefix of the word.
  24 //   - A is the initial state: '' matched against ''
  25 //   - O is an intermediate state: 'u_' matched against 'unique_'
  26 //   - B is the target state: 'u_p' matched against 'unique_ptr'
  27 //
  28 // We aim to find the best path from A->B.
  29 //  - Moving right (consuming a word character)
  30 //    Always legal: not all word characters must match.
  31 //  - Moving diagonally (consuming both a word and pattern character)
  32 //    Legal if the characters match.
  33 //  - Moving down (consuming a pattern character) is never legal.
  34 //    Never legal: all pattern characters must match something.
  35 // Characters are matched case-insensitively.
  36 // The first pattern character may only match the start of a word segment.
  37 //
  38 // The scoring is based on heuristics:
  39 //  - when matching a character, apply a bonus or penalty depending on the
  40 //    match quality (does case match, do word segments align, etc)
  41 //  - when skipping a character, apply a penalty if it hurts the match
  42 //    (it starts a word segment, or splits the matched region, etc)
  43 //
  44 // These heuristics require the ability to "look backward" one character, to
  45 // see whether it was matched or not. Therefore the dynamic-programming matrix
  46 // has an extra dimension (last character matched).
  47 // Each entry also has an additional flag indicating whether the last-but-one
  48 // character matched, which is needed to trace back through the scoring table
  49 // and reconstruct the match.
  50 //
  51 // We treat strings as byte-sequences, so only ASCII has first-class support.
  52 //
  53 // This algorithm was inspired by VS code's client-side filtering, and aims
  54 // to be mostly-compatible.
  55 //
  56 //===----------------------------------------------------------------------===//
  57
  58 #include "FuzzyMatch.h"
  59 #include "llvm/Support/Format.h"
  60 #include <optional>
  61
  62 namespace clang {
  63 namespace clangd {
  64
  65 constexpr int FuzzyMatcher::MaxPat;
  66 constexpr int FuzzyMatcher::MaxWord;
  67
  68 static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
  69 // A "negative infinity" score that won't overflow.
  70 // We use this to mark unreachable states and forbidden solutions.
  71 // Score field is 15 bits wide, min value is -2^14, we use half of that.
  72 static constexpr int AwfulScore = -(1 << 13);
  73 static bool isAwful(int S) { return S < AwfulScore / 2; }
  74 static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
  75
  76 FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
  77     : PatN(std::min<int>(MaxPat, Pattern.size())),
  78       ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
  79   std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
  80   for (int I = 0; I < PatN; ++I)
  81     LowPat[I] = lower(Pat[I]);
  82   Scores[0][0][Miss] = {0, Miss};
  83   Scores[0][0][Match] = {AwfulScore, Miss};
  84   for (int P = 0; P <= PatN; ++P)
  85     for (int W = 0; W < P; ++W)
  86       for (Action A : {Miss, Match})
  87         Scores[P][W][A] = {AwfulScore, Miss};
  88   PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN),
  89                               llvm::MutableArrayRef(PatRole, PatN));
  90 }
  91
  92 std::optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
  93   if (!(WordContainsPattern = init(Word)))
  94     return std::nullopt;
  95   if (!PatN)
  96     return 1;
  97   buildGraph();
  98   auto Best = std::max(Scores[PatN][WordN][Miss].Score,
  99                        Scores[PatN][WordN][Match].Score);
 100   if (isAwful(Best))
 101     return std::nullopt;
 102   float Score =
 103       ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
 104   // If the pattern is as long as the word, we have an exact string match,
 105   // since every pattern character must match something.
 106   if (WordN == PatN)
 107     Score *= 2; // May not be perfect 2 if case differs in a significant way.
 108   return Score;
 109 }
 110
 111 // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
 112 // The top 6 bits of the char select the byte, the bottom 2 select the offset.
 113 // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
 114 constexpr static uint8_t CharTypes[] = {
 115     0x00, 0x00, 0x00, 0x00, // Control characters
 116     0x00, 0x00, 0x00, 0x00, // Control characters
 117     0xff, 0xff, 0xff, 0xff, // Punctuation
 118     0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
 119     0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
 120     0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
 121     0x57, 0x55, 0x55, 0x55, // ` and a-o
 122     0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
 123     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
 124     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
 125     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
 126     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
 127 };
 128
 129 // The Role can be determined from the Type of a character and its neighbors:
 130 //
 131 //   Example  | Chars | Type | Role
 132 //   ---------+--------------+-----
 133 //   F(o)oBar | Foo   | Ull  | Tail
 134 //   Foo(B)ar | oBa   | lUl  | Head
 135 //   (f)oo    | ^fo   | Ell  | Head
 136 //   H(T)TP   | HTT   | UUU  | Tail
 137 //
 138 // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
 139 // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
 140 // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
 141 constexpr static uint8_t CharRoles[] = {
 142     // clang-format off
 143     //         Curr= Empty Lower Upper Separ
 144     /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
 145     /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
 146     /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
 147     /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
 148     // clang-format on
 149 };
 150
 151 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
 152   return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
 153 }
 154 CharTypeSet calculateRoles(llvm::StringRef Text,
 155                            llvm::MutableArrayRef<CharRole> Roles) {
 156   assert(Text.size() == Roles.size());
 157   if (Text.size() == 0)
 158     return 0;
 159   CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
 160   CharTypeSet TypeSet = 1 << Type;
 161   // Types holds a sliding window of (Prev, Curr, Next) types.
 162   // Initial value is (Empty, Empty, type of Text[0]).
 163   int Types = Type;
 164   // Rotate slides in the type of the next character.
 165   auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
 166   for (unsigned I = 0; I < Text.size() - 1; ++I) {
 167     // For each character, rotate in the next, and look up the role.
 168     Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
 169     TypeSet |= 1 << Type;
 170     Rotate(Type);
 171     Roles[I] = packedLookup<CharRole>(CharRoles, Types);
 172   }
 173   // For the last character, the "next character" is Empty.
 174   Rotate(Empty);
 175   Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
 176   return TypeSet;
 177 }
 178
 179 // Sets up the data structures matching Word.
 180 // Returns false if we can cheaply determine that no match is possible.
 181 bool FuzzyMatcher::init(llvm::StringRef NewWord) {
 182   WordN = std::min<int>(MaxWord, NewWord.size());
 183   if (PatN > WordN)
 184     return false;
 185   std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
 186   if (PatN == 0)
 187     return true;
 188   for (int I = 0; I < WordN; ++I)
 189     LowWord[I] = lower(Word[I]);
 190
 191   // Cheap subsequence check.
 192   for (int W = 0, P = 0; P != PatN; ++W) {
 193     if (W == WordN)
 194       return false;
 195     if (LowWord[W] == LowPat[P])
 196       ++P;
 197   }
 198
 199   // FIXME: some words are hard to tokenize algorithmically.
 200   // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
 201   // We could add a tokenization dictionary for common stdlib names.
 202   WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN),
 203                                llvm::MutableArrayRef(WordRole, WordN));
 204   return true;
 205 }
 206
 207 // The forwards pass finds the mappings of Pattern onto Word.
 208 // Score = best score achieved matching Word[..W] against Pat[..P].
 209 // Unlike other tables, indices range from 0 to N *inclusive*
 210 // Matched = whether we chose to match Word[W] with Pat[P] or not.
 211 //
 212 // Points are mostly assigned to matched characters, with 1 being a good score
 213 // and 3 being a great one. So we treat the score range as [0, 3 * PatN].
 214 // This range is not strict: we can apply larger bonuses/penalties, or penalize
 215 // non-matched characters.
 216 void FuzzyMatcher::buildGraph() {
 217   for (int W = 0; W < WordN; ++W) {
 218     Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
 219                               Miss};
 220     Scores[0][W + 1][Match] = {AwfulScore, Miss};
 221   }
 222   for (int P = 0; P < PatN; ++P) {
 223     for (int W = P; W < WordN; ++W) {
 224       auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
 225
 226       auto MatchMissScore = PreMiss[Match].Score;
 227       auto MissMissScore = PreMiss[Miss].Score;
 228       if (P < PatN - 1) { // Skipping trailing characters is always free.
 229         MatchMissScore -= skipPenalty(W, Match);
 230         MissMissScore -= skipPenalty(W, Miss);
 231       }
 232       Score[Miss] = (MatchMissScore > MissMissScore)
 233                         ? ScoreInfo{MatchMissScore, Match}
 234                         : ScoreInfo{MissMissScore, Miss};
 235
 236       auto &PreMatch = Scores[P][W];
 237       auto MatchMatchScore =
 238           allowMatch(P, W, Match)
 239               ? PreMatch[Match].Score + matchBonus(P, W, Match)
 240               : AwfulScore;
 241       auto MissMatchScore = allowMatch(P, W, Miss)
 242                                 ? PreMatch[Miss].Score + matchBonus(P, W, Miss)
 243                                 : AwfulScore;
 244       Score[Match] = (MatchMatchScore > MissMatchScore)
 245                          ? ScoreInfo{MatchMatchScore, Match}
 246                          : ScoreInfo{MissMatchScore, Miss};
 247     }
 248   }
 249 }
 250
 251 bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
 252   if (LowPat[P] != LowWord[W])
 253     return false;
 254   // We require a "strong" match:
 255   // - for the first pattern character.  [foo] !~ "barefoot"
 256   // - after a gap.                      [pat] !~ "patnther"
 257   if (Last == Miss) {
 258     // We're banning matches outright, so conservatively accept some other cases
 259     // where our segmentation might be wrong:
 260     //  - allow matching B in ABCDef (but not in NDEBUG)
 261     //  - we'd like to accept print in sprintf, but too many false positives
 262     if (WordRole[W] == Tail &&
 263         (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
 264       return false;
 265   }
 266   return true;
 267 }
 268
 269 int FuzzyMatcher::skipPenalty(int W, Action Last) const {
 270   if (W == 0) // Skipping the first character.
 271     return 3;
 272   if (WordRole[W] == Head) // Skipping a segment.
 273     return 1; // We want to keep this lower than a consecutive match bonus.
 274   // Instead of penalizing non-consecutive matches, we give a bonus to a
 275   // consecutive match in matchBonus. This produces a better score distribution
 276   // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
 277   return 0;
 278 }
 279
 280 int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
 281   assert(LowPat[P] == LowWord[W]);
 282   int S = 1;
 283   bool IsPatSingleCase =
 284       (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
 285   // Bonus: case matches, or a Head in the pattern aligns with one in the word.
 286   // Single-case patterns lack segmentation signals and we assume any character
 287   // can be a head of a segment.
 288   if (Pat[P] == Word[W] ||
 289       (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
 290     ++S;
 291   // Bonus: a consecutive match. First character match also gets a bonus to
 292   // ensure prefix final match score normalizes to 1.0.
 293   if (W == 0 || Last == Match)
 294     S += 2;
 295   // Penalty: matching inside a segment (and previous char wasn't matched).
 296   if (WordRole[W] == Tail && P && Last == Miss)
 297     S -= 3;
 298   // Penalty: a Head in the pattern matches in the middle of a word segment.
 299   if (PatRole[P] == Head && WordRole[W] == Tail)
 300     --S;
 301   // Penalty: matching the first pattern character in the middle of a segment.
 302   if (P == 0 && WordRole[W] == Tail)
 303     S -= 4;
 304   assert(S <= PerfectBonus);
 305   return S;
 306 }
 307
 308 llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
 309   llvm::SmallString<256> Result;
 310   OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against ["
 311      << llvm::StringRef(Pat, PatN) << "] ===\n";
 312   if (PatN == 0) {
 313     OS << "Pattern is empty: perfect match.\n";
 314     return Result = llvm::StringRef(Word, WordN);
 315   }
 316   if (WordN == 0) {
 317     OS << "Word is empty: no match.\n";
 318     return Result;
 319   }
 320   if (!WordContainsPattern) {
 321     OS << "Substring check failed.\n";
 322     return Result;
 323   }
 324   if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
 325                        Scores[PatN][WordN][Miss].Score))) {
 326     OS << "Substring check passed, but all matches are forbidden\n";
 327   }
 328   if (!(PatTypeSet & 1 << Upper))
 329     OS << "Lowercase query, so scoring ignores case\n";
 330
 331   // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
 332   // The Score table has cumulative scores, subtracting along this path gives
 333   // us the per-letter scores.
 334   Action Last =
 335       (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
 336           ? Match
 337           : Miss;
 338   int S[MaxWord];
 339   Action A[MaxWord];
 340   for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
 341     A[W] = Last;
 342     const auto &Cell = Scores[P + 1][W + 1][Last];
 343     if (Last == Match)
 344       --P;
 345     const auto &Prev = Scores[P + 1][W][Cell.Prev];
 346     S[W] = Cell.Score - Prev.Score;
 347     Last = Cell.Prev;
 348   }
 349   for (int I = 0; I < WordN; ++I) {
 350     if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
 351       Result.push_back('[');
 352     if (A[I] == Miss && I > 0 && A[I - 1] == Match)
 353       Result.push_back(']');
 354     Result.push_back(Word[I]);
 355   }
 356   if (A[WordN - 1] == Match)
 357     Result.push_back(']');
 358
 359   for (char C : llvm::StringRef(Word, WordN))
 360     OS << " " << C << " ";
 361   OS << "\n";
 362   for (int I = 0, J = 0; I < WordN; I++)
 363     OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
 364   OS << "\n";
 365   for (int I = 0; I < WordN; I++)
 366     OS << llvm::format("%2d ", S[I]);
 367   OS << "\n";
 368
 369   OS << "\nSegmentation:";
 370   OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n ";
 371   for (int I = 0; I < WordN; ++I)
 372     OS << "?-+ "[static_cast<int>(WordRole[I])];
 373   OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n ";
 374   for (int I = 0; I < PatN; ++I)
 375     OS << "?-+ "[static_cast<int>(PatRole[I])];
 376   OS << "\n";
 377
 378   OS << "\nScoring table (last-Miss, last-Match):\n";
 379   OS << " |    ";
 380   for (char C : llvm::StringRef(Word, WordN))
 381     OS << "  " << C << " ";
 382   OS << "\n";
 383   OS << "-+----" << std::string(WordN * 4, '-') << "\n";
 384   for (int I = 0; I <= PatN; ++I) {
 385     for (Action A : {Miss, Match}) {
 386       OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
 387       for (int J = 0; J <= WordN; ++J) {
 388         if (!isAwful(Scores[I][J][A].Score))
 389           OS << llvm::format("%3d%c", Scores[I][J][A].Score,
 390                              Scores[I][J][A].Prev == Match ? '*' : ' ');
 391         else
 392           OS << "    ";
 393       }
 394       OS << "\n";
 395     }
 396   }
 397
 398   return Result;
 399 }
 400
 401 } // namespace clangd
 402 } // namespace clang