1 //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
10 // we consider the possible partial match states:
12 // u n i q u e _ p t r
13 // +---------------------
14 // |A . . . . . . . . . .
16 // |. . . . . . . . . . .
18 // |. . . . . . . O . . .
20 // |. . . . . . . . . . B
22 // Each dot represents some prefix of the pattern being matched against some
23 // prefix of the word.
24 // - A is the initial state: '' matched against ''
25 // - O is an intermediate state: 'u_' matched against 'unique_'
26 // - B is the target state: 'u_p' matched against 'unique_ptr'
28 // We aim to find the best path from A->B.
29 // - Moving right (consuming a word character)
30 // Always legal: not all word characters must match.
31 // - Moving diagonally (consuming both a word and pattern character)
32 // Legal if the characters match.
33 // - Moving down (consuming a pattern character) is never legal.
34 // Never legal: all pattern characters must match something.
35 // Characters are matched case-insensitively.
36 // The first pattern character may only match the start of a word segment.
38 // The scoring is based on heuristics:
39 // - when matching a character, apply a bonus or penalty depending on the
40 // match quality (does case match, do word segments align, etc)
41 // - when skipping a character, apply a penalty if it hurts the match
42 // (it starts a word segment, or splits the matched region, etc)
44 // These heuristics require the ability to "look backward" one character, to
45 // see whether it was matched or not. Therefore the dynamic-programming matrix
46 // has an extra dimension (last character matched).
47 // Each entry also has an additional flag indicating whether the last-but-one
48 // character matched, which is needed to trace back through the scoring table
49 // and reconstruct the match.
51 // We treat strings as byte-sequences, so only ASCII has first-class support.
53 // This algorithm was inspired by VS code's client-side filtering, and aims
54 // to be mostly-compatible.
56 //===----------------------------------------------------------------------===//
58 #include "FuzzyMatch.h"
59 #include "llvm/ADT/Optional.h"
60 #include "llvm/Support/Format.h"
65 constexpr int FuzzyMatcher::MaxPat
;
66 constexpr int FuzzyMatcher::MaxWord
;
68 static char lower(char C
) { return C
>= 'A' && C
<= 'Z' ? C
+ ('a' - 'A') : C
; }
69 // A "negative infinity" score that won't overflow.
70 // We use this to mark unreachable states and forbidden solutions.
71 // Score field is 15 bits wide, min value is -2^14, we use half of that.
72 static constexpr int AwfulScore
= -(1 << 13);
73 static bool isAwful(int S
) { return S
< AwfulScore
/ 2; }
74 static constexpr int PerfectBonus
= 4; // Perfect per-pattern-char score.
76 FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern
)
77 : PatN(std::min
<int>(MaxPat
, Pattern
.size())),
78 ScoreScale(PatN
? float{1} / (PerfectBonus
* PatN
) : 0), WordN(0) {
79 std::copy(Pattern
.begin(), Pattern
.begin() + PatN
, Pat
);
80 for (int I
= 0; I
< PatN
; ++I
)
81 LowPat
[I
] = lower(Pat
[I
]);
82 Scores
[0][0][Miss
] = {0, Miss
};
83 Scores
[0][0][Match
] = {AwfulScore
, Miss
};
84 for (int P
= 0; P
<= PatN
; ++P
)
85 for (int W
= 0; W
< P
; ++W
)
86 for (Action A
: {Miss
, Match
})
87 Scores
[P
][W
][A
] = {AwfulScore
, Miss
};
88 PatTypeSet
= calculateRoles(llvm::StringRef(Pat
, PatN
),
89 llvm::makeMutableArrayRef(PatRole
, PatN
));
92 llvm::Optional
<float> FuzzyMatcher::match(llvm::StringRef Word
) {
93 if (!(WordContainsPattern
= init(Word
)))
98 auto Best
= std::max(Scores
[PatN
][WordN
][Miss
].Score
,
99 Scores
[PatN
][WordN
][Match
].Score
);
103 ScoreScale
* std::min(PerfectBonus
* PatN
, std::max
<int>(0, Best
));
104 // If the pattern is as long as the word, we have an exact string match,
105 // since every pattern character must match something.
107 Score
*= 2; // May not be perfect 2 if case differs in a significant way.
111 // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
112 // The top 6 bits of the char select the byte, the bottom 2 select the offset.
113 // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
114 constexpr static uint8_t CharTypes
[] = {
115 0x00, 0x00, 0x00, 0x00, // Control characters
116 0x00, 0x00, 0x00, 0x00, // Control characters
117 0xff, 0xff, 0xff, 0xff, // Punctuation
118 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
119 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
120 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
121 0x57, 0x55, 0x55, 0x55, // ` and a-o
122 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
123 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
124 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
125 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
126 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
129 // The Role can be determined from the Type of a character and its neighbors:
131 // Example | Chars | Type | Role
132 // ---------+--------------+-----
133 // F(o)oBar | Foo | Ull | Tail
134 // Foo(B)ar | oBa | lUl | Head
135 // (f)oo | ^fo | Ell | Head
136 // H(T)TP | HTT | UUU | Tail
138 // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
139 // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
140 // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
141 constexpr static uint8_t CharRoles
[] = {
143 // Curr= Empty Lower Upper Separ
144 /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
145 /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
146 /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
147 /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
151 template <typename T
> static T
packedLookup(const uint8_t *Data
, int I
) {
152 return static_cast<T
>((Data
[I
>> 2] >> ((I
& 3) * 2)) & 3);
154 CharTypeSet
calculateRoles(llvm::StringRef Text
,
155 llvm::MutableArrayRef
<CharRole
> Roles
) {
156 assert(Text
.size() == Roles
.size());
157 if (Text
.size() == 0)
159 CharType Type
= packedLookup
<CharType
>(CharTypes
, Text
[0]);
160 CharTypeSet TypeSet
= 1 << Type
;
161 // Types holds a sliding window of (Prev, Curr, Next) types.
162 // Initial value is (Empty, Empty, type of Text[0]).
164 // Rotate slides in the type of the next character.
165 auto Rotate
= [&](CharType T
) { Types
= ((Types
<< 2) | T
) & 0x3f; };
166 for (unsigned I
= 0; I
< Text
.size() - 1; ++I
) {
167 // For each character, rotate in the next, and look up the role.
168 Type
= packedLookup
<CharType
>(CharTypes
, Text
[I
+ 1]);
169 TypeSet
|= 1 << Type
;
171 Roles
[I
] = packedLookup
<CharRole
>(CharRoles
, Types
);
173 // For the last character, the "next character" is Empty.
175 Roles
[Text
.size() - 1] = packedLookup
<CharRole
>(CharRoles
, Types
);
179 // Sets up the data structures matching Word.
180 // Returns false if we can cheaply determine that no match is possible.
181 bool FuzzyMatcher::init(llvm::StringRef NewWord
) {
182 WordN
= std::min
<int>(MaxWord
, NewWord
.size());
185 std::copy(NewWord
.begin(), NewWord
.begin() + WordN
, Word
);
188 for (int I
= 0; I
< WordN
; ++I
)
189 LowWord
[I
] = lower(Word
[I
]);
191 // Cheap subsequence check.
192 for (int W
= 0, P
= 0; P
!= PatN
; ++W
) {
195 if (LowWord
[W
] == LowPat
[P
])
199 // FIXME: some words are hard to tokenize algorithmically.
200 // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
201 // We could add a tokenization dictionary for common stdlib names.
202 WordTypeSet
= calculateRoles(llvm::StringRef(Word
, WordN
),
203 llvm::makeMutableArrayRef(WordRole
, WordN
));
207 // The forwards pass finds the mappings of Pattern onto Word.
208 // Score = best score achieved matching Word[..W] against Pat[..P].
209 // Unlike other tables, indices range from 0 to N *inclusive*
210 // Matched = whether we chose to match Word[W] with Pat[P] or not.
212 // Points are mostly assigned to matched characters, with 1 being a good score
213 // and 3 being a great one. So we treat the score range as [0, 3 * PatN].
214 // This range is not strict: we can apply larger bonuses/penalties, or penalize
215 // non-matched characters.
216 void FuzzyMatcher::buildGraph() {
217 for (int W
= 0; W
< WordN
; ++W
) {
218 Scores
[0][W
+ 1][Miss
] = {Scores
[0][W
][Miss
].Score
- skipPenalty(W
, Miss
),
220 Scores
[0][W
+ 1][Match
] = {AwfulScore
, Miss
};
222 for (int P
= 0; P
< PatN
; ++P
) {
223 for (int W
= P
; W
< WordN
; ++W
) {
224 auto &Score
= Scores
[P
+ 1][W
+ 1], &PreMiss
= Scores
[P
+ 1][W
];
226 auto MatchMissScore
= PreMiss
[Match
].Score
;
227 auto MissMissScore
= PreMiss
[Miss
].Score
;
228 if (P
< PatN
- 1) { // Skipping trailing characters is always free.
229 MatchMissScore
-= skipPenalty(W
, Match
);
230 MissMissScore
-= skipPenalty(W
, Miss
);
232 Score
[Miss
] = (MatchMissScore
> MissMissScore
)
233 ? ScoreInfo
{MatchMissScore
, Match
}
234 : ScoreInfo
{MissMissScore
, Miss
};
236 auto &PreMatch
= Scores
[P
][W
];
237 auto MatchMatchScore
=
238 allowMatch(P
, W
, Match
)
239 ? PreMatch
[Match
].Score
+ matchBonus(P
, W
, Match
)
241 auto MissMatchScore
= allowMatch(P
, W
, Miss
)
242 ? PreMatch
[Miss
].Score
+ matchBonus(P
, W
, Miss
)
244 Score
[Match
] = (MatchMatchScore
> MissMatchScore
)
245 ? ScoreInfo
{MatchMatchScore
, Match
}
246 : ScoreInfo
{MissMatchScore
, Miss
};
251 bool FuzzyMatcher::allowMatch(int P
, int W
, Action Last
) const {
252 if (LowPat
[P
] != LowWord
[W
])
254 // We require a "strong" match:
255 // - for the first pattern character. [foo] !~ "barefoot"
256 // - after a gap. [pat] !~ "patnther"
258 // We're banning matches outright, so conservatively accept some other cases
259 // where our segmentation might be wrong:
260 // - allow matching B in ABCDef (but not in NDEBUG)
261 // - we'd like to accept print in sprintf, but too many false positives
262 if (WordRole
[W
] == Tail
&&
263 (Word
[W
] == LowWord
[W
] || !(WordTypeSet
& 1 << Lower
)))
269 int FuzzyMatcher::skipPenalty(int W
, Action Last
) const {
270 if (W
== 0) // Skipping the first character.
272 if (WordRole
[W
] == Head
) // Skipping a segment.
273 return 1; // We want to keep this lower than a consecutive match bonus.
274 // Instead of penalizing non-consecutive matches, we give a bonus to a
275 // consecutive match in matchBonus. This produces a better score distribution
276 // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
280 int FuzzyMatcher::matchBonus(int P
, int W
, Action Last
) const {
281 assert(LowPat
[P
] == LowWord
[W
]);
283 bool IsPatSingleCase
=
284 (PatTypeSet
== 1 << Lower
) || (PatTypeSet
== 1 << Upper
);
285 // Bonus: case matches, or a Head in the pattern aligns with one in the word.
286 // Single-case patterns lack segmentation signals and we assume any character
287 // can be a head of a segment.
288 if (Pat
[P
] == Word
[W
] ||
289 (WordRole
[W
] == Head
&& (IsPatSingleCase
|| PatRole
[P
] == Head
)))
291 // Bonus: a consecutive match. First character match also gets a bonus to
292 // ensure prefix final match score normalizes to 1.0.
293 if (W
== 0 || Last
== Match
)
295 // Penalty: matching inside a segment (and previous char wasn't matched).
296 if (WordRole
[W
] == Tail
&& P
&& Last
== Miss
)
298 // Penalty: a Head in the pattern matches in the middle of a word segment.
299 if (PatRole
[P
] == Head
&& WordRole
[W
] == Tail
)
301 // Penalty: matching the first pattern character in the middle of a segment.
302 if (P
== 0 && WordRole
[W
] == Tail
)
304 assert(S
<= PerfectBonus
);
308 llvm::SmallString
<256> FuzzyMatcher::dumpLast(llvm::raw_ostream
&OS
) const {
309 llvm::SmallString
<256> Result
;
310 OS
<< "=== Match \"" << llvm::StringRef(Word
, WordN
) << "\" against ["
311 << llvm::StringRef(Pat
, PatN
) << "] ===\n";
313 OS
<< "Pattern is empty: perfect match.\n";
314 return Result
= llvm::StringRef(Word
, WordN
);
317 OS
<< "Word is empty: no match.\n";
320 if (!WordContainsPattern
) {
321 OS
<< "Substring check failed.\n";
324 if (isAwful(std::max(Scores
[PatN
][WordN
][Match
].Score
,
325 Scores
[PatN
][WordN
][Miss
].Score
))) {
326 OS
<< "Substring check passed, but all matches are forbidden\n";
328 if (!(PatTypeSet
& 1 << Upper
))
329 OS
<< "Lowercase query, so scoring ignores case\n";
331 // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
332 // The Score table has cumulative scores, subtracting along this path gives
333 // us the per-letter scores.
335 (Scores
[PatN
][WordN
][Match
].Score
> Scores
[PatN
][WordN
][Miss
].Score
)
340 for (int W
= WordN
- 1, P
= PatN
- 1; W
>= 0; --W
) {
342 const auto &Cell
= Scores
[P
+ 1][W
+ 1][Last
];
345 const auto &Prev
= Scores
[P
+ 1][W
][Cell
.Prev
];
346 S
[W
] = Cell
.Score
- Prev
.Score
;
349 for (int I
= 0; I
< WordN
; ++I
) {
350 if (A
[I
] == Match
&& (I
== 0 || A
[I
- 1] == Miss
))
351 Result
.push_back('[');
352 if (A
[I
] == Miss
&& I
> 0 && A
[I
- 1] == Match
)
353 Result
.push_back(']');
354 Result
.push_back(Word
[I
]);
356 if (A
[WordN
- 1] == Match
)
357 Result
.push_back(']');
359 for (char C
: llvm::StringRef(Word
, WordN
))
360 OS
<< " " << C
<< " ";
362 for (int I
= 0, J
= 0; I
< WordN
; I
++)
363 OS
<< " " << (A
[I
] == Match
? Pat
[J
++] : ' ') << " ";
365 for (int I
= 0; I
< WordN
; I
++)
366 OS
<< llvm::format("%2d ", S
[I
]);
369 OS
<< "\nSegmentation:";
370 OS
<< "\n'" << llvm::StringRef(Word
, WordN
) << "'\n ";
371 for (int I
= 0; I
< WordN
; ++I
)
372 OS
<< "?-+ "[static_cast<int>(WordRole
[I
])];
373 OS
<< "\n[" << llvm::StringRef(Pat
, PatN
) << "]\n ";
374 for (int I
= 0; I
< PatN
; ++I
)
375 OS
<< "?-+ "[static_cast<int>(PatRole
[I
])];
378 OS
<< "\nScoring table (last-Miss, last-Match):\n";
380 for (char C
: llvm::StringRef(Word
, WordN
))
381 OS
<< " " << C
<< " ";
383 OS
<< "-+----" << std::string(WordN
* 4, '-') << "\n";
384 for (int I
= 0; I
<= PatN
; ++I
) {
385 for (Action A
: {Miss
, Match
}) {
386 OS
<< ((I
&& A
== Miss
) ? Pat
[I
- 1] : ' ') << "|";
387 for (int J
= 0; J
<= WordN
; ++J
) {
388 if (!isAwful(Scores
[I
][J
][A
].Score
))
389 OS
<< llvm::format("%3d%c", Scores
[I
][J
][A
].Score
,
390 Scores
[I
][J
][A
].Prev
== Match
? '*' : ' ');
401 } // namespace clangd