1 //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 #include "FuzzyMatch.h"
11 #include "index/dex/Token.h"
12 #include "llvm/ADT/ArrayRef.h"
13 #include "llvm/ADT/DenseSet.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
28 // Produce trigrams (including duplicates) and pass them to Out().
29 template <typename Func
>
30 static void identifierTrigrams(llvm::StringRef Identifier
, Func Out
) {
31 assert(!Identifier
.empty());
32 // Apply fuzzy matching text segmentation.
33 llvm::SmallVector
<CharRole
> Roles(Identifier
.size());
34 calculateRoles(Identifier
,
35 llvm::MutableArrayRef(Roles
.data(), Identifier
.size()));
37 std::string LowercaseIdentifier
= Identifier
.lower();
39 // For each character, store indices of the characters to which fuzzy matching
40 // algorithm can jump. There are 2 possible variants:
42 // * Next Tail - next character from the same segment
43 // * Next Head - front character of the next segment
45 // Next stores tuples of three indices in the presented order, if a variant is
46 // not available then 0 is stored.
47 llvm::SmallVector
<std::array
<unsigned, 2>, 12> Next(
48 LowercaseIdentifier
.size());
49 unsigned NextTail
= 0, NextHead
= 0;
50 for (int I
= LowercaseIdentifier
.size() - 1; I
>= 0; --I
) {
51 Next
[I
] = {{NextTail
, NextHead
}};
52 NextTail
= Roles
[I
] == Tail
? I
: 0;
53 if (Roles
[I
] == Head
) {
58 // Iterate through valid sequences of three characters Fuzzy Matcher can
60 for (unsigned I
= 0; I
< LowercaseIdentifier
.size(); ++I
) {
62 if (Roles
[I
] != Head
&& Roles
[I
] != Tail
)
64 for (unsigned J
: Next
[I
]) {
67 for (unsigned K
: Next
[J
]) {
70 Out(Trigram(LowercaseIdentifier
[I
], LowercaseIdentifier
[J
],
71 LowercaseIdentifier
[K
]));
75 // Short queries semantics are different. When the user dosn't type enough
76 // symbols to form trigrams, we still want to serve meaningful results. To
77 // achieve that, we form incomplete trigrams (bi- and unigrams) for the
78 // identifiers and also generate short trigrams on the query side from what
79 // is available. We allow a small number of short trigram types in order to
80 // prevent excessive memory usage and increase the quality of the search.
81 // Only the first few symbols are allowed to be used in incomplete trigrams.
83 // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
84 // "_", "_a", "a", "ab", "ad", "d", "de", "dg"
85 for (unsigned Position
= 0, HeadsSeen
= 0; HeadsSeen
< 2;) {
86 // The first symbol might be a separator, so the loop condition should be
87 // stopping as soon as there is no next head or we have seen two heads.
88 if (Roles
[Position
] == Head
)
90 Out(Trigram(LowercaseIdentifier
[Position
]));
91 for (unsigned I
: Next
[Position
])
93 Out(Trigram(LowercaseIdentifier
[Position
], LowercaseIdentifier
[I
]));
94 Position
= Next
[Position
][1];
100 void generateIdentifierTrigrams(llvm::StringRef Identifier
,
101 std::vector
<Trigram
> &Result
) {
102 // Empirically, scanning for duplicates is faster with fewer trigrams, and
103 // a hashtable is faster with more. This is a hot path, so dispatch based on
104 // expected number of trigrams. Longer identifiers produce more trigrams.
105 // The magic number was tuned by running IndexBenchmark.DexBuild.
106 constexpr unsigned ManyTrigramsIdentifierThreshold
= 14;
108 if (Identifier
.empty())
111 if (Identifier
.size() < ManyTrigramsIdentifierThreshold
) {
112 identifierTrigrams(Identifier
, [&](Trigram T
) {
113 if (!llvm::is_contained(Result
, T
))
117 identifierTrigrams(Identifier
, [&](Trigram T
) { Result
.push_back(T
); });
119 Result
.erase(std::unique(Result
.begin(), Result
.end()), Result
.end());
123 std::vector
<Token
> generateQueryTrigrams(llvm::StringRef Query
) {
127 // Apply fuzzy matching text segmentation.
128 llvm::SmallVector
<CharRole
> Roles(Query
.size());
129 calculateRoles(Query
, llvm::MutableArrayRef(Roles
.data(), Query
.size()));
131 std::string LowercaseQuery
= Query
.lower();
133 llvm::DenseSet
<Token
> UniqueTrigrams
;
135 for (unsigned I
= 0; I
< LowercaseQuery
.size(); ++I
) {
136 if (Roles
[I
] != Head
&& Roles
[I
] != Tail
)
137 continue; // Skip delimiters.
138 Chars
.push_back(LowercaseQuery
[I
]);
139 if (Chars
.size() > 3)
140 Chars
.erase(Chars
.begin());
141 if (Chars
.size() == 3)
142 UniqueTrigrams
.insert(Token(Token::Kind::Trigram
, Chars
));
145 // For queries with very few letters, generateIdentifierTrigrams emulates
146 // outputs of this function to match the semantics.
147 if (UniqueTrigrams
.empty()) {
148 // If bigram can't be formed out of letters/numbers, we prepend separator.
149 std::string
Result(1, LowercaseQuery
.front());
150 for (unsigned I
= 1; I
< LowercaseQuery
.size(); ++I
)
151 if (Roles
[I
] == Head
|| Roles
[I
] == Tail
)
152 Result
+= LowercaseQuery
[I
];
153 UniqueTrigrams
.insert(
154 Token(Token::Kind::Trigram
, llvm::StringRef(Result
).take_back(2)));
157 return {UniqueTrigrams
.begin(), UniqueTrigrams
.end()};
161 } // namespace clangd