Roll src/third_party/WebKit d9c6159:8139f33 (svn 201974:201975)
[chromium-blink-merge.git] / ui / app_list / search / tokenized_string_match.cc
blobc7035bbf5658aee2ebfd6754a3deae8668cfac0e
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "ui/app_list/search/tokenized_string_match.h"
7 #include <cmath>
9 #include "base/i18n/string_search.h"
10 #include "base/logging.h"
11 #include "ui/app_list/search/tokenized_string_char_iterator.h"
13 namespace app_list {
15 namespace {
17 // The factors below are applied when the current char of query matches
18 // the current char of the text to be matched. Different factors are chosen
19 // based on where the match happens. kIsPrefixMultiplier is used when the
20 // matched portion is a prefix of both the query and the text, which implies
21 // that the matched chars are at the same position in query and text. This is
22 // the most preferred case thus it has the highest score. When the current char
23 // of the query and the text does not match, the algorithm moves to the next
24 // token in the text and try to match from there. kIsFrontOfWordMultipler will
25 // be used if the first char of the token matches the current char of the query.
26 // Otherwise, the match is considered as weak and kIsWeakHitMultiplier is
27 // used.
28 // Examples:
29 // Suppose the text to be matched is 'Google Chrome'.
30 // Query 'go' would yield kIsPrefixMultiplier for each char.
31 // Query 'gc' would use kIsPrefixMultiplier for 'g' and
32 // kIsFrontOfWordMultipler for 'c'.
33 // Query 'ch' would use kIsFrontOfWordMultipler for 'c' and
34 // kIsWeakHitMultiplier for 'h'.
35 // Query 'oo' does not match any prefix and would use the substring match
36 // fallback, thus kIsSubstringMultiplier is used for each char.
37 const double kIsPrefixMultiplier = 1.0;
38 const double kIsFrontOfWordMultipler = 0.8;
39 const double kIsWeakHitMultiplier = 0.6;
40 const double kIsSubstringMultiplier = 0.4;
42 // A relevance score that represents no match.
43 const double kNoMatchScore = 0.0;
45 // PrefixMatcher matches the chars of a given query as prefix of tokens in
46 // a given text or as prefix of the acronyms of those text tokens.
47 class PrefixMatcher {
48 public:
49 PrefixMatcher(const TokenizedString& query,
50 const TokenizedString& text)
51 : query_iter_(query),
52 text_iter_(text),
53 current_match_(gfx::Range::InvalidRange()),
54 current_relevance_(kNoMatchScore) {
57 // Invokes RunMatch to perform prefix match. Use |states_| as a stack to
58 // perform DFS (depth first search) so that all possible matches are
59 // attempted. Stops on the first full match and returns true. Otherwise,
60 // returns false to indicate no match.
61 bool Match() {
62 while (!RunMatch()) {
63 // No match found and no more states to try. Bail out.
64 if (states_.empty()) {
65 current_relevance_ = kNoMatchScore;
66 current_hits_.clear();
67 return false;
70 PopState();
72 // Skip restored match to try other possibilites.
73 AdvanceToNextTextToken();
76 if (current_match_.IsValid())
77 current_hits_.push_back(current_match_);
79 return true;
82 double relevance() const { return current_relevance_; }
83 const TokenizedStringMatch::Hits& hits() const { return current_hits_; }
85 private:
86 // Context record of a match.
87 struct State {
88 State() : relevance(kNoMatchScore) {}
89 State(double relevance,
90 const gfx::Range& current_match,
91 const TokenizedStringMatch::Hits& hits,
92 const TokenizedStringCharIterator& query_iter,
93 const TokenizedStringCharIterator& text_iter)
94 : relevance(relevance),
95 current_match(current_match),
96 hits(hits.begin(), hits.end()),
97 query_iter_state(query_iter.GetState()),
98 text_iter_state(text_iter.GetState()) {}
100 // The current score of the processed query chars.
101 double relevance;
103 // Current matching range.
104 gfx::Range current_match;
106 // Completed matching ranges of the processed query chars.
107 TokenizedStringMatch::Hits hits;
109 // States of the processed query and text chars.
110 TokenizedStringCharIterator::State query_iter_state;
111 TokenizedStringCharIterator::State text_iter_state;
113 typedef std::vector<State> States;
115 // Match chars from the query and text one by one. For each matching char,
116 // calculate relevance and matching ranges. And the current stats is
117 // recorded so that the match could be skipped later to try other
118 // possiblities. Repeat until any of the iterators run out. Return true if
119 // query iterator runs out, i.e. all chars in query are matched.
120 bool RunMatch() {
121 while (!query_iter_.end() && !text_iter_.end()) {
122 if (query_iter_.Get() == text_iter_.Get()) {
123 PushState();
125 if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos())
126 current_relevance_ += kIsPrefixMultiplier;
127 else if (text_iter_.IsFirstCharOfToken())
128 current_relevance_ += kIsFrontOfWordMultipler;
129 else
130 current_relevance_ += kIsWeakHitMultiplier;
132 if (!current_match_.IsValid())
133 current_match_.set_start(text_iter_.GetArrayPos());
134 current_match_.set_end(text_iter_.GetArrayPos() +
135 text_iter_.GetCharSize());
137 query_iter_.NextChar();
138 text_iter_.NextChar();
139 } else {
140 AdvanceToNextTextToken();
144 return query_iter_.end();
147 // Skip to the next text token and close current match. Invoked when a
148 // mismatch happens or to skip a restored match.
149 void AdvanceToNextTextToken() {
150 if (current_match_.IsValid()) {
151 current_hits_.push_back(current_match_);
152 current_match_ = gfx::Range::InvalidRange();
155 text_iter_.NextToken();
158 void PushState() {
159 states_.push_back(State(current_relevance_,
160 current_match_,
161 current_hits_,
162 query_iter_,
163 text_iter_));
166 void PopState() {
167 DCHECK(!states_.empty());
169 State& last_match = states_.back();
170 current_relevance_ = last_match.relevance;
171 current_match_ = last_match.current_match;
172 current_hits_.swap(last_match.hits);
173 query_iter_.SetState(last_match.query_iter_state);
174 text_iter_.SetState(last_match.text_iter_state);
176 states_.pop_back();
179 TokenizedStringCharIterator query_iter_;
180 TokenizedStringCharIterator text_iter_;
182 States states_;
183 gfx::Range current_match_;
185 double current_relevance_;
186 TokenizedStringMatch::Hits current_hits_;
188 DISALLOW_COPY_AND_ASSIGN(PrefixMatcher);
191 } // namespace
193 TokenizedStringMatch::TokenizedStringMatch()
194 : relevance_(kNoMatchScore) {}
196 TokenizedStringMatch::~TokenizedStringMatch() {}
198 bool TokenizedStringMatch::Calculate(const TokenizedString& query,
199 const TokenizedString& text) {
200 relevance_ = kNoMatchScore;
201 hits_.clear();
203 PrefixMatcher matcher(query, text);
204 if (matcher.Match()) {
205 relevance_ = matcher.relevance();
206 hits_.assign(matcher.hits().begin(), matcher.hits().end());
209 // Substring match as a fallback.
210 if (relevance_ == kNoMatchScore) {
211 size_t substr_match_start = 0;
212 size_t substr_match_length = 0;
213 if (base::i18n::StringSearchIgnoringCaseAndAccents(query.text(),
214 text.text(),
215 &substr_match_start,
216 &substr_match_length)) {
217 relevance_ = kIsSubstringMultiplier * substr_match_length;
218 hits_.push_back(gfx::Range(substr_match_start,
219 substr_match_start + substr_match_length));
223 // Temper the relevance score with an exponential curve. Each point of
224 // relevance (roughly, each keystroke) is worth less than the last. This means
225 // that typing a few characters of a word is enough to promote matches very
226 // high, with any subsequent characters being worth comparatively less.
227 // TODO(mgiuca): This doesn't really play well with Omnibox results, since as
228 // you type more characters, the app/omnibox results tend to jump over each
229 // other.
230 relevance_ = 1.0 - std::pow(0.5, relevance_);
232 return relevance_ > kNoMatchScore;
235 bool TokenizedStringMatch::Calculate(const base::string16& query,
236 const base::string16& text) {
237 const TokenizedString tokenized_query(query);
238 const TokenizedString tokenized_text(text);
239 return Calculate(tokenized_query, tokenized_text);
242 } // namespace app_list