ui/app_list/search/tokenized_string_match.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "ui/app_list/search/tokenized_string_match.h"
   6
   7 #include <cmath>
   8
   9 #include "base/i18n/string_search.h"
  10 #include "base/logging.h"
  11 #include "ui/app_list/search/tokenized_string_char_iterator.h"
  12
  13 namespace app_list {
  14
  15 namespace {
  16
  17 // The factors below are applied when the current char of query matches
  18 // the current char of the text to be matched. Different factors are chosen
  19 // based on where the match happens. kIsPrefixMultiplier is used when the
  20 // matched portion is a prefix of both the query and the text, which implies
  21 // that the matched chars are at the same position in query and text. This is
  22 // the most preferred case thus it has the highest score. When the current char
  23 // of the query and the text does not match, the algorithm moves to the next
  24 // token in the text and try to match from there. kIsFrontOfWordMultipler will
  25 // be used if the first char of the token matches the current char of the query.
  26 // Otherwise, the match is considered as weak and kIsWeakHitMultiplier is
  27 // used.
  28 // Examples:
  29 //   Suppose the text to be matched is 'Google Chrome'.
  30 //   Query 'go' would yield kIsPrefixMultiplier for each char.
  31 //   Query 'gc' would use kIsPrefixMultiplier for 'g' and
  32 //       kIsFrontOfWordMultipler for 'c'.
  33 //   Query 'ch' would use kIsFrontOfWordMultipler for 'c' and
  34 //       kIsWeakHitMultiplier for 'h'.
  35 //   Query 'oo' does not match any prefix and would use the substring match
  36 //       fallback, thus kIsSubstringMultiplier is used for each char.
  37 const double kIsPrefixMultiplier = 1.0;
  38 const double kIsFrontOfWordMultipler = 0.8;
  39 const double kIsWeakHitMultiplier = 0.6;
  40 const double kIsSubstringMultiplier = 0.4;
  41
  42 // A relevance score that represents no match.
  43 const double kNoMatchScore = 0.0;
  44
  45 // PrefixMatcher matches the chars of a given query as prefix of tokens in
  46 // a given text or as prefix of the acronyms of those text tokens.
  47 class PrefixMatcher {
  48  public:
  49   PrefixMatcher(const TokenizedString& query,
  50                 const TokenizedString& text)
  51       : query_iter_(query),
  52         text_iter_(text),
  53         current_match_(gfx::Range::InvalidRange()),
  54         current_relevance_(kNoMatchScore) {
  55   }
  56
  57   // Invokes RunMatch to perform prefix match. Use |states_| as a stack to
  58   // perform DFS (depth first search) so that all possible matches are
  59   // attempted. Stops on the first full match and returns true. Otherwise,
  60   // returns false to indicate no match.
  61   bool Match() {
  62     while (!RunMatch()) {
  63       // No match found and no more states to try. Bail out.
  64       if (states_.empty()) {
  65         current_relevance_ = kNoMatchScore;
  66         current_hits_.clear();
  67         return false;
  68       }
  69
  70       PopState();
  71
  72       // Skip restored match to try other possibilites.
  73       AdvanceToNextTextToken();
  74     }
  75
  76     if (current_match_.IsValid())
  77       current_hits_.push_back(current_match_);
  78
  79     return true;
  80   }
  81
  82   double relevance() const { return current_relevance_; }
  83   const TokenizedStringMatch::Hits& hits() const { return current_hits_; }
  84
  85  private:
  86   // Context record of a match.
  87   struct State {
  88     State() : relevance(kNoMatchScore) {}
  89     State(double relevance,
  90           const gfx::Range& current_match,
  91           const TokenizedStringMatch::Hits& hits,
  92           const TokenizedStringCharIterator& query_iter,
  93           const TokenizedStringCharIterator& text_iter)
  94         : relevance(relevance),
  95           current_match(current_match),
  96           hits(hits.begin(), hits.end()),
  97           query_iter_state(query_iter.GetState()),
  98           text_iter_state(text_iter.GetState()) {}
  99
 100     // The current score of the processed query chars.
 101     double relevance;
 102
 103     // Current matching range.
 104     gfx::Range current_match;
 105
 106     // Completed matching ranges of the processed query chars.
 107     TokenizedStringMatch::Hits hits;
 108
 109     // States of the processed query and text chars.
 110     TokenizedStringCharIterator::State query_iter_state;
 111     TokenizedStringCharIterator::State text_iter_state;
 112   };
 113   typedef std::vector<State> States;
 114
 115   // Match chars from the query and text one by one. For each matching char,
 116   // calculate relevance and matching ranges. And the current stats is
 117   // recorded so that the match could be skipped later to try other
 118   // possiblities. Repeat until any of the iterators run out. Return true if
 119   // query iterator runs out, i.e. all chars in query are matched.
 120   bool RunMatch() {
 121     while (!query_iter_.end() && !text_iter_.end()) {
 122       if (query_iter_.Get() == text_iter_.Get()) {
 123         PushState();
 124
 125         if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos())
 126           current_relevance_ += kIsPrefixMultiplier;
 127         else if (text_iter_.IsFirstCharOfToken())
 128           current_relevance_ += kIsFrontOfWordMultipler;
 129         else
 130           current_relevance_ += kIsWeakHitMultiplier;
 131
 132         if (!current_match_.IsValid())
 133           current_match_.set_start(text_iter_.GetArrayPos());
 134         current_match_.set_end(text_iter_.GetArrayPos() +
 135                               text_iter_.GetCharSize());
 136
 137         query_iter_.NextChar();
 138         text_iter_.NextChar();
 139       } else {
 140         AdvanceToNextTextToken();
 141       }
 142     }
 143
 144     return query_iter_.end();
 145   }
 146
 147   // Skip to the next text token and close current match. Invoked when a
 148   // mismatch happens or to skip a restored match.
 149   void AdvanceToNextTextToken() {
 150     if (current_match_.IsValid()) {
 151       current_hits_.push_back(current_match_);
 152       current_match_ = gfx::Range::InvalidRange();
 153     }
 154
 155     text_iter_.NextToken();
 156   }
 157
 158   void PushState() {
 159     states_.push_back(State(current_relevance_,
 160                             current_match_,
 161                             current_hits_,
 162                             query_iter_,
 163                             text_iter_));
 164   }
 165
 166   void PopState() {
 167     DCHECK(!states_.empty());
 168
 169     State& last_match = states_.back();
 170     current_relevance_ = last_match.relevance;
 171     current_match_ = last_match.current_match;
 172     current_hits_.swap(last_match.hits);
 173     query_iter_.SetState(last_match.query_iter_state);
 174     text_iter_.SetState(last_match.text_iter_state);
 175
 176     states_.pop_back();
 177   }
 178
 179   TokenizedStringCharIterator query_iter_;
 180   TokenizedStringCharIterator text_iter_;
 181
 182   States states_;
 183   gfx::Range current_match_;
 184
 185   double current_relevance_;
 186   TokenizedStringMatch::Hits current_hits_;
 187
 188   DISALLOW_COPY_AND_ASSIGN(PrefixMatcher);
 189 };
 190
 191 }  // namespace
 192
 193 TokenizedStringMatch::TokenizedStringMatch()
 194     : relevance_(kNoMatchScore) {}
 195
 196 TokenizedStringMatch::~TokenizedStringMatch() {}
 197
 198 bool TokenizedStringMatch::Calculate(const TokenizedString& query,
 199                                      const TokenizedString& text) {
 200   relevance_ = kNoMatchScore;
 201   hits_.clear();
 202
 203   PrefixMatcher matcher(query, text);
 204   if (matcher.Match()) {
 205     relevance_ = matcher.relevance();
 206     hits_.assign(matcher.hits().begin(), matcher.hits().end());
 207   }
 208
 209   // Substring match as a fallback.
 210   if (relevance_ == kNoMatchScore) {
 211     size_t substr_match_start = 0;
 212     size_t substr_match_length = 0;
 213     if (base::i18n::StringSearchIgnoringCaseAndAccents(query.text(),
 214                                                        text.text(),
 215                                                        &substr_match_start,
 216                                                        &substr_match_length)) {
 217       relevance_ = kIsSubstringMultiplier * substr_match_length;
 218       hits_.push_back(gfx::Range(substr_match_start,
 219                                  substr_match_start + substr_match_length));
 220     }
 221   }
 222
 223   // Temper the relevance score with an exponential curve. Each point of
 224   // relevance (roughly, each keystroke) is worth less than the last. This means
 225   // that typing a few characters of a word is enough to promote matches very
 226   // high, with any subsequent characters being worth comparatively less.
 227   // TODO(mgiuca): This doesn't really play well with Omnibox results, since as
 228   // you type more characters, the app/omnibox results tend to jump over each
 229   // other.
 230   relevance_ = 1.0 - std::pow(0.5, relevance_);
 231
 232   return relevance_ > kNoMatchScore;
 233 }
 234
 235 bool TokenizedStringMatch::Calculate(const base::string16& query,
 236                                      const base::string16& text) {
 237   const TokenizedString tokenized_query(query);
 238   const TokenizedString tokenized_text(text);
 239   return Calculate(tokenized_query, tokenized_text);
 240 }
 241
 242 }  // namespace app_list