ui/app_list/search/tokenized_string_match.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "ui/app_list/search/tokenized_string_match.h"
   6
   7 #include "base/i18n/string_search.h"
   8 #include "base/logging.h"
   9 #include "ui/app_list/search/tokenized_string_char_iterator.h"
  10
  11 namespace app_list {
  12
  13 namespace {
  14
  15 // The factors below are applied when the current char of query matches
  16 // the current char of the text to be matched. Different factors are chosen
  17 // based on where the match happens. kIsPrefixMultiplier is used when the
  18 // matched portion is a prefix of both the query and the text, which implies
  19 // that the matched chars are at the same position in query and text. This is
  20 // the most preferred case thus it has the highest score. When the current char
  21 // of the query and the text does not match, the algorithm moves to the next
  22 // token in the text and try to match from there. kIsFrontOfWordMultipler will
  23 // be used if the first char of the token matches the current char of the query.
  24 // Otherwise, the match is considered as weak and kIsWeakHitMultiplier is
  25 // used.
  26 // Examples:
  27 //   Suppose the text to be matched is 'Google Chrome'.
  28 //   Query 'go' would yield kIsPrefixMultiplier for each char.
  29 //   Query 'gc' would use kIsPrefixMultiplier for 'g' and
  30 //       kIsFrontOfWordMultipler for 'c'.
  31 //   Query 'ch' would use kIsFrontOfWordMultipler for 'c' and
  32 //       kIsWeakHitMultiplier for 'h'.
  33 //   Query 'oo' does not match any prefix and would use the substring match
  34 //       fallback, thus kIsSubstringMultiplier is used for each char.
  35 const double kIsPrefixMultiplier = 1.0;
  36 const double kIsFrontOfWordMultipler = 0.8;
  37 const double kIsWeakHitMultiplier = 0.6;
  38 const double kIsSubstringMultiplier = 0.4;
  39
  40 // A relevance score that represents no match.
  41 const double kNoMatchScore = 0.0;
  42
  43 // PrefixMatcher matches the chars of a given query as prefix of tokens in
  44 // a given text or as prefix of the acronyms of those text tokens.
  45 class PrefixMatcher {
  46  public:
  47   PrefixMatcher(const TokenizedString& query,
  48                 const TokenizedString& text)
  49       : query_iter_(query),
  50         text_iter_(text),
  51         current_match_(gfx::Range::InvalidRange()),
  52         current_relevance_(kNoMatchScore) {
  53   }
  54
  55   // Invokes RunMatch to perform prefix match. Use |states_| as a stack to
  56   // perform DFS (depth first search) so that all possible matches are
  57   // attempted. Stops on the first full match and returns true. Otherwise,
  58   // returns false to indicate no match.
  59   bool Match() {
  60     while (!RunMatch()) {
  61       // No match found and no more states to try. Bail out.
  62       if (states_.empty()) {
  63         current_relevance_ = kNoMatchScore;
  64         current_hits_.clear();
  65         return false;
  66       }
  67
  68       PopState();
  69
  70       // Skip restored match to try other possibilites.
  71       AdvanceToNextTextToken();
  72     }
  73
  74     if (current_match_.IsValid())
  75       current_hits_.push_back(current_match_);
  76
  77     return true;
  78   }
  79
  80   double relevance() const { return current_relevance_; }
  81   const TokenizedStringMatch::Hits& hits() const { return current_hits_; }
  82
  83  private:
  84   // Context record of a match.
  85   struct State {
  86     State() : relevance(kNoMatchScore) {}
  87     State(double relevance,
  88           const gfx::Range& current_match,
  89           const TokenizedStringMatch::Hits& hits,
  90           const TokenizedStringCharIterator& query_iter,
  91           const TokenizedStringCharIterator& text_iter)
  92         : relevance(relevance),
  93           current_match(current_match),
  94           hits(hits.begin(), hits.end()),
  95           query_iter_state(query_iter.GetState()),
  96           text_iter_state(text_iter.GetState()) {}
  97
  98     // The current score of the processed query chars.
  99     double relevance;
 100
 101     // Current matching range.
 102     gfx::Range current_match;
 103
 104     // Completed matching ranges of the processed query chars.
 105     TokenizedStringMatch::Hits hits;
 106
 107     // States of the processed query and text chars.
 108     TokenizedStringCharIterator::State query_iter_state;
 109     TokenizedStringCharIterator::State text_iter_state;
 110   };
 111   typedef std::vector<State> States;
 112
 113   // Match chars from the query and text one by one. For each matching char,
 114   // calculate relevance and matching ranges. And the current stats is
 115   // recorded so that the match could be skipped later to try other
 116   // possiblities. Repeat until any of the iterators run out. Return true if
 117   // query iterator runs out, i.e. all chars in query are matched.
 118   bool RunMatch() {
 119     while (!query_iter_.end() && !text_iter_.end()) {
 120       if (query_iter_.Get() == text_iter_.Get()) {
 121         PushState();
 122
 123         if (query_iter_.GetArrayPos() == text_iter_.GetArrayPos())
 124           current_relevance_ += kIsPrefixMultiplier;
 125         else if (text_iter_.IsFirstCharOfToken())
 126           current_relevance_ += kIsFrontOfWordMultipler;
 127         else
 128           current_relevance_ += kIsWeakHitMultiplier;
 129
 130         if (!current_match_.IsValid())
 131           current_match_.set_start(text_iter_.GetArrayPos());
 132         current_match_.set_end(text_iter_.GetArrayPos() +
 133                               text_iter_.GetCharSize());
 134
 135         query_iter_.NextChar();
 136         text_iter_.NextChar();
 137       } else {
 138         AdvanceToNextTextToken();
 139       }
 140     }
 141
 142     return query_iter_.end();
 143   }
 144
 145   // Skip to the next text token and close current match. Invoked when a
 146   // mismatch happens or to skip a restored match.
 147   void AdvanceToNextTextToken() {
 148     if (current_match_.IsValid()) {
 149       current_hits_.push_back(current_match_);
 150       current_match_ = gfx::Range::InvalidRange();
 151     }
 152
 153     text_iter_.NextToken();
 154   }
 155
 156   void PushState() {
 157     states_.push_back(State(current_relevance_,
 158                             current_match_,
 159                             current_hits_,
 160                             query_iter_,
 161                             text_iter_));
 162   }
 163
 164   void PopState() {
 165     DCHECK(!states_.empty());
 166
 167     State& last_match = states_.back();
 168     current_relevance_ = last_match.relevance;
 169     current_match_ = last_match.current_match;
 170     current_hits_.swap(last_match.hits);
 171     query_iter_.SetState(last_match.query_iter_state);
 172     text_iter_.SetState(last_match.text_iter_state);
 173
 174     states_.pop_back();
 175   }
 176
 177   TokenizedStringCharIterator query_iter_;
 178   TokenizedStringCharIterator text_iter_;
 179
 180   States states_;
 181   gfx::Range current_match_;
 182
 183   double current_relevance_;
 184   TokenizedStringMatch::Hits current_hits_;
 185
 186   DISALLOW_COPY_AND_ASSIGN(PrefixMatcher);
 187 };
 188
 189 }  // namespace
 190
 191 TokenizedStringMatch::TokenizedStringMatch()
 192     : relevance_(kNoMatchScore) {}
 193
 194 TokenizedStringMatch::~TokenizedStringMatch() {}
 195
 196 bool TokenizedStringMatch::Calculate(const TokenizedString& query,
 197                                      const TokenizedString& text) {
 198   relevance_ = kNoMatchScore;
 199   hits_.clear();
 200
 201   PrefixMatcher matcher(query, text);
 202   if (matcher.Match()) {
 203     relevance_ = matcher.relevance();
 204     hits_.assign(matcher.hits().begin(), matcher.hits().end());
 205   }
 206
 207   // Substring match as a fallback.
 208   if (relevance_ == kNoMatchScore) {
 209     size_t substr_match_start = 0;
 210     size_t substr_match_length = 0;
 211     if (base::i18n::StringSearchIgnoringCaseAndAccents(query.text(),
 212                                                        text.text(),
 213                                                        &substr_match_start,
 214                                                        &substr_match_length)) {
 215       relevance_ = kIsSubstringMultiplier * substr_match_length;
 216       hits_.push_back(gfx::Range(substr_match_start,
 217                                  substr_match_start + substr_match_length));
 218     }
 219   }
 220
 221   // Using length() for normalizing is not 100% correct but should be good
 222   // enough compared with using real char count of the text.
 223   if (text.text().length())
 224     relevance_ /= text.text().length();
 225
 226   return relevance_ > kNoMatchScore;
 227 }
 228
 229 bool TokenizedStringMatch::Calculate(const base::string16& query,
 230                                      const base::string16& text) {
 231   const TokenizedString tokenized_query(query);
 232   const TokenizedString tokenized_text(text);
 233   return Calculate(tokenized_query, tokenized_text);
 234 }
 235
 236 }  // namespace app_list