Roll src/third_party/WebKit 605a979:06cb9e9 (svn 202556:202558)
[chromium-blink-merge.git] / components / query_parser / query_parser.cc
blobbd9d0954df2d21fce034f37008bcb041f332a3b7
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/query_parser/query_parser.h"
7 #include <algorithm>
9 #include "base/compiler_specific.h"
10 #include "base/i18n/break_iterator.h"
11 #include "base/i18n/case_conversion.h"
12 #include "base/logging.h"
13 #include "base/stl_util.h"
14 #include "base/strings/utf_string_conversions.h"
16 namespace query_parser {
17 namespace {
19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
20 // sort match positions.
21 int CompareMatchPosition(const Snippet::MatchPosition& mp1,
22 const Snippet::MatchPosition& mp2) {
23 return mp1.first < mp2.first;
26 // Returns true if |mp2| intersects |mp1|. This is intended for use by
27 // CoalesceMatchesFrom and isn't meant as a general intersection comparison
28 // function.
29 bool SnippetIntersects(const Snippet::MatchPosition& mp1,
30 const Snippet::MatchPosition& mp2) {
31 return mp2.first >= mp1.first && mp2.first <= mp1.second;
34 // Coalesces match positions in |matches| after index that intersect the match
35 // position at |index|.
36 void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) {
37 Snippet::MatchPosition& mp = (*matches)[index];
38 for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
39 i != matches->end(); ) {
40 if (SnippetIntersects(mp, *i)) {
41 mp.second = std::max(mp.second, i->second);
42 i = matches->erase(i);
43 } else {
44 return;
49 // Returns true if the character is considered a quote.
50 bool IsQueryQuote(wchar_t ch) {
51 return ch == '"' ||
52 ch == 0xab || // left pointing double angle bracket
53 ch == 0xbb || // right pointing double angle bracket
54 ch == 0x201c || // left double quotation mark
55 ch == 0x201d || // right double quotation mark
56 ch == 0x201e; // double low-9 quotation mark
59 } // namespace
61 // Inheritance structure:
62 // Queries are represented as trees of QueryNodes.
63 // QueryNodes are either a collection of subnodes (a QueryNodeList)
64 // or a single word (a QueryNodeWord).
66 // A QueryNodeWord is a single word in the query.
67 class QueryNodeWord : public QueryNode {
68 public:
69 explicit QueryNodeWord(const base::string16& word,
70 MatchingAlgorithm matching_algorithm);
71 ~QueryNodeWord() override;
73 const base::string16& word() const { return word_; }
75 bool literal() const { return literal_; };
76 void set_literal(bool literal) { literal_ = literal; }
78 // QueryNode:
79 int AppendToSQLiteQuery(base::string16* query) const override;
80 bool IsWord() const override;
81 bool Matches(const base::string16& word, bool exact) const override;
82 bool HasMatchIn(const QueryWordVector& words,
83 Snippet::MatchPositions* match_positions) const override;
84 bool HasMatchIn(const QueryWordVector& words) const override;
85 void AppendWords(std::vector<base::string16>* words) const override;
87 private:
88 base::string16 word_;
89 bool literal_;
90 const MatchingAlgorithm matching_algorithm_;
92 DISALLOW_COPY_AND_ASSIGN(QueryNodeWord);
95 QueryNodeWord::QueryNodeWord(const base::string16& word,
96 MatchingAlgorithm matching_algorithm)
97 : word_(word),
98 literal_(false),
99 matching_algorithm_(matching_algorithm) {}
101 QueryNodeWord::~QueryNodeWord() {}
103 int QueryNodeWord::AppendToSQLiteQuery(base::string16* query) const {
104 query->append(word_);
106 // Use prefix search if we're not literal and long enough.
107 if (!literal_ &&
108 QueryParser::IsWordLongEnoughForPrefixSearch(word_, matching_algorithm_))
109 *query += L'*';
110 return 1;
113 bool QueryNodeWord::IsWord() const {
114 return true;
117 bool QueryNodeWord::Matches(const base::string16& word, bool exact) const {
118 if (exact ||
119 !QueryParser::IsWordLongEnoughForPrefixSearch(word_, matching_algorithm_))
120 return word == word_;
121 return word.size() >= word_.size() &&
122 (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
125 bool QueryNodeWord::HasMatchIn(const QueryWordVector& words,
126 Snippet::MatchPositions* match_positions) const {
127 bool matched = false;
128 for (size_t i = 0; i < words.size(); ++i) {
129 if (Matches(words[i].word, false)) {
130 size_t match_start = words[i].position;
131 match_positions->push_back(
132 Snippet::MatchPosition(match_start,
133 match_start + static_cast<int>(word_.size())));
134 matched = true;
137 return matched;
140 bool QueryNodeWord::HasMatchIn(const QueryWordVector& words) const {
141 for (size_t i = 0; i < words.size(); ++i) {
142 if (Matches(words[i].word, false))
143 return true;
145 return false;
148 void QueryNodeWord::AppendWords(std::vector<base::string16>* words) const {
149 words->push_back(word_);
152 // A QueryNodeList has a collection of QueryNodes which are deleted in the end.
153 class QueryNodeList : public QueryNode {
154 public:
155 QueryNodeList();
156 ~QueryNodeList() override;
158 QueryNodeStarVector* children() { return &children_; }
160 void AddChild(QueryNode* node);
162 // Remove empty subnodes left over from other parsing.
163 void RemoveEmptySubnodes();
165 // QueryNode:
166 int AppendToSQLiteQuery(base::string16* query) const override;
167 bool IsWord() const override;
168 bool Matches(const base::string16& word, bool exact) const override;
169 bool HasMatchIn(const QueryWordVector& words,
170 Snippet::MatchPositions* match_positions) const override;
171 bool HasMatchIn(const QueryWordVector& words) const override;
172 void AppendWords(std::vector<base::string16>* words) const override;
174 protected:
175 int AppendChildrenToString(base::string16* query) const;
177 QueryNodeStarVector children_;
179 private:
180 DISALLOW_COPY_AND_ASSIGN(QueryNodeList);
183 QueryNodeList::QueryNodeList() {}
185 QueryNodeList::~QueryNodeList() {
186 STLDeleteElements(&children_);
189 void QueryNodeList::AddChild(QueryNode* node) {
190 children_.push_back(node);
193 void QueryNodeList::RemoveEmptySubnodes() {
194 for (size_t i = 0; i < children_.size(); ++i) {
195 if (children_[i]->IsWord())
196 continue;
198 QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
199 list_node->RemoveEmptySubnodes();
200 if (list_node->children()->empty()) {
201 children_.erase(children_.begin() + i);
202 --i;
203 delete list_node;
208 int QueryNodeList::AppendToSQLiteQuery(base::string16* query) const {
209 return AppendChildrenToString(query);
212 bool QueryNodeList::IsWord() const {
213 return false;
216 bool QueryNodeList::Matches(const base::string16& word, bool exact) const {
217 NOTREACHED();
218 return false;
221 bool QueryNodeList::HasMatchIn(const QueryWordVector& words,
222 Snippet::MatchPositions* match_positions) const {
223 NOTREACHED();
224 return false;
227 bool QueryNodeList::HasMatchIn(const QueryWordVector& words) const {
228 NOTREACHED();
229 return false;
232 void QueryNodeList::AppendWords(std::vector<base::string16>* words) const {
233 for (size_t i = 0; i < children_.size(); ++i)
234 children_[i]->AppendWords(words);
237 int QueryNodeList::AppendChildrenToString(base::string16* query) const {
238 int num_words = 0;
239 for (QueryNodeStarVector::const_iterator node = children_.begin();
240 node != children_.end(); ++node) {
241 if (node != children_.begin())
242 query->push_back(L' ');
243 num_words += (*node)->AppendToSQLiteQuery(query);
245 return num_words;
248 // A QueryNodePhrase is a phrase query ("quoted").
249 class QueryNodePhrase : public QueryNodeList {
250 public:
251 QueryNodePhrase();
252 ~QueryNodePhrase() override;
254 // QueryNodeList:
255 int AppendToSQLiteQuery(base::string16* query) const override;
256 bool HasMatchIn(const QueryWordVector& words,
257 Snippet::MatchPositions* match_positions) const override;
258 bool HasMatchIn(const QueryWordVector& words) const override;
260 private:
261 bool MatchesAll(const QueryWordVector& words,
262 const QueryWord** first_word,
263 const QueryWord** last_word) const;
264 DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase);
267 QueryNodePhrase::QueryNodePhrase() {}
269 QueryNodePhrase::~QueryNodePhrase() {}
271 int QueryNodePhrase::AppendToSQLiteQuery(base::string16* query) const {
272 query->push_back(L'"');
273 int num_words = AppendChildrenToString(query);
274 query->push_back(L'"');
275 return num_words;
278 bool QueryNodePhrase::MatchesAll(const QueryWordVector& words,
279 const QueryWord** first_word,
280 const QueryWord** last_word) const {
281 if (words.size() < children_.size())
282 return false;
284 for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
285 bool matched_all = true;
286 for (size_t j = 0; j < children_.size(); ++j) {
287 if (!children_[j]->Matches(words[i + j].word, true)) {
288 matched_all = false;
289 break;
292 if (matched_all) {
293 *first_word = &words[i];
294 *last_word = &words[i + children_.size() - 1];
295 return true;
298 return false;
301 bool QueryNodePhrase::HasMatchIn(
302 const QueryWordVector& words,
303 Snippet::MatchPositions* match_positions) const {
304 const QueryWord* first_word;
305 const QueryWord* last_word;
307 if (MatchesAll(words, &first_word, &last_word)) {
308 match_positions->push_back(
309 Snippet::MatchPosition(first_word->position,
310 last_word->position + last_word->word.length()));
311 return true;
313 return false;
316 bool QueryNodePhrase::HasMatchIn(const QueryWordVector& words) const {
317 const QueryWord* first_word;
318 const QueryWord* last_word;
319 return MatchesAll(words, &first_word, &last_word);
322 QueryParser::QueryParser() {}
324 // static
325 bool QueryParser::IsWordLongEnoughForPrefixSearch(
326 const base::string16& word, MatchingAlgorithm matching_algorithm) {
327 if (matching_algorithm == MatchingAlgorithm::ALWAYS_PREFIX_SEARCH)
328 return true;
330 DCHECK(!word.empty());
331 size_t minimum_length = 3;
332 // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
333 // because they 'behave like' Latin letters. Moreover, we should
334 // normalize the former before reaching here.
335 if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
336 minimum_length = 2;
337 return word.size() >= minimum_length;
340 int QueryParser::ParseQuery(const base::string16& query,
341 MatchingAlgorithm matching_algorithm,
342 base::string16* sqlite_query) {
343 QueryNodeList root;
344 if (!ParseQueryImpl(query, matching_algorithm, &root))
345 return 0;
346 return root.AppendToSQLiteQuery(sqlite_query);
349 void QueryParser::ParseQueryWords(const base::string16& query,
350 MatchingAlgorithm matching_algorithm,
351 std::vector<base::string16>* words) {
352 QueryNodeList root;
353 if (!ParseQueryImpl(query, matching_algorithm, &root))
354 return;
355 root.AppendWords(words);
358 void QueryParser::ParseQueryNodes(const base::string16& query,
359 MatchingAlgorithm matching_algorithm,
360 QueryNodeStarVector* nodes) {
361 QueryNodeList root;
362 if (ParseQueryImpl(base::i18n::ToLower(query), matching_algorithm, &root))
363 nodes->swap(*root.children());
366 bool QueryParser::DoesQueryMatch(const base::string16& text,
367 const QueryNodeStarVector& query_nodes,
368 Snippet::MatchPositions* match_positions) {
369 if (query_nodes.empty())
370 return false;
372 QueryWordVector query_words;
373 base::string16 lower_text = base::i18n::ToLower(text);
374 ExtractQueryWords(lower_text, &query_words);
376 if (query_words.empty())
377 return false;
379 Snippet::MatchPositions matches;
380 for (size_t i = 0; i < query_nodes.size(); ++i) {
381 if (!query_nodes[i]->HasMatchIn(query_words, &matches))
382 return false;
384 if (lower_text.length() != text.length()) {
385 // The lower case string differs from the original string. The matches are
386 // meaningless.
387 // TODO(sky): we need a better way to align the positions so that we don't
388 // completely punt here.
389 match_positions->clear();
390 } else {
391 SortAndCoalesceMatchPositions(&matches);
392 match_positions->swap(matches);
394 return true;
397 bool QueryParser::DoesQueryMatch(const QueryWordVector& query_words,
398 const QueryNodeStarVector& query_nodes) {
399 if (query_nodes.empty() || query_words.empty())
400 return false;
402 for (size_t i = 0; i < query_nodes.size(); ++i) {
403 if (!query_nodes[i]->HasMatchIn(query_words))
404 return false;
406 return true;
409 bool QueryParser::ParseQueryImpl(const base::string16& query,
410 MatchingAlgorithm matching_algorithm,
411 QueryNodeList* root) {
412 base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD);
413 // TODO(evanm): support a locale here
414 if (!iter.Init())
415 return false;
417 // To handle nesting, we maintain a stack of QueryNodeLists.
418 // The last element (back) of the stack contains the current, deepest node.
419 std::vector<QueryNodeList*> query_stack;
420 query_stack.push_back(root);
422 bool in_quotes = false; // whether we're currently in a quoted phrase
423 while (iter.Advance()) {
424 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
425 // is not necessarily a word, but could also be a sequence of punctuation
426 // or whitespace.
427 if (iter.IsWord()) {
428 QueryNodeWord* word_node = new QueryNodeWord(iter.GetString(),
429 matching_algorithm);
430 if (in_quotes)
431 word_node->set_literal(true);
432 query_stack.back()->AddChild(word_node);
433 } else { // Punctuation.
434 if (IsQueryQuote(query[iter.prev()])) {
435 if (!in_quotes) {
436 QueryNodeList* quotes_node = new QueryNodePhrase;
437 query_stack.back()->AddChild(quotes_node);
438 query_stack.push_back(quotes_node);
439 in_quotes = true;
440 } else {
441 query_stack.pop_back(); // Stop adding to the quoted phrase.
442 in_quotes = false;
448 root->RemoveEmptySubnodes();
449 return true;
452 void QueryParser::ExtractQueryWords(const base::string16& text,
453 QueryWordVector* words) {
454 base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD);
455 // TODO(evanm): support a locale here
456 if (!iter.Init())
457 return;
459 while (iter.Advance()) {
460 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
461 // is not necessarily a word, but could also be a sequence of punctuation
462 // or whitespace.
463 if (iter.IsWord()) {
464 base::string16 word = iter.GetString();
465 if (!word.empty()) {
466 words->push_back(QueryWord());
467 words->back().word = word;
468 words->back().position = iter.prev();
474 // static
475 void QueryParser::SortAndCoalesceMatchPositions(
476 Snippet::MatchPositions* matches) {
477 std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
478 // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
479 // from matches.
480 for (size_t i = 0; i < matches->size(); ++i)
481 CoalesceMatchesFrom(i, matches);
484 } // namespace query_parser