1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/query_parser/query_parser.h"
9 #include "base/compiler_specific.h"
10 #include "base/i18n/break_iterator.h"
11 #include "base/i18n/case_conversion.h"
12 #include "base/logging.h"
13 #include "base/stl_util.h"
14 #include "base/strings/utf_string_conversions.h"
16 namespace query_parser
{
19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
20 // sort match positions.
21 int CompareMatchPosition(const Snippet::MatchPosition
& mp1
,
22 const Snippet::MatchPosition
& mp2
) {
23 return mp1
.first
< mp2
.first
;
26 // Returns true if |mp2| intersects |mp1|. This is intended for use by
27 // CoalesceMatchesFrom and isn't meant as a general intersection comparison
29 bool SnippetIntersects(const Snippet::MatchPosition
& mp1
,
30 const Snippet::MatchPosition
& mp2
) {
31 return mp2
.first
>= mp1
.first
&& mp2
.first
<= mp1
.second
;
34 // Coalesces match positions in |matches| after index that intersect the match
35 // position at |index|.
36 void CoalesceMatchesFrom(size_t index
, Snippet::MatchPositions
* matches
) {
37 Snippet::MatchPosition
& mp
= (*matches
)[index
];
38 for (Snippet::MatchPositions::iterator i
= matches
->begin() + index
+ 1;
39 i
!= matches
->end(); ) {
40 if (SnippetIntersects(mp
, *i
)) {
41 mp
.second
= std::max(mp
.second
, i
->second
);
42 i
= matches
->erase(i
);
49 // Returns true if the character is considered a quote.
50 bool IsQueryQuote(wchar_t ch
) {
52 ch
== 0xab || // left pointing double angle bracket
53 ch
== 0xbb || // right pointing double angle bracket
54 ch
== 0x201c || // left double quotation mark
55 ch
== 0x201d || // right double quotation mark
56 ch
== 0x201e; // double low-9 quotation mark
61 // Inheritance structure:
62 // Queries are represented as trees of QueryNodes.
63 // QueryNodes are either a collection of subnodes (a QueryNodeList)
64 // or a single word (a QueryNodeWord).
66 // A QueryNodeWord is a single word in the query.
67 class QueryNodeWord
: public QueryNode
{
69 explicit QueryNodeWord(const base::string16
& word
);
70 virtual ~QueryNodeWord();
72 const base::string16
& word() const { return word_
; }
74 void set_literal(bool literal
) { literal_
= literal
; }
77 virtual int AppendToSQLiteQuery(base::string16
* query
) const OVERRIDE
;
78 virtual bool IsWord() const OVERRIDE
;
79 virtual bool Matches(const base::string16
& word
, bool exact
) const OVERRIDE
;
80 virtual bool HasMatchIn(
81 const QueryWordVector
& words
,
82 Snippet::MatchPositions
* match_positions
) const OVERRIDE
;
83 virtual bool HasMatchIn(
84 const QueryWordVector
& words
) const OVERRIDE
;
85 virtual void AppendWords(std::vector
<base::string16
>* words
) const OVERRIDE
;
91 DISALLOW_COPY_AND_ASSIGN(QueryNodeWord
);
94 QueryNodeWord::QueryNodeWord(const base::string16
& word
)
98 QueryNodeWord::~QueryNodeWord() {}
100 int QueryNodeWord::AppendToSQLiteQuery(base::string16
* query
) const {
101 query
->append(word_
);
103 // Use prefix search if we're not literal and long enough.
104 if (!literal_
&& QueryParser::IsWordLongEnoughForPrefixSearch(word_
))
109 bool QueryNodeWord::IsWord() const {
113 bool QueryNodeWord::Matches(const base::string16
& word
, bool exact
) const {
114 if (exact
|| !QueryParser::IsWordLongEnoughForPrefixSearch(word_
))
115 return word
== word_
;
116 return word
.size() >= word_
.size() &&
117 (word_
.compare(0, word_
.size(), word
, 0, word_
.size()) == 0);
120 bool QueryNodeWord::HasMatchIn(const QueryWordVector
& words
,
121 Snippet::MatchPositions
* match_positions
) const {
122 bool matched
= false;
123 for (size_t i
= 0; i
< words
.size(); ++i
) {
124 if (Matches(words
[i
].word
, false)) {
125 size_t match_start
= words
[i
].position
;
126 match_positions
->push_back(
127 Snippet::MatchPosition(match_start
,
128 match_start
+ static_cast<int>(word_
.size())));
135 bool QueryNodeWord::HasMatchIn(const QueryWordVector
& words
) const {
136 for (size_t i
= 0; i
< words
.size(); ++i
) {
137 if (Matches(words
[i
].word
, false))
143 void QueryNodeWord::AppendWords(std::vector
<base::string16
>* words
) const {
144 words
->push_back(word_
);
147 // A QueryNodeList has a collection of QueryNodes which are deleted in the end.
148 class QueryNodeList
: public QueryNode
{
151 virtual ~QueryNodeList();
153 QueryNodeStarVector
* children() { return &children_
; }
155 void AddChild(QueryNode
* node
);
157 // Remove empty subnodes left over from other parsing.
158 void RemoveEmptySubnodes();
161 virtual int AppendToSQLiteQuery(base::string16
* query
) const OVERRIDE
;
162 virtual bool IsWord() const OVERRIDE
;
163 virtual bool Matches(const base::string16
& word
, bool exact
) const OVERRIDE
;
164 virtual bool HasMatchIn(
165 const QueryWordVector
& words
,
166 Snippet::MatchPositions
* match_positions
) const OVERRIDE
;
167 virtual bool HasMatchIn(const QueryWordVector
& words
) const OVERRIDE
;
168 virtual void AppendWords(std::vector
<base::string16
>* words
) const OVERRIDE
;
171 int AppendChildrenToString(base::string16
* query
) const;
173 QueryNodeStarVector children_
;
176 DISALLOW_COPY_AND_ASSIGN(QueryNodeList
);
179 QueryNodeList::QueryNodeList() {}
181 QueryNodeList::~QueryNodeList() {
182 STLDeleteElements(&children_
);
185 void QueryNodeList::AddChild(QueryNode
* node
) {
186 children_
.push_back(node
);
189 void QueryNodeList::RemoveEmptySubnodes() {
190 for (size_t i
= 0; i
< children_
.size(); ++i
) {
191 if (children_
[i
]->IsWord())
194 QueryNodeList
* list_node
= static_cast<QueryNodeList
*>(children_
[i
]);
195 list_node
->RemoveEmptySubnodes();
196 if (list_node
->children()->empty()) {
197 children_
.erase(children_
.begin() + i
);
204 int QueryNodeList::AppendToSQLiteQuery(base::string16
* query
) const {
205 return AppendChildrenToString(query
);
208 bool QueryNodeList::IsWord() const {
212 bool QueryNodeList::Matches(const base::string16
& word
, bool exact
) const {
217 bool QueryNodeList::HasMatchIn(const QueryWordVector
& words
,
218 Snippet::MatchPositions
* match_positions
) const {
223 bool QueryNodeList::HasMatchIn(const QueryWordVector
& words
) const {
228 void QueryNodeList::AppendWords(std::vector
<base::string16
>* words
) const {
229 for (size_t i
= 0; i
< children_
.size(); ++i
)
230 children_
[i
]->AppendWords(words
);
233 int QueryNodeList::AppendChildrenToString(base::string16
* query
) const {
235 for (QueryNodeStarVector::const_iterator node
= children_
.begin();
236 node
!= children_
.end(); ++node
) {
237 if (node
!= children_
.begin())
238 query
->push_back(L
' ');
239 num_words
+= (*node
)->AppendToSQLiteQuery(query
);
244 // A QueryNodePhrase is a phrase query ("quoted").
245 class QueryNodePhrase
: public QueryNodeList
{
248 virtual ~QueryNodePhrase();
251 virtual int AppendToSQLiteQuery(base::string16
* query
) const OVERRIDE
;
252 virtual bool HasMatchIn(
253 const QueryWordVector
& words
,
254 Snippet::MatchPositions
* match_positions
) const OVERRIDE
;
255 virtual bool HasMatchIn(const QueryWordVector
& words
) const OVERRIDE
;
258 bool MatchesAll(const QueryWordVector
& words
,
259 const QueryWord
** first_word
,
260 const QueryWord
** last_word
) const;
261 DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase
);
264 QueryNodePhrase::QueryNodePhrase() {}
266 QueryNodePhrase::~QueryNodePhrase() {}
268 int QueryNodePhrase::AppendToSQLiteQuery(base::string16
* query
) const {
269 query
->push_back(L
'"');
270 int num_words
= AppendChildrenToString(query
);
271 query
->push_back(L
'"');
275 bool QueryNodePhrase::MatchesAll(const QueryWordVector
& words
,
276 const QueryWord
** first_word
,
277 const QueryWord
** last_word
) const {
278 if (words
.size() < children_
.size())
281 for (size_t i
= 0, max
= words
.size() - children_
.size() + 1; i
< max
; ++i
) {
282 bool matched_all
= true;
283 for (size_t j
= 0; j
< children_
.size(); ++j
) {
284 if (!children_
[j
]->Matches(words
[i
+ j
].word
, true)) {
290 *first_word
= &words
[i
];
291 *last_word
= &words
[i
+ children_
.size() - 1];
298 bool QueryNodePhrase::HasMatchIn(
299 const QueryWordVector
& words
,
300 Snippet::MatchPositions
* match_positions
) const {
301 const QueryWord
* first_word
;
302 const QueryWord
* last_word
;
304 if (MatchesAll(words
, &first_word
, &last_word
)) {
305 match_positions
->push_back(
306 Snippet::MatchPosition(first_word
->position
,
307 last_word
->position
+ last_word
->word
.length()));
313 bool QueryNodePhrase::HasMatchIn(const QueryWordVector
& words
) const {
314 const QueryWord
* first_word
;
315 const QueryWord
* last_word
;
316 return MatchesAll(words
, &first_word
, &last_word
);
319 QueryParser::QueryParser() {}
322 bool QueryParser::IsWordLongEnoughForPrefixSearch(const base::string16
& word
) {
323 DCHECK(!word
.empty());
324 size_t minimum_length
= 3;
325 // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
326 // because they 'behave like' Latin letters. Moreover, we should
327 // normalize the former before reaching here.
328 if (0xAC00 <= word
[0] && word
[0] <= 0xD7A3)
330 return word
.size() >= minimum_length
;
333 int QueryParser::ParseQuery(const base::string16
& query
,
334 base::string16
* sqlite_query
) {
336 if (!ParseQueryImpl(query
, &root
))
338 return root
.AppendToSQLiteQuery(sqlite_query
);
341 void QueryParser::ParseQueryWords(const base::string16
& query
,
342 std::vector
<base::string16
>* words
) {
344 if (!ParseQueryImpl(query
, &root
))
346 root
.AppendWords(words
);
349 void QueryParser::ParseQueryNodes(const base::string16
& query
,
350 QueryNodeStarVector
* nodes
) {
352 if (ParseQueryImpl(base::i18n::ToLower(query
), &root
))
353 nodes
->swap(*root
.children());
356 bool QueryParser::DoesQueryMatch(const base::string16
& text
,
357 const QueryNodeStarVector
& query_nodes
,
358 Snippet::MatchPositions
* match_positions
) {
359 if (query_nodes
.empty())
362 QueryWordVector query_words
;
363 base::string16 lower_text
= base::i18n::ToLower(text
);
364 ExtractQueryWords(lower_text
, &query_words
);
366 if (query_words
.empty())
369 Snippet::MatchPositions matches
;
370 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
371 if (!query_nodes
[i
]->HasMatchIn(query_words
, &matches
))
374 if (lower_text
.length() != text
.length()) {
375 // The lower case string differs from the original string. The matches are
377 // TODO(sky): we need a better way to align the positions so that we don't
378 // completely punt here.
379 match_positions
->clear();
381 SortAndCoalesceMatchPositions(&matches
);
382 match_positions
->swap(matches
);
387 bool QueryParser::DoesQueryMatch(const QueryWordVector
& query_words
,
388 const QueryNodeStarVector
& query_nodes
) {
389 if (query_nodes
.empty() || query_words
.empty())
392 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
393 if (!query_nodes
[i
]->HasMatchIn(query_words
))
399 bool QueryParser::ParseQueryImpl(const base::string16
& query
,
400 QueryNodeList
* root
) {
401 base::i18n::BreakIterator
iter(query
, base::i18n::BreakIterator::BREAK_WORD
);
402 // TODO(evanm): support a locale here
406 // To handle nesting, we maintain a stack of QueryNodeLists.
407 // The last element (back) of the stack contains the current, deepest node.
408 std::vector
<QueryNodeList
*> query_stack
;
409 query_stack
.push_back(root
);
411 bool in_quotes
= false; // whether we're currently in a quoted phrase
412 while (iter
.Advance()) {
413 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
414 // is not necessarily a word, but could also be a sequence of punctuation
417 QueryNodeWord
* word_node
= new QueryNodeWord(iter
.GetString());
419 word_node
->set_literal(true);
420 query_stack
.back()->AddChild(word_node
);
421 } else { // Punctuation.
422 if (IsQueryQuote(query
[iter
.prev()])) {
424 QueryNodeList
* quotes_node
= new QueryNodePhrase
;
425 query_stack
.back()->AddChild(quotes_node
);
426 query_stack
.push_back(quotes_node
);
429 query_stack
.pop_back(); // Stop adding to the quoted phrase.
436 root
->RemoveEmptySubnodes();
440 void QueryParser::ExtractQueryWords(const base::string16
& text
,
441 QueryWordVector
* words
) {
442 base::i18n::BreakIterator
iter(text
, base::i18n::BreakIterator::BREAK_WORD
);
443 // TODO(evanm): support a locale here
447 while (iter
.Advance()) {
448 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
449 // is not necessarily a word, but could also be a sequence of punctuation
452 base::string16 word
= iter
.GetString();
454 words
->push_back(QueryWord());
455 words
->back().word
= word
;
456 words
->back().position
= iter
.prev();
463 void QueryParser::SortAndCoalesceMatchPositions(
464 Snippet::MatchPositions
* matches
) {
465 std::sort(matches
->begin(), matches
->end(), &CompareMatchPosition
);
466 // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
468 for (size_t i
= 0; i
< matches
->size(); ++i
)
469 CoalesceMatchesFrom(i
, matches
);
472 } // namespace query_parser