1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/query_parser/snippet.h"
9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "testing/gtest/include/gtest/gtest.h"
14 namespace query_parser
{
17 // A sample document to compute snippets of.
18 // The \x bits after the first "Google" are UTF-8 of U+2122 TRADE MARK SIGN,
19 // and are useful for verifying we don't screw up in UTF-8/UTF-16 conversion.
20 const char* kSampleDocument
= "Google\xe2\x84\xa2 Terms of Service "
22 "1. Your relationship with Google "
23 "1.1 Your use of Google's products, software, services and web sites "
24 "(referred to collectively as the \"Services\" in this document and excluding "
25 "any services provided to you by Google under a separate written agreement) "
26 "is subject to the terms of a legal agreement between you and Google. "
27 "\"Google\" means Google Inc., whose principal place of business is at 1600 "
28 "Amphitheatre Parkway, Mountain View, CA 94043, United States. This document "
29 "explains how the agreement is made up, and sets out some of the terms of "
33 // Thai sample taken from http://www.google.co.th/intl/th/privacy.html
34 // TODO(jungshik) : Add more samples (e.g. Hindi) after porting
35 // ICU 4.0's character iterator changes to our copy of ICU 3.8 to get
36 // grapheme clusters in Indic scripts handled more reasonably.
37 const char* kThaiSample
= "Google \xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87"
38 "\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
39 "\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
40 "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
41 "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1"
42 "\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93"
43 "\xE0\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A"
44 "\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
45 "\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A"
46 "\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82"
47 "\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7"
48 "\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x82\xE0\xB9\x89"
49 "\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1"
50 "\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7"
51 "\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1"
52 "\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
53 "\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7"
54 "\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
55 "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
56 "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5"
57 "\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
58 "\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x88"
59 "\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB9\x80"
60 "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A"
61 "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
62 "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4"
63 "\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88"
64 "\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB"
65 "\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
66 "\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA"
67 "\xE0\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88"
68 "\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x9C\xE0\xB8\xB9"
69 "\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94"
70 "\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3"
71 "\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3"
72 "\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94"
73 "\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3"
74 "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87"
75 "\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
76 "\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89"
77 "\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89"
78 "\xE0\xB9\x80\xE0\xB8\xAB\xE0\xB8\xA1\xE0\xB8\xB2\xE0\xB8\xB0\xE0\xB8\xAA"
79 "\xE0\xB8\xB3\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x84"
80 "\xE0\xB8\xB8\xE0\xB8\x93";
82 // Comparator for sorting by the first element in a pair.
83 bool ComparePair1st(const Snippet::MatchPosition
& a
,
84 const Snippet::MatchPosition
& b
) {
85 return a
.first
< b
.first
;
88 // For testing, we'll compute the match positions manually instead of using
89 // sqlite's FTS matching. BuildSnippet returns the snippet for matching
90 // |query| against |document|. Matches are surrounded by "**".
91 base::string16
BuildSnippet(const std::string
& document
,
92 const std::string
& query
) {
93 // This function assumes that |document| does not contain
94 // any character for which lowercasing changes its length. Further,
95 // it's assumed that lowercasing only the ASCII-portion works for
96 // |document|. We need to add more test cases and change this function
97 // to be more generic depending on how we deal with 'folding for match'
99 const std::string document_folded
= StringToLowerASCII(std::string(document
));
101 std::vector
<std::string
> query_words
;
102 base::SplitString(query
, ' ', &query_words
);
104 // Manually construct match_positions of the document.
105 Snippet::MatchPositions match_positions
;
106 match_positions
.clear();
107 for (std::vector
<std::string
>::iterator qw
= query_words
.begin();
108 qw
!= query_words
.end(); ++qw
) {
109 // Insert all instances of this word into match_pairs.
111 while ((ofs
= document_folded
.find(*qw
, ofs
)) != std::string::npos
) {
112 match_positions
.push_back(std::make_pair(ofs
, ofs
+ qw
->size()));
116 // Sort match_positions in order of increasing offset.
117 std::sort(match_positions
.begin(), match_positions
.end(), ComparePair1st
);
119 // Compute the snippet.
121 snippet
.ComputeSnippet(match_positions
, document
);
123 // Now "highlight" all matches in the snippet with **.
124 base::string16 star_snippet
;
125 Snippet::MatchPositions::const_iterator match
;
127 for (match
= snippet
.matches().begin();
128 match
!= snippet
.matches().end(); ++match
) {
129 star_snippet
+= snippet
.text().substr(pos
, match
->first
- pos
);
130 star_snippet
+= base::UTF8ToUTF16("**");
131 star_snippet
+= snippet
.text().substr(match
->first
,
132 match
->second
- match
->first
);
133 star_snippet
+= base::UTF8ToUTF16("**");
136 star_snippet
+= snippet
.text().substr(pos
);
141 TEST(Snippets
, SimpleQuery
) {
142 ASSERT_EQ(" ... eferred to collectively as the \"Services\" in this "
143 "**document** and excluding any services provided to you by "
144 "Goo ... ... way, Mountain View, CA 94043, United States. This "
145 "**document** explains how the agreement is made up, and sets "
147 base::UTF16ToUTF8(BuildSnippet(kSampleDocument
, "document")));
150 // Test that two words that are near each other don't produce two elided bits.
151 TEST(Snippets
, NearbyWords
) {
152 ASSERT_EQ(" ... lace of business is at 1600 Amphitheatre Parkway, "
153 "**Mountain** **View**, CA 94043, United States. This "
154 "document explains ... ",
155 base::UTF16ToUTF8(BuildSnippet(kSampleDocument
, "mountain view")));
158 // The above tests already test that we get byte offsets correct, but here's
159 // one that gets the "TM" in its snippet.
160 TEST(Snippets
, UTF8
) {
161 ASSERT_EQ(" ... ogle\xe2\x84\xa2 Terms of Service Welcome to Google! "
162 "1. Your **relationship** with Google 1.1 Your use of Google's "
164 base::UTF16ToUTF8(BuildSnippet(kSampleDocument
, "relationship")));
167 TEST(Snippets
, ThaiUTF8
) {
168 // There are 3 instances of '\u0E43\u0E2B\u0E49'
169 // (\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89) in kThaiSample.
170 // The 1st is more than |kSniipetContext| graphemes away from the
171 // 2nd while the 2nd and 3rd are within that window. However, with
172 // the 2nd match added, the snippet goes over the size limit so that
173 // the snippet ends right before the 3rd match.
175 "\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
176 "\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8"
177 "\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8"
178 "\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8"
179 "\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1\xE0\xB8\xB7\xE0"
180 "\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0"
181 "\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0"
182 "\xB8\x9A\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0"
183 "\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0"
184 "\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0"
185 "\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82\xE0\xB8\xAD\xE0"
186 "\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8"
187 "\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x82\xE0"
188 "\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0"
189 "\xB8\x94\xE0\xB8\xB1\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0"
190 "\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7\xE0\xB9\x82\xE0\xB8\x94\xE0"
191 "\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1\xE0\xB8\x84\xE0"
192 "\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
193 "\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3"
194 "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD"
195 "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88"
196 "\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
197 "\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88"
198 "\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
199 "\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
200 "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8"
201 "\xE0\xB8\x93\xE0\xB9\x80\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2"
202 "\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A ... ... \xE0\xB8\x82"
203 "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
204 "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3"
205 "\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD"
206 "\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD"
207 "\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0"
208 "\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0"
209 "\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA\xE0"
210 "\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
211 "\xE0\xB9\x88\xE0\xB8\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9"
212 "\x89**\xE0\xB8\x9C\xE0\xB8\xB9\xE0\xB9\x89\xE0\xB9\x83\xE0"
213 "\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94\xE0\xB9\x89\xE0"
214 "\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3\xE0"
215 "\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0"
216 "\xB8\xA3\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0"
217 "\xB9\x88\xE0\xB8\x94\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0"
218 "\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
219 "\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87\xE0\xB8\x9B"
220 "\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
221 "\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7"
222 "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2",
223 base::UTF16ToUTF8(BuildSnippet(kThaiSample
,
224 "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89")));
227 TEST(Snippets
, ExtractMatchPositions
) {
229 const std::string offsets_string
;
230 const size_t expected_match_count
;
231 const size_t expected_matches
[10];
233 { "0 0 1 2 0 0 4 1 0 0 1 5", 1, { 1, 6 } },
234 { "0 0 1 4 0 0 2 1", 1, { 1, 5 } },
235 { "0 0 4 1 0 0 2 1", 2, { 2, 3, 4, 5 } },
236 { "0 0 0 1", 1, { 0, 1 } },
237 { "0 0 0 1 0 0 0 2", 1, { 0, 2 } },
238 { "0 0 1 1 0 0 1 2", 1, { 1, 3 } },
239 { "0 0 1 2 0 0 4 3 0 0 3 1", 1, { 1, 7 } },
240 { "0 0 1 4 0 0 2 5", 1, { 1, 7 } },
241 { "0 0 1 2 0 0 1 1", 1, { 1, 3 } },
242 { "0 0 1 1 0 0 5 2 0 0 10 1 0 0 3 10", 2, { 1, 2, 3, 13 } },
244 for (size_t i
= 0; i
< ARRAYSIZE_UNSAFE(data
); ++i
) {
245 Snippet::MatchPositions matches
;
246 Snippet::ExtractMatchPositions(data
[i
].offsets_string
, "0", &matches
);
247 EXPECT_EQ(data
[i
].expected_match_count
, matches
.size());
248 for (size_t j
= 0; j
< data
[i
].expected_match_count
; ++j
) {
249 EXPECT_EQ(data
[i
].expected_matches
[2 * j
], matches
[j
].first
);
250 EXPECT_EQ(data
[i
].expected_matches
[2 * j
+ 1], matches
[j
].second
);
255 } // namespace query_parser