1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/query_parser/snippet.h"
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "third_party/icu/source/common/unicode/brkiter.h"
15 #include "third_party/icu/source/common/unicode/utext.h"
16 #include "third_party/icu/source/common/unicode/utf8.h"
18 namespace query_parser
{
21 bool PairFirstLessThan(const Snippet::MatchPosition
& a
,
22 const Snippet::MatchPosition
& b
) {
23 return a
.first
< b
.first
;
26 // Combines all pairs after offset in match_positions that are contained
27 // or touch the pair at offset.
28 void CoalescePositionsFrom(size_t offset
,
29 Snippet::MatchPositions
* match_positions
) {
30 DCHECK(offset
< match_positions
->size());
31 Snippet::MatchPosition
& pair((*match_positions
)[offset
]);
33 while (offset
< match_positions
->size() &&
34 pair
.second
>= (*match_positions
)[offset
].first
) {
35 pair
.second
= std::max(pair
.second
, (*match_positions
)[offset
].second
);
36 match_positions
->erase(match_positions
->begin() + offset
);
40 // Makes sure there is a pair in match_positions that contains the specified
41 // range. This keeps the pairs ordered in match_positions by first, and makes
42 // sure none of the pairs in match_positions touch each other.
43 void AddMatch(size_t start
,
45 Snippet::MatchPositions
* match_positions
) {
47 DCHECK(match_positions
);
48 Snippet::MatchPosition
pair(start
, end
);
49 if (match_positions
->empty()) {
50 match_positions
->push_back(pair
);
53 // There's at least one match. Find the position of the new match,
54 // potentially extending pairs around it.
55 Snippet::MatchPositions::iterator i
=
56 std::lower_bound(match_positions
->begin(), match_positions
->end(),
57 pair
, &PairFirstLessThan
);
58 if (i
!= match_positions
->end() && i
->first
== start
) {
59 // Match not at the end and there is already a pair with the same
61 if (end
> i
->second
) {
62 // New pair extends beyond existing pair. Extend existing pair and
63 // coalesce matches after it.
65 CoalescePositionsFrom(i
- match_positions
->begin(), match_positions
);
66 } // else case, new pair completely contained in existing pair, nothing
68 } else if (i
== match_positions
->begin()) {
69 // Match at the beginning and the first pair doesn't have the same
70 // start. Insert new pair and coalesce matches after it.
71 match_positions
->insert(i
, pair
);
72 CoalescePositionsFrom(0, match_positions
);
74 // Not at the beginning (but may be at the end).
76 if (start
<= i
->second
&& end
> i
->second
) {
77 // Previous element contains match. Extend it and coalesce.
79 CoalescePositionsFrom(i
- match_positions
->begin(), match_positions
);
80 } else if (end
> i
->second
) {
81 // Region doesn't touch previous element. See if region touches current
84 if (i
== match_positions
->end() || end
< i
->first
) {
85 match_positions
->insert(i
, pair
);
89 CoalescePositionsFrom(i
- match_positions
->begin(), match_positions
);
95 // Converts an index in a utf8 string into the index in the corresponding utf16
96 // string and returns the utf16 index. This is intended to be called in a loop
97 // iterating through a utf8 string.
99 // utf8_string: the utf8 string.
100 // utf8_length: length of the utf8 string.
101 // offset: the utf8 offset to convert.
102 // utf8_pos: current offset in the utf8 string. This is modified and on return
104 // wide_pos: current index in the wide string. This is the same as the return
106 size_t AdvanceAndReturnUTF16Pos(const char* utf8_string
,
111 DCHECK(offset
>= *utf8_pos
&& offset
<= utf8_length
);
114 while (*utf8_pos
< offset
) {
115 U8_NEXT(utf8_string
, *utf8_pos
, utf8_length
, wide_char
);
116 *utf16_pos
+= (wide_char
<= 0xFFFF) ? 1 : 2;
121 // Given a character break iterator over a UTF-8 string, set the iterator
122 // position to |*utf8_pos| and move by |count| characters. |count| can
123 // be either positive or negative.
124 void MoveByNGraphemes(icu::BreakIterator
* bi
, int count
, size_t* utf8_pos
) {
125 // Ignore the return value. A side effect of the current position
126 // being set at or following |*utf8_pos| is exploited here.
127 // It's simpler than calling following(n) and then previous().
128 // isBoundary() is not very fast, but should be good enough for the
129 // snippet generation. If not, revisit the way we scan in ComputeSnippet.
130 bi
->isBoundary(static_cast<int32_t>(*utf8_pos
));
132 *utf8_pos
= static_cast<size_t>(bi
->current());
135 // The amount of context to include for a given hit. Note that it's counted
136 // in terms of graphemes rather than bytes.
137 const int kSnippetContext
= 50;
139 // Returns true if next match falls within a snippet window
140 // from the previous match. The window size is counted in terms
141 // of graphemes rather than bytes in UTF-8.
142 bool IsNextMatchWithinSnippetWindow(icu::BreakIterator
* bi
,
143 size_t previous_match_end
,
144 size_t next_match_start
) {
145 // If it's within a window in terms of bytes, it's certain
146 // that it's within a window in terms of graphemes as well.
147 if (next_match_start
< previous_match_end
+ kSnippetContext
)
149 bi
->isBoundary(static_cast<int32_t>(previous_match_end
));
150 // An alternative to this is to call |bi->next()| at most
151 // kSnippetContext times, compare |bi->current()| with |next_match_start|
152 // after each call and return early if possible. There are other
153 // heuristics to speed things up if necessary, but it's not likely that
154 // we need to bother.
155 bi
->next(kSnippetContext
);
156 int64 current
= bi
->current();
157 return (next_match_start
< static_cast<uint64
>(current
) ||
158 current
== icu::BreakIterator::DONE
);
164 void Snippet::ExtractMatchPositions(const std::string
& offsets_str
,
165 const std::string
& column_num
,
166 MatchPositions
* match_positions
) {
167 DCHECK(match_positions
);
168 if (offsets_str
.empty())
170 std::vector
<std::string
> offsets
= base::SplitString(
171 offsets_str
, " ", base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
172 // SQLite offsets are sets of four integers:
173 // column, query term, match offset, match length
174 // Matches within a string are marked by (start, end) pairs.
175 for (size_t i
= 0; i
< offsets
.size() - 3; i
+= 4) {
176 if (offsets
[i
] != column_num
)
178 const size_t start
= atoi(offsets
[i
+ 2].c_str());
179 const size_t end
= start
+ atoi(offsets
[i
+ 3].c_str());
180 // Switch to DCHECK after debugging http://crbug.com/15261.
182 AddMatch(start
, end
, match_positions
);
187 void Snippet::ConvertMatchPositionsToWide(
188 const std::string
& utf8_string
,
189 Snippet::MatchPositions
* match_positions
) {
190 DCHECK(match_positions
);
191 int32_t utf8_pos
= 0;
192 size_t utf16_pos
= 0;
193 const char* utf8_cstring
= utf8_string
.c_str();
194 const int32_t utf8_length
= static_cast<int32_t>(utf8_string
.size());
195 for (Snippet::MatchPositions::iterator i
= match_positions
->begin();
196 i
!= match_positions
->end(); ++i
) {
197 i
->first
= AdvanceAndReturnUTF16Pos(utf8_cstring
, utf8_length
,
198 static_cast<int32_t>(i
->first
),
199 &utf8_pos
, &utf16_pos
);
200 i
->second
= AdvanceAndReturnUTF16Pos(utf8_cstring
, utf8_length
,
201 static_cast<int32_t>(i
->second
),
202 &utf8_pos
, &utf16_pos
);
209 Snippet::~Snippet() {
212 void Snippet::ComputeSnippet(const MatchPositions
& match_positions
,
213 const std::string
& document
) {
214 // The length of snippets we try to produce.
215 // We can generate longer snippets but stop once we cross kSnippetMaxLength.
216 const size_t kSnippetMaxLength
= 200;
217 const base::string16 kEllipsis
= base::ASCIIToUTF16(" ... ");
219 UText
* document_utext
= NULL
;
220 UErrorCode status
= U_ZERO_ERROR
;
221 document_utext
= utext_openUTF8(document_utext
, document
.data(),
222 document
.size(), &status
);
223 // Locale does not matter because there's no per-locale customization
224 // for character iterator.
225 scoped_ptr
<icu::BreakIterator
> bi(icu::BreakIterator::createCharacterInstance(
226 icu::Locale::getDefault(), status
));
227 bi
->setText(document_utext
, status
);
228 DCHECK(U_SUCCESS(status
));
230 // We build the snippet by iterating through the matches and then grabbing
231 // context around each match. If matches are near enough each other (within
232 // kSnippetContext), we skip the "..." between them.
233 base::string16 snippet
;
235 for (size_t i
= 0; i
< match_positions
.size(); ++i
) {
236 // Some shorter names for the current match.
237 const size_t match_start
= match_positions
[i
].first
;
238 const size_t match_end
= match_positions
[i
].second
;
240 // Switch to DCHECK after debugging http://crbug.com/15261.
241 CHECK(match_end
> match_start
);
242 CHECK(match_end
<= document
.size());
244 // Add the context, if any, to show before the match.
245 size_t context_start
= match_start
;
246 MoveByNGraphemes(bi
.get(), -kSnippetContext
, &context_start
);
247 start
= std::max(start
, context_start
);
248 if (start
< match_start
) {
250 snippet
+= kEllipsis
;
251 // Switch to DCHECK after debugging http://crbug.com/15261.
252 CHECK(start
< document
.size());
253 snippet
+= base::UTF8ToUTF16(document
.substr(start
, match_start
- start
));
257 const size_t first
= snippet
.size();
258 snippet
+= base::UTF8ToUTF16(document
.substr(match_start
,
259 match_end
- match_start
));
260 matches_
.push_back(std::make_pair(first
, snippet
.size()));
262 // Compute the context, if any, to show after the match.
264 // Check if the next match falls within our snippet window.
265 if (i
+ 1 < match_positions
.size() &&
266 IsNextMatchWithinSnippetWindow(bi
.get(), match_end
,
267 match_positions
[i
+ 1].first
)) {
268 // Yes, it's within the window. Make the end context extend just up
269 // to the next match.
270 end
= match_positions
[i
+ 1].first
;
271 // Switch to DCHECK after debugging http://crbug.com/15261.
272 CHECK(end
>= match_end
);
273 CHECK(end
<= document
.size());
274 snippet
+= base::UTF8ToUTF16(document
.substr(match_end
, end
- match_end
));
276 // No, there's either no next match or the next match is too far away.
278 MoveByNGraphemes(bi
.get(), kSnippetContext
, &end
);
279 // Switch to DCHECK after debugging http://crbug.com/15261.
280 CHECK(end
>= match_end
);
281 CHECK(end
<= document
.size());
282 snippet
+= base::UTF8ToUTF16(document
.substr(match_end
, end
- match_end
));
283 if (end
< document
.size())
284 snippet
+= kEllipsis
;
288 // Stop here if we have enough snippet computed.
289 if (snippet
.size() >= kSnippetMaxLength
)
293 utext_close(document_utext
);
294 swap(text_
, snippet
);
297 void Snippet::Swap(Snippet
* other
) {
298 text_
.swap(other
->text_
);
299 matches_
.swap(other
->matches_
);
302 } // namespace query_parser