2 * @brief generate a sample from a utf-8 string.
4 /* Copyright (C) 2007,2013 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 generate_sample(const string
& input
, size_t maxlen
,
33 const string
& ind
, const string
& ind2
)
37 // Reserve an appropriate amount of space to repeated reallocation as
39 if (input
.size() <= maxlen
) {
40 output
.reserve(input
.size());
42 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
43 // output is maxlen - 1 bytes long.
44 output
.reserve(maxlen
+ 3);
47 size_t last_word_end
= 0;
49 Xapian::Utf8Iterator
i(input
);
50 for ( ; i
!= Xapian::Utf8Iterator(); ++i
) {
51 if (output
.size() >= maxlen
) {
52 // Need to truncate output.
53 if (last_word_end
<= maxlen
/ 2) {
54 // Fixed when maxlen < ind.size leading to a negative
56 if (maxlen
< ind
.size()) {
59 // Monster word! We'll have to just split it.
60 output
.replace(maxlen
- ind
.size(), string::npos
, ind
);
63 output
.replace(last_word_end
, string::npos
, ind2
);
69 if (ch
<= ' ' || ch
== 0xa0) {
70 // FIXME: if all the whitespace characters between two words are
71 // 0xa0 (non-breaking space) then perhaps we should output 0xa0.
74 last_word_end
= output
.size();
80 Xapian::Unicode::append_utf8(output
, ch
);