[ci] Test Tcl bindings for dragonfly/freebsd
[xapian.git] / xapian-applications / omega / sample.cc
blob4475e6756b017cc03cd8cc60dadaeffb3793932c
1 /** @file
2 * @brief generate a sample from a utf-8 string.
3 */
4 /* Copyright (C) 2007,2013 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include <xapian.h>
25 #include "sample.h"
27 #include <string>
29 using namespace std;
31 string
32 generate_sample(const string & input, size_t maxlen,
33 const string & ind, const string & ind2)
35 string output;
37 // Reserve an appropriate amount of space to repeated reallocation as
38 // output grows.
39 if (input.size() <= maxlen) {
40 output.reserve(input.size());
41 } else {
42 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
43 // output is maxlen - 1 bytes long.
44 output.reserve(maxlen + 3);
47 size_t last_word_end = 0;
48 bool in_space = true;
49 Xapian::Utf8Iterator i(input);
50 for ( ; i != Xapian::Utf8Iterator(); ++i) {
51 if (output.size() >= maxlen) {
52 // Need to truncate output.
53 if (last_word_end <= maxlen / 2) {
54 // Fixed when maxlen < ind.size leading to a negative
55 // reference
56 if (maxlen < ind.size()) {
57 output.resize(0);
58 } else {
59 // Monster word! We'll have to just split it.
60 output.replace(maxlen - ind.size(), string::npos, ind);
62 } else {
63 output.replace(last_word_end, string::npos, ind2);
65 break;
68 unsigned ch = *i;
69 if (ch <= ' ' || ch == 0xa0) {
70 // FIXME: if all the whitespace characters between two words are
71 // 0xa0 (non-breaking space) then perhaps we should output 0xa0.
72 if (!in_space) {
73 in_space = true;
74 last_word_end = output.size();
75 output += ' ';
77 continue;
80 Xapian::Unicode::append_utf8(output, ch);
81 in_space = false;
84 return output;