Add regression test for previous commit
[xapian.git] / xapian-core / languages / steminternal.cc
blob693fb613acc63d34eacb8856230d46f41dc1a956
1 /** @file
2 * @brief Base class for implementations of stemming algorithms
3 */
4 /* Derived from snowball's runtime/api.c:
6 * Copyright (c) 2001, Dr Martin Porter
7 * Copyright (c) 2004,2005, Richard Boulton
8 * Copyright (c) 2006,2007,2008,2009,2016 Olly Betts
9 * All rights reserved.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
14 * * Redistributions of source code must retain the above copyright notice,
15 * this list of conditions and the following disclaimer.
17 * * Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * * Neither the name of the <ORGANIZATION> nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
37 /* Copyright (C) 2007,2010 Olly Betts
38 * Copyright (C) 2010 Evgeny Sizikov
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License as
42 * published by the Free Software Foundation; either version 2 of the
43 * License, or (at your option) any later version.
45 * This program is distributed in the hope that it will be useful,
46 * but WITHOUT ANY WARRANTY; without even the implied warranty of
47 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
48 * GNU General Public License for more details.
50 * You should have received a copy of the GNU General Public License
51 * along with this program; if not, write to the Free Software
52 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
55 #include <config.h>
57 #include "steminternal.h"
59 #include <xapian/error.h>
61 #include "omassert.h"
63 #include <cstdlib>
64 #include <cstring>
66 #include <string>
68 using namespace std;
70 namespace Xapian {
72 #define CREATE_SIZE 16
74 symbol *
75 SnowballStemImplementation::create_s()
77 void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
78 if (mem == NULL) throw std::bad_alloc();
79 symbol * p = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
80 SET_CAPACITY(p, CREATE_SIZE);
81 SET_SIZE(p, 0);
82 return p;
86 new_c = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
87 if n +ve, or n characters backwards from p + c - 1 if n -ve. new_c is the new
88 value of the cursor c, or -1 on failure.
90 -- used to implement hop and next in the utf8 case.
93 int
94 SnowballStemImplementation::skip_utf8(const symbol * p, int c, int lb, int l, int n)
96 if (n >= 0) {
97 for (; n > 0; --n) {
98 if (c >= l) return -1;
99 if (p[c++] >= 0xC0) { /* 1100 0000 */
100 while (c < l) {
101 /* break unless p[c] is 10------ */
102 if (p[c] >> 6 != 2) break;
103 c++;
107 } else {
108 for (; n < 0; ++n) {
109 if (c <= lb) return -1;
110 if (p[--c] >= 0x80) { /* 1000 0000 */
111 while (c > lb) {
112 if (p[c] >= 0xC0) break; /* 1100 0000 */
113 c--;
118 return c;
122 /* Increase the size of the buffer pointed to by p to at least n symbols.
123 * If insufficient memory, throw std::bad_alloc().
125 symbol *
126 SnowballStemImplementation::increase_size(symbol * p, int n)
128 int new_size = n + 20;
129 void * mem = realloc(reinterpret_cast<char *>(p) - HEAD,
130 HEAD + (new_size + 1) * sizeof(symbol));
131 if (mem == NULL) {
132 throw std::bad_alloc();
134 symbol * q = reinterpret_cast<symbol*>(HEAD + static_cast<char *>(mem));
135 SET_CAPACITY(q, new_size);
136 return q;
140 StemImplementation::~StemImplementation() { }
142 SnowballStemImplementation::~SnowballStemImplementation()
144 lose_s(p);
147 string
148 SnowballStemImplementation::operator()(const string & word)
150 const symbol * s = reinterpret_cast<const symbol *>(word.data());
151 replace_s(0, l, word.size(), s);
152 c = 0;
153 if (stem() < 0) {
154 // FIXME: Is there a better choice of exception class?
155 throw Xapian::InternalError("stemming exception!");
157 return string(reinterpret_cast<const char *>(p), l);
160 /* Code for character groupings: utf8 cases */
162 int SnowballStemImplementation::get_utf8(int * slot) {
163 int b0, b1, b2;
164 int tmp = c;
165 if (tmp >= l) return 0;
166 b0 = p[tmp++];
167 if (b0 < 0xC0 || tmp == l) { /* 1100 0000 */
168 *slot = b0;
169 return 1;
171 b1 = p[tmp++] & 0x3F;
172 if (b0 < 0xE0 || tmp == l) { /* 1110 0000 */
173 *slot = (b0 & 0x1F) << 6 | b1;
174 return 2;
176 b2 = p[tmp++] & 0x3F;
177 if (b0 < 0xF0 || tmp == l) { /* 1111 0000 */
178 *slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
179 return 3;
181 *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[tmp] & 0x3F);
182 return 4;
185 int SnowballStemImplementation::get_b_utf8(int * slot) {
186 int a, b;
187 int tmp = c;
188 if (tmp <= lb) return 0;
189 b = p[--tmp];
190 if (b < 0x80 || tmp == lb) { /* 1000 0000 */
191 *slot = b;
192 return 1;
194 a = b & 0x3F;
195 b = p[--tmp];
196 if (b >= 0xC0 || tmp == lb) { /* 1100 0000 */
197 *slot = (b & 0x1F) << 6 | a;
198 return 2;
200 a |= (b & 0x3F) << 6;
201 b = p[--tmp];
202 if (b >= 0xE0 || tmp == lb) { /* 1110 0000 */
203 *slot = (b & 0xF) << 12 | a;
204 return 3;
206 *slot = (p[--tmp] & 0xE) << 18 | (b & 0x3F) << 12 | a;
207 return 4;
211 SnowballStemImplementation::in_grouping_U(const unsigned char * s, int min,
212 int max, int repeat)
214 do {
215 int ch;
216 int w = get_utf8(&ch);
217 if (!w) return -1;
218 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
219 return w;
220 c += w;
221 } while (repeat);
222 return 0;
226 SnowballStemImplementation::in_grouping_b_U(const unsigned char * s, int min,
227 int max, int repeat)
229 do {
230 int ch;
231 int w = get_b_utf8(&ch);
232 if (!w) return -1;
233 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
234 return w;
235 c -= w;
236 } while (repeat);
237 return 0;
241 SnowballStemImplementation::out_grouping_U(const unsigned char * s, int min,
242 int max, int repeat)
244 do {
245 int ch;
246 int w = get_utf8(&ch);
247 if (!w) return -1;
248 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
249 /* FIXME: try adding this so gopast in generated code is simpler: if (repeat == 2) c += w; */ return w;
250 c += w;
251 } while (repeat);
252 return 0;
256 SnowballStemImplementation::out_grouping_b_U(const unsigned char * s, int min,
257 int max, int repeat)
259 do {
260 int ch;
261 int w = get_b_utf8(&ch);
262 if (!w) return -1;
263 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
264 return w;
265 c -= w;
266 } while (repeat);
267 return 0;
270 int SnowballStemImplementation::eq_s(int s_size, const symbol * s) {
271 if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
272 return 0;
273 c += s_size;
274 return 1;
277 int SnowballStemImplementation::eq_s_b(int s_size, const symbol * s) {
278 if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
279 return 0;
280 c -= s_size;
281 return 1;
285 SnowballStemImplementation::find_among(const symbol * pool,
286 const struct among * v, int v_size,
287 const unsigned char * fnum,
288 const among_function * f)
290 int i = 0;
291 int j = v_size;
293 const symbol * q = p + c;
294 int c_orig = c;
296 int common_i = 0;
297 int common_j = 0;
299 int first_key_inspected = 0;
301 while (1) {
302 int k = i + ((j - i) >> 1);
303 int diff = 0;
304 int common = common_i < common_j ? common_i : common_j; /* smaller */
305 const struct among * w = v + k;
306 for (int x = common; x < w->s_size; ++x) {
307 if (c_orig + common == l) { diff = -1; break; }
308 diff = q[common] - (pool + w->s)[x];
309 if (diff != 0) break;
310 ++common;
312 if (diff < 0) {
313 j = k;
314 common_j = common;
315 } else {
316 i = k;
317 common_i = common;
319 if (j - i <= 1) {
320 if (i > 0) break; /* v->s has been inspected */
321 if (j == i) break; /* only one item in v */
323 /* - but now we need to go round once more to get
324 v->s inspected. This looks messy, but is actually
325 the optimal approach. */
327 if (first_key_inspected) break;
328 first_key_inspected = 1;
331 while (1) {
332 const struct among * w = v + i;
333 if (common_i >= w->s_size) {
334 c = c_orig + w->s_size;
335 if (!fnum || !fnum[i]) return w->result;
337 int res = f[fnum[i] - 1](this);
338 c = c_orig + w->s_size;
339 if (res) return w->result;
342 i = w->substring_i;
343 if (i < 0) return 0;
347 /* find_among_b is for backwards processing. Same comments apply */
349 SnowballStemImplementation::find_among_b(const symbol * pool,
350 const struct among * v, int v_size,
351 const unsigned char * fnum,
352 const among_function * f)
354 int i = 0;
355 int j = v_size;
357 const symbol * q = p + c - 1;
358 int c_orig = c;
360 int common_i = 0;
361 int common_j = 0;
363 int first_key_inspected = 0;
365 while (1) {
366 int k = i + ((j - i) >> 1);
367 int diff = 0;
368 int common = common_i < common_j ? common_i : common_j;
369 const struct among * w = v + k;
370 for (int x = w->s_size - 1 - common; x >= 0; --x) {
371 if (c_orig - common == lb) { diff = -1; break; }
372 diff = q[- common] - (pool + w->s)[x];
373 if (diff != 0) break;
374 ++common;
376 if (diff < 0) { j = k; common_j = common; }
377 else { i = k; common_i = common; }
378 if (j - i <= 1) {
379 if (i > 0) break;
380 if (j == i) break;
381 if (first_key_inspected) break;
382 first_key_inspected = 1;
385 while (1) {
386 const struct among * w = v + i;
387 if (common_i >= w->s_size) {
388 c = c_orig - w->s_size;
389 if (!fnum || !fnum[i]) return w->result;
391 int res = f[fnum[i] - 1](this);
392 c = c_orig - w->s_size;
393 if (res) return w->result;
396 i = w->substring_i;
397 if (i < 0) return 0;
402 SnowballStemImplementation::replace_s(int c_bra, int c_ket, int s_size,
403 const symbol * s)
405 int adjustment;
406 int len;
407 Assert(p);
408 adjustment = s_size - (c_ket - c_bra);
409 len = SIZE(p);
410 if (adjustment != 0) {
411 if (adjustment + len > CAPACITY(p)) {
412 p = increase_size(p, adjustment + len);
414 memmove(p + c_ket + adjustment,
415 p + c_ket,
416 (len - c_ket) * sizeof(symbol));
417 SET_SIZE(p, adjustment + len);
418 l += adjustment;
419 if (c >= c_ket)
420 c += adjustment;
421 else if (c > c_bra)
422 c = c_bra;
424 if (s_size) memmove(p + c_bra, s, s_size * sizeof(symbol));
425 return adjustment;
428 int SnowballStemImplementation::slice_check() {
429 Assert(p);
430 if (bra < 0 || bra > ket || ket > l) {
431 #if 0
432 fprintf(stderr, "faulty slice operation:\n");
433 debug(z, -1, 0);
434 #endif
435 return -1;
437 return 0;
440 int SnowballStemImplementation::slice_from_s(int s_size, const symbol * s) {
441 if (slice_check()) return -1;
442 replace_s(bra, ket, s_size, s);
443 return 0;
446 void
447 SnowballStemImplementation::insert_s(int c_bra, int c_ket, int s_size,
448 const symbol * s)
450 int adjustment = replace_s(c_bra, c_ket, s_size, s);
451 if (c_bra <= bra) bra += adjustment;
452 if (c_bra <= ket) ket += adjustment;
455 symbol * SnowballStemImplementation::slice_to(symbol * v) {
456 if (slice_check()) return NULL;
458 int len = ket - bra;
459 if (CAPACITY(v) < len) {
460 v = increase_size(v, len);
462 memmove(v, p + bra, len * sizeof(symbol));
463 SET_SIZE(v, len);
465 return v;
468 symbol * SnowballStemImplementation::assign_to(symbol * v) {
469 int len = l;
470 if (CAPACITY(v) < len) {
471 v = increase_size(v, len);
473 memmove(v, p, len * sizeof(symbol));
474 SET_SIZE(v, len);
475 return v;
478 int SnowballStemImplementation::len_utf8(const symbol * v) {
479 int size = SIZE(v);
480 int len = 0;
481 while (size--) {
482 symbol b = *v++;
483 if (static_cast<signed char>(b) >= static_cast<signed char>(0xc0))
484 ++len;
486 return len;
489 #if 0
490 void SnowballStemImplementation::debug(int number, int line_count) {
491 int i;
492 int limit = SIZE(p);
493 if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count, limit);
494 for (i = 0; i <= limit; ++i) {
495 if (lb == i) printf("{");
496 if (bra == i) printf("[");
497 if (c == i) printf("|");
498 if (ket == i) printf("]");
499 if (l == i) printf("}");
500 if (i < limit) {
501 int ch = p[i];
502 if (ch == 0) ch = '#';
503 printf("%c", ch);
506 printf("'\n");
508 #endif