2 * @brief Base class for implementations of stemming algorithms
4 /* Derived from snowball's runtime/api.c:
6 * Copyright (c) 2001, Dr Martin Porter
7 * Copyright (c) 2004,2005, Richard Boulton
8 * Copyright (c) 2006,2007,2008,2009,2016 Olly Betts
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
14 * * Redistributions of source code must retain the above copyright notice,
15 * this list of conditions and the following disclaimer.
17 * * Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * * Neither the name of the <ORGANIZATION> nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
37 /* Copyright (C) 2007,2010 Olly Betts
38 * Copyright (C) 2010 Evgeny Sizikov
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License as
42 * published by the Free Software Foundation; either version 2 of the
43 * License, or (at your option) any later version.
45 * This program is distributed in the hope that it will be useful,
46 * but WITHOUT ANY WARRANTY; without even the implied warranty of
47 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
48 * GNU General Public License for more details.
50 * You should have received a copy of the GNU General Public License
51 * along with this program; if not, write to the Free Software
52 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
57 #include "steminternal.h"
59 #include <xapian/error.h>
72 #define CREATE_SIZE 16
75 SnowballStemImplementation::create_s()
77 void * mem
= malloc(HEAD
+ (CREATE_SIZE
+ 1) * sizeof(symbol
));
78 if (mem
== NULL
) throw std::bad_alloc();
79 symbol
* p
= reinterpret_cast<symbol
*>(HEAD
+ static_cast<char *>(mem
));
80 SET_CAPACITY(p
, CREATE_SIZE
);
86 new_c = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
87 if n +ve, or n characters backwards from p + c - 1 if n -ve. new_c is the new
88 value of the cursor c, or -1 on failure.
90 -- used to implement hop and next in the utf8 case.
94 SnowballStemImplementation::skip_utf8(const symbol
* p
, int c
, int lb
, int l
, int n
)
98 if (c
>= l
) return -1;
99 if (p
[c
++] >= 0xC0) { /* 1100 0000 */
101 /* break unless p[c] is 10------ */
102 if (p
[c
] >> 6 != 2) break;
109 if (c
<= lb
) return -1;
110 if (p
[--c
] >= 0x80) { /* 1000 0000 */
112 if (p
[c
] >= 0xC0) break; /* 1100 0000 */
122 /* Increase the size of the buffer pointed to by p to at least n symbols.
123 * If insufficient memory, throw std::bad_alloc().
126 SnowballStemImplementation::increase_size(symbol
* p
, int n
)
128 int new_size
= n
+ 20;
129 void * mem
= realloc(reinterpret_cast<char *>(p
) - HEAD
,
130 HEAD
+ (new_size
+ 1) * sizeof(symbol
));
132 throw std::bad_alloc();
134 symbol
* q
= reinterpret_cast<symbol
*>(HEAD
+ static_cast<char *>(mem
));
135 SET_CAPACITY(q
, new_size
);
140 StemImplementation::~StemImplementation() { }
142 SnowballStemImplementation::~SnowballStemImplementation()
148 SnowballStemImplementation::operator()(const string
& word
)
150 const symbol
* s
= reinterpret_cast<const symbol
*>(word
.data());
151 replace_s(0, l
, word
.size(), s
);
154 // FIXME: Is there a better choice of exception class?
155 throw Xapian::InternalError("stemming exception!");
157 return string(reinterpret_cast<const char *>(p
), l
);
160 /* Code for character groupings: utf8 cases */
162 int SnowballStemImplementation::get_utf8(int * slot
) {
165 if (tmp
>= l
) return 0;
167 if (b0
< 0xC0 || tmp
== l
) { /* 1100 0000 */
171 b1
= p
[tmp
++] & 0x3F;
172 if (b0
< 0xE0 || tmp
== l
) { /* 1110 0000 */
173 *slot
= (b0
& 0x1F) << 6 | b1
;
176 b2
= p
[tmp
++] & 0x3F;
177 if (b0
< 0xF0 || tmp
== l
) { /* 1111 0000 */
178 *slot
= (b0
& 0xF) << 12 | b1
<< 6 | b2
;
181 *slot
= (b0
& 0xE) << 18 | b1
<< 12 | b2
<< 6 | (p
[tmp
] & 0x3F);
185 int SnowballStemImplementation::get_b_utf8(int * slot
) {
188 if (tmp
<= lb
) return 0;
190 if (b
< 0x80 || tmp
== lb
) { /* 1000 0000 */
196 if (b
>= 0xC0 || tmp
== lb
) { /* 1100 0000 */
197 *slot
= (b
& 0x1F) << 6 | a
;
200 a
|= (b
& 0x3F) << 6;
202 if (b
>= 0xE0 || tmp
== lb
) { /* 1110 0000 */
203 *slot
= (b
& 0xF) << 12 | a
;
206 *slot
= (p
[--tmp
] & 0xE) << 18 | (b
& 0x3F) << 12 | a
;
211 SnowballStemImplementation::in_grouping_U(const unsigned char * s
, int min
,
216 int w
= get_utf8(&ch
);
218 if (ch
> max
|| (ch
-= min
) < 0 || (s
[ch
>> 3] & (0X1 << (ch
& 0X7))) == 0)
226 SnowballStemImplementation::in_grouping_b_U(const unsigned char * s
, int min
,
231 int w
= get_b_utf8(&ch
);
233 if (ch
> max
|| (ch
-= min
) < 0 || (s
[ch
>> 3] & (0X1 << (ch
& 0X7))) == 0)
241 SnowballStemImplementation::out_grouping_U(const unsigned char * s
, int min
,
246 int w
= get_utf8(&ch
);
248 if (!(ch
> max
|| (ch
-= min
) < 0 || (s
[ch
>> 3] & (0X1 << (ch
& 0X7))) == 0))
249 /* FIXME: try adding this so gopast in generated code is simpler: if (repeat == 2) c += w; */ return w
;
256 SnowballStemImplementation::out_grouping_b_U(const unsigned char * s
, int min
,
261 int w
= get_b_utf8(&ch
);
263 if (!(ch
> max
|| (ch
-= min
) < 0 || (s
[ch
>> 3] & (0X1 << (ch
& 0X7))) == 0))
270 int SnowballStemImplementation::eq_s(int s_size
, const symbol
* s
) {
271 if (l
- c
< s_size
|| memcmp(p
+ c
, s
, s_size
* sizeof(symbol
)) != 0)
277 int SnowballStemImplementation::eq_s_b(int s_size
, const symbol
* s
) {
278 if (c
- lb
< s_size
|| memcmp(p
+ c
- s_size
, s
, s_size
* sizeof(symbol
)) != 0)
285 SnowballStemImplementation::find_among(const symbol
* pool
,
286 const struct among
* v
, int v_size
,
287 const unsigned char * fnum
,
288 const among_function
* f
)
293 const symbol
* q
= p
+ c
;
299 int first_key_inspected
= 0;
302 int k
= i
+ ((j
- i
) >> 1);
304 int common
= common_i
< common_j
? common_i
: common_j
; /* smaller */
305 const struct among
* w
= v
+ k
;
306 for (int x
= common
; x
< w
->s_size
; ++x
) {
307 if (c_orig
+ common
== l
) { diff
= -1; break; }
308 diff
= q
[common
] - (pool
+ w
->s
)[x
];
309 if (diff
!= 0) break;
320 if (i
> 0) break; /* v->s has been inspected */
321 if (j
== i
) break; /* only one item in v */
323 /* - but now we need to go round once more to get
324 v->s inspected. This looks messy, but is actually
325 the optimal approach. */
327 if (first_key_inspected
) break;
328 first_key_inspected
= 1;
332 const struct among
* w
= v
+ i
;
333 if (common_i
>= w
->s_size
) {
334 c
= c_orig
+ w
->s_size
;
335 if (!fnum
|| !fnum
[i
]) return w
->result
;
337 int res
= f
[fnum
[i
] - 1](this);
338 c
= c_orig
+ w
->s_size
;
339 if (res
) return w
->result
;
347 /* find_among_b is for backwards processing. Same comments apply */
349 SnowballStemImplementation::find_among_b(const symbol
* pool
,
350 const struct among
* v
, int v_size
,
351 const unsigned char * fnum
,
352 const among_function
* f
)
357 const symbol
* q
= p
+ c
- 1;
363 int first_key_inspected
= 0;
366 int k
= i
+ ((j
- i
) >> 1);
368 int common
= common_i
< common_j
? common_i
: common_j
;
369 const struct among
* w
= v
+ k
;
370 for (int x
= w
->s_size
- 1 - common
; x
>= 0; --x
) {
371 if (c_orig
- common
== lb
) { diff
= -1; break; }
372 diff
= q
[- common
] - (pool
+ w
->s
)[x
];
373 if (diff
!= 0) break;
376 if (diff
< 0) { j
= k
; common_j
= common
; }
377 else { i
= k
; common_i
= common
; }
381 if (first_key_inspected
) break;
382 first_key_inspected
= 1;
386 const struct among
* w
= v
+ i
;
387 if (common_i
>= w
->s_size
) {
388 c
= c_orig
- w
->s_size
;
389 if (!fnum
|| !fnum
[i
]) return w
->result
;
391 int res
= f
[fnum
[i
] - 1](this);
392 c
= c_orig
- w
->s_size
;
393 if (res
) return w
->result
;
402 SnowballStemImplementation::replace_s(int c_bra
, int c_ket
, int s_size
,
408 adjustment
= s_size
- (c_ket
- c_bra
);
410 if (adjustment
!= 0) {
411 if (adjustment
+ len
> CAPACITY(p
)) {
412 p
= increase_size(p
, adjustment
+ len
);
414 memmove(p
+ c_ket
+ adjustment
,
416 (len
- c_ket
) * sizeof(symbol
));
417 SET_SIZE(p
, adjustment
+ len
);
424 if (s_size
) memmove(p
+ c_bra
, s
, s_size
* sizeof(symbol
));
428 int SnowballStemImplementation::slice_check() {
430 if (bra
< 0 || bra
> ket
|| ket
> l
) {
432 fprintf(stderr
, "faulty slice operation:\n");
440 int SnowballStemImplementation::slice_from_s(int s_size
, const symbol
* s
) {
441 if (slice_check()) return -1;
442 replace_s(bra
, ket
, s_size
, s
);
447 SnowballStemImplementation::insert_s(int c_bra
, int c_ket
, int s_size
,
450 int adjustment
= replace_s(c_bra
, c_ket
, s_size
, s
);
451 if (c_bra
<= bra
) bra
+= adjustment
;
452 if (c_bra
<= ket
) ket
+= adjustment
;
455 symbol
* SnowballStemImplementation::slice_to(symbol
* v
) {
456 if (slice_check()) return NULL
;
459 if (CAPACITY(v
) < len
) {
460 v
= increase_size(v
, len
);
462 memmove(v
, p
+ bra
, len
* sizeof(symbol
));
468 symbol
* SnowballStemImplementation::assign_to(symbol
* v
) {
470 if (CAPACITY(v
) < len
) {
471 v
= increase_size(v
, len
);
473 memmove(v
, p
, len
* sizeof(symbol
));
478 int SnowballStemImplementation::len_utf8(const symbol
* v
) {
483 if (static_cast<signed char>(b
) >= static_cast<signed char>(0xc0))
490 void SnowballStemImplementation::debug(int number
, int line_count
) {
493 if (number
>= 0) printf("%3d (line %4d): [%d]'", number
, line_count
, limit
);
494 for (i
= 0; i
<= limit
; ++i
) {
495 if (lb
== i
) printf("{");
496 if (bra
== i
) printf("[");
497 if (c
== i
) printf("|");
498 if (ket
== i
) printf("]");
499 if (l
== i
) printf("}");
502 if (ch
== 0) ch
= '#';