4 * Default wordbreaker module for full text indexing.
17 #include <sys/types.h>
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
24 # include <sys/time.h>
34 #include <libcitadel.h>
37 #include "sysdep_decls.h"
38 #include "citserver.h"
44 #include "ft_wordbreaker.h"
46 #include "ctdl_module.h"
49 * Noise words are not included in search indices.
50 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51 * must also be changed, so that the index is rebuilt.
54 noise_word
*noise_words
[26];
56 static char *noise_words_init
[] = {
121 void initialize_noise_words(void)
128 memset (noise_words
, 0, sizeof(noise_words
));
130 for (i
=0; i
<(sizeof(noise_words_init
)/sizeof(char *)); ++i
)
132 ch
= noise_words_init
[i
][0] - 'a';
133 len
= strlen(noise_words_init
[i
]);
135 next
= malloc(sizeof(noise_word
));
137 next
->word
= strdup(noise_words_init
[i
]);
138 next
->next
= noise_words
[ch
];
139 noise_words
[ch
] = next
;
144 void noise_word_cleanup(void)
147 noise_word
*cur
, *next
;
149 CtdlLogPrintf(CTDL_INFO
, "Cleaning up fulltext noise words.\n");
151 for (i
= 0 ; i
< 26 ; i
++)
153 cur
= noise_words
[i
];
167 int intcmp(const void *rec1
, const void *rec2
) {
170 i1
= *(const int *)rec1
;
171 i2
= *(const int *)rec2
;
173 if (i1
> i2
) return(1);
174 if (i1
< i2
) return(-1);
179 void wordbreaker(char *text
, int *num_tokens
, int **tokens
) {
181 int wb_num_tokens
= 0;
182 int wb_num_alloc
= 0;
183 int *wb_tokens
= NULL
;
196 if (text
== NULL
) { /* no NULL text please */
202 if (text
[0] == 0) { /* no empty text either */
219 if ( (!isalnum(ch
)) && (word_start
) ) {
223 /* extract the word */
224 word_len
= word_end
- word_start
;
225 if (word_len
>= sizeof word
) {
226 CtdlLogPrintf(CTDL_DEBUG
, "Invalid word length: %d\n", word_len
);
227 safestrncpy(word
, word_start
, sizeof word
);
228 word
[(sizeof word
) - 1] = 0;
231 safestrncpy(word
, word_start
, word_len
+1);
236 /* are we ok with the length? */
237 if ( (word_len
>= WB_MIN
)
238 && (word_len
<= WB_MAX
) ) {
239 for (i
=0; i
<word_len
; ++i
) {
240 word
[i
] = tolower(word
[i
]);
242 /* disqualify noise words */
243 noise
= noise_words
[(int) (word
[0]-'a')];
246 if (noise
->len
== word_len
)
248 if (!strcmp(word
, noise
->word
))
260 CalcCRC16Bytes(word_len
, word
);
263 if (wb_num_tokens
> wb_num_alloc
) {
265 wb_tokens
= realloc(wb_tokens
, (sizeof(int) * wb_num_alloc
));
267 wb_tokens
[wb_num_tokens
- 1] = word_crc
;
272 /* sort and purge dups */
273 if (wb_num_tokens
> 1) {
274 qsort(wb_tokens
, wb_num_tokens
, sizeof(int), intcmp
);
275 for (i
=0; i
<(wb_num_tokens
-1); ++i
) {
276 if (wb_tokens
[i
] == wb_tokens
[i
+1]) {
277 memmove(&wb_tokens
[i
], &wb_tokens
[i
+1],
278 ((wb_num_tokens
- i
- 1)*sizeof(int)));
285 *num_tokens
= wb_num_tokens
;