* es.po: the correct license for this file was supposed to be GPL v2-or-later, not...
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
blob664a7de8dc99c4b3794b8e051155d72209b23d2a
1 /*
2 * $Id$
4 * Default wordbreaker module for full text indexing.
6 */
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 # include <sys/time.h>
25 # else
26 # include <time.h>
27 # endif
28 #endif
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include <libcitadel.h>
35 #include "citadel.h"
36 #include "server.h"
37 #include "sysdep_decls.h"
38 #include "citserver.h"
39 #include "support.h"
40 #include "config.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "ft_wordbreaker.h"
45 #include "crc16.h"
46 #include "ctdl_module.h"
49 * Noise words are not included in search indices.
50 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51 * must also be changed, so that the index is rebuilt.
54 noise_word *noise_words[26];
56 static char *noise_words_init[] = {
57 "about",
58 "after",
59 "also",
60 "another",
61 "because",
62 "been",
63 "before",
64 "being",
65 "between",
66 "both",
67 "came",
68 "come",
69 "could",
70 "each",
71 "from",
72 "have",
73 "here",
74 "himself",
75 "into",
76 "like",
77 "make",
78 "many",
79 "might",
80 "more",
81 "most",
82 "much",
83 "must",
84 "never",
85 "only",
86 "other",
87 "over",
88 "said",
89 "same",
90 "should",
91 "since",
92 "some",
93 "still",
94 "such",
95 "take",
96 "than",
97 "that",
98 "their",
99 "them",
100 "then",
101 "there",
102 "these",
103 "they",
104 "this",
105 "those",
106 "through",
107 "under",
108 "very",
109 "well",
110 "were",
111 "what",
112 "where",
113 "which",
114 "while",
115 "with",
116 "would",
117 "your"
121 void initialize_noise_words(void)
123 int i;
124 int len;
125 int ch;
126 noise_word *next;
128 memset (noise_words, 0, sizeof(noise_words));
130 for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
132 ch = noise_words_init[i][0] - 'a';
133 len = strlen(noise_words_init[i]);
135 next = malloc(sizeof(noise_word));
136 next->len = len;
137 next->word = strdup(noise_words_init[i]);
138 next->next = noise_words[ch];
139 noise_words[ch] = next;
144 void noise_word_cleanup(void)
146 int i;
147 noise_word *cur, *next;
149 CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
151 for (i = 0 ; i < 26 ; i++)
153 cur = noise_words[i];
154 while (cur)
156 next = cur->next;
157 free(cur->word);
158 free(cur);
159 cur = next;
165 * Compare function
167 int intcmp(const void *rec1, const void *rec2) {
168 int i1, i2;
170 i1 = *(const int *)rec1;
171 i2 = *(const int *)rec2;
173 if (i1 > i2) return(1);
174 if (i1 < i2) return(-1);
175 return(0);
179 void wordbreaker(char *text, int *num_tokens, int **tokens) {
181 int wb_num_tokens = 0;
182 int wb_num_alloc = 0;
183 int *wb_tokens = NULL;
185 char *ptr;
186 char *word_start;
187 char *word_end;
188 char ch;
189 int word_len;
190 char word[256];
191 int i;
192 int word_crc;
193 noise_word *noise;
196 if (text == NULL) { /* no NULL text please */
197 *num_tokens = 0;
198 *tokens = NULL;
199 return;
202 if (text[0] == 0) { /* no empty text either */
203 *num_tokens = 0;
204 *tokens = NULL;
205 return;
208 ptr = text;
209 word_start = NULL;
210 while (*ptr) {
211 ch = *ptr;
212 if (isalnum(ch)) {
213 if (!word_start) {
214 word_start = ptr;
217 ++ptr;
218 ch = *ptr;
219 if ( (!isalnum(ch)) && (word_start) ) {
220 word_end = ptr;
221 // --word_end;
223 /* extract the word */
224 word_len = word_end - word_start;
225 if (word_len >= sizeof word) {
226 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
227 safestrncpy(word, word_start, sizeof word);
228 word[(sizeof word) - 1] = 0;
230 else {
231 safestrncpy(word, word_start, word_len+1);
232 word[word_len] = 0;
234 word_start = NULL;
236 /* are we ok with the length? */
237 if ( (word_len >= WB_MIN)
238 && (word_len <= WB_MAX) ) {
239 for (i=0; i<word_len; ++i) {
240 word[i] = tolower(word[i]);
242 /* disqualify noise words */
243 noise = noise_words[(int) (word[0]-'a')];
244 while (noise)
246 if (noise->len == word_len)
248 if (!strcmp(word, noise->word))
250 word_len = 0;
251 break;
254 noise = noise->next;
256 if (word_len == 0)
257 continue;
259 word_crc = (int)
260 CalcCRC16Bytes(word_len, word);
262 ++wb_num_tokens;
263 if (wb_num_tokens > wb_num_alloc) {
264 wb_num_alloc += 512;
265 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
267 wb_tokens[wb_num_tokens - 1] = word_crc;
272 /* sort and purge dups */
273 if (wb_num_tokens > 1) {
274 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
275 for (i=0; i<(wb_num_tokens-1); ++i) {
276 if (wb_tokens[i] == wb_tokens[i+1]) {
277 memmove(&wb_tokens[i], &wb_tokens[i+1],
278 ((wb_num_tokens - i - 1)*sizeof(int)));
279 --wb_num_tokens;
280 --i;
285 *num_tokens = wb_num_tokens;
286 *tokens = wb_tokens;