build: skip checking for networks source directories
[mldonkey.git] / src / utils / lib / index.c
blob246ecc880bc14710570627a50ce23c9ba6511194
1 /* Copyright 2001, 2002 b8_bavard, b8_fee_carabine, INRIA */
2 /*
3 This file is part of mldonkey.
5 mldonkey is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 mldonkey is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with mldonkey; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 #include <stdlib.h>
20 #include <string.h>
22 typedef char (*converter)(char c);
24 void* index_alloc(int size)
26 return malloc(size);
29 void index_free(void *ptr)
31 free(ptr);
34 typedef enum { WORD_ARRAY, WORD_TREE, WORD_HITS } tree_type;
36 typedef struct tree_struct tree_struct;
37 typedef struct word_struct word_struct;
38 typedef struct hits_struct hits_struct;
39 typedef struct array_struct array_struct;
40 typedef struct suffix_struct suffix_struct;
41 typedef struct index_struct index_struct;
43 struct hits_struct {
44 int hits_type;
45 int hits_nbr;
46 int hits_words[0];
49 struct tree_struct {
50 int tree_type;
51 int tree_word; /* the word corresponding to the
52 current position in the tree */
53 tree_struct* tree_subtrees[0];
56 typedef struct suffix_struct {
57 char *suffix_word;
58 int suffix_num;
61 struct array_struct {
62 int array_type;
63 int array_word;
64 int array_nprefixes;
65 int array_nsuffixes;
66 suffix_struct array_suffixes[0];
69 struct word_struct {
70 int word_num;
71 word_struct *word_other;
72 char *word_string;
75 struct index_struct {
76 word_struct **dict; /* word index --> position dans words */
77 int next_word;
78 int dict_len;
80 tree_struct *tree;
82 int range;
83 converter f;
86 #define DICT_INITIAL_SIZE 16
87 #define WORD_INITIAL_SIZE 30000
88 #define MAX_LINEAR_SEARCH 20
89 #define MAX_WORD_SIZE 100
91 static char stemmed_word[MAX_WORD_SIZE];
92 static int word_len;
95 index_struct *create_index(int range, converter f)
97 index_struct *idx = index_alloc(sizeof(index_struct));
99 idx->dict = index_alloc(sizeof(word_struct*) * DICT_INITIAL_SIZE);
100 idx->next_word = 0;
101 idx->dict_len = DICT_INITIAL_SIZE;
103 idx->tree = NULL;
105 idx->range = range;
106 idx->f = f;
108 return idx;
112 static int add_new_word(index_struct *idx, char *word)
114 int word_len = strlen(word);
115 int w;
116 int left;
117 char *word_pos;
118 word_struct* ws;
120 if(idx->next_word == idx->dict_len){
121 /* must reallocate the dictionnary */
122 int old_size = idx->dict_len;
123 int new_size = old_size * 2;
124 word_struct** new_dict = index_alloc(sizeof(word_struct*) * new_size);
126 memcpy(new_dict, idx->dict, sizeof(word_struct*) * old_size);
127 index_free(idx->dict);
128 idx->dict = new_dict;
129 idx->dict_len = new_size;
131 w = idx->next_word;
132 idx->next_word++;
134 ws = index_alloc(sizeof(word_struct) + word_len + 1);
135 idx->dict[w] = ws;
136 strncpy(ws->word_string, word, word_len);
137 ws->word_string[word_len] = 0;
139 return w;
143 static int add_stemmed_word(index_struct *idx){
144 return add_new_word(idx, stemmed_word);
147 static array_struct *alloc_array(index_struct *idx)
149 array_struct *array= index_alloc(sizeof(tree_struct) +
150 2 * MAX_LINEAR_SEARCH * sizeof(suffix_struct));
151 array->array_type = WORD_ARRAY;
152 array->array_word = -1;
153 array->array_nprefixes = 0;
154 array->array_nsuffixes = 0;
156 return array;
159 static int add_prefix_in_tree(index_struct *idx, char *suffix,
160 tree_struct *tree, tree_struct **tree_ptr)
162 if(tree == NULL){
163 tree = (tree_struct*) alloc_array(idx);
164 *tree_ptr = tree;
167 if(suffix[0] == 0) {
168 if(tree->tree_word == (-1)) tree->tree_word = add_stemmed_word(idx);
169 return tree->tree_word;
172 if(tree->tree_type == WORD_TREE){
174 tree_struct * new_tree = tree->tree_subtrees[ (int) suffix[0]];
175 return add_prefix_in_tree(idx, suffix+1, new_tree,
176 & (tree->tree_subtrees[(int) suffix[0]]));
178 } else {
179 array_struct * array = (array_struct*) tree;
181 int nprefixes = array->array_nprefixes;
182 int suffix_len = strlen(suffix);
183 int i;
185 for(i=0; i<nprefixes; i++){
186 char *word = array->array_suffixes[i].suffix_word;
187 if(!strcmp(suffix, word)){
188 return array->array_suffixes[i].suffix_num;
192 /* not found */
193 if(nprefixes == MAX_LINEAR_SEARCH){
194 int size = idx->range * sizeof(tree_struct*);
195 int nsuffixes = array->array_nsuffixes;
196 tree_struct *new_tree = index_alloc(sizeof(tree_struct) + size);
198 bzero(new_tree->tree_subtrees, size);
199 new_tree->tree_type = WORD_TREE;
200 new_tree->tree_word = -1;
202 *tree_ptr = new_tree;
204 for(i=0; i<nprefixes; i++){
205 char *word = array->array_suffixes[i].suffix_word;
206 int num = array->array_suffixes[i].suffix_num;
208 tree_struct *next_tree = new_tree->tree_subtrees[(int)word[0]];
209 array_struct *nr = (array_struct*) next_tree;
211 if(nr == NULL){
212 nr = alloc_array(idx);
213 new_tree->tree_subtrees[(int) word[0]] = (tree_struct*) nr;
215 nr->array_suffixes[nr->array_nprefixes].suffix_word = word+1;
216 nr->array_suffixes[nr->array_nprefixes].suffix_num = num;
217 nr->array_nprefixes++;
220 for(i=0; i<nsuffixes; i++){
221 char *word = array->array_suffixes[MAX_LINEAR_SEARCH+i].suffix_word;
222 int num = array->array_suffixes[i].suffix_num;
224 tree_struct *next_tree = new_tree->tree_subtrees[(int)word[0]];
225 array_struct *nr = (array_struct*) next_tree;
227 if(nr == NULL){
228 nr = alloc_array(idx);
229 new_tree->tree_subtrees[(int) word[0]] = (tree_struct*) nr;
231 nr->array_suffixes[MAX_LINEAR_SEARCH+nr->array_nsuffixes].suffix_word = word+1;
232 nr->array_suffixes[MAX_LINEAR_SEARCH+nr->array_nsuffixes].suffix_num = num;
233 nr->array_nsuffixes++;
236 index_free(array);
237 return add_prefix_in_tree(idx, suffix+1,
238 new_tree->tree_subtrees[(int) suffix[0]],
239 & new_tree->tree_subtrees[(int) suffix[0]]);
240 } else {
241 int w = add_stemmed_word(idx);
242 array->array_nprefixes = nprefixes+1;
243 array->array_suffixes[nprefixes].suffix_num = w;
244 array->array_suffixes[nprefixes].suffix_word =
245 idx->dict[w]->word_string + word_len - suffix_len;
246 return w;
252 int add_word(index_struct *idx, char *word)
254 word_len = strlen(word);
255 int i;
256 int w;
257 int ww;
259 for(i = 0; i < word_len; i++)
260 stemmed_word[i] = idx->f(word[i]);
261 stemmed_word[word_len] = 0;
263 w = add_prefix_in_tree(idx, stemmed_word, idx->tree,& idx->tree);
264 if(!strcmp(stemmed_word, word)) return w;
266 /* the stemmed word is not the current word */
267 ww = add_new_word(idx, word);
269 idx->dict[ww]->word_other = idx->dict[w]->word_other;
270 idx->dict[w]->word_other = idx->dict[w];
272 return ww;
276 "toto" ---> 464 --> a,b,c,...
279 char *get_word(index_struct *idx, int w)
281 return idx->dict[w]->word_string;
284 static int nhits;
285 static int hits[MAX_LINEAR_SEARCH];
287 static void add_hit(int num)
289 hits[nhits++] = num;
292 static tree_struct*
293 find_in_tree(index_struct *idx, char *suffix, tree_struct *tree)
295 if(tree == NULL) return NULL;
297 if(suffix[0] == 0) return tree;
299 if(tree->tree_type == WORD_TREE){
301 tree_struct * new_tree = tree->tree_subtrees[ (int) suffix[0]];
302 return find_in_tree(idx, suffix+1, new_tree);
304 } else {
305 array_struct * array = (array_struct*) tree;
307 int nprefixes = array->array_nprefixes;
308 int nsuffixes = array->array_nsuffixes;
309 int suffix_len = strlen(suffix);
310 int i;
311 hits_struct *all_hits;
313 nhits = 0;
315 for(i=0; i<nprefixes; i++){
316 char *word = array->array_suffixes[i].suffix_word;
317 if(!strncmp(suffix, word, suffix_len)){
318 add_hit(array->array_suffixes[i].suffix_num);
321 for(i=0; i<nsuffixes; i++){
322 char *word = array->array_suffixes[MAX_LINEAR_SEARCH+i].suffix_word;
323 if(!strncmp(suffix, word, suffix_len)){
324 add_hit(array->array_suffixes[MAX_LINEAR_SEARCH+i].suffix_num);
327 if(nhits == 0) return NULL;
328 all_hits = index_alloc(sizeof(hits_struct) + sizeof(int) * nhits);
329 all_hits->hits_type = WORD_HITS;
330 all_hits->hits_nbr = nhits;
331 memcpy(all_hits->hits_words, hits, nhits * sizeof(int));
332 return (tree_struct*) all_hits;
337 tree_struct *find_word(index_struct *idx, char *word)
339 word_len = strlen(word);
340 int i;
342 for(i = 0; i < word_len; i++)
343 stemmed_word[i] = idx->f(word[i]);
344 stemmed_word[word_len] = 0;
346 return find_in_tree(idx, stemmed_word, idx->tree);