1 /* Copyright 2001, 2002 b8_bavard, b8_fee_carabine, INRIA */
3 This file is part of mldonkey.
5 mldonkey is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 mldonkey is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with mldonkey; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 typedef char (*converter
)(char c
);
24 void* index_alloc(int size
)
29 void index_free(void *ptr
)
34 typedef enum { WORD_ARRAY
, WORD_TREE
, WORD_HITS
} tree_type
;
36 typedef struct tree_struct tree_struct
;
37 typedef struct word_struct word_struct
;
38 typedef struct hits_struct hits_struct
;
39 typedef struct array_struct array_struct
;
40 typedef struct suffix_struct suffix_struct
;
41 typedef struct index_struct index_struct
;
51 int tree_word
; /* the word corresponding to the
52 current position in the tree */
53 tree_struct
* tree_subtrees
[0];
56 typedef struct suffix_struct
{
66 suffix_struct array_suffixes
[0];
71 word_struct
*word_other
;
76 word_struct
**dict
; /* word index --> position dans words */
86 #define DICT_INITIAL_SIZE 16
87 #define WORD_INITIAL_SIZE 30000
88 #define MAX_LINEAR_SEARCH 20
89 #define MAX_WORD_SIZE 100
91 static char stemmed_word
[MAX_WORD_SIZE
];
95 index_struct
*create_index(int range
, converter f
)
97 index_struct
*idx
= index_alloc(sizeof(index_struct
));
99 idx
->dict
= index_alloc(sizeof(word_struct
*) * DICT_INITIAL_SIZE
);
101 idx
->dict_len
= DICT_INITIAL_SIZE
;
112 static int add_new_word(index_struct
*idx
, char *word
)
114 int word_len
= strlen(word
);
120 if(idx
->next_word
== idx
->dict_len
){
121 /* must reallocate the dictionnary */
122 int old_size
= idx
->dict_len
;
123 int new_size
= old_size
* 2;
124 word_struct
** new_dict
= index_alloc(sizeof(word_struct
*) * new_size
);
126 memcpy(new_dict
, idx
->dict
, sizeof(word_struct
*) * old_size
);
127 index_free(idx
->dict
);
128 idx
->dict
= new_dict
;
129 idx
->dict_len
= new_size
;
134 ws
= index_alloc(sizeof(word_struct
) + word_len
+ 1);
136 strncpy(ws
->word_string
, word
, word_len
);
137 ws
->word_string
[word_len
] = 0;
143 static int add_stemmed_word(index_struct
*idx
){
144 return add_new_word(idx
, stemmed_word
);
147 static array_struct
*alloc_array(index_struct
*idx
)
149 array_struct
*array
= index_alloc(sizeof(tree_struct
) +
150 2 * MAX_LINEAR_SEARCH
* sizeof(suffix_struct
));
151 array
->array_type
= WORD_ARRAY
;
152 array
->array_word
= -1;
153 array
->array_nprefixes
= 0;
154 array
->array_nsuffixes
= 0;
159 static int add_prefix_in_tree(index_struct
*idx
, char *suffix
,
160 tree_struct
*tree
, tree_struct
**tree_ptr
)
163 tree
= (tree_struct
*) alloc_array(idx
);
168 if(tree
->tree_word
== (-1)) tree
->tree_word
= add_stemmed_word(idx
);
169 return tree
->tree_word
;
172 if(tree
->tree_type
== WORD_TREE
){
174 tree_struct
* new_tree
= tree
->tree_subtrees
[ (int) suffix
[0]];
175 return add_prefix_in_tree(idx
, suffix
+1, new_tree
,
176 & (tree
->tree_subtrees
[(int) suffix
[0]]));
179 array_struct
* array
= (array_struct
*) tree
;
181 int nprefixes
= array
->array_nprefixes
;
182 int suffix_len
= strlen(suffix
);
185 for(i
=0; i
<nprefixes
; i
++){
186 char *word
= array
->array_suffixes
[i
].suffix_word
;
187 if(!strcmp(suffix
, word
)){
188 return array
->array_suffixes
[i
].suffix_num
;
193 if(nprefixes
== MAX_LINEAR_SEARCH
){
194 int size
= idx
->range
* sizeof(tree_struct
*);
195 int nsuffixes
= array
->array_nsuffixes
;
196 tree_struct
*new_tree
= index_alloc(sizeof(tree_struct
) + size
);
198 bzero(new_tree
->tree_subtrees
, size
);
199 new_tree
->tree_type
= WORD_TREE
;
200 new_tree
->tree_word
= -1;
202 *tree_ptr
= new_tree
;
204 for(i
=0; i
<nprefixes
; i
++){
205 char *word
= array
->array_suffixes
[i
].suffix_word
;
206 int num
= array
->array_suffixes
[i
].suffix_num
;
208 tree_struct
*next_tree
= new_tree
->tree_subtrees
[(int)word
[0]];
209 array_struct
*nr
= (array_struct
*) next_tree
;
212 nr
= alloc_array(idx
);
213 new_tree
->tree_subtrees
[(int) word
[0]] = (tree_struct
*) nr
;
215 nr
->array_suffixes
[nr
->array_nprefixes
].suffix_word
= word
+1;
216 nr
->array_suffixes
[nr
->array_nprefixes
].suffix_num
= num
;
217 nr
->array_nprefixes
++;
220 for(i
=0; i
<nsuffixes
; i
++){
221 char *word
= array
->array_suffixes
[MAX_LINEAR_SEARCH
+i
].suffix_word
;
222 int num
= array
->array_suffixes
[i
].suffix_num
;
224 tree_struct
*next_tree
= new_tree
->tree_subtrees
[(int)word
[0]];
225 array_struct
*nr
= (array_struct
*) next_tree
;
228 nr
= alloc_array(idx
);
229 new_tree
->tree_subtrees
[(int) word
[0]] = (tree_struct
*) nr
;
231 nr
->array_suffixes
[MAX_LINEAR_SEARCH
+nr
->array_nsuffixes
].suffix_word
= word
+1;
232 nr
->array_suffixes
[MAX_LINEAR_SEARCH
+nr
->array_nsuffixes
].suffix_num
= num
;
233 nr
->array_nsuffixes
++;
237 return add_prefix_in_tree(idx
, suffix
+1,
238 new_tree
->tree_subtrees
[(int) suffix
[0]],
239 & new_tree
->tree_subtrees
[(int) suffix
[0]]);
241 int w
= add_stemmed_word(idx
);
242 array
->array_nprefixes
= nprefixes
+1;
243 array
->array_suffixes
[nprefixes
].suffix_num
= w
;
244 array
->array_suffixes
[nprefixes
].suffix_word
=
245 idx
->dict
[w
]->word_string
+ word_len
- suffix_len
;
252 int add_word(index_struct
*idx
, char *word
)
254 word_len
= strlen(word
);
259 for(i
= 0; i
< word_len
; i
++)
260 stemmed_word
[i
] = idx
->f(word
[i
]);
261 stemmed_word
[word_len
] = 0;
263 w
= add_prefix_in_tree(idx
, stemmed_word
, idx
->tree
,& idx
->tree
);
264 if(!strcmp(stemmed_word
, word
)) return w
;
266 /* the stemmed word is not the current word */
267 ww
= add_new_word(idx
, word
);
269 idx
->dict
[ww
]->word_other
= idx
->dict
[w
]->word_other
;
270 idx
->dict
[w
]->word_other
= idx
->dict
[w
];
276 "toto" ---> 464 --> a,b,c,...
279 char *get_word(index_struct
*idx
, int w
)
281 return idx
->dict
[w
]->word_string
;
285 static int hits
[MAX_LINEAR_SEARCH
];
287 static void add_hit(int num
)
293 find_in_tree(index_struct
*idx
, char *suffix
, tree_struct
*tree
)
295 if(tree
== NULL
) return NULL
;
297 if(suffix
[0] == 0) return tree
;
299 if(tree
->tree_type
== WORD_TREE
){
301 tree_struct
* new_tree
= tree
->tree_subtrees
[ (int) suffix
[0]];
302 return find_in_tree(idx
, suffix
+1, new_tree
);
305 array_struct
* array
= (array_struct
*) tree
;
307 int nprefixes
= array
->array_nprefixes
;
308 int nsuffixes
= array
->array_nsuffixes
;
309 int suffix_len
= strlen(suffix
);
311 hits_struct
*all_hits
;
315 for(i
=0; i
<nprefixes
; i
++){
316 char *word
= array
->array_suffixes
[i
].suffix_word
;
317 if(!strncmp(suffix
, word
, suffix_len
)){
318 add_hit(array
->array_suffixes
[i
].suffix_num
);
321 for(i
=0; i
<nsuffixes
; i
++){
322 char *word
= array
->array_suffixes
[MAX_LINEAR_SEARCH
+i
].suffix_word
;
323 if(!strncmp(suffix
, word
, suffix_len
)){
324 add_hit(array
->array_suffixes
[MAX_LINEAR_SEARCH
+i
].suffix_num
);
327 if(nhits
== 0) return NULL
;
328 all_hits
= index_alloc(sizeof(hits_struct
) + sizeof(int) * nhits
);
329 all_hits
->hits_type
= WORD_HITS
;
330 all_hits
->hits_nbr
= nhits
;
331 memcpy(all_hits
->hits_words
, hits
, nhits
* sizeof(int));
332 return (tree_struct
*) all_hits
;
337 tree_struct
*find_word(index_struct
*idx
, char *word
)
339 word_len
= strlen(word
);
342 for(i
= 0; i
< word_len
; i
++)
343 stemmed_word
[i
] = idx
->f(word
[i
]);
344 stemmed_word
[word_len
] = 0;
346 return find_in_tree(idx
, stemmed_word
, idx
->tree
);