1 /** Very fast search_keyword_in_list.
5 * It replaces bsearch() + strcasecmp() + callback + ...
7 * Following conditions should be met:
9 * - list keys are C strings.
10 * - keys should not be greater than 255 characters, and optimally < 20
11 * characters. It can work with greater keys but then memory usage will
13 * - each key must be unique and non empty.
14 * - list do not have to be ordered.
15 * - total number of unique characters used in all keys should be <= 128
16 * - idealy total number of keys should be <= 512 (but see below)
18 * (c) 2003 Laurent MONIN (aka Zas)
19 * Feel free to do whatever you want with that code.
22 * These routines use a tree search. First, a big tree is composed from the
23 * keys on input. Then, when searching we just go through the tree. If we will
24 * end up on an 'ending' node, we've got it.
26 * Hm, okay. For keys { 'head', 'h1', 'body', 'bodyrock', 'bodyground' }, it
42 * (the ending nodes are upcased just for this drawing, not in real)
44 * To optimize this for speed, leafs of nodes are organized in per-node arrays
45 * (so-called 'leafsets'), indexed by symbol value of the key's next character.
46 * But to optimize that for memory, we first compose own alphabet consisting
47 * only from the chars we ever use in the key strings. fastfind_info.uniq_chars
48 * holds that alphabet and fastfind_info.idxtab is used to translate between it
51 * Tree building: O((L+M)*N)
52 * (L: mean key length, M: alphabet size,
53 * N: number of items).
54 * String lookup: O(N) (N: string length). */
65 #include "util/conv.h"
66 #include "util/error.h"
67 #include "util/fastfind.h"
68 #include "util/memdebug.h"
69 #include "util/memory.h"
73 /** Define it to generate performance and memory usage statistics to stderr. */
75 #define DEBUG_FASTFIND
78 /** Define whether to use 32 or 64 bits per compressed element. */
83 #define END_LEAF_BITS 1
84 #define COMPRESSED_BITS 1
88 /* Use only 32 bits per element, but has very low limits. */
89 /* Adequate for ELinks tags search. */
91 #define POINTER_INDEX_BITS 10 /* 1024 */
92 #define LEAFSET_INDEX_BITS 13 /* 8192 */
93 #define COMP_CHAR_INDEX_BITS 7 /* 128 */
95 #define ff_node ff_node_c /* Both are 32 bits long. */
97 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
98 COMP_CHAR_INDEX_BITS + END_LEAF_BITS + \
100 #error Over 32 bits in struct ff_node !!
103 #else /* !USE_32_BITS */
105 /* Keep this one if there is more than 512 keywords in a list
106 * it eats a bit more memory.
107 * ELinks may need this one if fastfind is used in other
108 * things than tags searching. */
109 /* This will make struct ff_node_c use 64 bits. */
111 #define POINTER_INDEX_BITS 12
112 #define LEAFSET_INDEX_BITS 18
113 #define COMP_CHAR_INDEX_BITS 8
115 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
116 + END_LEAF_BITS + COMPRESSED_BITS) > 32
117 #error Over 32 bits in struct ff_node !!
121 /** End leaf -> p is significant */
122 unsigned int e
:END_LEAF_BITS
;
125 unsigned int c
:COMPRESSED_BITS
;
127 /** Index in pointers */
128 unsigned int p
:POINTER_INDEX_BITS
;
130 /** Index in leafsets */
131 unsigned int l
:LEAFSET_INDEX_BITS
;
134 #endif /* USE_32_BITS */
137 #define FF_MAX_KEYS (1 << POINTER_INDEX_BITS)
138 #define FF_MAX_LEAFSETS ((1 << LEAFSET_INDEX_BITS) - 1)
139 #define FF_MAX_CHARS (1 << COMP_CHAR_INDEX_BITS)
140 #define FF_MAX_KEYLEN 255
143 unsigned int e
:END_LEAF_BITS
;
144 unsigned int c
:COMPRESSED_BITS
;
145 unsigned int p
:POINTER_INDEX_BITS
;
146 unsigned int l
:LEAFSET_INDEX_BITS
;
148 /** Index of char when compressed. */
149 unsigned int ch
:COMP_CHAR_INDEX_BITS
;
157 struct fastfind_info
{
158 struct ff_data
*data
;
160 struct ff_node
**leafsets
;
161 struct ff_node
*root_leafset
;
166 int uniq_chars_count
;
171 unsigned int case_aware
:1;
172 unsigned int locale_indep
:1;
173 unsigned int compress
:1;
175 int idxtab
[FF_MAX_CHARS
];
176 unsigned char uniq_chars
[FF_MAX_CHARS
];
178 #ifdef DEBUG_FASTFIND
180 unsigned long searches
;
182 unsigned long itertmp
;
183 unsigned long iterdelta
;
184 unsigned long itermax
;
185 unsigned long iterations
;
187 unsigned long teststmp
;
188 unsigned long testsdelta
;
189 unsigned long testsmax
;
190 unsigned long memory_usage
;
191 unsigned long total_key_len
;
192 unsigned int compressed_nodes
;
193 unsigned char *comment
;
199 #ifdef DEBUG_FASTFIND
200 /* These are for performance testing. */
201 #define FF_DBG_mem(x, size) (x)->debug.memory_usage += (size)
202 #define FF_DBG_test(x) (x)->debug.tests++
203 #define FF_DBG_iter(x) (x)->debug.iterations++
204 #define FF_DBG_cnode(x) (x)->debug.compressed_nodes++
205 #define FF_DBG_found(x) \
207 unsigned long iter = (x)->debug.iterations - (x)->debug.itertmp; \
208 unsigned long tests = (x)->debug.tests - (x)->debug.teststmp; \
210 (x)->debug.iterdelta += iter; \
211 (x)->debug.testsdelta += tests; \
212 if (iter > (x)->debug.itermax) \
213 (x)->debug.itermax = iter; \
214 if (tests > (x)->debug.testsmax) \
215 (x)->debug.testsmax = tests; \
216 (x)->debug.found++; \
218 #define FF_DBG_comment(x, str) do { (x)->debug.comment = empty_string_or_(str); } while (0)
220 /** Update search stats. */
222 FF_DBG_search_stats(struct fastfind_info
*info
, int key_len
)
224 info
->debug
.searches
++;
225 info
->debug
.total_key_len
+= key_len
;
226 info
->debug
.teststmp
= info
->debug
.tests
;
227 info
->debug
.itertmp
= info
->debug
.iterations
;
230 /** Dump all stats. */
232 FF_DBG_dump_stats(struct fastfind_info
*info
)
234 fprintf(stderr
, "------ FastFind Statistics ------\n");
235 fprintf(stderr
, "Comment : %s\n", info
->debug
.comment
);
236 fprintf(stderr
, "Case-aware : %s\n", info
->case_aware
? "yes" : "no");
237 fprintf(stderr
, "Locale-indep: %s\n", info
->locale_indep
? "yes" : "no");
238 fprintf(stderr
, "Compress : %s\n", info
->compress
? "yes" : "no");
239 fprintf(stderr
, "Uniq_chars : %s\n", info
->uniq_chars
);
240 fprintf(stderr
, "Uniq_chars #: %d/%d max.\n", info
->uniq_chars_count
, FF_MAX_CHARS
);
241 fprintf(stderr
, "Min_key_len : %d\n", info
->min_key_len
);
242 fprintf(stderr
, "Max_key_len : %d\n", info
->max_key_len
);
243 fprintf(stderr
, "Entries : %d/%d max.\n", info
->pointers_count
, FF_MAX_KEYS
);
244 fprintf(stderr
, "Leafsets : %d/%d max.\n", info
->leafsets_count
, FF_MAX_LEAFSETS
);
246 fprintf(stderr
, "C. leafsets : %u/%d (%0.2f%%)\n",
247 info
->debug
.compressed_nodes
,
248 info
->leafsets_count
,
249 100 * (double) info
->debug
.compressed_nodes
/ info
->leafsets_count
);
250 fprintf(stderr
, "Memory usage: %lu bytes (cost per entry = %0.2f bytes)\n",
251 info
->debug
.memory_usage
, (double) info
->debug
.memory_usage
/ info
->pointers_count
);
252 fprintf(stderr
, "Struct info : %zu bytes\n", sizeof(*info
) - sizeof(info
->debug
));
253 fprintf(stderr
, "Struct node : %zu bytes\n", sizeof(struct ff_node
));
254 fprintf(stderr
, "Struct cnode: %zu bytes\n", sizeof(struct ff_node_c
));
255 fprintf(stderr
, "Searches : %lu\n", info
->debug
.searches
);
256 fprintf(stderr
, "Found : %lu (%0.2f%%)\n",
257 info
->debug
.found
, 100 * (double) info
->debug
.found
/ info
->debug
.searches
);
258 fprintf(stderr
, "Iterations : %lu (%0.2f per search, %0.2f before found, %lu max)\n",
259 info
->debug
.iterations
, (double) info
->debug
.iterations
/ info
->debug
.searches
,
260 (double) info
->debug
.iterdelta
/ info
->debug
.found
,
261 info
->debug
.itermax
);
262 fprintf(stderr
, "Tests : %lu (%0.2f per search, %0.2f per iter., %0.2f before found, %lu max)\n",
263 info
->debug
.tests
, (double) info
->debug
.tests
/ info
->debug
.searches
,
264 (double) info
->debug
.tests
/ info
->debug
.iterations
,
265 (double) info
->debug
.testsdelta
/ info
->debug
.found
,
266 info
->debug
.testsmax
);
267 fprintf(stderr
, "Total keylen: %lu bytes (%0.2f per search, %0.2f per iter.)\n",
268 info
->debug
.total_key_len
, (double) info
->debug
.total_key_len
/ info
->debug
.searches
,
269 (double) info
->debug
.total_key_len
/ info
->debug
.iterations
);
270 fprintf(stderr
, "\n");
273 #else /* !DEBUG_FASTFIND */
275 #define FF_DBG_mem(x, size)
276 #define FF_DBG_test(x)
277 #define FF_DBG_iter(x)
278 #define FF_DBG_cnode(x)
279 #define FF_DBG_found(x)
280 #define FF_DBG_comment(x, comment)
281 #define FF_DBG_search_stats(info, key_len)
282 #define FF_DBG_dump_stats(info)
284 #endif /* DEBUG_FASTFIND */
287 static struct fastfind_info
*
288 init_fastfind(struct fastfind_index
*index
, enum fastfind_flags flags
)
290 struct fastfind_info
*info
= mem_calloc(1, sizeof(*info
));
292 index
->handle
= info
;
293 if (!info
) return NULL
;
295 info
->min_key_len
= FF_MAX_KEYLEN
;
296 info
->case_aware
= !!(flags
& FF_CASE_AWARE
);
297 info
->locale_indep
= !!(flags
& FF_LOCALE_INDEP
);
298 info
->compress
= !!(flags
& FF_COMPRESS
);
300 FF_DBG_mem(info
, sizeof(*info
) - sizeof(info
->debug
));
301 FF_DBG_comment(info
, index
->comment
);
306 /** @returns 1 on success, 0 on allocation failure */
308 alloc_ff_data(struct fastfind_info
*info
)
310 struct ff_data
*data
;
312 assert(info
->count
< FF_MAX_KEYS
);
313 if_assert_failed
return 0;
315 /* On error, cleanup is done by fastfind_done(). */
317 data
= mem_calloc(info
->count
, sizeof(*data
));
320 FF_DBG_mem(info
, info
->count
* sizeof(*data
));
325 /** Add pointer and its key length to correspondant arrays, incrementing
326 * internal counter. */
328 add_to_ff_data(void *p
, int key_len
, struct fastfind_info
*info
)
330 struct ff_data
*data
= &info
->data
[info
->pointers_count
++];
332 /* Record new pointer and key len, used in search */
334 data
->keylen
= key_len
;
337 /** @returns 1 on success, 0 on allocation failure */
339 alloc_leafset(struct fastfind_info
*info
)
341 struct ff_node
**leafsets
;
342 struct ff_node
*leafset
;
344 assert(info
->leafsets_count
< FF_MAX_LEAFSETS
);
345 if_assert_failed
return 0;
347 /* info->leafsets[0] is never used since l=0 marks no leaf
348 * in struct ff_node. That's the reason of that + 2. */
349 leafsets
= mem_realloc(info
->leafsets
,
350 sizeof(*leafsets
) * (info
->leafsets_count
+ 2));
351 if (!leafsets
) return 0;
352 info
->leafsets
= leafsets
;
354 leafset
= mem_calloc(info
->uniq_chars_count
, sizeof(*leafset
));
355 if (!leafset
) return 0;
357 FF_DBG_mem(info
, sizeof(*leafsets
));
358 FF_DBG_mem(info
, sizeof(*leafset
) * info
->uniq_chars_count
);
360 info
->leafsets_count
++;
361 info
->leafsets
[info
->leafsets_count
] = leafset
;
367 char2idx(unsigned char c
, struct fastfind_info
*info
)
369 unsigned char *idx
= memchr(info
->uniq_chars
, c
, info
->uniq_chars_count
);
371 if (idx
) return (idx
- info
->uniq_chars
);
377 init_idxtab(struct fastfind_info
*info
)
381 for (i
= 0; i
< FF_MAX_CHARS
; i
++)
382 info
->idxtab
[i
] = char2idx((unsigned char) i
, info
);
386 compress_node(struct ff_node
*leafset
, struct fastfind_info
*info
,
389 struct ff_node_c
*new = mem_alloc(sizeof(*new));
394 new->e
= leafset
[pos
].e
;
395 new->p
= leafset
[pos
].p
;
396 new->l
= leafset
[pos
].l
;
399 mem_free_set(&info
->leafsets
[i
], (struct ff_node
*) new);
401 FF_DBG_mem(info
, sizeof(*new));
402 FF_DBG_mem(info
, sizeof(*leafset
) * -info
->uniq_chars_count
);
406 compress_tree(struct ff_node
*leafset
, struct fastfind_info
*info
)
413 if_assert_failed
return;
415 for (i
= 0; i
< info
->uniq_chars_count
; i
++) {
416 if (leafset
[i
].c
) continue;
419 /* There's a leaf leafset, descend to it and recurse */
420 compress_tree(info
->leafsets
[leafset
[i
].l
], info
);
423 if (leafset
[i
].l
|| leafset
[i
].e
) {
429 if (cnt
!= 1 || leafset
[pos
].c
) return;
431 /* Compress if possible ;) */
432 for (i
= 1; i
< info
->leafsets_count
; i
++) {
433 if (info
->leafsets
[i
] == leafset
) {
434 compress_node(leafset
, info
, i
, pos
);
440 #define ifcase(c) ( info->case_aware ? (c) : (info->locale_indep ? c_toupper(c) : toupper(c)) )
442 struct fastfind_index
*
443 fastfind_index(struct fastfind_index
*index
, enum fastfind_flags flags
)
445 struct fastfind_key_value
*p
;
446 struct fastfind_info
*info
;
448 assert(index
&& index
->reset
&& index
->next
);
449 if_assert_failed
goto return_error
;
451 info
= init_fastfind(index
, flags
);
452 if (!info
) goto return_error
;
454 /* First search min, max, count and uniq_chars. */
457 while ((p
= index
->next())) {
458 int key_len
= strlen(p
->key
);
461 assert(key_len
> 0 && key_len
<= FF_MAX_KEYLEN
);
462 if_assert_failed
goto return_error
;
464 if (key_len
< info
->min_key_len
)
465 info
->min_key_len
= key_len
;
467 if (key_len
> info
->max_key_len
)
468 info
->max_key_len
= key_len
;
470 for (i
= 0; i
< key_len
; i
++) {
471 /* ifcase() test should be moved outside loops but
472 * remember we call this routine only once per list.
473 * So I go for code readability vs performance here.
475 int k
= ifcase(p
->key
[i
]);
477 assert(k
< FF_MAX_CHARS
);
478 if_assert_failed
goto return_error
;
480 if (char2idx(k
, info
) == -1) {
481 assert(info
->uniq_chars_count
< FF_MAX_CHARS
);
482 if_assert_failed
goto return_error
;
484 info
->uniq_chars
[info
->uniq_chars_count
++] = k
;
491 if (!info
->count
) return NULL
;
495 /* Root leafset allocation */
496 if (!alloc_leafset(info
)) goto return_error
;
498 info
->root_leafset
= info
->leafsets
[info
->leafsets_count
];
500 if (!alloc_ff_data(info
)) goto return_error
;
505 while ((p
= index
->next())) {
506 int key_len
= strlen(p
->key
);
507 struct ff_node
*leafset
= info
->root_leafset
;
511 fprintf(stderr
, "K: %s\n", p
->key
);
513 for (i
= 0; i
< key_len
- 1; i
++) {
514 /* Convert char to its index value */
515 int idx
= info
->idxtab
[ifcase(p
->key
[i
])];
517 /* leafset[idx] is the desired leaf node's bucket. */
519 if (leafset
[idx
].l
== 0) {
520 /* There's no leaf yet */
521 if (!alloc_leafset(info
)) goto return_error
;
522 leafset
[idx
].l
= info
->leafsets_count
;
525 /* Descend to leaf */
526 leafset
= info
->leafsets
[leafset
[idx
].l
];
529 /* Index final leaf */
530 i
= info
->idxtab
[ifcase(p
->key
[i
])];
534 /* Memorize pointer to data */
535 leafset
[i
].p
= info
->pointers_count
;
536 add_to_ff_data(p
->data
, key_len
, info
);
540 compress_tree(info
->root_leafset
, info
);
545 fastfind_done(index
);
552 /** This macro searchs for the key in indexed list */
553 #define FF_SEARCH(what) do { \
556 for (i = 0; i < key_len; i++) { \
557 int lidx, k = what; \
562 if (k >= FF_MAX_CHARS) return NULL; \
563 lidx = info->idxtab[k]; \
566 if (lidx < 0) return NULL; \
570 /* It is a compressed leaf. */ \
572 if (((struct ff_node_c *) current)->ch != lidx) \
575 current = ¤t[lidx]; \
580 struct ff_data *data = &info->data[current->p]; \
583 if (key_len == data->keylen) { \
584 FF_DBG_found(info); \
585 return data->pointer; \
590 if (!current->l) return NULL; \
591 current = (struct ff_node *) info->leafsets[current->l]; \
596 fastfind_search(struct fastfind_index
*index
,
597 const unsigned char *key
, int key_len
)
599 struct ff_node
*current
;
600 struct fastfind_info
*info
;
603 if_assert_failed
return NULL
;
605 info
= index
->handle
;
607 assertm(info
!= NULL
, "FastFind index %s not initialized", index
->comment
);
608 if_assert_failed
return NULL
;
610 FF_DBG_search_stats(info
, key_len
);
612 FF_DBG_test(info
); if (!key
) return NULL
;
613 FF_DBG_test(info
); if (key_len
> info
->max_key_len
) return NULL
;
614 FF_DBG_test(info
); if (key_len
< info
->min_key_len
) return NULL
;
616 current
= info
->root_leafset
;
618 /* Macro and code redundancy are there to obtain maximum
619 * performance. Do not move it to an inlined function.
620 * Do not even think about it.
621 * If you find a better way (same or better performance) then
622 * propose it and be prepared to defend it. --Zas */
625 if (info
->case_aware
)
628 if (info
->locale_indep
)
629 FF_SEARCH(c_toupper(key
[i
]));
631 FF_SEARCH(toupper(key
[i
]));
639 fastfind_done(struct fastfind_index
*index
)
641 struct fastfind_info
*info
;
644 if_assert_failed
return;
646 info
= index
->handle
;
649 FF_DBG_dump_stats(info
);
651 mem_free_if(info
->data
);
652 while (info
->leafsets_count
) {
653 mem_free_if(info
->leafsets
[info
->leafsets_count
]);
654 info
->leafsets_count
--;
656 mem_free_if(info
->leafsets
);
659 index
->handle
= NULL
;
671 struct list list
[] = {
703 /* {"HEAD", html_skip, 0, 0}, */
737 {NULL
, 0}, /* List terminaison is key = NULL */
741 struct list
*internal_pointer
;
743 /** Reset internal list pointer */
747 internal_pointer
= list
;
750 /** Returns a pointer to a struct that contains
751 * current key and data pointers and increment
753 * It returns NULL when key is NULL. */
754 struct fastfind_key_value
*
757 static struct fastfind_key_value kv
;
759 if (!internal_pointer
->tag
) return NULL
;
761 kv
.key
= internal_pointer
->tag
;
762 kv
.data
= internal_pointer
;
769 static struct fastfind_index ff_index
770 = INIT_FASTFIND_INDEX("example", reset_list
, next_in_list
);
773 main(int argc
, char **argv
)
775 unsigned char *key
= argv
[1];
779 fprintf(stderr
, "Usage: fastfind keyword\n");
783 fprintf(stderr
, "---------- INDEX PHASE -----------\n");
785 fastfind_index(&ff_index
, FF_COMPRESS
);
787 fprintf(stderr
, "---------- SEARCH PHASE ----------\n");
788 /* Without this one ... */
789 result
= (struct list
*) fastfind_search(&ff_index
, key
, strlen(key
));
792 fprintf(stderr
, " Found: '%s' -> %d\n", result
->tag
, result
->val
);
794 fprintf(stderr
, " Not found: '%s'\n", key
);
796 fprintf(stderr
, "---------- CLEANUP PHASE ----------\n");
797 fastfind_done(&ff_index
);
804 #endif /* USE_FASTFIND */