Initial commit
[cgperf.git] / search.c
blob294deac764f48ec12c61f9c5fa25d50b5863011b
1 #ifndef CGPERF_SEARCH_C
2 #define CGPERF_SEARCH_C
3 #include <stdlib.h>
4 #include <string.h>
5 #include <time.h>
6 #include <math.h>
7 #include "c_fixing.h"
8 #include "globals.h"
9 #include "search.h"
10 #include "keyword.h"
11 #include "keyword_list.h"
12 #include "options.h"
13 #include "positions.h"
14 #include "hash-table.h"
15 #include "bool-array.h"
16 /*------------------------------------------------------------------------------------------------*/
17 #include "namespace/globals.h"
18 #include "namespace/search.h"
19 #include "namespace/keyword.h"
20 #include "namespace/keyword_list.h"
21 #include "namespace/options.h"
22 #include "namespace/positions.h"
23 #include "namespace/hash-table.h"
24 #include "namespace/bool-array.h"
25 #include "namespace/search.c"
26 /*------------------------------------------------------------------------------------------------*/
27 /*{{{ THEORY */
28 /* The general form of the hash function is
30 hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
31 + len (keyword)
33 where Pos is a set of byte positions,
34 each alpha_inc[i] is a nonnegative integer,
35 each asso_values[c] is a nonnegative integer,
36 len (keyword) is the keyword's length if _hash_includes_len, or 0 otherwise.
38 Theorem 1: If all keywords are different, there is a set Pos such that
39 all tuples (keyword[i] : i in Pos) are different.
41 Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
42 are nonnegative integers alpha_inc[i] such that all multisets
43 {keyword[i] + alpha_inc[i] : i in Pos} are different.
45 Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
47 Theorem 3: If all multisets selchars[keyword] are different, there are
48 nonnegative integers asso_values[c] such that all hash values
49 sum (asso_values[c] : c in selchars[keyword]) are different.
51 Based on these three facts, we find the hash function in three steps:
53 Step 1 (Finding good byte positions):
54 Find a set Pos, as small as possible, such that all tuples
55 (keyword[i] : i in Pos) are different.
57 Step 2 (Finding good alpha increments):
58 Find nonnegative integers alpha_inc[i], as many of them as possible being
59 zero, and the others being as small as possible, such that all multisets
60 {keyword[i] + alpha_inc[i] : i in Pos} are different.
62 Step 3 (Finding good asso_values):
63 Find asso_values[c] such that all hash (keyword) are different.
65 In other words, each step finds a projection that is injective on the
66 given finite set:
67 proj1 : String --> Map (Pos --> N)
68 proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
69 proj3 : Map (Pos --> N) / S(Pos) --> N
70 where
71 N denotes the set of nonnegative integers,
72 Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
73 S(Pos) is the symmetric group over Pos.
75 This was the theory for !_hash_includes_len; if _hash_includes_len, slight
76 modifications apply:
77 proj1 : String --> Map (Pos --> N) x N
78 proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
79 proj3 : Map (Pos --> N) / S(Pos) x N --> N
81 For a case-insensitive hash function, the general form is
83 hash (keyword) =
84 sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
85 + len (keyword)
87 where alpha_unify[c] is chosen so that an upper/lower case change in
88 keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]].
89 *//*}}} THEORY -- END */
90 /*{{{ finding asso_values[] that fit
91 The idea is to choose the _asso_values[] one by one, in a way that
92 a choice that has been made never needs to be undone later. This
93 means that we split the work into several steps. Each step chooses
94 one or more _asso_values[c]. The result of choosing one or more
95 _asso_values[c] is that the partitioning of the keyword set gets
96 broader.
97 Look at this partitioning: After every step, the _asso_values[] of a
98 certain set C of characters are undetermined. (At the beginning, C
99 is the set of characters c with _occurrences[c] > 0. At the end, C
100 is empty.) To each keyword K, we associate the multiset of _selchars
101 for which the _asso_values[] are undetermined:
102 K --> K->_selchars intersect C.
103 Consider two keywords equivalent if their value under this mapping is
104 the same. This introduces an equivalence relation on the set of
105 keywords. The equivalence classes partition the keyword set. (At the
106 beginning, the partition is the finest possible: each K is an equivalence
107 class by itself, because all K have a different _selchars. At the end,
108 all K have been merged into a single equivalence class.)
109 The partition before a step is always a refinement of the partition
110 after the step.
111 We choose the steps in such a way that the partition really becomes
112 broader at each step. (A step that only chooses an _asso_values[c]
113 without changing the partition is better merged with the previous step,
114 to avoid useless backtracking.) }}}*/
115 /*------------------------------------------------------------------------------------------------*/
116 /*{{{ local */
117 /*{{{ types */
118 struct EquivalenceClass
120 /* the keywords in this equivalence class */
121 struct Keyword_List *keywords;
122 struct Keyword_List *keywords_last;
123 /* the number of keywords in this equivalence class */
124 u32 cardinality;
126 * the undetermined selected characters for the keywords in this equivalence class, as a
127 * canonically reordered multiset
129 u32 *undetermined_chars;
130 u32 undetermined_chars_length;
132 struct EquivalenceClass *next;
135 struct Step
137 /* the characters whose values are being determined in this step */
138 u32 changing_count;
139 u32 *changing;
141 * Exclusive upper bound for the _asso_values[c] of this step. A power
142 * of 2.
144 u32 asso_value_max;
145 /* the characters whose values will be determined after this step */
146 bool *undetermined;
147 /* the keyword set partition after this step */
148 struct EquivalenceClass *partition;
149 /* the expected number of iterations in this step */
150 f64 expected_lower;
151 f64 expected_upper;
153 struct Step *next;
155 /*}}} types -- END */
156 /*{{{ code */
157 /*{{{ equals */
158 static bool equals(u32 *ptr1, u32 *ptr2, u32 len)
160 loop {
161 if (len == 0)
162 break;
163 if (*ptr1 != *ptr2)
164 return false;
165 ++ptr1;
166 ++ptr2;
167 len--;
169 return true;
170 }/*}}}*/
171 static void delete_partition(struct EquivalenceClass *partition)
173 loop {
174 struct EquivalenceClass *equclass;
176 if (partition == 0)
177 break;
178 equclass = partition;
179 partition = equclass->next;
180 delete_list(equclass->keywords);
181 free(equclass);
184 static bool less_by_hash_value(struct Keyword *kw1, struct Keyword *kw2)
186 return kw1->hash_value < kw2->hash_value;
188 /*}}} code -- END */
189 /*}}} local -- END */
190 /*------------------------------------------------------------------------------------------------*/
191 /*{{{ schr_new */
192 static struct Search *schr_new(struct Keyword_List *list)
194 struct Search *t;
196 t = calloc(1, sizeof(*t));
197 t->head = list;
198 t->key_positions = pos_new();
199 return t;
200 }/*}}}*/
201 /*{{{ schr_del */
202 static void schr_del(struct Search *t)
204 ba_del(t->collision_detector);
205 if (OPTS(DEBUG)) {
206 u32 i;
207 s32 field_width;
208 struct Keyword_List *ptr;
210 fprintf(stderr, "\ndumping occurrence and associated values tables\n");
211 i = 0;
212 loop {
213 if (i >= t->alpha_size)
214 break;
215 if (t->occurrences[i])
216 fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n", i, t->asso_values[i], i, t->occurrences[i]);
217 ++i;
219 fprintf(stderr, "end table dumping\n");
220 fprintf(stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n", t->list_len, t->total_keys, t->total_duplicates, t->max_key_len);
221 field_width = t->max_selchars_length;
222 fprintf(stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n", field_width, "selchars");
223 ptr = t->head;
224 loop {
225 s32 j;
227 if (ptr == 0)
228 break;
229 fprintf(stderr, "%11d,%11d,%6d, ", ptr->kw->hash_value, ptr->kw->allchars_length, ptr->kw->final_index);
230 if (field_width > ptr->kw->selchars_length)
231 fprintf(stderr, "%*s", field_width - ptr->kw->selchars_length, "");
232 j = 0;
233 loop {
234 if (j >= ptr->kw->selchars_length)
235 break;
236 putc(ptr->kw->selchars[j], stderr);
237 ++j;
239 fprintf(stderr, ", %.*s\n", ptr->kw->allchars_length, ptr->kw->allchars);
240 ptr = ptr->next;
242 fprintf(stderr, "End dumping list.\n\n");
244 pos_del(t->key_positions);
245 free(t->asso_values);
246 free(t->occurrences);
247 free(t->alpha_unify);
248 free(t->alpha_inc);
249 free(t);
250 }/*}}}*/
251 /*{{{ schr_optimize */
252 static void schr_optimize(struct Search *t)
254 struct Keyword_List *curr_ptr;
255 s32 max_hash_value;
256 u32 c;
258 /* preparations */
259 schr_prepare(t);
261 /* Step 1: Finding good byte positions. */
262 schr_find_positions(t);
264 /* Step 2: Finding good alpha increments. */
265 schr_find_alpha_inc(t);
267 /* Step 3: Finding good asso_values. */
268 schr_find_good_asso_values(t);
269 /* Make one final check, just to make sure nothing weird happened.... */
270 ba_clear(t->collision_detector);
271 curr_ptr = t->head;
272 loop {
273 struct Keyword *curr;
274 u32 hashcode;
276 if (curr_ptr == 0)
277 break;
278 curr = curr_ptr->kw;
279 hashcode = schr_compute_hash(t, curr);
280 if (ba_set_bit(t->collision_detector, hashcode)) {
282 * This shouldn't happen. proj1, proj2, proj3 must have been computed to be
283 * injective on the given keyword set.
285 fprintf(stderr, "\nInternal error, unexpected duplicate hash code\n");
286 if (OPTS(POSITIONS))
287 fprintf(stderr, "try options -m or -r, or use new key positions.\n\n");
288 else
289 fprintf(stderr, "try options -m or -r.\n\n");
290 exit(1);
292 curr_ptr = curr_ptr->next;
294 /* sorts the keyword list by hash value */
295 schr_sort(t);
297 * Set unused asso_values[c] to max_hash_value + 1. This is not absolutely necessary, but
298 * speeds up the lookup function in many cases of lookup failure: no string comparison is
299 * needed once the hash value of a string is larger than the hash value of any keyword.
302 struct Keyword_List *tmp;
304 tmp = t->head;
305 loop {
306 if (tmp->next == 0)
307 break;
308 tmp = tmp->next;
310 max_hash_value = tmp->kw->hash_value;
312 c = 0;
313 loop {
314 if (c >= t->alpha_size)
315 break;
316 if (t->occurrences[c] == 0)
317 t->asso_values[c] = max_hash_value + 1;
318 ++c;
320 /* propagate unified asso_values */
321 if (t->alpha_unify) {
322 u32 c;
324 c = 0;
325 loop {
326 if (c >= t->alpha_size)
327 break;
328 if (t->alpha_unify[c] != c)
329 t->asso_values[c] = t->asso_values[t->alpha_unify[c]];
330 ++c;
333 }/*}}}*/
334 /*{{{ schr_prepare */
335 static void schr_prepare(struct Search *t)
337 struct Keyword_List *tmp;
339 t->total_keys = 0;
340 tmp = t->head;
341 loop {
342 if (tmp == 0)
343 break;
344 ++(t->total_keys);
345 tmp = tmp->next;
347 /* compute the minimum and maximum keyword length */
348 t->max_key_len = S32_MIN;
349 t->min_key_len = S32_MAX;
350 tmp = t->head;
351 loop {
352 struct Keyword *kw;
354 if (tmp == 0)
355 break;
356 kw = tmp->kw;
357 if (t->max_key_len < kw->allchars_length)
358 t->max_key_len = kw->allchars_length;
359 if (t->min_key_len > kw->allchars_length)
360 t->min_key_len = kw->allchars_length;
361 tmp = tmp->next;
364 * exit program if an empty string is used as keyword, since the comparison expressions
365 * don't work correctly for looking up an empty string
367 if (t->min_key_len == 0) {
368 fprintf (stderr, "Empty input keyword is not allowed.\nTo recognize an empty input keyword, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n");
369 exit(1);
371 /* exit program if the characters in the keywords are not in the required range */
372 if (OPTS(SEVENBIT)) {
373 tmp = t->head;
374 loop {
375 struct Keyword *kw;
376 u8 *k;
377 s32 i;
379 if (tmp == 0)
380 break;
381 kw = tmp->kw;
382 k = kw->allchars;
383 i = kw->allchars_length;
384 loop {
385 if (i <= 0)
386 break;
387 if (!(*k < 128)) {
388 fprintf(stderr, "Option --seven-bit has been specified,\nbut keyword \"%.*s\" contains non-ASCII characters.\nTry removing option --seven-bit.\n", kw->allchars_length, kw->allchars);
389 exit(1);
391 i--;
392 ++k;
394 tmp = tmp->next;
397 /* determine whether the hash function shall include the length */
398 t->hash_includes_len = !(OPTS(NOLENGTH) || (t->min_key_len == t->max_key_len));
399 }/*}}}*/
400 /*{{{ schr_find_positions */
401 /* find good key positions */
402 static void schr_find_positions(struct Search *t)
404 u32 *alpha_unify;
405 s32 imax;
406 struct Positions *mandatory;
407 struct Positions *current;
408 u32 current_duplicates_count;
409 /* if the user gave the key positions, we use them */
410 if (OPTS(POSITIONS)) {
411 pos_cpy(t->key_positions, options->key_positions);
412 return;
414 /* compute preliminary alpha_unify table */
415 alpha_unify = schr_compute_alpha_unify(t);
417 /* 1. find positions that must occur in order to distinguish duplicates */
418 mandatory = pos_new();
419 if (!OPTS(DUP)) {
420 struct Keyword_List *l1;
422 l1 = t->head;
423 loop {
424 struct Keyword *kw1;
425 struct Keyword_List *l2;
427 if (l1 == 0 || l1->next == 0)
428 break;
429 kw1 = l1->kw;
430 l2 = l1->next;
431 loop {
432 struct Keyword *kw2;
434 if (l2 == 0)
435 break;
436 kw2 = l2->kw;
438 * if keyword1 and keyword2 have the same length and differ
439 * in just one position, and it is not the last character,
440 * this position is mandatory
442 if (kw1->allchars_length == kw2->allchars_length) {
443 s32 n;
444 s32 i;
446 n = kw1->allchars_length;
447 i = 0;
448 loop {
449 u32 c1;
450 u32 c2;
452 if (i >= (n - 1))
453 break;
454 c1 = kw1->allchars[i];
455 c2 = kw2->allchars[i];
456 if (OPTS(UPPERLOWER)) {
457 if (c1 >= 'A' && c1 <= 'Z')
458 c1 += 'a' - 'A';
459 if (c2 >= 'A' && c2 <= 'Z')
460 c2 += 'a' - 'A';
462 if (c1 != c2)
463 break;
464 ++i;
466 if (i < (n - 1)) {
467 s32 j;
469 j = i + 1;
470 loop {
471 u32 c1;
472 u32 c2;
474 if (j >= n)
475 break;
476 c1 = kw1->allchars[j];
477 c2 = kw2->allchars[j];
478 if (OPTS(UPPERLOWER)) {
479 if (c1 >= 'A' && c1 <= 'Z')
480 c1 += 'a' - 'A';
481 if (c2 >= 'A' && c2 <= 'Z')
482 c2 += 'a' - 'A';
484 if (c1 != c2)
485 break;
486 ++j;
488 if (j >= n) {
489 /* position i is mandatory */
490 if (!pos_contains(mandatory, i))
491 pos_add(mandatory, i);
495 l2 = l2->next;
497 l1 = l1->next;
500 /* 2. add positions, as long as this decreases the duplicates count */
501 imax = (t->max_key_len - 1 < (s32)POS_MAX_KEY_POS - 1 ? t->max_key_len - 1
502 : (s32)POS_MAX_KEY_POS - 1);
503 current = pos_new();
504 pos_cpy(current, mandatory);
505 current_duplicates_count = schr_count_duplicates_tuple_do(t, current, alpha_unify);
506 loop {
507 struct Positions *best;
508 u32 best_duplicates_count;
509 s32 i;
511 best = pos_new();
512 best_duplicates_count = U32_MAX;
513 i = imax;
514 loop {
515 if (i < -1)
516 break;
517 if (!pos_contains(current, i)) {
518 struct Positions *tryal;
519 u32 try_duplicates_count;
521 tryal = pos_new();
522 pos_cpy(tryal, current);
523 pos_add(tryal, i);
524 try_duplicates_count = schr_count_duplicates_tuple_do(t, tryal,
525 alpha_unify);
527 * We prefer 'try' to 'best' if it produces less duplicates, or if
528 * it produces the same number of duplicates but with a more
529 * efficient hash function.
531 if (try_duplicates_count < best_duplicates_count
532 || (try_duplicates_count == best_duplicates_count
533 && i >=0)) {
534 pos_cpy(best, tryal);
535 best_duplicates_count = try_duplicates_count;
537 pos_del(tryal);
539 i--;
541 /* stop adding positions when it gives no improvement */
542 if (best_duplicates_count >= current_duplicates_count)
543 break;
544 pos_cpy(current, best);
545 pos_del(best);
546 current_duplicates_count = best_duplicates_count;
548 /* 3. remove positions, as long as this doesn't increase the duplicates count */
549 loop {
550 struct Positions *best;
551 u32 best_duplicates_count;
552 s32 i;
554 best = pos_new();
555 best_duplicates_count = U32_MAX;
556 i = imax;
557 loop {
558 if (i < -1)
559 break;
560 if (pos_contains(current, i) && !pos_contains(mandatory, i)) {
561 struct Positions *tryal;
562 u32 try_duplicates_count;
564 tryal = pos_new();
565 pos_cpy(tryal, current);
566 pos_remove(tryal, i);
567 try_duplicates_count = schr_count_duplicates_tuple_do(t, tryal,
568 alpha_unify);
570 * We prefer 'try' to 'best' if it produces less duplicates, or if
571 * it produces the same number of duplicates but with a more
572 * efficient hash function.
574 if (try_duplicates_count < best_duplicates_count
575 || (try_duplicates_count == best_duplicates_count
576 && i == -1)) {
577 pos_cpy(best, tryal);
578 best_duplicates_count = try_duplicates_count;
581 i--;
583 /* stop removing positions when it gives no improvement */
584 if (best_duplicates_count > current_duplicates_count)
585 break;
586 pos_cpy(current, best);
587 pos_del(best);
588 current_duplicates_count = best_duplicates_count;
590 /* 4. replace two positions by one, as long as this doesn't increase the duplicates count */
591 loop {
592 struct Positions *best;
593 u32 best_duplicates_count;
594 s32 i1;
596 best = pos_new();
597 best_duplicates_count = U32_MAX;
599 * Loop over all pairs { i1, i2 } of currently selected positions. W.l.o.g. we can
600 * assume i1 > i2.
602 i1 = imax;
603 loop {
604 if (i1 < -1)
605 break;
606 if (pos_contains(current, i1) && !pos_contains(mandatory, i1)) {
607 s32 i2;
609 i2 = i1 - 1;
610 loop {
611 if (i2 < -1)
612 break;
613 if (pos_contains(current, i2) && !pos_contains(mandatory, i2)) {
614 s32 i3;
616 i3 = imax;
617 loop {
618 if (i3 < -1)
619 break;
620 if (!pos_contains(current, i3)) {
621 struct Positions *tryal;
622 u32 try_duplicates_count;
624 tryal = pos_new();
625 pos_cpy(tryal, current);
626 pos_remove(tryal, i1);
627 pos_remove(tryal, i2);
628 pos_add(tryal, i3);
629 try_duplicates_count = schr_count_duplicates_tuple_do(t, tryal, alpha_unify);
631 * We prefer 'try' to 'best' if it produces less
632 * duplicates, or if it produces the same number
633 * of duplicates but with a more efficient hash
634 * function.
636 if (try_duplicates_count < best_duplicates_count
637 || (try_duplicates_count == best_duplicates_count
638 && (i1 == -1 || i2 == -1) && i3 >= 0)) {
639 pos_cpy(best, tryal);
640 best_duplicates_count = try_duplicates_count;
642 pos_del(tryal);
644 i3--;
647 i2--;
650 i1--;
652 /* stop removing positions when it gives no improvement */
653 if (best_duplicates_count > current_duplicates_count)
654 break;
655 pos_cpy(current, best);
656 current_duplicates_count = best_duplicates_count;
657 pos_del(best);
659 /* That's it. Hope it's good enough. */
660 pos_cpy(t->key_positions, current);
661 if (OPTS(DEBUG)) {
662 struct PositionReverseIterator *iter;
663 bool seen_lastchar;
664 bool first;
665 s32 i;
666 /* Print the result. */
667 fprintf(stderr, "\nComputed positions: ");
668 iter = pos_reviterator(t->key_positions);
669 seen_lastchar = false;
670 first = true;
671 loop {
672 i = posrevit_next(iter);
673 if (i == POSREVIT_EOS)
674 break;
675 if (!first)
676 fprintf(stderr, ", ");
677 if (i == POS_LASTCHAR)
678 seen_lastchar = true;
679 else {
680 fprintf(stderr, "%d", i + 1);
681 first = false;
684 if (seen_lastchar) {
685 if (!first)
686 fprintf(stderr, ", ");
687 fprintf(stderr, "$");
689 fprintf(stderr, "\n");
690 posrevit_del(iter);
692 pos_del(current);
693 pos_del(mandatory);
694 free(alpha_unify);
695 }/*}}}*/
696 /*{{{ schr_compute_alpha_size */
697 /* computes the upper bound on the indices passed to asso_values[], assuming no alpha_increments */
698 static u32 schr_compute_alpha_size(void)
700 return (OPTS(SEVENBIT) ? 128 : 256);
701 }/*}}}*/
702 /*{{{ schr_compute_alpha_unify */
703 static u32 *schr_compute_alpha_unify(struct Search *t)
705 if (OPTS(UPPERLOWER)) {
706 /* uppercase to lowercase mapping */
707 u32 alpha_size;
708 u32 *alpha_unify;
709 u32 c;
711 alpha_size = schr_compute_alpha_size();
712 alpha_unify = calloc(alpha_size, sizeof(*alpha_unify));
713 c = 0;
714 loop {
715 if (c >= alpha_size)
716 break;
717 alpha_unify[c] = c;
718 ++c;
720 c = 'A';
721 loop {
722 if (c > 'Z')
723 break;
724 alpha_unify[c] = c + ('a' - 'A');
725 ++c;
727 return alpha_unify;
728 } else
729 /* identity mapping */
730 return 0;
731 }/*}}}*/
732 /*{{{ schr_find_alpha_inc */
733 /* find good alpha_inc[] */
734 static void schr_find_alpha_inc(struct Search *t)
737 * The goal is to choose _alpha_inc[] such that it doesn't introduce artificial duplicates.
738 * In other words, the goal is: # proj2 (proj1 (K)) = # proj1 (K).
740 u32 duplicates_goal;
741 u32 *current;
742 s32 i;
743 u32 current_duplicates_count;
745 duplicates_goal = schr_count_duplicates_tuple(t);
746 current = calloc(t->max_key_len, sizeof(*current));
747 i = 0;
748 loop {
749 if (i >= t->max_key_len)
750 break;
751 current[i] = 0;
752 ++i;
754 current_duplicates_count = schr_count_duplicates_multiset(t, current);
755 if (current_duplicates_count > duplicates_goal) {
756 /* look which _alpha_inc[i] we are free to increment */
757 u32 nindices;
758 u32 *indices;
759 u32 *best;
760 u32 *tryal;
762 struct PositionIterator *iter;
764 nindices = 0;
765 iter = pos_iterator(t->key_positions, t->max_key_len);
766 loop {
767 s32 key_pos;
769 key_pos = positer_next(iter);
770 if (key_pos == POSITER_EOS)
771 break;
772 if (key_pos != POS_LASTCHAR)
773 ++nindices;
775 positer_del(iter);
777 indices = calloc(nindices, sizeof(*indices));
779 u32 j;
780 struct PositionIterator *iter;
782 j = 0;
783 iter = pos_iterator(t->key_positions, t->max_key_len);
784 loop {
785 s32 key_pos;
787 key_pos = positer_next(iter);
788 if (key_pos == POSITER_EOS)
789 break;
790 if (key_pos != POS_LASTCHAR) {
791 indices[j] = key_pos;
792 ++j;
795 if (!(j == nindices))
796 abort();
797 positer_del(iter);
800 * Perform several rounds of searching for a good alpha increment. Each round
801 * reduces the number of artificial collisions by adding an increment in a single
802 * key position.
804 best = calloc(t->max_key_len, sizeof(*best));
805 tryal = calloc(t->max_key_len, sizeof(*best));
806 loop {
807 u32 inc;
808 /* An increment of 1 is not always enough. Try higher increments also. */
809 inc = 1;
810 loop {
811 u32 best_duplicates_count;
812 u32 j;
814 best_duplicates_count = U32_MAX;
815 j = 0;
816 loop {
817 u32 try_duplicates_count;
819 if (j >= nindices)
820 break;
821 memcpy(tryal, current, t->max_key_len * sizeof(u32));
822 tryal[indices[j]] += inc;
823 try_duplicates_count = schr_count_duplicates_multiset(t,
824 tryal);
825 /* we prefer 'try' to 'best' if it produces less duplicates */
826 if (try_duplicates_count < best_duplicates_count) {
827 memcpy(best, tryal, t->max_key_len * sizeof(u32));
828 best_duplicates_count = try_duplicates_count;
830 ++j;
832 /* stop this round when we got an improvement */
833 if (best_duplicates_count < current_duplicates_count) {
834 memcpy(current, best, t->max_key_len * sizeof(u32));
835 current_duplicates_count = best_duplicates_count;
836 break;
838 ++inc;
840 if (current_duplicates_count <= duplicates_goal)
841 break;
843 free(tryal);
844 free(best);
846 if (OPTS(DEBUG)) {
847 bool first;
848 u32 j;
849 /* print the result */
850 fprintf(stderr, "\nComputed alpha increments: ");
851 first = true;
852 j = nindices;
853 loop {
854 if (j <= 0)
855 break;
856 j--;
857 if (current[indices[j]] != 0) {
858 if (!first)
859 fprintf(stderr, ", ");
860 fprintf(stderr, "%u:+%u", indices[j] + 1, current[indices[j]]);
861 first = false;
864 fprintf(stderr, "\n");
866 free(indices);
868 t->alpha_inc = current;
869 t->alpha_size = schr_compute_alpha_size_with_inc(t, t->alpha_inc);
870 t->alpha_unify = schr_compute_alpha_unify_with_inc(t, t->key_positions, t->alpha_inc);
871 }/*}}}*/
872 /*{{{ schr_count_duplicates_tuple_do */
874 * Count the duplicate keywords that occur with a given set of positions. In other words, it
875 * returns the difference # K - # proj1 (K) where K is the multiset of given keywords.
877 static u32 schr_count_duplicates_tuple_do(struct Search *t, struct Positions *positions,
878 u32 *alpha_unify)
880 u32 count;
882 * Run through the keyword list and count the duplicates incrementally. The result does not
883 * depend on the order of the keyword list, thanks to the formula above.
885 schr_init_selchars_tuple(t, positions, alpha_unify);
886 count = 0;
888 struct Hash_Table *representatives;
889 struct Keyword_List *tmp;
891 representatives = ht_new(t->total_keys, !t->hash_includes_len);
892 tmp = t->head;
893 loop {
894 struct Keyword *kw;
896 if (tmp == 0)
897 break;
898 kw = tmp->kw;
899 if (ht_insert(representatives, kw))
900 ++count;
901 tmp = tmp->next;
903 ht_del(representatives);
905 schr_delete_selchars(t);
906 return count;
907 }/*}}}*/
908 /*{{{ schr_init_selchars_tuple */
909 static void schr_init_selchars_tuple(struct Search *t, struct Positions *positions,
910 u32 *alpha_unify)
912 struct Keyword_List *tmp;
914 tmp = t->head;
915 loop {
916 if (tmp == 0)
917 break;
918 kw_init_selchars_tuple(tmp->kw, positions, alpha_unify);
919 tmp = tmp->next;
921 }/*}}}*/
922 /*{{{ schr_delete_selchars */
923 static void schr_delete_selchars(struct Search *t)
925 struct Keyword_List *tmp;
927 tmp = t->head;
928 loop {
929 if (tmp == 0)
930 break;
931 kw_delete_selchars(tmp->kw);
932 tmp = tmp->next;
934 }/*}}}*/
935 /*{{{ schr_count_duplicates_tuple */
937 * Count the duplicate keywords that occur with the found set of positions. In other words, it
938 * returns the difference # K - # proj1 (K) where K is the multiset of given keywords.
940 static u32 schr_count_duplicates_tuple(struct Search *t)
942 u32 *alpha_unify;
943 u32 count;
945 alpha_unify = schr_compute_alpha_unify(t);
946 count = schr_count_duplicates_tuple_do(t, t->key_positions, alpha_unify);
947 free(alpha_unify);
948 return count;
949 }/*}}}*/
950 /*{{{ schr_count_duplicates_multiset */
952 * Count the duplicate keywords that occur with the given set of positions and a given alpha_inc[]
953 * array.
954 * In other words, it returns the difference
955 * # K - # proj2 (proj1 (K))
956 * where K is the multiset of given keywords.
958 static u32 schr_count_duplicates_multiset(struct Search *t, u32 *alpha_inc)
961 * Run through the keyword list and count the duplicates incrementally. The result does not
962 * depend on the order of the keyword list, thanks to the formula above.
964 u32 *alpha_unify;
965 u32 count;
967 alpha_unify = schr_compute_alpha_unify_with_inc(t, t->key_positions, alpha_inc);
968 schr_init_selchars_multiset(t, t->key_positions, alpha_unify, alpha_inc);
969 count = 0;
971 struct Hash_Table *representatives;
972 struct Keyword_List *tmp;
974 representatives = ht_new(t->total_keys, !t->hash_includes_len);
975 tmp = t->head;
976 loop {
977 struct Keyword *kw;
979 if (tmp == 0)
980 break;
981 kw = tmp->kw;
982 if (ht_insert(representatives, kw))
983 ++count;
984 tmp = tmp->next;
986 ht_del(representatives);
988 schr_delete_selchars(t);
989 free(alpha_unify);
990 return count;
991 }/*}}}*/
992 /*{{{ schr_compute_alpha_unify_with_inc */
993 /* computes the unification rules between different asso_values[c] */
994 static u32 *schr_compute_alpha_unify_with_inc(struct Search *t, struct Positions *positions,
995 u32 *alpha_inc)
997 if (OPTS(UPPERLOWER)) {
999 * Without alpha increments, we would simply unify
1000 * 'A' -> 'a', ..., 'Z' -> 'z'.
1001 * But when a keyword contains at position i a character c,
1002 * we have the constraint
1003 * asso_values[tolower(c) + alpha_inc[i]] ==
1004 * asso_values[toupper(c) + alpha_inc[i]].
1005 * This introduces a unification
1006 * toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
1007 * Note that this unification can extend outside the range of
1008 * ASCII letters! But still every unified character pair is at
1009 * a distance of 'a'-'A' = 32, or (after chained unification)
1010 * at a multiple of 32. So in the end the alpha_unify vector
1011 * has the form c -> c + 32 * f(c) where f(c) is a
1012 * nonnegative integer.
1014 u32 alpha_size;
1015 u32 *alpha_unify;
1016 u32 c;
1017 struct Keyword_List *tmp;
1019 alpha_size = schr_compute_alpha_size_with_inc(t, alpha_inc);
1020 alpha_unify = calloc(alpha_size, sizeof(*alpha_unify));
1021 c = 0;
1022 loop {
1023 if (c >= alpha_size)
1024 break;
1025 alpha_unify[c] = c;
1026 ++c;
1028 tmp = t->head;
1029 loop {
1030 struct Keyword *kw;
1031 struct PositionIterator *iter;
1032 s32 i;
1034 if (tmp == 0)
1035 break;
1036 kw = tmp->kw;
1037 iter = pos_iterator(positions, kw->allchars_length);
1038 loop {
1039 u32 c;
1041 i = positer_next(iter);
1042 if (i == POSITER_EOS)
1043 break;
1044 if (i == POS_LASTCHAR)
1045 c = (u8)(kw->allchars[kw->allchars_length - 1]);
1046 else if (i < kw->allchars_length)
1047 c = (u8)(kw->allchars[i]);
1048 else
1049 abort();
1050 if (c >= 'A' && c <= 'Z')
1051 c += 'a' - 'A';
1052 if (c >= 'a' && c <= 'z') {
1053 u32 d;
1054 u32 b;
1055 s32 a;
1057 if (i != POS_LASTCHAR)
1058 c += alpha_inc[i];
1059 /* unify c with c - ('a'-'A') */
1060 d = alpha_unify[c];
1061 b = c - ('a' - 'A');
1062 a = b;
1063 loop {
1064 if (a < 0 || alpha_unify[a] != b)
1065 break;
1066 alpha_unify[a] = d;
1067 a -= ('a' - 'A');
1071 positer_del(iter);
1072 tmp = tmp->next;
1074 return alpha_unify;
1075 } else
1076 /* identity mapping */
1077 return 0;
1078 }/*}}}*/
1079 /*{{{ schr_compute_alpha_size_with_inc */
1080 /* computes the upper bound on the indices passed to asso_values[] */
1081 static u32 schr_compute_alpha_size_with_inc(struct Search *t, u32 *alpha_inc)
1083 u32 max_alpha_inc;
1084 s32 i;
1086 max_alpha_inc = 0;
1087 i = 0;
1088 loop {
1089 if (i >= t->max_key_len)
1090 break;
1091 if (max_alpha_inc < alpha_inc[i])
1092 max_alpha_inc = alpha_inc[i];
1093 ++i;
1095 return (OPTS(SEVENBIT) ? 128 : 256) + max_alpha_inc;
1096 }/*}}}*/
1097 /*{{{ schr_init_selchars_multiset */
1098 static void schr_init_selchars_multiset(struct Search *t, struct Positions *positions,
1099 u32 *alpha_unify, u32 *alpha_inc)
1101 struct Keyword_List *tmp;
1103 tmp = t->head;
1104 loop {
1105 if (tmp == 0)
1106 break;
1107 kw_init_selchars_multiset(tmp->kw, positions, alpha_unify, alpha_inc);
1108 tmp = tmp->next;
1110 }/*}}}*/
1111 /*{{{ schr_find_good_asso_values */
1112 /* finds good _asso_values[] */
1113 static void schr_find_good_asso_values(struct Search *t)
1115 s32 asso_iterations;
1116 struct Keyword_List *saved_head;
1117 s32 best_initial_asso_value;
1118 s32 best_jump;
1119 s32 *best_asso_values;
1120 s32 best_collisions;
1121 s32 best_max_hash_value;
1123 schr_prepare_asso_values(t);
1125 /* search for good _asso_values[] */
1126 asso_iterations = options->asso_iterations;
1127 if (asso_iterations == 0) {
1128 schr_find_asso_values(t);
1129 return;
1132 * Try different pairs of _initial_asso_value and _jump, in the
1133 * following order:
1134 * (0, 1)
1135 * (1, 1)
1136 * (2, 1) (0, 3)
1137 * (3, 1) (1, 3)
1138 * (4, 1) (2, 3) (0, 5)
1139 * (5, 1) (3, 3) (1, 5)
1140 *.....
1142 saved_head = t->head;
1143 best_initial_asso_value = 0;
1144 best_jump = 1;
1145 best_asso_values = calloc(t->alpha_size, sizeof(*best_asso_values));
1146 best_collisions = S32_MAX;
1147 best_max_hash_value = S32_MAX;
1149 t->initial_asso_value = 0;
1150 t->jump = 1;
1151 loop {
1152 s32 collisions;
1153 s32 max_hash_value;
1154 struct Keyword_List *ptr;
1155 /* restore the keyword list in its original order */
1156 t->head = copy_list(saved_head);
1157 /* find good _asso_values[] */
1158 schr_find_asso_values(t);
1159 /* test whether it is the best solution so far */
1160 collisions = 0;
1161 max_hash_value = S32_MIN;
1162 ba_clear(t->collision_detector);
1163 ptr = t->head;
1164 loop {
1165 struct Keyword *kw;
1166 s32 hashcode;
1168 if (ptr == 0)
1169 break;
1170 kw = ptr->kw;
1171 hashcode = schr_compute_hash(t, kw);
1172 if (max_hash_value < hashcode)
1173 max_hash_value = hashcode;
1174 if (ba_set_bit(t->collision_detector, hashcode))
1175 ++collisions;
1176 ptr = ptr->next;
1178 if (collisions < best_collisions || (collisions == best_collisions
1179 && max_hash_value < best_max_hash_value)) {
1180 memcpy(best_asso_values, t->asso_values, t->alpha_size
1181 * sizeof(*(t->asso_values)));
1182 best_collisions = collisions;
1183 best_max_hash_value = max_hash_value;
1185 /* delete the copied keyword list */
1186 delete_list(t->head);
1188 asso_iterations--;
1189 if (asso_iterations == 0)
1190 break;
1191 /* prepare for next iteration */
1192 if (t->initial_asso_value >= 2) {
1193 t->initial_asso_value -= 2;
1194 t->jump += 2;
1195 } else {
1196 t->initial_asso_value += t->jump;
1197 t->jump = 1;
1200 t->head = saved_head;
1201 /* install the best found asso_values */
1202 t->initial_asso_value = best_initial_asso_value;
1203 t->jump = best_jump;
1204 memcpy(t->asso_values, best_asso_values, t->alpha_size * sizeof(*(t->asso_values)));
1205 free(best_asso_values);
1206 }/*}}}*/
1207 /*{{{ schr_find_asso_values */
1208 static void schr_find_asso_values(struct Search *t)
1210 struct Step *steps;
1211 struct Step *step;
1212 u32 stepno;
1214 /* determine the steps, starting with the last one */
1216 bool *undetermined;
1217 bool *determined;
1218 s32 c;
1220 steps = 0;
1222 undetermined = calloc(t->alpha_size, sizeof(*undetermined));
1223 c = 0;
1224 loop {
1225 if (c >= t->alpha_size)
1226 break;
1227 undetermined[c] = false;
1228 ++c;
1230 determined = calloc(t->alpha_size, sizeof(*determined));
1231 c = 0;
1232 loop {
1233 if (c >= t->alpha_size)
1234 break;
1235 determined[c] = true;
1236 ++c;
1238 loop {
1239 /* compute the partition that needs to be refined */
1240 struct EquivalenceClass *partition;
1241 u32 chosen_c;
1242 u32 chosen_possible_collisions;
1243 struct Step *step;
1244 u32 c;
1245 u32 *changing;
1246 u32 changing_count;
1248 partition = schr_compute_partition(t, undetermined);
1250 * Determine the main character to be chosen in this step. Choosing such a
1251 * character c has the effect of splitting every equivalence class
1252 * (according the the frequency of occurrence of c). We choose the c with
1253 * the minimum number of possible collisions, so that characters which lead
1254 * to a large number of collisions get handled early during the search.
1257 u32 best_c;
1258 u32 best_possible_collisions;
1259 u32 c;
1261 best_c = 0;
1262 best_possible_collisions = U32_MAX;
1263 c = 0;
1264 loop {
1265 if (c >= t->alpha_size)
1266 break;
1267 if (t->occurrences[c] > 0 && determined[c]) {
1268 u32 possible_collisions;
1270 possible_collisions =
1271 schr_count_possible_collisions(t,
1272 partition, c);
1273 if (possible_collisions
1274 < best_possible_collisions) {
1275 best_c = c;
1276 best_possible_collisions =
1277 possible_collisions;
1280 ++c;
1282 if (best_possible_collisions == U32_MAX) {
1284 * All c with occurrences[c] > 0 are undetermined. We are
1285 * are the starting situation and don't need any more step.
1287 delete_partition(partition);
1288 break;
1290 chosen_c = best_c;
1291 chosen_possible_collisions = best_possible_collisions;
1293 /* we need one more step */
1294 step = calloc(1, sizeof(*step));
1296 step->undetermined = calloc(t->alpha_size,
1297 sizeof(*(step->undetermined)));
1298 memcpy(step->undetermined, undetermined, t->alpha_size
1299 * sizeof(*(step->undetermined)));
1300 step->partition = partition;
1301 /* now determine how the equivalence classes will be before this step */
1302 undetermined[chosen_c] = true;
1303 partition = schr_compute_partition(t, undetermined);
1305 * Now determine which other characters should be determined in this step,
1306 * because they will not change the equivalence classes at this point. It
1307 * is the set of all c which, for all equivalence classes, have the same
1308 * frequency of occurrence in every keyword of the equivalence class.
1310 c = 0;
1311 loop {
1312 if (c >= t->alpha_size)
1313 break;
1314 if (t->occurrences[c] > 0 && determined[c]
1315 && schr_unchanged_partition(t, partition, c)) {
1316 undetermined[c] = true;
1317 determined[c] = false;
1319 ++c;
1321 /* main_c must be one of these */
1322 if (determined[chosen_c])
1323 abort();
1324 /* now the set of changing characters of this step */
1325 changing_count = 0;
1326 c = 0;
1327 loop {
1328 if (c >= t->alpha_size)
1329 break;
1330 if (undetermined[c] && !step->undetermined[c])
1331 ++changing_count;
1332 ++c;
1334 changing = calloc(changing_count, sizeof(*changing));
1335 changing_count = 0;
1336 c = 0;
1337 loop {
1338 if (c >= t->alpha_size)
1339 break;
1340 if (undetermined[c] && !step->undetermined[c]) {
1341 changing[changing_count] = c;
1342 ++changing_count;
1344 ++c;
1346 step->changing = changing;
1347 step->changing_count = changing_count;
1348 step->asso_value_max = t->asso_value_max;
1349 step->expected_lower = exp((f64)(chosen_possible_collisions)
1350 / (f64)(t->max_hash_value));
1351 step->expected_upper = exp((f64)(chosen_possible_collisions)
1352 / (f64)(t->asso_value_max));
1353 delete_partition(partition);
1354 step->next = steps;
1355 steps = step;
1357 free(determined);
1358 free(undetermined);
1360 if (OPTS(DEBUG)) {
1361 u32 stepno;
1362 struct Step *step;
1364 stepno = 0;
1365 step = steps;
1366 loop {
1367 u32 i;
1368 struct EquivalenceClass *cls;
1370 if (step == 0)
1371 break;
1372 ++stepno;
1373 fprintf(stderr, "Step %u chooses _asso_values[", stepno);
1374 i = 0;
1375 loop {
1376 if (i >= step->changing_count)
1377 break;
1378 if (i > 0)
1379 fprintf(stderr, ",");
1380 fprintf(stderr, "'%c'", step->changing[i]);
1381 ++i;
1383 fprintf(stderr, "], expected number of iterations between %g and %g.\n", step->expected_lower, step->expected_upper);
1384 fprintf(stderr, "Keyword equivalence classes:\n");
1385 cls = step->partition;
1386 loop {
1387 struct Keyword_List *tmp;
1389 if (cls == 0)
1390 break;
1391 fprintf (stderr, "\n");
1392 tmp = cls->keywords;
1393 loop {
1394 struct Keyword *kw;
1396 if (tmp == 0)
1397 break;
1398 kw = tmp->kw;
1399 fprintf(stderr, " %.*s\n", kw->allchars_length, kw->allchars);
1400 tmp = tmp->next;
1402 cls = cls->next;
1404 fprintf(stderr, "\n");
1405 step = step->next;
1409 * Initialize _asso_values[]. (The value given here matters only for those c which occur in
1410 * all keywords with equal multiplicity.)
1412 memset(t->asso_values, 0, t->alpha_size * sizeof(*t->asso_values));
1413 stepno = 0;
1414 step = steps;
1415 loop {
1416 u32 k;
1417 u32 i;
1418 u32 iterations;
1419 u32 *iter;
1420 u32 ii;
1422 if (step == 0)
1423 break;
1424 ++stepno;
1425 /* initialize the asso_values[] */
1426 k = step->changing_count;
1427 i = 0;
1428 loop {
1429 u32 c;
1431 if (i >= k)
1432 break;
1433 c = step->changing[i];
1434 t->asso_values[c] = (t->initial_asso_value < 0 ? rand()
1435 : t->initial_asso_value) & (step->asso_value_max - 1);
1436 ++i;
1438 iterations = 0;
1439 iter = calloc(k, sizeof(*iter));
1440 ii = (t->jump != 0 ? k - 1 : 0);
1441 loop {
1442 bool has_collision;
1443 struct EquivalenceClass *cls;
1445 * test whether these asso_values[] lead to collisions among the equivalence
1446 * classes that should be collision-free
1448 has_collision = false;
1449 cls = step->partition;
1450 loop {
1451 struct Keyword_List *ptr;
1453 if (cls == 0)
1454 break;
1455 /* Iteration Number array is a win, O(1) initialization time! */
1456 ba_clear(t->collision_detector);
1457 ptr = cls->keywords;
1458 loop {
1459 struct Keyword *kw;
1460 s32 hashcode;
1462 if (ptr == 0)
1463 break;
1464 kw = ptr->kw;
1466 * compute the new hash code for the keyword, leaving apart
1467 * the yet undetermined asso_values[].
1470 s32 sum;
1471 u32 *p;
1472 s32 i;
1474 sum = t->hash_includes_len ? kw->allchars_length
1475 : 0;
1476 p = kw->selchars;
1477 i = kw->selchars_length;
1478 loop {
1479 if (i <= 0)
1480 break;
1481 if (!step->undetermined[*p])
1482 sum += t->asso_values[*p];
1483 ++p;
1484 i--;
1486 hashcode = sum;
1489 * See whether it collides with another keyword's hash code,
1490 * from the same equivalence class
1492 if (ba_set_bit(t->collision_detector, hashcode)) {
1493 has_collision = true;
1494 break;
1496 ptr = ptr->next;
1499 * don't need to continue looking at the other equivalence classes
1500 * if we already have found a collision
1502 if (has_collision)
1503 break;
1504 cls = cls->next;
1506 ++iterations;
1507 if (!has_collision)
1508 break;
1509 /* try other asso_values[] */
1510 if (t->jump != 0) {
1512 * The way we try various values for
1513 * asso_values[step->_changing[0],...step->_changing[k-1]]
1514 * is like this:
1515 * for (bound = 0,1,...)
1516 * for (ii = 0,...,k-1)
1517 * iter[ii] := bound
1518 * iter[0..ii-1] := values <= bound
1519 * iter[ii+1..k-1] := values < bound
1520 * and
1521 * asso_values[step->_changing[i]] =
1522 * _initial_asso_value + iter[i] * _jump.
1523 * This makes it more likely to find small asso_values[].
1525 u32 bound;
1526 s32 i;
1528 bound = iter[ii];
1529 i = 0;
1530 loop {
1531 u32 c;
1533 if (i >= ii)
1534 break;
1535 c = step->changing[i];
1536 ++(iter[i]);
1537 t->asso_values[c] = (t->asso_values[c] + t->jump)
1538 & (step->asso_value_max - 1);
1539 if (iter[i] <= bound)
1540 goto found_next;
1541 t->asso_values[c] = (t->asso_values[c] - iter[i] * t->jump)
1542 & (step->asso_value_max - 1);
1543 iter[i] = 0;
1544 ++i;
1546 i = ii + 1;
1547 loop {
1548 u32 c;
1550 if (i >= k)
1551 break;
1552 c = step->changing[i];
1553 ++(iter[i]);
1554 t->asso_values[c] = (t->asso_values[c] + t->jump)
1555 & (step->asso_value_max - 1);
1556 if (iter[i] < bound)
1557 goto found_next;
1558 t->asso_values[c] = (t->asso_values[c] - iter[i] * t->jump)
1559 & (step->asso_value_max - 1);
1560 iter[i] = 0;
1561 ++i;
1563 /* switch from one ii to the next */
1565 u32 c;
1567 c = step->changing[ii];
1568 t->asso_values[c] = (t->asso_values[c] - bound * t->jump)
1569 & (step->asso_value_max - 1);
1570 iter[ii] = 0;
1572 /* here all iter[i] == 0 */
1573 ++ii;
1574 if (ii == k) {
1575 ii = 0;
1576 ++bound;
1577 if (bound == step->asso_value_max) {
1579 * Out of search space! We can either backtrack, or
1580 * increase the available search space of this step.
1581 * It seems simpler to choose the latter solution.
1583 step->asso_value_max = 2 * step->asso_value_max;
1584 if (step->asso_value_max > t->asso_value_max) {
1585 t->asso_value_max = step->asso_value_max;
1586 /* reinitialize _max_hash_value */
1587 t->max_hash_value = (t->hash_includes_len ? t->max_key_len : 0)
1588 + (t->asso_value_max - 1) * t->max_selchars_length;
1589 /* reinitialize _collision_detector */
1590 free(t->collision_detector);
1591 t->collision_detector = ba_new(
1592 t->max_hash_value + 1);
1597 u32 c;
1599 c = step->changing[ii];
1600 iter[ii] = bound;
1601 t->asso_values[c] = (t->asso_values[c] + bound * t->jump)
1602 & (step->asso_value_max - 1);
1604 found_next: ;
1605 } else {
1606 /* random */
1607 u32 c;
1609 c = step->changing[ii];
1610 t->asso_values[c] = (t->asso_values[c] + rand())
1611 & (step->asso_value_max - 1);
1612 /* next time, change the next c */
1613 ++ii;
1614 if (ii == k)
1615 ii = 0;
1618 free(iter);
1619 if (OPTS(DEBUG)) {
1620 u32 i;
1622 fprintf(stderr, "Step %u chose _asso_values[", stepno);
1623 i = 0;
1624 loop {
1625 if (i >= step->changing_count)
1626 break;
1627 if (i > 0)
1628 fprintf(stderr, ",");
1629 fprintf(stderr, "'%c'", step->changing[i]);
1630 ++i;
1632 fprintf (stderr, "] in %u iterations.\n", iterations);
1634 step = step->next;
1636 /* free allocated memory */
1637 loop {
1638 struct Step *step;
1640 if (steps == 0)
1641 break;
1642 step = steps;
1643 steps = step->next;
1644 free(step->changing);
1645 free(step->undetermined);
1646 delete_partition(step->partition);
1647 free(step);
1649 }/*}}}*/
1650 /*{{{ chr_count_possible_collisions */
1652 * compute the possible number of collisions when asso_values[c] is chosen, leading to the given
1653 * partition
1655 static u32 schr_count_possible_collisions(struct Search *t,
1656 struct EquivalenceClass *partition, u32 c)
1659 * Every equivalence class p is split according to the frequency of occurrence of c, leading
1660 * to equivalence classes p1, p2, ...
1661 * This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions.
1662 * Return the sum of this expression over all equivalence classes.
1664 u32 sum;
1665 u32 m;
1666 u32 *split_cardinalities;
1667 struct EquivalenceClass *cls;
1669 sum = 0;
1670 m = t->max_selchars_length;
1671 split_cardinalities = calloc(m + 1, sizeof(*split_cardinalities));
1672 cls = partition;
1673 loop {
1674 u32 i;
1675 struct Keyword_List *tmp;
1677 if (cls == 0)
1678 break;
1679 memset(split_cardinalities, 0, (m + 1) * sizeof(*split_cardinalities));
1680 tmp = cls->keywords;
1681 loop {
1682 struct Keyword *kw;
1683 u32 count;
1684 s32 i;
1686 if (tmp == 0)
1687 break;
1688 kw = tmp->kw;
1689 count = 0;
1690 i = 0;
1691 loop {
1692 if (i >= kw->selchars_length)
1693 break;
1694 if (kw->selchars[i] == c)
1695 ++count;
1696 ++i;
1698 ++(split_cardinalities[count]);
1699 tmp = tmp->next;
1701 sum += cls->cardinality * cls->cardinality;
1702 i = 0;
1703 loop {
1704 if (i > m)
1705 break;
1706 sum -= split_cardinalities[i] * split_cardinalities[i];
1707 ++i;
1709 cls = cls->next;
1711 free(split_cardinalities);
1712 return sum;
1714 /*}}}*/
1715 /*{{{ schr_compute_partition */
1716 static struct EquivalenceClass *schr_compute_partition(struct Search *t, bool *undetermined)
1718 struct EquivalenceClass *partition;
1719 struct EquivalenceClass *partition_last;
1720 struct Keyword_List *tmp;
1721 struct EquivalenceClass *cls;
1723 partition = 0;
1724 partition_last = 0;
1725 tmp = t->head;
1726 loop {
1727 struct Keyword *kw;
1728 u32 *undetermined_chars;
1729 u32 undetermined_chars_length;
1730 s32 i;
1731 struct EquivalenceClass *equclass;
1732 struct Keyword_List *cons;
1734 if (tmp == 0)
1735 break;
1736 kw = tmp->kw;
1737 undetermined_chars = calloc(kw->selchars_length, sizeof(*undetermined_chars));
1738 undetermined_chars_length = 0;
1739 i = 0;
1740 loop {
1741 if (i >= kw->selchars_length)
1742 break;
1743 if (undetermined[kw->selchars[i]] != 0) {
1744 undetermined_chars[undetermined_chars_length] = kw->selchars[i];
1745 ++undetermined_chars_length;
1747 ++i;
1749 /* look up the equivalence class to which this keyword belongs */
1750 equclass = partition;
1751 loop {
1752 if (equclass == 0)
1753 break;
1754 if (equclass->undetermined_chars_length == undetermined_chars_length
1755 && equals(equclass->undetermined_chars, undetermined_chars,
1756 undetermined_chars_length))
1757 break;
1758 equclass = equclass->next;
1760 if (equclass == 0) {
1761 equclass = calloc(1, sizeof(*equclass));
1762 equclass->undetermined_chars = undetermined_chars;
1763 equclass->undetermined_chars_length = undetermined_chars_length;
1764 if (partition != 0)
1765 partition_last->next = equclass;
1766 else
1767 partition = equclass;
1768 partition_last = equclass;
1769 } else
1770 free(undetermined_chars);
1771 /* add the keyword to the equivalence class */
1772 cons = kwl_new(kw);
1773 if (equclass->keywords != 0)
1774 equclass->keywords_last->next = cons;
1775 else
1776 equclass->keywords = cons;
1777 equclass->keywords_last = cons;
1778 ++(equclass->cardinality);
1779 tmp = tmp->next;
1781 /* Free some of the allocated memory. The caller doesn't need it. */
1782 cls = partition;
1783 loop {
1784 if (cls == 0)
1785 break;
1786 free(cls->undetermined_chars);
1787 cls = cls->next;
1789 return partition;
1790 }/*}}}*/
1791 /*{{{ schr_prepare_asso_values */
1792 /* initializes the asso_values[] related parameters */
1793 static void schr_prepare_asso_values(struct Search *t)
1795 struct Keyword_List *tmp;
1796 struct PositionIterator *iter;
1797 s32 non_linked_length;
1798 u32 asso_value_max;
1800 /* initialize each keyword's _selchars array */
1801 schr_init_selchars_multiset(t, t->key_positions, t->alpha_unify, t->alpha_inc);
1802 /* compute the maximum _selchars_length over all keywords */
1803 iter = pos_iterator(t->key_positions, t->max_key_len);
1804 t->max_selchars_length = positer_remaining(iter);
1805 positer_del(iter);
1807 * Check for duplicates, i.e. keywords with the same _selchars array (and - if
1808 * _hash_includes_len - also the same length). We deal with these by building an
1809 * equivalence class, so that only 1 keyword is representative of the entire collection.
1810 * Only this representative remains in the keyword list; the others are accessible through
1811 * the _duplicate_link chain, starting at the representative. This *greatly* simplifies
1812 * processing during later stages of the program. Set _total_duplicates and _list_len =
1813 * _total_keys - _total_duplicates.
1816 struct Hash_Table *representatives;
1817 struct Keyword_List *prev;
1819 t->list_len = t->total_keys;
1820 t->total_duplicates = 0;
1822 representatives = ht_new(t->list_len, !t->hash_includes_len);
1823 prev = 0; /* list node before temp */
1824 tmp = t->head;
1825 loop {
1826 struct Keyword *kw;
1827 struct Keyword *other_kw;
1828 struct Keyword_List *garbage;
1830 if (tmp == 0)
1831 break;
1832 kw = tmp->kw;
1833 other_kw = ht_insert(representatives, kw);
1834 garbage = 0;
1835 if (other_kw != 0) {
1836 ++(t->total_duplicates);
1837 (t->list_len)--;
1838 /* remove keyword from the main list */
1839 prev->next = tmp->next;
1840 garbage = tmp;
1841 /* and insert it on other_keyword's duplicate list */
1842 kw->duplicate_link = other_kw->duplicate_link;
1843 other_kw->duplicate_link = kw;
1844 /* complain if user hasn't enabled the duplicate option */
1845 if (!OPTS(DUP) || OPTS(DEBUG)) {
1846 s32 j;
1848 fprintf(stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"", kw->allchars_length, kw->allchars, other_kw->allchars_length, other_kw->allchars);
1849 j = 0;
1850 loop {
1851 if (j >= kw->selchars_length)
1852 break;
1853 putc(kw->selchars[j], stderr);
1854 ++j;
1856 fprintf(stderr, "\".\n");
1858 } else {
1859 kw->duplicate_link = 0;
1860 prev = tmp;
1862 tmp = tmp->next;
1863 if (garbage != 0)
1864 kwl_del(garbage);
1866 if (OPTS(DEBUG))
1867 ht_dump(representatives);
1868 ht_del(representatives);
1871 * Exit program if duplicates exists and option[DUP] not set, since we don't want to
1872 * continue in this case. (We don't want to turn on option[DUP] implicitly, because the
1873 * generated code is usually much slower.
1875 if (t->total_duplicates != 0) {
1876 if (OPTS(DUP))
1877 fprintf(stderr, "%d input keys have identical hash values, examine output carefully...\n", t->total_duplicates);
1878 else {
1879 fprintf(stderr, "%d input keys have identical hash values,\n", t->total_duplicates);
1880 if (OPTS(POSITIONS))
1881 fprintf(stderr, "try different key positions or use option -D.\n");
1882 else
1883 fprintf(stderr, "use option -D.\n");
1884 exit(1);
1887 /* compute the occurrences of each character in the alphabet */
1888 t->occurrences = calloc(t->alpha_size, sizeof(*(t->occurrences)));
1889 tmp = t->head;
1890 loop {
1891 struct Keyword *kw;
1892 u32 *ptr;
1893 s32 count;
1895 if (tmp == 0)
1896 break;
1897 kw = tmp->kw;
1898 ptr = kw->selchars;
1899 count = kw->selchars_length;
1900 loop {
1901 if (count <= 0)
1902 break;
1903 ++(t->occurrences[*ptr]);
1904 ++ptr;
1905 count--;
1907 tmp = tmp->next;
1909 /* memory allocation */
1910 t->asso_values = calloc(t->alpha_size, sizeof(*(t->asso_values)));
1911 non_linked_length = t->list_len;
1912 asso_value_max = (u32)(non_linked_length * options->size_multiple);
1914 * Round up to the next power of two. This makes it easy to ensure an _asso_value[c] is >= 0
1915 * and < asso_value_max. Also, the jump value being odd, it guarantees that
1916 * Search::try_asso_value() will iterate through different values for _asso_value[c].
1918 if (asso_value_max == 0)
1919 asso_value_max = 1;
1920 asso_value_max |= asso_value_max >> 1;
1921 asso_value_max |= asso_value_max >> 2;
1922 asso_value_max |= asso_value_max >> 4;
1923 asso_value_max |= asso_value_max >> 8;
1924 asso_value_max |= asso_value_max >> 16;
1925 ++asso_value_max;
1926 t->asso_value_max = asso_value_max;
1928 * given the bound for _asso_values[c], we have a bound for the possible hash values, as
1929 * computed in compute_hash()
1931 t->max_hash_value = (t->hash_includes_len ? t->max_key_len : 0) + (t->asso_value_max - 1)
1932 * t->max_selchars_length;
1933 /* allocate a sparse bit vector for detection of collisions of hash values */
1934 t->collision_detector = ba_new(t->max_hash_value + 1);
1935 if (OPTS(DEBUG)) {
1936 s32 field_width;
1937 s32 i;
1939 fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d\nmaximum size of generated hash table is %d\n", non_linked_length, asso_value_max, t->max_hash_value);
1940 field_width = 0;
1942 struct Keyword_List *tmp;
1944 tmp = t->head;
1945 loop {
1946 struct Keyword *kw;
1948 if (tmp == 0)
1949 break;
1950 kw = tmp->kw;
1951 if (field_width < kw->selchars_length)
1952 field_width = kw->selchars_length;
1953 tmp = tmp->next;
1956 fprintf (stderr, "\ndumping the keyword list without duplicates\n");
1957 fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
1958 i = 0;
1959 tmp = t->head;
1960 loop {
1961 struct Keyword *kw;
1962 s32 j;
1964 if (tmp == 0)
1965 break;
1966 kw = tmp->kw;
1967 ++i;
1968 fprintf(stderr, "%9d, ", i);
1969 if (field_width > kw->selchars_length)
1970 fprintf(stderr, "%*s", field_width - kw->selchars_length, "");
1971 j = 0;
1972 loop {
1973 if (j >= kw->selchars_length)
1974 break;
1975 putc(kw->selchars[j], stderr);
1976 ++j;
1978 fprintf(stderr, ", %.*s\n", kw->allchars_length, kw->allchars);
1979 tmp = tmp->next;
1981 fprintf(stderr, "\nend of keyword list\n\n");
1983 if (OPTS(RANDOM) || options->jump == 0)
1984 srand((long)time(0));
1985 t->initial_asso_value = (OPTS(RANDOM) ? -1 : options->initial_asso_value);
1986 t->jump = options->jump;
1987 }/*}}}*/
1988 /*{{{ schr_unchanged_partition */
1989 /* Test whether adding c to the undetermined characters changes the given partition. */
1990 static bool schr_unchanged_partition(struct Search *t, struct EquivalenceClass *partition, u32 c)
1992 struct EquivalenceClass *cls;
1994 cls = partition;
1995 loop {
1996 u32 first_count;
1997 struct Keyword_List *tmp;
1999 if (cls == 0)
2000 break;
2001 first_count = U32_MAX;
2002 tmp = cls->keywords;
2003 loop {
2004 struct Keyword *kw;
2005 u32 count;
2006 s32 i;
2008 if (tmp == 0)
2009 break;
2010 kw = tmp->kw;
2011 count = 0;
2012 i = 0;
2013 loop {
2014 if (i >= kw->selchars_length)
2015 break;
2016 if (kw->selchars[i] == c)
2017 ++count;
2018 ++i;
2020 if (tmp == cls->keywords)
2021 first_count = count;
2022 else if (count != first_count)
2023 /* c would split this equivalence class */
2024 return false;
2025 tmp = tmp->next;
2027 cls = cls->next;
2029 return true;
2031 /*}}}*/
2032 /*{{{ schr_compute_hash */
2034 * computes a keyword's hash value, relative to the current _asso_values[], and stores it in
2035 * keyword->_hash_value
2037 static s32 schr_compute_hash(struct Search *t, struct Keyword *kw)
2039 s32 sum;
2040 u32 *p;
2041 s32 i;
2043 sum = t->hash_includes_len ? kw->allchars_length : 0;
2044 p = kw->selchars;
2045 i = kw->selchars_length;
2046 loop {
2047 if (i <= 0)
2048 break;
2049 sum += t->asso_values[*p];
2050 ++p;
2051 i--;
2053 kw->hash_value = sum;
2054 return sum;
2055 }/*}}}*/
2056 /*{{{ schr_sort */
2057 static void schr_sort(struct Search *t)
2059 t->head = mergesort_list(t->head, less_by_hash_value);
2061 /*}}}*/
2062 /*------------------------------------------------------------------------------------------------*/
2063 #define EPILOG
2064 #include "namespace/globals.h"
2065 #include "namespace/search.h"
2066 #include "namespace/keyword.h"
2067 #include "namespace/keyword_list.h"
2068 #include "namespace/options.h"
2069 #include "namespace/positions.h"
2070 #include "namespace/hash-table.h"
2071 #include "namespace/bool-array.h"
2072 #include "namespace/search.c"
2073 #undef EPILOG
2074 /*------------------------------------------------------------------------------------------------*/
2075 #endif