search.c

   1 #ifndef CGPERF_SEARCH_C
   2 #define CGPERF_SEARCH_C
   3 #include <stdlib.h>
   4 #include <string.h>
   5 #include <time.h>
   6 #include <math.h>
   7 #include "c_fixing.h"
   8 #include "globals.h"
   9 #include "search.h"
  10 #include "keyword.h"
  11 #include "keyword_list.h"
  12 #include "options.h"
  13 #include "positions.h"
  14 #include "hash-table.h"
  15 #include "bool-array.h"
  16 /*------------------------------------------------------------------------------------------------*/
  17 #include "namespace/globals.h"
  18 #include "namespace/search.h"
  19 #include "namespace/keyword.h"
  20 #include "namespace/keyword_list.h"
  21 #include "namespace/options.h"
  22 #include "namespace/positions.h"
  23 #include "namespace/hash-table.h"
  24 #include "namespace/bool-array.h"
  25 #include "namespace/search.c"
  26 /*------------------------------------------------------------------------------------------------*/
  27 /*{{{ THEORY */
  28 /* The general form of the hash function is
  29
  30       hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
  31                        + len (keyword)
  32
  33    where Pos is a set of byte positions,
  34    each alpha_inc[i] is a nonnegative integer,
  35    each asso_values[c] is a nonnegative integer,
  36    len (keyword) is the keyword's length if _hash_includes_len, or 0 otherwise.
  37
  38    Theorem 1: If all keywords are different, there is a set Pos such that
  39    all tuples (keyword[i] : i in Pos) are different.
  40
  41    Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
  42    are nonnegative integers alpha_inc[i] such that all multisets
  43    {keyword[i] + alpha_inc[i] : i in Pos} are different.
  44
  45    Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
  46
  47    Theorem 3: If all multisets selchars[keyword] are different, there are
  48    nonnegative integers asso_values[c] such that all hash values
  49    sum (asso_values[c] : c in selchars[keyword]) are different.
  50
  51    Based on these three facts, we find the hash function in three steps:
  52
  53    Step 1 (Finding good byte positions):
  54    Find a set Pos, as small as possible, such that all tuples
  55    (keyword[i] : i in Pos) are different.
  56
  57    Step 2 (Finding good alpha increments):
  58    Find nonnegative integers alpha_inc[i], as many of them as possible being
  59    zero, and the others being as small as possible, such that all multisets
  60    {keyword[i] + alpha_inc[i] : i in Pos} are different.
  61
  62    Step 3 (Finding good asso_values):
  63    Find asso_values[c] such that all hash (keyword) are different.
  64
  65    In other words, each step finds a projection that is injective on the
  66    given finite set:
  67      proj1 : String --> Map (Pos --> N)
  68      proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
  69      proj3 : Map (Pos --> N) / S(Pos) --> N
  70    where
  71      N denotes the set of nonnegative integers,
  72      Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
  73      S(Pos) is the symmetric group over Pos.
  74
  75    This was the theory for !_hash_includes_len; if _hash_includes_len, slight
  76    modifications apply:
  77      proj1 : String --> Map (Pos --> N) x N
  78      proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
  79      proj3 : Map (Pos --> N) / S(Pos) x N --> N
  80
  81    For a case-insensitive hash function, the general form is
  82
  83       hash (keyword) =
  84         sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
  85         + len (keyword)
  86
  87    where alpha_unify[c] is chosen so that an upper/lower case change in
  88    keyword[i] doesn't change  alpha_unify[keyword[i] + alpha_inc[i]].
  89  *//*}}} THEORY -- END */
  90 /*{{{ finding asso_values[] that fit
  91    The idea is to choose the _asso_values[] one by one, in a way that
  92    a choice that has been made never needs to be undone later.  This
  93    means that we split the work into several steps.  Each step chooses
  94    one or more _asso_values[c].  The result of choosing one or more
  95    _asso_values[c] is that the partitioning of the keyword set gets
  96    broader.
  97    Look at this partitioning:  After every step, the _asso_values[] of a
  98    certain set C of characters are undetermined.  (At the beginning, C
  99    is the set of characters c with _occurrences[c] > 0.  At the end, C
 100    is empty.)  To each keyword K, we associate the multiset of _selchars
 101    for which the _asso_values[] are undetermined:
 102                     K  -->  K->_selchars intersect C.
 103    Consider two keywords equivalent if their value under this mapping is
 104    the same.  This introduces an equivalence relation on the set of
 105    keywords.  The equivalence classes partition the keyword set.  (At the
 106    beginning, the partition is the finest possible: each K is an equivalence
 107    class by itself, because all K have a different _selchars.  At the end,
 108    all K have been merged into a single equivalence class.)
 109    The partition before a step is always a refinement of the partition
 110    after the step.
 111    We choose the steps in such a way that the partition really becomes
 112    broader at each step.  (A step that only chooses an _asso_values[c]
 113    without changing the partition is better merged with the previous step,
 114    to avoid useless backtracking.) }}}*/
 115 /*------------------------------------------------------------------------------------------------*/
 116 /*{{{ local */
 117 /*{{{ types */
 118 struct EquivalenceClass
 119 {
 120         /* the keywords in this equivalence class */
 121         struct Keyword_List *keywords;
 122         struct Keyword_List  *keywords_last;
 123         /* the number of keywords in this equivalence class */
 124         u32 cardinality;
 125         /*
 126          * the undetermined selected characters for the keywords in this equivalence class, as a
 127          * canonically reordered multiset
 128          */
 129         u32 *undetermined_chars;
 130         u32 undetermined_chars_length;
 131
 132         struct EquivalenceClass *next;
 133 };
 134
 135 struct Step
 136 {
 137         /* the characters whose values are being determined in this step */
 138         u32 changing_count;
 139         u32 *changing;
 140         /*
 141          * Exclusive upper bound for the _asso_values[c] of this step. A power
 142          * of 2.
 143          */
 144         u32 asso_value_max;
 145         /* the characters whose values will be determined after this step */
 146         bool *undetermined;
 147         /* the keyword set partition after this step */
 148         struct EquivalenceClass *partition;
 149         /* the expected number of iterations in this step */
 150         f64 expected_lower;
 151         f64 expected_upper;
 152
 153         struct Step *next;
 154 };
 155 /*}}} types -- END */
 156 /*{{{ code */
 157 /*{{{ equals */
 158 static bool equals(u32 *ptr1, u32 *ptr2, u32 len)
 159 {
 160         loop {
 161                 if (len == 0)
 162                         break;
 163                 if (*ptr1 != *ptr2)
 164                         return false;
 165                 ++ptr1;
 166                 ++ptr2;
 167                 len--;
 168         }
 169         return true;
 170 }/*}}}*/
 171 static void delete_partition(struct EquivalenceClass *partition)
 172 {
 173         loop {
 174                 struct EquivalenceClass *equclass;
 175
 176                 if (partition == 0)
 177                         break;
 178                 equclass = partition;
 179                 partition = equclass->next;
 180                 delete_list(equclass->keywords);
 181                 free(equclass);
 182         }
 183 }
 184 static bool less_by_hash_value(struct Keyword *kw1, struct Keyword *kw2)
 185 {
 186         return kw1->hash_value < kw2->hash_value;
 187 }
 188 /*}}} code -- END */
 189 /*}}} local -- END */
 190 /*------------------------------------------------------------------------------------------------*/
 191 /*{{{ schr_new */
 192 static struct Search *schr_new(struct Keyword_List *list)
 193 {
 194         struct Search *t;
 195
 196         t = calloc(1, sizeof(*t));
 197         t->head = list;
 198         t->key_positions = pos_new();
 199         return t;
 200 }/*}}}*/
 201 /*{{{ schr_del */
 202 static void schr_del(struct Search *t)
 203 {
 204         ba_del(t->collision_detector);
 205         if (OPTS(DEBUG)) {
 206                 u32 i;
 207                 s32 field_width;
 208                 struct Keyword_List *ptr;
 209
 210                 fprintf(stderr, "\ndumping occurrence and associated values tables\n");
 211                 i = 0;
 212                 loop {
 213                         if (i >= t->alpha_size)
 214                                 break;
 215                         if (t->occurrences[i])
 216                                 fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n", i, t->asso_values[i], i, t->occurrences[i]);
 217                         ++i;
 218                 }
 219                 fprintf(stderr, "end table dumping\n");
 220                 fprintf(stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n", t->list_len, t->total_keys, t->total_duplicates, t->max_key_len);
 221                 field_width = t->max_selchars_length;
 222                 fprintf(stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n", field_width, "selchars");
 223                 ptr = t->head;
 224                 loop {
 225                         s32 j;
 226
 227                         if (ptr == 0)
 228                                 break;
 229                         fprintf(stderr, "%11d,%11d,%6d, ", ptr->kw->hash_value, ptr->kw->allchars_length, ptr->kw->final_index);
 230                         if (field_width > ptr->kw->selchars_length)
 231                                 fprintf(stderr, "%*s", field_width - ptr->kw->selchars_length, "");
 232                         j = 0;
 233                         loop {
 234                                 if (j >= ptr->kw->selchars_length)
 235                                         break;
 236                                 putc(ptr->kw->selchars[j], stderr);
 237                                 ++j;
 238                         }
 239                         fprintf(stderr, ", %.*s\n", ptr->kw->allchars_length, ptr->kw->allchars);
 240                         ptr = ptr->next;
 241                 }
 242                 fprintf(stderr, "End dumping list.\n\n");
 243         }
 244         pos_del(t->key_positions);
 245         free(t->asso_values);
 246         free(t->occurrences);
 247         free(t->alpha_unify);
 248         free(t->alpha_inc);
 249         free(t);
 250 }/*}}}*/
 251 /*{{{ schr_optimize */
 252 static void schr_optimize(struct Search *t)
 253 {
 254         struct Keyword_List *curr_ptr;
 255         s32 max_hash_value;
 256         u32 c;
 257
 258         /* preparations */
 259         schr_prepare(t);
 260
 261         /* Step 1: Finding good byte positions. */
 262         schr_find_positions(t);
 263
 264         /* Step 2: Finding good alpha increments. */
 265         schr_find_alpha_inc(t);
 266
 267         /* Step 3: Finding good asso_values. */
 268         schr_find_good_asso_values(t);
 269         /* Make one final check, just to make sure nothing weird happened.... */
 270         ba_clear(t->collision_detector);
 271         curr_ptr = t->head;
 272         loop {
 273                 struct Keyword *curr;
 274                 u32 hashcode;
 275
 276                 if (curr_ptr == 0)
 277                         break;
 278                 curr = curr_ptr->kw;
 279                 hashcode = schr_compute_hash(t, curr);
 280                 if (ba_set_bit(t->collision_detector, hashcode)) {
 281                         /*
 282                          * This shouldn't happen. proj1, proj2, proj3 must have been computed to be
 283                          * injective on the given keyword set.
 284                          */
 285                         fprintf(stderr, "\nInternal error, unexpected duplicate hash code\n");
 286                         if (OPTS(POSITIONS))
 287                                 fprintf(stderr, "try options -m or -r, or use new key positions.\n\n");
 288                         else
 289                                 fprintf(stderr, "try options -m or -r.\n\n");
 290                         exit(1);
 291                 }
 292                 curr_ptr = curr_ptr->next;
 293         }
 294         /* sorts the keyword list by hash value */
 295         schr_sort(t);
 296         /*
 297          * Set unused asso_values[c] to max_hash_value + 1.  This is not absolutely necessary, but
 298          * speeds up the lookup function in many cases of lookup failure: no string comparison is
 299          * needed once the hash value of a string is larger than the hash value of any keyword.
 300          */
 301         {
 302                 struct Keyword_List *tmp;
 303
 304                 tmp = t->head;
 305                 loop {
 306                         if (tmp->next == 0)
 307                                 break;
 308                         tmp = tmp->next;
 309                 }
 310                 max_hash_value = tmp->kw->hash_value;
 311         }
 312         c = 0;
 313         loop {
 314                 if (c >= t->alpha_size)
 315                         break;
 316                 if (t->occurrences[c] == 0)
 317                         t->asso_values[c] = max_hash_value + 1;
 318                 ++c;
 319         }
 320         /* propagate unified asso_values */
 321         if (t->alpha_unify) {
 322                 u32 c;
 323
 324                 c = 0;
 325                 loop {
 326                         if (c >= t->alpha_size)
 327                                 break;
 328                         if (t->alpha_unify[c] != c)
 329                                 t->asso_values[c] = t->asso_values[t->alpha_unify[c]];
 330                         ++c;
 331                 }
 332         }
 333 }/*}}}*/
 334 /*{{{ schr_prepare */
 335 static void schr_prepare(struct Search *t)
 336 {
 337         struct Keyword_List *tmp;
 338
 339         t->total_keys = 0;
 340         tmp = t->head;
 341         loop {
 342                 if (tmp == 0)
 343                         break;
 344                 ++(t->total_keys);
 345                 tmp = tmp->next;
 346         }
 347         /* compute the minimum and maximum keyword length */
 348         t->max_key_len = S32_MIN;
 349         t->min_key_len = S32_MAX;
 350         tmp = t->head;
 351         loop {
 352                 struct Keyword *kw;
 353
 354                 if (tmp == 0)
 355                         break;
 356                 kw = tmp->kw;
 357                 if (t->max_key_len < kw->allchars_length)
 358                         t->max_key_len = kw->allchars_length;
 359                 if (t->min_key_len > kw->allchars_length)
 360                         t->min_key_len = kw->allchars_length;
 361                 tmp = tmp->next;
 362         }
 363         /*
 364          * exit program if an empty string is used as keyword, since the comparison expressions
 365          * don't work correctly for looking up an empty string
 366          */
 367         if (t->min_key_len == 0) {
 368                 fprintf (stderr, "Empty input keyword is not allowed.\nTo recognize an empty input keyword, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n");
 369                 exit(1);
 370         }
 371         /* exit program if the characters in the keywords are not in the required range */
 372         if (OPTS(SEVENBIT)) {
 373                 tmp = t->head;
 374                 loop {
 375                         struct Keyword *kw;
 376                         u8 *k;
 377                         s32 i;
 378
 379                         if (tmp == 0)
 380                                 break;
 381                         kw = tmp->kw;
 382                         k = kw->allchars;
 383                         i = kw->allchars_length;
 384                         loop {
 385                                 if (i <= 0)
 386                                         break;
 387                                 if (!(*k < 128)) {
 388                                         fprintf(stderr, "Option --seven-bit has been specified,\nbut keyword \"%.*s\" contains non-ASCII characters.\nTry removing option --seven-bit.\n", kw->allchars_length, kw->allchars);
 389                                         exit(1);
 390                                 }
 391                                 i--;
 392                                 ++k;
 393                         }
 394                         tmp = tmp->next;
 395                 }
 396         }
 397         /* determine whether the hash function shall include the length */
 398         t->hash_includes_len = !(OPTS(NOLENGTH) || (t->min_key_len == t->max_key_len));
 399 }/*}}}*/
 400 /*{{{ schr_find_positions */
 401 /* find good key positions */
 402 static void schr_find_positions(struct Search *t)
 403 {
 404         u32 *alpha_unify;
 405         s32 imax;
 406         struct Positions *mandatory;
 407         struct Positions *current;
 408         u32 current_duplicates_count;
 409         /* if the user gave the key positions, we use them */
 410         if (OPTS(POSITIONS)) {
 411                 pos_cpy(t->key_positions, options->key_positions);
 412                 return;
 413         }
 414         /* compute preliminary alpha_unify table */
 415         alpha_unify = schr_compute_alpha_unify(t);
 416
 417         /* 1. find positions that must occur in order to distinguish duplicates */
 418         mandatory = pos_new();
 419         if (!OPTS(DUP)) {
 420                 struct Keyword_List *l1;
 421
 422                 l1 = t->head;
 423                 loop {
 424                         struct Keyword *kw1;
 425                         struct Keyword_List *l2;
 426
 427                         if (l1 == 0 || l1->next == 0)
 428                                 break;
 429                         kw1 = l1->kw;
 430                         l2 = l1->next;
 431                         loop {
 432                                 struct Keyword *kw2;
 433
 434                                 if (l2 == 0)
 435                                         break;
 436                                 kw2 = l2->kw;
 437                                 /*
 438                                  * if keyword1 and keyword2 have the same length and differ
 439                                  * in just one position, and it is not the last character,
 440                                  * this position is mandatory
 441                                  */
 442                                 if (kw1->allchars_length == kw2->allchars_length) {
 443                                         s32 n;
 444                                         s32 i;
 445
 446                                         n = kw1->allchars_length;
 447                                         i = 0;
 448                                         loop {
 449                                                 u32 c1;
 450                                                 u32 c2;
 451
 452                                                 if (i >= (n - 1))
 453                                                         break;
 454                                                 c1 = kw1->allchars[i];
 455                                                 c2 = kw2->allchars[i];
 456                                                 if (OPTS(UPPERLOWER)) {
 457                                                         if (c1 >= 'A' && c1 <= 'Z')
 458                                                                 c1 += 'a' - 'A';
 459                                                         if (c2 >= 'A' && c2 <= 'Z')
 460                                                                 c2 += 'a' - 'A';
 461                                                 }
 462                                                 if (c1 != c2)
 463                                                         break;
 464                                                 ++i;
 465                                         }
 466                                         if (i < (n - 1)) {
 467                                                 s32 j;
 468
 469                                                 j = i + 1;
 470                                                 loop {
 471                                                         u32 c1;
 472                                                         u32 c2;
 473
 474                                                         if (j >= n)
 475                                                                 break;
 476                                                         c1 = kw1->allchars[j];
 477                                                         c2 = kw2->allchars[j];
 478                                                         if (OPTS(UPPERLOWER)) {
 479                                                                 if (c1 >= 'A' && c1 <= 'Z')
 480                                                                         c1 += 'a' - 'A';
 481                                                                 if (c2 >= 'A' && c2 <= 'Z')
 482                                                                         c2 += 'a' - 'A';
 483                                                         }
 484                                                         if (c1 != c2)
 485                                                                 break;
 486                                                         ++j;
 487                                                 }
 488                                                 if (j >= n) {
 489                                                         /* position i is mandatory */
 490                                                         if (!pos_contains(mandatory, i))
 491                                                                 pos_add(mandatory, i);
 492                                                 }
 493                                         }
 494                                 }
 495                                 l2 = l2->next;
 496                         }
 497                         l1 = l1->next;
 498                 }
 499         }
 500         /* 2. add positions, as long as this decreases the duplicates count */
 501         imax = (t->max_key_len - 1 < (s32)POS_MAX_KEY_POS - 1 ?  t->max_key_len - 1
 502                                                                         : (s32)POS_MAX_KEY_POS - 1);
 503         current = pos_new();
 504         pos_cpy(current, mandatory);
 505         current_duplicates_count = schr_count_duplicates_tuple_do(t, current, alpha_unify);
 506         loop {
 507                 struct Positions *best;
 508                 u32 best_duplicates_count;
 509                 s32 i;
 510
 511                 best = pos_new();
 512                 best_duplicates_count = U32_MAX;
 513                 i = imax;
 514                 loop {
 515                         if (i < -1)
 516                                 break;
 517                         if (!pos_contains(current, i)) {
 518                                 struct Positions *tryal;
 519                                 u32 try_duplicates_count;
 520
 521                                 tryal = pos_new();
 522                                 pos_cpy(tryal, current);
 523                                 pos_add(tryal, i);
 524                                 try_duplicates_count = schr_count_duplicates_tuple_do(t, tryal,
 525                                                                                 alpha_unify);
 526                                 /*
 527                                  * We prefer 'try' to 'best' if it produces less duplicates, or if
 528                                  * it produces the same number of duplicates but with a more
 529                                  * efficient hash function.
 530                                  */
 531                                 if (try_duplicates_count < best_duplicates_count
 532                                                 || (try_duplicates_count == best_duplicates_count
 533                                                                                         && i >=0)) {
 534                                         pos_cpy(best, tryal);
 535                                         best_duplicates_count = try_duplicates_count;
 536                                 }
 537                                 pos_del(tryal);
 538                         }
 539                         i--;
 540                 }
 541                 /* stop adding positions when it gives no improvement */
 542                 if (best_duplicates_count >= current_duplicates_count)
 543                         break;
 544                 pos_cpy(current, best);
 545                 pos_del(best);
 546                 current_duplicates_count = best_duplicates_count;
 547         }
 548         /* 3. remove positions, as long as this doesn't increase the duplicates count */
 549         loop {
 550                 struct Positions *best;
 551                 u32 best_duplicates_count;
 552                 s32 i;
 553
 554                 best = pos_new();
 555                 best_duplicates_count = U32_MAX;
 556                 i = imax;
 557                 loop {
 558                         if (i < -1)
 559                                 break;
 560                         if (pos_contains(current, i) && !pos_contains(mandatory, i)) {
 561                                 struct Positions *tryal;
 562                                 u32 try_duplicates_count;
 563
 564                                 tryal = pos_new();
 565                                 pos_cpy(tryal, current);
 566                                 pos_remove(tryal, i);
 567                                 try_duplicates_count = schr_count_duplicates_tuple_do(t, tryal,
 568                                                                                 alpha_unify);
 569                                 /*
 570                                  * We prefer 'try' to 'best' if it produces less duplicates, or if
 571                                  * it produces the same number of duplicates but with a more
 572                                  * efficient hash function.
 573                                  */
 574                                 if (try_duplicates_count < best_duplicates_count
 575                                                 || (try_duplicates_count == best_duplicates_count
 576                                                                                 && i == -1)) {
 577                                         pos_cpy(best, tryal);
 578                                         best_duplicates_count = try_duplicates_count;
 579                                 }
 580                         }
 581                         i--;
 582                 }
 583                 /* stop removing positions when it gives no improvement */
 584                 if (best_duplicates_count > current_duplicates_count)
 585                         break;
 586                 pos_cpy(current, best);
 587                 pos_del(best);
 588                 current_duplicates_count = best_duplicates_count;
 589         }
 590         /* 4. replace two positions by one, as long as this doesn't increase the duplicates count */
 591         loop {
 592                 struct Positions *best;
 593                 u32 best_duplicates_count;
 594                 s32 i1;
 595
 596                 best = pos_new();
 597                 best_duplicates_count = U32_MAX;
 598                 /*
 599                  * Loop over all pairs { i1, i2 } of currently selected positions. W.l.o.g. we can
 600                  * assume i1 > i2.
 601                  */
 602                 i1 = imax;
 603                 loop {
 604                         if (i1 < -1)
 605                                 break;
 606                         if (pos_contains(current, i1) && !pos_contains(mandatory, i1)) {
 607                                 s32 i2;
 608
 609                                 i2 = i1 - 1;
 610                                 loop {
 611                                         if (i2 < -1)
 612                                                 break;
 613                                         if (pos_contains(current, i2) && !pos_contains(mandatory, i2)) {
 614                                                 s32 i3;
 615
 616                                                 i3 = imax;
 617                                                 loop {
 618                                                         if (i3 < -1)
 619                                                                 break;
 620                                                         if (!pos_contains(current, i3)) {
 621                                                                 struct Positions *tryal;
 622                                                                 u32 try_duplicates_count;
 623
 624                                                                 tryal = pos_new();
 625                                                                 pos_cpy(tryal, current);
 626                                                                 pos_remove(tryal, i1);
 627                                                                 pos_remove(tryal, i2);
 628                                                                 pos_add(tryal, i3);
 629                                                                 try_duplicates_count = schr_count_duplicates_tuple_do(t, tryal, alpha_unify);
 630                                                                 /*
 631                                                                  * We prefer 'try' to 'best' if it produces less
 632                                                                  * duplicates, or if it produces the same number
 633                                                                  * of duplicates but with a more efficient hash
 634                                                                  * function.
 635                                                                  */
 636                                                                 if (try_duplicates_count < best_duplicates_count
 637                                                                         || (try_duplicates_count == best_duplicates_count
 638                                                                                 && (i1 == -1 || i2 == -1) && i3 >= 0)) {
 639                                                                         pos_cpy(best, tryal);
 640                                                                         best_duplicates_count = try_duplicates_count;
 641                                                                 }
 642                                                                 pos_del(tryal);
 643                                                         }
 644                                                         i3--;
 645                                                 }
 646                                         }
 647                                         i2--;
 648                                 }
 649                         }
 650                         i1--;
 651                 }
 652                 /* stop removing positions when it gives no improvement */
 653                 if (best_duplicates_count > current_duplicates_count)
 654                         break;
 655                 pos_cpy(current, best);
 656                 current_duplicates_count = best_duplicates_count;
 657                 pos_del(best);
 658         }
 659         /* That's it. Hope it's good enough. */
 660         pos_cpy(t->key_positions, current);
 661         if (OPTS(DEBUG)) {
 662                 struct PositionReverseIterator *iter;
 663                 bool seen_lastchar;
 664                 bool first;
 665                 s32 i;
 666                 /* Print the result. */
 667                 fprintf(stderr, "\nComputed positions: ");
 668                 iter = pos_reviterator(t->key_positions);
 669                 seen_lastchar = false;
 670                 first = true;
 671                 loop {
 672                         i = posrevit_next(iter);
 673                         if (i == POSREVIT_EOS)
 674                                 break;
 675                         if (!first)
 676                                 fprintf(stderr, ", ");
 677                         if (i == POS_LASTCHAR)
 678                                 seen_lastchar = true;
 679                         else {
 680                                 fprintf(stderr, "%d", i + 1);
 681                                 first = false;
 682                         }
 683                 }
 684                 if (seen_lastchar) {
 685                         if (!first)
 686                                 fprintf(stderr, ", ");
 687                         fprintf(stderr, "$");
 688                 }
 689                 fprintf(stderr, "\n");
 690                 posrevit_del(iter);
 691         }
 692         pos_del(current);
 693         pos_del(mandatory);
 694         free(alpha_unify);
 695 }/*}}}*/
 696 /*{{{ schr_compute_alpha_size */
 697 /* computes the upper bound on the indices passed to asso_values[], assuming no alpha_increments */
 698 static u32 schr_compute_alpha_size(void)
 699 {
 700         return (OPTS(SEVENBIT) ? 128 : 256);
 701 }/*}}}*/
 702 /*{{{ schr_compute_alpha_unify */
 703 static u32 *schr_compute_alpha_unify(struct Search *t)
 704 {
 705         if (OPTS(UPPERLOWER)) {
 706                 /* uppercase to lowercase mapping */
 707                 u32 alpha_size;
 708                 u32 *alpha_unify;
 709                 u32 c;
 710
 711                 alpha_size = schr_compute_alpha_size();
 712                 alpha_unify = calloc(alpha_size, sizeof(*alpha_unify));
 713                 c = 0;
 714                 loop {
 715                         if (c >= alpha_size)
 716                                 break;
 717                         alpha_unify[c] = c;
 718                         ++c;
 719                 }
 720                 c = 'A';
 721                 loop {
 722                         if (c > 'Z')
 723                                 break;
 724                         alpha_unify[c] = c + ('a' - 'A');
 725                         ++c;
 726                 }
 727                 return alpha_unify;
 728         } else
 729                 /* identity mapping */
 730                 return 0;
 731 }/*}}}*/
 732 /*{{{ schr_find_alpha_inc */
 733 /* find good alpha_inc[] */
 734 static void schr_find_alpha_inc(struct Search *t)
 735 {
 736         /*
 737          * The goal is to choose _alpha_inc[] such that it doesn't introduce artificial duplicates.
 738          * In other words, the goal is: # proj2 (proj1 (K)) = # proj1 (K).
 739          */
 740         u32 duplicates_goal;
 741         u32 *current;
 742         s32 i;
 743         u32 current_duplicates_count;
 744
 745         duplicates_goal = schr_count_duplicates_tuple(t);
 746         current = calloc(t->max_key_len, sizeof(*current));
 747         i = 0;
 748         loop {
 749                 if (i >= t->max_key_len)
 750                         break;
 751                 current[i] = 0;
 752                 ++i;
 753         }
 754         current_duplicates_count = schr_count_duplicates_multiset(t, current);
 755         if (current_duplicates_count > duplicates_goal) {
 756                 /* look which _alpha_inc[i] we are free to increment */
 757                 u32 nindices;
 758                 u32 *indices;
 759                 u32 *best;
 760                 u32 *tryal;
 761                 {
 762                         struct PositionIterator *iter;
 763
 764                         nindices = 0;
 765                         iter = pos_iterator(t->key_positions, t->max_key_len);
 766                         loop {
 767                                 s32 key_pos;
 768
 769                                 key_pos = positer_next(iter);
 770                                 if (key_pos == POSITER_EOS)
 771                                         break;
 772                                 if (key_pos != POS_LASTCHAR)
 773                                         ++nindices;
 774                         }
 775                         positer_del(iter);
 776                 }
 777                 indices = calloc(nindices, sizeof(*indices));
 778                 {
 779                         u32 j;
 780                         struct PositionIterator *iter;
 781
 782                         j = 0;
 783                         iter = pos_iterator(t->key_positions, t->max_key_len);
 784                         loop {
 785                                 s32 key_pos;
 786
 787                                 key_pos = positer_next(iter);
 788                                 if (key_pos == POSITER_EOS)
 789                                         break;
 790                                 if (key_pos != POS_LASTCHAR) {
 791                                         indices[j] = key_pos;
 792                                         ++j;
 793                                 }
 794                         }
 795                         if (!(j == nindices))
 796                                 abort();
 797                         positer_del(iter);
 798                 }
 799                 /*
 800                  * Perform several rounds of searching for a good alpha increment. Each round
 801                  * reduces the number of artificial collisions by adding an increment in a single
 802                  * key position.
 803                  */
 804                 best = calloc(t->max_key_len, sizeof(*best));
 805                 tryal = calloc(t->max_key_len, sizeof(*best));
 806                 loop {
 807                         u32 inc;
 808                         /* An increment of 1 is not always enough. Try higher increments also. */
 809                         inc = 1;
 810                         loop {
 811                                 u32 best_duplicates_count;
 812                                 u32 j;
 813
 814                                 best_duplicates_count = U32_MAX;
 815                                 j = 0;
 816                                 loop {
 817                                         u32 try_duplicates_count;
 818
 819                                         if (j >= nindices)
 820                                                 break;
 821                                         memcpy(tryal, current, t->max_key_len * sizeof(u32));
 822                                         tryal[indices[j]] += inc;
 823                                         try_duplicates_count = schr_count_duplicates_multiset(t,
 824                                                                                         tryal);
 825                                         /* we prefer 'try' to 'best' if it produces less duplicates */
 826                                         if (try_duplicates_count < best_duplicates_count) {
 827                                                 memcpy(best, tryal, t->max_key_len * sizeof(u32));
 828                                                 best_duplicates_count = try_duplicates_count;
 829                                         }
 830                                         ++j;
 831                                 }
 832                                 /* stop this round when we got an improvement */
 833                                 if (best_duplicates_count < current_duplicates_count) {
 834                                         memcpy(current, best, t->max_key_len * sizeof(u32));
 835                                         current_duplicates_count = best_duplicates_count;
 836                                         break;
 837                                 }
 838                                 ++inc;
 839                         }
 840                         if (current_duplicates_count <= duplicates_goal)
 841                                 break;
 842                 }
 843                 free(tryal);
 844                 free(best);
 845
 846                 if (OPTS(DEBUG)) {
 847                         bool first;
 848                         u32 j;
 849                         /* print the result */
 850                         fprintf(stderr, "\nComputed alpha increments: ");
 851                         first = true;
 852                         j = nindices;
 853                         loop {
 854                                 if (j <= 0)
 855                                         break;
 856                                 j--;
 857                                 if (current[indices[j]] != 0) {
 858                                         if (!first)
 859                                                 fprintf(stderr, ", ");
 860                                         fprintf(stderr, "%u:+%u", indices[j] + 1, current[indices[j]]);
 861                                         first = false;
 862                                 }
 863                         }
 864                         fprintf(stderr, "\n");
 865                 }
 866                 free(indices);
 867         }
 868         t->alpha_inc = current;
 869         t->alpha_size = schr_compute_alpha_size_with_inc(t, t->alpha_inc);
 870         t->alpha_unify = schr_compute_alpha_unify_with_inc(t, t->key_positions, t->alpha_inc);
 871 }/*}}}*/
 872 /*{{{ schr_count_duplicates_tuple_do */
 873 /*
 874  * Count the duplicate keywords that occur with a given set of positions. In other words, it
 875  * returns the difference # K - # proj1 (K) where K is the multiset of given keywords.
 876  */
 877 static u32 schr_count_duplicates_tuple_do(struct Search *t, struct Positions *positions,
 878                                                                                 u32 *alpha_unify)
 879 {
 880         u32 count;
 881         /*
 882          * Run through the keyword list and count the duplicates incrementally.  The result does not
 883          * depend on the order of the keyword list, thanks to the formula above.
 884          */
 885         schr_init_selchars_tuple(t, positions, alpha_unify);
 886         count = 0;
 887         {
 888                 struct Hash_Table *representatives;
 889                 struct Keyword_List *tmp;
 890
 891                 representatives = ht_new(t->total_keys, !t->hash_includes_len);
 892                 tmp = t->head;
 893                 loop {
 894                         struct Keyword *kw;
 895
 896                         if (tmp == 0)
 897                                 break;
 898                         kw = tmp->kw;
 899                         if (ht_insert(representatives, kw))
 900                                 ++count;
 901                         tmp = tmp->next;
 902                 }
 903                 ht_del(representatives);
 904         }
 905         schr_delete_selchars(t);
 906         return count;
 907 }/*}}}*/
 908 /*{{{ schr_init_selchars_tuple */
 909 static void schr_init_selchars_tuple(struct Search *t, struct Positions *positions,
 910                                                                                 u32 *alpha_unify)
 911 {
 912         struct Keyword_List *tmp;
 913
 914         tmp = t->head;
 915         loop {
 916                 if (tmp == 0)
 917                         break;
 918                 kw_init_selchars_tuple(tmp->kw, positions, alpha_unify);
 919                 tmp = tmp->next;
 920         }
 921 }/*}}}*/
 922 /*{{{ schr_delete_selchars */
 923 static void schr_delete_selchars(struct Search *t)
 924 {
 925         struct Keyword_List *tmp;
 926
 927         tmp = t->head;
 928         loop {
 929                 if (tmp == 0)
 930                         break;
 931                 kw_delete_selchars(tmp->kw);
 932                 tmp = tmp->next;
 933         }
 934 }/*}}}*/
 935 /*{{{ schr_count_duplicates_tuple */
 936 /*
 937  * Count the duplicate keywords that occur with the found set of positions.  In other words, it
 938  * returns the difference # K - # proj1 (K) where K is the multiset of given keywords.
 939  */
 940 static u32 schr_count_duplicates_tuple(struct Search *t)
 941 {
 942         u32 *alpha_unify;
 943         u32 count;
 944
 945         alpha_unify = schr_compute_alpha_unify(t);
 946         count = schr_count_duplicates_tuple_do(t, t->key_positions, alpha_unify);
 947         free(alpha_unify);
 948         return count;
 949 }/*}}}*/
 950 /*{{{ schr_count_duplicates_multiset */
 951 /*
 952  * Count the duplicate keywords that occur with the given set of positions and a given alpha_inc[]
 953  * array.
 954  * In other words, it returns the difference
 955  * # K - # proj2 (proj1 (K))
 956  * where K is the multiset of given keywords.
 957  */
 958 static u32 schr_count_duplicates_multiset(struct Search *t, u32 *alpha_inc)
 959 {
 960         /*
 961          * Run through the keyword list and count the duplicates incrementally. The result does not
 962          * depend on the order of the keyword list, thanks to the formula above.
 963          */
 964         u32 *alpha_unify;
 965         u32 count;
 966
 967         alpha_unify = schr_compute_alpha_unify_with_inc(t, t->key_positions, alpha_inc);
 968         schr_init_selchars_multiset(t, t->key_positions, alpha_unify, alpha_inc);
 969         count = 0;
 970         {
 971                 struct Hash_Table *representatives;
 972                 struct Keyword_List *tmp;
 973
 974                 representatives = ht_new(t->total_keys, !t->hash_includes_len);
 975                 tmp = t->head;
 976                 loop {
 977                         struct Keyword *kw;
 978
 979                         if (tmp == 0)
 980                                 break;
 981                         kw = tmp->kw;
 982                         if (ht_insert(representatives, kw))
 983                                 ++count;
 984                         tmp = tmp->next;
 985                 }
 986                 ht_del(representatives);
 987         }
 988         schr_delete_selchars(t);
 989         free(alpha_unify);
 990         return count;
 991 }/*}}}*/
 992 /*{{{ schr_compute_alpha_unify_with_inc */
 993 /* computes the unification rules between different asso_values[c] */
 994 static u32 *schr_compute_alpha_unify_with_inc(struct Search *t, struct Positions *positions,
 995                                                                                 u32 *alpha_inc)
 996 {
 997         if (OPTS(UPPERLOWER)) {
 998                 /*
 999                  * Without alpha increments, we would simply unify
1000                  * 'A' -> 'a', ..., 'Z' -> 'z'.
1001                  * But when a keyword contains at position i a character c,
1002                  * we have the constraint
1003                  * asso_values[tolower(c) + alpha_inc[i]] ==
1004                  * asso_values[toupper(c) + alpha_inc[i]].
1005                  * This introduces a unification
1006                  * toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
1007                  * Note that this unification can extend outside the range of
1008                  * ASCII letters! But still every unified character pair is at
1009                  * a distance of 'a'-'A' = 32, or (after chained unification)
1010                  * at a multiple of 32.  So in the end the alpha_unify vector
1011                  * has the form    c -> c + 32 * f(c)   where f(c) is a
1012                  * nonnegative integer.
1013                  */
1014                 u32 alpha_size;
1015                 u32 *alpha_unify;
1016                 u32 c;
1017                 struct Keyword_List *tmp;
1018
1019                 alpha_size = schr_compute_alpha_size_with_inc(t, alpha_inc);
1020                 alpha_unify = calloc(alpha_size, sizeof(*alpha_unify));
1021                 c = 0;
1022                 loop {
1023                         if (c >= alpha_size)
1024                                 break;
1025                         alpha_unify[c] = c;
1026                         ++c;
1027                 }
1028                 tmp = t->head;
1029                 loop {
1030                         struct Keyword *kw;
1031                         struct PositionIterator *iter;
1032                         s32 i;
1033
1034                         if (tmp == 0)
1035                                 break;
1036                         kw = tmp->kw;
1037                         iter = pos_iterator(positions, kw->allchars_length);
1038                         loop {
1039                                 u32 c;
1040
1041                                 i = positer_next(iter);
1042                                 if (i == POSITER_EOS)
1043                                         break;
1044                                 if (i == POS_LASTCHAR)
1045                                         c = (u8)(kw->allchars[kw->allchars_length - 1]);
1046                                 else if (i < kw->allchars_length)
1047                                         c = (u8)(kw->allchars[i]);
1048                                 else
1049                                         abort();
1050                                 if (c >= 'A' && c <= 'Z')
1051                                         c += 'a' - 'A';
1052                                 if (c >= 'a' && c <= 'z') {
1053                                         u32 d;
1054                                         u32 b;
1055                                         s32 a;
1056
1057                                         if (i != POS_LASTCHAR)
1058                                                 c += alpha_inc[i];
1059                                         /* unify c with c - ('a'-'A') */
1060                                         d = alpha_unify[c];
1061                                         b = c - ('a' - 'A');
1062                                         a = b;
1063                                         loop {
1064                                                 if (a < 0 || alpha_unify[a] != b)
1065                                                         break;
1066                                                 alpha_unify[a] = d;
1067                                                 a -= ('a' - 'A');
1068                                         }
1069                                 }
1070                         }
1071                         positer_del(iter);
1072                         tmp = tmp->next;
1073                 }
1074                 return alpha_unify;
1075         } else
1076                 /* identity mapping */
1077                 return 0;
1078 }/*}}}*/
1079 /*{{{ schr_compute_alpha_size_with_inc */
1080 /* computes the upper bound on the indices passed to asso_values[] */
1081 static u32 schr_compute_alpha_size_with_inc(struct Search *t, u32 *alpha_inc)
1082 {
1083         u32 max_alpha_inc;
1084         s32 i;
1085
1086         max_alpha_inc = 0;
1087         i = 0;
1088         loop {
1089                 if (i >= t->max_key_len)
1090                         break;
1091                 if (max_alpha_inc < alpha_inc[i])
1092                         max_alpha_inc = alpha_inc[i];
1093                 ++i;
1094         }
1095         return (OPTS(SEVENBIT) ? 128 : 256) + max_alpha_inc;
1096 }/*}}}*/
1097 /*{{{ schr_init_selchars_multiset */
1098 static void schr_init_selchars_multiset(struct Search *t, struct Positions *positions,
1099                                                                 u32 *alpha_unify, u32 *alpha_inc)
1100 {
1101         struct Keyword_List *tmp;
1102
1103         tmp = t->head;
1104         loop {
1105                 if (tmp == 0)
1106                         break;
1107                 kw_init_selchars_multiset(tmp->kw, positions, alpha_unify, alpha_inc);
1108                 tmp = tmp->next;
1109         }
1110 }/*}}}*/
1111 /*{{{ schr_find_good_asso_values */
1112 /* finds good _asso_values[] */
1113 static void schr_find_good_asso_values(struct Search *t)
1114 {
1115         s32 asso_iterations;
1116         struct Keyword_List *saved_head;
1117         s32 best_initial_asso_value;
1118         s32 best_jump;
1119         s32 *best_asso_values;
1120         s32 best_collisions;
1121         s32 best_max_hash_value;
1122
1123         schr_prepare_asso_values(t);
1124
1125         /* search for good _asso_values[] */
1126         asso_iterations = options->asso_iterations;
1127         if (asso_iterations == 0) {
1128                 schr_find_asso_values(t);
1129                 return;
1130         }
1131         /*
1132          * Try different pairs of _initial_asso_value and _jump, in the
1133          * following order:
1134          * (0, 1)
1135          * (1, 1)
1136          * (2, 1) (0, 3)
1137          * (3, 1) (1, 3)
1138          * (4, 1) (2, 3) (0, 5)
1139          * (5, 1) (3, 3) (1, 5)
1140          *.....
1141          */
1142         saved_head = t->head;
1143         best_initial_asso_value = 0;
1144         best_jump = 1;
1145         best_asso_values = calloc(t->alpha_size, sizeof(*best_asso_values));
1146         best_collisions = S32_MAX;
1147         best_max_hash_value = S32_MAX;
1148
1149         t->initial_asso_value = 0;
1150         t->jump = 1;
1151         loop {
1152                 s32 collisions;
1153                 s32 max_hash_value;
1154                 struct Keyword_List *ptr;
1155                 /* restore the keyword list in its original order */
1156                 t->head = copy_list(saved_head);
1157                 /* find good _asso_values[] */
1158                 schr_find_asso_values(t);
1159                 /* test whether it is the best solution so far */
1160                 collisions = 0;
1161                 max_hash_value = S32_MIN;
1162                 ba_clear(t->collision_detector);
1163                 ptr = t->head;
1164                 loop {
1165                         struct Keyword *kw;
1166                         s32 hashcode;
1167
1168                         if (ptr == 0)
1169                                 break;
1170                         kw = ptr->kw;
1171                         hashcode = schr_compute_hash(t, kw);
1172                         if (max_hash_value < hashcode)
1173                                 max_hash_value = hashcode;
1174                         if (ba_set_bit(t->collision_detector, hashcode))
1175                                 ++collisions;
1176                         ptr = ptr->next;
1177                 }
1178                 if (collisions < best_collisions || (collisions == best_collisions
1179                                                          && max_hash_value < best_max_hash_value)) {
1180                         memcpy(best_asso_values, t->asso_values, t->alpha_size
1181                                                                 * sizeof(*(t->asso_values)));
1182                         best_collisions = collisions;
1183                         best_max_hash_value = max_hash_value;
1184                 }
1185                 /* delete the copied keyword list */
1186                 delete_list(t->head);
1187
1188                 asso_iterations--;
1189                 if (asso_iterations == 0)
1190                         break;
1191                 /* prepare for next iteration */
1192                 if (t->initial_asso_value >= 2) {
1193                         t->initial_asso_value -= 2;
1194                         t->jump += 2;
1195                 } else {
1196                         t->initial_asso_value += t->jump;
1197                         t->jump = 1;
1198                 }
1199         }
1200         t->head = saved_head;
1201         /* install the best found asso_values */
1202         t->initial_asso_value = best_initial_asso_value;
1203         t->jump = best_jump;
1204         memcpy(t->asso_values, best_asso_values, t->alpha_size * sizeof(*(t->asso_values)));
1205         free(best_asso_values);
1206 }/*}}}*/
1207 /*{{{ schr_find_asso_values */
1208 static void schr_find_asso_values(struct Search *t)
1209 {
1210         struct Step *steps;
1211         struct Step *step;
1212         u32 stepno;
1213
1214         /* determine the steps, starting with the last one */
1215         {
1216                 bool *undetermined;
1217                 bool *determined;
1218                 s32 c;
1219
1220                 steps = 0;
1221
1222                 undetermined = calloc(t->alpha_size, sizeof(*undetermined));
1223                 c = 0;
1224                 loop {
1225                         if (c >= t->alpha_size)
1226                                 break;
1227                         undetermined[c] = false;
1228                         ++c;
1229                 }
1230                 determined = calloc(t->alpha_size, sizeof(*determined));
1231                 c = 0;
1232                 loop {
1233                         if (c >= t->alpha_size)
1234                                 break;
1235                         determined[c] = true;
1236                         ++c;
1237                 }
1238                 loop {
1239                         /* compute the partition that needs to be refined */
1240                         struct EquivalenceClass *partition;
1241                         u32 chosen_c;
1242                         u32 chosen_possible_collisions;
1243                         struct Step *step;
1244                         u32 c;
1245                         u32 *changing;
1246                         u32 changing_count;
1247
1248                         partition = schr_compute_partition(t, undetermined);
1249                         /*
1250                          * Determine the main character to be chosen in this step. Choosing such a
1251                          * character c has the effect of splitting every equivalence class
1252                          * (according the the frequency of occurrence of c). We choose the c with
1253                          * the minimum number of possible collisions, so that characters which lead
1254                          * to a large number of collisions get handled early during the search.
1255                          */
1256                         {
1257                                 u32 best_c;
1258                                 u32 best_possible_collisions;
1259                                 u32 c;
1260
1261                                 best_c = 0;
1262                                 best_possible_collisions = U32_MAX;
1263                                 c = 0;
1264                                 loop {
1265                                         if (c >= t->alpha_size)
1266                                                 break;
1267                                         if (t->occurrences[c] > 0 && determined[c]) {
1268                                                 u32 possible_collisions;
1269
1270                                                 possible_collisions =
1271                                                                 schr_count_possible_collisions(t,
1272                                                                                 partition, c);
1273                                                 if (possible_collisions
1274                                                                 < best_possible_collisions) {
1275                                                         best_c = c;
1276                                                         best_possible_collisions =
1277                                                                                 possible_collisions;
1278                                                 }
1279                                         }
1280                                         ++c;
1281                                 }
1282                                 if (best_possible_collisions == U32_MAX) {
1283                                         /*
1284                                          * All c with occurrences[c] > 0 are undetermined. We are
1285                                          * are the starting situation and don't need any more step.
1286                                          */
1287                                         delete_partition(partition);
1288                                         break;
1289                                 }
1290                                 chosen_c = best_c;
1291                                 chosen_possible_collisions = best_possible_collisions;
1292                         }
1293                         /* we need one more step */
1294                         step = calloc(1, sizeof(*step));
1295
1296                         step->undetermined = calloc(t->alpha_size,
1297                                                                 sizeof(*(step->undetermined)));
1298                         memcpy(step->undetermined, undetermined, t->alpha_size
1299                                                                 * sizeof(*(step->undetermined)));
1300                         step->partition = partition;
1301                         /* now determine how the equivalence classes will be before this step */
1302                         undetermined[chosen_c] = true;
1303                         partition = schr_compute_partition(t, undetermined);
1304                         /*
1305                          * Now determine which other characters should be determined in this step,
1306                          * because they will not change the equivalence classes at this point.  It
1307                          * is the set of all c which, for all equivalence classes, have the same
1308                          * frequency of occurrence in every keyword of the equivalence class.
1309                          */
1310                         c = 0;
1311                         loop {
1312                                 if (c >= t->alpha_size)
1313                                         break;
1314                                 if (t->occurrences[c] > 0 && determined[c]
1315                                                 && schr_unchanged_partition(t, partition, c)) {
1316                                         undetermined[c] = true;
1317                                         determined[c] = false;
1318                                 }
1319                                 ++c;
1320                         }
1321                         /* main_c must be one of these */
1322                         if (determined[chosen_c])
1323                                 abort();
1324                         /* now the set of changing characters of this step */
1325                         changing_count = 0;
1326                         c = 0;
1327                         loop {
1328                                 if (c >= t->alpha_size)
1329                                         break;
1330                                 if (undetermined[c] && !step->undetermined[c])
1331                                         ++changing_count;
1332                                 ++c;
1333                         }
1334                         changing = calloc(changing_count, sizeof(*changing));
1335                         changing_count = 0;
1336                         c = 0;
1337                         loop {
1338                                 if (c >= t->alpha_size)
1339                                         break;
1340                                 if (undetermined[c] && !step->undetermined[c]) {
1341                                         changing[changing_count] = c;
1342                                         ++changing_count;
1343                                 }
1344                                 ++c;
1345                         }
1346                         step->changing = changing;
1347                         step->changing_count = changing_count;
1348                         step->asso_value_max = t->asso_value_max;
1349                         step->expected_lower = exp((f64)(chosen_possible_collisions)
1350                                                                 / (f64)(t->max_hash_value));
1351                         step->expected_upper = exp((f64)(chosen_possible_collisions)
1352                                                                 / (f64)(t->asso_value_max));
1353                         delete_partition(partition);
1354                         step->next = steps;
1355                         steps = step;
1356                 }
1357                 free(determined);
1358                 free(undetermined);
1359         }
1360         if (OPTS(DEBUG)) {
1361                 u32 stepno;
1362                 struct Step *step;
1363
1364                 stepno = 0;
1365                 step = steps;
1366                 loop {
1367                         u32 i;
1368                         struct EquivalenceClass *cls;
1369
1370                         if (step == 0)
1371                                 break;
1372                         ++stepno;
1373                         fprintf(stderr, "Step %u chooses _asso_values[", stepno);
1374                         i = 0;
1375                         loop {
1376                                 if (i >= step->changing_count)
1377                                         break;
1378                                 if (i > 0)
1379                                         fprintf(stderr, ",");
1380                                 fprintf(stderr, "'%c'", step->changing[i]);
1381                                 ++i;
1382                         }
1383                         fprintf(stderr, "], expected number of iterations between %g and %g.\n", step->expected_lower, step->expected_upper);
1384                         fprintf(stderr, "Keyword equivalence classes:\n");
1385                         cls = step->partition;
1386                         loop {
1387                                 struct Keyword_List *tmp;
1388
1389                                 if (cls == 0)
1390                                         break;
1391                                 fprintf (stderr, "\n");
1392                                 tmp = cls->keywords;
1393                                 loop {
1394                                         struct Keyword *kw;
1395
1396                                         if (tmp == 0)
1397                                                 break;
1398                                         kw = tmp->kw;
1399                                         fprintf(stderr, "  %.*s\n", kw->allchars_length, kw->allchars);
1400                                         tmp = tmp->next;
1401                                 }
1402                                 cls = cls->next;
1403                         }
1404                         fprintf(stderr, "\n");
1405                         step = step->next;
1406                 }
1407         }
1408         /*
1409          * Initialize _asso_values[]. (The value given here matters only for those c which occur in
1410          * all keywords with equal multiplicity.)
1411          */
1412         memset(t->asso_values, 0, t->alpha_size * sizeof(*t->asso_values));
1413         stepno = 0;
1414         step = steps;
1415         loop {
1416                 u32 k;
1417                 u32 i;
1418                 u32 iterations;
1419                 u32 *iter;
1420                 u32 ii;
1421
1422                 if (step == 0)
1423                         break;
1424                 ++stepno;
1425                 /* initialize the asso_values[] */
1426                 k = step->changing_count;
1427                 i = 0;
1428                 loop {
1429                         u32 c;
1430
1431                         if (i >= k)
1432                                 break;
1433                         c = step->changing[i];
1434                         t->asso_values[c] = (t->initial_asso_value < 0 ?  rand()
1435                                         : t->initial_asso_value) & (step->asso_value_max - 1);
1436                         ++i;
1437                 }
1438                 iterations = 0;
1439                 iter = calloc(k, sizeof(*iter));
1440                 ii = (t->jump != 0 ? k - 1 : 0);
1441                 loop {
1442                         bool has_collision;
1443                         struct EquivalenceClass *cls;
1444                         /*
1445                          * test whether these asso_values[] lead to collisions among the equivalence
1446                          * classes that should be collision-free
1447                          */
1448                         has_collision = false;
1449                         cls = step->partition;
1450                         loop {
1451                                 struct Keyword_List *ptr;
1452
1453                                 if (cls == 0)
1454                                         break;
1455                                 /* Iteration Number array is a win, O(1) initialization time! */
1456                                 ba_clear(t->collision_detector);
1457                                 ptr = cls->keywords;
1458                                 loop {
1459                                         struct Keyword *kw;
1460                                         s32 hashcode;
1461
1462                                         if (ptr == 0)
1463                                                 break;
1464                                         kw = ptr->kw;
1465                                         /*
1466                                          * compute the new hash code for the keyword, leaving apart
1467                                          * the yet undetermined asso_values[].
1468                                          */
1469                                         {
1470                                                 s32 sum;
1471                                                 u32 *p;
1472                                                 s32 i;
1473
1474                                                 sum = t->hash_includes_len ? kw->allchars_length
1475                                                                                                 : 0;
1476                                                 p = kw->selchars;
1477                                                 i = kw->selchars_length;
1478                                                 loop {
1479                                                         if (i <= 0)
1480                                                                 break;
1481                                                         if (!step->undetermined[*p])
1482                                                                 sum += t->asso_values[*p];
1483                                                         ++p;
1484                                                         i--;
1485                                                 }
1486                                                 hashcode = sum;
1487                                         }
1488                                         /*
1489                                          * See whether it collides with another keyword's hash code,
1490                                          * from the same equivalence class
1491                                          */
1492                                         if (ba_set_bit(t->collision_detector, hashcode)) {
1493                                                 has_collision = true;
1494                                                 break;
1495                                         }
1496                                         ptr = ptr->next;
1497                                 }
1498                                 /*
1499                                  * don't need to continue looking at the other equivalence classes
1500                                  * if we already have found a collision
1501                                  */
1502                                 if (has_collision)
1503                                         break;
1504                                 cls = cls->next;
1505                         }
1506                         ++iterations;
1507                         if (!has_collision)
1508                                 break;
1509                         /* try other asso_values[] */
1510                         if (t->jump != 0) {
1511                                 /*
1512                                  * The way we try various values for
1513                                  *   asso_values[step->_changing[0],...step->_changing[k-1]]
1514                                  * is like this:
1515                                  * for (bound = 0,1,...)
1516                                  *   for (ii = 0,...,k-1)
1517                                  *     iter[ii] := bound
1518                                  *     iter[0..ii-1] := values <= bound
1519                                  *     iter[ii+1..k-1] := values < bound
1520                                  * and
1521                                  *   asso_values[step->_changing[i]] =
1522                                  *     _initial_asso_value + iter[i] * _jump.
1523                                  * This makes it more likely to find small asso_values[].
1524                                  */
1525                                 u32 bound;
1526                                 s32 i;
1527
1528                                 bound = iter[ii];
1529                                 i = 0;
1530                                 loop {
1531                                         u32 c;
1532
1533                                         if (i >= ii)
1534                                                 break;
1535                                         c = step->changing[i];
1536                                         ++(iter[i]);
1537                                         t->asso_values[c] = (t->asso_values[c] + t->jump)
1538                                                                 & (step->asso_value_max - 1);
1539                                         if (iter[i] <= bound)
1540                                                 goto found_next;
1541                                         t->asso_values[c] = (t->asso_values[c] - iter[i] * t->jump)
1542                                                                 & (step->asso_value_max - 1);
1543                                         iter[i] = 0;
1544                                         ++i;
1545                                 }
1546                                 i = ii + 1;
1547                                 loop {
1548                                         u32 c;
1549
1550                                         if (i >= k)
1551                                                 break;
1552                                         c = step->changing[i];
1553                                         ++(iter[i]);
1554                                         t->asso_values[c] = (t->asso_values[c] + t->jump)
1555                                                                 & (step->asso_value_max - 1);
1556                                         if (iter[i] < bound)
1557                                                 goto found_next;
1558                                         t->asso_values[c] = (t->asso_values[c] - iter[i] * t->jump)
1559                                                                 & (step->asso_value_max - 1);
1560                                         iter[i] = 0;
1561                                         ++i;
1562                                 }
1563                                 /* switch from one ii to the next */
1564                                 {
1565                                         u32 c;
1566
1567                                         c = step->changing[ii];
1568                                         t->asso_values[c] = (t->asso_values[c] - bound * t->jump)
1569                                                                 & (step->asso_value_max - 1);
1570                                         iter[ii] = 0;
1571                                 }
1572                                 /* here all iter[i] == 0 */
1573                                 ++ii;
1574                                 if (ii == k) {
1575                                         ii = 0;
1576                                         ++bound;
1577                                         if (bound == step->asso_value_max) {
1578                                                 /*
1579                                                  * Out of search space!  We can either backtrack, or
1580                                                  * increase the available search space of this step.
1581                                                  * It seems simpler to choose the latter solution.
1582                                                  */
1583                                                 step->asso_value_max = 2 * step->asso_value_max;
1584                                                 if (step->asso_value_max > t->asso_value_max) {
1585                                                         t->asso_value_max = step->asso_value_max;
1586                                                         /* reinitialize _max_hash_value */
1587                                                         t->max_hash_value = (t->hash_includes_len ? t->max_key_len : 0)
1588                                                                         + (t->asso_value_max - 1) * t->max_selchars_length;
1589                                                         /* reinitialize _collision_detector */
1590                                                         free(t->collision_detector);
1591                                                         t->collision_detector = ba_new(
1592                                                                         t->max_hash_value + 1);
1593                                                 }
1594                                         }
1595                                 }
1596                                 {
1597                                         u32 c;
1598
1599                                         c = step->changing[ii];
1600                                         iter[ii] = bound;
1601                                         t->asso_values[c] = (t->asso_values[c] + bound * t->jump)
1602                                                                 & (step->asso_value_max - 1);
1603                                 }
1604                                 found_next: ;
1605                         } else {
1606                                 /* random */
1607                                 u32 c;
1608
1609                                 c = step->changing[ii];
1610                                 t->asso_values[c] = (t->asso_values[c] + rand())
1611                                                                 & (step->asso_value_max - 1);
1612                                 /* next time, change the next c */
1613                                 ++ii;
1614                                 if (ii == k)
1615                                         ii = 0;
1616                         }
1617                 }
1618                 free(iter);
1619                 if (OPTS(DEBUG)) {
1620                         u32 i;
1621
1622                         fprintf(stderr, "Step %u chose _asso_values[", stepno);
1623                         i = 0;
1624                         loop {
1625                                 if (i >= step->changing_count)
1626                                         break;
1627                                 if (i > 0)
1628                                         fprintf(stderr, ",");
1629                                 fprintf(stderr, "'%c'", step->changing[i]);
1630                                 ++i;
1631                         }
1632                         fprintf (stderr, "] in %u iterations.\n", iterations);
1633                 }
1634                 step = step->next;
1635         }
1636         /* free allocated memory */
1637         loop {
1638                 struct Step *step;
1639
1640                 if (steps == 0)
1641                         break;
1642                 step = steps;
1643                 steps = step->next;
1644                 free(step->changing);
1645                 free(step->undetermined);
1646                 delete_partition(step->partition);
1647                 free(step);
1648         }
1649 }/*}}}*/
1650 /*{{{ chr_count_possible_collisions */
1651 /*
1652  * compute the possible number of collisions when asso_values[c] is chosen, leading to the given
1653  * partition
1654  */
1655 static u32 schr_count_possible_collisions(struct Search *t,
1656                                                         struct EquivalenceClass *partition, u32 c)
1657 {
1658         /*
1659          * Every equivalence class p is split according to the frequency of occurrence of c, leading
1660          * to equivalence classes p1, p2, ...
1661          * This leads to   (|p|^2 - |p1|^2 - |p2|^2 - ...)/2  possible collisions.
1662          * Return the sum of this expression over all equivalence classes.
1663          */
1664         u32 sum;
1665         u32 m;
1666         u32 *split_cardinalities;
1667         struct EquivalenceClass *cls;
1668
1669         sum = 0;
1670         m = t->max_selchars_length;
1671         split_cardinalities = calloc(m + 1, sizeof(*split_cardinalities));
1672         cls = partition;
1673         loop {
1674                 u32 i;
1675                 struct Keyword_List *tmp;
1676
1677                 if (cls == 0)
1678                         break;
1679                 memset(split_cardinalities, 0, (m + 1) * sizeof(*split_cardinalities));
1680                 tmp = cls->keywords;
1681                 loop {
1682                         struct Keyword *kw;
1683                         u32 count;
1684                         s32 i;
1685
1686                         if (tmp == 0)
1687                                 break;
1688                         kw = tmp->kw;
1689                         count = 0;
1690                         i = 0;
1691                         loop {
1692                                 if (i >= kw->selchars_length)
1693                                         break;
1694                                 if (kw->selchars[i] == c)
1695                                         ++count;
1696                                 ++i;
1697                         }
1698                         ++(split_cardinalities[count]);
1699                         tmp = tmp->next;
1700                 }
1701                 sum += cls->cardinality * cls->cardinality;
1702                 i = 0;
1703                 loop {
1704                         if (i > m)
1705                                 break;
1706                         sum -= split_cardinalities[i] * split_cardinalities[i];
1707                         ++i;
1708                 }
1709                 cls = cls->next;
1710         }
1711         free(split_cardinalities);
1712         return sum;
1713 }
1714 /*}}}*/
1715 /*{{{ schr_compute_partition */
1716 static struct EquivalenceClass *schr_compute_partition(struct Search *t, bool *undetermined)
1717 {
1718         struct EquivalenceClass *partition;
1719         struct EquivalenceClass *partition_last;
1720         struct Keyword_List *tmp;
1721         struct EquivalenceClass *cls;
1722
1723         partition = 0;
1724         partition_last = 0;
1725         tmp = t->head;
1726         loop {
1727                 struct Keyword *kw;
1728                 u32 *undetermined_chars;
1729                 u32 undetermined_chars_length;
1730                 s32 i;
1731                 struct EquivalenceClass *equclass;
1732                 struct Keyword_List *cons;
1733
1734                 if (tmp == 0)
1735                         break;
1736                 kw = tmp->kw;
1737                 undetermined_chars = calloc(kw->selchars_length, sizeof(*undetermined_chars));
1738                 undetermined_chars_length = 0;
1739                 i = 0;
1740                 loop {
1741                         if (i >= kw->selchars_length)
1742                                 break;
1743                         if (undetermined[kw->selchars[i]] != 0) {
1744                                 undetermined_chars[undetermined_chars_length] = kw->selchars[i];
1745                                 ++undetermined_chars_length;
1746                         }
1747                         ++i;
1748                 }
1749                 /* look up the equivalence class to which this keyword belongs */
1750                 equclass = partition;
1751                 loop {
1752                         if (equclass == 0)
1753                                 break;
1754                         if (equclass->undetermined_chars_length == undetermined_chars_length
1755                                         && equals(equclass->undetermined_chars, undetermined_chars,
1756                                                                         undetermined_chars_length))
1757                                 break;
1758                         equclass = equclass->next;
1759                 }
1760                 if (equclass == 0) {
1761                         equclass = calloc(1, sizeof(*equclass));
1762                         equclass->undetermined_chars = undetermined_chars;
1763                         equclass->undetermined_chars_length = undetermined_chars_length;
1764                         if (partition != 0)
1765                                 partition_last->next = equclass;
1766                         else
1767                                 partition = equclass;
1768                         partition_last = equclass;
1769                 } else
1770                         free(undetermined_chars);
1771                 /* add the keyword to the equivalence class */
1772                 cons = kwl_new(kw);
1773                 if (equclass->keywords != 0)
1774                         equclass->keywords_last->next = cons;
1775                 else
1776                         equclass->keywords = cons;
1777                 equclass->keywords_last = cons;
1778                 ++(equclass->cardinality);
1779                 tmp = tmp->next;
1780         }
1781         /* Free some of the allocated memory. The caller doesn't need it. */
1782         cls = partition;
1783         loop {
1784                 if (cls == 0)
1785                         break;
1786                 free(cls->undetermined_chars);
1787                 cls = cls->next;
1788         }
1789         return partition;
1790 }/*}}}*/
1791 /*{{{ schr_prepare_asso_values */
1792 /* initializes the asso_values[] related parameters */
1793 static void schr_prepare_asso_values(struct Search *t)
1794 {
1795         struct Keyword_List *tmp;
1796         struct PositionIterator *iter;
1797         s32 non_linked_length;
1798         u32 asso_value_max;
1799
1800         /* initialize each keyword's _selchars array */
1801         schr_init_selchars_multiset(t, t->key_positions, t->alpha_unify, t->alpha_inc);
1802         /* compute the maximum _selchars_length over all keywords */
1803         iter = pos_iterator(t->key_positions, t->max_key_len);
1804         t->max_selchars_length = positer_remaining(iter);
1805         positer_del(iter);
1806         /*
1807          * Check for duplicates, i.e. keywords with the same _selchars array (and - if
1808          * _hash_includes_len - also the same length).  We deal with these by building an
1809          * equivalence class, so that only 1 keyword is representative of the entire collection.
1810          * Only this representative remains in the keyword list; the others are accessible through
1811          * the _duplicate_link chain, starting at the representative.  This *greatly* simplifies
1812          * processing during later stages of the program.  Set _total_duplicates and _list_len =
1813          * _total_keys - _total_duplicates.
1814          */
1815         {
1816                 struct Hash_Table *representatives;
1817                 struct Keyword_List *prev;
1818
1819                 t->list_len = t->total_keys;
1820                 t->total_duplicates = 0;
1821
1822                 representatives = ht_new(t->list_len, !t->hash_includes_len);
1823                 prev = 0; /* list node before temp */
1824                 tmp = t->head;
1825                 loop {
1826                         struct Keyword *kw;
1827                         struct Keyword *other_kw;
1828                         struct Keyword_List *garbage;
1829
1830                         if (tmp == 0)
1831                                 break;
1832                         kw = tmp->kw;
1833                         other_kw = ht_insert(representatives, kw);
1834                         garbage = 0;
1835                         if (other_kw != 0) {
1836                                 ++(t->total_duplicates);
1837                                 (t->list_len)--;
1838                                 /* remove keyword from the main list */
1839                                 prev->next = tmp->next;
1840                                 garbage = tmp;
1841                                 /* and insert it on other_keyword's duplicate list */
1842                                 kw->duplicate_link = other_kw->duplicate_link;
1843                                 other_kw->duplicate_link = kw;
1844                                 /* complain if user hasn't enabled the duplicate option */
1845                                 if (!OPTS(DUP) || OPTS(DEBUG)) {
1846                                         s32 j;
1847
1848                                         fprintf(stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"", kw->allchars_length, kw->allchars, other_kw->allchars_length, other_kw->allchars);
1849                                         j = 0;
1850                                         loop {
1851                                                 if (j >= kw->selchars_length)
1852                                                         break;
1853                                                 putc(kw->selchars[j], stderr);
1854                                                 ++j;
1855                                         }
1856                                         fprintf(stderr, "\".\n");
1857                                 }
1858                         } else {
1859                                 kw->duplicate_link = 0;
1860                                 prev = tmp;
1861                         }
1862                         tmp = tmp->next;
1863                         if (garbage != 0)
1864                                 kwl_del(garbage);
1865                 }
1866                 if (OPTS(DEBUG))
1867                         ht_dump(representatives);
1868                 ht_del(representatives);
1869         }
1870         /*
1871          * Exit program if duplicates exists and option[DUP] not set, since we don't want to
1872          * continue in this case. (We don't want to turn on option[DUP] implicitly, because the
1873          * generated code is usually much slower.
1874          */
1875         if (t->total_duplicates != 0) {
1876                 if (OPTS(DUP))
1877                         fprintf(stderr, "%d input keys have identical hash values, examine output carefully...\n", t->total_duplicates);
1878                 else {
1879                         fprintf(stderr, "%d input keys have identical hash values,\n", t->total_duplicates);
1880                         if (OPTS(POSITIONS))
1881                                 fprintf(stderr, "try different key positions or use option -D.\n");
1882                         else
1883                                 fprintf(stderr, "use option -D.\n");
1884                         exit(1);
1885                 }
1886         }
1887         /* compute the occurrences of each character in the alphabet */
1888         t->occurrences = calloc(t->alpha_size, sizeof(*(t->occurrences)));
1889         tmp = t->head;
1890         loop {
1891                 struct Keyword *kw;
1892                 u32 *ptr;
1893                 s32 count;
1894
1895                 if (tmp == 0)
1896                         break;
1897                 kw = tmp->kw;
1898                 ptr = kw->selchars;
1899                 count = kw->selchars_length;
1900                 loop {
1901                         if (count <= 0)
1902                                 break;
1903                         ++(t->occurrences[*ptr]);
1904                         ++ptr;
1905                         count--;
1906                 }
1907                 tmp = tmp->next;
1908         }
1909         /* memory allocation */
1910         t->asso_values = calloc(t->alpha_size, sizeof(*(t->asso_values)));
1911         non_linked_length = t->list_len;
1912         asso_value_max = (u32)(non_linked_length * options->size_multiple);
1913         /*
1914          * Round up to the next power of two. This makes it easy to ensure an _asso_value[c] is >= 0
1915          * and < asso_value_max. Also, the jump value being odd, it guarantees that
1916          * Search::try_asso_value() will iterate through different values for _asso_value[c].
1917          */
1918         if (asso_value_max == 0)
1919                 asso_value_max = 1;
1920         asso_value_max |= asso_value_max >> 1;
1921         asso_value_max |= asso_value_max >> 2;
1922         asso_value_max |= asso_value_max >> 4;
1923         asso_value_max |= asso_value_max >> 8;
1924         asso_value_max |= asso_value_max >> 16;
1925         ++asso_value_max;
1926         t->asso_value_max = asso_value_max;
1927         /*
1928          * given the bound for _asso_values[c], we have a bound for the possible hash values, as
1929          * computed in compute_hash()
1930          */
1931         t->max_hash_value = (t->hash_includes_len ? t->max_key_len : 0) + (t->asso_value_max - 1)
1932                                                                         * t->max_selchars_length;
1933         /* allocate a sparse bit vector for detection of collisions of hash values */
1934         t->collision_detector = ba_new(t->max_hash_value + 1);
1935         if (OPTS(DEBUG)) {
1936                 s32 field_width;
1937                 s32 i;
1938
1939                 fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d\nmaximum size of generated hash table is %d\n", non_linked_length, asso_value_max, t->max_hash_value);
1940                 field_width = 0;
1941                 {
1942                         struct Keyword_List *tmp;
1943
1944                         tmp = t->head;
1945                         loop {
1946                                 struct Keyword *kw;
1947
1948                                 if (tmp == 0)
1949                                         break;
1950                                 kw = tmp->kw;
1951                                 if (field_width < kw->selchars_length)
1952                                         field_width = kw->selchars_length;
1953                                 tmp = tmp->next;
1954                         }
1955                 }
1956                 fprintf (stderr, "\ndumping the keyword list without duplicates\n");
1957                 fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
1958                 i = 0;
1959                 tmp = t->head;
1960                 loop {
1961                         struct Keyword *kw;
1962                         s32 j;
1963
1964                         if (tmp == 0)
1965                                 break;
1966                         kw = tmp->kw;
1967                         ++i;
1968                         fprintf(stderr, "%9d, ", i);
1969                         if (field_width > kw->selchars_length)
1970                                 fprintf(stderr, "%*s", field_width - kw->selchars_length, "");
1971                         j = 0;
1972                         loop {
1973                                 if (j >= kw->selchars_length)
1974                                         break;
1975                                 putc(kw->selchars[j], stderr);
1976                                 ++j;
1977                         }
1978                         fprintf(stderr, ", %.*s\n", kw->allchars_length, kw->allchars);
1979                         tmp = tmp->next;
1980                 }
1981                 fprintf(stderr, "\nend of keyword list\n\n");
1982         }
1983         if (OPTS(RANDOM) || options->jump == 0)
1984                 srand((long)time(0));
1985         t->initial_asso_value = (OPTS(RANDOM) ?  -1 : options->initial_asso_value);
1986         t->jump = options->jump;
1987 }/*}}}*/
1988 /*{{{ schr_unchanged_partition */
1989 /* Test whether adding c to the undetermined characters changes the given partition. */
1990 static bool schr_unchanged_partition(struct Search *t, struct EquivalenceClass *partition, u32 c)
1991 {
1992         struct EquivalenceClass *cls;
1993
1994         cls = partition;
1995         loop {
1996                 u32 first_count;
1997                 struct Keyword_List *tmp;
1998
1999                 if (cls == 0)
2000                         break;
2001                 first_count = U32_MAX;
2002                 tmp = cls->keywords;
2003                 loop {
2004                         struct Keyword *kw;
2005                         u32 count;
2006                         s32 i;
2007
2008                         if (tmp == 0)
2009                                 break;
2010                         kw = tmp->kw;
2011                         count = 0;
2012                         i = 0;
2013                         loop {
2014                                 if (i >= kw->selchars_length)
2015                                         break;
2016                                 if (kw->selchars[i] == c)
2017                                         ++count;
2018                                 ++i;
2019                         }
2020                         if (tmp == cls->keywords)
2021                                 first_count = count;
2022                         else if (count != first_count)
2023                                 /* c would split this equivalence class */
2024                                 return false;
2025                         tmp = tmp->next;
2026                 }
2027                 cls = cls->next;
2028         }
2029         return true;
2030 }
2031 /*}}}*/
2032 /*{{{ schr_compute_hash */
2033 /*
2034  * computes a keyword's hash value, relative to the current _asso_values[], and stores it in
2035  * keyword->_hash_value
2036  */
2037 static s32 schr_compute_hash(struct Search *t, struct Keyword *kw)
2038 {
2039         s32 sum;
2040         u32 *p;
2041         s32 i;
2042
2043         sum = t->hash_includes_len ? kw->allchars_length : 0;
2044         p = kw->selchars;
2045         i = kw->selchars_length;
2046         loop {
2047                 if (i <= 0)
2048                         break;
2049                 sum += t->asso_values[*p];
2050                 ++p;
2051                 i--;
2052         }
2053         kw->hash_value = sum;
2054         return sum;
2055 }/*}}}*/
2056 /*{{{ schr_sort */
2057 static void schr_sort(struct Search *t)
2058 {
2059         t->head = mergesort_list(t->head, less_by_hash_value);
2060 }
2061 /*}}}*/
2062 /*------------------------------------------------------------------------------------------------*/
2063 #define EPILOG
2064 #include "namespace/globals.h"
2065 #include "namespace/search.h"
2066 #include "namespace/keyword.h"
2067 #include "namespace/keyword_list.h"
2068 #include "namespace/options.h"
2069 #include "namespace/positions.h"
2070 #include "namespace/hash-table.h"
2071 #include "namespace/bool-array.h"
2072 #include "namespace/search.c"
2073 #undef EPILOG
2074 /*------------------------------------------------------------------------------------------------*/
2075 #endif