1 /* hash - hashing table processing.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3 Written by Jim Meyering, 1992.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 /* A generic hash table package. */
21 /* Define USE_OBSTACK to 1 if you want the allocator to use obstacks instead
22 of malloc. If you change USE_OBSTACK, you have to recompile! */
33 typedef enum {false = 0, true = 1} bool;
47 # ifndef obstack_chunk_alloc
48 # define obstack_chunk_alloc malloc
50 # ifndef obstack_chunk_free
51 # define obstack_chunk_free free
57 /* An hash table contains many internal entries, each holding a pointer to
58 some user provided data (also called a user entry). An entry indistinctly
59 refers to both the internal entry and its associated user entry. A user
60 entry contents may be hashed by a randomisation function (the hashing
61 function, or just `hasher' for short) into a number (or `slot') between 0
62 and the current table size. At each slot position in the hash table,
63 starts a linked chain of entries for which the user data all hash to this
64 slot. A bucket is the collection of all entries hashing to the same slot.
66 A good `hasher' function will distribute entries rather evenly in buckets.
67 In the ideal case, the length of each bucket is roughly the number of
68 entries divided by the table size. Finding the slot for a data is usually
69 done at constant speed by the `hasher', and the later finding of a precise
70 entry is linear in time with the size of the bucket. Consequently, a
71 bigger hash table size (that is, a bigger number of buckets) is prone to
72 yielding shorter buckets, *given* the `hasher' function behaves properly.
74 Long buckets slow down the lookup algorithm. One might use big hash table
75 sizes in hope to reduce the average length of buckets, but this might
76 become inordinate, as unused slots in the hash table take some space. The
77 best bet is to make sure you are using a good `hasher' function (beware
78 that those are not that easy to write! :-), and to use a table size at
79 least bigger than the actual number of entries.
81 Currently, whenever the addition of an entry gets 80% of buckets to be
82 non-empty, this package automatically doubles the number of buckets. */
84 /* Information and lookup. */
86 /* The following few functions provide information about the overall hash
87 table organisation: the number of entries, number of buckets and maximum
90 /* Return the number of buckets in the hash table. The table size, the total
91 number of buckets (used plus unused), or the maximum number of slots, are
95 hash_get_n_buckets (const Hash_table
*table
)
97 return table
->n_buckets
;
100 /* Return the number of slots in use (non-empty buckets). */
103 hash_get_n_buckets_used (const Hash_table
*table
)
105 return table
->n_buckets_used
;
108 /* Return the number of active entries. */
111 hash_get_n_entries (const Hash_table
*table
)
113 return table
->n_entries
;
116 /* Return the length of the most lenghty chain (bucket). */
119 hash_get_max_bucket_length (const Hash_table
*table
)
121 struct hash_entry
*bucket
;
122 unsigned int max_bucket_length
= 0;
124 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
128 struct hash_entry
*cursor
= bucket
;
129 unsigned int bucket_length
= 1;
131 while (cursor
= cursor
->next
, cursor
)
134 if (bucket_length
> max_bucket_length
)
135 max_bucket_length
= bucket_length
;
139 return max_bucket_length
;
142 /* Do a mild validation of an hash table, by traversing it and checking two
146 hash_table_ok (const Hash_table
*table
)
148 struct hash_entry
*bucket
;
149 unsigned int n_buckets_used
= 0;
150 unsigned int n_entries
= 0;
152 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
156 struct hash_entry
*cursor
= bucket
;
158 /* Count bucket head. */
162 /* Count bucket overflow. */
163 while (cursor
= cursor
->next
, cursor
)
168 if (n_buckets_used
== table
->n_buckets_used
&& n_entries
== table
->n_entries
)
175 hash_print_statistics (const Hash_table
*table
, FILE *stream
)
177 unsigned int n_entries
= hash_get_n_entries (table
);
178 unsigned int n_buckets
= hash_get_n_buckets (table
);
179 unsigned int n_buckets_used
= hash_get_n_buckets_used (table
);
180 unsigned int max_bucket_length
= hash_get_max_bucket_length (table
);
182 fprintf (stream
, "# entries: %u\n", n_entries
);
183 fprintf (stream
, "# buckets: %u\n", n_buckets
);
184 fprintf (stream
, "# buckets used: %u (%.2f%%)\n", n_buckets_used
,
185 (100.0 * n_buckets_used
) / n_buckets
);
186 fprintf (stream
, "max bucket length: %u\n", max_bucket_length
);
189 /* Return the user entry from the hash table, if some entry in the hash table
190 compares equally with ENTRY, or NULL otherwise. */
193 hash_lookup (const Hash_table
*table
, const void *entry
)
195 struct hash_entry
*bucket
196 = table
->bucket
+ table
->hasher (entry
, table
->n_buckets
);
197 struct hash_entry
*cursor
;
199 assert (bucket
< table
->bucket_limit
);
201 if (bucket
->data
== NULL
)
204 for (cursor
= bucket
; cursor
; cursor
= cursor
->next
)
205 if (table
->comparator (entry
, cursor
->data
))
213 /* The functions in this page traverse the hash table and process the
214 contained entries. For the traversal to work properly, the hash table
215 should not be resized nor modified while any particular entry is being
216 processed. In particular, entries should not be added or removed. */
218 /* Return the first data in the table, or NULL if the table is empty. */
221 hash_get_first (const Hash_table
*table
)
223 struct hash_entry
*bucket
;
225 if (table
->n_entries
== 0)
228 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
235 /* Return the user data for the entry following ENTRY, where ENTRY has been
236 returned by a previous call to either `hash_get_first' or `hash_get_next'.
237 Return NULL if there is no more entries. */
240 hash_get_next (const Hash_table
*table
, const void *entry
)
242 struct hash_entry
*bucket
243 = table
->bucket
+ table
->hasher (entry
, table
->n_buckets
);
244 struct hash_entry
*cursor
;
246 assert (bucket
< table
->bucket_limit
);
248 /* Find next entry in the same bucket. */
249 for (cursor
= bucket
; cursor
; cursor
= cursor
->next
)
250 if (cursor
->data
== entry
&& cursor
->next
)
251 return cursor
->next
->data
;
253 /* Find first entry in any subsequent bucket. */
254 for (; bucket
< table
->bucket_limit
; bucket
++)
262 /* Fill BUFFER with pointers to active user entries in the hash table, then
263 return the number of pointers copied. Do not copy more than BUFFER_SIZE
267 hash_get_entries (const Hash_table
*table
, void **buffer
,
268 unsigned int buffer_size
)
270 unsigned int counter
= 0;
271 struct hash_entry
*bucket
;
272 struct hash_entry
*cursor
;
274 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
278 for (cursor
= bucket
; cursor
; cursor
= cursor
->next
)
280 if (counter
>= buffer_size
)
282 buffer
[counter
++] = cursor
->data
;
290 /* Call a PROCESSOR function for each entry of an hash table, and return the
291 number of entries for which the processor function returned success. A
292 pointer to some PROCESSOR_DATA which will be made available to each call to
293 the processor function. The PROCESSOR accepts two arguments: the first is
294 the user entry being walked into, the second is the value of PROCESSOR_DATA
295 as received. The walking continue for as long as the PROCESSOR function
296 returns nonzero. When it returns zero, the walking is interrupted. */
299 hash_do_for_each (const Hash_table
*table
, Hash_processor processor
,
300 void *processor_data
)
302 unsigned int counter
= 0;
303 struct hash_entry
*bucket
;
304 struct hash_entry
*cursor
;
306 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
310 for (cursor
= bucket
; cursor
; cursor
= cursor
->next
)
312 if (!(*processor
) (cursor
->data
, processor_data
))
322 /* Allocation and clean-up. */
324 /* Return a hash index for a NUL-terminated STRING between 0 and N_BUCKETS-1.
325 This is a convenience routine for constructing other hashing functions. */
329 /* About hashings, Paul Eggert writes to me (FP), on 1994-01-01: "Please see
330 B. J. McKenzie, R. Harries & T. Bell, Selecting a hashing algorithm,
331 Software--practice & experience 20, 2 (Feb 1990), 209-224. Good hash
332 algorithms tend to be domain-specific, so what's good for [diffutils'] io.c
333 may not be good for your application." */
336 hash_string (const char *string
, unsigned int n_buckets
)
341 # define ROTATE_LEFT(Value, Shift) \
342 ((Value) << (Shift) | (Value) >> ((sizeof (unsigned) * CHAR_BIT) - (Shift)))
343 # define HASH_ONE_CHAR(Value, Byte) \
344 ((Byte) + ROTATE_LEFT (Value, 7))
348 for (; *string
; string
++)
349 value
= HASH_ONE_CHAR (value
, *(const unsigned char *) string
);
350 return value
% n_buckets
;
353 # undef HASH_ONE_CHAR
356 #else /* not USE_DIFF_HASH */
358 /* This one comes from `recode', and performs a bit better than the above as
359 per a few experiments. It is inspired from a hashing routine found in the
360 very old Cyber `snoop', itself written in typical Greg Mansfield style.
361 (By the way, what happened to this excellent man? Is he still alive?) */
364 hash_string (const char *string
, unsigned int n_buckets
)
369 value
= ((value
* 31 + (int) *(const unsigned char *) string
++)
374 #endif /* not USE_DIFF_HASH */
376 /* Return true if CANDIDATE is a prime number. CANDIDATE should be an odd
377 number at least equal to 11. */
380 is_prime (unsigned long candidate
)
382 unsigned long divisor
= 3;
383 unsigned long square
= divisor
* divisor
;
385 while (square
< candidate
&& (candidate
% divisor
))
388 square
+= 4 * divisor
;
392 return candidate
% divisor
!= 0;
395 /* Round a given CANDIDATE number up to the nearest prime, and return that
396 prime. CANDIDATE should be at least equal to 10. */
399 next_prime (unsigned long candidate
)
401 assert (candidate
>= 10);
403 /* Make it definitely odd. */
406 while (!is_prime (candidate
))
412 /* Allocate and return a new hash table, or NULL if an error is met. The
413 initial number of buckets would be at least CANDIDATE (which need not be
416 If DATA_FREER is not NULL, this function may be later called with the data
417 as an argument, just before they entry containing the data gets freed. The
418 HASHER function should be supplied, and FIXME. The COMPARATOR function
419 should also be supplied, and FIXME. */
421 /* User-supplied function for freeing datas. It is specified in
422 hash_initialize. If non-null, it is used by hash_free and hash_clear.
423 You should specify `free' here only if you want these functions to free
424 all of your `data' data. This is typically the case when your data is
425 simply an auxilliary struct that you have malloc'd to aggregate several
428 /* User-supplied hash function that hashes entry ENTRY to an integer in
429 the range 0..TABLE_SIZE-1. */
431 /* User-supplied function that determines whether a new entry is unique by
432 comparing the new entry to entries that hashed to the same bucket
433 index. It should return zero for a pair of entries that compare equal,
434 non-zero otherwise. */
437 hash_initialize (unsigned int candidate
, Hash_hasher hasher
,
438 Hash_comparator comparator
, Hash_data_freer data_freer
)
441 struct hash_entry
*bucket
;
443 if (hasher
== NULL
|| comparator
== NULL
)
446 table
= (Hash_table
*) malloc (sizeof (Hash_table
));
450 table
->n_buckets
= next_prime (candidate
< 10 ? 10 : candidate
);
451 table
->bucket
= (struct hash_entry
*)
452 malloc (table
->n_buckets
* sizeof (struct hash_entry
));
453 if (table
->bucket
== NULL
)
458 table
->bucket_limit
= table
->bucket
+ table
->n_buckets
;
460 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
465 table
->n_buckets_used
= 0;
466 table
->n_entries
= 0;
468 table
->hasher
= hasher
;
469 table
->comparator
= comparator
;
470 table
->data_freer
= data_freer
;
472 table
->free_entry_list
= NULL
;
474 obstack_init (&table
->entry_stack
);
479 /* Make all buckets empty, placing any chained entries on the free list.
480 Apply the user-specified function data_freer (if any) to the datas of any
484 hash_clear (Hash_table
*table
)
486 struct hash_entry
*bucket
;
487 struct hash_entry
*cursor
;
489 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
493 /* Free the bucket overflow. */
494 for (cursor
= bucket
->next
; cursor
; cursor
= cursor
->next
)
496 if (table
->data_freer
)
497 (*table
->data_freer
) (cursor
->data
);
500 /* Relinking is done one entry at a time, as it is to be expected
501 that overflows are either rare or short. */
502 cursor
->next
= table
->free_entry_list
;
503 table
->free_entry_list
= cursor
;
506 /* Free the bucket head. */
507 if (table
->data_freer
)
508 (*table
->data_freer
) (bucket
->data
);
514 table
->n_buckets_used
= 0;
515 table
->n_entries
= 0;
518 /* Reclaim all storage associated with an hash table. If a data_freer
519 function has been supplied by the user when the hash table was created,
520 this function applies it to the data of each entry before freeing that
524 hash_free (Hash_table
*table
)
526 struct hash_entry
*bucket
;
527 struct hash_entry
*cursor
;
528 struct hash_entry
*next
;
530 /* Call the user data_freer function. */
531 if (table
->data_freer
&& table
->n_entries
)
533 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
537 for (cursor
= bucket
; cursor
; cursor
= cursor
->next
)
539 (*table
->data_freer
) (cursor
->data
);
547 obstack_free (&table
->entry_stack
, NULL
);
551 /* Free all bucket overflowed entries. */
552 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
554 for (cursor
= bucket
->next
; cursor
; cursor
= next
)
561 /* Also reclaim the internal list of previously freed entries. */
562 for (cursor
= table
->free_entry_list
; cursor
; cursor
= next
)
570 /* Free the remainder of the hash table structure. */
571 free (table
->bucket
);
575 /* Insertion and deletion. */
577 /* Get a new hash entry for a bucket overflow, possibly by reclying a
578 previously freed one. If this is not possible, allocate a new one. */
580 static struct hash_entry
*
581 allocate_entry (Hash_table
*table
)
583 struct hash_entry
*new;
585 if (table
->free_entry_list
)
587 new = table
->free_entry_list
;
588 table
->free_entry_list
= new->next
;
593 new = (struct hash_entry
*)
594 obstack_alloc (&table
->entry_stack
, sizeof (struct hash_entry
));
596 new = (struct hash_entry
*) malloc (sizeof (struct hash_entry
));
603 /* Free a hash entry which was part of some bucket overflow,
604 saving it for later recycling. */
607 free_entry (Hash_table
*table
, struct hash_entry
*entry
)
610 entry
->next
= table
->free_entry_list
;
611 table
->free_entry_list
= entry
;
614 /* This private function is used to help with insertion and deletion. When
615 ENTRY matches an entry in the table, return a pointer to the corresponding
616 user data and set *BUCKET_HEAD to the head of the selected bucket.
617 Otherwise, return NULL. When DELETE is true and ENTRY matches an entry in
618 the table, unlink the matching entry. */
621 hash_find_entry (Hash_table
*table
, const void *entry
,
622 struct hash_entry
**bucket_head
, bool delete)
624 struct hash_entry
*bucket
625 = table
->bucket
+ table
->hasher (entry
, table
->n_buckets
);
626 struct hash_entry
*cursor
;
628 assert (bucket
< table
->bucket_limit
);
629 *bucket_head
= bucket
;
631 /* Test for empty bucket. */
632 if (bucket
->data
== NULL
)
635 /* Check if then entry is found as the bucket head. */
636 if ((*table
->comparator
) (entry
, bucket
->data
))
638 void *data
= bucket
->data
;
644 struct hash_entry
*next
= bucket
->next
;
646 /* Bump the first overflow entry into the bucket head, then save
647 the previous first overflow entry for later recycling. */
649 free_entry (table
, next
);
660 /* Scan the bucket overflow. */
661 for (cursor
= bucket
; cursor
->next
; cursor
= cursor
->next
)
663 if ((*table
->comparator
) (entry
, cursor
->next
->data
))
665 void *data
= cursor
->next
->data
;
669 struct hash_entry
*next
= cursor
->next
;
671 /* Unlink the entry to delete, then save the freed entry for later
673 cursor
->next
= next
->next
;
674 free_entry (table
, next
);
681 /* No entry found. */
685 /* For an already existing hash table, change the number of buckets and make
686 it NEW_TABLE_SIZE. The contents of the hash table are preserved. */
689 hash_rehash (Hash_table
*table
, unsigned int new_n_buckets
)
691 Hash_table
*new_table
;
692 struct hash_entry
*bucket
;
693 struct hash_entry
*cursor
;
694 struct hash_entry
*next
;
696 if (table
->n_buckets
<= 0 || new_n_buckets
== 0)
699 new_table
= hash_initialize (new_n_buckets
, table
->hasher
,
700 table
->comparator
, table
->data_freer
);
701 if (new_table
== NULL
)
704 /* Merely reuse the extra old space into the new table. */
706 obstack_free (&new_table
->entry_stack
, NULL
);
707 new_table
->entry_stack
= table
->entry_stack
;
709 new_table
->free_entry_list
= table
->free_entry_list
;
711 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
715 for (cursor
= bucket
; cursor
; cursor
= next
)
717 void *data
= cursor
->data
;
718 struct hash_entry
*new_bucket
719 = new_table
->bucket
+ new_table
->hasher (data
, new_n_buckets
);
721 assert (new_bucket
< new_table
->bucket_limit
);
723 /* Free overflow entries as soon as possible, moving them from the
724 old hash table into the new one, as they may be needed now. */
726 if (cursor
!= bucket
)
727 free_entry (new_table
, cursor
);
729 /* Insert the entry into the new hash table. */
730 if (new_bucket
->data
)
732 struct hash_entry
*new_entry
= allocate_entry (new_table
);
734 if (new_entry
== NULL
)
737 new_entry
->data
= data
;
738 new_entry
->next
= new_bucket
->next
;
739 new_bucket
->next
= new_entry
;
743 new_bucket
->data
= data
;
744 new_table
->n_buckets_used
++;
750 free (table
->bucket
);
751 table
->bucket
= new_table
->bucket
;
752 table
->bucket_limit
= new_table
->bucket_limit
;
753 table
->n_buckets
= new_table
->n_buckets
;
754 table
->n_buckets_used
= new_table
->n_buckets_used
;
755 /* table->n_entries already holds its value. */
757 table
->entry_stack
= new_table
->entry_stack
;
764 /* If ENTRY matches an entry already in the hash table, don't modify the table
765 and return the matched entry. Otherwise, insert ENTRY and return NULL.
766 *DONE is set to true in all cases, unless the storage required for
767 insertion cannot be allocated. */
770 hash_insert (Hash_table
*table
, const void *entry
, bool *done
)
773 struct hash_entry
*bucket
;
775 assert (entry
); /* cannot insert a NULL data */
777 if (data
= hash_find_entry (table
, entry
, &bucket
, false), data
)
783 /* ENTRY is not matched, it should be inserted. */
789 struct hash_entry
*new_entry
= allocate_entry (table
);
791 if (new_entry
== NULL
)
797 /* Add ENTRY in the overflow of the bucket. */
799 new_entry
->data
= (void *) entry
;
800 new_entry
->next
= bucket
->next
;
801 bucket
->next
= new_entry
;
806 /* Add ENTRY right in the bucket head. */
808 bucket
->data
= (void *) entry
;
809 table
->n_buckets_used
++;
811 /* If more than 80% of the buckets are in use, rehash the table two
812 times bigger. It's no real use checking the number of entries, as if
813 the hashing function is ill-conditioned, rehashing is not likely to
816 if (5 * table
->n_buckets_used
> 4 * table
->n_buckets
)
818 unsigned int new_n_buckets
= next_prime (2 * table
->n_buckets
+ 1);
820 *done
= hash_rehash (table
, new_n_buckets
);
828 /* If ENTRY is already in the table, remove it and return the just-deleted
829 data (the user may want to deallocate its storage). If ENTRY is not in the
830 table, don't modify the table and return NULL. */
833 hash_delete (Hash_table
*table
, const void *entry
)
836 struct hash_entry
*bucket
;
838 if (data
= hash_find_entry (table
, entry
, &bucket
, true), !data
)
842 table
->n_buckets_used
--;
853 hash_print (const Hash_table
*table
)
855 struct hash_entry
*bucket
;
857 for (bucket
= table
->bucket
; bucket
< table
->bucket_limit
; bucket
++)
859 struct hash_entry
*cursor
;
862 printf ("%d:\n", slot
);
864 for (cursor
= bucket
; cursor
; cursor
= cursor
->next
)
866 char *s
= (char *) cursor
->data
;