Add tests with filenames containing newline and backslash characters.
[coreutils.git] / lib / hash.c
blob5899f57c3d552bd6dec65533bba016a1307e3437
1 /* hash - hashing table processing.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3 Written by Jim Meyering, 1992.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 /* A generic hash table package. */
21 /* Define USE_OBSTACK to 1 if you want the allocator to use obstacks instead
22 of malloc. If you change USE_OBSTACK, you have to recompile! */
24 #if HAVE_CONFIG_H
25 # include <config.h>
26 #endif
27 #if HAVE_STDLIB_H
28 # include <stdlib.h>
29 #endif
30 #if HAVE_STDBOOL_H
31 # include <stdbool.h>
32 #else
33 typedef enum {false = 0, true = 1} bool;
34 #endif
35 #include <stdio.h>
36 #include <assert.h>
38 #if !HAVE_DECL_FREE
39 void free ();
40 #endif
41 #if !HAVE_DECL_MALLOC
42 char *malloc ();
43 #endif
45 #if USE_OBSTACK
46 # include "obstack.h"
47 # ifndef obstack_chunk_alloc
48 # define obstack_chunk_alloc malloc
49 # endif
50 # ifndef obstack_chunk_free
51 # define obstack_chunk_free free
52 # endif
53 #endif
55 #include "hash.h"
57 /* An hash table contains many internal entries, each holding a pointer to
58 some user provided data (also called a user entry). An entry indistinctly
59 refers to both the internal entry and its associated user entry. A user
60 entry contents may be hashed by a randomisation function (the hashing
61 function, or just `hasher' for short) into a number (or `slot') between 0
62 and the current table size. At each slot position in the hash table,
63 starts a linked chain of entries for which the user data all hash to this
64 slot. A bucket is the collection of all entries hashing to the same slot.
66 A good `hasher' function will distribute entries rather evenly in buckets.
67 In the ideal case, the length of each bucket is roughly the number of
68 entries divided by the table size. Finding the slot for a data is usually
69 done at constant speed by the `hasher', and the later finding of a precise
70 entry is linear in time with the size of the bucket. Consequently, a
71 bigger hash table size (that is, a bigger number of buckets) is prone to
72 yielding shorter buckets, *given* the `hasher' function behaves properly.
74 Long buckets slow down the lookup algorithm. One might use big hash table
75 sizes in hope to reduce the average length of buckets, but this might
76 become inordinate, as unused slots in the hash table take some space. The
77 best bet is to make sure you are using a good `hasher' function (beware
78 that those are not that easy to write! :-), and to use a table size at
79 least bigger than the actual number of entries.
81 Currently, whenever the addition of an entry gets 80% of buckets to be
82 non-empty, this package automatically doubles the number of buckets. */
84 /* Information and lookup. */
86 /* The following few functions provide information about the overall hash
87 table organisation: the number of entries, number of buckets and maximum
88 length of buckets. */
90 /* Return the number of buckets in the hash table. The table size, the total
91 number of buckets (used plus unused), or the maximum number of slots, are
92 the same quantity. */
94 unsigned int
95 hash_get_n_buckets (const Hash_table *table)
97 return table->n_buckets;
100 /* Return the number of slots in use (non-empty buckets). */
102 unsigned int
103 hash_get_n_buckets_used (const Hash_table *table)
105 return table->n_buckets_used;
108 /* Return the number of active entries. */
110 unsigned int
111 hash_get_n_entries (const Hash_table *table)
113 return table->n_entries;
116 /* Return the length of the most lenghty chain (bucket). */
118 unsigned int
119 hash_get_max_bucket_length (const Hash_table *table)
121 struct hash_entry *bucket;
122 unsigned int max_bucket_length = 0;
124 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
126 if (bucket->data)
128 struct hash_entry *cursor = bucket;
129 unsigned int bucket_length = 1;
131 while (cursor = cursor->next, cursor)
132 bucket_length++;
134 if (bucket_length > max_bucket_length)
135 max_bucket_length = bucket_length;
139 return max_bucket_length;
142 /* Do a mild validation of an hash table, by traversing it and checking two
143 statistics. */
145 bool
146 hash_table_ok (const Hash_table *table)
148 struct hash_entry *bucket;
149 unsigned int n_buckets_used = 0;
150 unsigned int n_entries = 0;
152 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
154 if (bucket->data)
156 struct hash_entry *cursor = bucket;
158 /* Count bucket head. */
159 n_buckets_used++;
160 n_entries++;
162 /* Count bucket overflow. */
163 while (cursor = cursor->next, cursor)
164 n_entries++;
168 if (n_buckets_used == table->n_buckets_used && n_entries == table->n_entries)
169 return true;
171 return false;
174 void
175 hash_print_statistics (const Hash_table *table, FILE *stream)
177 unsigned int n_entries = hash_get_n_entries (table);
178 unsigned int n_buckets = hash_get_n_buckets (table);
179 unsigned int n_buckets_used = hash_get_n_buckets_used (table);
180 unsigned int max_bucket_length = hash_get_max_bucket_length (table);
182 fprintf (stream, "# entries: %u\n", n_entries);
183 fprintf (stream, "# buckets: %u\n", n_buckets);
184 fprintf (stream, "# buckets used: %u (%.2f%%)\n", n_buckets_used,
185 (100.0 * n_buckets_used) / n_buckets);
186 fprintf (stream, "max bucket length: %u\n", max_bucket_length);
189 /* Return the user entry from the hash table, if some entry in the hash table
190 compares equally with ENTRY, or NULL otherwise. */
192 void *
193 hash_lookup (const Hash_table *table, const void *entry)
195 struct hash_entry *bucket
196 = table->bucket + table->hasher (entry, table->n_buckets);
197 struct hash_entry *cursor;
199 assert (bucket < table->bucket_limit);
201 if (bucket->data == NULL)
202 return NULL;
204 for (cursor = bucket; cursor; cursor = cursor->next)
205 if (table->comparator (entry, cursor->data))
206 return cursor->data;
208 return NULL;
211 /* Walking. */
213 /* The functions in this page traverse the hash table and process the
214 contained entries. For the traversal to work properly, the hash table
215 should not be resized nor modified while any particular entry is being
216 processed. In particular, entries should not be added or removed. */
218 /* Return the first data in the table, or NULL if the table is empty. */
220 void *
221 hash_get_first (const Hash_table *table)
223 struct hash_entry *bucket;
225 if (table->n_entries == 0)
226 return NULL;
228 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
229 if (bucket->data)
230 return bucket->data;
232 abort ();
235 /* Return the user data for the entry following ENTRY, where ENTRY has been
236 returned by a previous call to either `hash_get_first' or `hash_get_next'.
237 Return NULL if there is no more entries. */
239 void *
240 hash_get_next (const Hash_table *table, const void *entry)
242 struct hash_entry *bucket
243 = table->bucket + table->hasher (entry, table->n_buckets);
244 struct hash_entry *cursor;
246 assert (bucket < table->bucket_limit);
248 /* Find next entry in the same bucket. */
249 for (cursor = bucket; cursor; cursor = cursor->next)
250 if (cursor->data == entry && cursor->next)
251 return cursor->next->data;
253 /* Find first entry in any subsequent bucket. */
254 for (; bucket < table->bucket_limit; bucket++)
255 if (bucket->data)
256 return bucket->data;
258 /* None found. */
259 return NULL;
262 /* Fill BUFFER with pointers to active user entries in the hash table, then
263 return the number of pointers copied. Do not copy more than BUFFER_SIZE
264 pointers. */
266 unsigned int
267 hash_get_entries (const Hash_table *table, void **buffer,
268 unsigned int buffer_size)
270 unsigned int counter = 0;
271 struct hash_entry *bucket;
272 struct hash_entry *cursor;
274 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
276 if (bucket->data)
278 for (cursor = bucket; cursor; cursor = cursor->next)
280 if (counter >= buffer_size)
281 return counter;
282 buffer[counter++] = cursor->data;
287 return counter;
290 /* Call a PROCESSOR function for each entry of an hash table, and return the
291 number of entries for which the processor function returned success. A
292 pointer to some PROCESSOR_DATA which will be made available to each call to
293 the processor function. The PROCESSOR accepts two arguments: the first is
294 the user entry being walked into, the second is the value of PROCESSOR_DATA
295 as received. The walking continue for as long as the PROCESSOR function
296 returns nonzero. When it returns zero, the walking is interrupted. */
298 unsigned int
299 hash_do_for_each (const Hash_table *table, Hash_processor processor,
300 void *processor_data)
302 unsigned int counter = 0;
303 struct hash_entry *bucket;
304 struct hash_entry *cursor;
306 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
308 if (bucket->data)
310 for (cursor = bucket; cursor; cursor = cursor->next)
312 if (!(*processor) (cursor->data, processor_data))
313 return counter;
314 counter++;
319 return counter;
322 /* Allocation and clean-up. */
324 /* Return a hash index for a NUL-terminated STRING between 0 and N_BUCKETS-1.
325 This is a convenience routine for constructing other hashing functions. */
327 #if USE_DIFF_HASH
329 /* About hashings, Paul Eggert writes to me (FP), on 1994-01-01: "Please see
330 B. J. McKenzie, R. Harries & T. Bell, Selecting a hashing algorithm,
331 Software--practice & experience 20, 2 (Feb 1990), 209-224. Good hash
332 algorithms tend to be domain-specific, so what's good for [diffutils'] io.c
333 may not be good for your application." */
335 unsigned int
336 hash_string (const char *string, unsigned int n_buckets)
338 # ifndef CHAR_BIT
339 # define CHAR_BIT 8
340 # endif
341 # define ROTATE_LEFT(Value, Shift) \
342 ((Value) << (Shift) | (Value) >> ((sizeof (unsigned) * CHAR_BIT) - (Shift)))
343 # define HASH_ONE_CHAR(Value, Byte) \
344 ((Byte) + ROTATE_LEFT (Value, 7))
346 unsigned value = 0;
348 for (; *string; string++)
349 value = HASH_ONE_CHAR (value, *(const unsigned char *) string);
350 return value % n_buckets;
352 # undef ROTATE_LEFT
353 # undef HASH_ONE_CHAR
356 #else /* not USE_DIFF_HASH */
358 /* This one comes from `recode', and performs a bit better than the above as
359 per a few experiments. It is inspired from a hashing routine found in the
360 very old Cyber `snoop', itself written in typical Greg Mansfield style.
361 (By the way, what happened to this excellent man? Is he still alive?) */
363 unsigned int
364 hash_string (const char *string, unsigned int n_buckets)
366 unsigned value = 0;
368 while (*string)
369 value = ((value * 31 + (int) *(const unsigned char *) string++)
370 % n_buckets);
371 return value;
374 #endif /* not USE_DIFF_HASH */
376 /* Return true if CANDIDATE is a prime number. CANDIDATE should be an odd
377 number at least equal to 11. */
379 static int
380 is_prime (unsigned long candidate)
382 unsigned long divisor = 3;
383 unsigned long square = divisor * divisor;
385 while (square < candidate && (candidate % divisor))
387 divisor++;
388 square += 4 * divisor;
389 divisor++;
392 return candidate % divisor != 0;
395 /* Round a given CANDIDATE number up to the nearest prime, and return that
396 prime. CANDIDATE should be at least equal to 10. */
398 static unsigned long
399 next_prime (unsigned long candidate)
401 assert (candidate >= 10);
403 /* Make it definitely odd. */
404 candidate |= 1;
406 while (!is_prime (candidate))
407 candidate += 2;
409 return candidate;
412 /* Allocate and return a new hash table, or NULL if an error is met. The
413 initial number of buckets would be at least CANDIDATE (which need not be
414 prime).
416 If DATA_FREER is not NULL, this function may be later called with the data
417 as an argument, just before they entry containing the data gets freed. The
418 HASHER function should be supplied, and FIXME. The COMPARATOR function
419 should also be supplied, and FIXME. */
421 /* User-supplied function for freeing datas. It is specified in
422 hash_initialize. If non-null, it is used by hash_free and hash_clear.
423 You should specify `free' here only if you want these functions to free
424 all of your `data' data. This is typically the case when your data is
425 simply an auxilliary struct that you have malloc'd to aggregate several
426 values. */
428 /* User-supplied hash function that hashes entry ENTRY to an integer in
429 the range 0..TABLE_SIZE-1. */
431 /* User-supplied function that determines whether a new entry is unique by
432 comparing the new entry to entries that hashed to the same bucket
433 index. It should return zero for a pair of entries that compare equal,
434 non-zero otherwise. */
436 Hash_table *
437 hash_initialize (unsigned int candidate, Hash_hasher hasher,
438 Hash_comparator comparator, Hash_data_freer data_freer)
440 Hash_table *table;
441 struct hash_entry *bucket;
443 if (hasher == NULL || comparator == NULL)
444 return NULL;
446 table = (Hash_table *) malloc (sizeof (Hash_table));
447 if (table == NULL)
448 return NULL;
450 table->n_buckets = next_prime (candidate < 10 ? 10 : candidate);
451 table->bucket = (struct hash_entry *)
452 malloc (table->n_buckets * sizeof (struct hash_entry));
453 if (table->bucket == NULL)
455 free (table);
456 return NULL;
458 table->bucket_limit = table->bucket + table->n_buckets;
460 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
462 bucket->data = NULL;
463 bucket->next = NULL;
465 table->n_buckets_used = 0;
466 table->n_entries = 0;
468 table->hasher = hasher;
469 table->comparator = comparator;
470 table->data_freer = data_freer;
472 table->free_entry_list = NULL;
473 #if USE_OBSTACK
474 obstack_init (&table->entry_stack);
475 #endif
476 return table;
479 /* Make all buckets empty, placing any chained entries on the free list.
480 Apply the user-specified function data_freer (if any) to the datas of any
481 affected entries. */
483 void
484 hash_clear (Hash_table *table)
486 struct hash_entry *bucket;
487 struct hash_entry *cursor;
489 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
491 if (bucket->data)
493 /* Free the bucket overflow. */
494 for (cursor = bucket->next; cursor; cursor = cursor->next)
496 if (table->data_freer)
497 (*table->data_freer) (cursor->data);
498 cursor->data = NULL;
500 /* Relinking is done one entry at a time, as it is to be expected
501 that overflows are either rare or short. */
502 cursor->next = table->free_entry_list;
503 table->free_entry_list = cursor;
506 /* Free the bucket head. */
507 if (table->data_freer)
508 (*table->data_freer) (bucket->data);
509 bucket->data = NULL;
510 bucket->next = NULL;
514 table->n_buckets_used = 0;
515 table->n_entries = 0;
518 /* Reclaim all storage associated with an hash table. If a data_freer
519 function has been supplied by the user when the hash table was created,
520 this function applies it to the data of each entry before freeing that
521 entry. */
523 void
524 hash_free (Hash_table *table)
526 struct hash_entry *bucket;
527 struct hash_entry *cursor;
528 struct hash_entry *next;
530 /* Call the user data_freer function. */
531 if (table->data_freer && table->n_entries)
533 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
535 if (bucket->data)
537 for (cursor = bucket; cursor; cursor = cursor->next)
539 (*table->data_freer) (cursor->data);
545 #if USE_OBSTACK
547 obstack_free (&table->entry_stack, NULL);
549 #else
551 /* Free all bucket overflowed entries. */
552 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
554 for (cursor = bucket->next; cursor; cursor = next)
556 next = cursor->next;
557 free (cursor);
561 /* Also reclaim the internal list of previously freed entries. */
562 for (cursor = table->free_entry_list; cursor; cursor = next)
564 next = cursor->next;
565 free (cursor);
568 #endif
570 /* Free the remainder of the hash table structure. */
571 free (table->bucket);
572 free (table);
575 /* Insertion and deletion. */
577 /* Get a new hash entry for a bucket overflow, possibly by reclying a
578 previously freed one. If this is not possible, allocate a new one. */
580 static struct hash_entry *
581 allocate_entry (Hash_table *table)
583 struct hash_entry *new;
585 if (table->free_entry_list)
587 new = table->free_entry_list;
588 table->free_entry_list = new->next;
590 else
592 #if USE_OBSTACK
593 new = (struct hash_entry *)
594 obstack_alloc (&table->entry_stack, sizeof (struct hash_entry));
595 #else
596 new = (struct hash_entry *) malloc (sizeof (struct hash_entry));
597 #endif
600 return new;
603 /* Free a hash entry which was part of some bucket overflow,
604 saving it for later recycling. */
606 static void
607 free_entry (Hash_table *table, struct hash_entry *entry)
609 entry->data = NULL;
610 entry->next = table->free_entry_list;
611 table->free_entry_list = entry;
614 /* This private function is used to help with insertion and deletion. When
615 ENTRY matches an entry in the table, return a pointer to the corresponding
616 user data and set *BUCKET_HEAD to the head of the selected bucket.
617 Otherwise, return NULL. When DELETE is true and ENTRY matches an entry in
618 the table, unlink the matching entry. */
620 static void *
621 hash_find_entry (Hash_table *table, const void *entry,
622 struct hash_entry **bucket_head, bool delete)
624 struct hash_entry *bucket
625 = table->bucket + table->hasher (entry, table->n_buckets);
626 struct hash_entry *cursor;
628 assert (bucket < table->bucket_limit);
629 *bucket_head = bucket;
631 /* Test for empty bucket. */
632 if (bucket->data == NULL)
633 return NULL;
635 /* Check if then entry is found as the bucket head. */
636 if ((*table->comparator) (entry, bucket->data))
638 void *data = bucket->data;
640 if (delete)
642 if (bucket->next)
644 struct hash_entry *next = bucket->next;
646 /* Bump the first overflow entry into the bucket head, then save
647 the previous first overflow entry for later recycling. */
648 *bucket = *next;
649 free_entry (table, next);
651 else
653 bucket->data = NULL;
657 return data;
660 /* Scan the bucket overflow. */
661 for (cursor = bucket; cursor->next; cursor = cursor->next)
663 if ((*table->comparator) (entry, cursor->next->data))
665 void *data = cursor->next->data;
667 if (delete)
669 struct hash_entry *next = cursor->next;
671 /* Unlink the entry to delete, then save the freed entry for later
672 recycling. */
673 cursor->next = next->next;
674 free_entry (table, next);
677 return data;
681 /* No entry found. */
682 return NULL;
685 /* For an already existing hash table, change the number of buckets and make
686 it NEW_TABLE_SIZE. The contents of the hash table are preserved. */
688 bool
689 hash_rehash (Hash_table *table, unsigned int new_n_buckets)
691 Hash_table *new_table;
692 struct hash_entry *bucket;
693 struct hash_entry *cursor;
694 struct hash_entry *next;
696 if (table->n_buckets <= 0 || new_n_buckets == 0)
697 return false;
699 new_table = hash_initialize (new_n_buckets, table->hasher,
700 table->comparator, table->data_freer);
701 if (new_table == NULL)
702 return false;
704 /* Merely reuse the extra old space into the new table. */
705 #if USE_OBSTACK
706 obstack_free (&new_table->entry_stack, NULL);
707 new_table->entry_stack = table->entry_stack;
708 #endif
709 new_table->free_entry_list = table->free_entry_list;
711 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
713 if (bucket->data)
715 for (cursor = bucket; cursor; cursor = next)
717 void *data = cursor->data;
718 struct hash_entry *new_bucket
719 = new_table->bucket + new_table->hasher (data, new_n_buckets);
721 assert (new_bucket < new_table->bucket_limit);
723 /* Free overflow entries as soon as possible, moving them from the
724 old hash table into the new one, as they may be needed now. */
725 next = cursor->next;
726 if (cursor != bucket)
727 free_entry (new_table, cursor);
729 /* Insert the entry into the new hash table. */
730 if (new_bucket->data)
732 struct hash_entry *new_entry = allocate_entry (new_table);
734 if (new_entry == NULL)
735 return false;
737 new_entry->data = data;
738 new_entry->next = new_bucket->next;
739 new_bucket->next = new_entry;
741 else
743 new_bucket->data = data;
744 new_table->n_buckets_used++;
750 free (table->bucket);
751 table->bucket = new_table->bucket;
752 table->bucket_limit = new_table->bucket_limit;
753 table->n_buckets = new_table->n_buckets;
754 table->n_buckets_used = new_table->n_buckets_used;
755 /* table->n_entries already holds its value. */
756 #if USE_OBSTACK
757 table->entry_stack = new_table->entry_stack;
758 #endif
759 free (new_table);
761 return true;
764 /* If ENTRY matches an entry already in the hash table, don't modify the table
765 and return the matched entry. Otherwise, insert ENTRY and return NULL.
766 *DONE is set to true in all cases, unless the storage required for
767 insertion cannot be allocated. */
769 void *
770 hash_insert (Hash_table *table, const void *entry, bool *done)
772 void *data;
773 struct hash_entry *bucket;
775 assert (entry); /* cannot insert a NULL data */
777 if (data = hash_find_entry (table, entry, &bucket, false), data)
779 *done = true;
780 return data;
783 /* ENTRY is not matched, it should be inserted. */
785 table->n_entries++;
787 if (bucket->data)
789 struct hash_entry *new_entry = allocate_entry (table);
791 if (new_entry == NULL)
793 *done = false;
794 return NULL;
797 /* Add ENTRY in the overflow of the bucket. */
799 new_entry->data = (void *) entry;
800 new_entry->next = bucket->next;
801 bucket->next = new_entry;
802 *done = true;
803 return NULL;
806 /* Add ENTRY right in the bucket head. */
808 bucket->data = (void *) entry;
809 table->n_buckets_used++;
811 /* If more than 80% of the buckets are in use, rehash the table two
812 times bigger. It's no real use checking the number of entries, as if
813 the hashing function is ill-conditioned, rehashing is not likely to
814 improve it. */
816 if (5 * table->n_buckets_used > 4 * table->n_buckets)
818 unsigned int new_n_buckets = next_prime (2 * table->n_buckets + 1);
820 *done = hash_rehash (table, new_n_buckets);
821 return NULL;
824 *done = true;
825 return NULL;
828 /* If ENTRY is already in the table, remove it and return the just-deleted
829 data (the user may want to deallocate its storage). If ENTRY is not in the
830 table, don't modify the table and return NULL. */
832 void *
833 hash_delete (Hash_table *table, const void *entry)
835 void *data;
836 struct hash_entry *bucket;
838 if (data = hash_find_entry (table, entry, &bucket, true), !data)
839 return NULL;
841 if (!bucket->data)
842 table->n_buckets_used--;
843 table->n_entries--;
845 return data;
848 /* Testing. */
850 #if TESTING
852 void
853 hash_print (const Hash_table *table)
855 struct hash_entry *bucket;
857 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
859 struct hash_entry *cursor;
861 if (bucket)
862 printf ("%d:\n", slot);
864 for (cursor = bucket; cursor; cursor = cursor->next)
866 char *s = (char *) cursor->data;
867 /* FIXME */
868 printf (" %s\n", s);
873 #endif /* TESTING */