1 /*-------------------------------------------------------------------------
4 * Generic hashing functions, and hash functions for use in dynahash.c
8 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
16 * It is expected that every bit of a hash function's 32-bit result is
17 * as random as every other; failure to ensure this is likely to lead
18 * to poor performance of hash tables. In most cases a hash
19 * function should use hash_bytes() or its variant hash_bytes_uint32(),
20 * or the wrappers hash_any() and hash_uint32 defined in hashfn.h.
22 *-------------------------------------------------------------------------
26 #include "common/hashfn.h"
27 #include "port/pg_bitutils.h"
31 * This hash function was written by Bob Jenkins
32 * (bob_jenkins@burtleburtle.net), and superficially adapted
33 * for PostgreSQL by Neil Conway. For more information on this
34 * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
35 * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
37 * In the current code, we have adopted Bob's 2006 update of his hash
38 * function to fetch the data a word at a time when it is suitably aligned.
39 * This makes for a useful speedup, at the cost of having to maintain
40 * four code paths (aligned vs unaligned, and little-endian vs big-endian).
41 * It also uses two separate mixing functions mix() and final(), instead
42 * of a slower multi-purpose function.
45 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
46 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
48 #define rot(x,k) pg_rotate_left32(x, k)
51 * mix -- mix 3 32-bit values reversibly.
53 * This is reversible, so any information in (a,b,c) before mix() is
54 * still in (a,b,c) after mix().
56 * If four pairs of (a,b,c) inputs are run through mix(), or through
57 * mix() in reverse, there are at least 32 bits of the output that
58 * are sometimes the same for one pair and different for another pair.
59 * This was tested for:
60 * * pairs that differed by one bit, by two bits, in any combination
61 * of top bits of (a,b,c), or in any combination of bottom bits of
63 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
64 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
65 * is commonly produced by subtraction) look like a single 1-bit
67 * * the base values were pseudorandom, all zero but one bit set, or
68 * all zero plus a counter that starts at zero.
70 * This does not achieve avalanche. There are input bits of (a,b,c)
71 * that fail to affect some output bits of (a,b,c), especially of a. The
72 * most thoroughly mixed value is c, but it doesn't really even achieve
75 * This allows some parallelism. Read-after-writes are good at doubling
76 * the number of bits affected, so the goal of mixing pulls in the opposite
77 * direction from the goal of parallelism. I did what I could. Rotates
78 * seem to cost as much as shifts on every machine I could lay my hands on,
79 * and rotates are much kinder to the top and bottom bits, so I used rotates.
84 a -= c; a ^= rot(c, 4); c += b; \
85 b -= a; b ^= rot(a, 6); a += c; \
86 c -= b; c ^= rot(b, 8); b += a; \
87 a -= c; a ^= rot(c,16); c += b; \
88 b -= a; b ^= rot(a,19); a += c; \
89 c -= b; c ^= rot(b, 4); b += a; \
93 * final -- final mixing of 3 32-bit values (a,b,c) into c
95 * Pairs of (a,b,c) values differing in only a few bits will usually
96 * produce values of c that look totally different. This was tested for
97 * * pairs that differed by one bit, by two bits, in any combination
98 * of top bits of (a,b,c), or in any combination of bottom bits of
100 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
101 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
102 * is commonly produced by subtraction) look like a single 1-bit
104 * * the base values were pseudorandom, all zero but one bit set, or
105 * all zero plus a counter that starts at zero.
107 * The use of separate functions for mix() and final() allow for a
108 * substantial performance increase since final() does not need to
109 * do well in reverse, but is does need to affect all output bits.
110 * mix(), on the other hand, does not need to affect all output
111 * bits (affecting 32 bits is enough). The original hash function had
112 * a single mixing operation that had to satisfy both sets of requirements
113 * and was slower as a result.
116 #define final(a,b,c) \
118 c ^= b; c -= rot(b,14); \
119 a ^= c; a -= rot(c,11); \
120 b ^= a; b -= rot(a,25); \
121 c ^= b; c -= rot(b,16); \
122 a ^= c; a -= rot(c, 4); \
123 b ^= a; b -= rot(a,14); \
124 c ^= b; c -= rot(b,24); \
128 * hash_bytes() -- hash a variable-length key into a 32-bit value
129 * k : the key (the unaligned variable-length array of bytes)
130 * len : the length of the key, counting by bytes
132 * Returns a uint32 value. Every bit of the key affects every bit of
133 * the return value. Every 1-bit and 2-bit delta achieves avalanche.
134 * About 6*len+35 instructions. The best hash table sizes are powers
135 * of 2. There is no need to do mod a prime (mod is sooo slow!).
136 * If you need less than 32 bits, use a bitmask.
138 * This procedure must never throw elog(ERROR); the ResourceOwner code
139 * relies on this not to fail.
141 * Note: we could easily change this function to return a 64-bit hash value
142 * by using the final values of both b and c. b is perhaps a little less
143 * well mixed than c, however.
146 hash_bytes(const unsigned char *k
, int keylen
)
153 /* Set up the internal state */
155 a
= b
= c
= 0x9e3779b9 + len
+ 3923095;
157 /* If the source pointer is word-aligned, we use word-wide fetches */
158 if (((uintptr_t) k
& UINT32_ALIGN_MASK
) == 0)
160 /* Code path for aligned source data */
161 const uint32
*ka
= (const uint32
*) k
;
163 /* handle most of the key */
174 /* handle the last 11 bytes */
175 k
= (const unsigned char *) ka
;
176 #ifdef WORDS_BIGENDIAN
180 c
+= ((uint32
) k
[10] << 8);
183 c
+= ((uint32
) k
[9] << 16);
186 c
+= ((uint32
) k
[8] << 24);
189 /* the lowest byte of c is reserved for the length */
194 b
+= ((uint32
) k
[6] << 8);
197 b
+= ((uint32
) k
[5] << 16);
200 b
+= ((uint32
) k
[4] << 24);
206 a
+= ((uint32
) k
[2] << 8);
209 a
+= ((uint32
) k
[1] << 16);
212 a
+= ((uint32
) k
[0] << 24);
213 /* case 0: nothing left to add */
215 #else /* !WORDS_BIGENDIAN */
219 c
+= ((uint32
) k
[10] << 24);
222 c
+= ((uint32
) k
[9] << 16);
225 c
+= ((uint32
) k
[8] << 8);
228 /* the lowest byte of c is reserved for the length */
233 b
+= ((uint32
) k
[6] << 16);
236 b
+= ((uint32
) k
[5] << 8);
245 a
+= ((uint32
) k
[2] << 16);
248 a
+= ((uint32
) k
[1] << 8);
252 /* case 0: nothing left to add */
254 #endif /* WORDS_BIGENDIAN */
258 /* Code path for non-aligned source data */
260 /* handle most of the key */
263 #ifdef WORDS_BIGENDIAN
264 a
+= (k
[3] + ((uint32
) k
[2] << 8) + ((uint32
) k
[1] << 16) + ((uint32
) k
[0] << 24));
265 b
+= (k
[7] + ((uint32
) k
[6] << 8) + ((uint32
) k
[5] << 16) + ((uint32
) k
[4] << 24));
266 c
+= (k
[11] + ((uint32
) k
[10] << 8) + ((uint32
) k
[9] << 16) + ((uint32
) k
[8] << 24));
267 #else /* !WORDS_BIGENDIAN */
268 a
+= (k
[0] + ((uint32
) k
[1] << 8) + ((uint32
) k
[2] << 16) + ((uint32
) k
[3] << 24));
269 b
+= (k
[4] + ((uint32
) k
[5] << 8) + ((uint32
) k
[6] << 16) + ((uint32
) k
[7] << 24));
270 c
+= (k
[8] + ((uint32
) k
[9] << 8) + ((uint32
) k
[10] << 16) + ((uint32
) k
[11] << 24));
271 #endif /* WORDS_BIGENDIAN */
277 /* handle the last 11 bytes */
278 #ifdef WORDS_BIGENDIAN
282 c
+= ((uint32
) k
[10] << 8);
285 c
+= ((uint32
) k
[9] << 16);
288 c
+= ((uint32
) k
[8] << 24);
291 /* the lowest byte of c is reserved for the length */
295 b
+= ((uint32
) k
[6] << 8);
298 b
+= ((uint32
) k
[5] << 16);
301 b
+= ((uint32
) k
[4] << 24);
307 a
+= ((uint32
) k
[2] << 8);
310 a
+= ((uint32
) k
[1] << 16);
313 a
+= ((uint32
) k
[0] << 24);
314 /* case 0: nothing left to add */
316 #else /* !WORDS_BIGENDIAN */
320 c
+= ((uint32
) k
[10] << 24);
323 c
+= ((uint32
) k
[9] << 16);
326 c
+= ((uint32
) k
[8] << 8);
329 /* the lowest byte of c is reserved for the length */
330 b
+= ((uint32
) k
[7] << 24);
333 b
+= ((uint32
) k
[6] << 16);
336 b
+= ((uint32
) k
[5] << 8);
342 a
+= ((uint32
) k
[3] << 24);
345 a
+= ((uint32
) k
[2] << 16);
348 a
+= ((uint32
) k
[1] << 8);
352 /* case 0: nothing left to add */
354 #endif /* WORDS_BIGENDIAN */
359 /* report the result */
364 * hash_bytes_extended() -- hash into a 64-bit value, using an optional seed
365 * k : the key (the unaligned variable-length array of bytes)
366 * len : the length of the key, counting by bytes
367 * seed : a 64-bit seed (0 means no seed)
369 * Returns a uint64 value. Otherwise similar to hash_bytes.
372 hash_bytes_extended(const unsigned char *k
, int keylen
, uint64 seed
)
379 /* Set up the internal state */
381 a
= b
= c
= 0x9e3779b9 + len
+ 3923095;
383 /* If the seed is non-zero, use it to perturb the internal state. */
387 * In essence, the seed is treated as part of the data being hashed,
388 * but for simplicity, we pretend that it's padded with four bytes of
389 * zeroes so that the seed constitutes a 12-byte chunk.
391 a
+= (uint32
) (seed
>> 32);
396 /* If the source pointer is word-aligned, we use word-wide fetches */
397 if (((uintptr_t) k
& UINT32_ALIGN_MASK
) == 0)
399 /* Code path for aligned source data */
400 const uint32
*ka
= (const uint32
*) k
;
402 /* handle most of the key */
413 /* handle the last 11 bytes */
414 k
= (const unsigned char *) ka
;
415 #ifdef WORDS_BIGENDIAN
419 c
+= ((uint32
) k
[10] << 8);
422 c
+= ((uint32
) k
[9] << 16);
425 c
+= ((uint32
) k
[8] << 24);
428 /* the lowest byte of c is reserved for the length */
433 b
+= ((uint32
) k
[6] << 8);
436 b
+= ((uint32
) k
[5] << 16);
439 b
+= ((uint32
) k
[4] << 24);
445 a
+= ((uint32
) k
[2] << 8);
448 a
+= ((uint32
) k
[1] << 16);
451 a
+= ((uint32
) k
[0] << 24);
452 /* case 0: nothing left to add */
454 #else /* !WORDS_BIGENDIAN */
458 c
+= ((uint32
) k
[10] << 24);
461 c
+= ((uint32
) k
[9] << 16);
464 c
+= ((uint32
) k
[8] << 8);
467 /* the lowest byte of c is reserved for the length */
472 b
+= ((uint32
) k
[6] << 16);
475 b
+= ((uint32
) k
[5] << 8);
484 a
+= ((uint32
) k
[2] << 16);
487 a
+= ((uint32
) k
[1] << 8);
491 /* case 0: nothing left to add */
493 #endif /* WORDS_BIGENDIAN */
497 /* Code path for non-aligned source data */
499 /* handle most of the key */
502 #ifdef WORDS_BIGENDIAN
503 a
+= (k
[3] + ((uint32
) k
[2] << 8) + ((uint32
) k
[1] << 16) + ((uint32
) k
[0] << 24));
504 b
+= (k
[7] + ((uint32
) k
[6] << 8) + ((uint32
) k
[5] << 16) + ((uint32
) k
[4] << 24));
505 c
+= (k
[11] + ((uint32
) k
[10] << 8) + ((uint32
) k
[9] << 16) + ((uint32
) k
[8] << 24));
506 #else /* !WORDS_BIGENDIAN */
507 a
+= (k
[0] + ((uint32
) k
[1] << 8) + ((uint32
) k
[2] << 16) + ((uint32
) k
[3] << 24));
508 b
+= (k
[4] + ((uint32
) k
[5] << 8) + ((uint32
) k
[6] << 16) + ((uint32
) k
[7] << 24));
509 c
+= (k
[8] + ((uint32
) k
[9] << 8) + ((uint32
) k
[10] << 16) + ((uint32
) k
[11] << 24));
510 #endif /* WORDS_BIGENDIAN */
516 /* handle the last 11 bytes */
517 #ifdef WORDS_BIGENDIAN
521 c
+= ((uint32
) k
[10] << 8);
524 c
+= ((uint32
) k
[9] << 16);
527 c
+= ((uint32
) k
[8] << 24);
530 /* the lowest byte of c is reserved for the length */
534 b
+= ((uint32
) k
[6] << 8);
537 b
+= ((uint32
) k
[5] << 16);
540 b
+= ((uint32
) k
[4] << 24);
546 a
+= ((uint32
) k
[2] << 8);
549 a
+= ((uint32
) k
[1] << 16);
552 a
+= ((uint32
) k
[0] << 24);
553 /* case 0: nothing left to add */
555 #else /* !WORDS_BIGENDIAN */
559 c
+= ((uint32
) k
[10] << 24);
562 c
+= ((uint32
) k
[9] << 16);
565 c
+= ((uint32
) k
[8] << 8);
568 /* the lowest byte of c is reserved for the length */
569 b
+= ((uint32
) k
[7] << 24);
572 b
+= ((uint32
) k
[6] << 16);
575 b
+= ((uint32
) k
[5] << 8);
581 a
+= ((uint32
) k
[3] << 24);
584 a
+= ((uint32
) k
[2] << 16);
587 a
+= ((uint32
) k
[1] << 8);
591 /* case 0: nothing left to add */
593 #endif /* WORDS_BIGENDIAN */
598 /* report the result */
599 return ((uint64
) b
<< 32) | c
;
603 * hash_bytes_uint32() -- hash a 32-bit value to a 32-bit value
605 * This has the same result as
606 * hash_bytes(&k, sizeof(uint32))
607 * but is faster and doesn't force the caller to store k into memory.
610 hash_bytes_uint32(uint32 k
)
616 a
= b
= c
= 0x9e3779b9 + (uint32
) sizeof(uint32
) + 3923095;
621 /* report the result */
626 * hash_bytes_uint32_extended() -- hash 32-bit value to 64-bit value, with seed
628 * Like hash_bytes_uint32, this is a convenience function.
631 hash_bytes_uint32_extended(uint32 k
, uint64 seed
)
637 a
= b
= c
= 0x9e3779b9 + (uint32
) sizeof(uint32
) + 3923095;
641 a
+= (uint32
) (seed
>> 32);
650 /* report the result */
651 return ((uint64
) b
<< 32) | c
;
655 * string_hash: hash function for keys that are NUL-terminated strings.
657 * NOTE: this is the default hash function if none is specified.
660 string_hash(const void *key
, Size keysize
)
663 * If the string exceeds keysize-1 bytes, we want to hash only that many,
664 * because when it is copied into the hash table it will be truncated at
667 Size s_len
= strlen((const char *) key
);
669 s_len
= Min(s_len
, keysize
- 1);
670 return hash_bytes((const unsigned char *) key
, (int) s_len
);
674 * tag_hash: hash function for fixed-size tag values
677 tag_hash(const void *key
, Size keysize
)
679 return hash_bytes((const unsigned char *) key
, (int) keysize
);
683 * uint32_hash: hash function for keys that are uint32 or int32
685 * (tag_hash works for this case too, but is slower)
688 uint32_hash(const void *key
, Size keysize
)
690 Assert(keysize
== sizeof(uint32
));
691 return hash_bytes_uint32(*((const uint32
*) key
));