jsonpath scanner: reentrant scanner
[pgsql.git] / src / common / hashfn.c
blob4db468cf85b7bc1ce9a465301eee5eae93a72c65
1 /*-------------------------------------------------------------------------
3 * hashfn.c
4 * Generic hashing functions, and hash functions for use in dynahash.c
5 * hashtables
8 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
12 * IDENTIFICATION
13 * src/common/hashfn.c
15 * NOTES
16 * It is expected that every bit of a hash function's 32-bit result is
17 * as random as every other; failure to ensure this is likely to lead
18 * to poor performance of hash tables. In most cases a hash
19 * function should use hash_bytes() or its variant hash_bytes_uint32(),
20 * or the wrappers hash_any() and hash_uint32 defined in hashfn.h.
22 *-------------------------------------------------------------------------
24 #include "postgres.h"
26 #include "common/hashfn.h"
27 #include "port/pg_bitutils.h"
31 * This hash function was written by Bob Jenkins
32 * (bob_jenkins@burtleburtle.net), and superficially adapted
33 * for PostgreSQL by Neil Conway. For more information on this
34 * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
35 * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
37 * In the current code, we have adopted Bob's 2006 update of his hash
38 * function to fetch the data a word at a time when it is suitably aligned.
39 * This makes for a useful speedup, at the cost of having to maintain
40 * four code paths (aligned vs unaligned, and little-endian vs big-endian).
41 * It also uses two separate mixing functions mix() and final(), instead
42 * of a slower multi-purpose function.
45 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
46 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
48 #define rot(x,k) pg_rotate_left32(x, k)
50 /*----------
51 * mix -- mix 3 32-bit values reversibly.
53 * This is reversible, so any information in (a,b,c) before mix() is
54 * still in (a,b,c) after mix().
56 * If four pairs of (a,b,c) inputs are run through mix(), or through
57 * mix() in reverse, there are at least 32 bits of the output that
58 * are sometimes the same for one pair and different for another pair.
59 * This was tested for:
60 * * pairs that differed by one bit, by two bits, in any combination
61 * of top bits of (a,b,c), or in any combination of bottom bits of
62 * (a,b,c).
63 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
64 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
65 * is commonly produced by subtraction) look like a single 1-bit
66 * difference.
67 * * the base values were pseudorandom, all zero but one bit set, or
68 * all zero plus a counter that starts at zero.
70 * This does not achieve avalanche. There are input bits of (a,b,c)
71 * that fail to affect some output bits of (a,b,c), especially of a. The
72 * most thoroughly mixed value is c, but it doesn't really even achieve
73 * avalanche in c.
75 * This allows some parallelism. Read-after-writes are good at doubling
76 * the number of bits affected, so the goal of mixing pulls in the opposite
77 * direction from the goal of parallelism. I did what I could. Rotates
78 * seem to cost as much as shifts on every machine I could lay my hands on,
79 * and rotates are much kinder to the top and bottom bits, so I used rotates.
80 *----------
82 #define mix(a,b,c) \
83 { \
84 a -= c; a ^= rot(c, 4); c += b; \
85 b -= a; b ^= rot(a, 6); a += c; \
86 c -= b; c ^= rot(b, 8); b += a; \
87 a -= c; a ^= rot(c,16); c += b; \
88 b -= a; b ^= rot(a,19); a += c; \
89 c -= b; c ^= rot(b, 4); b += a; \
92 /*----------
93 * final -- final mixing of 3 32-bit values (a,b,c) into c
95 * Pairs of (a,b,c) values differing in only a few bits will usually
96 * produce values of c that look totally different. This was tested for
97 * * pairs that differed by one bit, by two bits, in any combination
98 * of top bits of (a,b,c), or in any combination of bottom bits of
99 * (a,b,c).
100 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
101 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
102 * is commonly produced by subtraction) look like a single 1-bit
103 * difference.
104 * * the base values were pseudorandom, all zero but one bit set, or
105 * all zero plus a counter that starts at zero.
107 * The use of separate functions for mix() and final() allow for a
108 * substantial performance increase since final() does not need to
109 * do well in reverse, but is does need to affect all output bits.
110 * mix(), on the other hand, does not need to affect all output
111 * bits (affecting 32 bits is enough). The original hash function had
112 * a single mixing operation that had to satisfy both sets of requirements
113 * and was slower as a result.
114 *----------
116 #define final(a,b,c) \
118 c ^= b; c -= rot(b,14); \
119 a ^= c; a -= rot(c,11); \
120 b ^= a; b -= rot(a,25); \
121 c ^= b; c -= rot(b,16); \
122 a ^= c; a -= rot(c, 4); \
123 b ^= a; b -= rot(a,14); \
124 c ^= b; c -= rot(b,24); \
128 * hash_bytes() -- hash a variable-length key into a 32-bit value
129 * k : the key (the unaligned variable-length array of bytes)
130 * len : the length of the key, counting by bytes
132 * Returns a uint32 value. Every bit of the key affects every bit of
133 * the return value. Every 1-bit and 2-bit delta achieves avalanche.
134 * About 6*len+35 instructions. The best hash table sizes are powers
135 * of 2. There is no need to do mod a prime (mod is sooo slow!).
136 * If you need less than 32 bits, use a bitmask.
138 * This procedure must never throw elog(ERROR); the ResourceOwner code
139 * relies on this not to fail.
141 * Note: we could easily change this function to return a 64-bit hash value
142 * by using the final values of both b and c. b is perhaps a little less
143 * well mixed than c, however.
145 uint32
146 hash_bytes(const unsigned char *k, int keylen)
148 uint32 a,
151 len;
153 /* Set up the internal state */
154 len = keylen;
155 a = b = c = 0x9e3779b9 + len + 3923095;
157 /* If the source pointer is word-aligned, we use word-wide fetches */
158 if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
160 /* Code path for aligned source data */
161 const uint32 *ka = (const uint32 *) k;
163 /* handle most of the key */
164 while (len >= 12)
166 a += ka[0];
167 b += ka[1];
168 c += ka[2];
169 mix(a, b, c);
170 ka += 3;
171 len -= 12;
174 /* handle the last 11 bytes */
175 k = (const unsigned char *) ka;
176 #ifdef WORDS_BIGENDIAN
177 switch (len)
179 case 11:
180 c += ((uint32) k[10] << 8);
181 /* fall through */
182 case 10:
183 c += ((uint32) k[9] << 16);
184 /* fall through */
185 case 9:
186 c += ((uint32) k[8] << 24);
187 /* fall through */
188 case 8:
189 /* the lowest byte of c is reserved for the length */
190 b += ka[1];
191 a += ka[0];
192 break;
193 case 7:
194 b += ((uint32) k[6] << 8);
195 /* fall through */
196 case 6:
197 b += ((uint32) k[5] << 16);
198 /* fall through */
199 case 5:
200 b += ((uint32) k[4] << 24);
201 /* fall through */
202 case 4:
203 a += ka[0];
204 break;
205 case 3:
206 a += ((uint32) k[2] << 8);
207 /* fall through */
208 case 2:
209 a += ((uint32) k[1] << 16);
210 /* fall through */
211 case 1:
212 a += ((uint32) k[0] << 24);
213 /* case 0: nothing left to add */
215 #else /* !WORDS_BIGENDIAN */
216 switch (len)
218 case 11:
219 c += ((uint32) k[10] << 24);
220 /* fall through */
221 case 10:
222 c += ((uint32) k[9] << 16);
223 /* fall through */
224 case 9:
225 c += ((uint32) k[8] << 8);
226 /* fall through */
227 case 8:
228 /* the lowest byte of c is reserved for the length */
229 b += ka[1];
230 a += ka[0];
231 break;
232 case 7:
233 b += ((uint32) k[6] << 16);
234 /* fall through */
235 case 6:
236 b += ((uint32) k[5] << 8);
237 /* fall through */
238 case 5:
239 b += k[4];
240 /* fall through */
241 case 4:
242 a += ka[0];
243 break;
244 case 3:
245 a += ((uint32) k[2] << 16);
246 /* fall through */
247 case 2:
248 a += ((uint32) k[1] << 8);
249 /* fall through */
250 case 1:
251 a += k[0];
252 /* case 0: nothing left to add */
254 #endif /* WORDS_BIGENDIAN */
256 else
258 /* Code path for non-aligned source data */
260 /* handle most of the key */
261 while (len >= 12)
263 #ifdef WORDS_BIGENDIAN
264 a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
265 b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
266 c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
267 #else /* !WORDS_BIGENDIAN */
268 a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
269 b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
270 c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
271 #endif /* WORDS_BIGENDIAN */
272 mix(a, b, c);
273 k += 12;
274 len -= 12;
277 /* handle the last 11 bytes */
278 #ifdef WORDS_BIGENDIAN
279 switch (len)
281 case 11:
282 c += ((uint32) k[10] << 8);
283 /* fall through */
284 case 10:
285 c += ((uint32) k[9] << 16);
286 /* fall through */
287 case 9:
288 c += ((uint32) k[8] << 24);
289 /* fall through */
290 case 8:
291 /* the lowest byte of c is reserved for the length */
292 b += k[7];
293 /* fall through */
294 case 7:
295 b += ((uint32) k[6] << 8);
296 /* fall through */
297 case 6:
298 b += ((uint32) k[5] << 16);
299 /* fall through */
300 case 5:
301 b += ((uint32) k[4] << 24);
302 /* fall through */
303 case 4:
304 a += k[3];
305 /* fall through */
306 case 3:
307 a += ((uint32) k[2] << 8);
308 /* fall through */
309 case 2:
310 a += ((uint32) k[1] << 16);
311 /* fall through */
312 case 1:
313 a += ((uint32) k[0] << 24);
314 /* case 0: nothing left to add */
316 #else /* !WORDS_BIGENDIAN */
317 switch (len)
319 case 11:
320 c += ((uint32) k[10] << 24);
321 /* fall through */
322 case 10:
323 c += ((uint32) k[9] << 16);
324 /* fall through */
325 case 9:
326 c += ((uint32) k[8] << 8);
327 /* fall through */
328 case 8:
329 /* the lowest byte of c is reserved for the length */
330 b += ((uint32) k[7] << 24);
331 /* fall through */
332 case 7:
333 b += ((uint32) k[6] << 16);
334 /* fall through */
335 case 6:
336 b += ((uint32) k[5] << 8);
337 /* fall through */
338 case 5:
339 b += k[4];
340 /* fall through */
341 case 4:
342 a += ((uint32) k[3] << 24);
343 /* fall through */
344 case 3:
345 a += ((uint32) k[2] << 16);
346 /* fall through */
347 case 2:
348 a += ((uint32) k[1] << 8);
349 /* fall through */
350 case 1:
351 a += k[0];
352 /* case 0: nothing left to add */
354 #endif /* WORDS_BIGENDIAN */
357 final(a, b, c);
359 /* report the result */
360 return c;
364 * hash_bytes_extended() -- hash into a 64-bit value, using an optional seed
365 * k : the key (the unaligned variable-length array of bytes)
366 * len : the length of the key, counting by bytes
367 * seed : a 64-bit seed (0 means no seed)
369 * Returns a uint64 value. Otherwise similar to hash_bytes.
371 uint64
372 hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed)
374 uint32 a,
377 len;
379 /* Set up the internal state */
380 len = keylen;
381 a = b = c = 0x9e3779b9 + len + 3923095;
383 /* If the seed is non-zero, use it to perturb the internal state. */
384 if (seed != 0)
387 * In essence, the seed is treated as part of the data being hashed,
388 * but for simplicity, we pretend that it's padded with four bytes of
389 * zeroes so that the seed constitutes a 12-byte chunk.
391 a += (uint32) (seed >> 32);
392 b += (uint32) seed;
393 mix(a, b, c);
396 /* If the source pointer is word-aligned, we use word-wide fetches */
397 if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
399 /* Code path for aligned source data */
400 const uint32 *ka = (const uint32 *) k;
402 /* handle most of the key */
403 while (len >= 12)
405 a += ka[0];
406 b += ka[1];
407 c += ka[2];
408 mix(a, b, c);
409 ka += 3;
410 len -= 12;
413 /* handle the last 11 bytes */
414 k = (const unsigned char *) ka;
415 #ifdef WORDS_BIGENDIAN
416 switch (len)
418 case 11:
419 c += ((uint32) k[10] << 8);
420 /* fall through */
421 case 10:
422 c += ((uint32) k[9] << 16);
423 /* fall through */
424 case 9:
425 c += ((uint32) k[8] << 24);
426 /* fall through */
427 case 8:
428 /* the lowest byte of c is reserved for the length */
429 b += ka[1];
430 a += ka[0];
431 break;
432 case 7:
433 b += ((uint32) k[6] << 8);
434 /* fall through */
435 case 6:
436 b += ((uint32) k[5] << 16);
437 /* fall through */
438 case 5:
439 b += ((uint32) k[4] << 24);
440 /* fall through */
441 case 4:
442 a += ka[0];
443 break;
444 case 3:
445 a += ((uint32) k[2] << 8);
446 /* fall through */
447 case 2:
448 a += ((uint32) k[1] << 16);
449 /* fall through */
450 case 1:
451 a += ((uint32) k[0] << 24);
452 /* case 0: nothing left to add */
454 #else /* !WORDS_BIGENDIAN */
455 switch (len)
457 case 11:
458 c += ((uint32) k[10] << 24);
459 /* fall through */
460 case 10:
461 c += ((uint32) k[9] << 16);
462 /* fall through */
463 case 9:
464 c += ((uint32) k[8] << 8);
465 /* fall through */
466 case 8:
467 /* the lowest byte of c is reserved for the length */
468 b += ka[1];
469 a += ka[0];
470 break;
471 case 7:
472 b += ((uint32) k[6] << 16);
473 /* fall through */
474 case 6:
475 b += ((uint32) k[5] << 8);
476 /* fall through */
477 case 5:
478 b += k[4];
479 /* fall through */
480 case 4:
481 a += ka[0];
482 break;
483 case 3:
484 a += ((uint32) k[2] << 16);
485 /* fall through */
486 case 2:
487 a += ((uint32) k[1] << 8);
488 /* fall through */
489 case 1:
490 a += k[0];
491 /* case 0: nothing left to add */
493 #endif /* WORDS_BIGENDIAN */
495 else
497 /* Code path for non-aligned source data */
499 /* handle most of the key */
500 while (len >= 12)
502 #ifdef WORDS_BIGENDIAN
503 a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
504 b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
505 c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
506 #else /* !WORDS_BIGENDIAN */
507 a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
508 b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
509 c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
510 #endif /* WORDS_BIGENDIAN */
511 mix(a, b, c);
512 k += 12;
513 len -= 12;
516 /* handle the last 11 bytes */
517 #ifdef WORDS_BIGENDIAN
518 switch (len)
520 case 11:
521 c += ((uint32) k[10] << 8);
522 /* fall through */
523 case 10:
524 c += ((uint32) k[9] << 16);
525 /* fall through */
526 case 9:
527 c += ((uint32) k[8] << 24);
528 /* fall through */
529 case 8:
530 /* the lowest byte of c is reserved for the length */
531 b += k[7];
532 /* fall through */
533 case 7:
534 b += ((uint32) k[6] << 8);
535 /* fall through */
536 case 6:
537 b += ((uint32) k[5] << 16);
538 /* fall through */
539 case 5:
540 b += ((uint32) k[4] << 24);
541 /* fall through */
542 case 4:
543 a += k[3];
544 /* fall through */
545 case 3:
546 a += ((uint32) k[2] << 8);
547 /* fall through */
548 case 2:
549 a += ((uint32) k[1] << 16);
550 /* fall through */
551 case 1:
552 a += ((uint32) k[0] << 24);
553 /* case 0: nothing left to add */
555 #else /* !WORDS_BIGENDIAN */
556 switch (len)
558 case 11:
559 c += ((uint32) k[10] << 24);
560 /* fall through */
561 case 10:
562 c += ((uint32) k[9] << 16);
563 /* fall through */
564 case 9:
565 c += ((uint32) k[8] << 8);
566 /* fall through */
567 case 8:
568 /* the lowest byte of c is reserved for the length */
569 b += ((uint32) k[7] << 24);
570 /* fall through */
571 case 7:
572 b += ((uint32) k[6] << 16);
573 /* fall through */
574 case 6:
575 b += ((uint32) k[5] << 8);
576 /* fall through */
577 case 5:
578 b += k[4];
579 /* fall through */
580 case 4:
581 a += ((uint32) k[3] << 24);
582 /* fall through */
583 case 3:
584 a += ((uint32) k[2] << 16);
585 /* fall through */
586 case 2:
587 a += ((uint32) k[1] << 8);
588 /* fall through */
589 case 1:
590 a += k[0];
591 /* case 0: nothing left to add */
593 #endif /* WORDS_BIGENDIAN */
596 final(a, b, c);
598 /* report the result */
599 return ((uint64) b << 32) | c;
603 * hash_bytes_uint32() -- hash a 32-bit value to a 32-bit value
605 * This has the same result as
606 * hash_bytes(&k, sizeof(uint32))
607 * but is faster and doesn't force the caller to store k into memory.
609 uint32
610 hash_bytes_uint32(uint32 k)
612 uint32 a,
616 a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
617 a += k;
619 final(a, b, c);
621 /* report the result */
622 return c;
626 * hash_bytes_uint32_extended() -- hash 32-bit value to 64-bit value, with seed
628 * Like hash_bytes_uint32, this is a convenience function.
630 uint64
631 hash_bytes_uint32_extended(uint32 k, uint64 seed)
633 uint32 a,
637 a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
639 if (seed != 0)
641 a += (uint32) (seed >> 32);
642 b += (uint32) seed;
643 mix(a, b, c);
646 a += k;
648 final(a, b, c);
650 /* report the result */
651 return ((uint64) b << 32) | c;
655 * string_hash: hash function for keys that are NUL-terminated strings.
657 * NOTE: this is the default hash function if none is specified.
659 uint32
660 string_hash(const void *key, Size keysize)
663 * If the string exceeds keysize-1 bytes, we want to hash only that many,
664 * because when it is copied into the hash table it will be truncated at
665 * that length.
667 Size s_len = strlen((const char *) key);
669 s_len = Min(s_len, keysize - 1);
670 return hash_bytes((const unsigned char *) key, (int) s_len);
674 * tag_hash: hash function for fixed-size tag values
676 uint32
677 tag_hash(const void *key, Size keysize)
679 return hash_bytes((const unsigned char *) key, (int) keysize);
683 * uint32_hash: hash function for keys that are uint32 or int32
685 * (tag_hash works for this case too, but is slower)
687 uint32
688 uint32_hash(const void *key, Size keysize)
690 Assert(keysize == sizeof(uint32));
691 return hash_bytes_uint32(*((const uint32 *) key));