1 /*-------------------------------------------------------------------------
4 * Support functions for hash access method.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
14 * These functions are stored in pg_amproc. For each operator class
15 * defined for hash indexes, they compute the hash value of the argument.
17 * Additional hash functions appear in /utils/adt/ files for various
18 * specialized datatypes.
20 * It is expected that every bit of a hash function's 32-bit result is
21 * as random as every other; failure to ensure this is likely to lead
22 * to poor performance of hash joins, for example. In most cases a hash
23 * function should use hash_any() or its variant hash_uint32().
24 *-------------------------------------------------------------------------
29 #include "access/hash.h"
32 /* Note: this is used for both "char" and boolean datatypes */
34 hashchar(PG_FUNCTION_ARGS
)
36 return hash_uint32((int32
) PG_GETARG_CHAR(0));
40 hashint2(PG_FUNCTION_ARGS
)
42 return hash_uint32((int32
) PG_GETARG_INT16(0));
46 hashint4(PG_FUNCTION_ARGS
)
48 return hash_uint32(PG_GETARG_INT32(0));
52 hashint8(PG_FUNCTION_ARGS
)
55 * The idea here is to produce a hash value compatible with the values
56 * produced by hashint4 and hashint2 for logically equal inputs; this is
57 * necessary to support cross-type hash joins across these input types.
58 * Since all three types are signed, we can xor the high half of the int8
59 * value if the sign is positive, or the complement of the high half when
60 * the sign is negative.
62 #ifndef INT64_IS_BUSTED
63 int64 val
= PG_GETARG_INT64(0);
64 uint32 lohalf
= (uint32
) val
;
65 uint32 hihalf
= (uint32
) (val
>> 32);
67 lohalf
^= (val
>= 0) ? hihalf
: ~hihalf
;
69 return hash_uint32(lohalf
);
71 /* here if we can't count on "x >> 32" to work sanely */
72 return hash_uint32((int32
) PG_GETARG_INT64(0));
77 hashoid(PG_FUNCTION_ARGS
)
79 return hash_uint32((uint32
) PG_GETARG_OID(0));
83 hashenum(PG_FUNCTION_ARGS
)
85 return hash_uint32((uint32
) PG_GETARG_OID(0));
89 hashfloat4(PG_FUNCTION_ARGS
)
91 float4 key
= PG_GETARG_FLOAT4(0);
95 * On IEEE-float machines, minus zero and zero have different bit patterns
96 * but should compare as equal. We must ensure that they have the same
97 * hash value, which is most reliably done this way:
99 if (key
== (float4
) 0)
103 * To support cross-type hashing of float8 and float4, we want to return
104 * the same hash value hashfloat8 would produce for an equal float8 value.
105 * So, widen the value to float8 and hash that. (We must do this rather
106 * than have hashfloat8 try to narrow its value to float4; that could fail
111 return hash_any((unsigned char *) &key8
, sizeof(key8
));
115 hashfloat8(PG_FUNCTION_ARGS
)
117 float8 key
= PG_GETARG_FLOAT8(0);
120 * On IEEE-float machines, minus zero and zero have different bit patterns
121 * but should compare as equal. We must ensure that they have the same
122 * hash value, which is most reliably done this way:
124 if (key
== (float8
) 0)
127 return hash_any((unsigned char *) &key
, sizeof(key
));
131 hashoidvector(PG_FUNCTION_ARGS
)
133 oidvector
*key
= (oidvector
*) PG_GETARG_POINTER(0);
135 return hash_any((unsigned char *) key
->values
, key
->dim1
* sizeof(Oid
));
139 hashint2vector(PG_FUNCTION_ARGS
)
141 int2vector
*key
= (int2vector
*) PG_GETARG_POINTER(0);
143 return hash_any((unsigned char *) key
->values
, key
->dim1
* sizeof(int2
));
147 hashname(PG_FUNCTION_ARGS
)
149 char *key
= NameStr(*PG_GETARG_NAME(0));
150 int keylen
= strlen(key
);
152 Assert(keylen
< NAMEDATALEN
); /* else it's not truncated correctly */
154 return hash_any((unsigned char *) key
, keylen
);
158 hashtext(PG_FUNCTION_ARGS
)
160 text
*key
= PG_GETARG_TEXT_PP(0);
164 * Note: this is currently identical in behavior to hashvarlena, but keep
165 * it as a separate function in case we someday want to do something
166 * different in non-C locales. (See also hashbpchar, if so.)
168 result
= hash_any((unsigned char *) VARDATA_ANY(key
),
169 VARSIZE_ANY_EXHDR(key
));
171 /* Avoid leaking memory for toasted inputs */
172 PG_FREE_IF_COPY(key
, 0);
178 * hashvarlena() can be used for any varlena datatype in which there are
179 * no non-significant bits, ie, distinct bitpatterns never compare as equal.
182 hashvarlena(PG_FUNCTION_ARGS
)
184 struct varlena
*key
= PG_GETARG_VARLENA_PP(0);
187 result
= hash_any((unsigned char *) VARDATA_ANY(key
),
188 VARSIZE_ANY_EXHDR(key
));
190 /* Avoid leaking memory for toasted inputs */
191 PG_FREE_IF_COPY(key
, 0);
197 * This hash function was written by Bob Jenkins
198 * (bob_jenkins@burtleburtle.net), and superficially adapted
199 * for PostgreSQL by Neil Conway. For more information on this
200 * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
201 * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
203 * In the current code, we have adopted Bob's 2006 update of his hash
204 * function to fetch the data a word at a time when it is suitably aligned.
205 * This makes for a useful speedup, at the cost of having to maintain
206 * four code paths (aligned vs unaligned, and little-endian vs big-endian).
207 * It also uses two separate mixing functions mix() and final(), instead
208 * of a slower multi-purpose function.
211 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
212 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
214 /* Rotate a uint32 value left by k bits - note multiple evaluation! */
215 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
218 * mix -- mix 3 32-bit values reversibly.
220 * This is reversible, so any information in (a,b,c) before mix() is
221 * still in (a,b,c) after mix().
223 * If four pairs of (a,b,c) inputs are run through mix(), or through
224 * mix() in reverse, there are at least 32 bits of the output that
225 * are sometimes the same for one pair and different for another pair.
226 * This was tested for:
227 * * pairs that differed by one bit, by two bits, in any combination
228 * of top bits of (a,b,c), or in any combination of bottom bits of
230 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
231 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
232 * is commonly produced by subtraction) look like a single 1-bit
234 * * the base values were pseudorandom, all zero but one bit set, or
235 * all zero plus a counter that starts at zero.
237 * This does not achieve avalanche. There are input bits of (a,b,c)
238 * that fail to affect some output bits of (a,b,c), especially of a. The
239 * most thoroughly mixed value is c, but it doesn't really even achieve
242 * This allows some parallelism. Read-after-writes are good at doubling
243 * the number of bits affected, so the goal of mixing pulls in the opposite
244 * direction from the goal of parallelism. I did what I could. Rotates
245 * seem to cost as much as shifts on every machine I could lay my hands on,
246 * and rotates are much kinder to the top and bottom bits, so I used rotates.
251 a -= c; a ^= rot(c, 4); c += b; \
252 b -= a; b ^= rot(a, 6); a += c; \
253 c -= b; c ^= rot(b, 8); b += a; \
254 a -= c; a ^= rot(c,16); c += b; \
255 b -= a; b ^= rot(a,19); a += c; \
256 c -= b; c ^= rot(b, 4); b += a; \
260 * final -- final mixing of 3 32-bit values (a,b,c) into c
262 * Pairs of (a,b,c) values differing in only a few bits will usually
263 * produce values of c that look totally different. This was tested for
264 * * pairs that differed by one bit, by two bits, in any combination
265 * of top bits of (a,b,c), or in any combination of bottom bits of
267 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
268 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
269 * is commonly produced by subtraction) look like a single 1-bit
271 * * the base values were pseudorandom, all zero but one bit set, or
272 * all zero plus a counter that starts at zero.
274 * The use of separate functions for mix() and final() allow for a
275 * substantial performance increase since final() does not need to
276 * do well in reverse, but is does need to affect all output bits.
277 * mix(), on the other hand, does not need to affect all output
278 * bits (affecting 32 bits is enough). The original hash function had
279 * a single mixing operation that had to satisfy both sets of requirements
280 * and was slower as a result.
283 #define final(a,b,c) \
285 c ^= b; c -= rot(b,14); \
286 a ^= c; a -= rot(c,11); \
287 b ^= a; b -= rot(a,25); \
288 c ^= b; c -= rot(b,16); \
289 a ^= c; a -= rot(c, 4); \
290 b ^= a; b -= rot(a,14); \
291 c ^= b; c -= rot(b,24); \
295 * hash_any() -- hash a variable-length key into a 32-bit value
296 * k : the key (the unaligned variable-length array of bytes)
297 * len : the length of the key, counting by bytes
299 * Returns a uint32 value. Every bit of the key affects every bit of
300 * the return value. Every 1-bit and 2-bit delta achieves avalanche.
301 * About 6*len+35 instructions. The best hash table sizes are powers
302 * of 2. There is no need to do mod a prime (mod is sooo slow!).
303 * If you need less than 32 bits, use a bitmask.
305 * Note: we could easily change this function to return a 64-bit hash value
306 * by using the final values of both b and c. b is perhaps a little less
307 * well mixed than c, however.
310 hash_any(register const unsigned char *k
, register int keylen
)
317 /* Set up the internal state */
319 a
= b
= c
= 0x9e3779b9 + len
+ 3923095;
321 /* If the source pointer is word-aligned, we use word-wide fetches */
322 if (((long) k
& UINT32_ALIGN_MASK
) == 0)
324 /* Code path for aligned source data */
325 register const uint32
*ka
= (const uint32
*) k
;
327 /* handle most of the key */
338 /* handle the last 11 bytes */
339 k
= (const unsigned char *) ka
;
340 #ifdef WORDS_BIGENDIAN
344 c
+= ((uint32
) k
[10] << 8);
347 c
+= ((uint32
) k
[9] << 16);
350 c
+= ((uint32
) k
[8] << 24);
351 /* the lowest byte of c is reserved for the length */
358 b
+= ((uint32
) k
[6] << 8);
361 b
+= ((uint32
) k
[5] << 16);
364 b
+= ((uint32
) k
[4] << 24);
370 a
+= ((uint32
) k
[2] << 8);
373 a
+= ((uint32
) k
[1] << 16);
376 a
+= ((uint32
) k
[0] << 24);
377 /* case 0: nothing left to add */
379 #else /* !WORDS_BIGENDIAN */
383 c
+= ((uint32
) k
[10] << 24);
386 c
+= ((uint32
) k
[9] << 16);
389 c
+= ((uint32
) k
[8] << 8);
390 /* the lowest byte of c is reserved for the length */
397 b
+= ((uint32
) k
[6] << 16);
400 b
+= ((uint32
) k
[5] << 8);
409 a
+= ((uint32
) k
[2] << 16);
412 a
+= ((uint32
) k
[1] << 8);
416 /* case 0: nothing left to add */
418 #endif /* WORDS_BIGENDIAN */
422 /* Code path for non-aligned source data */
424 /* handle most of the key */
427 #ifdef WORDS_BIGENDIAN
428 a
+= (k
[3] + ((uint32
) k
[2] << 8) + ((uint32
) k
[1] << 16) + ((uint32
) k
[0] << 24));
429 b
+= (k
[7] + ((uint32
) k
[6] << 8) + ((uint32
) k
[5] << 16) + ((uint32
) k
[4] << 24));
430 c
+= (k
[11] + ((uint32
) k
[10] << 8) + ((uint32
) k
[9] << 16) + ((uint32
) k
[8] << 24));
431 #else /* !WORDS_BIGENDIAN */
432 a
+= (k
[0] + ((uint32
) k
[1] << 8) + ((uint32
) k
[2] << 16) + ((uint32
) k
[3] << 24));
433 b
+= (k
[4] + ((uint32
) k
[5] << 8) + ((uint32
) k
[6] << 16) + ((uint32
) k
[7] << 24));
434 c
+= (k
[8] + ((uint32
) k
[9] << 8) + ((uint32
) k
[10] << 16) + ((uint32
) k
[11] << 24));
435 #endif /* WORDS_BIGENDIAN */
441 /* handle the last 11 bytes */
442 #ifdef WORDS_BIGENDIAN
443 switch (len
) /* all the case statements fall through */
446 c
+= ((uint32
) k
[10] << 8);
448 c
+= ((uint32
) k
[9] << 16);
450 c
+= ((uint32
) k
[8] << 24);
451 /* the lowest byte of c is reserved for the length */
455 b
+= ((uint32
) k
[6] << 8);
457 b
+= ((uint32
) k
[5] << 16);
459 b
+= ((uint32
) k
[4] << 24);
463 a
+= ((uint32
) k
[2] << 8);
465 a
+= ((uint32
) k
[1] << 16);
467 a
+= ((uint32
) k
[0] << 24);
468 /* case 0: nothing left to add */
470 #else /* !WORDS_BIGENDIAN */
471 switch (len
) /* all the case statements fall through */
474 c
+= ((uint32
) k
[10] << 24);
476 c
+= ((uint32
) k
[9] << 16);
478 c
+= ((uint32
) k
[8] << 8);
479 /* the lowest byte of c is reserved for the length */
481 b
+= ((uint32
) k
[7] << 24);
483 b
+= ((uint32
) k
[6] << 16);
485 b
+= ((uint32
) k
[5] << 8);
489 a
+= ((uint32
) k
[3] << 24);
491 a
+= ((uint32
) k
[2] << 16);
493 a
+= ((uint32
) k
[1] << 8);
496 /* case 0: nothing left to add */
498 #endif /* WORDS_BIGENDIAN */
503 /* report the result */
504 return UInt32GetDatum(c
);
508 * hash_uint32() -- hash a 32-bit value
510 * This has the same result as
511 * hash_any(&k, sizeof(uint32))
512 * but is faster and doesn't force the caller to store k into memory.
515 hash_uint32(uint32 k
)
521 a
= b
= c
= 0x9e3779b9 + (uint32
) sizeof(uint32
) + 3923095;
526 /* report the result */
527 return UInt32GetDatum(c
);