Fix xslt_process() to ensure that it inserts a NULL terminator after the
[PostgreSQL.git] / src / backend / access / hash / hashfunc.c
bloba38103eaf1d12640b1de5ccd4f69829eb0dbfb79
1 /*-------------------------------------------------------------------------
3 * hashfunc.c
4 * Support functions for hash access method.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * $PostgreSQL$
13 * NOTES
14 * These functions are stored in pg_amproc. For each operator class
15 * defined for hash indexes, they compute the hash value of the argument.
17 * Additional hash functions appear in /utils/adt/ files for various
18 * specialized datatypes.
20 * It is expected that every bit of a hash function's 32-bit result is
21 * as random as every other; failure to ensure this is likely to lead
22 * to poor performance of hash joins, for example. In most cases a hash
23 * function should use hash_any() or its variant hash_uint32().
24 *-------------------------------------------------------------------------
27 #include "postgres.h"
29 #include "access/hash.h"
32 /* Note: this is used for both "char" and boolean datatypes */
33 Datum
34 hashchar(PG_FUNCTION_ARGS)
36 return hash_uint32((int32) PG_GETARG_CHAR(0));
39 Datum
40 hashint2(PG_FUNCTION_ARGS)
42 return hash_uint32((int32) PG_GETARG_INT16(0));
45 Datum
46 hashint4(PG_FUNCTION_ARGS)
48 return hash_uint32(PG_GETARG_INT32(0));
51 Datum
52 hashint8(PG_FUNCTION_ARGS)
55 * The idea here is to produce a hash value compatible with the values
56 * produced by hashint4 and hashint2 for logically equal inputs; this is
57 * necessary to support cross-type hash joins across these input types.
58 * Since all three types are signed, we can xor the high half of the int8
59 * value if the sign is positive, or the complement of the high half when
60 * the sign is negative.
62 #ifndef INT64_IS_BUSTED
63 int64 val = PG_GETARG_INT64(0);
64 uint32 lohalf = (uint32) val;
65 uint32 hihalf = (uint32) (val >> 32);
67 lohalf ^= (val >= 0) ? hihalf : ~hihalf;
69 return hash_uint32(lohalf);
70 #else
71 /* here if we can't count on "x >> 32" to work sanely */
72 return hash_uint32((int32) PG_GETARG_INT64(0));
73 #endif
76 Datum
77 hashoid(PG_FUNCTION_ARGS)
79 return hash_uint32((uint32) PG_GETARG_OID(0));
82 Datum
83 hashenum(PG_FUNCTION_ARGS)
85 return hash_uint32((uint32) PG_GETARG_OID(0));
88 Datum
89 hashfloat4(PG_FUNCTION_ARGS)
91 float4 key = PG_GETARG_FLOAT4(0);
92 float8 key8;
95 * On IEEE-float machines, minus zero and zero have different bit patterns
96 * but should compare as equal. We must ensure that they have the same
97 * hash value, which is most reliably done this way:
99 if (key == (float4) 0)
100 PG_RETURN_UINT32(0);
103 * To support cross-type hashing of float8 and float4, we want to return
104 * the same hash value hashfloat8 would produce for an equal float8 value.
105 * So, widen the value to float8 and hash that. (We must do this rather
106 * than have hashfloat8 try to narrow its value to float4; that could fail
107 * on overflow.)
109 key8 = key;
111 return hash_any((unsigned char *) &key8, sizeof(key8));
114 Datum
115 hashfloat8(PG_FUNCTION_ARGS)
117 float8 key = PG_GETARG_FLOAT8(0);
120 * On IEEE-float machines, minus zero and zero have different bit patterns
121 * but should compare as equal. We must ensure that they have the same
122 * hash value, which is most reliably done this way:
124 if (key == (float8) 0)
125 PG_RETURN_UINT32(0);
127 return hash_any((unsigned char *) &key, sizeof(key));
130 Datum
131 hashoidvector(PG_FUNCTION_ARGS)
133 oidvector *key = (oidvector *) PG_GETARG_POINTER(0);
135 return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
138 Datum
139 hashint2vector(PG_FUNCTION_ARGS)
141 int2vector *key = (int2vector *) PG_GETARG_POINTER(0);
143 return hash_any((unsigned char *) key->values, key->dim1 * sizeof(int2));
146 Datum
147 hashname(PG_FUNCTION_ARGS)
149 char *key = NameStr(*PG_GETARG_NAME(0));
150 int keylen = strlen(key);
152 Assert(keylen < NAMEDATALEN); /* else it's not truncated correctly */
154 return hash_any((unsigned char *) key, keylen);
157 Datum
158 hashtext(PG_FUNCTION_ARGS)
160 text *key = PG_GETARG_TEXT_PP(0);
161 Datum result;
164 * Note: this is currently identical in behavior to hashvarlena, but keep
165 * it as a separate function in case we someday want to do something
166 * different in non-C locales. (See also hashbpchar, if so.)
168 result = hash_any((unsigned char *) VARDATA_ANY(key),
169 VARSIZE_ANY_EXHDR(key));
171 /* Avoid leaking memory for toasted inputs */
172 PG_FREE_IF_COPY(key, 0);
174 return result;
178 * hashvarlena() can be used for any varlena datatype in which there are
179 * no non-significant bits, ie, distinct bitpatterns never compare as equal.
181 Datum
182 hashvarlena(PG_FUNCTION_ARGS)
184 struct varlena *key = PG_GETARG_VARLENA_PP(0);
185 Datum result;
187 result = hash_any((unsigned char *) VARDATA_ANY(key),
188 VARSIZE_ANY_EXHDR(key));
190 /* Avoid leaking memory for toasted inputs */
191 PG_FREE_IF_COPY(key, 0);
193 return result;
197 * This hash function was written by Bob Jenkins
198 * (bob_jenkins@burtleburtle.net), and superficially adapted
199 * for PostgreSQL by Neil Conway. For more information on this
200 * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
201 * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
203 * In the current code, we have adopted Bob's 2006 update of his hash
204 * function to fetch the data a word at a time when it is suitably aligned.
205 * This makes for a useful speedup, at the cost of having to maintain
206 * four code paths (aligned vs unaligned, and little-endian vs big-endian).
207 * It also uses two separate mixing functions mix() and final(), instead
208 * of a slower multi-purpose function.
211 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
212 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
214 /* Rotate a uint32 value left by k bits - note multiple evaluation! */
215 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
217 /*----------
218 * mix -- mix 3 32-bit values reversibly.
220 * This is reversible, so any information in (a,b,c) before mix() is
221 * still in (a,b,c) after mix().
223 * If four pairs of (a,b,c) inputs are run through mix(), or through
224 * mix() in reverse, there are at least 32 bits of the output that
225 * are sometimes the same for one pair and different for another pair.
226 * This was tested for:
227 * * pairs that differed by one bit, by two bits, in any combination
228 * of top bits of (a,b,c), or in any combination of bottom bits of
229 * (a,b,c).
230 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
231 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
232 * is commonly produced by subtraction) look like a single 1-bit
233 * difference.
234 * * the base values were pseudorandom, all zero but one bit set, or
235 * all zero plus a counter that starts at zero.
237 * This does not achieve avalanche. There are input bits of (a,b,c)
238 * that fail to affect some output bits of (a,b,c), especially of a. The
239 * most thoroughly mixed value is c, but it doesn't really even achieve
240 * avalanche in c.
242 * This allows some parallelism. Read-after-writes are good at doubling
243 * the number of bits affected, so the goal of mixing pulls in the opposite
244 * direction from the goal of parallelism. I did what I could. Rotates
245 * seem to cost as much as shifts on every machine I could lay my hands on,
246 * and rotates are much kinder to the top and bottom bits, so I used rotates.
247 *----------
249 #define mix(a,b,c) \
251 a -= c; a ^= rot(c, 4); c += b; \
252 b -= a; b ^= rot(a, 6); a += c; \
253 c -= b; c ^= rot(b, 8); b += a; \
254 a -= c; a ^= rot(c,16); c += b; \
255 b -= a; b ^= rot(a,19); a += c; \
256 c -= b; c ^= rot(b, 4); b += a; \
259 /*----------
260 * final -- final mixing of 3 32-bit values (a,b,c) into c
262 * Pairs of (a,b,c) values differing in only a few bits will usually
263 * produce values of c that look totally different. This was tested for
264 * * pairs that differed by one bit, by two bits, in any combination
265 * of top bits of (a,b,c), or in any combination of bottom bits of
266 * (a,b,c).
267 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
268 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
269 * is commonly produced by subtraction) look like a single 1-bit
270 * difference.
271 * * the base values were pseudorandom, all zero but one bit set, or
272 * all zero plus a counter that starts at zero.
274 * The use of separate functions for mix() and final() allow for a
275 * substantial performance increase since final() does not need to
276 * do well in reverse, but is does need to affect all output bits.
277 * mix(), on the other hand, does not need to affect all output
278 * bits (affecting 32 bits is enough). The original hash function had
279 * a single mixing operation that had to satisfy both sets of requirements
280 * and was slower as a result.
281 *----------
283 #define final(a,b,c) \
285 c ^= b; c -= rot(b,14); \
286 a ^= c; a -= rot(c,11); \
287 b ^= a; b -= rot(a,25); \
288 c ^= b; c -= rot(b,16); \
289 a ^= c; a -= rot(c, 4); \
290 b ^= a; b -= rot(a,14); \
291 c ^= b; c -= rot(b,24); \
295 * hash_any() -- hash a variable-length key into a 32-bit value
296 * k : the key (the unaligned variable-length array of bytes)
297 * len : the length of the key, counting by bytes
299 * Returns a uint32 value. Every bit of the key affects every bit of
300 * the return value. Every 1-bit and 2-bit delta achieves avalanche.
301 * About 6*len+35 instructions. The best hash table sizes are powers
302 * of 2. There is no need to do mod a prime (mod is sooo slow!).
303 * If you need less than 32 bits, use a bitmask.
305 * Note: we could easily change this function to return a 64-bit hash value
306 * by using the final values of both b and c. b is perhaps a little less
307 * well mixed than c, however.
309 Datum
310 hash_any(register const unsigned char *k, register int keylen)
312 register uint32 a,
315 len;
317 /* Set up the internal state */
318 len = keylen;
319 a = b = c = 0x9e3779b9 + len + 3923095;
321 /* If the source pointer is word-aligned, we use word-wide fetches */
322 if (((long) k & UINT32_ALIGN_MASK) == 0)
324 /* Code path for aligned source data */
325 register const uint32 *ka = (const uint32 *) k;
327 /* handle most of the key */
328 while (len >= 12)
330 a += ka[0];
331 b += ka[1];
332 c += ka[2];
333 mix(a, b, c);
334 ka += 3;
335 len -= 12;
338 /* handle the last 11 bytes */
339 k = (const unsigned char *) ka;
340 #ifdef WORDS_BIGENDIAN
341 switch (len)
343 case 11:
344 c += ((uint32) k[10] << 8);
345 /* fall through */
346 case 10:
347 c += ((uint32) k[9] << 16);
348 /* fall through */
349 case 9:
350 c += ((uint32) k[8] << 24);
351 /* the lowest byte of c is reserved for the length */
352 /* fall through */
353 case 8:
354 b += ka[1];
355 a += ka[0];
356 break;
357 case 7:
358 b += ((uint32) k[6] << 8);
359 /* fall through */
360 case 6:
361 b += ((uint32) k[5] << 16);
362 /* fall through */
363 case 5:
364 b += ((uint32) k[4] << 24);
365 /* fall through */
366 case 4:
367 a += ka[0];
368 break;
369 case 3:
370 a += ((uint32) k[2] << 8);
371 /* fall through */
372 case 2:
373 a += ((uint32) k[1] << 16);
374 /* fall through */
375 case 1:
376 a += ((uint32) k[0] << 24);
377 /* case 0: nothing left to add */
379 #else /* !WORDS_BIGENDIAN */
380 switch (len)
382 case 11:
383 c += ((uint32) k[10] << 24);
384 /* fall through */
385 case 10:
386 c += ((uint32) k[9] << 16);
387 /* fall through */
388 case 9:
389 c += ((uint32) k[8] << 8);
390 /* the lowest byte of c is reserved for the length */
391 /* fall through */
392 case 8:
393 b += ka[1];
394 a += ka[0];
395 break;
396 case 7:
397 b += ((uint32) k[6] << 16);
398 /* fall through */
399 case 6:
400 b += ((uint32) k[5] << 8);
401 /* fall through */
402 case 5:
403 b += k[4];
404 /* fall through */
405 case 4:
406 a += ka[0];
407 break;
408 case 3:
409 a += ((uint32) k[2] << 16);
410 /* fall through */
411 case 2:
412 a += ((uint32) k[1] << 8);
413 /* fall through */
414 case 1:
415 a += k[0];
416 /* case 0: nothing left to add */
418 #endif /* WORDS_BIGENDIAN */
420 else
422 /* Code path for non-aligned source data */
424 /* handle most of the key */
425 while (len >= 12)
427 #ifdef WORDS_BIGENDIAN
428 a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
429 b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
430 c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
431 #else /* !WORDS_BIGENDIAN */
432 a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
433 b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
434 c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
435 #endif /* WORDS_BIGENDIAN */
436 mix(a, b, c);
437 k += 12;
438 len -= 12;
441 /* handle the last 11 bytes */
442 #ifdef WORDS_BIGENDIAN
443 switch (len) /* all the case statements fall through */
445 case 11:
446 c += ((uint32) k[10] << 8);
447 case 10:
448 c += ((uint32) k[9] << 16);
449 case 9:
450 c += ((uint32) k[8] << 24);
451 /* the lowest byte of c is reserved for the length */
452 case 8:
453 b += k[7];
454 case 7:
455 b += ((uint32) k[6] << 8);
456 case 6:
457 b += ((uint32) k[5] << 16);
458 case 5:
459 b += ((uint32) k[4] << 24);
460 case 4:
461 a += k[3];
462 case 3:
463 a += ((uint32) k[2] << 8);
464 case 2:
465 a += ((uint32) k[1] << 16);
466 case 1:
467 a += ((uint32) k[0] << 24);
468 /* case 0: nothing left to add */
470 #else /* !WORDS_BIGENDIAN */
471 switch (len) /* all the case statements fall through */
473 case 11:
474 c += ((uint32) k[10] << 24);
475 case 10:
476 c += ((uint32) k[9] << 16);
477 case 9:
478 c += ((uint32) k[8] << 8);
479 /* the lowest byte of c is reserved for the length */
480 case 8:
481 b += ((uint32) k[7] << 24);
482 case 7:
483 b += ((uint32) k[6] << 16);
484 case 6:
485 b += ((uint32) k[5] << 8);
486 case 5:
487 b += k[4];
488 case 4:
489 a += ((uint32) k[3] << 24);
490 case 3:
491 a += ((uint32) k[2] << 16);
492 case 2:
493 a += ((uint32) k[1] << 8);
494 case 1:
495 a += k[0];
496 /* case 0: nothing left to add */
498 #endif /* WORDS_BIGENDIAN */
501 final(a, b, c);
503 /* report the result */
504 return UInt32GetDatum(c);
508 * hash_uint32() -- hash a 32-bit value
510 * This has the same result as
511 * hash_any(&k, sizeof(uint32))
512 * but is faster and doesn't force the caller to store k into memory.
514 Datum
515 hash_uint32(uint32 k)
517 register uint32 a,
521 a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
522 a += k;
524 final(a, b, c);
526 /* report the result */
527 return UInt32GetDatum(c);