src/backend/access/hash/hashfunc.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashfunc.c
   4  *        Support functions for hash access method.
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  * NOTES
  14  *        These functions are stored in pg_amproc.      For each operator class
  15  *        defined for hash indexes, they compute the hash value of the argument.
  16  *
  17  *        Additional hash functions appear in /utils/adt/ files for various
  18  *        specialized datatypes.
  19  *
  20  *        It is expected that every bit of a hash function's 32-bit result is
  21  *        as random as every other; failure to ensure this is likely to lead
  22  *        to poor performance of hash joins, for example.  In most cases a hash
  23  *        function should use hash_any() or its variant hash_uint32().
  24  *-------------------------------------------------------------------------
  25  */
  26
  27 #include "postgres.h"
  28
  29 #include "access/hash.h"
  30
  31
  32 /* Note: this is used for both "char" and boolean datatypes */
  33 Datum
  34 hashchar(PG_FUNCTION_ARGS)
  35 {
  36         return hash_uint32((int32) PG_GETARG_CHAR(0));
  37 }
  38
  39 Datum
  40 hashint2(PG_FUNCTION_ARGS)
  41 {
  42         return hash_uint32((int32) PG_GETARG_INT16(0));
  43 }
  44
  45 Datum
  46 hashint4(PG_FUNCTION_ARGS)
  47 {
  48         return hash_uint32(PG_GETARG_INT32(0));
  49 }
  50
  51 Datum
  52 hashint8(PG_FUNCTION_ARGS)
  53 {
  54         /*
  55          * The idea here is to produce a hash value compatible with the values
  56          * produced by hashint4 and hashint2 for logically equal inputs; this is
  57          * necessary to support cross-type hash joins across these input types.
  58          * Since all three types are signed, we can xor the high half of the int8
  59          * value if the sign is positive, or the complement of the high half when
  60          * the sign is negative.
  61          */
  62 #ifndef INT64_IS_BUSTED
  63         int64           val = PG_GETARG_INT64(0);
  64         uint32          lohalf = (uint32) val;
  65         uint32          hihalf = (uint32) (val >> 32);
  66
  67         lohalf ^= (val >= 0) ? hihalf : ~hihalf;
  68
  69         return hash_uint32(lohalf);
  70 #else
  71         /* here if we can't count on "x >> 32" to work sanely */
  72         return hash_uint32((int32) PG_GETARG_INT64(0));
  73 #endif
  74 }
  75
  76 Datum
  77 hashoid(PG_FUNCTION_ARGS)
  78 {
  79         return hash_uint32((uint32) PG_GETARG_OID(0));
  80 }
  81
  82 Datum
  83 hashenum(PG_FUNCTION_ARGS)
  84 {
  85         return hash_uint32((uint32) PG_GETARG_OID(0));
  86 }
  87
  88 Datum
  89 hashfloat4(PG_FUNCTION_ARGS)
  90 {
  91         float4          key = PG_GETARG_FLOAT4(0);
  92         float8          key8;
  93
  94         /*
  95          * On IEEE-float machines, minus zero and zero have different bit patterns
  96          * but should compare as equal.  We must ensure that they have the same
  97          * hash value, which is most reliably done this way:
  98          */
  99         if (key == (float4) 0)
 100                 PG_RETURN_UINT32(0);
 101
 102         /*
 103          * To support cross-type hashing of float8 and float4, we want to return
 104          * the same hash value hashfloat8 would produce for an equal float8 value.
 105          * So, widen the value to float8 and hash that.  (We must do this rather
 106          * than have hashfloat8 try to narrow its value to float4; that could fail
 107          * on overflow.)
 108          */
 109         key8 = key;
 110
 111         return hash_any((unsigned char *) &key8, sizeof(key8));
 112 }
 113
 114 Datum
 115 hashfloat8(PG_FUNCTION_ARGS)
 116 {
 117         float8          key = PG_GETARG_FLOAT8(0);
 118
 119         /*
 120          * On IEEE-float machines, minus zero and zero have different bit patterns
 121          * but should compare as equal.  We must ensure that they have the same
 122          * hash value, which is most reliably done this way:
 123          */
 124         if (key == (float8) 0)
 125                 PG_RETURN_UINT32(0);
 126
 127         return hash_any((unsigned char *) &key, sizeof(key));
 128 }
 129
 130 Datum
 131 hashoidvector(PG_FUNCTION_ARGS)
 132 {
 133         oidvector  *key = (oidvector *) PG_GETARG_POINTER(0);
 134
 135         return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
 136 }
 137
 138 Datum
 139 hashint2vector(PG_FUNCTION_ARGS)
 140 {
 141         int2vector *key = (int2vector *) PG_GETARG_POINTER(0);
 142
 143         return hash_any((unsigned char *) key->values, key->dim1 * sizeof(int2));
 144 }
 145
 146 Datum
 147 hashname(PG_FUNCTION_ARGS)
 148 {
 149         char       *key = NameStr(*PG_GETARG_NAME(0));
 150         int                     keylen = strlen(key);
 151
 152         Assert(keylen < NAMEDATALEN);           /* else it's not truncated correctly */
 153
 154         return hash_any((unsigned char *) key, keylen);
 155 }
 156
 157 Datum
 158 hashtext(PG_FUNCTION_ARGS)
 159 {
 160         text       *key = PG_GETARG_TEXT_PP(0);
 161         Datum           result;
 162
 163         /*
 164          * Note: this is currently identical in behavior to hashvarlena, but keep
 165          * it as a separate function in case we someday want to do something
 166          * different in non-C locales.  (See also hashbpchar, if so.)
 167          */
 168         result = hash_any((unsigned char *) VARDATA_ANY(key),
 169                                           VARSIZE_ANY_EXHDR(key));
 170
 171         /* Avoid leaking memory for toasted inputs */
 172         PG_FREE_IF_COPY(key, 0);
 173
 174         return result;
 175 }
 176
 177 /*
 178  * hashvarlena() can be used for any varlena datatype in which there are
 179  * no non-significant bits, ie, distinct bitpatterns never compare as equal.
 180  */
 181 Datum
 182 hashvarlena(PG_FUNCTION_ARGS)
 183 {
 184         struct varlena *key = PG_GETARG_VARLENA_PP(0);
 185         Datum           result;
 186
 187         result = hash_any((unsigned char *) VARDATA_ANY(key),
 188                                           VARSIZE_ANY_EXHDR(key));
 189
 190         /* Avoid leaking memory for toasted inputs */
 191         PG_FREE_IF_COPY(key, 0);
 192
 193         return result;
 194 }
 195
 196 /*
 197  * This hash function was written by Bob Jenkins
 198  * (bob_jenkins@burtleburtle.net), and superficially adapted
 199  * for PostgreSQL by Neil Conway. For more information on this
 200  * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
 201  * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
 202  *
 203  * In the current code, we have adopted Bob's 2006 update of his hash
 204  * function to fetch the data a word at a time when it is suitably aligned.
 205  * This makes for a useful speedup, at the cost of having to maintain
 206  * four code paths (aligned vs unaligned, and little-endian vs big-endian).
 207  * It also uses two separate mixing functions mix() and final(), instead
 208  * of a slower multi-purpose function.
 209  */
 210
 211 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
 212 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
 213
 214 /* Rotate a uint32 value left by k bits - note multiple evaluation! */
 215 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
 216
 217 /*----------
 218  * mix -- mix 3 32-bit values reversibly.
 219  *
 220  * This is reversible, so any information in (a,b,c) before mix() is
 221  * still in (a,b,c) after mix().
 222  *
 223  * If four pairs of (a,b,c) inputs are run through mix(), or through
 224  * mix() in reverse, there are at least 32 bits of the output that
 225  * are sometimes the same for one pair and different for another pair.
 226  * This was tested for:
 227  * * pairs that differed by one bit, by two bits, in any combination
 228  *       of top bits of (a,b,c), or in any combination of bottom bits of
 229  *       (a,b,c).
 230  * * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 231  *       the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 232  *       is commonly produced by subtraction) look like a single 1-bit
 233  *       difference.
 234  * * the base values were pseudorandom, all zero but one bit set, or
 235  *       all zero plus a counter that starts at zero.
 236  *
 237  * This does not achieve avalanche.  There are input bits of (a,b,c)
 238  * that fail to affect some output bits of (a,b,c), especially of a.  The
 239  * most thoroughly mixed value is c, but it doesn't really even achieve
 240  * avalanche in c.
 241  *
 242  * This allows some parallelism.  Read-after-writes are good at doubling
 243  * the number of bits affected, so the goal of mixing pulls in the opposite
 244  * direction from the goal of parallelism.      I did what I could.  Rotates
 245  * seem to cost as much as shifts on every machine I could lay my hands on,
 246  * and rotates are much kinder to the top and bottom bits, so I used rotates.
 247  *----------
 248  */
 249 #define mix(a,b,c) \
 250 { \
 251   a -= c;  a ^= rot(c, 4);      c += b; \
 252   b -= a;  b ^= rot(a, 6);      a += c; \
 253   c -= b;  c ^= rot(b, 8);      b += a; \
 254   a -= c;  a ^= rot(c,16);      c += b; \
 255   b -= a;  b ^= rot(a,19);      a += c; \
 256   c -= b;  c ^= rot(b, 4);      b += a; \
 257 }
 258
 259 /*----------
 260  * final -- final mixing of 3 32-bit values (a,b,c) into c
 261  *
 262  * Pairs of (a,b,c) values differing in only a few bits will usually
 263  * produce values of c that look totally different.  This was tested for
 264  * * pairs that differed by one bit, by two bits, in any combination
 265  *       of top bits of (a,b,c), or in any combination of bottom bits of
 266  *       (a,b,c).
 267  * * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 268  *       the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 269  *       is commonly produced by subtraction) look like a single 1-bit
 270  *       difference.
 271  * * the base values were pseudorandom, all zero but one bit set, or
 272  *       all zero plus a counter that starts at zero.
 273  *
 274  * The use of separate functions for mix() and final() allow for a
 275  * substantial performance increase since final() does not need to
 276  * do well in reverse, but is does need to affect all output bits.
 277  * mix(), on the other hand, does not need to affect all output
 278  * bits (affecting 32 bits is enough).  The original hash function had
 279  * a single mixing operation that had to satisfy both sets of requirements
 280  * and was slower as a result.
 281  *----------
 282  */
 283 #define final(a,b,c) \
 284 { \
 285   c ^= b; c -= rot(b,14); \
 286   a ^= c; a -= rot(c,11); \
 287   b ^= a; b -= rot(a,25); \
 288   c ^= b; c -= rot(b,16); \
 289   a ^= c; a -= rot(c, 4); \
 290   b ^= a; b -= rot(a,14); \
 291   c ^= b; c -= rot(b,24); \
 292 }
 293
 294 /*
 295  * hash_any() -- hash a variable-length key into a 32-bit value
 296  *              k               : the key (the unaligned variable-length array of bytes)
 297  *              len             : the length of the key, counting by bytes
 298  *
 299  * Returns a uint32 value.      Every bit of the key affects every bit of
 300  * the return value.  Every 1-bit and 2-bit delta achieves avalanche.
 301  * About 6*len+35 instructions. The best hash table sizes are powers
 302  * of 2.  There is no need to do mod a prime (mod is sooo slow!).
 303  * If you need less than 32 bits, use a bitmask.
 304  *
 305  * Note: we could easily change this function to return a 64-bit hash value
 306  * by using the final values of both b and c.  b is perhaps a little less
 307  * well mixed than c, however.
 308  */
 309 Datum
 310 hash_any(register const unsigned char *k, register int keylen)
 311 {
 312         register uint32 a,
 313                                 b,
 314                                 c,
 315                                 len;
 316
 317         /* Set up the internal state */
 318         len = keylen;
 319         a = b = c = 0x9e3779b9 + len + 3923095;
 320
 321         /* If the source pointer is word-aligned, we use word-wide fetches */
 322         if (((long) k & UINT32_ALIGN_MASK) == 0)
 323         {
 324                 /* Code path for aligned source data */
 325                 register const uint32 *ka = (const uint32 *) k;
 326
 327                 /* handle most of the key */
 328                 while (len >= 12)
 329                 {
 330                         a += ka[0];
 331                         b += ka[1];
 332                         c += ka[2];
 333                         mix(a, b, c);
 334                         ka += 3;
 335                         len -= 12;
 336                 }
 337
 338                 /* handle the last 11 bytes */
 339                 k = (const unsigned char *) ka;
 340 #ifdef WORDS_BIGENDIAN
 341                 switch (len)
 342                 {
 343                         case 11:
 344                                 c += ((uint32) k[10] << 8);
 345                                 /* fall through */
 346                         case 10:
 347                                 c += ((uint32) k[9] << 16);
 348                                 /* fall through */
 349                         case 9:
 350                                 c += ((uint32) k[8] << 24);
 351                                 /* the lowest byte of c is reserved for the length */
 352                                 /* fall through */
 353                         case 8:
 354                                 b += ka[1];
 355                                 a += ka[0];
 356                                 break;
 357                         case 7:
 358                                 b += ((uint32) k[6] << 8);
 359                                 /* fall through */
 360                         case 6:
 361                                 b += ((uint32) k[5] << 16);
 362                                 /* fall through */
 363                         case 5:
 364                                 b += ((uint32) k[4] << 24);
 365                                 /* fall through */
 366                         case 4:
 367                                 a += ka[0];
 368                                 break;
 369                         case 3:
 370                                 a += ((uint32) k[2] << 8);
 371                                 /* fall through */
 372                         case 2:
 373                                 a += ((uint32) k[1] << 16);
 374                                 /* fall through */
 375                         case 1:
 376                                 a += ((uint32) k[0] << 24);
 377                                 /* case 0: nothing left to add */
 378                 }
 379 #else                                                   /* !WORDS_BIGENDIAN */
 380                 switch (len)
 381                 {
 382                         case 11:
 383                                 c += ((uint32) k[10] << 24);
 384                                 /* fall through */
 385                         case 10:
 386                                 c += ((uint32) k[9] << 16);
 387                                 /* fall through */
 388                         case 9:
 389                                 c += ((uint32) k[8] << 8);
 390                                 /* the lowest byte of c is reserved for the length */
 391                                 /* fall through */
 392                         case 8:
 393                                 b += ka[1];
 394                                 a += ka[0];
 395                                 break;
 396                         case 7:
 397                                 b += ((uint32) k[6] << 16);
 398                                 /* fall through */
 399                         case 6:
 400                                 b += ((uint32) k[5] << 8);
 401                                 /* fall through */
 402                         case 5:
 403                                 b += k[4];
 404                                 /* fall through */
 405                         case 4:
 406                                 a += ka[0];
 407                                 break;
 408                         case 3:
 409                                 a += ((uint32) k[2] << 16);
 410                                 /* fall through */
 411                         case 2:
 412                                 a += ((uint32) k[1] << 8);
 413                                 /* fall through */
 414                         case 1:
 415                                 a += k[0];
 416                                 /* case 0: nothing left to add */
 417                 }
 418 #endif   /* WORDS_BIGENDIAN */
 419         }
 420         else
 421         {
 422                 /* Code path for non-aligned source data */
 423
 424                 /* handle most of the key */
 425                 while (len >= 12)
 426                 {
 427 #ifdef WORDS_BIGENDIAN
 428                         a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
 429                         b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
 430                         c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
 431 #else                                                   /* !WORDS_BIGENDIAN */
 432                         a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
 433                         b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
 434                         c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
 435 #endif   /* WORDS_BIGENDIAN */
 436                         mix(a, b, c);
 437                         k += 12;
 438                         len -= 12;
 439                 }
 440
 441                 /* handle the last 11 bytes */
 442 #ifdef WORDS_BIGENDIAN
 443                 switch (len)                    /* all the case statements fall through */
 444                 {
 445                         case 11:
 446                                 c += ((uint32) k[10] << 8);
 447                         case 10:
 448                                 c += ((uint32) k[9] << 16);
 449                         case 9:
 450                                 c += ((uint32) k[8] << 24);
 451                                 /* the lowest byte of c is reserved for the length */
 452                         case 8:
 453                                 b += k[7];
 454                         case 7:
 455                                 b += ((uint32) k[6] << 8);
 456                         case 6:
 457                                 b += ((uint32) k[5] << 16);
 458                         case 5:
 459                                 b += ((uint32) k[4] << 24);
 460                         case 4:
 461                                 a += k[3];
 462                         case 3:
 463                                 a += ((uint32) k[2] << 8);
 464                         case 2:
 465                                 a += ((uint32) k[1] << 16);
 466                         case 1:
 467                                 a += ((uint32) k[0] << 24);
 468                                 /* case 0: nothing left to add */
 469                 }
 470 #else                                                   /* !WORDS_BIGENDIAN */
 471                 switch (len)                    /* all the case statements fall through */
 472                 {
 473                         case 11:
 474                                 c += ((uint32) k[10] << 24);
 475                         case 10:
 476                                 c += ((uint32) k[9] << 16);
 477                         case 9:
 478                                 c += ((uint32) k[8] << 8);
 479                                 /* the lowest byte of c is reserved for the length */
 480                         case 8:
 481                                 b += ((uint32) k[7] << 24);
 482                         case 7:
 483                                 b += ((uint32) k[6] << 16);
 484                         case 6:
 485                                 b += ((uint32) k[5] << 8);
 486                         case 5:
 487                                 b += k[4];
 488                         case 4:
 489                                 a += ((uint32) k[3] << 24);
 490                         case 3:
 491                                 a += ((uint32) k[2] << 16);
 492                         case 2:
 493                                 a += ((uint32) k[1] << 8);
 494                         case 1:
 495                                 a += k[0];
 496                                 /* case 0: nothing left to add */
 497                 }
 498 #endif   /* WORDS_BIGENDIAN */
 499         }
 500
 501         final(a, b, c);
 502
 503         /* report the result */
 504         return UInt32GetDatum(c);
 505 }
 506
 507 /*
 508  * hash_uint32() -- hash a 32-bit value
 509  *
 510  * This has the same result as
 511  *              hash_any(&k, sizeof(uint32))
 512  * but is faster and doesn't force the caller to store k into memory.
 513  */
 514 Datum
 515 hash_uint32(uint32 k)
 516 {
 517         register uint32 a,
 518                                 b,
 519                                 c;
 520
 521         a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
 522         a += k;
 523
 524         final(a, b, c);
 525
 526         /* report the result */
 527         return UInt32GetDatum(c);
 528 }