src/backend/executor/nodeHash.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nodeHash.c
   4  *        Routines to hash relations for hashjoin
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 /*
  16  * INTERFACE ROUTINES
  17  *              MultiExecHash   - generate an in-memory hash table of the relation
  18  *              ExecInitHash    - initialize node and subnodes
  19  *              ExecEndHash             - shutdown node and subnodes
  20  */
  21
  22 #include "postgres.h"
  23
  24 #include <math.h>
  25 #include <limits.h>
  26
  27 #include "catalog/pg_statistic.h"
  28 #include "commands/tablespace.h"
  29 #include "executor/execdebug.h"
  30 #include "executor/hashjoin.h"
  31 #include "executor/instrument.h"
  32 #include "executor/nodeHash.h"
  33 #include "executor/nodeHashjoin.h"
  34 #include "miscadmin.h"
  35 #include "parser/parse_expr.h"
  36 #include "utils/dynahash.h"
  37 #include "utils/memutils.h"
  38 #include "utils/lsyscache.h"
  39 #include "utils/syscache.h"
  40
  41
  42 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
  43 static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
  44                                           int mcvsToUse);
  45 static void ExecHashSkewTableInsert(HashJoinTable hashtable,
  46                                                 TupleTableSlot *slot,
  47                                                 uint32 hashvalue,
  48                                                 int bucketNumber);
  49 static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
  50
  51
  52 /* ----------------------------------------------------------------
  53  *              ExecHash
  54  *
  55  *              stub for pro forma compliance
  56  * ----------------------------------------------------------------
  57  */
  58 TupleTableSlot *
  59 ExecHash(HashState *node)
  60 {
  61         elog(ERROR, "Hash node does not support ExecProcNode call convention");
  62         return NULL;
  63 }
  64
  65 /* ----------------------------------------------------------------
  66  *              MultiExecHash
  67  *
  68  *              build hash table for hashjoin, doing partitioning if more
  69  *              than one batch is required.
  70  * ----------------------------------------------------------------
  71  */
  72 Node *
  73 MultiExecHash(HashState *node)
  74 {
  75         PlanState  *outerNode;
  76         List       *hashkeys;
  77         HashJoinTable hashtable;
  78         TupleTableSlot *slot;
  79         ExprContext *econtext;
  80         uint32          hashvalue;
  81
  82         /* must provide our own instrumentation support */
  83         if (node->ps.instrument)
  84                 InstrStartNode(node->ps.instrument);
  85
  86         /*
  87          * get state info from node
  88          */
  89         outerNode = outerPlanState(node);
  90         hashtable = node->hashtable;
  91
  92         /*
  93          * set expression context
  94          */
  95         hashkeys = node->hashkeys;
  96         econtext = node->ps.ps_ExprContext;
  97
  98         /*
  99          * get all inner tuples and insert into the hash table (or temp files)
 100          */
 101         for (;;)
 102         {
 103                 slot = ExecProcNode(outerNode);
 104                 if (TupIsNull(slot))
 105                         break;
 106                 /* We have to compute the hash value */
 107                 econtext->ecxt_innertuple = slot;
 108                 if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
 109                                                                  &hashvalue))
 110                 {
 111                         int                     bucketNumber;
 112
 113                         bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue);
 114                         if (bucketNumber != INVALID_SKEW_BUCKET_NO)
 115                         {
 116                                 /* It's a skew tuple, so put it into that hash table */
 117                                 ExecHashSkewTableInsert(hashtable, slot, hashvalue,
 118                                                                                 bucketNumber);
 119                         }
 120                         else
 121                         {
 122                                 /* Not subject to skew optimization, so insert normally */
 123                                 ExecHashTableInsert(hashtable, slot, hashvalue);
 124                         }
 125                         hashtable->totalTuples += 1;
 126                 }
 127         }
 128
 129         /* must provide our own instrumentation support */
 130         if (node->ps.instrument)
 131                 InstrStopNode(node->ps.instrument, hashtable->totalTuples);
 132
 133         /*
 134          * We do not return the hash table directly because it's not a subtype of
 135          * Node, and so would violate the MultiExecProcNode API.  Instead, our
 136          * parent Hashjoin node is expected to know how to fish it out of our node
 137          * state.  Ugly but not really worth cleaning up, since Hashjoin knows
 138          * quite a bit more about Hash besides that.
 139          */
 140         return NULL;
 141 }
 142
 143 /* ----------------------------------------------------------------
 144  *              ExecInitHash
 145  *
 146  *              Init routine for Hash node
 147  * ----------------------------------------------------------------
 148  */
 149 HashState *
 150 ExecInitHash(Hash *node, EState *estate, int eflags)
 151 {
 152         HashState  *hashstate;
 153
 154         /* check for unsupported flags */
 155         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
 156
 157         /*
 158          * create state structure
 159          */
 160         hashstate = makeNode(HashState);
 161         hashstate->ps.plan = (Plan *) node;
 162         hashstate->ps.state = estate;
 163         hashstate->hashtable = NULL;
 164         hashstate->hashkeys = NIL;      /* will be set by parent HashJoin */
 165
 166         /*
 167          * Miscellaneous initialization
 168          *
 169          * create expression context for node
 170          */
 171         ExecAssignExprContext(estate, &hashstate->ps);
 172
 173 #define HASH_NSLOTS 1
 174
 175         /*
 176          * initialize our result slot
 177          */
 178         ExecInitResultTupleSlot(estate, &hashstate->ps);
 179
 180         /*
 181          * initialize child expressions
 182          */
 183         hashstate->ps.targetlist = (List *)
 184                 ExecInitExpr((Expr *) node->plan.targetlist,
 185                                          (PlanState *) hashstate);
 186         hashstate->ps.qual = (List *)
 187                 ExecInitExpr((Expr *) node->plan.qual,
 188                                          (PlanState *) hashstate);
 189
 190         /*
 191          * initialize child nodes
 192          */
 193         outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags);
 194
 195         /*
 196          * initialize tuple type. no need to initialize projection info because
 197          * this node doesn't do projections
 198          */
 199         ExecAssignResultTypeFromTL(&hashstate->ps);
 200         hashstate->ps.ps_ProjInfo = NULL;
 201
 202         return hashstate;
 203 }
 204
 205 int
 206 ExecCountSlotsHash(Hash *node)
 207 {
 208         return ExecCountSlotsNode(outerPlan(node)) +
 209                 ExecCountSlotsNode(innerPlan(node)) +
 210                 HASH_NSLOTS;
 211 }
 212
 213 /* ---------------------------------------------------------------
 214  *              ExecEndHash
 215  *
 216  *              clean up routine for Hash node
 217  * ----------------------------------------------------------------
 218  */
 219 void
 220 ExecEndHash(HashState *node)
 221 {
 222         PlanState  *outerPlan;
 223
 224         /*
 225          * free exprcontext
 226          */
 227         ExecFreeExprContext(&node->ps);
 228
 229         /*
 230          * shut down the subplan
 231          */
 232         outerPlan = outerPlanState(node);
 233         ExecEndNode(outerPlan);
 234 }
 235
 236
 237 /* ----------------------------------------------------------------
 238  *              ExecHashTableCreate
 239  *
 240  *              create an empty hashtable data structure for hashjoin.
 241  * ----------------------------------------------------------------
 242  */
 243 HashJoinTable
 244 ExecHashTableCreate(Hash *node, List *hashOperators)
 245 {
 246         HashJoinTable hashtable;
 247         Plan       *outerNode;
 248         int                     nbuckets;
 249         int                     nbatch;
 250         int                     num_skew_mcvs;
 251         int                     log2_nbuckets;
 252         int                     nkeys;
 253         int                     i;
 254         ListCell   *ho;
 255         MemoryContext oldcxt;
 256
 257         /*
 258          * Get information about the size of the relation to be hashed (it's the
 259          * "outer" subtree of this node, but the inner relation of the hashjoin).
 260          * Compute the appropriate size of the hash table.
 261          */
 262         outerNode = outerPlan(node);
 263
 264         ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width,
 265                                                         OidIsValid(node->skewTable),
 266                                                         &nbuckets, &nbatch, &num_skew_mcvs);
 267
 268 #ifdef HJDEBUG
 269         printf("nbatch = %d, nbuckets = %d\n", nbatch, nbuckets);
 270 #endif
 271
 272         /* nbuckets must be a power of 2 */
 273         log2_nbuckets = my_log2(nbuckets);
 274         Assert(nbuckets == (1 << log2_nbuckets));
 275
 276         /*
 277          * Initialize the hash table control block.
 278          *
 279          * The hashtable control block is just palloc'd from the executor's
 280          * per-query memory context.
 281          */
 282         hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
 283         hashtable->nbuckets = nbuckets;
 284         hashtable->log2_nbuckets = log2_nbuckets;
 285         hashtable->buckets = NULL;
 286         hashtable->skewEnabled = false;
 287         hashtable->skewBucket = NULL;
 288         hashtable->skewBucketLen = 0;
 289         hashtable->nSkewBuckets = 0;
 290         hashtable->skewBucketNums = NULL;
 291         hashtable->nbatch = nbatch;
 292         hashtable->curbatch = 0;
 293         hashtable->nbatch_original = nbatch;
 294         hashtable->nbatch_outstart = nbatch;
 295         hashtable->growEnabled = true;
 296         hashtable->totalTuples = 0;
 297         hashtable->innerBatchFile = NULL;
 298         hashtable->outerBatchFile = NULL;
 299         hashtable->spaceUsed = 0;
 300         hashtable->spaceAllowed = work_mem * 1024L;
 301         hashtable->spaceUsedSkew = 0;
 302         hashtable->spaceAllowedSkew =
 303                 hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
 304
 305         /*
 306          * Get info about the hash functions to be used for each hash key. Also
 307          * remember whether the join operators are strict.
 308          */
 309         nkeys = list_length(hashOperators);
 310         hashtable->outer_hashfunctions =
 311                 (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
 312         hashtable->inner_hashfunctions =
 313                 (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
 314         hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));
 315         i = 0;
 316         foreach(ho, hashOperators)
 317         {
 318                 Oid                     hashop = lfirst_oid(ho);
 319                 Oid                     left_hashfn;
 320                 Oid                     right_hashfn;
 321
 322                 if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
 323                         elog(ERROR, "could not find hash function for hash operator %u",
 324                                  hashop);
 325                 fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
 326                 fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
 327                 hashtable->hashStrict[i] = op_strict(hashop);
 328                 i++;
 329         }
 330
 331         /*
 332          * Create temporary memory contexts in which to keep the hashtable working
 333          * storage.  See notes in executor/hashjoin.h.
 334          */
 335         hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
 336                                                                                            "HashTableContext",
 337                                                                                            ALLOCSET_DEFAULT_MINSIZE,
 338                                                                                            ALLOCSET_DEFAULT_INITSIZE,
 339                                                                                            ALLOCSET_DEFAULT_MAXSIZE);
 340
 341         hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
 342                                                                                                 "HashBatchContext",
 343                                                                                                 ALLOCSET_DEFAULT_MINSIZE,
 344                                                                                                 ALLOCSET_DEFAULT_INITSIZE,
 345                                                                                                 ALLOCSET_DEFAULT_MAXSIZE);
 346
 347         /* Allocate data that will live for the life of the hashjoin */
 348
 349         oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
 350
 351         if (nbatch > 1)
 352         {
 353                 /*
 354                  * allocate and initialize the file arrays in hashCxt
 355                  */
 356                 hashtable->innerBatchFile = (BufFile **)
 357                         palloc0(nbatch * sizeof(BufFile *));
 358                 hashtable->outerBatchFile = (BufFile **)
 359                         palloc0(nbatch * sizeof(BufFile *));
 360                 /* The files will not be opened until needed... */
 361                 /* ... but make sure we have temp tablespaces established for them */
 362                 PrepareTempTablespaces();
 363         }
 364
 365         /*
 366          * Prepare context for the first-scan space allocations; allocate the
 367          * hashbucket array therein, and set each bucket "empty".
 368          */
 369         MemoryContextSwitchTo(hashtable->batchCxt);
 370
 371         hashtable->buckets = (HashJoinTuple *)
 372                 palloc0(nbuckets * sizeof(HashJoinTuple));
 373
 374         /*
 375          * Set up for skew optimization, if possible and there's a need for more
 376          * than one batch.      (In a one-batch join, there's no point in it.)
 377          */
 378         if (nbatch > 1)
 379                 ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
 380
 381         MemoryContextSwitchTo(oldcxt);
 382
 383         return hashtable;
 384 }
 385
 386
 387 /*
 388  * Compute appropriate size for hashtable given the estimated size of the
 389  * relation to be hashed (number of rows and average row width).
 390  *
 391  * This is exported so that the planner's costsize.c can use it.
 392  */
 393
 394 /* Target bucket loading (tuples per bucket) */
 395 #define NTUP_PER_BUCKET                 10
 396
 397 void
 398 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 399                                                 int *numbuckets,
 400                                                 int *numbatches,
 401                                                 int *num_skew_mcvs)
 402 {
 403         int                     tupsize;
 404         double          inner_rel_bytes;
 405         long            hash_table_bytes;
 406         long            skew_table_bytes;
 407         int                     nbatch;
 408         int                     nbuckets;
 409         int                     i;
 410
 411         /* Force a plausible relation size if no info */
 412         if (ntuples <= 0.0)
 413                 ntuples = 1000.0;
 414
 415         /*
 416          * Estimate tupsize based on footprint of tuple in hashtable... note this
 417          * does not allow for any palloc overhead.      The manipulations of spaceUsed
 418          * don't count palloc overhead either.
 419          */
 420         tupsize = HJTUPLE_OVERHEAD +
 421                 MAXALIGN(sizeof(MinimalTupleData)) +
 422                 MAXALIGN(tupwidth);
 423         inner_rel_bytes = ntuples * tupsize;
 424
 425         /*
 426          * Target in-memory hashtable size is work_mem kilobytes.
 427          */
 428         hash_table_bytes = work_mem * 1024L;
 429
 430         /*
 431          * If skew optimization is possible, estimate the number of skew buckets
 432          * that will fit in the memory allowed, and decrement the assumed space
 433          * available for the main hash table accordingly.
 434          *
 435          * We make the optimistic assumption that each skew bucket will contain
 436          * one inner-relation tuple.  If that turns out to be low, we will recover
 437          * at runtime by reducing the number of skew buckets.
 438          *
 439          * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
 440          * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
 441          * will round up to the next power of 2 and then multiply by 4 to reduce
 442          * collisions.
 443          */
 444         if (useskew)
 445         {
 446                 skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
 447
 448                 *num_skew_mcvs = skew_table_bytes / (
 449                 /* size of a hash tuple */
 450                                                                                          tupsize +
 451                 /* worst-case size of skewBucket[] per MCV */
 452                                                                                          (8 * sizeof(HashSkewBucket *)) +
 453                 /* size of skewBucketNums[] entry */
 454                                                                                          sizeof(int) +
 455                 /* size of skew bucket struct itself */
 456                                                                                          SKEW_BUCKET_OVERHEAD
 457                         );
 458
 459                 if (*num_skew_mcvs > 0)
 460                         hash_table_bytes -= skew_table_bytes;
 461         }
 462         else
 463                 *num_skew_mcvs = 0;
 464
 465         /*
 466          * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
 467          * memory is filled.  Set nbatch to the smallest power of 2 that appears
 468          * sufficient.
 469          */
 470         if (inner_rel_bytes > hash_table_bytes)
 471         {
 472                 /* We'll need multiple batches */
 473                 long            lbuckets;
 474                 double          dbatch;
 475                 int                     minbatch;
 476
 477                 lbuckets = (hash_table_bytes / tupsize) / NTUP_PER_BUCKET;
 478                 lbuckets = Min(lbuckets, INT_MAX / 2);
 479                 nbuckets = (int) lbuckets;
 480
 481                 dbatch = ceil(inner_rel_bytes / hash_table_bytes);
 482                 dbatch = Min(dbatch, INT_MAX / 2);
 483                 minbatch = (int) dbatch;
 484                 nbatch = 2;
 485                 while (nbatch < minbatch)
 486                         nbatch <<= 1;
 487         }
 488         else
 489         {
 490                 /* We expect the hashtable to fit in memory */
 491                 double          dbuckets;
 492
 493                 dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
 494                 dbuckets = Min(dbuckets, INT_MAX / 2);
 495                 nbuckets = (int) dbuckets;
 496
 497                 nbatch = 1;
 498         }
 499
 500         /*
 501          * Both nbuckets and nbatch must be powers of 2 to make
 502          * ExecHashGetBucketAndBatch fast.      We already fixed nbatch; now inflate
 503          * nbuckets to the next larger power of 2.      We also force nbuckets to not
 504          * be real small, by starting the search at 2^10.
 505          */
 506         i = 10;
 507         while ((1 << i) < nbuckets)
 508                 i++;
 509         nbuckets = (1 << i);
 510
 511         *numbuckets = nbuckets;
 512         *numbatches = nbatch;
 513 }
 514
 515
 516 /* ----------------------------------------------------------------
 517  *              ExecHashTableDestroy
 518  *
 519  *              destroy a hash table
 520  * ----------------------------------------------------------------
 521  */
 522 void
 523 ExecHashTableDestroy(HashJoinTable hashtable)
 524 {
 525         int                     i;
 526
 527         /*
 528          * Make sure all the temp files are closed.  We skip batch 0, since it
 529          * can't have any temp files (and the arrays might not even exist if
 530          * nbatch is only 1).
 531          */
 532         for (i = 1; i < hashtable->nbatch; i++)
 533         {
 534                 if (hashtable->innerBatchFile[i])
 535                         BufFileClose(hashtable->innerBatchFile[i]);
 536                 if (hashtable->outerBatchFile[i])
 537                         BufFileClose(hashtable->outerBatchFile[i]);
 538         }
 539
 540         /* Release working memory (batchCxt is a child, so it goes away too) */
 541         MemoryContextDelete(hashtable->hashCxt);
 542
 543         /* And drop the control block */
 544         pfree(hashtable);
 545 }
 546
 547 /*
 548  * ExecHashIncreaseNumBatches
 549  *              increase the original number of batches in order to reduce
 550  *              current memory consumption
 551  */
 552 static void
 553 ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 554 {
 555         int                     oldnbatch = hashtable->nbatch;
 556         int                     curbatch = hashtable->curbatch;
 557         int                     nbatch;
 558         int                     i;
 559         MemoryContext oldcxt;
 560         long            ninmemory;
 561         long            nfreed;
 562
 563         /* do nothing if we've decided to shut off growth */
 564         if (!hashtable->growEnabled)
 565                 return;
 566
 567         /* safety check to avoid overflow */
 568         if (oldnbatch > INT_MAX / 2)
 569                 return;
 570
 571         nbatch = oldnbatch * 2;
 572         Assert(nbatch > 1);
 573
 574 #ifdef HJDEBUG
 575         printf("Increasing nbatch to %d because space = %lu\n",
 576                    nbatch, (unsigned long) hashtable->spaceUsed);
 577 #endif
 578
 579         oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
 580
 581         if (hashtable->innerBatchFile == NULL)
 582         {
 583                 /* we had no file arrays before */
 584                 hashtable->innerBatchFile = (BufFile **)
 585                         palloc0(nbatch * sizeof(BufFile *));
 586                 hashtable->outerBatchFile = (BufFile **)
 587                         palloc0(nbatch * sizeof(BufFile *));
 588                 /* time to establish the temp tablespaces, too */
 589                 PrepareTempTablespaces();
 590         }
 591         else
 592         {
 593                 /* enlarge arrays and zero out added entries */
 594                 hashtable->innerBatchFile = (BufFile **)
 595                         repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *));
 596                 hashtable->outerBatchFile = (BufFile **)
 597                         repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *));
 598                 MemSet(hashtable->innerBatchFile + oldnbatch, 0,
 599                            (nbatch - oldnbatch) * sizeof(BufFile *));
 600                 MemSet(hashtable->outerBatchFile + oldnbatch, 0,
 601                            (nbatch - oldnbatch) * sizeof(BufFile *));
 602         }
 603
 604         MemoryContextSwitchTo(oldcxt);
 605
 606         hashtable->nbatch = nbatch;
 607
 608         /*
 609          * Scan through the existing hash table entries and dump out any that are
 610          * no longer of the current batch.
 611          */
 612         ninmemory = nfreed = 0;
 613
 614         for (i = 0; i < hashtable->nbuckets; i++)
 615         {
 616                 HashJoinTuple prevtuple;
 617                 HashJoinTuple tuple;
 618
 619                 prevtuple = NULL;
 620                 tuple = hashtable->buckets[i];
 621
 622                 while (tuple != NULL)
 623                 {
 624                         /* save link in case we delete */
 625                         HashJoinTuple nexttuple = tuple->next;
 626                         int                     bucketno;
 627                         int                     batchno;
 628
 629                         ninmemory++;
 630                         ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
 631                                                                           &bucketno, &batchno);
 632                         Assert(bucketno == i);
 633                         if (batchno == curbatch)
 634                         {
 635                                 /* keep tuple */
 636                                 prevtuple = tuple;
 637                         }
 638                         else
 639                         {
 640                                 /* dump it out */
 641                                 Assert(batchno > curbatch);
 642                                 ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
 643                                                                           tuple->hashvalue,
 644                                                                           &hashtable->innerBatchFile[batchno]);
 645                                 /* and remove from hash table */
 646                                 if (prevtuple)
 647                                         prevtuple->next = nexttuple;
 648                                 else
 649                                         hashtable->buckets[i] = nexttuple;
 650                                 /* prevtuple doesn't change */
 651                                 hashtable->spaceUsed -=
 652                                         HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
 653                                 pfree(tuple);
 654                                 nfreed++;
 655                         }
 656
 657                         tuple = nexttuple;
 658                 }
 659         }
 660
 661 #ifdef HJDEBUG
 662         printf("Freed %ld of %ld tuples, space now %lu\n",
 663                    nfreed, ninmemory, (unsigned long) hashtable->spaceUsed);
 664 #endif
 665
 666         /*
 667          * If we dumped out either all or none of the tuples in the table, disable
 668          * further expansion of nbatch.  This situation implies that we have
 669          * enough tuples of identical hashvalues to overflow spaceAllowed.
 670          * Increasing nbatch will not fix it since there's no way to subdivide the
 671          * group any more finely. We have to just gut it out and hope the server
 672          * has enough RAM.
 673          */
 674         if (nfreed == 0 || nfreed == ninmemory)
 675         {
 676                 hashtable->growEnabled = false;
 677 #ifdef HJDEBUG
 678                 printf("Disabling further increase of nbatch\n");
 679 #endif
 680         }
 681 }
 682
 683 /*
 684  * ExecHashTableInsert
 685  *              insert a tuple into the hash table depending on the hash value
 686  *              it may just go to a temp file for later batches
 687  *
 688  * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
 689  * tuple; the minimal case in particular is certain to happen while reloading
 690  * tuples from batch files.  We could save some cycles in the regular-tuple
 691  * case by not forcing the slot contents into minimal form; not clear if it's
 692  * worth the messiness required.
 693  */
 694 void
 695 ExecHashTableInsert(HashJoinTable hashtable,
 696                                         TupleTableSlot *slot,
 697                                         uint32 hashvalue)
 698 {
 699         MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot);
 700         int                     bucketno;
 701         int                     batchno;
 702
 703         ExecHashGetBucketAndBatch(hashtable, hashvalue,
 704                                                           &bucketno, &batchno);
 705
 706         /*
 707          * decide whether to put the tuple in the hash table or a temp file
 708          */
 709         if (batchno == hashtable->curbatch)
 710         {
 711                 /*
 712                  * put the tuple in hash table
 713                  */
 714                 HashJoinTuple hashTuple;
 715                 int                     hashTupleSize;
 716
 717                 hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
 718                 hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
 719                                                                                                            hashTupleSize);
 720                 hashTuple->hashvalue = hashvalue;
 721                 memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 722                 hashTuple->next = hashtable->buckets[bucketno];
 723                 hashtable->buckets[bucketno] = hashTuple;
 724                 hashtable->spaceUsed += hashTupleSize;
 725                 if (hashtable->spaceUsed > hashtable->spaceAllowed)
 726                         ExecHashIncreaseNumBatches(hashtable);
 727         }
 728         else
 729         {
 730                 /*
 731                  * put the tuple into a temp file for later batches
 732                  */
 733                 Assert(batchno > hashtable->curbatch);
 734                 ExecHashJoinSaveTuple(tuple,
 735                                                           hashvalue,
 736                                                           &hashtable->innerBatchFile[batchno]);
 737         }
 738 }
 739
 740 /*
 741  * ExecHashGetHashValue
 742  *              Compute the hash value for a tuple
 743  *
 744  * The tuple to be tested must be in either econtext->ecxt_outertuple or
 745  * econtext->ecxt_innertuple.  Vars in the hashkeys expressions reference
 746  * either OUTER or INNER.
 747  *
 748  * A TRUE result means the tuple's hash value has been successfully computed
 749  * and stored at *hashvalue.  A FALSE result means the tuple cannot match
 750  * because it contains a null attribute, and hence it should be discarded
 751  * immediately.  (If keep_nulls is true then FALSE is never returned.)
 752  */
 753 bool
 754 ExecHashGetHashValue(HashJoinTable hashtable,
 755                                          ExprContext *econtext,
 756                                          List *hashkeys,
 757                                          bool outer_tuple,
 758                                          bool keep_nulls,
 759                                          uint32 *hashvalue)
 760 {
 761         uint32          hashkey = 0;
 762         FmgrInfo   *hashfunctions;
 763         ListCell   *hk;
 764         int                     i = 0;
 765         MemoryContext oldContext;
 766
 767         /*
 768          * We reset the eval context each time to reclaim any memory leaked in the
 769          * hashkey expressions.
 770          */
 771         ResetExprContext(econtext);
 772
 773         oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
 774
 775         if (outer_tuple)
 776                 hashfunctions = hashtable->outer_hashfunctions;
 777         else
 778                 hashfunctions = hashtable->inner_hashfunctions;
 779
 780         foreach(hk, hashkeys)
 781         {
 782                 ExprState  *keyexpr = (ExprState *) lfirst(hk);
 783                 Datum           keyval;
 784                 bool            isNull;
 785
 786                 /* rotate hashkey left 1 bit at each step */
 787                 hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
 788
 789                 /*
 790                  * Get the join attribute value of the tuple
 791                  */
 792                 keyval = ExecEvalExpr(keyexpr, econtext, &isNull, NULL);
 793
 794                 /*
 795                  * If the attribute is NULL, and the join operator is strict, then
 796                  * this tuple cannot pass the join qual so we can reject it
 797                  * immediately (unless we're scanning the outside of an outer join, in
 798                  * which case we must not reject it).  Otherwise we act like the
 799                  * hashcode of NULL is zero (this will support operators that act like
 800                  * IS NOT DISTINCT, though not any more-random behavior).  We treat
 801                  * the hash support function as strict even if the operator is not.
 802                  *
 803                  * Note: currently, all hashjoinable operators must be strict since
 804                  * the hash index AM assumes that.      However, it takes so little extra
 805                  * code here to allow non-strict that we may as well do it.
 806                  */
 807                 if (isNull)
 808                 {
 809                         if (hashtable->hashStrict[i] && !keep_nulls)
 810                         {
 811                                 MemoryContextSwitchTo(oldContext);
 812                                 return false;   /* cannot match */
 813                         }
 814                         /* else, leave hashkey unmodified, equivalent to hashcode 0 */
 815                 }
 816                 else
 817                 {
 818                         /* Compute the hash function */
 819                         uint32          hkey;
 820
 821                         hkey = DatumGetUInt32(FunctionCall1(&hashfunctions[i], keyval));
 822                         hashkey ^= hkey;
 823                 }
 824
 825                 i++;
 826         }
 827
 828         MemoryContextSwitchTo(oldContext);
 829
 830         *hashvalue = hashkey;
 831         return true;
 832 }
 833
 834 /*
 835  * ExecHashGetBucketAndBatch
 836  *              Determine the bucket number and batch number for a hash value
 837  *
 838  * Note: on-the-fly increases of nbatch must not change the bucket number
 839  * for a given hash code (since we don't move tuples to different hash
 840  * chains), and must only cause the batch number to remain the same or
 841  * increase.  Our algorithm is
 842  *              bucketno = hashvalue MOD nbuckets
 843  *              batchno = (hashvalue DIV nbuckets) MOD nbatch
 844  * where nbuckets and nbatch are both expected to be powers of 2, so we can
 845  * do the computations by shifting and masking.  (This assumes that all hash
 846  * functions are good about randomizing all their output bits, else we are
 847  * likely to have very skewed bucket or batch occupancy.)
 848  *
 849  * nbuckets doesn't change over the course of the join.
 850  *
 851  * nbatch is always a power of 2; we increase it only by doubling it.  This
 852  * effectively adds one more bit to the top of the batchno.
 853  */
 854 void
 855 ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 856                                                   uint32 hashvalue,
 857                                                   int *bucketno,
 858                                                   int *batchno)
 859 {
 860         uint32          nbuckets = (uint32) hashtable->nbuckets;
 861         uint32          nbatch = (uint32) hashtable->nbatch;
 862
 863         if (nbatch > 1)
 864         {
 865                 /* we can do MOD by masking, DIV by shifting */
 866                 *bucketno = hashvalue & (nbuckets - 1);
 867                 *batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1);
 868         }
 869         else
 870         {
 871                 *bucketno = hashvalue & (nbuckets - 1);
 872                 *batchno = 0;
 873         }
 874 }
 875
 876 /*
 877  * ExecScanHashBucket
 878  *              scan a hash bucket for matches to the current outer tuple
 879  *
 880  * The current outer tuple must be stored in econtext->ecxt_outertuple.
 881  */
 882 HashJoinTuple
 883 ExecScanHashBucket(HashJoinState *hjstate,
 884                                    ExprContext *econtext)
 885 {
 886         List       *hjclauses = hjstate->hashclauses;
 887         HashJoinTable hashtable = hjstate->hj_HashTable;
 888         HashJoinTuple hashTuple = hjstate->hj_CurTuple;
 889         uint32          hashvalue = hjstate->hj_CurHashValue;
 890
 891         /*
 892          * hj_CurTuple is the address of the tuple last returned from the current
 893          * bucket, or NULL if it's time to start scanning a new bucket.
 894          *
 895          * If the tuple hashed to a skew bucket then scan the skew bucket
 896          * otherwise scan the standard hashtable bucket.
 897          */
 898         if (hashTuple != NULL)
 899                 hashTuple = hashTuple->next;
 900         else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
 901                 hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
 902         else
 903                 hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
 904
 905         while (hashTuple != NULL)
 906         {
 907                 if (hashTuple->hashvalue == hashvalue)
 908                 {
 909                         TupleTableSlot *inntuple;
 910
 911                         /* insert hashtable's tuple into exec slot so ExecQual sees it */
 912                         inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
 913                                                                                          hjstate->hj_HashTupleSlot,
 914                                                                                          false);        /* do not pfree */
 915                         econtext->ecxt_innertuple = inntuple;
 916
 917                         /* reset temp memory each time to avoid leaks from qual expr */
 918                         ResetExprContext(econtext);
 919
 920                         if (ExecQual(hjclauses, econtext, false))
 921                         {
 922                                 hjstate->hj_CurTuple = hashTuple;
 923                                 return hashTuple;
 924                         }
 925                 }
 926
 927                 hashTuple = hashTuple->next;
 928         }
 929
 930         /*
 931          * no match
 932          */
 933         return NULL;
 934 }
 935
 936 /*
 937  * ExecHashTableReset
 938  *
 939  *              reset hash table header for new batch
 940  */
 941 void
 942 ExecHashTableReset(HashJoinTable hashtable)
 943 {
 944         MemoryContext oldcxt;
 945         int                     nbuckets = hashtable->nbuckets;
 946
 947         /*
 948          * Release all the hash buckets and tuples acquired in the prior pass, and
 949          * reinitialize the context for a new pass.
 950          */
 951         MemoryContextReset(hashtable->batchCxt);
 952         oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
 953
 954         /* Reallocate and reinitialize the hash bucket headers. */
 955         hashtable->buckets = (HashJoinTuple *)
 956                 palloc0(nbuckets * sizeof(HashJoinTuple));
 957
 958         hashtable->spaceUsed = 0;
 959
 960         MemoryContextSwitchTo(oldcxt);
 961 }
 962
 963 void
 964 ExecReScanHash(HashState *node, ExprContext *exprCtxt)
 965 {
 966         /*
 967          * if chgParam of subnode is not null then plan will be re-scanned by
 968          * first ExecProcNode.
 969          */
 970         if (((PlanState *) node)->lefttree->chgParam == NULL)
 971                 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
 972 }
 973
 974
 975 /*
 976  * ExecHashBuildSkewHash
 977  *
 978  *              Set up for skew optimization if we can identify the most common values
 979  *              (MCVs) of the outer relation's join key.  We make a skew hash bucket
 980  *              for the hash value of each MCV, up to the number of slots allowed
 981  *              based on available memory.
 982  */
 983 static void
 984 ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse)
 985 {
 986         HeapTupleData *statsTuple;
 987         Datum      *values;
 988         int                     nvalues;
 989         float4     *numbers;
 990         int                     nnumbers;
 991
 992         /* Do nothing if planner didn't identify the outer relation's join key */
 993         if (!OidIsValid(node->skewTable))
 994                 return;
 995         /* Also, do nothing if we don't have room for at least one skew bucket */
 996         if (mcvsToUse <= 0)
 997                 return;
 998
 999         /*
1000          * Try to find the MCV statistics for the outer relation's join key.
1001          */
1002         statsTuple = SearchSysCache(STATRELATT,
1003                                                                 ObjectIdGetDatum(node->skewTable),
1004                                                                 Int16GetDatum(node->skewColumn),
1005                                                                 0, 0);
1006         if (!HeapTupleIsValid(statsTuple))
1007                 return;
1008
1009         if (get_attstatsslot(statsTuple, node->skewColType, node->skewColTypmod,
1010                                                  STATISTIC_KIND_MCV, InvalidOid,
1011                                                  &values, &nvalues,
1012                                                  &numbers, &nnumbers))
1013         {
1014                 double          frac;
1015                 int                     nbuckets;
1016                 FmgrInfo   *hashfunctions;
1017                 int                     i;
1018
1019                 if (mcvsToUse > nvalues)
1020                         mcvsToUse = nvalues;
1021
1022                 /*
1023                  * Calculate the expected fraction of outer relation that will
1024                  * participate in the skew optimization.  If this isn't at least
1025                  * SKEW_MIN_OUTER_FRACTION, don't use skew optimization.
1026                  */
1027                 frac = 0;
1028                 for (i = 0; i < mcvsToUse; i++)
1029                         frac += numbers[i];
1030                 if (frac < SKEW_MIN_OUTER_FRACTION)
1031                 {
1032                         free_attstatsslot(node->skewColType,
1033                                                           values, nvalues, numbers, nnumbers);
1034                         ReleaseSysCache(statsTuple);
1035                         return;
1036                 }
1037
1038                 /*
1039                  * Okay, set up the skew hashtable.
1040                  *
1041                  * skewBucket[] is an open addressing hashtable with a power of 2 size
1042                  * that is greater than the number of MCV values.  (This ensures there
1043                  * will be at least one null entry, so searches will always
1044                  * terminate.)
1045                  *
1046                  * Note: this code could fail if mcvsToUse exceeds INT_MAX/8, but that
1047                  * is not currently possible since we limit pg_statistic entries to
1048                  * much less than that.
1049                  */
1050                 nbuckets = 2;
1051                 while (nbuckets <= mcvsToUse)
1052                         nbuckets <<= 1;
1053                 /* use two more bits just to help avoid collisions */
1054                 nbuckets <<= 2;
1055
1056                 hashtable->skewEnabled = true;
1057                 hashtable->skewBucketLen = nbuckets;
1058
1059                 /*
1060                  * We allocate the bucket memory in the hashtable's batch context. It
1061                  * is only needed during the first batch, and this ensures it will be
1062                  * automatically removed once the first batch is done.
1063                  */
1064                 hashtable->skewBucket = (HashSkewBucket **)
1065                         MemoryContextAllocZero(hashtable->batchCxt,
1066                                                                    nbuckets * sizeof(HashSkewBucket *));
1067                 hashtable->skewBucketNums = (int *)
1068                         MemoryContextAllocZero(hashtable->batchCxt,
1069                                                                    mcvsToUse * sizeof(int));
1070
1071                 hashtable->spaceUsed += nbuckets * sizeof(HashSkewBucket *)
1072                         + mcvsToUse * sizeof(int);
1073                 hashtable->spaceUsedSkew += nbuckets * sizeof(HashSkewBucket *)
1074                         + mcvsToUse * sizeof(int);
1075
1076                 /*
1077                  * Create a skew bucket for each MCV hash value.
1078                  *
1079                  * Note: it is very important that we create the buckets in order of
1080                  * decreasing MCV frequency.  If we have to remove some buckets, they
1081                  * must be removed in reverse order of creation (see notes in
1082                  * ExecHashRemoveNextSkewBucket) and we want the least common MCVs to
1083                  * be removed first.
1084                  */
1085                 hashfunctions = hashtable->outer_hashfunctions;
1086
1087                 for (i = 0; i < mcvsToUse; i++)
1088                 {
1089                         uint32          hashvalue;
1090                         int                     bucket;
1091
1092                         hashvalue = DatumGetUInt32(FunctionCall1(&hashfunctions[0],
1093                                                                                                          values[i]));
1094
1095                         /*
1096                          * While we have not hit a hole in the hashtable and have not hit
1097                          * the desired bucket, we have collided with some previous hash
1098                          * value, so try the next bucket location.      NB: this code must
1099                          * match ExecHashGetSkewBucket.
1100                          */
1101                         bucket = hashvalue & (nbuckets - 1);
1102                         while (hashtable->skewBucket[bucket] != NULL &&
1103                                    hashtable->skewBucket[bucket]->hashvalue != hashvalue)
1104                                 bucket = (bucket + 1) & (nbuckets - 1);
1105
1106                         /*
1107                          * If we found an existing bucket with the same hashvalue, leave
1108                          * it alone.  It's okay for two MCVs to share a hashvalue.
1109                          */
1110                         if (hashtable->skewBucket[bucket] != NULL)
1111                                 continue;
1112
1113                         /* Okay, create a new skew bucket for this hashvalue. */
1114                         hashtable->skewBucket[bucket] = (HashSkewBucket *)
1115                                 MemoryContextAlloc(hashtable->batchCxt,
1116                                                                    sizeof(HashSkewBucket));
1117                         hashtable->skewBucket[bucket]->hashvalue = hashvalue;
1118                         hashtable->skewBucket[bucket]->tuples = NULL;
1119                         hashtable->skewBucketNums[hashtable->nSkewBuckets] = bucket;
1120                         hashtable->nSkewBuckets++;
1121                         hashtable->spaceUsed += SKEW_BUCKET_OVERHEAD;
1122                         hashtable->spaceUsedSkew += SKEW_BUCKET_OVERHEAD;
1123                 }
1124
1125                 free_attstatsslot(node->skewColType,
1126                                                   values, nvalues, numbers, nnumbers);
1127         }
1128
1129         ReleaseSysCache(statsTuple);
1130 }
1131
1132 /*
1133  * ExecHashGetSkewBucket
1134  *
1135  *              Returns the index of the skew bucket for this hashvalue,
1136  *              or INVALID_SKEW_BUCKET_NO if the hashvalue is not
1137  *              associated with any active skew bucket.
1138  */
1139 int
1140 ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
1141 {
1142         int                     bucket;
1143
1144         /*
1145          * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in
1146          * particular, this happens after the initial batch is done).
1147          */
1148         if (!hashtable->skewEnabled)
1149                 return INVALID_SKEW_BUCKET_NO;
1150
1151         /*
1152          * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.
1153          */
1154         bucket = hashvalue & (hashtable->skewBucketLen - 1);
1155
1156         /*
1157          * While we have not hit a hole in the hashtable and have not hit the
1158          * desired bucket, we have collided with some other hash value, so try the
1159          * next bucket location.
1160          */
1161         while (hashtable->skewBucket[bucket] != NULL &&
1162                    hashtable->skewBucket[bucket]->hashvalue != hashvalue)
1163                 bucket = (bucket + 1) & (hashtable->skewBucketLen - 1);
1164
1165         /*
1166          * Found the desired bucket?
1167          */
1168         if (hashtable->skewBucket[bucket] != NULL)
1169                 return bucket;
1170
1171         /*
1172          * There must not be any hashtable entry for this hash value.
1173          */
1174         return INVALID_SKEW_BUCKET_NO;
1175 }
1176
1177 /*
1178  * ExecHashSkewTableInsert
1179  *
1180  *              Insert a tuple into the skew hashtable.
1181  *
1182  * This should generally match up with the current-batch case in
1183  * ExecHashTableInsert.
1184  */
1185 static void
1186 ExecHashSkewTableInsert(HashJoinTable hashtable,
1187                                                 TupleTableSlot *slot,
1188                                                 uint32 hashvalue,
1189                                                 int bucketNumber)
1190 {
1191         MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot);
1192         HashJoinTuple hashTuple;
1193         int                     hashTupleSize;
1194
1195         /* Create the HashJoinTuple */
1196         hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
1197         hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
1198                                                                                                    hashTupleSize);
1199         hashTuple->hashvalue = hashvalue;
1200         memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
1201
1202         /* Push it onto the front of the skew bucket's list */
1203         hashTuple->next = hashtable->skewBucket[bucketNumber]->tuples;
1204         hashtable->skewBucket[bucketNumber]->tuples = hashTuple;
1205
1206         /* Account for space used, and back off if we've used too much */
1207         hashtable->spaceUsed += hashTupleSize;
1208         hashtable->spaceUsedSkew += hashTupleSize;
1209         while (hashtable->spaceUsedSkew > hashtable->spaceAllowedSkew)
1210                 ExecHashRemoveNextSkewBucket(hashtable);
1211
1212         /* Check we are not over the total spaceAllowed, either */
1213         if (hashtable->spaceUsed > hashtable->spaceAllowed)
1214                 ExecHashIncreaseNumBatches(hashtable);
1215 }
1216
1217 /*
1218  *              ExecHashRemoveNextSkewBucket
1219  *
1220  *              Remove the least valuable skew bucket by pushing its tuples into
1221  *              the main hash table.
1222  */
1223 static void
1224 ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
1225 {
1226         int                     bucketToRemove;
1227         HashSkewBucket *bucket;
1228         uint32          hashvalue;
1229         int                     bucketno;
1230         int                     batchno;
1231         HashJoinTuple hashTuple;
1232
1233         /* Locate the bucket to remove */
1234         bucketToRemove = hashtable->skewBucketNums[hashtable->nSkewBuckets - 1];
1235         bucket = hashtable->skewBucket[bucketToRemove];
1236
1237         /*
1238          * Calculate which bucket and batch the tuples belong to in the main
1239          * hashtable.  They all have the same hash value, so it's the same for all
1240          * of them.  Also note that it's not possible for nbatch to increase while
1241          * we are processing the tuples.
1242          */
1243         hashvalue = bucket->hashvalue;
1244         ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
1245
1246         /* Process all tuples in the bucket */
1247         hashTuple = bucket->tuples;
1248         while (hashTuple != NULL)
1249         {
1250                 HashJoinTuple nextHashTuple = hashTuple->next;
1251                 MinimalTuple tuple;
1252                 Size            tupleSize;
1253
1254                 /*
1255                  * This code must agree with ExecHashTableInsert.  We do not use
1256                  * ExecHashTableInsert directly as ExecHashTableInsert expects a
1257                  * TupleTableSlot while we already have HashJoinTuples.
1258                  */
1259                 tuple = HJTUPLE_MINTUPLE(hashTuple);
1260                 tupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
1261
1262                 /* Decide whether to put the tuple in the hash table or a temp file */
1263                 if (batchno == hashtable->curbatch)
1264                 {
1265                         /* Move the tuple to the main hash table */
1266                         hashTuple->next = hashtable->buckets[bucketno];
1267                         hashtable->buckets[bucketno] = hashTuple;
1268                         /* We have reduced skew space, but overall space doesn't change */
1269                         hashtable->spaceUsedSkew -= tupleSize;
1270                 }
1271                 else
1272                 {
1273                         /* Put the tuple into a temp file for later batches */
1274                         Assert(batchno > hashtable->curbatch);
1275                         ExecHashJoinSaveTuple(tuple, hashvalue,
1276                                                                   &hashtable->innerBatchFile[batchno]);
1277                         pfree(hashTuple);
1278                         hashtable->spaceUsed -= tupleSize;
1279                         hashtable->spaceUsedSkew -= tupleSize;
1280                 }
1281
1282                 hashTuple = nextHashTuple;
1283         }
1284
1285         /*
1286          * Free the bucket struct itself and reset the hashtable entry to NULL.
1287          *
1288          * NOTE: this is not nearly as simple as it looks on the surface, because
1289          * of the possibility of collisions in the hashtable.  Suppose that hash
1290          * values A and B collide at a particular hashtable entry, and that A was
1291          * entered first so B gets shifted to a different table entry.  If we were
1292          * to remove A first then ExecHashGetSkewBucket would mistakenly start
1293          * reporting that B is not in the hashtable, because it would hit the NULL
1294          * before finding B.  However, we always remove entries in the reverse
1295          * order of creation, so this failure cannot happen.
1296          */
1297         hashtable->skewBucket[bucketToRemove] = NULL;
1298         hashtable->nSkewBuckets--;
1299         pfree(bucket);
1300         hashtable->spaceUsed -= SKEW_BUCKET_OVERHEAD;
1301         hashtable->spaceUsedSkew -= SKEW_BUCKET_OVERHEAD;
1302
1303         /*
1304          * If we have removed all skew buckets then give up on skew optimization.
1305          * Release the arrays since they aren't useful any more.
1306          */
1307         if (hashtable->nSkewBuckets == 0)
1308         {
1309                 hashtable->skewEnabled = false;
1310                 pfree(hashtable->skewBucket);
1311                 pfree(hashtable->skewBucketNums);
1312                 hashtable->skewBucket = NULL;
1313                 hashtable->skewBucketNums = NULL;
1314                 hashtable->spaceUsed -= hashtable->spaceUsedSkew;
1315                 hashtable->spaceUsedSkew = 0;
1316         }
1317 }