Disallow empty passwords in LDAP authentication, the same way
[PostgreSQL.git] / src / backend / executor / nodeHash.c
blob79a84da577715542f0d65700b9c07a28e9ab551b
1 /*-------------------------------------------------------------------------
3 * nodeHash.c
4 * Routines to hash relations for hashjoin
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * $PostgreSQL$
13 *-------------------------------------------------------------------------
16 * INTERFACE ROUTINES
17 * MultiExecHash - generate an in-memory hash table of the relation
18 * ExecInitHash - initialize node and subnodes
19 * ExecEndHash - shutdown node and subnodes
22 #include "postgres.h"
24 #include <math.h>
25 #include <limits.h>
27 #include "catalog/pg_statistic.h"
28 #include "commands/tablespace.h"
29 #include "executor/execdebug.h"
30 #include "executor/hashjoin.h"
31 #include "executor/instrument.h"
32 #include "executor/nodeHash.h"
33 #include "executor/nodeHashjoin.h"
34 #include "miscadmin.h"
35 #include "parser/parse_expr.h"
36 #include "utils/dynahash.h"
37 #include "utils/memutils.h"
38 #include "utils/lsyscache.h"
39 #include "utils/syscache.h"
42 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
43 static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
44 int mcvsToUse);
45 static void ExecHashSkewTableInsert(HashJoinTable hashtable,
46 TupleTableSlot *slot,
47 uint32 hashvalue,
48 int bucketNumber);
49 static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
52 /* ----------------------------------------------------------------
53 * ExecHash
55 * stub for pro forma compliance
56 * ----------------------------------------------------------------
58 TupleTableSlot *
59 ExecHash(HashState *node)
61 elog(ERROR, "Hash node does not support ExecProcNode call convention");
62 return NULL;
65 /* ----------------------------------------------------------------
66 * MultiExecHash
68 * build hash table for hashjoin, doing partitioning if more
69 * than one batch is required.
70 * ----------------------------------------------------------------
72 Node *
73 MultiExecHash(HashState *node)
75 PlanState *outerNode;
76 List *hashkeys;
77 HashJoinTable hashtable;
78 TupleTableSlot *slot;
79 ExprContext *econtext;
80 uint32 hashvalue;
82 /* must provide our own instrumentation support */
83 if (node->ps.instrument)
84 InstrStartNode(node->ps.instrument);
87 * get state info from node
89 outerNode = outerPlanState(node);
90 hashtable = node->hashtable;
93 * set expression context
95 hashkeys = node->hashkeys;
96 econtext = node->ps.ps_ExprContext;
99 * get all inner tuples and insert into the hash table (or temp files)
101 for (;;)
103 slot = ExecProcNode(outerNode);
104 if (TupIsNull(slot))
105 break;
106 /* We have to compute the hash value */
107 econtext->ecxt_innertuple = slot;
108 if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
109 &hashvalue))
111 int bucketNumber;
113 bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue);
114 if (bucketNumber != INVALID_SKEW_BUCKET_NO)
116 /* It's a skew tuple, so put it into that hash table */
117 ExecHashSkewTableInsert(hashtable, slot, hashvalue,
118 bucketNumber);
120 else
122 /* Not subject to skew optimization, so insert normally */
123 ExecHashTableInsert(hashtable, slot, hashvalue);
125 hashtable->totalTuples += 1;
129 /* must provide our own instrumentation support */
130 if (node->ps.instrument)
131 InstrStopNode(node->ps.instrument, hashtable->totalTuples);
134 * We do not return the hash table directly because it's not a subtype of
135 * Node, and so would violate the MultiExecProcNode API. Instead, our
136 * parent Hashjoin node is expected to know how to fish it out of our node
137 * state. Ugly but not really worth cleaning up, since Hashjoin knows
138 * quite a bit more about Hash besides that.
140 return NULL;
143 /* ----------------------------------------------------------------
144 * ExecInitHash
146 * Init routine for Hash node
147 * ----------------------------------------------------------------
149 HashState *
150 ExecInitHash(Hash *node, EState *estate, int eflags)
152 HashState *hashstate;
154 /* check for unsupported flags */
155 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
158 * create state structure
160 hashstate = makeNode(HashState);
161 hashstate->ps.plan = (Plan *) node;
162 hashstate->ps.state = estate;
163 hashstate->hashtable = NULL;
164 hashstate->hashkeys = NIL; /* will be set by parent HashJoin */
167 * Miscellaneous initialization
169 * create expression context for node
171 ExecAssignExprContext(estate, &hashstate->ps);
173 #define HASH_NSLOTS 1
176 * initialize our result slot
178 ExecInitResultTupleSlot(estate, &hashstate->ps);
181 * initialize child expressions
183 hashstate->ps.targetlist = (List *)
184 ExecInitExpr((Expr *) node->plan.targetlist,
185 (PlanState *) hashstate);
186 hashstate->ps.qual = (List *)
187 ExecInitExpr((Expr *) node->plan.qual,
188 (PlanState *) hashstate);
191 * initialize child nodes
193 outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags);
196 * initialize tuple type. no need to initialize projection info because
197 * this node doesn't do projections
199 ExecAssignResultTypeFromTL(&hashstate->ps);
200 hashstate->ps.ps_ProjInfo = NULL;
202 return hashstate;
206 ExecCountSlotsHash(Hash *node)
208 return ExecCountSlotsNode(outerPlan(node)) +
209 ExecCountSlotsNode(innerPlan(node)) +
210 HASH_NSLOTS;
213 /* ---------------------------------------------------------------
214 * ExecEndHash
216 * clean up routine for Hash node
217 * ----------------------------------------------------------------
219 void
220 ExecEndHash(HashState *node)
222 PlanState *outerPlan;
225 * free exprcontext
227 ExecFreeExprContext(&node->ps);
230 * shut down the subplan
232 outerPlan = outerPlanState(node);
233 ExecEndNode(outerPlan);
237 /* ----------------------------------------------------------------
238 * ExecHashTableCreate
240 * create an empty hashtable data structure for hashjoin.
241 * ----------------------------------------------------------------
243 HashJoinTable
244 ExecHashTableCreate(Hash *node, List *hashOperators)
246 HashJoinTable hashtable;
247 Plan *outerNode;
248 int nbuckets;
249 int nbatch;
250 int num_skew_mcvs;
251 int log2_nbuckets;
252 int nkeys;
253 int i;
254 ListCell *ho;
255 MemoryContext oldcxt;
258 * Get information about the size of the relation to be hashed (it's the
259 * "outer" subtree of this node, but the inner relation of the hashjoin).
260 * Compute the appropriate size of the hash table.
262 outerNode = outerPlan(node);
264 ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width,
265 OidIsValid(node->skewTable),
266 &nbuckets, &nbatch, &num_skew_mcvs);
268 #ifdef HJDEBUG
269 printf("nbatch = %d, nbuckets = %d\n", nbatch, nbuckets);
270 #endif
272 /* nbuckets must be a power of 2 */
273 log2_nbuckets = my_log2(nbuckets);
274 Assert(nbuckets == (1 << log2_nbuckets));
277 * Initialize the hash table control block.
279 * The hashtable control block is just palloc'd from the executor's
280 * per-query memory context.
282 hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
283 hashtable->nbuckets = nbuckets;
284 hashtable->log2_nbuckets = log2_nbuckets;
285 hashtable->buckets = NULL;
286 hashtable->skewEnabled = false;
287 hashtable->skewBucket = NULL;
288 hashtable->skewBucketLen = 0;
289 hashtable->nSkewBuckets = 0;
290 hashtable->skewBucketNums = NULL;
291 hashtable->nbatch = nbatch;
292 hashtable->curbatch = 0;
293 hashtable->nbatch_original = nbatch;
294 hashtable->nbatch_outstart = nbatch;
295 hashtable->growEnabled = true;
296 hashtable->totalTuples = 0;
297 hashtable->innerBatchFile = NULL;
298 hashtable->outerBatchFile = NULL;
299 hashtable->spaceUsed = 0;
300 hashtable->spaceAllowed = work_mem * 1024L;
301 hashtable->spaceUsedSkew = 0;
302 hashtable->spaceAllowedSkew =
303 hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
306 * Get info about the hash functions to be used for each hash key. Also
307 * remember whether the join operators are strict.
309 nkeys = list_length(hashOperators);
310 hashtable->outer_hashfunctions =
311 (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
312 hashtable->inner_hashfunctions =
313 (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
314 hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));
315 i = 0;
316 foreach(ho, hashOperators)
318 Oid hashop = lfirst_oid(ho);
319 Oid left_hashfn;
320 Oid right_hashfn;
322 if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
323 elog(ERROR, "could not find hash function for hash operator %u",
324 hashop);
325 fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
326 fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
327 hashtable->hashStrict[i] = op_strict(hashop);
328 i++;
332 * Create temporary memory contexts in which to keep the hashtable working
333 * storage. See notes in executor/hashjoin.h.
335 hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
336 "HashTableContext",
337 ALLOCSET_DEFAULT_MINSIZE,
338 ALLOCSET_DEFAULT_INITSIZE,
339 ALLOCSET_DEFAULT_MAXSIZE);
341 hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
342 "HashBatchContext",
343 ALLOCSET_DEFAULT_MINSIZE,
344 ALLOCSET_DEFAULT_INITSIZE,
345 ALLOCSET_DEFAULT_MAXSIZE);
347 /* Allocate data that will live for the life of the hashjoin */
349 oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
351 if (nbatch > 1)
354 * allocate and initialize the file arrays in hashCxt
356 hashtable->innerBatchFile = (BufFile **)
357 palloc0(nbatch * sizeof(BufFile *));
358 hashtable->outerBatchFile = (BufFile **)
359 palloc0(nbatch * sizeof(BufFile *));
360 /* The files will not be opened until needed... */
361 /* ... but make sure we have temp tablespaces established for them */
362 PrepareTempTablespaces();
366 * Prepare context for the first-scan space allocations; allocate the
367 * hashbucket array therein, and set each bucket "empty".
369 MemoryContextSwitchTo(hashtable->batchCxt);
371 hashtable->buckets = (HashJoinTuple *)
372 palloc0(nbuckets * sizeof(HashJoinTuple));
375 * Set up for skew optimization, if possible and there's a need for more
376 * than one batch. (In a one-batch join, there's no point in it.)
378 if (nbatch > 1)
379 ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
381 MemoryContextSwitchTo(oldcxt);
383 return hashtable;
388 * Compute appropriate size for hashtable given the estimated size of the
389 * relation to be hashed (number of rows and average row width).
391 * This is exported so that the planner's costsize.c can use it.
394 /* Target bucket loading (tuples per bucket) */
395 #define NTUP_PER_BUCKET 10
397 void
398 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
399 int *numbuckets,
400 int *numbatches,
401 int *num_skew_mcvs)
403 int tupsize;
404 double inner_rel_bytes;
405 long hash_table_bytes;
406 long skew_table_bytes;
407 int nbatch;
408 int nbuckets;
409 int i;
411 /* Force a plausible relation size if no info */
412 if (ntuples <= 0.0)
413 ntuples = 1000.0;
416 * Estimate tupsize based on footprint of tuple in hashtable... note this
417 * does not allow for any palloc overhead. The manipulations of spaceUsed
418 * don't count palloc overhead either.
420 tupsize = HJTUPLE_OVERHEAD +
421 MAXALIGN(sizeof(MinimalTupleData)) +
422 MAXALIGN(tupwidth);
423 inner_rel_bytes = ntuples * tupsize;
426 * Target in-memory hashtable size is work_mem kilobytes.
428 hash_table_bytes = work_mem * 1024L;
431 * If skew optimization is possible, estimate the number of skew buckets
432 * that will fit in the memory allowed, and decrement the assumed space
433 * available for the main hash table accordingly.
435 * We make the optimistic assumption that each skew bucket will contain
436 * one inner-relation tuple. If that turns out to be low, we will recover
437 * at runtime by reducing the number of skew buckets.
439 * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
440 * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
441 * will round up to the next power of 2 and then multiply by 4 to reduce
442 * collisions.
444 if (useskew)
446 skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
448 *num_skew_mcvs = skew_table_bytes / (
449 /* size of a hash tuple */
450 tupsize +
451 /* worst-case size of skewBucket[] per MCV */
452 (8 * sizeof(HashSkewBucket *)) +
453 /* size of skewBucketNums[] entry */
454 sizeof(int) +
455 /* size of skew bucket struct itself */
456 SKEW_BUCKET_OVERHEAD
459 if (*num_skew_mcvs > 0)
460 hash_table_bytes -= skew_table_bytes;
462 else
463 *num_skew_mcvs = 0;
466 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
467 * memory is filled. Set nbatch to the smallest power of 2 that appears
468 * sufficient.
470 if (inner_rel_bytes > hash_table_bytes)
472 /* We'll need multiple batches */
473 long lbuckets;
474 double dbatch;
475 int minbatch;
477 lbuckets = (hash_table_bytes / tupsize) / NTUP_PER_BUCKET;
478 lbuckets = Min(lbuckets, INT_MAX / 2);
479 nbuckets = (int) lbuckets;
481 dbatch = ceil(inner_rel_bytes / hash_table_bytes);
482 dbatch = Min(dbatch, INT_MAX / 2);
483 minbatch = (int) dbatch;
484 nbatch = 2;
485 while (nbatch < minbatch)
486 nbatch <<= 1;
488 else
490 /* We expect the hashtable to fit in memory */
491 double dbuckets;
493 dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
494 dbuckets = Min(dbuckets, INT_MAX / 2);
495 nbuckets = (int) dbuckets;
497 nbatch = 1;
501 * Both nbuckets and nbatch must be powers of 2 to make
502 * ExecHashGetBucketAndBatch fast. We already fixed nbatch; now inflate
503 * nbuckets to the next larger power of 2. We also force nbuckets to not
504 * be real small, by starting the search at 2^10.
506 i = 10;
507 while ((1 << i) < nbuckets)
508 i++;
509 nbuckets = (1 << i);
511 *numbuckets = nbuckets;
512 *numbatches = nbatch;
516 /* ----------------------------------------------------------------
517 * ExecHashTableDestroy
519 * destroy a hash table
520 * ----------------------------------------------------------------
522 void
523 ExecHashTableDestroy(HashJoinTable hashtable)
525 int i;
528 * Make sure all the temp files are closed. We skip batch 0, since it
529 * can't have any temp files (and the arrays might not even exist if
530 * nbatch is only 1).
532 for (i = 1; i < hashtable->nbatch; i++)
534 if (hashtable->innerBatchFile[i])
535 BufFileClose(hashtable->innerBatchFile[i]);
536 if (hashtable->outerBatchFile[i])
537 BufFileClose(hashtable->outerBatchFile[i]);
540 /* Release working memory (batchCxt is a child, so it goes away too) */
541 MemoryContextDelete(hashtable->hashCxt);
543 /* And drop the control block */
544 pfree(hashtable);
548 * ExecHashIncreaseNumBatches
549 * increase the original number of batches in order to reduce
550 * current memory consumption
552 static void
553 ExecHashIncreaseNumBatches(HashJoinTable hashtable)
555 int oldnbatch = hashtable->nbatch;
556 int curbatch = hashtable->curbatch;
557 int nbatch;
558 int i;
559 MemoryContext oldcxt;
560 long ninmemory;
561 long nfreed;
563 /* do nothing if we've decided to shut off growth */
564 if (!hashtable->growEnabled)
565 return;
567 /* safety check to avoid overflow */
568 if (oldnbatch > INT_MAX / 2)
569 return;
571 nbatch = oldnbatch * 2;
572 Assert(nbatch > 1);
574 #ifdef HJDEBUG
575 printf("Increasing nbatch to %d because space = %lu\n",
576 nbatch, (unsigned long) hashtable->spaceUsed);
577 #endif
579 oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
581 if (hashtable->innerBatchFile == NULL)
583 /* we had no file arrays before */
584 hashtable->innerBatchFile = (BufFile **)
585 palloc0(nbatch * sizeof(BufFile *));
586 hashtable->outerBatchFile = (BufFile **)
587 palloc0(nbatch * sizeof(BufFile *));
588 /* time to establish the temp tablespaces, too */
589 PrepareTempTablespaces();
591 else
593 /* enlarge arrays and zero out added entries */
594 hashtable->innerBatchFile = (BufFile **)
595 repalloc(hashtable->innerBatchFile, nbatch * sizeof(BufFile *));
596 hashtable->outerBatchFile = (BufFile **)
597 repalloc(hashtable->outerBatchFile, nbatch * sizeof(BufFile *));
598 MemSet(hashtable->innerBatchFile + oldnbatch, 0,
599 (nbatch - oldnbatch) * sizeof(BufFile *));
600 MemSet(hashtable->outerBatchFile + oldnbatch, 0,
601 (nbatch - oldnbatch) * sizeof(BufFile *));
604 MemoryContextSwitchTo(oldcxt);
606 hashtable->nbatch = nbatch;
609 * Scan through the existing hash table entries and dump out any that are
610 * no longer of the current batch.
612 ninmemory = nfreed = 0;
614 for (i = 0; i < hashtable->nbuckets; i++)
616 HashJoinTuple prevtuple;
617 HashJoinTuple tuple;
619 prevtuple = NULL;
620 tuple = hashtable->buckets[i];
622 while (tuple != NULL)
624 /* save link in case we delete */
625 HashJoinTuple nexttuple = tuple->next;
626 int bucketno;
627 int batchno;
629 ninmemory++;
630 ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue,
631 &bucketno, &batchno);
632 Assert(bucketno == i);
633 if (batchno == curbatch)
635 /* keep tuple */
636 prevtuple = tuple;
638 else
640 /* dump it out */
641 Assert(batchno > curbatch);
642 ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
643 tuple->hashvalue,
644 &hashtable->innerBatchFile[batchno]);
645 /* and remove from hash table */
646 if (prevtuple)
647 prevtuple->next = nexttuple;
648 else
649 hashtable->buckets[i] = nexttuple;
650 /* prevtuple doesn't change */
651 hashtable->spaceUsed -=
652 HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
653 pfree(tuple);
654 nfreed++;
657 tuple = nexttuple;
661 #ifdef HJDEBUG
662 printf("Freed %ld of %ld tuples, space now %lu\n",
663 nfreed, ninmemory, (unsigned long) hashtable->spaceUsed);
664 #endif
667 * If we dumped out either all or none of the tuples in the table, disable
668 * further expansion of nbatch. This situation implies that we have
669 * enough tuples of identical hashvalues to overflow spaceAllowed.
670 * Increasing nbatch will not fix it since there's no way to subdivide the
671 * group any more finely. We have to just gut it out and hope the server
672 * has enough RAM.
674 if (nfreed == 0 || nfreed == ninmemory)
676 hashtable->growEnabled = false;
677 #ifdef HJDEBUG
678 printf("Disabling further increase of nbatch\n");
679 #endif
684 * ExecHashTableInsert
685 * insert a tuple into the hash table depending on the hash value
686 * it may just go to a temp file for later batches
688 * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
689 * tuple; the minimal case in particular is certain to happen while reloading
690 * tuples from batch files. We could save some cycles in the regular-tuple
691 * case by not forcing the slot contents into minimal form; not clear if it's
692 * worth the messiness required.
694 void
695 ExecHashTableInsert(HashJoinTable hashtable,
696 TupleTableSlot *slot,
697 uint32 hashvalue)
699 MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot);
700 int bucketno;
701 int batchno;
703 ExecHashGetBucketAndBatch(hashtable, hashvalue,
704 &bucketno, &batchno);
707 * decide whether to put the tuple in the hash table or a temp file
709 if (batchno == hashtable->curbatch)
712 * put the tuple in hash table
714 HashJoinTuple hashTuple;
715 int hashTupleSize;
717 hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
718 hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
719 hashTupleSize);
720 hashTuple->hashvalue = hashvalue;
721 memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
722 hashTuple->next = hashtable->buckets[bucketno];
723 hashtable->buckets[bucketno] = hashTuple;
724 hashtable->spaceUsed += hashTupleSize;
725 if (hashtable->spaceUsed > hashtable->spaceAllowed)
726 ExecHashIncreaseNumBatches(hashtable);
728 else
731 * put the tuple into a temp file for later batches
733 Assert(batchno > hashtable->curbatch);
734 ExecHashJoinSaveTuple(tuple,
735 hashvalue,
736 &hashtable->innerBatchFile[batchno]);
741 * ExecHashGetHashValue
742 * Compute the hash value for a tuple
744 * The tuple to be tested must be in either econtext->ecxt_outertuple or
745 * econtext->ecxt_innertuple. Vars in the hashkeys expressions reference
746 * either OUTER or INNER.
748 * A TRUE result means the tuple's hash value has been successfully computed
749 * and stored at *hashvalue. A FALSE result means the tuple cannot match
750 * because it contains a null attribute, and hence it should be discarded
751 * immediately. (If keep_nulls is true then FALSE is never returned.)
753 bool
754 ExecHashGetHashValue(HashJoinTable hashtable,
755 ExprContext *econtext,
756 List *hashkeys,
757 bool outer_tuple,
758 bool keep_nulls,
759 uint32 *hashvalue)
761 uint32 hashkey = 0;
762 FmgrInfo *hashfunctions;
763 ListCell *hk;
764 int i = 0;
765 MemoryContext oldContext;
768 * We reset the eval context each time to reclaim any memory leaked in the
769 * hashkey expressions.
771 ResetExprContext(econtext);
773 oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
775 if (outer_tuple)
776 hashfunctions = hashtable->outer_hashfunctions;
777 else
778 hashfunctions = hashtable->inner_hashfunctions;
780 foreach(hk, hashkeys)
782 ExprState *keyexpr = (ExprState *) lfirst(hk);
783 Datum keyval;
784 bool isNull;
786 /* rotate hashkey left 1 bit at each step */
787 hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
790 * Get the join attribute value of the tuple
792 keyval = ExecEvalExpr(keyexpr, econtext, &isNull, NULL);
795 * If the attribute is NULL, and the join operator is strict, then
796 * this tuple cannot pass the join qual so we can reject it
797 * immediately (unless we're scanning the outside of an outer join, in
798 * which case we must not reject it). Otherwise we act like the
799 * hashcode of NULL is zero (this will support operators that act like
800 * IS NOT DISTINCT, though not any more-random behavior). We treat
801 * the hash support function as strict even if the operator is not.
803 * Note: currently, all hashjoinable operators must be strict since
804 * the hash index AM assumes that. However, it takes so little extra
805 * code here to allow non-strict that we may as well do it.
807 if (isNull)
809 if (hashtable->hashStrict[i] && !keep_nulls)
811 MemoryContextSwitchTo(oldContext);
812 return false; /* cannot match */
814 /* else, leave hashkey unmodified, equivalent to hashcode 0 */
816 else
818 /* Compute the hash function */
819 uint32 hkey;
821 hkey = DatumGetUInt32(FunctionCall1(&hashfunctions[i], keyval));
822 hashkey ^= hkey;
825 i++;
828 MemoryContextSwitchTo(oldContext);
830 *hashvalue = hashkey;
831 return true;
835 * ExecHashGetBucketAndBatch
836 * Determine the bucket number and batch number for a hash value
838 * Note: on-the-fly increases of nbatch must not change the bucket number
839 * for a given hash code (since we don't move tuples to different hash
840 * chains), and must only cause the batch number to remain the same or
841 * increase. Our algorithm is
842 * bucketno = hashvalue MOD nbuckets
843 * batchno = (hashvalue DIV nbuckets) MOD nbatch
844 * where nbuckets and nbatch are both expected to be powers of 2, so we can
845 * do the computations by shifting and masking. (This assumes that all hash
846 * functions are good about randomizing all their output bits, else we are
847 * likely to have very skewed bucket or batch occupancy.)
849 * nbuckets doesn't change over the course of the join.
851 * nbatch is always a power of 2; we increase it only by doubling it. This
852 * effectively adds one more bit to the top of the batchno.
854 void
855 ExecHashGetBucketAndBatch(HashJoinTable hashtable,
856 uint32 hashvalue,
857 int *bucketno,
858 int *batchno)
860 uint32 nbuckets = (uint32) hashtable->nbuckets;
861 uint32 nbatch = (uint32) hashtable->nbatch;
863 if (nbatch > 1)
865 /* we can do MOD by masking, DIV by shifting */
866 *bucketno = hashvalue & (nbuckets - 1);
867 *batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1);
869 else
871 *bucketno = hashvalue & (nbuckets - 1);
872 *batchno = 0;
877 * ExecScanHashBucket
878 * scan a hash bucket for matches to the current outer tuple
880 * The current outer tuple must be stored in econtext->ecxt_outertuple.
882 HashJoinTuple
883 ExecScanHashBucket(HashJoinState *hjstate,
884 ExprContext *econtext)
886 List *hjclauses = hjstate->hashclauses;
887 HashJoinTable hashtable = hjstate->hj_HashTable;
888 HashJoinTuple hashTuple = hjstate->hj_CurTuple;
889 uint32 hashvalue = hjstate->hj_CurHashValue;
892 * hj_CurTuple is the address of the tuple last returned from the current
893 * bucket, or NULL if it's time to start scanning a new bucket.
895 * If the tuple hashed to a skew bucket then scan the skew bucket
896 * otherwise scan the standard hashtable bucket.
898 if (hashTuple != NULL)
899 hashTuple = hashTuple->next;
900 else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
901 hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
902 else
903 hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
905 while (hashTuple != NULL)
907 if (hashTuple->hashvalue == hashvalue)
909 TupleTableSlot *inntuple;
911 /* insert hashtable's tuple into exec slot so ExecQual sees it */
912 inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
913 hjstate->hj_HashTupleSlot,
914 false); /* do not pfree */
915 econtext->ecxt_innertuple = inntuple;
917 /* reset temp memory each time to avoid leaks from qual expr */
918 ResetExprContext(econtext);
920 if (ExecQual(hjclauses, econtext, false))
922 hjstate->hj_CurTuple = hashTuple;
923 return hashTuple;
927 hashTuple = hashTuple->next;
931 * no match
933 return NULL;
937 * ExecHashTableReset
939 * reset hash table header for new batch
941 void
942 ExecHashTableReset(HashJoinTable hashtable)
944 MemoryContext oldcxt;
945 int nbuckets = hashtable->nbuckets;
948 * Release all the hash buckets and tuples acquired in the prior pass, and
949 * reinitialize the context for a new pass.
951 MemoryContextReset(hashtable->batchCxt);
952 oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
954 /* Reallocate and reinitialize the hash bucket headers. */
955 hashtable->buckets = (HashJoinTuple *)
956 palloc0(nbuckets * sizeof(HashJoinTuple));
958 hashtable->spaceUsed = 0;
960 MemoryContextSwitchTo(oldcxt);
963 void
964 ExecReScanHash(HashState *node, ExprContext *exprCtxt)
967 * if chgParam of subnode is not null then plan will be re-scanned by
968 * first ExecProcNode.
970 if (((PlanState *) node)->lefttree->chgParam == NULL)
971 ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
976 * ExecHashBuildSkewHash
978 * Set up for skew optimization if we can identify the most common values
979 * (MCVs) of the outer relation's join key. We make a skew hash bucket
980 * for the hash value of each MCV, up to the number of slots allowed
981 * based on available memory.
983 static void
984 ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse)
986 HeapTupleData *statsTuple;
987 Datum *values;
988 int nvalues;
989 float4 *numbers;
990 int nnumbers;
992 /* Do nothing if planner didn't identify the outer relation's join key */
993 if (!OidIsValid(node->skewTable))
994 return;
995 /* Also, do nothing if we don't have room for at least one skew bucket */
996 if (mcvsToUse <= 0)
997 return;
1000 * Try to find the MCV statistics for the outer relation's join key.
1002 statsTuple = SearchSysCache(STATRELATT,
1003 ObjectIdGetDatum(node->skewTable),
1004 Int16GetDatum(node->skewColumn),
1005 0, 0);
1006 if (!HeapTupleIsValid(statsTuple))
1007 return;
1009 if (get_attstatsslot(statsTuple, node->skewColType, node->skewColTypmod,
1010 STATISTIC_KIND_MCV, InvalidOid,
1011 &values, &nvalues,
1012 &numbers, &nnumbers))
1014 double frac;
1015 int nbuckets;
1016 FmgrInfo *hashfunctions;
1017 int i;
1019 if (mcvsToUse > nvalues)
1020 mcvsToUse = nvalues;
1023 * Calculate the expected fraction of outer relation that will
1024 * participate in the skew optimization. If this isn't at least
1025 * SKEW_MIN_OUTER_FRACTION, don't use skew optimization.
1027 frac = 0;
1028 for (i = 0; i < mcvsToUse; i++)
1029 frac += numbers[i];
1030 if (frac < SKEW_MIN_OUTER_FRACTION)
1032 free_attstatsslot(node->skewColType,
1033 values, nvalues, numbers, nnumbers);
1034 ReleaseSysCache(statsTuple);
1035 return;
1039 * Okay, set up the skew hashtable.
1041 * skewBucket[] is an open addressing hashtable with a power of 2 size
1042 * that is greater than the number of MCV values. (This ensures there
1043 * will be at least one null entry, so searches will always
1044 * terminate.)
1046 * Note: this code could fail if mcvsToUse exceeds INT_MAX/8, but that
1047 * is not currently possible since we limit pg_statistic entries to
1048 * much less than that.
1050 nbuckets = 2;
1051 while (nbuckets <= mcvsToUse)
1052 nbuckets <<= 1;
1053 /* use two more bits just to help avoid collisions */
1054 nbuckets <<= 2;
1056 hashtable->skewEnabled = true;
1057 hashtable->skewBucketLen = nbuckets;
1060 * We allocate the bucket memory in the hashtable's batch context. It
1061 * is only needed during the first batch, and this ensures it will be
1062 * automatically removed once the first batch is done.
1064 hashtable->skewBucket = (HashSkewBucket **)
1065 MemoryContextAllocZero(hashtable->batchCxt,
1066 nbuckets * sizeof(HashSkewBucket *));
1067 hashtable->skewBucketNums = (int *)
1068 MemoryContextAllocZero(hashtable->batchCxt,
1069 mcvsToUse * sizeof(int));
1071 hashtable->spaceUsed += nbuckets * sizeof(HashSkewBucket *)
1072 + mcvsToUse * sizeof(int);
1073 hashtable->spaceUsedSkew += nbuckets * sizeof(HashSkewBucket *)
1074 + mcvsToUse * sizeof(int);
1077 * Create a skew bucket for each MCV hash value.
1079 * Note: it is very important that we create the buckets in order of
1080 * decreasing MCV frequency. If we have to remove some buckets, they
1081 * must be removed in reverse order of creation (see notes in
1082 * ExecHashRemoveNextSkewBucket) and we want the least common MCVs to
1083 * be removed first.
1085 hashfunctions = hashtable->outer_hashfunctions;
1087 for (i = 0; i < mcvsToUse; i++)
1089 uint32 hashvalue;
1090 int bucket;
1092 hashvalue = DatumGetUInt32(FunctionCall1(&hashfunctions[0],
1093 values[i]));
1096 * While we have not hit a hole in the hashtable and have not hit
1097 * the desired bucket, we have collided with some previous hash
1098 * value, so try the next bucket location. NB: this code must
1099 * match ExecHashGetSkewBucket.
1101 bucket = hashvalue & (nbuckets - 1);
1102 while (hashtable->skewBucket[bucket] != NULL &&
1103 hashtable->skewBucket[bucket]->hashvalue != hashvalue)
1104 bucket = (bucket + 1) & (nbuckets - 1);
1107 * If we found an existing bucket with the same hashvalue, leave
1108 * it alone. It's okay for two MCVs to share a hashvalue.
1110 if (hashtable->skewBucket[bucket] != NULL)
1111 continue;
1113 /* Okay, create a new skew bucket for this hashvalue. */
1114 hashtable->skewBucket[bucket] = (HashSkewBucket *)
1115 MemoryContextAlloc(hashtable->batchCxt,
1116 sizeof(HashSkewBucket));
1117 hashtable->skewBucket[bucket]->hashvalue = hashvalue;
1118 hashtable->skewBucket[bucket]->tuples = NULL;
1119 hashtable->skewBucketNums[hashtable->nSkewBuckets] = bucket;
1120 hashtable->nSkewBuckets++;
1121 hashtable->spaceUsed += SKEW_BUCKET_OVERHEAD;
1122 hashtable->spaceUsedSkew += SKEW_BUCKET_OVERHEAD;
1125 free_attstatsslot(node->skewColType,
1126 values, nvalues, numbers, nnumbers);
1129 ReleaseSysCache(statsTuple);
1133 * ExecHashGetSkewBucket
1135 * Returns the index of the skew bucket for this hashvalue,
1136 * or INVALID_SKEW_BUCKET_NO if the hashvalue is not
1137 * associated with any active skew bucket.
1140 ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
1142 int bucket;
1145 * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in
1146 * particular, this happens after the initial batch is done).
1148 if (!hashtable->skewEnabled)
1149 return INVALID_SKEW_BUCKET_NO;
1152 * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.
1154 bucket = hashvalue & (hashtable->skewBucketLen - 1);
1157 * While we have not hit a hole in the hashtable and have not hit the
1158 * desired bucket, we have collided with some other hash value, so try the
1159 * next bucket location.
1161 while (hashtable->skewBucket[bucket] != NULL &&
1162 hashtable->skewBucket[bucket]->hashvalue != hashvalue)
1163 bucket = (bucket + 1) & (hashtable->skewBucketLen - 1);
1166 * Found the desired bucket?
1168 if (hashtable->skewBucket[bucket] != NULL)
1169 return bucket;
1172 * There must not be any hashtable entry for this hash value.
1174 return INVALID_SKEW_BUCKET_NO;
1178 * ExecHashSkewTableInsert
1180 * Insert a tuple into the skew hashtable.
1182 * This should generally match up with the current-batch case in
1183 * ExecHashTableInsert.
1185 static void
1186 ExecHashSkewTableInsert(HashJoinTable hashtable,
1187 TupleTableSlot *slot,
1188 uint32 hashvalue,
1189 int bucketNumber)
1191 MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot);
1192 HashJoinTuple hashTuple;
1193 int hashTupleSize;
1195 /* Create the HashJoinTuple */
1196 hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
1197 hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
1198 hashTupleSize);
1199 hashTuple->hashvalue = hashvalue;
1200 memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
1202 /* Push it onto the front of the skew bucket's list */
1203 hashTuple->next = hashtable->skewBucket[bucketNumber]->tuples;
1204 hashtable->skewBucket[bucketNumber]->tuples = hashTuple;
1206 /* Account for space used, and back off if we've used too much */
1207 hashtable->spaceUsed += hashTupleSize;
1208 hashtable->spaceUsedSkew += hashTupleSize;
1209 while (hashtable->spaceUsedSkew > hashtable->spaceAllowedSkew)
1210 ExecHashRemoveNextSkewBucket(hashtable);
1212 /* Check we are not over the total spaceAllowed, either */
1213 if (hashtable->spaceUsed > hashtable->spaceAllowed)
1214 ExecHashIncreaseNumBatches(hashtable);
1218 * ExecHashRemoveNextSkewBucket
1220 * Remove the least valuable skew bucket by pushing its tuples into
1221 * the main hash table.
1223 static void
1224 ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
1226 int bucketToRemove;
1227 HashSkewBucket *bucket;
1228 uint32 hashvalue;
1229 int bucketno;
1230 int batchno;
1231 HashJoinTuple hashTuple;
1233 /* Locate the bucket to remove */
1234 bucketToRemove = hashtable->skewBucketNums[hashtable->nSkewBuckets - 1];
1235 bucket = hashtable->skewBucket[bucketToRemove];
1238 * Calculate which bucket and batch the tuples belong to in the main
1239 * hashtable. They all have the same hash value, so it's the same for all
1240 * of them. Also note that it's not possible for nbatch to increase while
1241 * we are processing the tuples.
1243 hashvalue = bucket->hashvalue;
1244 ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
1246 /* Process all tuples in the bucket */
1247 hashTuple = bucket->tuples;
1248 while (hashTuple != NULL)
1250 HashJoinTuple nextHashTuple = hashTuple->next;
1251 MinimalTuple tuple;
1252 Size tupleSize;
1255 * This code must agree with ExecHashTableInsert. We do not use
1256 * ExecHashTableInsert directly as ExecHashTableInsert expects a
1257 * TupleTableSlot while we already have HashJoinTuples.
1259 tuple = HJTUPLE_MINTUPLE(hashTuple);
1260 tupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
1262 /* Decide whether to put the tuple in the hash table or a temp file */
1263 if (batchno == hashtable->curbatch)
1265 /* Move the tuple to the main hash table */
1266 hashTuple->next = hashtable->buckets[bucketno];
1267 hashtable->buckets[bucketno] = hashTuple;
1268 /* We have reduced skew space, but overall space doesn't change */
1269 hashtable->spaceUsedSkew -= tupleSize;
1271 else
1273 /* Put the tuple into a temp file for later batches */
1274 Assert(batchno > hashtable->curbatch);
1275 ExecHashJoinSaveTuple(tuple, hashvalue,
1276 &hashtable->innerBatchFile[batchno]);
1277 pfree(hashTuple);
1278 hashtable->spaceUsed -= tupleSize;
1279 hashtable->spaceUsedSkew -= tupleSize;
1282 hashTuple = nextHashTuple;
1286 * Free the bucket struct itself and reset the hashtable entry to NULL.
1288 * NOTE: this is not nearly as simple as it looks on the surface, because
1289 * of the possibility of collisions in the hashtable. Suppose that hash
1290 * values A and B collide at a particular hashtable entry, and that A was
1291 * entered first so B gets shifted to a different table entry. If we were
1292 * to remove A first then ExecHashGetSkewBucket would mistakenly start
1293 * reporting that B is not in the hashtable, because it would hit the NULL
1294 * before finding B. However, we always remove entries in the reverse
1295 * order of creation, so this failure cannot happen.
1297 hashtable->skewBucket[bucketToRemove] = NULL;
1298 hashtable->nSkewBuckets--;
1299 pfree(bucket);
1300 hashtable->spaceUsed -= SKEW_BUCKET_OVERHEAD;
1301 hashtable->spaceUsedSkew -= SKEW_BUCKET_OVERHEAD;
1304 * If we have removed all skew buckets then give up on skew optimization.
1305 * Release the arrays since they aren't useful any more.
1307 if (hashtable->nSkewBuckets == 0)
1309 hashtable->skewEnabled = false;
1310 pfree(hashtable->skewBucket);
1311 pfree(hashtable->skewBucketNums);
1312 hashtable->skewBucket = NULL;
1313 hashtable->skewBucketNums = NULL;
1314 hashtable->spaceUsed -= hashtable->spaceUsedSkew;
1315 hashtable->spaceUsedSkew = 0;