1 /*-------------------------------------------------------------------------
4 * Routines to hash relations for hashjoin
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
13 *-------------------------------------------------------------------------
17 * MultiExecHash - generate an in-memory hash table of the relation
18 * ExecInitHash - initialize node and subnodes
19 * ExecEndHash - shutdown node and subnodes
27 #include "catalog/pg_statistic.h"
28 #include "commands/tablespace.h"
29 #include "executor/execdebug.h"
30 #include "executor/hashjoin.h"
31 #include "executor/instrument.h"
32 #include "executor/nodeHash.h"
33 #include "executor/nodeHashjoin.h"
34 #include "miscadmin.h"
35 #include "parser/parse_expr.h"
36 #include "utils/dynahash.h"
37 #include "utils/memutils.h"
38 #include "utils/lsyscache.h"
39 #include "utils/syscache.h"
42 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable
);
43 static void ExecHashBuildSkewHash(HashJoinTable hashtable
, Hash
*node
,
45 static void ExecHashSkewTableInsert(HashJoinTable hashtable
,
49 static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable
);
52 /* ----------------------------------------------------------------
55 * stub for pro forma compliance
56 * ----------------------------------------------------------------
59 ExecHash(HashState
*node
)
61 elog(ERROR
, "Hash node does not support ExecProcNode call convention");
65 /* ----------------------------------------------------------------
68 * build hash table for hashjoin, doing partitioning if more
69 * than one batch is required.
70 * ----------------------------------------------------------------
73 MultiExecHash(HashState
*node
)
77 HashJoinTable hashtable
;
79 ExprContext
*econtext
;
82 /* must provide our own instrumentation support */
83 if (node
->ps
.instrument
)
84 InstrStartNode(node
->ps
.instrument
);
87 * get state info from node
89 outerNode
= outerPlanState(node
);
90 hashtable
= node
->hashtable
;
93 * set expression context
95 hashkeys
= node
->hashkeys
;
96 econtext
= node
->ps
.ps_ExprContext
;
99 * get all inner tuples and insert into the hash table (or temp files)
103 slot
= ExecProcNode(outerNode
);
106 /* We have to compute the hash value */
107 econtext
->ecxt_innertuple
= slot
;
108 if (ExecHashGetHashValue(hashtable
, econtext
, hashkeys
, false, false,
113 bucketNumber
= ExecHashGetSkewBucket(hashtable
, hashvalue
);
114 if (bucketNumber
!= INVALID_SKEW_BUCKET_NO
)
116 /* It's a skew tuple, so put it into that hash table */
117 ExecHashSkewTableInsert(hashtable
, slot
, hashvalue
,
122 /* Not subject to skew optimization, so insert normally */
123 ExecHashTableInsert(hashtable
, slot
, hashvalue
);
125 hashtable
->totalTuples
+= 1;
129 /* must provide our own instrumentation support */
130 if (node
->ps
.instrument
)
131 InstrStopNode(node
->ps
.instrument
, hashtable
->totalTuples
);
134 * We do not return the hash table directly because it's not a subtype of
135 * Node, and so would violate the MultiExecProcNode API. Instead, our
136 * parent Hashjoin node is expected to know how to fish it out of our node
137 * state. Ugly but not really worth cleaning up, since Hashjoin knows
138 * quite a bit more about Hash besides that.
143 /* ----------------------------------------------------------------
146 * Init routine for Hash node
147 * ----------------------------------------------------------------
150 ExecInitHash(Hash
*node
, EState
*estate
, int eflags
)
152 HashState
*hashstate
;
154 /* check for unsupported flags */
155 Assert(!(eflags
& (EXEC_FLAG_BACKWARD
| EXEC_FLAG_MARK
)));
158 * create state structure
160 hashstate
= makeNode(HashState
);
161 hashstate
->ps
.plan
= (Plan
*) node
;
162 hashstate
->ps
.state
= estate
;
163 hashstate
->hashtable
= NULL
;
164 hashstate
->hashkeys
= NIL
; /* will be set by parent HashJoin */
167 * Miscellaneous initialization
169 * create expression context for node
171 ExecAssignExprContext(estate
, &hashstate
->ps
);
173 #define HASH_NSLOTS 1
176 * initialize our result slot
178 ExecInitResultTupleSlot(estate
, &hashstate
->ps
);
181 * initialize child expressions
183 hashstate
->ps
.targetlist
= (List
*)
184 ExecInitExpr((Expr
*) node
->plan
.targetlist
,
185 (PlanState
*) hashstate
);
186 hashstate
->ps
.qual
= (List
*)
187 ExecInitExpr((Expr
*) node
->plan
.qual
,
188 (PlanState
*) hashstate
);
191 * initialize child nodes
193 outerPlanState(hashstate
) = ExecInitNode(outerPlan(node
), estate
, eflags
);
196 * initialize tuple type. no need to initialize projection info because
197 * this node doesn't do projections
199 ExecAssignResultTypeFromTL(&hashstate
->ps
);
200 hashstate
->ps
.ps_ProjInfo
= NULL
;
206 ExecCountSlotsHash(Hash
*node
)
208 return ExecCountSlotsNode(outerPlan(node
)) +
209 ExecCountSlotsNode(innerPlan(node
)) +
213 /* ---------------------------------------------------------------
216 * clean up routine for Hash node
217 * ----------------------------------------------------------------
220 ExecEndHash(HashState
*node
)
222 PlanState
*outerPlan
;
227 ExecFreeExprContext(&node
->ps
);
230 * shut down the subplan
232 outerPlan
= outerPlanState(node
);
233 ExecEndNode(outerPlan
);
237 /* ----------------------------------------------------------------
238 * ExecHashTableCreate
240 * create an empty hashtable data structure for hashjoin.
241 * ----------------------------------------------------------------
244 ExecHashTableCreate(Hash
*node
, List
*hashOperators
)
246 HashJoinTable hashtable
;
255 MemoryContext oldcxt
;
258 * Get information about the size of the relation to be hashed (it's the
259 * "outer" subtree of this node, but the inner relation of the hashjoin).
260 * Compute the appropriate size of the hash table.
262 outerNode
= outerPlan(node
);
264 ExecChooseHashTableSize(outerNode
->plan_rows
, outerNode
->plan_width
,
265 OidIsValid(node
->skewTable
),
266 &nbuckets
, &nbatch
, &num_skew_mcvs
);
269 printf("nbatch = %d, nbuckets = %d\n", nbatch
, nbuckets
);
272 /* nbuckets must be a power of 2 */
273 log2_nbuckets
= my_log2(nbuckets
);
274 Assert(nbuckets
== (1 << log2_nbuckets
));
277 * Initialize the hash table control block.
279 * The hashtable control block is just palloc'd from the executor's
280 * per-query memory context.
282 hashtable
= (HashJoinTable
) palloc(sizeof(HashJoinTableData
));
283 hashtable
->nbuckets
= nbuckets
;
284 hashtable
->log2_nbuckets
= log2_nbuckets
;
285 hashtable
->buckets
= NULL
;
286 hashtable
->skewEnabled
= false;
287 hashtable
->skewBucket
= NULL
;
288 hashtable
->skewBucketLen
= 0;
289 hashtable
->nSkewBuckets
= 0;
290 hashtable
->skewBucketNums
= NULL
;
291 hashtable
->nbatch
= nbatch
;
292 hashtable
->curbatch
= 0;
293 hashtable
->nbatch_original
= nbatch
;
294 hashtable
->nbatch_outstart
= nbatch
;
295 hashtable
->growEnabled
= true;
296 hashtable
->totalTuples
= 0;
297 hashtable
->innerBatchFile
= NULL
;
298 hashtable
->outerBatchFile
= NULL
;
299 hashtable
->spaceUsed
= 0;
300 hashtable
->spaceAllowed
= work_mem
* 1024L;
301 hashtable
->spaceUsedSkew
= 0;
302 hashtable
->spaceAllowedSkew
=
303 hashtable
->spaceAllowed
* SKEW_WORK_MEM_PERCENT
/ 100;
306 * Get info about the hash functions to be used for each hash key. Also
307 * remember whether the join operators are strict.
309 nkeys
= list_length(hashOperators
);
310 hashtable
->outer_hashfunctions
=
311 (FmgrInfo
*) palloc(nkeys
* sizeof(FmgrInfo
));
312 hashtable
->inner_hashfunctions
=
313 (FmgrInfo
*) palloc(nkeys
* sizeof(FmgrInfo
));
314 hashtable
->hashStrict
= (bool *) palloc(nkeys
* sizeof(bool));
316 foreach(ho
, hashOperators
)
318 Oid hashop
= lfirst_oid(ho
);
322 if (!get_op_hash_functions(hashop
, &left_hashfn
, &right_hashfn
))
323 elog(ERROR
, "could not find hash function for hash operator %u",
325 fmgr_info(left_hashfn
, &hashtable
->outer_hashfunctions
[i
]);
326 fmgr_info(right_hashfn
, &hashtable
->inner_hashfunctions
[i
]);
327 hashtable
->hashStrict
[i
] = op_strict(hashop
);
332 * Create temporary memory contexts in which to keep the hashtable working
333 * storage. See notes in executor/hashjoin.h.
335 hashtable
->hashCxt
= AllocSetContextCreate(CurrentMemoryContext
,
337 ALLOCSET_DEFAULT_MINSIZE
,
338 ALLOCSET_DEFAULT_INITSIZE
,
339 ALLOCSET_DEFAULT_MAXSIZE
);
341 hashtable
->batchCxt
= AllocSetContextCreate(hashtable
->hashCxt
,
343 ALLOCSET_DEFAULT_MINSIZE
,
344 ALLOCSET_DEFAULT_INITSIZE
,
345 ALLOCSET_DEFAULT_MAXSIZE
);
347 /* Allocate data that will live for the life of the hashjoin */
349 oldcxt
= MemoryContextSwitchTo(hashtable
->hashCxt
);
354 * allocate and initialize the file arrays in hashCxt
356 hashtable
->innerBatchFile
= (BufFile
**)
357 palloc0(nbatch
* sizeof(BufFile
*));
358 hashtable
->outerBatchFile
= (BufFile
**)
359 palloc0(nbatch
* sizeof(BufFile
*));
360 /* The files will not be opened until needed... */
361 /* ... but make sure we have temp tablespaces established for them */
362 PrepareTempTablespaces();
366 * Prepare context for the first-scan space allocations; allocate the
367 * hashbucket array therein, and set each bucket "empty".
369 MemoryContextSwitchTo(hashtable
->batchCxt
);
371 hashtable
->buckets
= (HashJoinTuple
*)
372 palloc0(nbuckets
* sizeof(HashJoinTuple
));
375 * Set up for skew optimization, if possible and there's a need for more
376 * than one batch. (In a one-batch join, there's no point in it.)
379 ExecHashBuildSkewHash(hashtable
, node
, num_skew_mcvs
);
381 MemoryContextSwitchTo(oldcxt
);
388 * Compute appropriate size for hashtable given the estimated size of the
389 * relation to be hashed (number of rows and average row width).
391 * This is exported so that the planner's costsize.c can use it.
394 /* Target bucket loading (tuples per bucket) */
395 #define NTUP_PER_BUCKET 10
398 ExecChooseHashTableSize(double ntuples
, int tupwidth
, bool useskew
,
404 double inner_rel_bytes
;
405 long hash_table_bytes
;
406 long skew_table_bytes
;
411 /* Force a plausible relation size if no info */
416 * Estimate tupsize based on footprint of tuple in hashtable... note this
417 * does not allow for any palloc overhead. The manipulations of spaceUsed
418 * don't count palloc overhead either.
420 tupsize
= HJTUPLE_OVERHEAD
+
421 MAXALIGN(sizeof(MinimalTupleData
)) +
423 inner_rel_bytes
= ntuples
* tupsize
;
426 * Target in-memory hashtable size is work_mem kilobytes.
428 hash_table_bytes
= work_mem
* 1024L;
431 * If skew optimization is possible, estimate the number of skew buckets
432 * that will fit in the memory allowed, and decrement the assumed space
433 * available for the main hash table accordingly.
435 * We make the optimistic assumption that each skew bucket will contain
436 * one inner-relation tuple. If that turns out to be low, we will recover
437 * at runtime by reducing the number of skew buckets.
439 * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
440 * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
441 * will round up to the next power of 2 and then multiply by 4 to reduce
446 skew_table_bytes
= hash_table_bytes
* SKEW_WORK_MEM_PERCENT
/ 100;
448 *num_skew_mcvs
= skew_table_bytes
/ (
449 /* size of a hash tuple */
451 /* worst-case size of skewBucket[] per MCV */
452 (8 * sizeof(HashSkewBucket
*)) +
453 /* size of skewBucketNums[] entry */
455 /* size of skew bucket struct itself */
459 if (*num_skew_mcvs
> 0)
460 hash_table_bytes
-= skew_table_bytes
;
466 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
467 * memory is filled. Set nbatch to the smallest power of 2 that appears
470 if (inner_rel_bytes
> hash_table_bytes
)
472 /* We'll need multiple batches */
477 lbuckets
= (hash_table_bytes
/ tupsize
) / NTUP_PER_BUCKET
;
478 lbuckets
= Min(lbuckets
, INT_MAX
/ 2);
479 nbuckets
= (int) lbuckets
;
481 dbatch
= ceil(inner_rel_bytes
/ hash_table_bytes
);
482 dbatch
= Min(dbatch
, INT_MAX
/ 2);
483 minbatch
= (int) dbatch
;
485 while (nbatch
< minbatch
)
490 /* We expect the hashtable to fit in memory */
493 dbuckets
= ceil(ntuples
/ NTUP_PER_BUCKET
);
494 dbuckets
= Min(dbuckets
, INT_MAX
/ 2);
495 nbuckets
= (int) dbuckets
;
501 * Both nbuckets and nbatch must be powers of 2 to make
502 * ExecHashGetBucketAndBatch fast. We already fixed nbatch; now inflate
503 * nbuckets to the next larger power of 2. We also force nbuckets to not
504 * be real small, by starting the search at 2^10.
507 while ((1 << i
) < nbuckets
)
511 *numbuckets
= nbuckets
;
512 *numbatches
= nbatch
;
516 /* ----------------------------------------------------------------
517 * ExecHashTableDestroy
519 * destroy a hash table
520 * ----------------------------------------------------------------
523 ExecHashTableDestroy(HashJoinTable hashtable
)
528 * Make sure all the temp files are closed. We skip batch 0, since it
529 * can't have any temp files (and the arrays might not even exist if
532 for (i
= 1; i
< hashtable
->nbatch
; i
++)
534 if (hashtable
->innerBatchFile
[i
])
535 BufFileClose(hashtable
->innerBatchFile
[i
]);
536 if (hashtable
->outerBatchFile
[i
])
537 BufFileClose(hashtable
->outerBatchFile
[i
]);
540 /* Release working memory (batchCxt is a child, so it goes away too) */
541 MemoryContextDelete(hashtable
->hashCxt
);
543 /* And drop the control block */
548 * ExecHashIncreaseNumBatches
549 * increase the original number of batches in order to reduce
550 * current memory consumption
553 ExecHashIncreaseNumBatches(HashJoinTable hashtable
)
555 int oldnbatch
= hashtable
->nbatch
;
556 int curbatch
= hashtable
->curbatch
;
559 MemoryContext oldcxt
;
563 /* do nothing if we've decided to shut off growth */
564 if (!hashtable
->growEnabled
)
567 /* safety check to avoid overflow */
568 if (oldnbatch
> INT_MAX
/ 2)
571 nbatch
= oldnbatch
* 2;
575 printf("Increasing nbatch to %d because space = %lu\n",
576 nbatch
, (unsigned long) hashtable
->spaceUsed
);
579 oldcxt
= MemoryContextSwitchTo(hashtable
->hashCxt
);
581 if (hashtable
->innerBatchFile
== NULL
)
583 /* we had no file arrays before */
584 hashtable
->innerBatchFile
= (BufFile
**)
585 palloc0(nbatch
* sizeof(BufFile
*));
586 hashtable
->outerBatchFile
= (BufFile
**)
587 palloc0(nbatch
* sizeof(BufFile
*));
588 /* time to establish the temp tablespaces, too */
589 PrepareTempTablespaces();
593 /* enlarge arrays and zero out added entries */
594 hashtable
->innerBatchFile
= (BufFile
**)
595 repalloc(hashtable
->innerBatchFile
, nbatch
* sizeof(BufFile
*));
596 hashtable
->outerBatchFile
= (BufFile
**)
597 repalloc(hashtable
->outerBatchFile
, nbatch
* sizeof(BufFile
*));
598 MemSet(hashtable
->innerBatchFile
+ oldnbatch
, 0,
599 (nbatch
- oldnbatch
) * sizeof(BufFile
*));
600 MemSet(hashtable
->outerBatchFile
+ oldnbatch
, 0,
601 (nbatch
- oldnbatch
) * sizeof(BufFile
*));
604 MemoryContextSwitchTo(oldcxt
);
606 hashtable
->nbatch
= nbatch
;
609 * Scan through the existing hash table entries and dump out any that are
610 * no longer of the current batch.
612 ninmemory
= nfreed
= 0;
614 for (i
= 0; i
< hashtable
->nbuckets
; i
++)
616 HashJoinTuple prevtuple
;
620 tuple
= hashtable
->buckets
[i
];
622 while (tuple
!= NULL
)
624 /* save link in case we delete */
625 HashJoinTuple nexttuple
= tuple
->next
;
630 ExecHashGetBucketAndBatch(hashtable
, tuple
->hashvalue
,
631 &bucketno
, &batchno
);
632 Assert(bucketno
== i
);
633 if (batchno
== curbatch
)
641 Assert(batchno
> curbatch
);
642 ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple
),
644 &hashtable
->innerBatchFile
[batchno
]);
645 /* and remove from hash table */
647 prevtuple
->next
= nexttuple
;
649 hashtable
->buckets
[i
] = nexttuple
;
650 /* prevtuple doesn't change */
651 hashtable
->spaceUsed
-=
652 HJTUPLE_OVERHEAD
+ HJTUPLE_MINTUPLE(tuple
)->t_len
;
662 printf("Freed %ld of %ld tuples, space now %lu\n",
663 nfreed
, ninmemory
, (unsigned long) hashtable
->spaceUsed
);
667 * If we dumped out either all or none of the tuples in the table, disable
668 * further expansion of nbatch. This situation implies that we have
669 * enough tuples of identical hashvalues to overflow spaceAllowed.
670 * Increasing nbatch will not fix it since there's no way to subdivide the
671 * group any more finely. We have to just gut it out and hope the server
674 if (nfreed
== 0 || nfreed
== ninmemory
)
676 hashtable
->growEnabled
= false;
678 printf("Disabling further increase of nbatch\n");
684 * ExecHashTableInsert
685 * insert a tuple into the hash table depending on the hash value
686 * it may just go to a temp file for later batches
688 * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
689 * tuple; the minimal case in particular is certain to happen while reloading
690 * tuples from batch files. We could save some cycles in the regular-tuple
691 * case by not forcing the slot contents into minimal form; not clear if it's
692 * worth the messiness required.
695 ExecHashTableInsert(HashJoinTable hashtable
,
696 TupleTableSlot
*slot
,
699 MinimalTuple tuple
= ExecFetchSlotMinimalTuple(slot
);
703 ExecHashGetBucketAndBatch(hashtable
, hashvalue
,
704 &bucketno
, &batchno
);
707 * decide whether to put the tuple in the hash table or a temp file
709 if (batchno
== hashtable
->curbatch
)
712 * put the tuple in hash table
714 HashJoinTuple hashTuple
;
717 hashTupleSize
= HJTUPLE_OVERHEAD
+ tuple
->t_len
;
718 hashTuple
= (HashJoinTuple
) MemoryContextAlloc(hashtable
->batchCxt
,
720 hashTuple
->hashvalue
= hashvalue
;
721 memcpy(HJTUPLE_MINTUPLE(hashTuple
), tuple
, tuple
->t_len
);
722 hashTuple
->next
= hashtable
->buckets
[bucketno
];
723 hashtable
->buckets
[bucketno
] = hashTuple
;
724 hashtable
->spaceUsed
+= hashTupleSize
;
725 if (hashtable
->spaceUsed
> hashtable
->spaceAllowed
)
726 ExecHashIncreaseNumBatches(hashtable
);
731 * put the tuple into a temp file for later batches
733 Assert(batchno
> hashtable
->curbatch
);
734 ExecHashJoinSaveTuple(tuple
,
736 &hashtable
->innerBatchFile
[batchno
]);
741 * ExecHashGetHashValue
742 * Compute the hash value for a tuple
744 * The tuple to be tested must be in either econtext->ecxt_outertuple or
745 * econtext->ecxt_innertuple. Vars in the hashkeys expressions reference
746 * either OUTER or INNER.
748 * A TRUE result means the tuple's hash value has been successfully computed
749 * and stored at *hashvalue. A FALSE result means the tuple cannot match
750 * because it contains a null attribute, and hence it should be discarded
751 * immediately. (If keep_nulls is true then FALSE is never returned.)
754 ExecHashGetHashValue(HashJoinTable hashtable
,
755 ExprContext
*econtext
,
762 FmgrInfo
*hashfunctions
;
765 MemoryContext oldContext
;
768 * We reset the eval context each time to reclaim any memory leaked in the
769 * hashkey expressions.
771 ResetExprContext(econtext
);
773 oldContext
= MemoryContextSwitchTo(econtext
->ecxt_per_tuple_memory
);
776 hashfunctions
= hashtable
->outer_hashfunctions
;
778 hashfunctions
= hashtable
->inner_hashfunctions
;
780 foreach(hk
, hashkeys
)
782 ExprState
*keyexpr
= (ExprState
*) lfirst(hk
);
786 /* rotate hashkey left 1 bit at each step */
787 hashkey
= (hashkey
<< 1) | ((hashkey
& 0x80000000) ? 1 : 0);
790 * Get the join attribute value of the tuple
792 keyval
= ExecEvalExpr(keyexpr
, econtext
, &isNull
, NULL
);
795 * If the attribute is NULL, and the join operator is strict, then
796 * this tuple cannot pass the join qual so we can reject it
797 * immediately (unless we're scanning the outside of an outer join, in
798 * which case we must not reject it). Otherwise we act like the
799 * hashcode of NULL is zero (this will support operators that act like
800 * IS NOT DISTINCT, though not any more-random behavior). We treat
801 * the hash support function as strict even if the operator is not.
803 * Note: currently, all hashjoinable operators must be strict since
804 * the hash index AM assumes that. However, it takes so little extra
805 * code here to allow non-strict that we may as well do it.
809 if (hashtable
->hashStrict
[i
] && !keep_nulls
)
811 MemoryContextSwitchTo(oldContext
);
812 return false; /* cannot match */
814 /* else, leave hashkey unmodified, equivalent to hashcode 0 */
818 /* Compute the hash function */
821 hkey
= DatumGetUInt32(FunctionCall1(&hashfunctions
[i
], keyval
));
828 MemoryContextSwitchTo(oldContext
);
830 *hashvalue
= hashkey
;
835 * ExecHashGetBucketAndBatch
836 * Determine the bucket number and batch number for a hash value
838 * Note: on-the-fly increases of nbatch must not change the bucket number
839 * for a given hash code (since we don't move tuples to different hash
840 * chains), and must only cause the batch number to remain the same or
841 * increase. Our algorithm is
842 * bucketno = hashvalue MOD nbuckets
843 * batchno = (hashvalue DIV nbuckets) MOD nbatch
844 * where nbuckets and nbatch are both expected to be powers of 2, so we can
845 * do the computations by shifting and masking. (This assumes that all hash
846 * functions are good about randomizing all their output bits, else we are
847 * likely to have very skewed bucket or batch occupancy.)
849 * nbuckets doesn't change over the course of the join.
851 * nbatch is always a power of 2; we increase it only by doubling it. This
852 * effectively adds one more bit to the top of the batchno.
855 ExecHashGetBucketAndBatch(HashJoinTable hashtable
,
860 uint32 nbuckets
= (uint32
) hashtable
->nbuckets
;
861 uint32 nbatch
= (uint32
) hashtable
->nbatch
;
865 /* we can do MOD by masking, DIV by shifting */
866 *bucketno
= hashvalue
& (nbuckets
- 1);
867 *batchno
= (hashvalue
>> hashtable
->log2_nbuckets
) & (nbatch
- 1);
871 *bucketno
= hashvalue
& (nbuckets
- 1);
878 * scan a hash bucket for matches to the current outer tuple
880 * The current outer tuple must be stored in econtext->ecxt_outertuple.
883 ExecScanHashBucket(HashJoinState
*hjstate
,
884 ExprContext
*econtext
)
886 List
*hjclauses
= hjstate
->hashclauses
;
887 HashJoinTable hashtable
= hjstate
->hj_HashTable
;
888 HashJoinTuple hashTuple
= hjstate
->hj_CurTuple
;
889 uint32 hashvalue
= hjstate
->hj_CurHashValue
;
892 * hj_CurTuple is the address of the tuple last returned from the current
893 * bucket, or NULL if it's time to start scanning a new bucket.
895 * If the tuple hashed to a skew bucket then scan the skew bucket
896 * otherwise scan the standard hashtable bucket.
898 if (hashTuple
!= NULL
)
899 hashTuple
= hashTuple
->next
;
900 else if (hjstate
->hj_CurSkewBucketNo
!= INVALID_SKEW_BUCKET_NO
)
901 hashTuple
= hashtable
->skewBucket
[hjstate
->hj_CurSkewBucketNo
]->tuples
;
903 hashTuple
= hashtable
->buckets
[hjstate
->hj_CurBucketNo
];
905 while (hashTuple
!= NULL
)
907 if (hashTuple
->hashvalue
== hashvalue
)
909 TupleTableSlot
*inntuple
;
911 /* insert hashtable's tuple into exec slot so ExecQual sees it */
912 inntuple
= ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple
),
913 hjstate
->hj_HashTupleSlot
,
914 false); /* do not pfree */
915 econtext
->ecxt_innertuple
= inntuple
;
917 /* reset temp memory each time to avoid leaks from qual expr */
918 ResetExprContext(econtext
);
920 if (ExecQual(hjclauses
, econtext
, false))
922 hjstate
->hj_CurTuple
= hashTuple
;
927 hashTuple
= hashTuple
->next
;
939 * reset hash table header for new batch
942 ExecHashTableReset(HashJoinTable hashtable
)
944 MemoryContext oldcxt
;
945 int nbuckets
= hashtable
->nbuckets
;
948 * Release all the hash buckets and tuples acquired in the prior pass, and
949 * reinitialize the context for a new pass.
951 MemoryContextReset(hashtable
->batchCxt
);
952 oldcxt
= MemoryContextSwitchTo(hashtable
->batchCxt
);
954 /* Reallocate and reinitialize the hash bucket headers. */
955 hashtable
->buckets
= (HashJoinTuple
*)
956 palloc0(nbuckets
* sizeof(HashJoinTuple
));
958 hashtable
->spaceUsed
= 0;
960 MemoryContextSwitchTo(oldcxt
);
964 ExecReScanHash(HashState
*node
, ExprContext
*exprCtxt
)
967 * if chgParam of subnode is not null then plan will be re-scanned by
968 * first ExecProcNode.
970 if (((PlanState
*) node
)->lefttree
->chgParam
== NULL
)
971 ExecReScan(((PlanState
*) node
)->lefttree
, exprCtxt
);
976 * ExecHashBuildSkewHash
978 * Set up for skew optimization if we can identify the most common values
979 * (MCVs) of the outer relation's join key. We make a skew hash bucket
980 * for the hash value of each MCV, up to the number of slots allowed
981 * based on available memory.
984 ExecHashBuildSkewHash(HashJoinTable hashtable
, Hash
*node
, int mcvsToUse
)
986 HeapTupleData
*statsTuple
;
992 /* Do nothing if planner didn't identify the outer relation's join key */
993 if (!OidIsValid(node
->skewTable
))
995 /* Also, do nothing if we don't have room for at least one skew bucket */
1000 * Try to find the MCV statistics for the outer relation's join key.
1002 statsTuple
= SearchSysCache(STATRELATT
,
1003 ObjectIdGetDatum(node
->skewTable
),
1004 Int16GetDatum(node
->skewColumn
),
1006 if (!HeapTupleIsValid(statsTuple
))
1009 if (get_attstatsslot(statsTuple
, node
->skewColType
, node
->skewColTypmod
,
1010 STATISTIC_KIND_MCV
, InvalidOid
,
1012 &numbers
, &nnumbers
))
1016 FmgrInfo
*hashfunctions
;
1019 if (mcvsToUse
> nvalues
)
1020 mcvsToUse
= nvalues
;
1023 * Calculate the expected fraction of outer relation that will
1024 * participate in the skew optimization. If this isn't at least
1025 * SKEW_MIN_OUTER_FRACTION, don't use skew optimization.
1028 for (i
= 0; i
< mcvsToUse
; i
++)
1030 if (frac
< SKEW_MIN_OUTER_FRACTION
)
1032 free_attstatsslot(node
->skewColType
,
1033 values
, nvalues
, numbers
, nnumbers
);
1034 ReleaseSysCache(statsTuple
);
1039 * Okay, set up the skew hashtable.
1041 * skewBucket[] is an open addressing hashtable with a power of 2 size
1042 * that is greater than the number of MCV values. (This ensures there
1043 * will be at least one null entry, so searches will always
1046 * Note: this code could fail if mcvsToUse exceeds INT_MAX/8, but that
1047 * is not currently possible since we limit pg_statistic entries to
1048 * much less than that.
1051 while (nbuckets
<= mcvsToUse
)
1053 /* use two more bits just to help avoid collisions */
1056 hashtable
->skewEnabled
= true;
1057 hashtable
->skewBucketLen
= nbuckets
;
1060 * We allocate the bucket memory in the hashtable's batch context. It
1061 * is only needed during the first batch, and this ensures it will be
1062 * automatically removed once the first batch is done.
1064 hashtable
->skewBucket
= (HashSkewBucket
**)
1065 MemoryContextAllocZero(hashtable
->batchCxt
,
1066 nbuckets
* sizeof(HashSkewBucket
*));
1067 hashtable
->skewBucketNums
= (int *)
1068 MemoryContextAllocZero(hashtable
->batchCxt
,
1069 mcvsToUse
* sizeof(int));
1071 hashtable
->spaceUsed
+= nbuckets
* sizeof(HashSkewBucket
*)
1072 + mcvsToUse
* sizeof(int);
1073 hashtable
->spaceUsedSkew
+= nbuckets
* sizeof(HashSkewBucket
*)
1074 + mcvsToUse
* sizeof(int);
1077 * Create a skew bucket for each MCV hash value.
1079 * Note: it is very important that we create the buckets in order of
1080 * decreasing MCV frequency. If we have to remove some buckets, they
1081 * must be removed in reverse order of creation (see notes in
1082 * ExecHashRemoveNextSkewBucket) and we want the least common MCVs to
1085 hashfunctions
= hashtable
->outer_hashfunctions
;
1087 for (i
= 0; i
< mcvsToUse
; i
++)
1092 hashvalue
= DatumGetUInt32(FunctionCall1(&hashfunctions
[0],
1096 * While we have not hit a hole in the hashtable and have not hit
1097 * the desired bucket, we have collided with some previous hash
1098 * value, so try the next bucket location. NB: this code must
1099 * match ExecHashGetSkewBucket.
1101 bucket
= hashvalue
& (nbuckets
- 1);
1102 while (hashtable
->skewBucket
[bucket
] != NULL
&&
1103 hashtable
->skewBucket
[bucket
]->hashvalue
!= hashvalue
)
1104 bucket
= (bucket
+ 1) & (nbuckets
- 1);
1107 * If we found an existing bucket with the same hashvalue, leave
1108 * it alone. It's okay for two MCVs to share a hashvalue.
1110 if (hashtable
->skewBucket
[bucket
] != NULL
)
1113 /* Okay, create a new skew bucket for this hashvalue. */
1114 hashtable
->skewBucket
[bucket
] = (HashSkewBucket
*)
1115 MemoryContextAlloc(hashtable
->batchCxt
,
1116 sizeof(HashSkewBucket
));
1117 hashtable
->skewBucket
[bucket
]->hashvalue
= hashvalue
;
1118 hashtable
->skewBucket
[bucket
]->tuples
= NULL
;
1119 hashtable
->skewBucketNums
[hashtable
->nSkewBuckets
] = bucket
;
1120 hashtable
->nSkewBuckets
++;
1121 hashtable
->spaceUsed
+= SKEW_BUCKET_OVERHEAD
;
1122 hashtable
->spaceUsedSkew
+= SKEW_BUCKET_OVERHEAD
;
1125 free_attstatsslot(node
->skewColType
,
1126 values
, nvalues
, numbers
, nnumbers
);
1129 ReleaseSysCache(statsTuple
);
1133 * ExecHashGetSkewBucket
1135 * Returns the index of the skew bucket for this hashvalue,
1136 * or INVALID_SKEW_BUCKET_NO if the hashvalue is not
1137 * associated with any active skew bucket.
1140 ExecHashGetSkewBucket(HashJoinTable hashtable
, uint32 hashvalue
)
1145 * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in
1146 * particular, this happens after the initial batch is done).
1148 if (!hashtable
->skewEnabled
)
1149 return INVALID_SKEW_BUCKET_NO
;
1152 * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.
1154 bucket
= hashvalue
& (hashtable
->skewBucketLen
- 1);
1157 * While we have not hit a hole in the hashtable and have not hit the
1158 * desired bucket, we have collided with some other hash value, so try the
1159 * next bucket location.
1161 while (hashtable
->skewBucket
[bucket
] != NULL
&&
1162 hashtable
->skewBucket
[bucket
]->hashvalue
!= hashvalue
)
1163 bucket
= (bucket
+ 1) & (hashtable
->skewBucketLen
- 1);
1166 * Found the desired bucket?
1168 if (hashtable
->skewBucket
[bucket
] != NULL
)
1172 * There must not be any hashtable entry for this hash value.
1174 return INVALID_SKEW_BUCKET_NO
;
1178 * ExecHashSkewTableInsert
1180 * Insert a tuple into the skew hashtable.
1182 * This should generally match up with the current-batch case in
1183 * ExecHashTableInsert.
1186 ExecHashSkewTableInsert(HashJoinTable hashtable
,
1187 TupleTableSlot
*slot
,
1191 MinimalTuple tuple
= ExecFetchSlotMinimalTuple(slot
);
1192 HashJoinTuple hashTuple
;
1195 /* Create the HashJoinTuple */
1196 hashTupleSize
= HJTUPLE_OVERHEAD
+ tuple
->t_len
;
1197 hashTuple
= (HashJoinTuple
) MemoryContextAlloc(hashtable
->batchCxt
,
1199 hashTuple
->hashvalue
= hashvalue
;
1200 memcpy(HJTUPLE_MINTUPLE(hashTuple
), tuple
, tuple
->t_len
);
1202 /* Push it onto the front of the skew bucket's list */
1203 hashTuple
->next
= hashtable
->skewBucket
[bucketNumber
]->tuples
;
1204 hashtable
->skewBucket
[bucketNumber
]->tuples
= hashTuple
;
1206 /* Account for space used, and back off if we've used too much */
1207 hashtable
->spaceUsed
+= hashTupleSize
;
1208 hashtable
->spaceUsedSkew
+= hashTupleSize
;
1209 while (hashtable
->spaceUsedSkew
> hashtable
->spaceAllowedSkew
)
1210 ExecHashRemoveNextSkewBucket(hashtable
);
1212 /* Check we are not over the total spaceAllowed, either */
1213 if (hashtable
->spaceUsed
> hashtable
->spaceAllowed
)
1214 ExecHashIncreaseNumBatches(hashtable
);
1218 * ExecHashRemoveNextSkewBucket
1220 * Remove the least valuable skew bucket by pushing its tuples into
1221 * the main hash table.
1224 ExecHashRemoveNextSkewBucket(HashJoinTable hashtable
)
1227 HashSkewBucket
*bucket
;
1231 HashJoinTuple hashTuple
;
1233 /* Locate the bucket to remove */
1234 bucketToRemove
= hashtable
->skewBucketNums
[hashtable
->nSkewBuckets
- 1];
1235 bucket
= hashtable
->skewBucket
[bucketToRemove
];
1238 * Calculate which bucket and batch the tuples belong to in the main
1239 * hashtable. They all have the same hash value, so it's the same for all
1240 * of them. Also note that it's not possible for nbatch to increase while
1241 * we are processing the tuples.
1243 hashvalue
= bucket
->hashvalue
;
1244 ExecHashGetBucketAndBatch(hashtable
, hashvalue
, &bucketno
, &batchno
);
1246 /* Process all tuples in the bucket */
1247 hashTuple
= bucket
->tuples
;
1248 while (hashTuple
!= NULL
)
1250 HashJoinTuple nextHashTuple
= hashTuple
->next
;
1255 * This code must agree with ExecHashTableInsert. We do not use
1256 * ExecHashTableInsert directly as ExecHashTableInsert expects a
1257 * TupleTableSlot while we already have HashJoinTuples.
1259 tuple
= HJTUPLE_MINTUPLE(hashTuple
);
1260 tupleSize
= HJTUPLE_OVERHEAD
+ tuple
->t_len
;
1262 /* Decide whether to put the tuple in the hash table or a temp file */
1263 if (batchno
== hashtable
->curbatch
)
1265 /* Move the tuple to the main hash table */
1266 hashTuple
->next
= hashtable
->buckets
[bucketno
];
1267 hashtable
->buckets
[bucketno
] = hashTuple
;
1268 /* We have reduced skew space, but overall space doesn't change */
1269 hashtable
->spaceUsedSkew
-= tupleSize
;
1273 /* Put the tuple into a temp file for later batches */
1274 Assert(batchno
> hashtable
->curbatch
);
1275 ExecHashJoinSaveTuple(tuple
, hashvalue
,
1276 &hashtable
->innerBatchFile
[batchno
]);
1278 hashtable
->spaceUsed
-= tupleSize
;
1279 hashtable
->spaceUsedSkew
-= tupleSize
;
1282 hashTuple
= nextHashTuple
;
1286 * Free the bucket struct itself and reset the hashtable entry to NULL.
1288 * NOTE: this is not nearly as simple as it looks on the surface, because
1289 * of the possibility of collisions in the hashtable. Suppose that hash
1290 * values A and B collide at a particular hashtable entry, and that A was
1291 * entered first so B gets shifted to a different table entry. If we were
1292 * to remove A first then ExecHashGetSkewBucket would mistakenly start
1293 * reporting that B is not in the hashtable, because it would hit the NULL
1294 * before finding B. However, we always remove entries in the reverse
1295 * order of creation, so this failure cannot happen.
1297 hashtable
->skewBucket
[bucketToRemove
] = NULL
;
1298 hashtable
->nSkewBuckets
--;
1300 hashtable
->spaceUsed
-= SKEW_BUCKET_OVERHEAD
;
1301 hashtable
->spaceUsedSkew
-= SKEW_BUCKET_OVERHEAD
;
1304 * If we have removed all skew buckets then give up on skew optimization.
1305 * Release the arrays since they aren't useful any more.
1307 if (hashtable
->nSkewBuckets
== 0)
1309 hashtable
->skewEnabled
= false;
1310 pfree(hashtable
->skewBucket
);
1311 pfree(hashtable
->skewBucketNums
);
1312 hashtable
->skewBucket
= NULL
;
1313 hashtable
->skewBucketNums
= NULL
;
1314 hashtable
->spaceUsed
-= hashtable
->spaceUsedSkew
;
1315 hashtable
->spaceUsedSkew
= 0;