1 /*-------------------------------------------------------------------------
4 * executor utility routines for grouping, hashing, and aggregation
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/executor/execGrouping.c
13 *-------------------------------------------------------------------------
17 #include "access/parallel.h"
18 #include "common/hashfn.h"
19 #include "executor/executor.h"
20 #include "miscadmin.h"
21 #include "utils/lsyscache.h"
23 struct TupleHashEntryData
25 MinimalTuple firstTuple
; /* copy of first tuple in this group */
26 uint32 status
; /* hash status */
27 uint32 hash
; /* hash value (cached) */
30 static int TupleHashTableMatch(struct tuplehash_hash
*tb
, const MinimalTuple tuple1
, const MinimalTuple tuple2
);
31 static inline uint32
TupleHashTableHash_internal(struct tuplehash_hash
*tb
,
32 const MinimalTuple tuple
);
33 static inline TupleHashEntry
LookupTupleHashEntry_internal(TupleHashTable hashtable
,
35 bool *isnew
, uint32 hash
);
38 * Define parameters for tuple hash table code generation. The interface is
39 * *also* declared in execnodes.h (to generate the types, which are externally
42 #define SH_PREFIX tuplehash
43 #define SH_ELEMENT_TYPE TupleHashEntryData
44 #define SH_KEY_TYPE MinimalTuple
45 #define SH_KEY firstTuple
46 #define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key)
47 #define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0
48 #define SH_SCOPE extern
50 #define SH_GET_HASH(tb, a) a->hash
52 #include "lib/simplehash.h"
55 /*****************************************************************************
56 * Utility routines for grouping tuples together
57 *****************************************************************************/
60 * execTuplesMatchPrepare
61 * Build expression that can be evaluated using ExecQual(), returning
62 * whether an ExprContext's inner/outer tuples are NOT DISTINCT
65 execTuplesMatchPrepare(TupleDesc desc
,
67 const AttrNumber
*keyColIdx
,
68 const Oid
*eqOperators
,
69 const Oid
*collations
,
79 eqFunctions
= (Oid
*) palloc(numCols
* sizeof(Oid
));
81 /* lookup equality functions */
82 for (i
= 0; i
< numCols
; i
++)
83 eqFunctions
[i
] = get_opcode(eqOperators
[i
]);
85 /* build actual expression */
86 expr
= ExecBuildGroupingEqual(desc
, desc
, NULL
, NULL
,
87 numCols
, keyColIdx
, eqFunctions
, collations
,
94 * execTuplesHashPrepare
95 * Look up the equality and hashing functions needed for a TupleHashTable.
97 * This is similar to execTuplesMatchPrepare, but we also need to find the
98 * hash functions associated with the equality operators. *eqFunctions and
99 * *hashFunctions receive the palloc'd result arrays.
101 * Note: we expect that the given operators are not cross-type comparisons.
104 execTuplesHashPrepare(int numCols
,
105 const Oid
*eqOperators
,
107 FmgrInfo
**hashFunctions
)
111 *eqFuncOids
= (Oid
*) palloc(numCols
* sizeof(Oid
));
112 *hashFunctions
= (FmgrInfo
*) palloc(numCols
* sizeof(FmgrInfo
));
114 for (i
= 0; i
< numCols
; i
++)
116 Oid eq_opr
= eqOperators
[i
];
118 Oid left_hash_function
;
119 Oid right_hash_function
;
121 eq_function
= get_opcode(eq_opr
);
122 if (!get_op_hash_functions(eq_opr
,
123 &left_hash_function
, &right_hash_function
))
124 elog(ERROR
, "could not find hash function for hash operator %u",
126 /* We're not supporting cross-type cases here */
127 Assert(left_hash_function
== right_hash_function
);
128 (*eqFuncOids
)[i
] = eq_function
;
129 fmgr_info(right_hash_function
, &(*hashFunctions
)[i
]);
134 /*****************************************************************************
135 * Utility routines for all-in-memory hash tables
137 * These routines build hash tables for grouping tuples together (eg, for
138 * hash aggregation). There is one entry for each not-distinct set of tuples
140 *****************************************************************************/
143 * Construct an empty TupleHashTable
145 * parent: PlanState node that will own this hash table
146 * inputDesc: tuple descriptor for input tuples
147 * inputOps: slot ops for input tuples, or NULL if unknown or not fixed
148 * numCols: number of columns to be compared (length of next 4 arrays)
149 * keyColIdx: indexes of tuple columns to compare
150 * eqfuncoids: OIDs of equality comparison functions to use
151 * hashfunctions: FmgrInfos of datatype-specific hashing functions to use
152 * collations: collations to use in comparisons
153 * nbuckets: initial estimate of hashtable size
154 * additionalsize: size of data stored in ->additional
155 * metacxt: memory context for long-lived allocation, but not per-entry data
156 * tablecxt: memory context in which to store table entries
157 * tempcxt: short-lived context for evaluation hash and comparison functions
158 * use_variable_hash_iv: if true, adjust hash IV per-parallel-worker
160 * The hashfunctions array may be made with execTuplesHashPrepare(). Note they
161 * are not cross-type functions, but expect to see the table datatype(s)
164 * Note that the keyColIdx, hashfunctions, and collations arrays must be
165 * allocated in storage that will live as long as the hashtable does.
168 BuildTupleHashTable(PlanState
*parent
,
170 const TupleTableSlotOps
*inputOps
,
172 AttrNumber
*keyColIdx
,
173 const Oid
*eqfuncoids
,
174 FmgrInfo
*hashfunctions
,
178 MemoryContext metacxt
,
179 MemoryContext tablecxt
,
180 MemoryContext tempcxt
,
181 bool use_variable_hash_iv
)
183 TupleHashTable hashtable
;
184 Size entrysize
= sizeof(TupleHashEntryData
) + additionalsize
;
186 MemoryContext oldcontext
;
190 Assert(nbuckets
> 0);
192 /* Limit initial table size request to not more than hash_mem */
193 hash_mem_limit
= get_hash_memory_limit() / entrysize
;
194 if (nbuckets
> hash_mem_limit
)
195 nbuckets
= hash_mem_limit
;
197 oldcontext
= MemoryContextSwitchTo(metacxt
);
199 hashtable
= (TupleHashTable
) palloc(sizeof(TupleHashTableData
));
201 hashtable
->numCols
= numCols
;
202 hashtable
->keyColIdx
= keyColIdx
;
203 hashtable
->tab_collations
= collations
;
204 hashtable
->tablecxt
= tablecxt
;
205 hashtable
->tempcxt
= tempcxt
;
206 hashtable
->additionalsize
= additionalsize
;
207 hashtable
->tableslot
= NULL
; /* will be made on first lookup */
208 hashtable
->inputslot
= NULL
;
209 hashtable
->in_hash_expr
= NULL
;
210 hashtable
->cur_eq_func
= NULL
;
213 * If parallelism is in use, even if the leader backend is performing the
214 * scan itself, we don't want to create the hashtable exactly the same way
215 * in all workers. As hashtables are iterated over in keyspace-order,
216 * doing so in all processes in the same way is likely to lead to
217 * "unbalanced" hashtables when the table size initially is
220 if (use_variable_hash_iv
)
221 hash_iv
= murmurhash32(ParallelWorkerNumber
);
223 hashtable
->hashtab
= tuplehash_create(metacxt
, nbuckets
, hashtable
);
226 * We copy the input tuple descriptor just for safety --- we assume all
227 * input tuples will have equivalent descriptors.
229 hashtable
->tableslot
= MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc
),
230 &TTSOpsMinimalTuple
);
233 * If the caller fails to make the metacxt different from the tablecxt,
234 * allowing JIT would lead to the generated functions to a) live longer
235 * than the query or b) be re-generated each time the table is being
236 * reset. Therefore prevent JIT from being used in that case, by not
237 * providing a parent node (which prevents accessing the JitContext in the
240 allow_jit
= (metacxt
!= tablecxt
);
242 /* build hash ExprState for all columns */
243 hashtable
->tab_hash_expr
= ExecBuildHash32FromAttrs(inputDesc
,
249 allow_jit
? parent
: NULL
,
252 /* build comparator for all columns */
253 hashtable
->tab_eq_func
= ExecBuildGroupingEqual(inputDesc
, inputDesc
,
257 keyColIdx
, eqfuncoids
, collations
,
258 allow_jit
? parent
: NULL
);
261 * While not pretty, it's ok to not shut down this context, but instead
262 * rely on the containing memory context being reset, as
263 * ExecBuildGroupingEqual() only builds a very simple expression calling
264 * functions (i.e. nothing that'd employ RegisterExprContextCallback()).
266 hashtable
->exprcontext
= CreateStandaloneExprContext();
268 MemoryContextSwitchTo(oldcontext
);
274 * Reset contents of the hashtable to be empty, preserving all the non-content
275 * state. Note that the tablecxt passed to BuildTupleHashTable() should
276 * also be reset, otherwise there will be leaks.
279 ResetTupleHashTable(TupleHashTable hashtable
)
281 tuplehash_reset(hashtable
->hashtab
);
285 * Return size of the hash bucket. Useful for estimating memory usage.
288 TupleHashEntrySize(void)
290 return sizeof(TupleHashEntryData
);
294 * Find or create a hashtable entry for the tuple group containing the
295 * given tuple. The tuple must be the same type as the hashtable entries.
297 * If isnew is NULL, we do not create new entries; we return NULL if no
300 * If hash is not NULL, we set it to the calculated hash value. This allows
301 * callers access to the hash value even if no entry is returned.
303 * If isnew isn't NULL, then a new entry is created if no existing entry
304 * matches. On return, *isnew is true if the entry is newly created,
305 * false if it existed already. ->additional_data in the new entry has
309 LookupTupleHashEntry(TupleHashTable hashtable
, TupleTableSlot
*slot
,
310 bool *isnew
, uint32
*hash
)
312 TupleHashEntry entry
;
313 MemoryContext oldContext
;
316 /* Need to run the hash functions in short-lived context */
317 oldContext
= MemoryContextSwitchTo(hashtable
->tempcxt
);
319 /* set up data needed by hash and match functions */
320 hashtable
->inputslot
= slot
;
321 hashtable
->in_hash_expr
= hashtable
->tab_hash_expr
;
322 hashtable
->cur_eq_func
= hashtable
->tab_eq_func
;
324 local_hash
= TupleHashTableHash_internal(hashtable
->hashtab
, NULL
);
325 entry
= LookupTupleHashEntry_internal(hashtable
, slot
, isnew
, local_hash
);
330 Assert(entry
== NULL
|| entry
->hash
== local_hash
);
332 MemoryContextSwitchTo(oldContext
);
338 * Compute the hash value for a tuple
341 TupleHashTableHash(TupleHashTable hashtable
, TupleTableSlot
*slot
)
343 MemoryContext oldContext
;
346 hashtable
->inputslot
= slot
;
347 hashtable
->in_hash_expr
= hashtable
->tab_hash_expr
;
349 /* Need to run the hash functions in short-lived context */
350 oldContext
= MemoryContextSwitchTo(hashtable
->tempcxt
);
352 hash
= TupleHashTableHash_internal(hashtable
->hashtab
, NULL
);
354 MemoryContextSwitchTo(oldContext
);
360 TupleHashEntryGetTuple(TupleHashEntry entry
)
362 return entry
->firstTuple
;
366 * Get a pointer into the additional space allocated for this entry. The
367 * amount of space available is the additionalsize specified to
368 * BuildTupleHashTable(). If additionalsize was specified as zero, no
369 * additional space is available and this function should not be called.
372 TupleHashEntryGetAdditional(TupleHashEntry entry
)
374 return (char *) entry
->firstTuple
+ MAXALIGN(entry
->firstTuple
->t_len
);
378 * A variant of LookupTupleHashEntry for callers that have already computed
382 LookupTupleHashEntryHash(TupleHashTable hashtable
, TupleTableSlot
*slot
,
383 bool *isnew
, uint32 hash
)
385 TupleHashEntry entry
;
386 MemoryContext oldContext
;
388 /* Need to run the hash functions in short-lived context */
389 oldContext
= MemoryContextSwitchTo(hashtable
->tempcxt
);
391 /* set up data needed by hash and match functions */
392 hashtable
->inputslot
= slot
;
393 hashtable
->in_hash_expr
= hashtable
->tab_hash_expr
;
394 hashtable
->cur_eq_func
= hashtable
->tab_eq_func
;
396 entry
= LookupTupleHashEntry_internal(hashtable
, slot
, isnew
, hash
);
397 Assert(entry
== NULL
|| entry
->hash
== hash
);
399 MemoryContextSwitchTo(oldContext
);
405 * Search for a hashtable entry matching the given tuple. No entry is
406 * created if there's not a match. This is similar to the non-creating
407 * case of LookupTupleHashEntry, except that it supports cross-type
408 * comparisons, in which the given tuple is not of the same type as the
409 * table entries. The caller must provide the hash ExprState to use for
410 * the input tuple, as well as the equality ExprState, since these may be
411 * different from the table's internal functions.
414 FindTupleHashEntry(TupleHashTable hashtable
, TupleTableSlot
*slot
,
418 TupleHashEntry entry
;
419 MemoryContext oldContext
;
422 /* Need to run the hash functions in short-lived context */
423 oldContext
= MemoryContextSwitchTo(hashtable
->tempcxt
);
425 /* Set up data needed by hash and match functions */
426 hashtable
->inputslot
= slot
;
427 hashtable
->in_hash_expr
= hashexpr
;
428 hashtable
->cur_eq_func
= eqcomp
;
430 /* Search the hash table */
431 key
= NULL
; /* flag to reference inputslot */
432 entry
= tuplehash_lookup(hashtable
->hashtab
, key
);
433 MemoryContextSwitchTo(oldContext
);
439 * If tuple is NULL, use the input slot instead. This convention avoids the
440 * need to materialize virtual input tuples unless they actually need to get
441 * copied into the table.
443 * Also, the caller must select an appropriate memory context for running
444 * the hash functions.
447 TupleHashTableHash_internal(struct tuplehash_hash
*tb
,
448 const MinimalTuple tuple
)
450 TupleHashTable hashtable
= (TupleHashTable
) tb
->private_data
;
452 TupleTableSlot
*slot
;
457 /* Process the current input tuple for the table */
458 hashtable
->exprcontext
->ecxt_innertuple
= hashtable
->inputslot
;
459 hashkey
= DatumGetUInt32(ExecEvalExpr(hashtable
->in_hash_expr
,
460 hashtable
->exprcontext
,
466 * Process a tuple already stored in the table.
468 * (this case never actually occurs due to the way simplehash.h is
469 * used, as the hash-value is stored in the entries)
471 slot
= hashtable
->exprcontext
->ecxt_innertuple
= hashtable
->tableslot
;
472 ExecStoreMinimalTuple(tuple
, slot
, false);
473 hashkey
= DatumGetUInt32(ExecEvalExpr(hashtable
->tab_hash_expr
,
474 hashtable
->exprcontext
,
479 * The hashing done above, even with an initial value, doesn't tend to
480 * result in good hash perturbation. Running the value produced above
481 * through murmurhash32 leads to near perfect hash perturbation.
483 return murmurhash32(hashkey
);
487 * Does the work of LookupTupleHashEntry and LookupTupleHashEntryHash. Useful
488 * so that we can avoid switching the memory context multiple times for
489 * LookupTupleHashEntry.
491 * NB: This function may or may not change the memory context. Caller is
492 * expected to change it back.
494 static inline TupleHashEntry
495 LookupTupleHashEntry_internal(TupleHashTable hashtable
, TupleTableSlot
*slot
,
496 bool *isnew
, uint32 hash
)
498 TupleHashEntryData
*entry
;
502 key
= NULL
; /* flag to reference inputslot */
506 entry
= tuplehash_insert_hash(hashtable
->hashtab
, key
, hash
, &found
);
510 /* found pre-existing entry */
515 MinimalTuple firstTuple
;
516 size_t totalsize
; /* including alignment and additionalsize */
518 /* created new entry */
520 /* zero caller data */
521 MemoryContextSwitchTo(hashtable
->tablecxt
);
523 /* Copy the first tuple into the table context */
524 firstTuple
= ExecCopySlotMinimalTuple(slot
);
527 * Allocate additional space right after the MinimalTuple of size
528 * additionalsize. The caller can get a pointer to this data with
529 * TupleHashEntryGetAdditional(), and store arbitrary data there.
531 * This avoids the need to store an extra pointer or allocate an
532 * additional chunk, which would waste memory.
534 totalsize
= MAXALIGN(firstTuple
->t_len
) + hashtable
->additionalsize
;
535 firstTuple
= repalloc(firstTuple
, totalsize
);
536 memset((char *) firstTuple
+ firstTuple
->t_len
, 0,
537 totalsize
- firstTuple
->t_len
);
539 entry
->firstTuple
= firstTuple
;
544 entry
= tuplehash_lookup_hash(hashtable
->hashtab
, key
, hash
);
551 * See whether two tuples (presumably of the same hash value) match
554 TupleHashTableMatch(struct tuplehash_hash
*tb
, const MinimalTuple tuple1
, const MinimalTuple tuple2
)
556 TupleTableSlot
*slot1
;
557 TupleTableSlot
*slot2
;
558 TupleHashTable hashtable
= (TupleHashTable
) tb
->private_data
;
559 ExprContext
*econtext
= hashtable
->exprcontext
;
562 * We assume that simplehash.h will only ever call us with the first
563 * argument being an actual table entry, and the second argument being
564 * LookupTupleHashEntry's dummy TupleHashEntryData. The other direction
565 * could be supported too, but is not currently required.
567 Assert(tuple1
!= NULL
);
568 slot1
= hashtable
->tableslot
;
569 ExecStoreMinimalTuple(tuple1
, slot1
, false);
570 Assert(tuple2
== NULL
);
571 slot2
= hashtable
->inputslot
;
573 /* For crosstype comparisons, the inputslot must be first */
574 econtext
->ecxt_innertuple
= slot2
;
575 econtext
->ecxt_outertuple
= slot1
;
576 return !ExecQualAndReset(hashtable
->cur_eq_func
, econtext
);