src/backend/executor/execGrouping.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * execGrouping.c
   4  *        executor utility routines for grouping, hashing, and aggregation
   5  *
   6  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/executor/execGrouping.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include "access/parallel.h"
  18 #include "common/hashfn.h"
  19 #include "executor/executor.h"
  20 #include "miscadmin.h"
  21 #include "utils/lsyscache.h"
  22
  23 struct TupleHashEntryData
  24 {
  25         MinimalTuple firstTuple;        /* copy of first tuple in this group */
  26         uint32          status;                 /* hash status */
  27         uint32          hash;                   /* hash value (cached) */
  28 };
  29
  30 static int      TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
  31 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
  32                                                                                                  const MinimalTuple tuple);
  33 static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
  34                                                                                                                    TupleTableSlot *slot,
  35                                                                                                                    bool *isnew, uint32 hash);
  36
  37 /*
  38  * Define parameters for tuple hash table code generation. The interface is
  39  * *also* declared in execnodes.h (to generate the types, which are externally
  40  * visible).
  41  */
  42 #define SH_PREFIX tuplehash
  43 #define SH_ELEMENT_TYPE TupleHashEntryData
  44 #define SH_KEY_TYPE MinimalTuple
  45 #define SH_KEY firstTuple
  46 #define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key)
  47 #define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0
  48 #define SH_SCOPE extern
  49 #define SH_STORE_HASH
  50 #define SH_GET_HASH(tb, a) a->hash
  51 #define SH_DEFINE
  52 #include "lib/simplehash.h"
  53
  54
  55 /*****************************************************************************
  56  *              Utility routines for grouping tuples together
  57  *****************************************************************************/
  58
  59 /*
  60  * execTuplesMatchPrepare
  61  *              Build expression that can be evaluated using ExecQual(), returning
  62  *              whether an ExprContext's inner/outer tuples are NOT DISTINCT
  63  */
  64 ExprState *
  65 execTuplesMatchPrepare(TupleDesc desc,
  66                                            int numCols,
  67                                            const AttrNumber *keyColIdx,
  68                                            const Oid *eqOperators,
  69                                            const Oid *collations,
  70                                            PlanState *parent)
  71 {
  72         Oid                *eqFunctions;
  73         int                     i;
  74         ExprState  *expr;
  75
  76         if (numCols == 0)
  77                 return NULL;
  78
  79         eqFunctions = (Oid *) palloc(numCols * sizeof(Oid));
  80
  81         /* lookup equality functions */
  82         for (i = 0; i < numCols; i++)
  83                 eqFunctions[i] = get_opcode(eqOperators[i]);
  84
  85         /* build actual expression */
  86         expr = ExecBuildGroupingEqual(desc, desc, NULL, NULL,
  87                                                                   numCols, keyColIdx, eqFunctions, collations,
  88                                                                   parent);
  89
  90         return expr;
  91 }
  92
  93 /*
  94  * execTuplesHashPrepare
  95  *              Look up the equality and hashing functions needed for a TupleHashTable.
  96  *
  97  * This is similar to execTuplesMatchPrepare, but we also need to find the
  98  * hash functions associated with the equality operators.  *eqFunctions and
  99  * *hashFunctions receive the palloc'd result arrays.
 100  *
 101  * Note: we expect that the given operators are not cross-type comparisons.
 102  */
 103 void
 104 execTuplesHashPrepare(int numCols,
 105                                           const Oid *eqOperators,
 106                                           Oid **eqFuncOids,
 107                                           FmgrInfo **hashFunctions)
 108 {
 109         int                     i;
 110
 111         *eqFuncOids = (Oid *) palloc(numCols * sizeof(Oid));
 112         *hashFunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
 113
 114         for (i = 0; i < numCols; i++)
 115         {
 116                 Oid                     eq_opr = eqOperators[i];
 117                 Oid                     eq_function;
 118                 Oid                     left_hash_function;
 119                 Oid                     right_hash_function;
 120
 121                 eq_function = get_opcode(eq_opr);
 122                 if (!get_op_hash_functions(eq_opr,
 123                                                                    &left_hash_function, &right_hash_function))
 124                         elog(ERROR, "could not find hash function for hash operator %u",
 125                                  eq_opr);
 126                 /* We're not supporting cross-type cases here */
 127                 Assert(left_hash_function == right_hash_function);
 128                 (*eqFuncOids)[i] = eq_function;
 129                 fmgr_info(right_hash_function, &(*hashFunctions)[i]);
 130         }
 131 }
 132
 133
 134 /*****************************************************************************
 135  *              Utility routines for all-in-memory hash tables
 136  *
 137  * These routines build hash tables for grouping tuples together (eg, for
 138  * hash aggregation).  There is one entry for each not-distinct set of tuples
 139  * presented.
 140  *****************************************************************************/
 141
 142 /*
 143  * Construct an empty TupleHashTable
 144  *
 145  *      parent: PlanState node that will own this hash table
 146  *      inputDesc: tuple descriptor for input tuples
 147  *      inputOps: slot ops for input tuples, or NULL if unknown or not fixed
 148  *      numCols: number of columns to be compared (length of next 4 arrays)
 149  *      keyColIdx: indexes of tuple columns to compare
 150  *      eqfuncoids: OIDs of equality comparison functions to use
 151  *      hashfunctions: FmgrInfos of datatype-specific hashing functions to use
 152  *      collations: collations to use in comparisons
 153  *      nbuckets: initial estimate of hashtable size
 154  *      additionalsize: size of data stored in ->additional
 155  *      metacxt: memory context for long-lived allocation, but not per-entry data
 156  *      tablecxt: memory context in which to store table entries
 157  *      tempcxt: short-lived context for evaluation hash and comparison functions
 158  *      use_variable_hash_iv: if true, adjust hash IV per-parallel-worker
 159  *
 160  * The hashfunctions array may be made with execTuplesHashPrepare().  Note they
 161  * are not cross-type functions, but expect to see the table datatype(s)
 162  * on both sides.
 163  *
 164  * Note that the keyColIdx, hashfunctions, and collations arrays must be
 165  * allocated in storage that will live as long as the hashtable does.
 166  */
 167 TupleHashTable
 168 BuildTupleHashTable(PlanState *parent,
 169                                         TupleDesc inputDesc,
 170                                         const TupleTableSlotOps *inputOps,
 171                                         int numCols,
 172                                         AttrNumber *keyColIdx,
 173                                         const Oid *eqfuncoids,
 174                                         FmgrInfo *hashfunctions,
 175                                         Oid *collations,
 176                                         long nbuckets,
 177                                         Size additionalsize,
 178                                         MemoryContext metacxt,
 179                                         MemoryContext tablecxt,
 180                                         MemoryContext tempcxt,
 181                                         bool use_variable_hash_iv)
 182 {
 183         TupleHashTable hashtable;
 184         Size            entrysize = sizeof(TupleHashEntryData) + additionalsize;
 185         Size            hash_mem_limit;
 186         MemoryContext oldcontext;
 187         bool            allow_jit;
 188         uint32          hash_iv = 0;
 189
 190         Assert(nbuckets > 0);
 191
 192         /* Limit initial table size request to not more than hash_mem */
 193         hash_mem_limit = get_hash_memory_limit() / entrysize;
 194         if (nbuckets > hash_mem_limit)
 195                 nbuckets = hash_mem_limit;
 196
 197         oldcontext = MemoryContextSwitchTo(metacxt);
 198
 199         hashtable = (TupleHashTable) palloc(sizeof(TupleHashTableData));
 200
 201         hashtable->numCols = numCols;
 202         hashtable->keyColIdx = keyColIdx;
 203         hashtable->tab_collations = collations;
 204         hashtable->tablecxt = tablecxt;
 205         hashtable->tempcxt = tempcxt;
 206         hashtable->additionalsize = additionalsize;
 207         hashtable->tableslot = NULL;    /* will be made on first lookup */
 208         hashtable->inputslot = NULL;
 209         hashtable->in_hash_expr = NULL;
 210         hashtable->cur_eq_func = NULL;
 211
 212         /*
 213          * If parallelism is in use, even if the leader backend is performing the
 214          * scan itself, we don't want to create the hashtable exactly the same way
 215          * in all workers. As hashtables are iterated over in keyspace-order,
 216          * doing so in all processes in the same way is likely to lead to
 217          * "unbalanced" hashtables when the table size initially is
 218          * underestimated.
 219          */
 220         if (use_variable_hash_iv)
 221                 hash_iv = murmurhash32(ParallelWorkerNumber);
 222
 223         hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
 224
 225         /*
 226          * We copy the input tuple descriptor just for safety --- we assume all
 227          * input tuples will have equivalent descriptors.
 228          */
 229         hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc),
 230                                                                                                         &TTSOpsMinimalTuple);
 231
 232         /*
 233          * If the caller fails to make the metacxt different from the tablecxt,
 234          * allowing JIT would lead to the generated functions to a) live longer
 235          * than the query or b) be re-generated each time the table is being
 236          * reset. Therefore prevent JIT from being used in that case, by not
 237          * providing a parent node (which prevents accessing the JitContext in the
 238          * EState).
 239          */
 240         allow_jit = (metacxt != tablecxt);
 241
 242         /* build hash ExprState for all columns */
 243         hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc,
 244                                                                                                                 inputOps,
 245                                                                                                                 hashfunctions,
 246                                                                                                                 collations,
 247                                                                                                                 numCols,
 248                                                                                                                 keyColIdx,
 249                                                                                                                 allow_jit ? parent : NULL,
 250                                                                                                                 hash_iv);
 251
 252         /* build comparator for all columns */
 253         hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc,
 254                                                                                                         inputOps,
 255                                                                                                         &TTSOpsMinimalTuple,
 256                                                                                                         numCols,
 257                                                                                                         keyColIdx, eqfuncoids, collations,
 258                                                                                                         allow_jit ? parent : NULL);
 259
 260         /*
 261          * While not pretty, it's ok to not shut down this context, but instead
 262          * rely on the containing memory context being reset, as
 263          * ExecBuildGroupingEqual() only builds a very simple expression calling
 264          * functions (i.e. nothing that'd employ RegisterExprContextCallback()).
 265          */
 266         hashtable->exprcontext = CreateStandaloneExprContext();
 267
 268         MemoryContextSwitchTo(oldcontext);
 269
 270         return hashtable;
 271 }
 272
 273 /*
 274  * Reset contents of the hashtable to be empty, preserving all the non-content
 275  * state. Note that the tablecxt passed to BuildTupleHashTable() should
 276  * also be reset, otherwise there will be leaks.
 277  */
 278 void
 279 ResetTupleHashTable(TupleHashTable hashtable)
 280 {
 281         tuplehash_reset(hashtable->hashtab);
 282 }
 283
 284 /*
 285  * Return size of the hash bucket. Useful for estimating memory usage.
 286  */
 287 size_t
 288 TupleHashEntrySize(void)
 289 {
 290         return sizeof(TupleHashEntryData);
 291 }
 292
 293 /*
 294  * Find or create a hashtable entry for the tuple group containing the
 295  * given tuple.  The tuple must be the same type as the hashtable entries.
 296  *
 297  * If isnew is NULL, we do not create new entries; we return NULL if no
 298  * match is found.
 299  *
 300  * If hash is not NULL, we set it to the calculated hash value. This allows
 301  * callers access to the hash value even if no entry is returned.
 302  *
 303  * If isnew isn't NULL, then a new entry is created if no existing entry
 304  * matches.  On return, *isnew is true if the entry is newly created,
 305  * false if it existed already.  ->additional_data in the new entry has
 306  * been zeroed.
 307  */
 308 TupleHashEntry
 309 LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 310                                          bool *isnew, uint32 *hash)
 311 {
 312         TupleHashEntry entry;
 313         MemoryContext oldContext;
 314         uint32          local_hash;
 315
 316         /* Need to run the hash functions in short-lived context */
 317         oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
 318
 319         /* set up data needed by hash and match functions */
 320         hashtable->inputslot = slot;
 321         hashtable->in_hash_expr = hashtable->tab_hash_expr;
 322         hashtable->cur_eq_func = hashtable->tab_eq_func;
 323
 324         local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
 325         entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
 326
 327         if (hash != NULL)
 328                 *hash = local_hash;
 329
 330         Assert(entry == NULL || entry->hash == local_hash);
 331
 332         MemoryContextSwitchTo(oldContext);
 333
 334         return entry;
 335 }
 336
 337 /*
 338  * Compute the hash value for a tuple
 339  */
 340 uint32
 341 TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
 342 {
 343         MemoryContext oldContext;
 344         uint32          hash;
 345
 346         hashtable->inputslot = slot;
 347         hashtable->in_hash_expr = hashtable->tab_hash_expr;
 348
 349         /* Need to run the hash functions in short-lived context */
 350         oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
 351
 352         hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
 353
 354         MemoryContextSwitchTo(oldContext);
 355
 356         return hash;
 357 }
 358
 359 MinimalTuple
 360 TupleHashEntryGetTuple(TupleHashEntry entry)
 361 {
 362         return entry->firstTuple;
 363 }
 364
 365 /*
 366  * Get a pointer into the additional space allocated for this entry. The
 367  * amount of space available is the additionalsize specified to
 368  * BuildTupleHashTable(). If additionalsize was specified as zero, no
 369  * additional space is available and this function should not be called.
 370  */
 371 void *
 372 TupleHashEntryGetAdditional(TupleHashEntry entry)
 373 {
 374         return (char *) entry->firstTuple + MAXALIGN(entry->firstTuple->t_len);
 375 }
 376
 377 /*
 378  * A variant of LookupTupleHashEntry for callers that have already computed
 379  * the hash value.
 380  */
 381 TupleHashEntry
 382 LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
 383                                                  bool *isnew, uint32 hash)
 384 {
 385         TupleHashEntry entry;
 386         MemoryContext oldContext;
 387
 388         /* Need to run the hash functions in short-lived context */
 389         oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
 390
 391         /* set up data needed by hash and match functions */
 392         hashtable->inputslot = slot;
 393         hashtable->in_hash_expr = hashtable->tab_hash_expr;
 394         hashtable->cur_eq_func = hashtable->tab_eq_func;
 395
 396         entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
 397         Assert(entry == NULL || entry->hash == hash);
 398
 399         MemoryContextSwitchTo(oldContext);
 400
 401         return entry;
 402 }
 403
 404 /*
 405  * Search for a hashtable entry matching the given tuple.  No entry is
 406  * created if there's not a match.  This is similar to the non-creating
 407  * case of LookupTupleHashEntry, except that it supports cross-type
 408  * comparisons, in which the given tuple is not of the same type as the
 409  * table entries.  The caller must provide the hash ExprState to use for
 410  * the input tuple, as well as the equality ExprState, since these may be
 411  * different from the table's internal functions.
 412  */
 413 TupleHashEntry
 414 FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 415                                    ExprState *eqcomp,
 416                                    ExprState *hashexpr)
 417 {
 418         TupleHashEntry entry;
 419         MemoryContext oldContext;
 420         MinimalTuple key;
 421
 422         /* Need to run the hash functions in short-lived context */
 423         oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
 424
 425         /* Set up data needed by hash and match functions */
 426         hashtable->inputslot = slot;
 427         hashtable->in_hash_expr = hashexpr;
 428         hashtable->cur_eq_func = eqcomp;
 429
 430         /* Search the hash table */
 431         key = NULL;                                     /* flag to reference inputslot */
 432         entry = tuplehash_lookup(hashtable->hashtab, key);
 433         MemoryContextSwitchTo(oldContext);
 434
 435         return entry;
 436 }
 437
 438 /*
 439  * If tuple is NULL, use the input slot instead. This convention avoids the
 440  * need to materialize virtual input tuples unless they actually need to get
 441  * copied into the table.
 442  *
 443  * Also, the caller must select an appropriate memory context for running
 444  * the hash functions.
 445  */
 446 static uint32
 447 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 448                                                         const MinimalTuple tuple)
 449 {
 450         TupleHashTable hashtable = (TupleHashTable) tb->private_data;
 451         uint32          hashkey;
 452         TupleTableSlot *slot;
 453         bool            isnull;
 454
 455         if (tuple == NULL)
 456         {
 457                 /* Process the current input tuple for the table */
 458                 hashtable->exprcontext->ecxt_innertuple = hashtable->inputslot;
 459                 hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->in_hash_expr,
 460                                                                                           hashtable->exprcontext,
 461                                                                                           &isnull));
 462         }
 463         else
 464         {
 465                 /*
 466                  * Process a tuple already stored in the table.
 467                  *
 468                  * (this case never actually occurs due to the way simplehash.h is
 469                  * used, as the hash-value is stored in the entries)
 470                  */
 471                 slot = hashtable->exprcontext->ecxt_innertuple = hashtable->tableslot;
 472                 ExecStoreMinimalTuple(tuple, slot, false);
 473                 hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->tab_hash_expr,
 474                                                                                           hashtable->exprcontext,
 475                                                                                           &isnull));
 476         }
 477
 478         /*
 479          * The hashing done above, even with an initial value, doesn't tend to
 480          * result in good hash perturbation.  Running the value produced above
 481          * through murmurhash32 leads to near perfect hash perturbation.
 482          */
 483         return murmurhash32(hashkey);
 484 }
 485
 486 /*
 487  * Does the work of LookupTupleHashEntry and LookupTupleHashEntryHash. Useful
 488  * so that we can avoid switching the memory context multiple times for
 489  * LookupTupleHashEntry.
 490  *
 491  * NB: This function may or may not change the memory context. Caller is
 492  * expected to change it back.
 493  */
 494 static inline TupleHashEntry
 495 LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 496                                                           bool *isnew, uint32 hash)
 497 {
 498         TupleHashEntryData *entry;
 499         bool            found;
 500         MinimalTuple key;
 501
 502         key = NULL;                                     /* flag to reference inputslot */
 503
 504         if (isnew)
 505         {
 506                 entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found);
 507
 508                 if (found)
 509                 {
 510                         /* found pre-existing entry */
 511                         *isnew = false;
 512                 }
 513                 else
 514                 {
 515                         MinimalTuple firstTuple;
 516                         size_t totalsize; /* including alignment and additionalsize */
 517
 518                         /* created new entry */
 519                         *isnew = true;
 520                         /* zero caller data */
 521                         MemoryContextSwitchTo(hashtable->tablecxt);
 522
 523                         /* Copy the first tuple into the table context */
 524                         firstTuple = ExecCopySlotMinimalTuple(slot);
 525
 526                         /*
 527                          * Allocate additional space right after the MinimalTuple of size
 528                          * additionalsize. The caller can get a pointer to this data with
 529                          * TupleHashEntryGetAdditional(), and store arbitrary data there.
 530                          *
 531                          * This avoids the need to store an extra pointer or allocate an
 532                          * additional chunk, which would waste memory.
 533                          */
 534                         totalsize = MAXALIGN(firstTuple->t_len) + hashtable->additionalsize;
 535                         firstTuple = repalloc(firstTuple, totalsize);
 536                         memset((char *) firstTuple + firstTuple->t_len, 0,
 537                                    totalsize - firstTuple->t_len);
 538
 539                         entry->firstTuple = firstTuple;
 540                 }
 541         }
 542         else
 543         {
 544                 entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash);
 545         }
 546
 547         return entry;
 548 }
 549
 550 /*
 551  * See whether two tuples (presumably of the same hash value) match
 552  */
 553 static int
 554 TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2)
 555 {
 556         TupleTableSlot *slot1;
 557         TupleTableSlot *slot2;
 558         TupleHashTable hashtable = (TupleHashTable) tb->private_data;
 559         ExprContext *econtext = hashtable->exprcontext;
 560
 561         /*
 562          * We assume that simplehash.h will only ever call us with the first
 563          * argument being an actual table entry, and the second argument being
 564          * LookupTupleHashEntry's dummy TupleHashEntryData.  The other direction
 565          * could be supported too, but is not currently required.
 566          */
 567         Assert(tuple1 != NULL);
 568         slot1 = hashtable->tableslot;
 569         ExecStoreMinimalTuple(tuple1, slot1, false);
 570         Assert(tuple2 == NULL);
 571         slot2 = hashtable->inputslot;
 572
 573         /* For crosstype comparisons, the inputslot must be first */
 574         econtext->ecxt_innertuple = slot2;
 575         econtext->ecxt_outertuple = slot1;
 576         return !ExecQualAndReset(hashtable->cur_eq_func, econtext);
 577 }