src/backend/executor/nodeSetOp.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nodeSetOp.c
   4  *        Routines to handle INTERSECT and EXCEPT selection
   5  *
   6  * The input of a SetOp node consists of tuples from two relations,
   7  * which have been combined into one dataset, with a junk attribute added
   8  * that shows which relation each tuple came from.  In SETOP_SORTED mode,
   9  * the input has furthermore been sorted according to all the grouping
  10  * columns (ie, all the non-junk attributes).  The SetOp node scans each
  11  * group of identical tuples to determine how many came from each input
  12  * relation.  Then it is a simple matter to emit the output demanded by the
  13  * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
  14  *
  15  * In SETOP_HASHED mode, the input is delivered in no particular order,
  16  * except that we know all the tuples from one input relation will come before
  17  * all the tuples of the other.  The planner guarantees that the first input
  18  * relation is the left-hand one for EXCEPT, and tries to make the smaller
  19  * input relation come first for INTERSECT.  We build a hash table in memory
  20  * with one entry for each group of identical tuples, and count the number of
  21  * tuples in the group from each relation.  After seeing all the input, we
  22  * scan the hashtable and generate the correct output using those counts.
  23  * We can avoid making hashtable entries for any tuples appearing only in the
  24  * second input relation, since they cannot result in any output.
  25  *
  26  * This node type is not used for UNION or UNION ALL, since those can be
  27  * implemented more cheaply (there's no need for the junk attribute to
  28  * identify the source relation).
  29  *
  30  * Note that SetOp does no qual checking nor projection.  The delivered
  31  * output tuples are just copies of the first-to-arrive tuple in each
  32  * input group.
  33  *
  34  *
  35  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  36  * Portions Copyright (c) 1994, Regents of the University of California
  37  *
  38  *
  39  * IDENTIFICATION
  40  *        src/backend/executor/nodeSetOp.c
  41  *
  42  *-------------------------------------------------------------------------
  43  */
  44
  45 #include "postgres.h"
  46
  47 #include "access/htup_details.h"
  48 #include "executor/executor.h"
  49 #include "executor/nodeSetOp.h"
  50 #include "miscadmin.h"
  51 #include "utils/memutils.h"
  52
  53
  54 /*
  55  * SetOpStatePerGroupData - per-group working state
  56  *
  57  * These values are working state that is initialized at the start of
  58  * an input tuple group and updated for each input tuple.
  59  *
  60  * In SETOP_SORTED mode, we need only one of these structs, and it's kept in
  61  * the plan state node.  In SETOP_HASHED mode, the hash table contains one
  62  * of these for each tuple group.
  63  */
  64 typedef struct SetOpStatePerGroupData
  65 {
  66         long            numLeft;                /* number of left-input dups in group */
  67         long            numRight;               /* number of right-input dups in group */
  68 }                       SetOpStatePerGroupData;
  69
  70
  71 static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
  72 static void setop_fill_hash_table(SetOpState *setopstate);
  73 static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);
  74
  75
  76 /*
  77  * Initialize state for a new group of input values.
  78  */
  79 static inline void
  80 initialize_counts(SetOpStatePerGroup pergroup)
  81 {
  82         pergroup->numLeft = pergroup->numRight = 0;
  83 }
  84
  85 /*
  86  * Advance the appropriate counter for one input tuple.
  87  */
  88 static inline void
  89 advance_counts(SetOpStatePerGroup pergroup, int flag)
  90 {
  91         if (flag)
  92                 pergroup->numRight++;
  93         else
  94                 pergroup->numLeft++;
  95 }
  96
  97 /*
  98  * Fetch the "flag" column from an input tuple.
  99  * This is an integer column with value 0 for left side, 1 for right side.
 100  */
 101 static int
 102 fetch_tuple_flag(SetOpState *setopstate, TupleTableSlot *inputslot)
 103 {
 104         SetOp      *node = (SetOp *) setopstate->ps.plan;
 105         int                     flag;
 106         bool            isNull;
 107
 108         flag = DatumGetInt32(slot_getattr(inputslot,
 109                                                                           node->flagColIdx,
 110                                                                           &isNull));
 111         Assert(!isNull);
 112         Assert(flag == 0 || flag == 1);
 113         return flag;
 114 }
 115
 116 /*
 117  * Initialize the hash table to empty.
 118  */
 119 static void
 120 build_hash_table(SetOpState *setopstate)
 121 {
 122         SetOp      *node = (SetOp *) setopstate->ps.plan;
 123         ExprContext *econtext = setopstate->ps.ps_ExprContext;
 124         TupleDesc       desc = ExecGetResultType(outerPlanState(setopstate));
 125
 126         Assert(node->strategy == SETOP_HASHED);
 127         Assert(node->numGroups > 0);
 128
 129         setopstate->hashtable = BuildTupleHashTableExt(&setopstate->ps,
 130                                                                                                    desc,
 131                                                                                                    node->numCols,
 132                                                                                                    node->dupColIdx,
 133                                                                                                    setopstate->eqfuncoids,
 134                                                                                                    setopstate->hashfunctions,
 135                                                                                                    node->dupCollations,
 136                                                                                                    node->numGroups,
 137                                                                                                    0,
 138                                                                                                    setopstate->ps.state->es_query_cxt,
 139                                                                                                    setopstate->tableContext,
 140                                                                                                    econtext->ecxt_per_tuple_memory,
 141                                                                                                    false);
 142 }
 143
 144 /*
 145  * We've completed processing a tuple group.  Decide how many copies (if any)
 146  * of its representative row to emit, and store the count into numOutput.
 147  * This logic is straight from the SQL92 specification.
 148  */
 149 static void
 150 set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
 151 {
 152         SetOp      *plannode = (SetOp *) setopstate->ps.plan;
 153
 154         switch (plannode->cmd)
 155         {
 156                 case SETOPCMD_INTERSECT:
 157                         if (pergroup->numLeft > 0 && pergroup->numRight > 0)
 158                                 setopstate->numOutput = 1;
 159                         else
 160                                 setopstate->numOutput = 0;
 161                         break;
 162                 case SETOPCMD_INTERSECT_ALL:
 163                         setopstate->numOutput =
 164                                 (pergroup->numLeft < pergroup->numRight) ?
 165                                 pergroup->numLeft : pergroup->numRight;
 166                         break;
 167                 case SETOPCMD_EXCEPT:
 168                         if (pergroup->numLeft > 0 && pergroup->numRight == 0)
 169                                 setopstate->numOutput = 1;
 170                         else
 171                                 setopstate->numOutput = 0;
 172                         break;
 173                 case SETOPCMD_EXCEPT_ALL:
 174                         setopstate->numOutput =
 175                                 (pergroup->numLeft < pergroup->numRight) ?
 176                                 0 : (pergroup->numLeft - pergroup->numRight);
 177                         break;
 178                 default:
 179                         elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
 180                         break;
 181         }
 182 }
 183
 184
 185 /* ----------------------------------------------------------------
 186  *              ExecSetOp
 187  * ----------------------------------------------------------------
 188  */
 189 static TupleTableSlot *                 /* return: a tuple or NULL */
 190 ExecSetOp(PlanState *pstate)
 191 {
 192         SetOpState *node = castNode(SetOpState, pstate);
 193         SetOp      *plannode = (SetOp *) node->ps.plan;
 194         TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;
 195
 196         CHECK_FOR_INTERRUPTS();
 197
 198         /*
 199          * If the previously-returned tuple needs to be returned more than once,
 200          * keep returning it.
 201          */
 202         if (node->numOutput > 0)
 203         {
 204                 node->numOutput--;
 205                 return resultTupleSlot;
 206         }
 207
 208         /* Otherwise, we're done if we are out of groups */
 209         if (node->setop_done)
 210                 return NULL;
 211
 212         /* Fetch the next tuple group according to the correct strategy */
 213         if (plannode->strategy == SETOP_HASHED)
 214         {
 215                 if (!node->table_filled)
 216                         setop_fill_hash_table(node);
 217                 return setop_retrieve_hash_table(node);
 218         }
 219         else
 220                 return setop_retrieve_direct(node);
 221 }
 222
 223 /*
 224  * ExecSetOp for non-hashed case
 225  */
 226 static TupleTableSlot *
 227 setop_retrieve_direct(SetOpState *setopstate)
 228 {
 229         PlanState  *outerPlan;
 230         SetOpStatePerGroup pergroup;
 231         TupleTableSlot *outerslot;
 232         TupleTableSlot *resultTupleSlot;
 233         ExprContext *econtext = setopstate->ps.ps_ExprContext;
 234
 235         /*
 236          * get state info from node
 237          */
 238         outerPlan = outerPlanState(setopstate);
 239         pergroup = (SetOpStatePerGroup) setopstate->pergroup;
 240         resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
 241
 242         /*
 243          * We loop retrieving groups until we find one we should return
 244          */
 245         while (!setopstate->setop_done)
 246         {
 247                 /*
 248                  * If we don't already have the first tuple of the new group, fetch it
 249                  * from the outer plan.
 250                  */
 251                 if (setopstate->grp_firstTuple == NULL)
 252                 {
 253                         outerslot = ExecProcNode(outerPlan);
 254                         if (!TupIsNull(outerslot))
 255                         {
 256                                 /* Make a copy of the first input tuple */
 257                                 setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
 258                         }
 259                         else
 260                         {
 261                                 /* outer plan produced no tuples at all */
 262                                 setopstate->setop_done = true;
 263                                 return NULL;
 264                         }
 265                 }
 266
 267                 /*
 268                  * Store the copied first input tuple in the tuple table slot reserved
 269                  * for it.  The tuple will be deleted when it is cleared from the
 270                  * slot.
 271                  */
 272                 ExecStoreHeapTuple(setopstate->grp_firstTuple,
 273                                                    resultTupleSlot,
 274                                                    true);
 275                 setopstate->grp_firstTuple = NULL;      /* don't keep two pointers */
 276
 277                 /* Initialize working state for a new input tuple group */
 278                 initialize_counts(pergroup);
 279
 280                 /* Count the first input tuple */
 281                 advance_counts(pergroup,
 282                                            fetch_tuple_flag(setopstate, resultTupleSlot));
 283
 284                 /*
 285                  * Scan the outer plan until we exhaust it or cross a group boundary.
 286                  */
 287                 for (;;)
 288                 {
 289                         outerslot = ExecProcNode(outerPlan);
 290                         if (TupIsNull(outerslot))
 291                         {
 292                                 /* no more outer-plan tuples available */
 293                                 setopstate->setop_done = true;
 294                                 break;
 295                         }
 296
 297                         /*
 298                          * Check whether we've crossed a group boundary.
 299                          */
 300                         econtext->ecxt_outertuple = resultTupleSlot;
 301                         econtext->ecxt_innertuple = outerslot;
 302
 303                         if (!ExecQualAndReset(setopstate->eqfunction, econtext))
 304                         {
 305                                 /*
 306                                  * Save the first input tuple of the next group.
 307                                  */
 308                                 setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot);
 309                                 break;
 310                         }
 311
 312                         /* Still in same group, so count this tuple */
 313                         advance_counts(pergroup,
 314                                                    fetch_tuple_flag(setopstate, outerslot));
 315                 }
 316
 317                 /*
 318                  * Done scanning input tuple group.  See if we should emit any copies
 319                  * of result tuple, and if so return the first copy.
 320                  */
 321                 set_output_count(setopstate, pergroup);
 322
 323                 if (setopstate->numOutput > 0)
 324                 {
 325                         setopstate->numOutput--;
 326                         return resultTupleSlot;
 327                 }
 328         }
 329
 330         /* No more groups */
 331         ExecClearTuple(resultTupleSlot);
 332         return NULL;
 333 }
 334
 335 /*
 336  * ExecSetOp for hashed case: phase 1, read input and build hash table
 337  */
 338 static void
 339 setop_fill_hash_table(SetOpState *setopstate)
 340 {
 341         SetOp      *node = (SetOp *) setopstate->ps.plan;
 342         PlanState  *outerPlan;
 343         int                     firstFlag;
 344         bool            in_first_rel PG_USED_FOR_ASSERTS_ONLY;
 345         ExprContext *econtext = setopstate->ps.ps_ExprContext;
 346
 347         /*
 348          * get state info from node
 349          */
 350         outerPlan = outerPlanState(setopstate);
 351         firstFlag = node->firstFlag;
 352         /* verify planner didn't mess up */
 353         Assert(firstFlag == 0 ||
 354                    (firstFlag == 1 &&
 355                         (node->cmd == SETOPCMD_INTERSECT ||
 356                          node->cmd == SETOPCMD_INTERSECT_ALL)));
 357
 358         /*
 359          * Process each outer-plan tuple, and then fetch the next one, until we
 360          * exhaust the outer plan.
 361          */
 362         in_first_rel = true;
 363         for (;;)
 364         {
 365                 TupleTableSlot *outerslot;
 366                 int                     flag;
 367                 TupleHashEntryData *entry;
 368                 bool            isnew;
 369
 370                 outerslot = ExecProcNode(outerPlan);
 371                 if (TupIsNull(outerslot))
 372                         break;
 373
 374                 /* Identify whether it's left or right input */
 375                 flag = fetch_tuple_flag(setopstate, outerslot);
 376
 377                 if (flag == firstFlag)
 378                 {
 379                         /* (still) in first input relation */
 380                         Assert(in_first_rel);
 381
 382                         /* Find or build hashtable entry for this tuple's group */
 383                         entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
 384                                                                                  &isnew, NULL);
 385
 386                         /* If new tuple group, initialize counts */
 387                         if (isnew)
 388                         {
 389                                 entry->additional = (SetOpStatePerGroup)
 390                                         MemoryContextAlloc(setopstate->hashtable->tablecxt,
 391                                                                            sizeof(SetOpStatePerGroupData));
 392                                 initialize_counts((SetOpStatePerGroup) entry->additional);
 393                         }
 394
 395                         /* Advance the counts */
 396                         advance_counts((SetOpStatePerGroup) entry->additional, flag);
 397                 }
 398                 else
 399                 {
 400                         /* reached second relation */
 401                         in_first_rel = false;
 402
 403                         /* For tuples not seen previously, do not make hashtable entry */
 404                         entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
 405                                                                                  NULL, NULL);
 406
 407                         /* Advance the counts if entry is already present */
 408                         if (entry)
 409                                 advance_counts((SetOpStatePerGroup) entry->additional, flag);
 410                 }
 411
 412                 /* Must reset expression context after each hashtable lookup */
 413                 ResetExprContext(econtext);
 414         }
 415
 416         setopstate->table_filled = true;
 417         /* Initialize to walk the hash table */
 418         ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
 419 }
 420
 421 /*
 422  * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
 423  */
 424 static TupleTableSlot *
 425 setop_retrieve_hash_table(SetOpState *setopstate)
 426 {
 427         TupleHashEntryData *entry;
 428         TupleTableSlot *resultTupleSlot;
 429
 430         /*
 431          * get state info from node
 432          */
 433         resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
 434
 435         /*
 436          * We loop retrieving groups until we find one we should return
 437          */
 438         while (!setopstate->setop_done)
 439         {
 440                 CHECK_FOR_INTERRUPTS();
 441
 442                 /*
 443                  * Find the next entry in the hash table
 444                  */
 445                 entry = ScanTupleHashTable(setopstate->hashtable, &setopstate->hashiter);
 446                 if (entry == NULL)
 447                 {
 448                         /* No more entries in hashtable, so done */
 449                         setopstate->setop_done = true;
 450                         return NULL;
 451                 }
 452
 453                 /*
 454                  * See if we should emit any copies of this tuple, and if so return
 455                  * the first copy.
 456                  */
 457                 set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
 458
 459                 if (setopstate->numOutput > 0)
 460                 {
 461                         setopstate->numOutput--;
 462                         return ExecStoreMinimalTuple(entry->firstTuple,
 463                                                                                  resultTupleSlot,
 464                                                                                  false);
 465                 }
 466         }
 467
 468         /* No more groups */
 469         ExecClearTuple(resultTupleSlot);
 470         return NULL;
 471 }
 472
 473 /* ----------------------------------------------------------------
 474  *              ExecInitSetOp
 475  *
 476  *              This initializes the setop node state structures and
 477  *              the node's subplan.
 478  * ----------------------------------------------------------------
 479  */
 480 SetOpState *
 481 ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 482 {
 483         SetOpState *setopstate;
 484         TupleDesc       outerDesc;
 485
 486         /* check for unsupported flags */
 487         Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
 488
 489         /*
 490          * create state structure
 491          */
 492         setopstate = makeNode(SetOpState);
 493         setopstate->ps.plan = (Plan *) node;
 494         setopstate->ps.state = estate;
 495         setopstate->ps.ExecProcNode = ExecSetOp;
 496
 497         setopstate->eqfuncoids = NULL;
 498         setopstate->hashfunctions = NULL;
 499         setopstate->setop_done = false;
 500         setopstate->numOutput = 0;
 501         setopstate->pergroup = NULL;
 502         setopstate->grp_firstTuple = NULL;
 503         setopstate->hashtable = NULL;
 504         setopstate->tableContext = NULL;
 505
 506         /*
 507          * create expression context
 508          */
 509         ExecAssignExprContext(estate, &setopstate->ps);
 510
 511         /*
 512          * If hashing, we also need a longer-lived context to store the hash
 513          * table.  The table can't just be kept in the per-query context because
 514          * we want to be able to throw it away in ExecReScanSetOp.
 515          */
 516         if (node->strategy == SETOP_HASHED)
 517                 setopstate->tableContext =
 518                         AllocSetContextCreate(CurrentMemoryContext,
 519                                                                   "SetOp hash table",
 520                                                                   ALLOCSET_DEFAULT_SIZES);
 521
 522         /*
 523          * initialize child nodes
 524          *
 525          * If we are hashing then the child plan does not need to handle REWIND
 526          * efficiently; see ExecReScanSetOp.
 527          */
 528         if (node->strategy == SETOP_HASHED)
 529                 eflags &= ~EXEC_FLAG_REWIND;
 530         outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);
 531         outerDesc = ExecGetResultType(outerPlanState(setopstate));
 532
 533         /*
 534          * Initialize result slot and type. Setop nodes do no projections, so
 535          * initialize projection info for this node appropriately.
 536          */
 537         ExecInitResultTupleSlotTL(&setopstate->ps,
 538                                                           node->strategy == SETOP_HASHED ?
 539                                                           &TTSOpsMinimalTuple : &TTSOpsHeapTuple);
 540         setopstate->ps.ps_ProjInfo = NULL;
 541
 542         /*
 543          * Precompute fmgr lookup data for inner loop. We need both equality and
 544          * hashing functions to do it by hashing, but only equality if not
 545          * hashing.
 546          */
 547         if (node->strategy == SETOP_HASHED)
 548                 execTuplesHashPrepare(node->numCols,
 549                                                           node->dupOperators,
 550                                                           &setopstate->eqfuncoids,
 551                                                           &setopstate->hashfunctions);
 552         else
 553                 setopstate->eqfunction =
 554                         execTuplesMatchPrepare(outerDesc,
 555                                                                    node->numCols,
 556                                                                    node->dupColIdx,
 557                                                                    node->dupOperators,
 558                                                                    node->dupCollations,
 559                                                                    &setopstate->ps);
 560
 561         if (node->strategy == SETOP_HASHED)
 562         {
 563                 build_hash_table(setopstate);
 564                 setopstate->table_filled = false;
 565         }
 566         else
 567         {
 568                 setopstate->pergroup =
 569                         (SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData));
 570         }
 571
 572         return setopstate;
 573 }
 574
 575 /* ----------------------------------------------------------------
 576  *              ExecEndSetOp
 577  *
 578  *              This shuts down the subplan and frees resources allocated
 579  *              to this node.
 580  * ----------------------------------------------------------------
 581  */
 582 void
 583 ExecEndSetOp(SetOpState *node)
 584 {
 585         /* free subsidiary stuff including hashtable */
 586         if (node->tableContext)
 587                 MemoryContextDelete(node->tableContext);
 588
 589         ExecEndNode(outerPlanState(node));
 590 }
 591
 592
 593 void
 594 ExecReScanSetOp(SetOpState *node)
 595 {
 596         PlanState  *outerPlan = outerPlanState(node);
 597
 598         ExecClearTuple(node->ps.ps_ResultTupleSlot);
 599         node->setop_done = false;
 600         node->numOutput = 0;
 601
 602         if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
 603         {
 604                 /*
 605                  * In the hashed case, if we haven't yet built the hash table then we
 606                  * can just return; nothing done yet, so nothing to undo. If subnode's
 607                  * chgParam is not NULL then it will be re-scanned by ExecProcNode,
 608                  * else no reason to re-scan it at all.
 609                  */
 610                 if (!node->table_filled)
 611                         return;
 612
 613                 /*
 614                  * If we do have the hash table and the subplan does not have any
 615                  * parameter changes, then we can just rescan the existing hash table;
 616                  * no need to build it again.
 617                  */
 618                 if (outerPlan->chgParam == NULL)
 619                 {
 620                         ResetTupleHashIterator(node->hashtable, &node->hashiter);
 621                         return;
 622                 }
 623         }
 624
 625         /* Release first tuple of group, if we have made a copy */
 626         if (node->grp_firstTuple != NULL)
 627         {
 628                 heap_freetuple(node->grp_firstTuple);
 629                 node->grp_firstTuple = NULL;
 630         }
 631
 632         /* Release any hashtable storage */
 633         if (node->tableContext)
 634                 MemoryContextReset(node->tableContext);
 635
 636         /* And rebuild empty hashtable if needed */
 637         if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
 638         {
 639                 ResetTupleHashTable(node->hashtable);
 640                 node->table_filled = false;
 641         }
 642
 643         /*
 644          * if chgParam of subnode is not null then plan will be re-scanned by
 645          * first ExecProcNode.
 646          */
 647         if (outerPlan->chgParam == NULL)
 648                 ExecReScan(outerPlan);
 649 }