1 /*-------------------------------------------------------------------------
4 * Support routines for partitioning.
6 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/backend/executor/execPartition.c
12 *-------------------------------------------------------------------------
16 #include "access/table.h"
17 #include "access/tableam.h"
18 #include "catalog/partition.h"
19 #include "executor/execPartition.h"
20 #include "executor/executor.h"
21 #include "executor/nodeModifyTable.h"
22 #include "foreign/fdwapi.h"
23 #include "mb/pg_wchar.h"
24 #include "miscadmin.h"
25 #include "partitioning/partbounds.h"
26 #include "partitioning/partdesc.h"
27 #include "partitioning/partprune.h"
28 #include "rewrite/rewriteManip.h"
29 #include "utils/acl.h"
30 #include "utils/lsyscache.h"
31 #include "utils/partcache.h"
32 #include "utils/rls.h"
33 #include "utils/ruleutils.h"
36 /*-----------------------
37 * PartitionTupleRouting - Encapsulates all information required to
38 * route a tuple inserted into a partitioned table to one of its leaf
42 * The partitioned table that's the target of the command.
44 * partition_dispatch_info
45 * Array of 'max_dispatch' elements containing a pointer to a
46 * PartitionDispatch object for every partitioned table touched by tuple
47 * routing. The entry for the target partitioned table is *always*
48 * present in the 0th element of this array. See comment for
49 * PartitionDispatchData->indexes for details on how this array is
53 * Array of 'max_dispatch' elements containing pointers to fake
54 * ResultRelInfo objects for nonleaf partitions, useful for checking
55 * the partition constraint.
58 * The current number of items stored in the 'partition_dispatch_info'
59 * array. Also serves as the index of the next free array element for
60 * new PartitionDispatch objects that need to be stored.
63 * The current allocated size of the 'partition_dispatch_info' array.
66 * Array of 'max_partitions' elements containing a pointer to a
67 * ResultRelInfo for every leaf partition touched by tuple routing.
68 * Some of these are pointers to ResultRelInfos which are borrowed out of
69 * the owning ModifyTableState node. The remainder have been built
70 * especially for tuple routing. See comment for
71 * PartitionDispatchData->indexes for details on how this array is
75 * Array of 'max_partitions' booleans recording whether a given entry
76 * in 'partitions' is a ResultRelInfo pointer borrowed from the owning
77 * ModifyTableState node, rather than being built here.
80 * The current number of items stored in the 'partitions' array. Also
81 * serves as the index of the next free array element for new
82 * ResultRelInfo objects that need to be stored.
85 * The current allocated size of the 'partitions' array.
88 * Memory context used to allocate subsidiary structs.
89 *-----------------------
91 struct PartitionTupleRouting
93 Relation partition_root
;
94 PartitionDispatch
*partition_dispatch_info
;
95 ResultRelInfo
**nonleaf_partitions
;
98 ResultRelInfo
**partitions
;
99 bool *is_borrowed_rel
;
102 MemoryContext memcxt
;
105 /*-----------------------
106 * PartitionDispatch - information about one partitioned table in a partition
107 * hierarchy required to route a tuple to any of its partitions. A
108 * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
109 * struct and stored inside its 'partition_dispatch_info' array.
112 * Relation descriptor of the table
115 * Partition key information of the table
118 * Execution state required for expressions in the partition key
121 * Partition descriptor of the table
124 * A standalone TupleTableSlot initialized with this table's tuple
125 * descriptor, or NULL if no tuple conversion between the parent is
129 * TupleConversionMap to convert from the parent's rowtype to this table's
130 * rowtype (when extracting the partition key of a tuple just before
131 * routing it through this table). A NULL value is stored if no tuple
132 * conversion is required.
135 * Array of partdesc->nparts elements. For leaf partitions the index
136 * corresponds to the partition's ResultRelInfo in the encapsulating
137 * PartitionTupleRouting's partitions array. For partitioned partitions,
138 * the index corresponds to the PartitionDispatch for it in its
139 * partition_dispatch_info array. -1 indicates we've not yet allocated
140 * anything in PartitionTupleRouting for the partition.
141 *-----------------------
143 typedef struct PartitionDispatchData
147 List
*keystate
; /* list of ExprState */
148 PartitionDesc partdesc
;
149 TupleTableSlot
*tupslot
;
151 int indexes
[FLEXIBLE_ARRAY_MEMBER
];
152 } PartitionDispatchData
;
155 static ResultRelInfo
*ExecInitPartitionInfo(ModifyTableState
*mtstate
,
156 EState
*estate
, PartitionTupleRouting
*proute
,
157 PartitionDispatch dispatch
,
158 ResultRelInfo
*rootResultRelInfo
,
160 static void ExecInitRoutingInfo(ModifyTableState
*mtstate
,
162 PartitionTupleRouting
*proute
,
163 PartitionDispatch dispatch
,
164 ResultRelInfo
*partRelInfo
,
166 bool is_borrowed_rel
);
167 static PartitionDispatch
ExecInitPartitionDispatchInfo(EState
*estate
,
168 PartitionTupleRouting
*proute
,
169 Oid partoid
, PartitionDispatch parent_pd
,
170 int partidx
, ResultRelInfo
*rootResultRelInfo
);
171 static void FormPartitionKeyDatum(PartitionDispatch pd
,
172 TupleTableSlot
*slot
,
176 static int get_partition_for_tuple(PartitionDispatch pd
, Datum
*values
,
178 static char *ExecBuildSlotPartitionKeyDescription(Relation rel
,
182 static List
*adjust_partition_colnos(List
*colnos
, ResultRelInfo
*leaf_part_rri
);
183 static List
*adjust_partition_colnos_using_map(List
*colnos
, AttrMap
*attrMap
);
184 static PartitionPruneState
*CreatePartitionPruneState(PlanState
*planstate
,
185 PartitionPruneInfo
*pruneinfo
);
186 static void InitPartitionPruneContext(PartitionPruneContext
*context
,
188 PartitionDesc partdesc
,
189 PartitionKey partkey
,
190 PlanState
*planstate
,
191 ExprContext
*econtext
);
192 static void PartitionPruneFixSubPlanMap(PartitionPruneState
*prunestate
,
193 Bitmapset
*initially_valid_subplans
,
194 int n_total_subplans
);
195 static void find_matching_subplans_recurse(PartitionPruningData
*prunedata
,
196 PartitionedRelPruningData
*pprune
,
198 Bitmapset
**validsubplans
);
202 * ExecSetupPartitionTupleRouting - sets up information needed during
203 * tuple routing for partitioned tables, encapsulates it in
204 * PartitionTupleRouting, and returns it.
206 * Callers must use the returned PartitionTupleRouting during calls to
207 * ExecFindPartition(). The actual ResultRelInfo for a partition is only
208 * allocated when the partition is found for the first time.
210 * The current memory context is used to allocate this struct and all
211 * subsidiary structs that will be allocated from it later on. Typically
212 * it should be estate->es_query_cxt.
214 PartitionTupleRouting
*
215 ExecSetupPartitionTupleRouting(EState
*estate
, Relation rel
)
217 PartitionTupleRouting
*proute
;
220 * Here we attempt to expend as little effort as possible in setting up
221 * the PartitionTupleRouting. Each partition's ResultRelInfo is built on
222 * demand, only when we actually need to route a tuple to that partition.
223 * The reason for this is that a common case is for INSERT to insert a
224 * single tuple into a partitioned table and this must be fast.
226 proute
= (PartitionTupleRouting
*) palloc0(sizeof(PartitionTupleRouting
));
227 proute
->partition_root
= rel
;
228 proute
->memcxt
= CurrentMemoryContext
;
229 /* Rest of members initialized by zeroing */
232 * Initialize this table's PartitionDispatch object. Here we pass in the
233 * parent as NULL as we don't need to care about any parent of the target
236 ExecInitPartitionDispatchInfo(estate
, proute
, RelationGetRelid(rel
),
243 * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
244 * the tuple contained in *slot should belong to.
246 * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
247 * one up or reuse one from mtstate's resultRelInfo array. When reusing a
248 * ResultRelInfo from the mtstate we verify that the relation is a valid
249 * target for INSERTs and initialize tuple routing information.
251 * rootResultRelInfo is the relation named in the query.
253 * estate must be non-NULL; we'll need it to compute any expressions in the
254 * partition keys. Also, its per-tuple contexts are used as evaluation
257 * If no leaf partition is found, this routine errors out with the appropriate
258 * error message. An error may also be raised if the found target partition
259 * is not a valid target for an INSERT.
262 ExecFindPartition(ModifyTableState
*mtstate
,
263 ResultRelInfo
*rootResultRelInfo
,
264 PartitionTupleRouting
*proute
,
265 TupleTableSlot
*slot
, EState
*estate
)
267 PartitionDispatch
*pd
= proute
->partition_dispatch_info
;
268 Datum values
[PARTITION_MAX_KEYS
];
269 bool isnull
[PARTITION_MAX_KEYS
];
271 PartitionDispatch dispatch
;
272 PartitionDesc partdesc
;
273 ExprContext
*ecxt
= GetPerTupleExprContext(estate
);
274 TupleTableSlot
*ecxt_scantuple_saved
= ecxt
->ecxt_scantuple
;
275 TupleTableSlot
*rootslot
= slot
;
276 TupleTableSlot
*myslot
= NULL
;
277 MemoryContext oldcxt
;
278 ResultRelInfo
*rri
= NULL
;
280 /* use per-tuple context here to avoid leaking memory */
281 oldcxt
= MemoryContextSwitchTo(GetPerTupleMemoryContext(estate
));
284 * First check the root table's partition constraint, if any. No point in
285 * routing the tuple if it doesn't belong in the root table itself.
287 if (rootResultRelInfo
->ri_RelationDesc
->rd_rel
->relispartition
)
288 ExecPartitionCheck(rootResultRelInfo
, slot
, estate
, true);
290 /* start with the root partitioned table */
292 while (dispatch
!= NULL
)
297 CHECK_FOR_INTERRUPTS();
299 rel
= dispatch
->reldesc
;
300 partdesc
= dispatch
->partdesc
;
303 * Extract partition key from tuple. Expression evaluation machinery
304 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
305 * point to the correct tuple slot. The slot might have changed from
306 * what was used for the parent table if the table of the current
307 * partitioning level has different tuple descriptor from the parent.
308 * So update ecxt_scantuple accordingly.
310 ecxt
->ecxt_scantuple
= slot
;
311 FormPartitionKeyDatum(dispatch
, slot
, estate
, values
, isnull
);
314 * If this partitioned table has no partitions or no partition for
315 * these values, error out.
317 if (partdesc
->nparts
== 0 ||
318 (partidx
= get_partition_for_tuple(dispatch
, values
, isnull
)) < 0)
322 val_desc
= ExecBuildSlotPartitionKeyDescription(rel
,
324 Assert(OidIsValid(RelationGetRelid(rel
)));
326 (errcode(ERRCODE_CHECK_VIOLATION
),
327 errmsg("no partition of relation \"%s\" found for row",
328 RelationGetRelationName(rel
)),
330 errdetail("Partition key of the failing row contains %s.",
335 is_leaf
= partdesc
->is_leaf
[partidx
];
339 * We've reached the leaf -- hurray, we're done. Look to see if
340 * we've already got a ResultRelInfo for this partition.
342 if (likely(dispatch
->indexes
[partidx
] >= 0))
344 /* ResultRelInfo already built */
345 Assert(dispatch
->indexes
[partidx
] < proute
->num_partitions
);
346 rri
= proute
->partitions
[dispatch
->indexes
[partidx
]];
351 * If the partition is known in the owning ModifyTableState
352 * node, we can re-use that ResultRelInfo instead of creating
353 * a new one with ExecInitPartitionInfo().
355 rri
= ExecLookupResultRelByOid(mtstate
,
356 partdesc
->oids
[partidx
],
360 /* Verify this ResultRelInfo allows INSERTs */
361 CheckValidResultRel(rri
, CMD_INSERT
, NIL
);
364 * Initialize information needed to insert this and
365 * subsequent tuples routed to this partition.
367 ExecInitRoutingInfo(mtstate
, estate
, proute
, dispatch
,
372 /* We need to create a new one. */
373 rri
= ExecInitPartitionInfo(mtstate
, estate
, proute
,
375 rootResultRelInfo
, partidx
);
380 /* Signal to terminate the loop */
386 * Partition is a sub-partitioned table; get the PartitionDispatch
388 if (likely(dispatch
->indexes
[partidx
] >= 0))
391 Assert(dispatch
->indexes
[partidx
] < proute
->num_dispatch
);
393 rri
= proute
->nonleaf_partitions
[dispatch
->indexes
[partidx
]];
396 * Move down to the next partition level and search again
397 * until we find a leaf partition that matches this tuple
399 dispatch
= pd
[dispatch
->indexes
[partidx
]];
403 /* Not yet built. Do that now. */
404 PartitionDispatch subdispatch
;
407 * Create the new PartitionDispatch. We pass the current one
408 * in as the parent PartitionDispatch
410 subdispatch
= ExecInitPartitionDispatchInfo(estate
,
412 partdesc
->oids
[partidx
],
414 mtstate
->rootResultRelInfo
);
415 Assert(dispatch
->indexes
[partidx
] >= 0 &&
416 dispatch
->indexes
[partidx
] < proute
->num_dispatch
);
418 rri
= proute
->nonleaf_partitions
[dispatch
->indexes
[partidx
]];
419 dispatch
= subdispatch
;
423 * Convert the tuple to the new parent's layout, if different from
424 * the previous parent.
426 if (dispatch
->tupslot
)
428 AttrMap
*map
= dispatch
->tupmap
;
429 TupleTableSlot
*tempslot
= myslot
;
431 myslot
= dispatch
->tupslot
;
432 slot
= execute_attr_map_slot(map
, slot
, myslot
);
434 if (tempslot
!= NULL
)
435 ExecClearTuple(tempslot
);
440 * If this partition is the default one, we must check its partition
441 * constraint now, which may have changed concurrently due to
442 * partitions being added to the parent.
444 * (We do this here, and do not rely on ExecInsert doing it, because
445 * we don't want to miss doing it for non-leaf partitions.)
447 if (partidx
== partdesc
->boundinfo
->default_index
)
450 * The tuple must match the partition's layout for the constraint
451 * expression to be evaluated successfully. If the partition is
452 * sub-partitioned, that would already be the case due to the code
453 * above, but for a leaf partition the tuple still matches the
456 * Note that we have a map to convert from root to current
457 * partition, but not from immediate parent to current partition.
458 * So if we have to convert, do it from the root slot; if not, use
459 * the root slot as-is.
463 TupleConversionMap
*map
= ExecGetRootToChildMap(rri
, estate
);
466 slot
= execute_attr_map_slot(map
->attrMap
, rootslot
,
467 rri
->ri_PartitionTupleSlot
);
472 ExecPartitionCheck(rri
, slot
, estate
, true);
476 /* Release the tuple in the lowest parent's dedicated slot. */
478 ExecClearTuple(myslot
);
479 /* and restore ecxt's scantuple */
480 ecxt
->ecxt_scantuple
= ecxt_scantuple_saved
;
481 MemoryContextSwitchTo(oldcxt
);
487 * ExecInitPartitionInfo
488 * Lock the partition and initialize ResultRelInfo. Also setup other
489 * information for the partition and store it in the next empty slot in
490 * the proute->partitions array.
492 * Returns the ResultRelInfo
494 static ResultRelInfo
*
495 ExecInitPartitionInfo(ModifyTableState
*mtstate
, EState
*estate
,
496 PartitionTupleRouting
*proute
,
497 PartitionDispatch dispatch
,
498 ResultRelInfo
*rootResultRelInfo
,
501 ModifyTable
*node
= (ModifyTable
*) mtstate
->ps
.plan
;
502 Oid partOid
= dispatch
->partdesc
->oids
[partidx
];
504 int firstVarno
= mtstate
->resultRelInfo
[0].ri_RangeTableIndex
;
505 Relation firstResultRel
= mtstate
->resultRelInfo
[0].ri_RelationDesc
;
506 ResultRelInfo
*leaf_part_rri
;
507 MemoryContext oldcxt
;
508 AttrMap
*part_attmap
= NULL
;
509 bool found_whole_row
;
511 oldcxt
= MemoryContextSwitchTo(proute
->memcxt
);
513 partrel
= table_open(partOid
, RowExclusiveLock
);
515 leaf_part_rri
= makeNode(ResultRelInfo
);
516 InitResultRelInfo(leaf_part_rri
,
520 estate
->es_instrument
);
523 * Verify result relation is a valid target for an INSERT. An UPDATE of a
524 * partition-key becomes a DELETE+INSERT operation, so this check is still
525 * required when the operation is CMD_UPDATE.
527 CheckValidResultRel(leaf_part_rri
, CMD_INSERT
, NIL
);
530 * Open partition indices. The user may have asked to check for conflicts
531 * within this leaf partition and do "nothing" instead of throwing an
532 * error. Be prepared in that case by initializing the index information
533 * needed by ExecInsert() to perform speculative insertions.
535 if (partrel
->rd_rel
->relhasindex
&&
536 leaf_part_rri
->ri_IndexRelationDescs
== NULL
)
537 ExecOpenIndices(leaf_part_rri
,
539 node
->onConflictAction
!= ONCONFLICT_NONE
));
542 * Build WITH CHECK OPTION constraints for the partition. Note that we
543 * didn't build the withCheckOptionList for partitions within the planner,
544 * but simple translation of varattnos will suffice. This only occurs for
545 * the INSERT case or in the case of UPDATE/MERGE tuple routing where we
546 * didn't find a result rel to reuse.
548 if (node
&& node
->withCheckOptionLists
!= NIL
)
551 List
*wcoExprs
= NIL
;
555 * In the case of INSERT on a partitioned table, there is only one
556 * plan. Likewise, there is only one WCO list, not one per partition.
557 * For UPDATE/MERGE, there are as many WCO lists as there are plans.
559 Assert((node
->operation
== CMD_INSERT
&&
560 list_length(node
->withCheckOptionLists
) == 1 &&
561 list_length(node
->resultRelations
) == 1) ||
562 (node
->operation
== CMD_UPDATE
&&
563 list_length(node
->withCheckOptionLists
) ==
564 list_length(node
->resultRelations
)) ||
565 (node
->operation
== CMD_MERGE
&&
566 list_length(node
->withCheckOptionLists
) ==
567 list_length(node
->resultRelations
)));
570 * Use the WCO list of the first plan as a reference to calculate
571 * attno's for the WCO list of this partition. In the INSERT case,
572 * that refers to the root partitioned table, whereas in the UPDATE
573 * tuple routing case, that refers to the first partition in the
574 * mtstate->resultRelInfo array. In any case, both that relation and
575 * this partition should have the same columns, so we should be able
576 * to map attributes successfully.
578 wcoList
= linitial(node
->withCheckOptionLists
);
581 * Convert Vars in it to contain this partition's attribute numbers.
584 build_attrmap_by_name(RelationGetDescr(partrel
),
585 RelationGetDescr(firstResultRel
),
588 map_variable_attnos((Node
*) wcoList
,
591 RelationGetForm(partrel
)->reltype
,
593 /* We ignore the value of found_whole_row. */
597 WithCheckOption
*wco
= lfirst_node(WithCheckOption
, ll
);
598 ExprState
*wcoExpr
= ExecInitQual(castNode(List
, wco
->qual
),
601 wcoExprs
= lappend(wcoExprs
, wcoExpr
);
604 leaf_part_rri
->ri_WithCheckOptions
= wcoList
;
605 leaf_part_rri
->ri_WithCheckOptionExprs
= wcoExprs
;
609 * Build the RETURNING projection for the partition. Note that we didn't
610 * build the returningList for partitions within the planner, but simple
611 * translation of varattnos will suffice. This only occurs for the INSERT
612 * case or in the case of UPDATE/MERGE tuple routing where we didn't find
613 * a result rel to reuse.
615 if (node
&& node
->returningLists
!= NIL
)
617 TupleTableSlot
*slot
;
618 ExprContext
*econtext
;
621 /* See the comment above for WCO lists. */
622 Assert((node
->operation
== CMD_INSERT
&&
623 list_length(node
->returningLists
) == 1 &&
624 list_length(node
->resultRelations
) == 1) ||
625 (node
->operation
== CMD_UPDATE
&&
626 list_length(node
->returningLists
) ==
627 list_length(node
->resultRelations
)) ||
628 (node
->operation
== CMD_MERGE
&&
629 list_length(node
->returningLists
) ==
630 list_length(node
->resultRelations
)));
633 * Use the RETURNING list of the first plan as a reference to
634 * calculate attno's for the RETURNING list of this partition. See
635 * the comment above for WCO lists for more details on why this is
638 returningList
= linitial(node
->returningLists
);
641 * Convert Vars in it to contain this partition's attribute numbers.
643 if (part_attmap
== NULL
)
645 build_attrmap_by_name(RelationGetDescr(partrel
),
646 RelationGetDescr(firstResultRel
),
648 returningList
= (List
*)
649 map_variable_attnos((Node
*) returningList
,
652 RelationGetForm(partrel
)->reltype
,
654 /* We ignore the value of found_whole_row. */
656 leaf_part_rri
->ri_returningList
= returningList
;
659 * Initialize the projection itself.
661 * Use the slot and the expression context that would have been set up
662 * in ExecInitModifyTable() for projection's output.
664 Assert(mtstate
->ps
.ps_ResultTupleSlot
!= NULL
);
665 slot
= mtstate
->ps
.ps_ResultTupleSlot
;
666 Assert(mtstate
->ps
.ps_ExprContext
!= NULL
);
667 econtext
= mtstate
->ps
.ps_ExprContext
;
668 leaf_part_rri
->ri_projectReturning
=
669 ExecBuildProjectionInfo(returningList
, econtext
, slot
,
670 &mtstate
->ps
, RelationGetDescr(partrel
));
673 /* Set up information needed for routing tuples to the partition. */
674 ExecInitRoutingInfo(mtstate
, estate
, proute
, dispatch
,
675 leaf_part_rri
, partidx
, false);
678 * If there is an ON CONFLICT clause, initialize state for it.
680 if (node
&& node
->onConflictAction
!= ONCONFLICT_NONE
)
682 TupleDesc partrelDesc
= RelationGetDescr(partrel
);
683 ExprContext
*econtext
= mtstate
->ps
.ps_ExprContext
;
685 List
*arbiterIndexes
= NIL
;
688 * If there is a list of arbiter indexes, map it to a list of indexes
689 * in the partition. We do that by scanning the partition's index
690 * list and searching for ancestry relationships to each index in the
693 if (rootResultRelInfo
->ri_onConflictArbiterIndexes
!= NIL
)
697 childIdxs
= RelationGetIndexList(leaf_part_rri
->ri_RelationDesc
);
699 foreach(lc
, childIdxs
)
701 Oid childIdx
= lfirst_oid(lc
);
705 ancestors
= get_partition_ancestors(childIdx
);
706 foreach(lc2
, rootResultRelInfo
->ri_onConflictArbiterIndexes
)
708 if (list_member_oid(ancestors
, lfirst_oid(lc2
)))
709 arbiterIndexes
= lappend_oid(arbiterIndexes
, childIdx
);
711 list_free(ancestors
);
716 * If the resulting lists are of inequal length, something is wrong.
717 * (This shouldn't happen, since arbiter index selection should not
718 * pick up an invalid index.)
720 if (list_length(rootResultRelInfo
->ri_onConflictArbiterIndexes
) !=
721 list_length(arbiterIndexes
))
722 elog(ERROR
, "invalid arbiter index list");
723 leaf_part_rri
->ri_onConflictArbiterIndexes
= arbiterIndexes
;
726 * In the DO UPDATE case, we have some more state to initialize.
728 if (node
->onConflictAction
== ONCONFLICT_UPDATE
)
730 OnConflictSetState
*onconfl
= makeNode(OnConflictSetState
);
731 TupleConversionMap
*map
;
733 map
= ExecGetRootToChildMap(leaf_part_rri
, estate
);
735 Assert(node
->onConflictSet
!= NIL
);
736 Assert(rootResultRelInfo
->ri_onConflict
!= NULL
);
738 leaf_part_rri
->ri_onConflict
= onconfl
;
741 * Need a separate existing slot for each partition, as the
742 * partition could be of a different AM, even if the tuple
745 onconfl
->oc_Existing
=
746 table_slot_create(leaf_part_rri
->ri_RelationDesc
,
747 &mtstate
->ps
.state
->es_tupleTable
);
750 * If the partition's tuple descriptor matches exactly the root
751 * parent (the common case), we can re-use most of the parent's ON
752 * CONFLICT SET state, skipping a bunch of work. Otherwise, we
753 * need to create state specific to this partition.
758 * It's safe to reuse these from the partition root, as we
759 * only process one tuple at a time (therefore we won't
760 * overwrite needed data in slots), and the results of
761 * projections are independent of the underlying storage.
762 * Projections and where clauses themselves don't store state
763 * / are independent of the underlying storage.
765 onconfl
->oc_ProjSlot
=
766 rootResultRelInfo
->ri_onConflict
->oc_ProjSlot
;
767 onconfl
->oc_ProjInfo
=
768 rootResultRelInfo
->ri_onConflict
->oc_ProjInfo
;
769 onconfl
->oc_WhereClause
=
770 rootResultRelInfo
->ri_onConflict
->oc_WhereClause
;
778 * Translate expressions in onConflictSet to account for
779 * different attribute numbers. For that, map partition
780 * varattnos twice: first to catch the EXCLUDED
781 * pseudo-relation (INNER_VAR), and second to handle the main
782 * target relation (firstVarno).
784 onconflset
= copyObject(node
->onConflictSet
);
785 if (part_attmap
== NULL
)
787 build_attrmap_by_name(RelationGetDescr(partrel
),
788 RelationGetDescr(firstResultRel
),
790 onconflset
= (List
*)
791 map_variable_attnos((Node
*) onconflset
,
794 RelationGetForm(partrel
)->reltype
,
796 /* We ignore the value of found_whole_row. */
797 onconflset
= (List
*)
798 map_variable_attnos((Node
*) onconflset
,
801 RelationGetForm(partrel
)->reltype
,
803 /* We ignore the value of found_whole_row. */
805 /* Finally, adjust the target colnos to match the partition. */
806 onconflcols
= adjust_partition_colnos(node
->onConflictCols
,
809 /* create the tuple slot for the UPDATE SET projection */
810 onconfl
->oc_ProjSlot
=
811 table_slot_create(partrel
,
812 &mtstate
->ps
.state
->es_tupleTable
);
814 /* build UPDATE SET projection state */
815 onconfl
->oc_ProjInfo
=
816 ExecBuildUpdateProjection(onconflset
,
821 onconfl
->oc_ProjSlot
,
825 * If there is a WHERE clause, initialize state where it will
826 * be evaluated, mapping the attribute numbers appropriately.
827 * As with onConflictSet, we need to map partition varattnos
828 * to the partition's tupdesc.
830 if (node
->onConflictWhere
)
834 clause
= copyObject((List
*) node
->onConflictWhere
);
836 map_variable_attnos((Node
*) clause
,
839 RelationGetForm(partrel
)->reltype
,
841 /* We ignore the value of found_whole_row. */
843 map_variable_attnos((Node
*) clause
,
846 RelationGetForm(partrel
)->reltype
,
848 /* We ignore the value of found_whole_row. */
849 onconfl
->oc_WhereClause
=
850 ExecInitQual((List
*) clause
, &mtstate
->ps
);
857 * Since we've just initialized this ResultRelInfo, it's not in any list
858 * attached to the estate as yet. Add it, so that it can be found later.
860 * Note that the entries in this list appear in no predetermined order,
861 * because partition result rels are initialized as and when they're
864 MemoryContextSwitchTo(estate
->es_query_cxt
);
865 estate
->es_tuple_routing_result_relations
=
866 lappend(estate
->es_tuple_routing_result_relations
,
870 * Initialize information about this partition that's needed to handle
871 * MERGE. We take the "first" result relation's mergeActionList as
872 * reference and make copy for this relation, converting stuff that
873 * references attribute numbers to match this relation's.
875 * This duplicates much of the logic in ExecInitMerge(), so something
876 * changes there, look here too.
878 if (node
&& node
->operation
== CMD_MERGE
)
880 List
*firstMergeActionList
= linitial(node
->mergeActionLists
);
882 ExprContext
*econtext
= mtstate
->ps
.ps_ExprContext
;
885 if (part_attmap
== NULL
)
887 build_attrmap_by_name(RelationGetDescr(partrel
),
888 RelationGetDescr(firstResultRel
),
891 if (unlikely(!leaf_part_rri
->ri_projectNewInfoValid
))
892 ExecInitMergeTupleSlots(mtstate
, leaf_part_rri
);
894 /* Initialize state for join condition checking. */
896 map_variable_attnos(linitial(node
->mergeJoinConditions
),
899 RelationGetForm(partrel
)->reltype
,
901 /* We ignore the value of found_whole_row. */
902 leaf_part_rri
->ri_MergeJoinCondition
=
903 ExecInitQual((List
*) joinCondition
, &mtstate
->ps
);
905 foreach(lc
, firstMergeActionList
)
907 /* Make a copy for this relation to be safe. */
908 MergeAction
*action
= copyObject(lfirst(lc
));
909 MergeActionState
*action_state
;
911 /* Generate the action's state for this relation */
912 action_state
= makeNode(MergeActionState
);
913 action_state
->mas_action
= action
;
915 /* And put the action in the appropriate list */
916 leaf_part_rri
->ri_MergeActions
[action
->matchKind
] =
917 lappend(leaf_part_rri
->ri_MergeActions
[action
->matchKind
],
920 switch (action
->commandType
)
925 * ExecCheckPlanOutput() already done on the targetlist
926 * when "first" result relation initialized and it is same
927 * for all result relations.
929 action_state
->mas_proj
=
930 ExecBuildProjectionInfo(action
->targetList
, econtext
,
931 leaf_part_rri
->ri_newTupleSlot
,
933 RelationGetDescr(partrel
));
938 * Convert updateColnos from "first" result relation
939 * attribute numbers to this result rel's.
942 action
->updateColnos
=
943 adjust_partition_colnos_using_map(action
->updateColnos
,
945 action_state
->mas_proj
=
946 ExecBuildUpdateProjection(action
->targetList
,
948 action
->updateColnos
,
949 RelationGetDescr(leaf_part_rri
->ri_RelationDesc
),
951 leaf_part_rri
->ri_newTupleSlot
,
958 elog(ERROR
, "unknown action in MERGE WHEN clause");
961 /* found_whole_row intentionally ignored. */
963 map_variable_attnos(action
->qual
,
966 RelationGetForm(partrel
)->reltype
,
968 action_state
->mas_whenqual
=
969 ExecInitQual((List
*) action
->qual
, &mtstate
->ps
);
972 MemoryContextSwitchTo(oldcxt
);
974 return leaf_part_rri
;
978 * ExecInitRoutingInfo
979 * Set up information needed for translating tuples between root
980 * partitioned table format and partition format, and keep track of it
981 * in PartitionTupleRouting.
984 ExecInitRoutingInfo(ModifyTableState
*mtstate
,
986 PartitionTupleRouting
*proute
,
987 PartitionDispatch dispatch
,
988 ResultRelInfo
*partRelInfo
,
990 bool is_borrowed_rel
)
992 MemoryContext oldcxt
;
995 oldcxt
= MemoryContextSwitchTo(proute
->memcxt
);
998 * Set up tuple conversion between root parent and the partition if the
999 * two have different rowtypes. If conversion is indeed required, also
1000 * initialize a slot dedicated to storing this partition's converted
1001 * tuples. Various operations that are applied to tuples after routing,
1002 * such as checking constraints, will refer to this slot.
1004 if (ExecGetRootToChildMap(partRelInfo
, estate
) != NULL
)
1006 Relation partrel
= partRelInfo
->ri_RelationDesc
;
1009 * This pins the partition's TupleDesc, which will be released at the
1010 * end of the command.
1012 partRelInfo
->ri_PartitionTupleSlot
=
1013 table_slot_create(partrel
, &estate
->es_tupleTable
);
1016 partRelInfo
->ri_PartitionTupleSlot
= NULL
;
1019 * If the partition is a foreign table, let the FDW init itself for
1020 * routing tuples to the partition.
1022 if (partRelInfo
->ri_FdwRoutine
!= NULL
&&
1023 partRelInfo
->ri_FdwRoutine
->BeginForeignInsert
!= NULL
)
1024 partRelInfo
->ri_FdwRoutine
->BeginForeignInsert(mtstate
, partRelInfo
);
1027 * Determine if the FDW supports batch insert and determine the batch size
1028 * (a FDW may support batching, but it may be disabled for the
1029 * server/table or for this particular query).
1031 * If the FDW does not support batching, we set the batch size to 1.
1033 if (partRelInfo
->ri_FdwRoutine
!= NULL
&&
1034 partRelInfo
->ri_FdwRoutine
->GetForeignModifyBatchSize
&&
1035 partRelInfo
->ri_FdwRoutine
->ExecForeignBatchInsert
)
1036 partRelInfo
->ri_BatchSize
=
1037 partRelInfo
->ri_FdwRoutine
->GetForeignModifyBatchSize(partRelInfo
);
1039 partRelInfo
->ri_BatchSize
= 1;
1041 Assert(partRelInfo
->ri_BatchSize
>= 1);
1043 partRelInfo
->ri_CopyMultiInsertBuffer
= NULL
;
1046 * Keep track of it in the PartitionTupleRouting->partitions array.
1048 Assert(dispatch
->indexes
[partidx
] == -1);
1050 rri_index
= proute
->num_partitions
++;
1052 /* Allocate or enlarge the array, as needed */
1053 if (proute
->num_partitions
>= proute
->max_partitions
)
1055 if (proute
->max_partitions
== 0)
1057 proute
->max_partitions
= 8;
1058 proute
->partitions
= (ResultRelInfo
**)
1059 palloc(sizeof(ResultRelInfo
*) * proute
->max_partitions
);
1060 proute
->is_borrowed_rel
= (bool *)
1061 palloc(sizeof(bool) * proute
->max_partitions
);
1065 proute
->max_partitions
*= 2;
1066 proute
->partitions
= (ResultRelInfo
**)
1067 repalloc(proute
->partitions
, sizeof(ResultRelInfo
*) *
1068 proute
->max_partitions
);
1069 proute
->is_borrowed_rel
= (bool *)
1070 repalloc(proute
->is_borrowed_rel
, sizeof(bool) *
1071 proute
->max_partitions
);
1075 proute
->partitions
[rri_index
] = partRelInfo
;
1076 proute
->is_borrowed_rel
[rri_index
] = is_borrowed_rel
;
1077 dispatch
->indexes
[partidx
] = rri_index
;
1079 MemoryContextSwitchTo(oldcxt
);
1083 * ExecInitPartitionDispatchInfo
1084 * Lock the partitioned table (if not locked already) and initialize
1085 * PartitionDispatch for a partitioned table and store it in the next
1086 * available slot in the proute->partition_dispatch_info array. Also,
1087 * record the index into this array in the parent_pd->indexes[] array in
1088 * the partidx element so that we can properly retrieve the newly created
1089 * PartitionDispatch later.
1091 static PartitionDispatch
1092 ExecInitPartitionDispatchInfo(EState
*estate
,
1093 PartitionTupleRouting
*proute
, Oid partoid
,
1094 PartitionDispatch parent_pd
, int partidx
,
1095 ResultRelInfo
*rootResultRelInfo
)
1098 PartitionDesc partdesc
;
1099 PartitionDispatch pd
;
1101 MemoryContext oldcxt
;
1104 * For data modification, it is better that executor does not include
1105 * partitions being detached, except when running in snapshot-isolation
1106 * mode. This means that a read-committed transaction immediately gets a
1107 * "no partition for tuple" error when a tuple is inserted into a
1108 * partition that's being detached concurrently, but a transaction in
1109 * repeatable-read mode can still use such a partition.
1111 if (estate
->es_partition_directory
== NULL
)
1112 estate
->es_partition_directory
=
1113 CreatePartitionDirectory(estate
->es_query_cxt
,
1114 !IsolationUsesXactSnapshot());
1116 oldcxt
= MemoryContextSwitchTo(proute
->memcxt
);
1119 * Only sub-partitioned tables need to be locked here. The root
1120 * partitioned table will already have been locked as it's referenced in
1121 * the query's rtable.
1123 if (partoid
!= RelationGetRelid(proute
->partition_root
))
1124 rel
= table_open(partoid
, RowExclusiveLock
);
1126 rel
= proute
->partition_root
;
1127 partdesc
= PartitionDirectoryLookup(estate
->es_partition_directory
, rel
);
1129 pd
= (PartitionDispatch
) palloc(offsetof(PartitionDispatchData
, indexes
) +
1130 partdesc
->nparts
* sizeof(int));
1132 pd
->key
= RelationGetPartitionKey(rel
);
1134 pd
->partdesc
= partdesc
;
1135 if (parent_pd
!= NULL
)
1137 TupleDesc tupdesc
= RelationGetDescr(rel
);
1140 * For sub-partitioned tables where the column order differs from its
1141 * direct parent partitioned table, we must store a tuple table slot
1142 * initialized with its tuple descriptor and a tuple conversion map to
1143 * convert a tuple from its parent's rowtype to its own. This is to
1144 * make sure that we are looking at the correct row using the correct
1145 * tuple descriptor when computing its partition key for tuple
1148 pd
->tupmap
= build_attrmap_by_name_if_req(RelationGetDescr(parent_pd
->reldesc
),
1151 pd
->tupslot
= pd
->tupmap
?
1152 MakeSingleTupleTableSlot(tupdesc
, &TTSOpsVirtual
) : NULL
;
1156 /* Not required for the root partitioned table */
1162 * Initialize with -1 to signify that the corresponding partition's
1163 * ResultRelInfo or PartitionDispatch has not been created yet.
1165 memset(pd
->indexes
, -1, sizeof(int) * partdesc
->nparts
);
1167 /* Track in PartitionTupleRouting for later use */
1168 dispatchidx
= proute
->num_dispatch
++;
1170 /* Allocate or enlarge the array, as needed */
1171 if (proute
->num_dispatch
>= proute
->max_dispatch
)
1173 if (proute
->max_dispatch
== 0)
1175 proute
->max_dispatch
= 4;
1176 proute
->partition_dispatch_info
= (PartitionDispatch
*)
1177 palloc(sizeof(PartitionDispatch
) * proute
->max_dispatch
);
1178 proute
->nonleaf_partitions
= (ResultRelInfo
**)
1179 palloc(sizeof(ResultRelInfo
*) * proute
->max_dispatch
);
1183 proute
->max_dispatch
*= 2;
1184 proute
->partition_dispatch_info
= (PartitionDispatch
*)
1185 repalloc(proute
->partition_dispatch_info
,
1186 sizeof(PartitionDispatch
) * proute
->max_dispatch
);
1187 proute
->nonleaf_partitions
= (ResultRelInfo
**)
1188 repalloc(proute
->nonleaf_partitions
,
1189 sizeof(ResultRelInfo
*) * proute
->max_dispatch
);
1192 proute
->partition_dispatch_info
[dispatchidx
] = pd
;
1195 * If setting up a PartitionDispatch for a sub-partitioned table, we may
1196 * also need a minimally valid ResultRelInfo for checking the partition
1197 * constraint later; set that up now.
1201 ResultRelInfo
*rri
= makeNode(ResultRelInfo
);
1203 InitResultRelInfo(rri
, rel
, 0, rootResultRelInfo
, 0);
1204 proute
->nonleaf_partitions
[dispatchidx
] = rri
;
1207 proute
->nonleaf_partitions
[dispatchidx
] = NULL
;
1210 * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
1211 * install a downlink in the parent to allow quick descent.
1215 Assert(parent_pd
->indexes
[partidx
] == -1);
1216 parent_pd
->indexes
[partidx
] = dispatchidx
;
1219 MemoryContextSwitchTo(oldcxt
);
1225 * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
1228 * Close all the partitioned tables, leaf partitions, and their indices.
1231 ExecCleanupTupleRouting(ModifyTableState
*mtstate
,
1232 PartitionTupleRouting
*proute
)
1237 * Remember, proute->partition_dispatch_info[0] corresponds to the root
1238 * partitioned table, which we must not try to close, because it is the
1239 * main target table of the query that will be closed by callers such as
1240 * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
1241 * partitioned table.
1243 for (i
= 1; i
< proute
->num_dispatch
; i
++)
1245 PartitionDispatch pd
= proute
->partition_dispatch_info
[i
];
1247 table_close(pd
->reldesc
, NoLock
);
1250 ExecDropSingleTupleTableSlot(pd
->tupslot
);
1253 for (i
= 0; i
< proute
->num_partitions
; i
++)
1255 ResultRelInfo
*resultRelInfo
= proute
->partitions
[i
];
1257 /* Allow any FDWs to shut down */
1258 if (resultRelInfo
->ri_FdwRoutine
!= NULL
&&
1259 resultRelInfo
->ri_FdwRoutine
->EndForeignInsert
!= NULL
)
1260 resultRelInfo
->ri_FdwRoutine
->EndForeignInsert(mtstate
->ps
.state
,
1264 * Close it if it's not one of the result relations borrowed from the
1265 * owning ModifyTableState; those will be closed by ExecEndPlan().
1267 if (proute
->is_borrowed_rel
[i
])
1270 ExecCloseIndices(resultRelInfo
);
1271 table_close(resultRelInfo
->ri_RelationDesc
, NoLock
);
1276 * FormPartitionKeyDatum
1277 * Construct values[] and isnull[] arrays for the partition key
1280 * pd Partition dispatch object of the partitioned table
1281 * slot Heap tuple from which to extract partition key
1282 * estate executor state for evaluating any partition key
1283 * expressions (must be non-NULL)
1284 * values Array of partition key Datums (output area)
1285 * isnull Array of is-null indicators (output area)
1287 * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1288 * the heap tuple passed in.
1292 FormPartitionKeyDatum(PartitionDispatch pd
,
1293 TupleTableSlot
*slot
,
1298 ListCell
*partexpr_item
;
1301 if (pd
->key
->partexprs
!= NIL
&& pd
->keystate
== NIL
)
1303 /* Check caller has set up context correctly */
1304 Assert(estate
!= NULL
&&
1305 GetPerTupleExprContext(estate
)->ecxt_scantuple
== slot
);
1307 /* First time through, set up expression evaluation state */
1308 pd
->keystate
= ExecPrepareExprList(pd
->key
->partexprs
, estate
);
1311 partexpr_item
= list_head(pd
->keystate
);
1312 for (i
= 0; i
< pd
->key
->partnatts
; i
++)
1314 AttrNumber keycol
= pd
->key
->partattrs
[i
];
1320 /* Plain column; get the value directly from the heap tuple */
1321 datum
= slot_getattr(slot
, keycol
, &isNull
);
1325 /* Expression; need to evaluate it */
1326 if (partexpr_item
== NULL
)
1327 elog(ERROR
, "wrong number of partition key expressions");
1328 datum
= ExecEvalExprSwitchContext((ExprState
*) lfirst(partexpr_item
),
1329 GetPerTupleExprContext(estate
),
1331 partexpr_item
= lnext(pd
->keystate
, partexpr_item
);
1337 if (partexpr_item
!= NULL
)
1338 elog(ERROR
, "wrong number of partition key expressions");
1342 * The number of times the same partition must be found in a row before we
1343 * switch from a binary search for the given values to just checking if the
1344 * values belong to the last found partition. This must be above 0.
1346 #define PARTITION_CACHED_FIND_THRESHOLD 16
1349 * get_partition_for_tuple
1350 * Finds partition of relation which accepts the partition key specified
1351 * in values and isnull.
1353 * Calling this function can be quite expensive when LIST and RANGE
1354 * partitioned tables have many partitions. This is due to the binary search
1355 * that's done to find the correct partition. Many of the use cases for LIST
1356 * and RANGE partitioned tables make it likely that the same partition is
1357 * found in subsequent ExecFindPartition() calls. This is especially true for
1358 * cases such as RANGE partitioned tables on a TIMESTAMP column where the
1359 * partition key is the current time. When asked to find a partition for a
1360 * RANGE or LIST partitioned table, we record the partition index and datum
1361 * offset we've found for the given 'values' in the PartitionDesc (which is
1362 * stored in relcache), and if we keep finding the same partition
1363 * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching
1364 * logic and instead of performing a binary search to find the correct
1365 * partition, we'll just double-check that 'values' still belong to the last
1366 * found partition, and if so, we'll return that partition index, thus
1367 * skipping the need for the binary search. If we fail to match the last
1368 * partition when double checking, then we fall back on doing a binary search.
1369 * In this case, unless we find 'values' belong to the DEFAULT partition,
1370 * we'll reset the number of times we've hit the same partition so that we
1371 * don't attempt to use the cache again until we've found that partition at
1372 * least PARTITION_CACHED_FIND_THRESHOLD times in a row.
1374 * For cases where the partition changes on each lookup, the amount of
1375 * additional work required just amounts to recording the last found partition
1376 * and bound offset then resetting the found counter. This is cheap and does
1377 * not appear to cause any meaningful slowdowns for such cases.
1379 * No caching of partitions is done when the last found partition is the
1380 * DEFAULT or NULL partition. For the case of the DEFAULT partition, there
1381 * is no bound offset storing the matching datum, so we cannot confirm the
1382 * indexes match. For the NULL partition, this is just so cheap, there's no
1385 * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
1386 * found or -1 if none found.
1389 get_partition_for_tuple(PartitionDispatch pd
, Datum
*values
, bool *isnull
)
1391 int bound_offset
= -1;
1392 int part_index
= -1;
1393 PartitionKey key
= pd
->key
;
1394 PartitionDesc partdesc
= pd
->partdesc
;
1395 PartitionBoundInfo boundinfo
= partdesc
->boundinfo
;
1398 * In the switch statement below, when we perform a cached lookup for
1399 * RANGE and LIST partitioned tables, if we find that the last found
1400 * partition matches the 'values', we return the partition index right
1401 * away. We do this instead of breaking out of the switch as we don't
1402 * want to execute the code about the DEFAULT partition or do any updates
1403 * for any of the cache-related fields. That would be a waste of effort
1404 * as we already know it's not the DEFAULT partition and have no need to
1405 * increment the number of times we found the same partition any higher
1406 * than PARTITION_CACHED_FIND_THRESHOLD.
1409 /* Route as appropriate based on partitioning strategy. */
1410 switch (key
->strategy
)
1412 case PARTITION_STRATEGY_HASH
:
1416 /* hash partitioning is too cheap to bother caching */
1417 rowHash
= compute_partition_hash_value(key
->partnatts
,
1423 * HASH partitions can't have a DEFAULT partition and we don't
1424 * do any caching work for them, so just return the part index
1426 return boundinfo
->indexes
[rowHash
% boundinfo
->nindexes
];
1429 case PARTITION_STRATEGY_LIST
:
1432 /* this is far too cheap to bother doing any caching */
1433 if (partition_bound_accepts_nulls(boundinfo
))
1436 * When there is a NULL partition we just return that
1437 * directly. We don't have a bound_offset so it's not
1438 * valid to drop into the code after the switch which
1439 * checks and updates the cache fields. We perhaps should
1440 * be invalidating the details of the last cached
1441 * partition but there's no real need to. Keeping those
1442 * fields set gives a chance at matching to the cached
1443 * partition on the next lookup.
1445 return boundinfo
->null_index
;
1452 if (partdesc
->last_found_count
>= PARTITION_CACHED_FIND_THRESHOLD
)
1454 int last_datum_offset
= partdesc
->last_found_datum_index
;
1455 Datum lastDatum
= boundinfo
->datums
[last_datum_offset
][0];
1458 /* does the last found datum index match this datum? */
1459 cmpval
= DatumGetInt32(FunctionCall2Coll(&key
->partsupfunc
[0],
1460 key
->partcollation
[0],
1465 return boundinfo
->indexes
[last_datum_offset
];
1467 /* fall-through and do a manual lookup */
1470 bound_offset
= partition_list_bsearch(key
->partsupfunc
,
1474 if (bound_offset
>= 0 && equal
)
1475 part_index
= boundinfo
->indexes
[bound_offset
];
1479 case PARTITION_STRATEGY_RANGE
:
1482 range_partkey_has_null
= false;
1486 * No range includes NULL, so this will be accepted by the
1487 * default partition if there is one, and otherwise rejected.
1489 for (i
= 0; i
< key
->partnatts
; i
++)
1493 range_partkey_has_null
= true;
1498 /* NULLs belong in the DEFAULT partition */
1499 if (range_partkey_has_null
)
1502 if (partdesc
->last_found_count
>= PARTITION_CACHED_FIND_THRESHOLD
)
1504 int last_datum_offset
= partdesc
->last_found_datum_index
;
1505 Datum
*lastDatums
= boundinfo
->datums
[last_datum_offset
];
1506 PartitionRangeDatumKind
*kind
= boundinfo
->kind
[last_datum_offset
];
1509 /* check if the value is >= to the lower bound */
1510 cmpval
= partition_rbound_datum_cmp(key
->partsupfunc
,
1518 * If it's equal to the lower bound then no need to check
1522 return boundinfo
->indexes
[last_datum_offset
+ 1];
1524 if (cmpval
< 0 && last_datum_offset
+ 1 < boundinfo
->ndatums
)
1526 /* check if the value is below the upper bound */
1527 lastDatums
= boundinfo
->datums
[last_datum_offset
+ 1];
1528 kind
= boundinfo
->kind
[last_datum_offset
+ 1];
1529 cmpval
= partition_rbound_datum_cmp(key
->partsupfunc
,
1537 return boundinfo
->indexes
[last_datum_offset
+ 1];
1539 /* fall-through and do a manual lookup */
1542 bound_offset
= partition_range_datum_bsearch(key
->partsupfunc
,
1550 * The bound at bound_offset is less than or equal to the
1551 * tuple value, so the bound at offset+1 is the upper bound of
1552 * the partition we're looking for, if there actually exists
1555 part_index
= boundinfo
->indexes
[bound_offset
+ 1];
1560 elog(ERROR
, "unexpected partition strategy: %d",
1561 (int) key
->strategy
);
1565 * part_index < 0 means we failed to find a partition of this parent. Use
1566 * the default partition, if there is one.
1571 * No need to reset the cache fields here. The next set of values
1572 * might end up belonging to the cached partition, so leaving the
1573 * cache alone improves the chances of a cache hit on the next lookup.
1575 return boundinfo
->default_index
;
1578 /* we should only make it here when the code above set bound_offset */
1579 Assert(bound_offset
>= 0);
1582 * Attend to the cache fields. If the bound_offset matches the last
1583 * cached bound offset then we've found the same partition as last time,
1584 * so bump the count by one. If all goes well, we'll eventually reach
1585 * PARTITION_CACHED_FIND_THRESHOLD and try the cache path next time
1586 * around. Otherwise, we'll reset the cache count back to 1 to mark that
1587 * we've found this partition for the first time.
1589 if (bound_offset
== partdesc
->last_found_datum_index
)
1590 partdesc
->last_found_count
++;
1593 partdesc
->last_found_count
= 1;
1594 partdesc
->last_found_part_index
= part_index
;
1595 partdesc
->last_found_datum_index
= bound_offset
;
1602 * ExecBuildSlotPartitionKeyDescription
1604 * This works very much like BuildIndexValueDescription() and is currently
1605 * used for building error messages when ExecFindPartition() fails to find
1606 * partition for a row.
1609 ExecBuildSlotPartitionKeyDescription(Relation rel
,
1615 PartitionKey key
= RelationGetPartitionKey(rel
);
1616 int partnatts
= get_partition_natts(key
);
1618 Oid relid
= RelationGetRelid(rel
);
1619 AclResult aclresult
;
1621 if (check_enable_rls(relid
, InvalidOid
, true) == RLS_ENABLED
)
1624 /* If the user has table-level access, just go build the description. */
1625 aclresult
= pg_class_aclcheck(relid
, GetUserId(), ACL_SELECT
);
1626 if (aclresult
!= ACLCHECK_OK
)
1629 * Step through the columns of the partition key and make sure the
1630 * user has SELECT rights on all of them.
1632 for (i
= 0; i
< partnatts
; i
++)
1634 AttrNumber attnum
= get_partition_col_attnum(key
, i
);
1637 * If this partition key column is an expression, we return no
1638 * detail rather than try to figure out what column(s) the
1639 * expression includes and if the user has SELECT rights on them.
1641 if (attnum
== InvalidAttrNumber
||
1642 pg_attribute_aclcheck(relid
, attnum
, GetUserId(),
1643 ACL_SELECT
) != ACLCHECK_OK
)
1648 initStringInfo(&buf
);
1649 appendStringInfo(&buf
, "(%s) = (",
1650 pg_get_partkeydef_columns(relid
, true));
1652 for (i
= 0; i
< partnatts
; i
++)
1664 getTypeOutputInfo(get_partition_col_typid(key
, i
),
1665 &foutoid
, &typisvarlena
);
1666 val
= OidOutputFunctionCall(foutoid
, values
[i
]);
1670 appendStringInfoString(&buf
, ", ");
1672 /* truncate if needed */
1673 vallen
= strlen(val
);
1674 if (vallen
<= maxfieldlen
)
1675 appendBinaryStringInfo(&buf
, val
, vallen
);
1678 vallen
= pg_mbcliplen(val
, vallen
, maxfieldlen
);
1679 appendBinaryStringInfo(&buf
, val
, vallen
);
1680 appendStringInfoString(&buf
, "...");
1684 appendStringInfoChar(&buf
, ')');
1690 * adjust_partition_colnos
1691 * Adjust the list of UPDATE target column numbers to account for
1692 * attribute differences between the parent and the partition.
1694 * Note: mustn't be called if no adjustment is required.
1697 adjust_partition_colnos(List
*colnos
, ResultRelInfo
*leaf_part_rri
)
1699 TupleConversionMap
*map
= ExecGetChildToRootMap(leaf_part_rri
);
1701 Assert(map
!= NULL
);
1703 return adjust_partition_colnos_using_map(colnos
, map
->attrMap
);
1707 * adjust_partition_colnos_using_map
1708 * Like adjust_partition_colnos, but uses a caller-supplied map instead
1709 * of assuming to map from the "root" result relation.
1711 * Note: mustn't be called if no adjustment is required.
1714 adjust_partition_colnos_using_map(List
*colnos
, AttrMap
*attrMap
)
1716 List
*new_colnos
= NIL
;
1719 Assert(attrMap
!= NULL
); /* else we shouldn't be here */
1723 AttrNumber parentattrno
= lfirst_int(lc
);
1725 if (parentattrno
<= 0 ||
1726 parentattrno
> attrMap
->maplen
||
1727 attrMap
->attnums
[parentattrno
- 1] == 0)
1728 elog(ERROR
, "unexpected attno %d in target column list",
1730 new_colnos
= lappend_int(new_colnos
,
1731 attrMap
->attnums
[parentattrno
- 1]);
1737 /*-------------------------------------------------------------------------
1738 * Run-Time Partition Pruning Support.
1740 * The following series of functions exist to support the removal of unneeded
1741 * subplans for queries against partitioned tables. The supporting functions
1742 * here are designed to work with any plan type which supports an arbitrary
1743 * number of subplans, e.g. Append, MergeAppend.
1745 * When pruning involves comparison of a partition key to a constant, it's
1746 * done by the planner. However, if we have a comparison to a non-constant
1747 * but not volatile expression, that presents an opportunity for run-time
1748 * pruning by the executor, allowing irrelevant partitions to be skipped
1751 * We must distinguish expressions containing PARAM_EXEC Params from
1752 * expressions that don't contain those. Even though a PARAM_EXEC Param is
1753 * considered to be a stable expression, it can change value from one plan
1754 * node scan to the next during query execution. Stable comparison
1755 * expressions that don't involve such Params allow partition pruning to be
1756 * done once during executor startup. Expressions that do involve such Params
1757 * require us to prune separately for each scan of the parent plan node.
1759 * Note that pruning away unneeded subplans during executor startup has the
1760 * added benefit of not having to initialize the unneeded subplans at all.
1765 * ExecInitPartitionPruning:
1766 * Creates the PartitionPruneState required by ExecFindMatchingSubPlans.
1767 * Details stored include how to map the partition index returned by the
1768 * partition pruning code into subplan indexes. Also determines the set
1769 * of subplans to initialize considering the result of performing initial
1770 * pruning steps if any. Maps in PartitionPruneState are updated to
1771 * account for initial pruning possibly having eliminated some of the
1774 * ExecFindMatchingSubPlans:
1775 * Returns indexes of matching subplans after evaluating the expressions
1776 * that are safe to evaluate at a given point. This function is first
1777 * called during ExecInitPartitionPruning() to find the initially
1778 * matching subplans based on performing the initial pruning steps and
1779 * then must be called again each time the value of a Param listed in
1780 * PartitionPruneState's 'execparamids' changes.
1781 *-------------------------------------------------------------------------
1785 * ExecInitPartitionPruning
1786 * Initialize data structure needed for run-time partition pruning and
1787 * do initial pruning if needed
1789 * On return, *initially_valid_subplans is assigned the set of indexes of
1790 * child subplans that must be initialized along with the parent plan node.
1791 * Initial pruning is performed here if needed and in that case only the
1792 * surviving subplans' indexes are added.
1794 * If subplans are indeed pruned, subplan_map arrays contained in the returned
1795 * PartitionPruneState are re-sequenced to not count those, though only if the
1796 * maps will be needed for subsequent execution pruning passes.
1798 PartitionPruneState
*
1799 ExecInitPartitionPruning(PlanState
*planstate
,
1800 int n_total_subplans
,
1801 PartitionPruneInfo
*pruneinfo
,
1802 Bitmapset
**initially_valid_subplans
)
1804 PartitionPruneState
*prunestate
;
1805 EState
*estate
= planstate
->state
;
1807 /* We may need an expression context to evaluate partition exprs */
1808 ExecAssignExprContext(estate
, planstate
);
1810 /* Create the working data structure for pruning */
1811 prunestate
= CreatePartitionPruneState(planstate
, pruneinfo
);
1814 * Perform an initial partition prune pass, if required.
1816 if (prunestate
->do_initial_prune
)
1817 *initially_valid_subplans
= ExecFindMatchingSubPlans(prunestate
, true);
1820 /* No pruning, so we'll need to initialize all subplans */
1821 Assert(n_total_subplans
> 0);
1822 *initially_valid_subplans
= bms_add_range(NULL
, 0,
1823 n_total_subplans
- 1);
1827 * Re-sequence subplan indexes contained in prunestate to account for any
1828 * that were removed above due to initial pruning. No need to do this if
1829 * no steps were removed.
1831 if (bms_num_members(*initially_valid_subplans
) < n_total_subplans
)
1834 * We can safely skip this when !do_exec_prune, even though that
1835 * leaves invalid data in prunestate, because that data won't be
1836 * consulted again (cf initial Assert in ExecFindMatchingSubPlans).
1838 if (prunestate
->do_exec_prune
)
1839 PartitionPruneFixSubPlanMap(prunestate
,
1840 *initially_valid_subplans
,
1848 * CreatePartitionPruneState
1849 * Build the data structure required for calling ExecFindMatchingSubPlans
1851 * 'planstate' is the parent plan node's execution state.
1853 * 'pruneinfo' is a PartitionPruneInfo as generated by
1854 * make_partition_pruneinfo. Here we build a PartitionPruneState containing a
1855 * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
1856 * pruneinfo->prune_infos), each of which contains a PartitionedRelPruningData
1857 * for each PartitionedRelPruneInfo appearing in that sublist. This two-level
1858 * system is needed to keep from confusing the different hierarchies when a
1859 * UNION ALL contains multiple partitioned tables as children. The data
1860 * stored in each PartitionedRelPruningData can be re-used each time we
1861 * re-evaluate which partitions match the pruning steps provided in each
1862 * PartitionedRelPruneInfo.
1864 static PartitionPruneState
*
1865 CreatePartitionPruneState(PlanState
*planstate
, PartitionPruneInfo
*pruneinfo
)
1867 EState
*estate
= planstate
->state
;
1868 PartitionPruneState
*prunestate
;
1869 int n_part_hierarchies
;
1872 ExprContext
*econtext
= planstate
->ps_ExprContext
;
1874 /* For data reading, executor always includes detached partitions */
1875 if (estate
->es_partition_directory
== NULL
)
1876 estate
->es_partition_directory
=
1877 CreatePartitionDirectory(estate
->es_query_cxt
, false);
1879 n_part_hierarchies
= list_length(pruneinfo
->prune_infos
);
1880 Assert(n_part_hierarchies
> 0);
1883 * Allocate the data structure
1885 prunestate
= (PartitionPruneState
*)
1886 palloc(offsetof(PartitionPruneState
, partprunedata
) +
1887 sizeof(PartitionPruningData
*) * n_part_hierarchies
);
1889 prunestate
->execparamids
= NULL
;
1890 /* other_subplans can change at runtime, so we need our own copy */
1891 prunestate
->other_subplans
= bms_copy(pruneinfo
->other_subplans
);
1892 prunestate
->do_initial_prune
= false; /* may be set below */
1893 prunestate
->do_exec_prune
= false; /* may be set below */
1894 prunestate
->num_partprunedata
= n_part_hierarchies
;
1897 * Create a short-term memory context which we'll use when making calls to
1898 * the partition pruning functions. This avoids possible memory leaks,
1899 * since the pruning functions call comparison functions that aren't under
1902 prunestate
->prune_context
=
1903 AllocSetContextCreate(CurrentMemoryContext
,
1905 ALLOCSET_DEFAULT_SIZES
);
1908 foreach(lc
, pruneinfo
->prune_infos
)
1910 List
*partrelpruneinfos
= lfirst_node(List
, lc
);
1911 int npartrelpruneinfos
= list_length(partrelpruneinfos
);
1912 PartitionPruningData
*prunedata
;
1916 prunedata
= (PartitionPruningData
*)
1917 palloc(offsetof(PartitionPruningData
, partrelprunedata
) +
1918 npartrelpruneinfos
* sizeof(PartitionedRelPruningData
));
1919 prunestate
->partprunedata
[i
] = prunedata
;
1920 prunedata
->num_partrelprunedata
= npartrelpruneinfos
;
1923 foreach(lc2
, partrelpruneinfos
)
1925 PartitionedRelPruneInfo
*pinfo
= lfirst_node(PartitionedRelPruneInfo
, lc2
);
1926 PartitionedRelPruningData
*pprune
= &prunedata
->partrelprunedata
[j
];
1928 PartitionDesc partdesc
;
1929 PartitionKey partkey
;
1932 * We can rely on the copies of the partitioned table's partition
1933 * key and partition descriptor appearing in its relcache entry,
1934 * because that entry will be held open and locked for the
1935 * duration of this executor run.
1937 partrel
= ExecGetRangeTableRelation(estate
, pinfo
->rtindex
);
1938 partkey
= RelationGetPartitionKey(partrel
);
1939 partdesc
= PartitionDirectoryLookup(estate
->es_partition_directory
,
1943 * Initialize the subplan_map and subpart_map.
1945 * The set of partitions that exist now might not be the same that
1946 * existed when the plan was made. The normal case is that it is;
1947 * optimize for that case with a quick comparison, and just copy
1948 * the subplan_map and make subpart_map point to the one in
1951 * For the case where they aren't identical, we could have more
1952 * partitions on either side; or even exactly the same number of
1953 * them on both but the set of OIDs doesn't match fully. Handle
1954 * this by creating new subplan_map and subpart_map arrays that
1955 * corresponds to the ones in the PruneInfo where the new
1956 * partition descriptor's OIDs match. Any that don't match can be
1957 * set to -1, as if they were pruned. By construction, both
1958 * arrays are in partition bounds order.
1960 pprune
->nparts
= partdesc
->nparts
;
1961 pprune
->subplan_map
= palloc(sizeof(int) * partdesc
->nparts
);
1963 if (partdesc
->nparts
== pinfo
->nparts
&&
1964 memcmp(partdesc
->oids
, pinfo
->relid_map
,
1965 sizeof(int) * partdesc
->nparts
) == 0)
1967 pprune
->subpart_map
= pinfo
->subpart_map
;
1968 memcpy(pprune
->subplan_map
, pinfo
->subplan_map
,
1969 sizeof(int) * pinfo
->nparts
);
1977 * When the partition arrays are not identical, there could be
1978 * some new ones but it's also possible that one was removed;
1979 * we cope with both situations by walking the arrays and
1980 * discarding those that don't match.
1982 * If the number of partitions on both sides match, it's still
1983 * possible that one partition has been detached and another
1984 * attached. Cope with that by creating a map that skips any
1987 pprune
->subpart_map
= palloc(sizeof(int) * partdesc
->nparts
);
1989 for (pp_idx
= 0; pp_idx
< partdesc
->nparts
; pp_idx
++)
1991 /* Skip any InvalidOid relid_map entries */
1992 while (pd_idx
< pinfo
->nparts
&&
1993 !OidIsValid(pinfo
->relid_map
[pd_idx
]))
1997 if (pd_idx
< pinfo
->nparts
&&
1998 pinfo
->relid_map
[pd_idx
] == partdesc
->oids
[pp_idx
])
2001 pprune
->subplan_map
[pp_idx
] =
2002 pinfo
->subplan_map
[pd_idx
];
2003 pprune
->subpart_map
[pp_idx
] =
2004 pinfo
->subpart_map
[pd_idx
];
2010 * There isn't an exact match in the corresponding
2011 * positions of both arrays. Peek ahead in
2012 * pinfo->relid_map to see if we have a match for the
2013 * current partition in partdesc. Normally if a match
2014 * exists it's just one element ahead, and it means the
2015 * planner saw one extra partition that we no longer see
2016 * now (its concurrent detach finished just in between);
2017 * so we skip that one by updating pd_idx to the new
2018 * location and jumping above. We can then continue to
2019 * match the rest of the elements after skipping the OID
2020 * with no match; no future matches are tried for the
2021 * element that was skipped, because we know the arrays to
2022 * be in the same order.
2024 * If we don't see a match anywhere in the rest of the
2025 * pinfo->relid_map array, that means we see an element
2026 * now that the planner didn't see, so mark that one as
2027 * pruned and move on.
2029 for (int pd_idx2
= pd_idx
+ 1; pd_idx2
< pinfo
->nparts
; pd_idx2
++)
2031 if (pd_idx2
>= pinfo
->nparts
)
2033 if (pinfo
->relid_map
[pd_idx2
] == partdesc
->oids
[pp_idx
])
2040 pprune
->subpart_map
[pp_idx
] = -1;
2041 pprune
->subplan_map
[pp_idx
] = -1;
2045 /* present_parts is also subject to later modification */
2046 pprune
->present_parts
= bms_copy(pinfo
->present_parts
);
2049 * Initialize pruning contexts as needed. Note that we must skip
2050 * execution-time partition pruning in EXPLAIN (GENERIC_PLAN),
2051 * since parameter values may be missing.
2053 pprune
->initial_pruning_steps
= pinfo
->initial_pruning_steps
;
2054 if (pinfo
->initial_pruning_steps
&&
2055 !(econtext
->ecxt_estate
->es_top_eflags
& EXEC_FLAG_EXPLAIN_GENERIC
))
2057 InitPartitionPruneContext(&pprune
->initial_context
,
2058 pinfo
->initial_pruning_steps
,
2059 partdesc
, partkey
, planstate
,
2061 /* Record whether initial pruning is needed at any level */
2062 prunestate
->do_initial_prune
= true;
2064 pprune
->exec_pruning_steps
= pinfo
->exec_pruning_steps
;
2065 if (pinfo
->exec_pruning_steps
&&
2066 !(econtext
->ecxt_estate
->es_top_eflags
& EXEC_FLAG_EXPLAIN_GENERIC
))
2068 InitPartitionPruneContext(&pprune
->exec_context
,
2069 pinfo
->exec_pruning_steps
,
2070 partdesc
, partkey
, planstate
,
2072 /* Record whether exec pruning is needed at any level */
2073 prunestate
->do_exec_prune
= true;
2077 * Accumulate the IDs of all PARAM_EXEC Params affecting the
2078 * partitioning decisions at this plan node.
2080 prunestate
->execparamids
= bms_add_members(prunestate
->execparamids
,
2081 pinfo
->execparamids
);
2092 * Initialize a PartitionPruneContext for the given list of pruning steps.
2095 InitPartitionPruneContext(PartitionPruneContext
*context
,
2096 List
*pruning_steps
,
2097 PartitionDesc partdesc
,
2098 PartitionKey partkey
,
2099 PlanState
*planstate
,
2100 ExprContext
*econtext
)
2106 n_steps
= list_length(pruning_steps
);
2108 context
->strategy
= partkey
->strategy
;
2109 context
->partnatts
= partnatts
= partkey
->partnatts
;
2110 context
->nparts
= partdesc
->nparts
;
2111 context
->boundinfo
= partdesc
->boundinfo
;
2112 context
->partcollation
= partkey
->partcollation
;
2113 context
->partsupfunc
= partkey
->partsupfunc
;
2115 /* We'll look up type-specific support functions as needed */
2116 context
->stepcmpfuncs
= (FmgrInfo
*)
2117 palloc0(sizeof(FmgrInfo
) * n_steps
* partnatts
);
2119 context
->ppccontext
= CurrentMemoryContext
;
2120 context
->planstate
= planstate
;
2121 context
->exprcontext
= econtext
;
2123 /* Initialize expression state for each expression we need */
2124 context
->exprstates
= (ExprState
**)
2125 palloc0(sizeof(ExprState
*) * n_steps
* partnatts
);
2126 foreach(lc
, pruning_steps
)
2128 PartitionPruneStepOp
*step
= (PartitionPruneStepOp
*) lfirst(lc
);
2129 ListCell
*lc2
= list_head(step
->exprs
);
2132 /* not needed for other step kinds */
2133 if (!IsA(step
, PartitionPruneStepOp
))
2136 Assert(list_length(step
->exprs
) <= partnatts
);
2138 for (keyno
= 0; keyno
< partnatts
; keyno
++)
2140 if (bms_is_member(keyno
, step
->nullkeys
))
2145 Expr
*expr
= lfirst(lc2
);
2147 /* not needed for Consts */
2148 if (!IsA(expr
, Const
))
2150 int stateidx
= PruneCxtStateIdx(partnatts
,
2155 * When planstate is NULL, pruning_steps is known not to
2156 * contain any expressions that depend on the parent plan.
2157 * Information of any available EXTERN parameters must be
2158 * passed explicitly in that case, which the caller must
2159 * have made available via econtext.
2161 if (planstate
== NULL
)
2162 context
->exprstates
[stateidx
] =
2163 ExecInitExprWithParams(expr
,
2164 econtext
->ecxt_param_list_info
);
2166 context
->exprstates
[stateidx
] =
2167 ExecInitExpr(expr
, context
->planstate
);
2169 lc2
= lnext(step
->exprs
, lc2
);
2176 * PartitionPruneFixSubPlanMap
2177 * Fix mapping of partition indexes to subplan indexes contained in
2178 * prunestate by considering the new list of subplans that survived
2181 * Current values of the indexes present in PartitionPruneState count all the
2182 * subplans that would be present before initial pruning was done. If initial
2183 * pruning got rid of some of the subplans, any subsequent pruning passes will
2184 * be looking at a different set of target subplans to choose from than those
2185 * in the pre-initial-pruning set, so the maps in PartitionPruneState
2186 * containing those indexes must be updated to reflect the new indexes of
2187 * subplans in the post-initial-pruning set.
2190 PartitionPruneFixSubPlanMap(PartitionPruneState
*prunestate
,
2191 Bitmapset
*initially_valid_subplans
,
2192 int n_total_subplans
)
2194 int *new_subplan_indexes
;
2195 Bitmapset
*new_other_subplans
;
2200 * First we must build a temporary array which maps old subplan indexes to
2201 * new ones. For convenience of initialization, we use 1-based indexes in
2202 * this array and leave pruned items as 0.
2204 new_subplan_indexes
= (int *) palloc0(sizeof(int) * n_total_subplans
);
2207 while ((i
= bms_next_member(initially_valid_subplans
, i
)) >= 0)
2209 Assert(i
< n_total_subplans
);
2210 new_subplan_indexes
[i
] = newidx
++;
2214 * Now we can update each PartitionedRelPruneInfo's subplan_map with new
2215 * subplan indexes. We must also recompute its present_parts bitmap.
2217 for (i
= 0; i
< prunestate
->num_partprunedata
; i
++)
2219 PartitionPruningData
*prunedata
= prunestate
->partprunedata
[i
];
2223 * Within each hierarchy, we perform this loop in back-to-front order
2224 * so that we determine present_parts for the lowest-level partitioned
2225 * tables first. This way we can tell whether a sub-partitioned
2226 * table's partitions were entirely pruned so we can exclude it from
2227 * the current level's present_parts.
2229 for (j
= prunedata
->num_partrelprunedata
- 1; j
>= 0; j
--)
2231 PartitionedRelPruningData
*pprune
= &prunedata
->partrelprunedata
[j
];
2232 int nparts
= pprune
->nparts
;
2235 /* We just rebuild present_parts from scratch */
2236 bms_free(pprune
->present_parts
);
2237 pprune
->present_parts
= NULL
;
2239 for (k
= 0; k
< nparts
; k
++)
2241 int oldidx
= pprune
->subplan_map
[k
];
2245 * If this partition existed as a subplan then change the old
2246 * subplan index to the new subplan index. The new index may
2247 * become -1 if the partition was pruned above, or it may just
2248 * come earlier in the subplan list due to some subplans being
2249 * removed earlier in the list. If it's a subpartition, add
2250 * it to present_parts unless it's entirely pruned.
2254 Assert(oldidx
< n_total_subplans
);
2255 pprune
->subplan_map
[k
] = new_subplan_indexes
[oldidx
] - 1;
2257 if (new_subplan_indexes
[oldidx
] > 0)
2258 pprune
->present_parts
=
2259 bms_add_member(pprune
->present_parts
, k
);
2261 else if ((subidx
= pprune
->subpart_map
[k
]) >= 0)
2263 PartitionedRelPruningData
*subprune
;
2265 subprune
= &prunedata
->partrelprunedata
[subidx
];
2267 if (!bms_is_empty(subprune
->present_parts
))
2268 pprune
->present_parts
=
2269 bms_add_member(pprune
->present_parts
, k
);
2276 * We must also recompute the other_subplans set, since indexes in it may
2279 new_other_subplans
= NULL
;
2281 while ((i
= bms_next_member(prunestate
->other_subplans
, i
)) >= 0)
2282 new_other_subplans
= bms_add_member(new_other_subplans
,
2283 new_subplan_indexes
[i
] - 1);
2285 bms_free(prunestate
->other_subplans
);
2286 prunestate
->other_subplans
= new_other_subplans
;
2288 pfree(new_subplan_indexes
);
2292 * ExecFindMatchingSubPlans
2293 * Determine which subplans match the pruning steps detailed in
2294 * 'prunestate' for the current comparison expression values.
2296 * Pass initial_prune if PARAM_EXEC Params cannot yet be evaluated. This
2297 * differentiates the initial executor-time pruning step from later
2301 ExecFindMatchingSubPlans(PartitionPruneState
*prunestate
,
2304 Bitmapset
*result
= NULL
;
2305 MemoryContext oldcontext
;
2309 * Either we're here on the initial prune done during pruning
2310 * initialization, or we're at a point where PARAM_EXEC Params can be
2311 * evaluated *and* there are steps in which to do so.
2313 Assert(initial_prune
|| prunestate
->do_exec_prune
);
2316 * Switch to a temp context to avoid leaking memory in the executor's
2317 * query-lifespan memory context.
2319 oldcontext
= MemoryContextSwitchTo(prunestate
->prune_context
);
2322 * For each hierarchy, do the pruning tests, and add nondeletable
2323 * subplans' indexes to "result".
2325 for (i
= 0; i
< prunestate
->num_partprunedata
; i
++)
2327 PartitionPruningData
*prunedata
= prunestate
->partprunedata
[i
];
2328 PartitionedRelPruningData
*pprune
;
2331 * We pass the zeroth item, belonging to the root table of the
2332 * hierarchy, and find_matching_subplans_recurse() takes care of
2333 * recursing to other (lower-level) parents as needed.
2335 pprune
= &prunedata
->partrelprunedata
[0];
2336 find_matching_subplans_recurse(prunedata
, pprune
, initial_prune
,
2339 /* Expression eval may have used space in ExprContext too */
2340 if (pprune
->exec_pruning_steps
)
2341 ResetExprContext(pprune
->exec_context
.exprcontext
);
2344 /* Add in any subplans that partition pruning didn't account for */
2345 result
= bms_add_members(result
, prunestate
->other_subplans
);
2347 MemoryContextSwitchTo(oldcontext
);
2349 /* Copy result out of the temp context before we reset it */
2350 result
= bms_copy(result
);
2352 MemoryContextReset(prunestate
->prune_context
);
2358 * find_matching_subplans_recurse
2359 * Recursive worker function for ExecFindMatchingSubPlans
2361 * Adds valid (non-prunable) subplan IDs to *validsubplans
2364 find_matching_subplans_recurse(PartitionPruningData
*prunedata
,
2365 PartitionedRelPruningData
*pprune
,
2367 Bitmapset
**validsubplans
)
2372 /* Guard against stack overflow due to overly deep partition hierarchy. */
2373 check_stack_depth();
2376 * Prune as appropriate, if we have pruning steps matching the current
2377 * execution context. Otherwise just include all partitions at this
2380 if (initial_prune
&& pprune
->initial_pruning_steps
)
2381 partset
= get_matching_partitions(&pprune
->initial_context
,
2382 pprune
->initial_pruning_steps
);
2383 else if (!initial_prune
&& pprune
->exec_pruning_steps
)
2384 partset
= get_matching_partitions(&pprune
->exec_context
,
2385 pprune
->exec_pruning_steps
);
2387 partset
= pprune
->present_parts
;
2389 /* Translate partset into subplan indexes */
2391 while ((i
= bms_next_member(partset
, i
)) >= 0)
2393 if (pprune
->subplan_map
[i
] >= 0)
2394 *validsubplans
= bms_add_member(*validsubplans
,
2395 pprune
->subplan_map
[i
]);
2398 int partidx
= pprune
->subpart_map
[i
];
2401 find_matching_subplans_recurse(prunedata
,
2402 &prunedata
->partrelprunedata
[partidx
],
2403 initial_prune
, validsubplans
);
2407 * We get here if the planner already pruned all the sub-
2408 * partitions for this partition. Silently ignore this
2409 * partition in this case. The end result is the same: we
2410 * would have pruned all partitions just the same, but we
2411 * don't have any pruning steps to execute to verify this.