1 /*-------------------------------------------------------------------------
4 * Support routines for manipulating partition descriptors
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/backend/partitioning/partdesc.c
12 *-------------------------------------------------------------------------
17 #include "access/genam.h"
18 #include "access/htup_details.h"
19 #include "access/table.h"
20 #include "catalog/partition.h"
21 #include "catalog/pg_inherits.h"
22 #include "partitioning/partbounds.h"
23 #include "partitioning/partdesc.h"
24 #include "storage/bufmgr.h"
25 #include "storage/sinval.h"
26 #include "utils/builtins.h"
27 #include "utils/fmgroids.h"
28 #include "utils/hsearch.h"
29 #include "utils/inval.h"
30 #include "utils/lsyscache.h"
31 #include "utils/memutils.h"
32 #include "utils/partcache.h"
33 #include "utils/rel.h"
34 #include "utils/syscache.h"
36 typedef struct PartitionDirectoryData
38 MemoryContext pdir_mcxt
;
41 } PartitionDirectoryData
;
43 typedef struct PartitionDirectoryEntry
48 } PartitionDirectoryEntry
;
50 static PartitionDesc
RelationBuildPartitionDesc(Relation rel
,
55 * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
57 * We keep two partdescs in relcache: rd_partdesc includes all partitions
58 * (even those being concurrently marked detached), while rd_partdesc_nodetach
59 * omits (some of) those. We store the pg_inherits.xmin value for the latter,
60 * to determine whether it can be validly reused in each case, since that
61 * depends on the active snapshot.
63 * Note: we arrange for partition descriptors to not get freed until the
64 * relcache entry's refcount goes to zero (see hacks in RelationClose,
65 * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
66 * though we hand back a direct pointer into the relcache entry, it's safe
67 * for callers to continue to use that pointer as long as (a) they hold the
68 * relation open, and (b) they hold a relation lock strong enough to ensure
69 * that the data doesn't become stale.
72 RelationGetPartitionDesc(Relation rel
, bool omit_detached
)
74 Assert(rel
->rd_rel
->relkind
== RELKIND_PARTITIONED_TABLE
);
77 * If relcache has a partition descriptor, use that. However, we can only
78 * do so when we are asked to include all partitions including detached;
79 * and also when we know that there are no detached partitions.
81 * If there is no active snapshot, detached partitions aren't omitted
82 * either, so we can use the cached descriptor too in that case.
84 if (likely(rel
->rd_partdesc
&&
85 (!rel
->rd_partdesc
->detached_exist
|| !omit_detached
||
86 !ActiveSnapshotSet())))
87 return rel
->rd_partdesc
;
90 * If we're asked to omit detached partitions, we may be able to use a
91 * cached descriptor too. We determine that based on the pg_inherits.xmin
92 * that was saved alongside that descriptor: if the xmin that was not in
93 * progress for that active snapshot is also not in progress for the
94 * current active snapshot, then we can use it. Otherwise build one from
98 rel
->rd_partdesc_nodetached
&&
103 Assert(TransactionIdIsValid(rel
->rd_partdesc_nodetached_xmin
));
104 activesnap
= GetActiveSnapshot();
106 if (!XidInMVCCSnapshot(rel
->rd_partdesc_nodetached_xmin
, activesnap
))
107 return rel
->rd_partdesc_nodetached
;
110 return RelationBuildPartitionDesc(rel
, omit_detached
);
114 * RelationBuildPartitionDesc
115 * Form rel's partition descriptor, and store in relcache entry
117 * Partition descriptor is a complex structure; to avoid complicated logic to
118 * free individual elements whenever the relcache entry is flushed, we give it
119 * its own memory context, a child of CacheMemoryContext, which can easily be
120 * deleted on its own. To avoid leaking memory in that context in case of an
121 * error partway through this function, the context is initially created as a
122 * child of CurTransactionContext and only re-parented to CacheMemoryContext
123 * at the end, when no further errors are possible. Also, we don't make this
124 * context the current context except in very brief code sections, out of fear
125 * that some of our callees allocate memory on their own which would be leaked
128 * As a special case, partition descriptors that are requested to omit
129 * partitions being detached (and which contain such partitions) are transient
130 * and are not associated with the relcache entry. Such descriptors only last
131 * through the requesting Portal, so we use the corresponding memory context
135 RelationBuildPartitionDesc(Relation rel
, bool omit_detached
)
137 PartitionDesc partdesc
;
138 PartitionBoundInfo boundinfo
= NULL
;
140 PartitionBoundSpec
**boundspecs
= NULL
;
142 bool *is_leaf
= NULL
;
145 TransactionId detached_xmin
;
149 PartitionKey key
= RelationGetPartitionKey(rel
);
150 MemoryContext new_pdcxt
;
151 MemoryContext oldcxt
;
155 * Get partition oids from pg_inherits. This uses a single snapshot to
156 * fetch the list of children, so while more children may be getting added
157 * concurrently, whatever this function returns will be accurate as of
158 * some well-defined point in time.
160 detached_exist
= false;
161 detached_xmin
= InvalidTransactionId
;
162 inhoids
= find_inheritance_children_extended(RelationGetRelid(rel
),
163 omit_detached
, NoLock
,
167 nparts
= list_length(inhoids
);
169 /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
172 oids
= (Oid
*) palloc(nparts
* sizeof(Oid
));
173 is_leaf
= (bool *) palloc(nparts
* sizeof(bool));
174 boundspecs
= palloc(nparts
* sizeof(PartitionBoundSpec
*));
177 /* Collect bound spec nodes for each partition. */
179 foreach(cell
, inhoids
)
181 Oid inhrelid
= lfirst_oid(cell
);
183 PartitionBoundSpec
*boundspec
= NULL
;
185 /* Try fetching the tuple from the catcache, for speed. */
186 tuple
= SearchSysCache1(RELOID
, inhrelid
);
187 if (HeapTupleIsValid(tuple
))
192 datum
= SysCacheGetAttr(RELOID
, tuple
,
193 Anum_pg_class_relpartbound
,
196 boundspec
= stringToNode(TextDatumGetCString(datum
));
197 ReleaseSysCache(tuple
);
201 * The system cache may be out of date; if so, we may find no pg_class
202 * tuple or an old one where relpartbound is NULL. In that case, try
203 * the table directly. We can't just AcceptInvalidationMessages() and
204 * retry the system cache lookup because it's possible that a
205 * concurrent ATTACH PARTITION operation has removed itself from the
206 * ProcArray but not yet added invalidation messages to the shared
207 * queue; InvalidateSystemCaches() would work, but seems excessive.
209 * Note that this algorithm assumes that PartitionBoundSpec we manage
210 * to fetch is the right one -- so this is only good enough for
211 * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
212 * some hypothetical operation that changes the partition bounds.
214 if (boundspec
== NULL
)
222 pg_class
= table_open(RelationRelationId
, AccessShareLock
);
225 BTEqualStrategyNumber
, F_OIDEQ
,
226 ObjectIdGetDatum(inhrelid
));
227 scan
= systable_beginscan(pg_class
, ClassOidIndexId
, true,
229 tuple
= systable_getnext(scan
);
230 datum
= heap_getattr(tuple
, Anum_pg_class_relpartbound
,
231 RelationGetDescr(pg_class
), &isnull
);
233 boundspec
= stringToNode(TextDatumGetCString(datum
));
234 systable_endscan(scan
);
235 table_close(pg_class
, AccessShareLock
);
240 elog(ERROR
, "missing relpartbound for relation %u", inhrelid
);
241 if (!IsA(boundspec
, PartitionBoundSpec
))
242 elog(ERROR
, "invalid relpartbound for relation %u", inhrelid
);
245 * If the PartitionBoundSpec says this is the default partition, its
246 * OID should match pg_partitioned_table.partdefid; if not, the
247 * catalog is corrupt.
249 if (boundspec
->is_default
)
253 partdefid
= get_default_partition_oid(RelationGetRelid(rel
));
254 if (partdefid
!= inhrelid
)
255 elog(ERROR
, "expected partdefid %u, but got %u",
256 inhrelid
, partdefid
);
261 is_leaf
[i
] = (get_rel_relkind(inhrelid
) != RELKIND_PARTITIONED_TABLE
);
262 boundspecs
[i
] = boundspec
;
267 * Create PartitionBoundInfo and mapping, working in the caller's context.
268 * This could fail, but we haven't done any damage if so.
271 boundinfo
= partition_bounds_create(boundspecs
, nparts
, key
, &mapping
);
274 * Now build the actual relcache partition descriptor, copying all the
275 * data into a new, small context. As per above comment, we don't make
276 * this a long-lived context until it's finished.
278 new_pdcxt
= AllocSetContextCreate(CurTransactionContext
,
279 "partition descriptor",
280 ALLOCSET_SMALL_SIZES
);
281 MemoryContextCopyAndSetIdentifier(new_pdcxt
,
282 RelationGetRelationName(rel
));
284 partdesc
= (PartitionDescData
*)
285 MemoryContextAllocZero(new_pdcxt
, sizeof(PartitionDescData
));
286 partdesc
->nparts
= nparts
;
287 partdesc
->detached_exist
= detached_exist
;
288 /* If there are no partitions, the rest of the partdesc can stay zero */
291 oldcxt
= MemoryContextSwitchTo(new_pdcxt
);
292 partdesc
->boundinfo
= partition_bounds_copy(boundinfo
, key
);
294 /* Initialize caching fields for speeding up ExecFindPartition */
295 partdesc
->last_found_datum_index
= -1;
296 partdesc
->last_found_part_index
= -1;
297 partdesc
->last_found_count
= 0;
299 partdesc
->oids
= (Oid
*) palloc(nparts
* sizeof(Oid
));
300 partdesc
->is_leaf
= (bool *) palloc(nparts
* sizeof(bool));
303 * Assign OIDs from the original array into mapped indexes of the
304 * result array. The order of OIDs in the former is defined by the
305 * catalog scan that retrieved them, whereas that in the latter is
306 * defined by canonicalized representation of the partition bounds.
307 * Also save leaf-ness of each partition.
309 for (i
= 0; i
< nparts
; i
++)
311 int index
= mapping
[i
];
313 partdesc
->oids
[index
] = oids
[i
];
314 partdesc
->is_leaf
[index
] = is_leaf
[i
];
316 MemoryContextSwitchTo(oldcxt
);
320 * Are we working with the partdesc that omits the detached partition, or
321 * the one that includes it?
323 * Note that if a partition was found by the catalog's scan to have been
324 * detached, but the pg_inherit tuple saying so was not visible to the
325 * active snapshot (find_inheritance_children_extended will not have set
326 * detached_xmin in that case), we consider there to be no "omittable"
327 * detached partitions.
329 is_omit
= omit_detached
&& detached_exist
&& ActiveSnapshotSet() &&
330 TransactionIdIsValid(detached_xmin
);
333 * We have a fully valid partdesc. Reparent it so that it has the right
336 MemoryContextSetParent(new_pdcxt
, CacheMemoryContext
);
339 * Store it into relcache.
341 * But first, a kluge: if there's an old context for this type of
342 * descriptor, it contains an old partition descriptor that may still be
343 * referenced somewhere. Preserve it, while not leaking it, by
344 * reattaching it as a child context of the new one. Eventually it will
345 * get dropped by either RelationClose or RelationClearRelation. (We keep
346 * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
347 * detached-partitions in rd_pddcxt.)
351 if (rel
->rd_pddcxt
!= NULL
)
352 MemoryContextSetParent(rel
->rd_pddcxt
, new_pdcxt
);
353 rel
->rd_pddcxt
= new_pdcxt
;
354 rel
->rd_partdesc_nodetached
= partdesc
;
357 * For partdescs built excluding detached partitions, which we save
358 * separately, we also record the pg_inherits.xmin of the detached
359 * partition that was omitted; this informs a future potential user of
360 * such a cached partdesc to only use it after cross-checking that the
361 * xmin is indeed visible to the snapshot it is going to be working
364 Assert(TransactionIdIsValid(detached_xmin
));
365 rel
->rd_partdesc_nodetached_xmin
= detached_xmin
;
369 if (rel
->rd_pdcxt
!= NULL
)
370 MemoryContextSetParent(rel
->rd_pdcxt
, new_pdcxt
);
371 rel
->rd_pdcxt
= new_pdcxt
;
372 rel
->rd_partdesc
= partdesc
;
379 * CreatePartitionDirectory
380 * Create a new partition directory object.
383 CreatePartitionDirectory(MemoryContext mcxt
, bool omit_detached
)
385 MemoryContext oldcontext
= MemoryContextSwitchTo(mcxt
);
386 PartitionDirectory pdir
;
389 pdir
= palloc(sizeof(PartitionDirectoryData
));
390 pdir
->pdir_mcxt
= mcxt
;
392 ctl
.keysize
= sizeof(Oid
);
393 ctl
.entrysize
= sizeof(PartitionDirectoryEntry
);
396 pdir
->pdir_hash
= hash_create("partition directory", 256, &ctl
,
397 HASH_ELEM
| HASH_BLOBS
| HASH_CONTEXT
);
398 pdir
->omit_detached
= omit_detached
;
400 MemoryContextSwitchTo(oldcontext
);
405 * PartitionDirectoryLookup
406 * Look up the partition descriptor for a relation in the directory.
408 * The purpose of this function is to ensure that we get the same
409 * PartitionDesc for each relation every time we look it up. In the
410 * face of concurrent DDL, different PartitionDescs may be constructed with
411 * different views of the catalog state, but any single particular OID
412 * will always get the same PartitionDesc for as long as the same
413 * PartitionDirectory is used.
416 PartitionDirectoryLookup(PartitionDirectory pdir
, Relation rel
)
418 PartitionDirectoryEntry
*pde
;
419 Oid relid
= RelationGetRelid(rel
);
422 pde
= hash_search(pdir
->pdir_hash
, &relid
, HASH_ENTER
, &found
);
426 * We must keep a reference count on the relation so that the
427 * PartitionDesc to which we are pointing can't get destroyed.
429 RelationIncrementReferenceCount(rel
);
431 pde
->pd
= RelationGetPartitionDesc(rel
, pdir
->omit_detached
);
432 Assert(pde
->pd
!= NULL
);
438 * DestroyPartitionDirectory
439 * Destroy a partition directory.
441 * Release the reference counts we're holding.
444 DestroyPartitionDirectory(PartitionDirectory pdir
)
446 HASH_SEQ_STATUS status
;
447 PartitionDirectoryEntry
*pde
;
449 hash_seq_init(&status
, pdir
->pdir_hash
);
450 while ((pde
= hash_seq_search(&status
)) != NULL
)
451 RelationDecrementReferenceCount(pde
->rel
);
455 * get_default_oid_from_partdesc
457 * Given a partition descriptor, return the OID of the default partition, if
458 * one exists; else, return InvalidOid.
461 get_default_oid_from_partdesc(PartitionDesc partdesc
)
463 if (partdesc
&& partdesc
->boundinfo
&&
464 partition_bound_has_default(partdesc
->boundinfo
))
465 return partdesc
->oids
[partdesc
->boundinfo
->default_index
];