Repair ALTER EXTENSION ... SET SCHEMA.
[pgsql.git] / src / backend / commands / cluster.c
blob78f96789b0e84dc0a246c9e0334455e0201c997b
1 /*-------------------------------------------------------------------------
3 * cluster.c
4 * CLUSTER a table on an index. This is now also used for VACUUM FULL.
6 * There is hardly anything left of Paul Brown's original implementation...
9 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994-5, Regents of the University of California
13 * IDENTIFICATION
14 * src/backend/commands/cluster.c
16 *-------------------------------------------------------------------------
18 #include "postgres.h"
20 #include "access/amapi.h"
21 #include "access/heapam.h"
22 #include "access/multixact.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "access/toast_internals.h"
26 #include "access/transam.h"
27 #include "access/xact.h"
28 #include "catalog/catalog.h"
29 #include "catalog/dependency.h"
30 #include "catalog/heap.h"
31 #include "catalog/index.h"
32 #include "catalog/namespace.h"
33 #include "catalog/objectaccess.h"
34 #include "catalog/pg_am.h"
35 #include "catalog/pg_database.h"
36 #include "catalog/pg_inherits.h"
37 #include "catalog/toasting.h"
38 #include "commands/cluster.h"
39 #include "commands/defrem.h"
40 #include "commands/progress.h"
41 #include "commands/tablecmds.h"
42 #include "commands/vacuum.h"
43 #include "miscadmin.h"
44 #include "optimizer/optimizer.h"
45 #include "pgstat.h"
46 #include "storage/bufmgr.h"
47 #include "storage/lmgr.h"
48 #include "storage/predicate.h"
49 #include "utils/acl.h"
50 #include "utils/fmgroids.h"
51 #include "utils/guc.h"
52 #include "utils/inval.h"
53 #include "utils/lsyscache.h"
54 #include "utils/memutils.h"
55 #include "utils/pg_rusage.h"
56 #include "utils/relmapper.h"
57 #include "utils/snapmgr.h"
58 #include "utils/syscache.h"
61 * This struct is used to pass around the information on tables to be
62 * clustered. We need this so we can make a list of them when invoked without
63 * a specific table/index pair.
65 typedef struct
67 Oid tableOid;
68 Oid indexOid;
69 } RelToCluster;
72 static void cluster_multiple_rels(List *rtcs, ClusterParams *params);
73 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
74 static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
75 bool verbose, bool *pSwapToastByContent,
76 TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
77 static List *get_tables_to_cluster(MemoryContext cluster_context);
78 static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context,
79 Oid indexOid);
80 static bool cluster_is_permitted_for_relation(Oid relid, Oid userid);
83 /*---------------------------------------------------------------------------
84 * This cluster code allows for clustering multiple tables at once. Because
85 * of this, we cannot just run everything on a single transaction, or we
86 * would be forced to acquire exclusive locks on all the tables being
87 * clustered, simultaneously --- very likely leading to deadlock.
89 * To solve this we follow a similar strategy to VACUUM code,
90 * clustering each relation in a separate transaction. For this to work,
91 * we need to:
92 * - provide a separate memory context so that we can pass information in
93 * a way that survives across transactions
94 * - start a new transaction every time a new relation is clustered
95 * - check for validity of the information on to-be-clustered relations,
96 * as someone might have deleted a relation behind our back, or
97 * clustered one on a different index
98 * - end the transaction
100 * The single-relation case does not have any such overhead.
102 * We also allow a relation to be specified without index. In that case,
103 * the indisclustered bit will be looked up, and an ERROR will be thrown
104 * if there is no index with the bit set.
105 *---------------------------------------------------------------------------
107 void
108 cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
110 ListCell *lc;
111 ClusterParams params = {0};
112 bool verbose = false;
113 Relation rel = NULL;
114 Oid indexOid = InvalidOid;
115 MemoryContext cluster_context;
116 List *rtcs;
118 /* Parse option list */
119 foreach(lc, stmt->params)
121 DefElem *opt = (DefElem *) lfirst(lc);
123 if (strcmp(opt->defname, "verbose") == 0)
124 verbose = defGetBoolean(opt);
125 else
126 ereport(ERROR,
127 (errcode(ERRCODE_SYNTAX_ERROR),
128 errmsg("unrecognized CLUSTER option \"%s\"",
129 opt->defname),
130 parser_errposition(pstate, opt->location)));
133 params.options = (verbose ? CLUOPT_VERBOSE : 0);
135 if (stmt->relation != NULL)
137 /* This is the single-relation case. */
138 Oid tableOid;
141 * Find, lock, and check permissions on the table. We obtain
142 * AccessExclusiveLock right away to avoid lock-upgrade hazard in the
143 * single-transaction case.
145 tableOid = RangeVarGetRelidExtended(stmt->relation,
146 AccessExclusiveLock,
148 RangeVarCallbackMaintainsTable,
149 NULL);
150 rel = table_open(tableOid, NoLock);
153 * Reject clustering a remote temp table ... their local buffer
154 * manager is not going to cope.
156 if (RELATION_IS_OTHER_TEMP(rel))
157 ereport(ERROR,
158 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
159 errmsg("cannot cluster temporary tables of other sessions")));
161 if (stmt->indexname == NULL)
163 ListCell *index;
165 /* We need to find the index that has indisclustered set. */
166 foreach(index, RelationGetIndexList(rel))
168 indexOid = lfirst_oid(index);
169 if (get_index_isclustered(indexOid))
170 break;
171 indexOid = InvalidOid;
174 if (!OidIsValid(indexOid))
175 ereport(ERROR,
176 (errcode(ERRCODE_UNDEFINED_OBJECT),
177 errmsg("there is no previously clustered index for table \"%s\"",
178 stmt->relation->relname)));
180 else
183 * The index is expected to be in the same namespace as the
184 * relation.
186 indexOid = get_relname_relid(stmt->indexname,
187 rel->rd_rel->relnamespace);
188 if (!OidIsValid(indexOid))
189 ereport(ERROR,
190 (errcode(ERRCODE_UNDEFINED_OBJECT),
191 errmsg("index \"%s\" for table \"%s\" does not exist",
192 stmt->indexname, stmt->relation->relname)));
195 if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
197 /* close relation, keep lock till commit */
198 table_close(rel, NoLock);
200 /* Do the job. */
201 cluster_rel(tableOid, indexOid, &params);
203 return;
208 * By here, we know we are in a multi-table situation. In order to avoid
209 * holding locks for too long, we want to process each table in its own
210 * transaction. This forces us to disallow running inside a user
211 * transaction block.
213 PreventInTransactionBlock(isTopLevel, "CLUSTER");
215 /* Also, we need a memory context to hold our list of relations */
216 cluster_context = AllocSetContextCreate(PortalContext,
217 "Cluster",
218 ALLOCSET_DEFAULT_SIZES);
221 * Either we're processing a partitioned table, or we were not given any
222 * table name at all. In either case, obtain a list of relations to
223 * process.
225 * In the former case, an index name must have been given, so we don't
226 * need to recheck its "indisclustered" bit, but we have to check that it
227 * is an index that we can cluster on. In the latter case, we set the
228 * option bit to have indisclustered verified.
230 * Rechecking the relation itself is necessary here in all cases.
232 params.options |= CLUOPT_RECHECK;
233 if (rel != NULL)
235 Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
236 check_index_is_clusterable(rel, indexOid, AccessShareLock);
237 rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid);
239 /* close relation, releasing lock on parent table */
240 table_close(rel, AccessExclusiveLock);
242 else
244 rtcs = get_tables_to_cluster(cluster_context);
245 params.options |= CLUOPT_RECHECK_ISCLUSTERED;
248 /* Do the job. */
249 cluster_multiple_rels(rtcs, &params);
251 /* Start a new transaction for the cleanup work. */
252 StartTransactionCommand();
254 /* Clean up working storage */
255 MemoryContextDelete(cluster_context);
259 * Given a list of relations to cluster, process each of them in a separate
260 * transaction.
262 * We expect to be in a transaction at start, but there isn't one when we
263 * return.
265 static void
266 cluster_multiple_rels(List *rtcs, ClusterParams *params)
268 ListCell *lc;
270 /* Commit to get out of starting transaction */
271 PopActiveSnapshot();
272 CommitTransactionCommand();
274 /* Cluster the tables, each in a separate transaction */
275 foreach(lc, rtcs)
277 RelToCluster *rtc = (RelToCluster *) lfirst(lc);
279 /* Start a new transaction for each relation. */
280 StartTransactionCommand();
282 /* functions in indexes may want a snapshot set */
283 PushActiveSnapshot(GetTransactionSnapshot());
285 /* Do the job. */
286 cluster_rel(rtc->tableOid, rtc->indexOid, params);
288 PopActiveSnapshot();
289 CommitTransactionCommand();
294 * cluster_rel
296 * This clusters the table by creating a new, clustered table and
297 * swapping the relfilenumbers of the new table and the old table, so
298 * the OID of the original table is preserved. Thus we do not lose
299 * GRANT, inheritance nor references to this table (this was a bug
300 * in releases through 7.3).
302 * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
303 * the new table, it's better to create the indexes afterwards than to fill
304 * them incrementally while we load the table.
306 * If indexOid is InvalidOid, the table will be rewritten in physical order
307 * instead of index order. This is the new implementation of VACUUM FULL,
308 * and error messages should refer to the operation as VACUUM not CLUSTER.
310 void
311 cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
313 Relation OldHeap;
314 Oid save_userid;
315 int save_sec_context;
316 int save_nestlevel;
317 bool verbose = ((params->options & CLUOPT_VERBOSE) != 0);
318 bool recheck = ((params->options & CLUOPT_RECHECK) != 0);
320 /* Check for user-requested abort. */
321 CHECK_FOR_INTERRUPTS();
323 pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
324 if (OidIsValid(indexOid))
325 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
326 PROGRESS_CLUSTER_COMMAND_CLUSTER);
327 else
328 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
329 PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
332 * We grab exclusive access to the target rel and index for the duration
333 * of the transaction. (This is redundant for the single-transaction
334 * case, since cluster() already did it.) The index lock is taken inside
335 * check_index_is_clusterable.
337 OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
339 /* If the table has gone away, we can skip processing it */
340 if (!OldHeap)
342 pgstat_progress_end_command();
343 return;
347 * Switch to the table owner's userid, so that any index functions are run
348 * as that user. Also lock down security-restricted operations and
349 * arrange to make GUC variable changes local to this command.
351 GetUserIdAndSecContext(&save_userid, &save_sec_context);
352 SetUserIdAndSecContext(OldHeap->rd_rel->relowner,
353 save_sec_context | SECURITY_RESTRICTED_OPERATION);
354 save_nestlevel = NewGUCNestLevel();
355 RestrictSearchPath();
358 * Since we may open a new transaction for each relation, we have to check
359 * that the relation still is what we think it is.
361 * If this is a single-transaction CLUSTER, we can skip these tests. We
362 * *must* skip the one on indisclustered since it would reject an attempt
363 * to cluster a not-previously-clustered index.
365 if (recheck)
367 /* Check that the user still has privileges for the relation */
368 if (!cluster_is_permitted_for_relation(tableOid, save_userid))
370 relation_close(OldHeap, AccessExclusiveLock);
371 goto out;
375 * Silently skip a temp table for a remote session. Only doing this
376 * check in the "recheck" case is appropriate (which currently means
377 * somebody is executing a database-wide CLUSTER or on a partitioned
378 * table), because there is another check in cluster() which will stop
379 * any attempt to cluster remote temp tables by name. There is
380 * another check in cluster_rel which is redundant, but we leave it
381 * for extra safety.
383 if (RELATION_IS_OTHER_TEMP(OldHeap))
385 relation_close(OldHeap, AccessExclusiveLock);
386 goto out;
389 if (OidIsValid(indexOid))
392 * Check that the index still exists
394 if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
396 relation_close(OldHeap, AccessExclusiveLock);
397 goto out;
401 * Check that the index is still the one with indisclustered set,
402 * if needed.
404 if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 &&
405 !get_index_isclustered(indexOid))
407 relation_close(OldHeap, AccessExclusiveLock);
408 goto out;
414 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
415 * would work in most respects, but the index would only get marked as
416 * indisclustered in the current database, leading to unexpected behavior
417 * if CLUSTER were later invoked in another database.
419 if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
420 ereport(ERROR,
421 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
422 errmsg("cannot cluster a shared catalog")));
425 * Don't process temp tables of other backends ... their local buffer
426 * manager is not going to cope.
428 if (RELATION_IS_OTHER_TEMP(OldHeap))
430 if (OidIsValid(indexOid))
431 ereport(ERROR,
432 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
433 errmsg("cannot cluster temporary tables of other sessions")));
434 else
435 ereport(ERROR,
436 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
437 errmsg("cannot vacuum temporary tables of other sessions")));
441 * Also check for active uses of the relation in the current transaction,
442 * including open scans and pending AFTER trigger events.
444 CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
446 /* Check heap and index are valid to cluster on */
447 if (OidIsValid(indexOid))
448 check_index_is_clusterable(OldHeap, indexOid, AccessExclusiveLock);
451 * Quietly ignore the request if this is a materialized view which has not
452 * been populated from its query. No harm is done because there is no data
453 * to deal with, and we don't want to throw an error if this is part of a
454 * multi-relation request -- for example, CLUSTER was run on the entire
455 * database.
457 if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
458 !RelationIsPopulated(OldHeap))
460 relation_close(OldHeap, AccessExclusiveLock);
461 goto out;
464 Assert(OldHeap->rd_rel->relkind == RELKIND_RELATION ||
465 OldHeap->rd_rel->relkind == RELKIND_MATVIEW ||
466 OldHeap->rd_rel->relkind == RELKIND_TOASTVALUE);
469 * All predicate locks on the tuples or pages are about to be made
470 * invalid, because we move tuples around. Promote them to relation
471 * locks. Predicate locks on indexes will be promoted when they are
472 * reindexed.
474 TransferPredicateLocksToHeapRelation(OldHeap);
476 /* rebuild_relation does all the dirty work */
477 rebuild_relation(OldHeap, indexOid, verbose);
479 /* NB: rebuild_relation does table_close() on OldHeap */
481 out:
482 /* Roll back any GUC changes executed by index functions */
483 AtEOXact_GUC(false, save_nestlevel);
485 /* Restore userid and security context */
486 SetUserIdAndSecContext(save_userid, save_sec_context);
488 pgstat_progress_end_command();
492 * Verify that the specified heap and index are valid to cluster on
494 * Side effect: obtains lock on the index. The caller may
495 * in some cases already have AccessExclusiveLock on the table, but
496 * not in all cases so we can't rely on the table-level lock for
497 * protection here.
499 void
500 check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode)
502 Relation OldIndex;
504 OldIndex = index_open(indexOid, lockmode);
507 * Check that index is in fact an index on the given relation
509 if (OldIndex->rd_index == NULL ||
510 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
511 ereport(ERROR,
512 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
513 errmsg("\"%s\" is not an index for table \"%s\"",
514 RelationGetRelationName(OldIndex),
515 RelationGetRelationName(OldHeap))));
517 /* Index AM must allow clustering */
518 if (!OldIndex->rd_indam->amclusterable)
519 ereport(ERROR,
520 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
521 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
522 RelationGetRelationName(OldIndex))));
525 * Disallow clustering on incomplete indexes (those that might not index
526 * every row of the relation). We could relax this by making a separate
527 * seqscan pass over the table to copy the missing rows, but that seems
528 * expensive and tedious.
530 if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
531 ereport(ERROR,
532 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
533 errmsg("cannot cluster on partial index \"%s\"",
534 RelationGetRelationName(OldIndex))));
537 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
538 * it might well not contain entries for every heap row, or might not even
539 * be internally consistent. (But note that we don't check indcheckxmin;
540 * the worst consequence of following broken HOT chains would be that we
541 * might put recently-dead tuples out-of-order in the new table, and there
542 * is little harm in that.)
544 if (!OldIndex->rd_index->indisvalid)
545 ereport(ERROR,
546 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
547 errmsg("cannot cluster on invalid index \"%s\"",
548 RelationGetRelationName(OldIndex))));
550 /* Drop relcache refcnt on OldIndex, but keep lock */
551 index_close(OldIndex, NoLock);
555 * mark_index_clustered: mark the specified index as the one clustered on
557 * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
559 void
560 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
562 HeapTuple indexTuple;
563 Form_pg_index indexForm;
564 Relation pg_index;
565 ListCell *index;
567 /* Disallow applying to a partitioned table */
568 if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
569 ereport(ERROR,
570 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
571 errmsg("cannot mark index clustered in partitioned table")));
574 * If the index is already marked clustered, no need to do anything.
576 if (OidIsValid(indexOid))
578 if (get_index_isclustered(indexOid))
579 return;
583 * Check each index of the relation and set/clear the bit as needed.
585 pg_index = table_open(IndexRelationId, RowExclusiveLock);
587 foreach(index, RelationGetIndexList(rel))
589 Oid thisIndexOid = lfirst_oid(index);
591 indexTuple = SearchSysCacheCopy1(INDEXRELID,
592 ObjectIdGetDatum(thisIndexOid));
593 if (!HeapTupleIsValid(indexTuple))
594 elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
595 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
598 * Unset the bit if set. We know it's wrong because we checked this
599 * earlier.
601 if (indexForm->indisclustered)
603 indexForm->indisclustered = false;
604 CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
606 else if (thisIndexOid == indexOid)
608 /* this was checked earlier, but let's be real sure */
609 if (!indexForm->indisvalid)
610 elog(ERROR, "cannot cluster on invalid index %u", indexOid);
611 indexForm->indisclustered = true;
612 CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
615 InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
616 InvalidOid, is_internal);
618 heap_freetuple(indexTuple);
621 table_close(pg_index, RowExclusiveLock);
625 * rebuild_relation: rebuild an existing relation in index or physical order
627 * OldHeap: table to rebuild --- must be opened and exclusive-locked!
628 * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
630 * NB: this routine closes OldHeap at the right time; caller should not.
632 static void
633 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
635 Oid tableOid = RelationGetRelid(OldHeap);
636 Oid accessMethod = OldHeap->rd_rel->relam;
637 Oid tableSpace = OldHeap->rd_rel->reltablespace;
638 Oid OIDNewHeap;
639 char relpersistence;
640 bool is_system_catalog;
641 bool swap_toast_by_content;
642 TransactionId frozenXid;
643 MultiXactId cutoffMulti;
645 if (OidIsValid(indexOid))
646 /* Mark the correct index as clustered */
647 mark_index_clustered(OldHeap, indexOid, true);
649 /* Remember info about rel before closing OldHeap */
650 relpersistence = OldHeap->rd_rel->relpersistence;
651 is_system_catalog = IsSystemRelation(OldHeap);
653 /* Close relcache entry, but keep lock until transaction commit */
654 table_close(OldHeap, NoLock);
656 /* Create the transient table that will receive the re-ordered data */
657 OIDNewHeap = make_new_heap(tableOid, tableSpace,
658 accessMethod,
659 relpersistence,
660 AccessExclusiveLock);
662 /* Copy the heap data into the new table in the desired order */
663 copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
664 &swap_toast_by_content, &frozenXid, &cutoffMulti);
667 * Swap the physical files of the target and transient tables, then
668 * rebuild the target's indexes and throw away the transient table.
670 finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
671 swap_toast_by_content, false, true,
672 frozenXid, cutoffMulti,
673 relpersistence);
678 * Create the transient table that will be filled with new data during
679 * CLUSTER, ALTER TABLE, and similar operations. The transient table
680 * duplicates the logical structure of the OldHeap; but will have the
681 * specified physical storage properties NewTableSpace, NewAccessMethod, and
682 * relpersistence.
684 * After this, the caller should load the new heap with transferred/modified
685 * data, then call finish_heap_swap to complete the operation.
688 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
689 char relpersistence, LOCKMODE lockmode)
691 TupleDesc OldHeapDesc;
692 char NewHeapName[NAMEDATALEN];
693 Oid OIDNewHeap;
694 Oid toastid;
695 Relation OldHeap;
696 HeapTuple tuple;
697 Datum reloptions;
698 bool isNull;
699 Oid namespaceid;
701 OldHeap = table_open(OIDOldHeap, lockmode);
702 OldHeapDesc = RelationGetDescr(OldHeap);
705 * Note that the NewHeap will not receive any of the defaults or
706 * constraints associated with the OldHeap; we don't need 'em, and there's
707 * no reason to spend cycles inserting them into the catalogs only to
708 * delete them.
712 * But we do want to use reloptions of the old heap for new heap.
714 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
715 if (!HeapTupleIsValid(tuple))
716 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
717 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
718 &isNull);
719 if (isNull)
720 reloptions = (Datum) 0;
722 if (relpersistence == RELPERSISTENCE_TEMP)
723 namespaceid = LookupCreationNamespace("pg_temp");
724 else
725 namespaceid = RelationGetNamespace(OldHeap);
728 * Create the new heap, using a temporary name in the same namespace as
729 * the existing table. NOTE: there is some risk of collision with user
730 * relnames. Working around this seems more trouble than it's worth; in
731 * particular, we can't create the new heap in a different namespace from
732 * the old, or we will have problems with the TEMP status of temp tables.
734 * Note: the new heap is not a shared relation, even if we are rebuilding
735 * a shared rel. However, we do make the new heap mapped if the source is
736 * mapped. This simplifies swap_relation_files, and is absolutely
737 * necessary for rebuilding pg_class, for reasons explained there.
739 snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
741 OIDNewHeap = heap_create_with_catalog(NewHeapName,
742 namespaceid,
743 NewTableSpace,
744 InvalidOid,
745 InvalidOid,
746 InvalidOid,
747 OldHeap->rd_rel->relowner,
748 NewAccessMethod,
749 OldHeapDesc,
750 NIL,
751 RELKIND_RELATION,
752 relpersistence,
753 false,
754 RelationIsMapped(OldHeap),
755 ONCOMMIT_NOOP,
756 reloptions,
757 false,
758 true,
759 true,
760 OIDOldHeap,
761 NULL);
762 Assert(OIDNewHeap != InvalidOid);
764 ReleaseSysCache(tuple);
767 * Advance command counter so that the newly-created relation's catalog
768 * tuples will be visible to table_open.
770 CommandCounterIncrement();
773 * If necessary, create a TOAST table for the new relation.
775 * If the relation doesn't have a TOAST table already, we can't need one
776 * for the new relation. The other way around is possible though: if some
777 * wide columns have been dropped, NewHeapCreateToastTable can decide that
778 * no TOAST table is needed for the new table.
780 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
781 * that the TOAST table will be visible for insertion.
783 toastid = OldHeap->rd_rel->reltoastrelid;
784 if (OidIsValid(toastid))
786 /* keep the existing toast table's reloptions, if any */
787 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
788 if (!HeapTupleIsValid(tuple))
789 elog(ERROR, "cache lookup failed for relation %u", toastid);
790 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
791 &isNull);
792 if (isNull)
793 reloptions = (Datum) 0;
795 NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);
797 ReleaseSysCache(tuple);
800 table_close(OldHeap, NoLock);
802 return OIDNewHeap;
806 * Do the physical copying of table data.
808 * There are three output parameters:
809 * *pSwapToastByContent is set true if toast tables must be swapped by content.
810 * *pFreezeXid receives the TransactionId used as freeze cutoff point.
811 * *pCutoffMulti receives the MultiXactId used as a cutoff point.
813 static void
814 copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
815 bool *pSwapToastByContent, TransactionId *pFreezeXid,
816 MultiXactId *pCutoffMulti)
818 Relation NewHeap,
819 OldHeap,
820 OldIndex;
821 Relation relRelation;
822 HeapTuple reltup;
823 Form_pg_class relform;
824 TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
825 TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY;
826 VacuumParams params;
827 struct VacuumCutoffs cutoffs;
828 bool use_sort;
829 double num_tuples = 0,
830 tups_vacuumed = 0,
831 tups_recently_dead = 0;
832 BlockNumber num_pages;
833 int elevel = verbose ? INFO : DEBUG2;
834 PGRUsage ru0;
835 char *nspname;
837 pg_rusage_init(&ru0);
840 * Open the relations we need.
842 NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
843 OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
844 if (OidIsValid(OIDOldIndex))
845 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
846 else
847 OldIndex = NULL;
849 /* Store a copy of the namespace name for logging purposes */
850 nspname = get_namespace_name(RelationGetNamespace(OldHeap));
853 * Their tuple descriptors should be exactly alike, but here we only need
854 * assume that they have the same number of columns.
856 oldTupDesc = RelationGetDescr(OldHeap);
857 newTupDesc = RelationGetDescr(NewHeap);
858 Assert(newTupDesc->natts == oldTupDesc->natts);
861 * If the OldHeap has a toast table, get lock on the toast table to keep
862 * it from being vacuumed. This is needed because autovacuum processes
863 * toast tables independently of their main tables, with no lock on the
864 * latter. If an autovacuum were to start on the toast table after we
865 * compute our OldestXmin below, it would use a later OldestXmin, and then
866 * possibly remove as DEAD toast tuples belonging to main tuples we think
867 * are only RECENTLY_DEAD. Then we'd fail while trying to copy those
868 * tuples.
870 * We don't need to open the toast relation here, just lock it. The lock
871 * will be held till end of transaction.
873 if (OldHeap->rd_rel->reltoastrelid)
874 LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
877 * If both tables have TOAST tables, perform toast swap by content. It is
878 * possible that the old table has a toast table but the new one doesn't,
879 * if toastable columns have been dropped. In that case we have to do
880 * swap by links. This is okay because swap by content is only essential
881 * for system catalogs, and we don't support schema changes for them.
883 if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
885 *pSwapToastByContent = true;
888 * When doing swap by content, any toast pointers written into NewHeap
889 * must use the old toast table's OID, because that's where the toast
890 * data will eventually be found. Set this up by setting rd_toastoid.
891 * This also tells toast_save_datum() to preserve the toast value
892 * OIDs, which we want so as not to invalidate toast pointers in
893 * system catalog caches, and to avoid making multiple copies of a
894 * single toast value.
896 * Note that we must hold NewHeap open until we are done writing data,
897 * since the relcache will not guarantee to remember this setting once
898 * the relation is closed. Also, this technique depends on the fact
899 * that no one will try to read from the NewHeap until after we've
900 * finished writing it and swapping the rels --- otherwise they could
901 * follow the toast pointers to the wrong place. (It would actually
902 * work for values copied over from the old toast table, but not for
903 * any values that we toast which were previously not toasted.)
905 NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
907 else
908 *pSwapToastByContent = false;
911 * Compute xids used to freeze and weed out dead tuples and multixacts.
912 * Since we're going to rewrite the whole table anyway, there's no reason
913 * not to be aggressive about this.
915 memset(&params, 0, sizeof(VacuumParams));
916 vacuum_get_cutoffs(OldHeap, &params, &cutoffs);
919 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
920 * backwards, so take the max.
923 TransactionId relfrozenxid = OldHeap->rd_rel->relfrozenxid;
925 if (TransactionIdIsValid(relfrozenxid) &&
926 TransactionIdPrecedes(cutoffs.FreezeLimit, relfrozenxid))
927 cutoffs.FreezeLimit = relfrozenxid;
931 * MultiXactCutoff, similarly, shouldn't go backwards either.
934 MultiXactId relminmxid = OldHeap->rd_rel->relminmxid;
936 if (MultiXactIdIsValid(relminmxid) &&
937 MultiXactIdPrecedes(cutoffs.MultiXactCutoff, relminmxid))
938 cutoffs.MultiXactCutoff = relminmxid;
942 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
943 * the OldHeap. We know how to use a sort to duplicate the ordering of a
944 * btree index, and will use seqscan-and-sort for that case if the planner
945 * tells us it's cheaper. Otherwise, always indexscan if an index is
946 * provided, else plain seqscan.
948 if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
949 use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
950 else
951 use_sort = false;
953 /* Log what we're doing */
954 if (OldIndex != NULL && !use_sort)
955 ereport(elevel,
956 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
957 nspname,
958 RelationGetRelationName(OldHeap),
959 RelationGetRelationName(OldIndex))));
960 else if (use_sort)
961 ereport(elevel,
962 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
963 nspname,
964 RelationGetRelationName(OldHeap))));
965 else
966 ereport(elevel,
967 (errmsg("vacuuming \"%s.%s\"",
968 nspname,
969 RelationGetRelationName(OldHeap))));
972 * Hand off the actual copying to AM specific function, the generic code
973 * cannot know how to deal with visibility across AMs. Note that this
974 * routine is allowed to set FreezeXid / MultiXactCutoff to different
975 * values (e.g. because the AM doesn't use freezing).
977 table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
978 cutoffs.OldestXmin, &cutoffs.FreezeLimit,
979 &cutoffs.MultiXactCutoff,
980 &num_tuples, &tups_vacuumed,
981 &tups_recently_dead);
983 /* return selected values to caller, get set as relfrozenxid/minmxid */
984 *pFreezeXid = cutoffs.FreezeLimit;
985 *pCutoffMulti = cutoffs.MultiXactCutoff;
987 /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
988 NewHeap->rd_toastoid = InvalidOid;
990 num_pages = RelationGetNumberOfBlocks(NewHeap);
992 /* Log what we did */
993 ereport(elevel,
994 (errmsg("\"%s.%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
995 nspname,
996 RelationGetRelationName(OldHeap),
997 tups_vacuumed, num_tuples,
998 RelationGetNumberOfBlocks(OldHeap)),
999 errdetail("%.0f dead row versions cannot be removed yet.\n"
1000 "%s.",
1001 tups_recently_dead,
1002 pg_rusage_show(&ru0))));
1004 if (OldIndex != NULL)
1005 index_close(OldIndex, NoLock);
1006 table_close(OldHeap, NoLock);
1007 table_close(NewHeap, NoLock);
1009 /* Update pg_class to reflect the correct values of pages and tuples. */
1010 relRelation = table_open(RelationRelationId, RowExclusiveLock);
1012 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
1013 if (!HeapTupleIsValid(reltup))
1014 elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
1015 relform = (Form_pg_class) GETSTRUCT(reltup);
1017 relform->relpages = num_pages;
1018 relform->reltuples = num_tuples;
1020 /* Don't update the stats for pg_class. See swap_relation_files. */
1021 if (OIDOldHeap != RelationRelationId)
1022 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1023 else
1024 CacheInvalidateRelcacheByTuple(reltup);
1026 /* Clean up. */
1027 heap_freetuple(reltup);
1028 table_close(relRelation, RowExclusiveLock);
1030 /* Make the update visible */
1031 CommandCounterIncrement();
1035 * Swap the physical files of two given relations.
1037 * We swap the physical identity (reltablespace, relfilenumber) while keeping
1038 * the same logical identities of the two relations. relpersistence is also
1039 * swapped, which is critical since it determines where buffers live for each
1040 * relation.
1042 * We can swap associated TOAST data in either of two ways: recursively swap
1043 * the physical content of the toast tables (and their indexes), or swap the
1044 * TOAST links in the given relations' pg_class entries. The former is needed
1045 * to manage rewrites of shared catalogs (where we cannot change the pg_class
1046 * links) while the latter is the only way to handle cases in which a toast
1047 * table is added or removed altogether.
1049 * Additionally, the first relation is marked with relfrozenxid set to
1050 * frozenXid. It seems a bit ugly to have this here, but the caller would
1051 * have to do it anyway, so having it here saves a heap_update. Note: in
1052 * the swap-toast-links case, we assume we don't need to change the toast
1053 * table's relfrozenxid: the new version of the toast table should already
1054 * have relfrozenxid set to RecentXmin, which is good enough.
1056 * Lastly, if r2 and its toast table and toast index (if any) are mapped,
1057 * their OIDs are emitted into mapped_tables[]. This is hacky but beats
1058 * having to look the information up again later in finish_heap_swap.
1060 static void
1061 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1062 bool swap_toast_by_content,
1063 bool is_internal,
1064 TransactionId frozenXid,
1065 MultiXactId cutoffMulti,
1066 Oid *mapped_tables)
1068 Relation relRelation;
1069 HeapTuple reltup1,
1070 reltup2;
1071 Form_pg_class relform1,
1072 relform2;
1073 RelFileNumber relfilenumber1,
1074 relfilenumber2;
1075 RelFileNumber swaptemp;
1076 char swptmpchr;
1077 Oid relam1,
1078 relam2;
1080 /* We need writable copies of both pg_class tuples. */
1081 relRelation = table_open(RelationRelationId, RowExclusiveLock);
1083 reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1084 if (!HeapTupleIsValid(reltup1))
1085 elog(ERROR, "cache lookup failed for relation %u", r1);
1086 relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1088 reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1089 if (!HeapTupleIsValid(reltup2))
1090 elog(ERROR, "cache lookup failed for relation %u", r2);
1091 relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1093 relfilenumber1 = relform1->relfilenode;
1094 relfilenumber2 = relform2->relfilenode;
1095 relam1 = relform1->relam;
1096 relam2 = relform2->relam;
1098 if (RelFileNumberIsValid(relfilenumber1) &&
1099 RelFileNumberIsValid(relfilenumber2))
1102 * Normal non-mapped relations: swap relfilenumbers, reltablespaces,
1103 * relpersistence
1105 Assert(!target_is_pg_class);
1107 swaptemp = relform1->relfilenode;
1108 relform1->relfilenode = relform2->relfilenode;
1109 relform2->relfilenode = swaptemp;
1111 swaptemp = relform1->reltablespace;
1112 relform1->reltablespace = relform2->reltablespace;
1113 relform2->reltablespace = swaptemp;
1115 swaptemp = relform1->relam;
1116 relform1->relam = relform2->relam;
1117 relform2->relam = swaptemp;
1119 swptmpchr = relform1->relpersistence;
1120 relform1->relpersistence = relform2->relpersistence;
1121 relform2->relpersistence = swptmpchr;
1123 /* Also swap toast links, if we're swapping by links */
1124 if (!swap_toast_by_content)
1126 swaptemp = relform1->reltoastrelid;
1127 relform1->reltoastrelid = relform2->reltoastrelid;
1128 relform2->reltoastrelid = swaptemp;
1131 else
1134 * Mapped-relation case. Here we have to swap the relation mappings
1135 * instead of modifying the pg_class columns. Both must be mapped.
1137 if (RelFileNumberIsValid(relfilenumber1) ||
1138 RelFileNumberIsValid(relfilenumber2))
1139 elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1140 NameStr(relform1->relname));
1143 * We can't change the tablespace nor persistence of a mapped rel, and
1144 * we can't handle toast link swapping for one either, because we must
1145 * not apply any critical changes to its pg_class row. These cases
1146 * should be prevented by upstream permissions tests, so these checks
1147 * are non-user-facing emergency backstop.
1149 if (relform1->reltablespace != relform2->reltablespace)
1150 elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1151 NameStr(relform1->relname));
1152 if (relform1->relpersistence != relform2->relpersistence)
1153 elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1154 NameStr(relform1->relname));
1155 if (relform1->relam != relform2->relam)
1156 elog(ERROR, "cannot change access method of mapped relation \"%s\"",
1157 NameStr(relform1->relname));
1158 if (!swap_toast_by_content &&
1159 (relform1->reltoastrelid || relform2->reltoastrelid))
1160 elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1161 NameStr(relform1->relname));
1164 * Fetch the mappings --- shouldn't fail, but be paranoid
1166 relfilenumber1 = RelationMapOidToFilenumber(r1, relform1->relisshared);
1167 if (!RelFileNumberIsValid(relfilenumber1))
1168 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1169 NameStr(relform1->relname), r1);
1170 relfilenumber2 = RelationMapOidToFilenumber(r2, relform2->relisshared);
1171 if (!RelFileNumberIsValid(relfilenumber2))
1172 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1173 NameStr(relform2->relname), r2);
1176 * Send replacement mappings to relmapper. Note these won't actually
1177 * take effect until CommandCounterIncrement.
1179 RelationMapUpdateMap(r1, relfilenumber2, relform1->relisshared, false);
1180 RelationMapUpdateMap(r2, relfilenumber1, relform2->relisshared, false);
1182 /* Pass OIDs of mapped r2 tables back to caller */
1183 *mapped_tables++ = r2;
1187 * Recognize that rel1's relfilenumber (swapped from rel2) is new in this
1188 * subtransaction. The rel2 storage (swapped from rel1) may or may not be
1189 * new.
1192 Relation rel1,
1193 rel2;
1195 rel1 = relation_open(r1, NoLock);
1196 rel2 = relation_open(r2, NoLock);
1197 rel2->rd_createSubid = rel1->rd_createSubid;
1198 rel2->rd_newRelfilelocatorSubid = rel1->rd_newRelfilelocatorSubid;
1199 rel2->rd_firstRelfilelocatorSubid = rel1->rd_firstRelfilelocatorSubid;
1200 RelationAssumeNewRelfilelocator(rel1);
1201 relation_close(rel1, NoLock);
1202 relation_close(rel2, NoLock);
1206 * In the case of a shared catalog, these next few steps will only affect
1207 * our own database's pg_class row; but that's okay, because they are all
1208 * noncritical updates. That's also an important fact for the case of a
1209 * mapped catalog, because it's possible that we'll commit the map change
1210 * and then fail to commit the pg_class update.
1213 /* set rel1's frozen Xid and minimum MultiXid */
1214 if (relform1->relkind != RELKIND_INDEX)
1216 Assert(!TransactionIdIsValid(frozenXid) ||
1217 TransactionIdIsNormal(frozenXid));
1218 relform1->relfrozenxid = frozenXid;
1219 relform1->relminmxid = cutoffMulti;
1222 /* swap size statistics too, since new rel has freshly-updated stats */
1224 int32 swap_pages;
1225 float4 swap_tuples;
1226 int32 swap_allvisible;
1228 swap_pages = relform1->relpages;
1229 relform1->relpages = relform2->relpages;
1230 relform2->relpages = swap_pages;
1232 swap_tuples = relform1->reltuples;
1233 relform1->reltuples = relform2->reltuples;
1234 relform2->reltuples = swap_tuples;
1236 swap_allvisible = relform1->relallvisible;
1237 relform1->relallvisible = relform2->relallvisible;
1238 relform2->relallvisible = swap_allvisible;
1242 * Update the tuples in pg_class --- unless the target relation of the
1243 * swap is pg_class itself. In that case, there is zero point in making
1244 * changes because we'd be updating the old data that we're about to throw
1245 * away. Because the real work being done here for a mapped relation is
1246 * just to change the relation map settings, it's all right to not update
1247 * the pg_class rows in this case. The most important changes will instead
1248 * performed later, in finish_heap_swap() itself.
1250 if (!target_is_pg_class)
1252 CatalogIndexState indstate;
1254 indstate = CatalogOpenIndexes(relRelation);
1255 CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1256 indstate);
1257 CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1258 indstate);
1259 CatalogCloseIndexes(indstate);
1261 else
1263 /* no update ... but we do still need relcache inval */
1264 CacheInvalidateRelcacheByTuple(reltup1);
1265 CacheInvalidateRelcacheByTuple(reltup2);
1269 * Now that pg_class has been updated with its relevant information for
1270 * the swap, update the dependency of the relations to point to their new
1271 * table AM, if it has changed.
1273 if (relam1 != relam2)
1275 if (changeDependencyFor(RelationRelationId,
1277 AccessMethodRelationId,
1278 relam1,
1279 relam2) != 1)
1280 elog(ERROR, "could not change access method dependency for relation \"%s.%s\"",
1281 get_namespace_name(get_rel_namespace(r1)),
1282 get_rel_name(r1));
1283 if (changeDependencyFor(RelationRelationId,
1285 AccessMethodRelationId,
1286 relam2,
1287 relam1) != 1)
1288 elog(ERROR, "could not change access method dependency for relation \"%s.%s\"",
1289 get_namespace_name(get_rel_namespace(r2)),
1290 get_rel_name(r2));
1294 * Post alter hook for modified relations. The change to r2 is always
1295 * internal, but r1 depends on the invocation context.
1297 InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1298 InvalidOid, is_internal);
1299 InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1300 InvalidOid, true);
1303 * If we have toast tables associated with the relations being swapped,
1304 * deal with them too.
1306 if (relform1->reltoastrelid || relform2->reltoastrelid)
1308 if (swap_toast_by_content)
1310 if (relform1->reltoastrelid && relform2->reltoastrelid)
1312 /* Recursively swap the contents of the toast tables */
1313 swap_relation_files(relform1->reltoastrelid,
1314 relform2->reltoastrelid,
1315 target_is_pg_class,
1316 swap_toast_by_content,
1317 is_internal,
1318 frozenXid,
1319 cutoffMulti,
1320 mapped_tables);
1322 else
1324 /* caller messed up */
1325 elog(ERROR, "cannot swap toast files by content when there's only one");
1328 else
1331 * We swapped the ownership links, so we need to change dependency
1332 * data to match.
1334 * NOTE: it is possible that only one table has a toast table.
1336 * NOTE: at present, a TOAST table's only dependency is the one on
1337 * its owning table. If more are ever created, we'd need to use
1338 * something more selective than deleteDependencyRecordsFor() to
1339 * get rid of just the link we want.
1341 ObjectAddress baseobject,
1342 toastobject;
1343 long count;
1346 * We disallow this case for system catalogs, to avoid the
1347 * possibility that the catalog we're rebuilding is one of the
1348 * ones the dependency changes would change. It's too late to be
1349 * making any data changes to the target catalog.
1351 if (IsSystemClass(r1, relform1))
1352 elog(ERROR, "cannot swap toast files by links for system catalogs");
1354 /* Delete old dependencies */
1355 if (relform1->reltoastrelid)
1357 count = deleteDependencyRecordsFor(RelationRelationId,
1358 relform1->reltoastrelid,
1359 false);
1360 if (count != 1)
1361 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1362 count);
1364 if (relform2->reltoastrelid)
1366 count = deleteDependencyRecordsFor(RelationRelationId,
1367 relform2->reltoastrelid,
1368 false);
1369 if (count != 1)
1370 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1371 count);
1374 /* Register new dependencies */
1375 baseobject.classId = RelationRelationId;
1376 baseobject.objectSubId = 0;
1377 toastobject.classId = RelationRelationId;
1378 toastobject.objectSubId = 0;
1380 if (relform1->reltoastrelid)
1382 baseobject.objectId = r1;
1383 toastobject.objectId = relform1->reltoastrelid;
1384 recordDependencyOn(&toastobject, &baseobject,
1385 DEPENDENCY_INTERNAL);
1388 if (relform2->reltoastrelid)
1390 baseobject.objectId = r2;
1391 toastobject.objectId = relform2->reltoastrelid;
1392 recordDependencyOn(&toastobject, &baseobject,
1393 DEPENDENCY_INTERNAL);
1399 * If we're swapping two toast tables by content, do the same for their
1400 * valid index. The swap can actually be safely done only if the relations
1401 * have indexes.
1403 if (swap_toast_by_content &&
1404 relform1->relkind == RELKIND_TOASTVALUE &&
1405 relform2->relkind == RELKIND_TOASTVALUE)
1407 Oid toastIndex1,
1408 toastIndex2;
1410 /* Get valid index for each relation */
1411 toastIndex1 = toast_get_valid_index(r1,
1412 AccessExclusiveLock);
1413 toastIndex2 = toast_get_valid_index(r2,
1414 AccessExclusiveLock);
1416 swap_relation_files(toastIndex1,
1417 toastIndex2,
1418 target_is_pg_class,
1419 swap_toast_by_content,
1420 is_internal,
1421 InvalidTransactionId,
1422 InvalidMultiXactId,
1423 mapped_tables);
1426 /* Clean up. */
1427 heap_freetuple(reltup1);
1428 heap_freetuple(reltup2);
1430 table_close(relRelation, RowExclusiveLock);
1434 * Remove the transient table that was built by make_new_heap, and finish
1435 * cleaning up (including rebuilding all indexes on the old heap).
1437 void
1438 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1439 bool is_system_catalog,
1440 bool swap_toast_by_content,
1441 bool check_constraints,
1442 bool is_internal,
1443 TransactionId frozenXid,
1444 MultiXactId cutoffMulti,
1445 char newrelpersistence)
1447 ObjectAddress object;
1448 Oid mapped_tables[4];
1449 int reindex_flags;
1450 ReindexParams reindex_params = {0};
1451 int i;
1453 /* Report that we are now swapping relation files */
1454 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1455 PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1457 /* Zero out possible results from swapped_relation_files */
1458 memset(mapped_tables, 0, sizeof(mapped_tables));
1461 * Swap the contents of the heap relations (including any toast tables).
1462 * Also set old heap's relfrozenxid to frozenXid.
1464 swap_relation_files(OIDOldHeap, OIDNewHeap,
1465 (OIDOldHeap == RelationRelationId),
1466 swap_toast_by_content, is_internal,
1467 frozenXid, cutoffMulti, mapped_tables);
1470 * If it's a system catalog, queue a sinval message to flush all catcaches
1471 * on the catalog when we reach CommandCounterIncrement.
1473 if (is_system_catalog)
1474 CacheInvalidateCatalog(OIDOldHeap);
1477 * Rebuild each index on the relation (but not the toast table, which is
1478 * all-new at this point). It is important to do this before the DROP
1479 * step because if we are processing a system catalog that will be used
1480 * during DROP, we want to have its indexes available. There is no
1481 * advantage to the other order anyway because this is all transactional,
1482 * so no chance to reclaim disk space before commit. We do not need a
1483 * final CommandCounterIncrement() because reindex_relation does it.
1485 * Note: because index_build is called via reindex_relation, it will never
1486 * set indcheckxmin true for the indexes. This is OK even though in some
1487 * sense we are building new indexes rather than rebuilding existing ones,
1488 * because the new heap won't contain any HOT chains at all, let alone
1489 * broken ones, so it can't be necessary to set indcheckxmin.
1491 reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1492 if (check_constraints)
1493 reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1496 * Ensure that the indexes have the same persistence as the parent
1497 * relation.
1499 if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1500 reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1501 else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1502 reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1504 /* Report that we are now reindexing relations */
1505 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1506 PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1508 reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params);
1510 /* Report that we are now doing clean up */
1511 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1512 PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1515 * If the relation being rebuilt is pg_class, swap_relation_files()
1516 * couldn't update pg_class's own pg_class entry (check comments in
1517 * swap_relation_files()), thus relfrozenxid was not updated. That's
1518 * annoying because a potential reason for doing a VACUUM FULL is a
1519 * imminent or actual anti-wraparound shutdown. So, now that we can
1520 * access the new relation using its indices, update relfrozenxid.
1521 * pg_class doesn't have a toast relation, so we don't need to update the
1522 * corresponding toast relation. Not that there's little point moving all
1523 * relfrozenxid updates here since swap_relation_files() needs to write to
1524 * pg_class for non-mapped relations anyway.
1526 if (OIDOldHeap == RelationRelationId)
1528 Relation relRelation;
1529 HeapTuple reltup;
1530 Form_pg_class relform;
1532 relRelation = table_open(RelationRelationId, RowExclusiveLock);
1534 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1535 if (!HeapTupleIsValid(reltup))
1536 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1537 relform = (Form_pg_class) GETSTRUCT(reltup);
1539 relform->relfrozenxid = frozenXid;
1540 relform->relminmxid = cutoffMulti;
1542 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1544 table_close(relRelation, RowExclusiveLock);
1547 /* Destroy new heap with old filenumber */
1548 object.classId = RelationRelationId;
1549 object.objectId = OIDNewHeap;
1550 object.objectSubId = 0;
1553 * The new relation is local to our transaction and we know nothing
1554 * depends on it, so DROP_RESTRICT should be OK.
1556 performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1558 /* performDeletion does CommandCounterIncrement at end */
1561 * Now we must remove any relation mapping entries that we set up for the
1562 * transient table, as well as its toast table and toast index if any. If
1563 * we fail to do this before commit, the relmapper will complain about new
1564 * permanent map entries being added post-bootstrap.
1566 for (i = 0; OidIsValid(mapped_tables[i]); i++)
1567 RelationMapRemoveMapping(mapped_tables[i]);
1570 * At this point, everything is kosher except that, if we did toast swap
1571 * by links, the toast table's name corresponds to the transient table.
1572 * The name is irrelevant to the backend because it's referenced by OID,
1573 * but users looking at the catalogs could be confused. Rename it to
1574 * prevent this problem.
1576 * Note no lock required on the relation, because we already hold an
1577 * exclusive lock on it.
1579 if (!swap_toast_by_content)
1581 Relation newrel;
1583 newrel = table_open(OIDOldHeap, NoLock);
1584 if (OidIsValid(newrel->rd_rel->reltoastrelid))
1586 Oid toastidx;
1587 char NewToastName[NAMEDATALEN];
1589 /* Get the associated valid index to be renamed */
1590 toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1591 NoLock);
1593 /* rename the toast table ... */
1594 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1595 OIDOldHeap);
1596 RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1597 NewToastName, true, false);
1599 /* ... and its valid index too. */
1600 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1601 OIDOldHeap);
1603 RenameRelationInternal(toastidx,
1604 NewToastName, true, true);
1607 * Reset the relrewrite for the toast. The command-counter
1608 * increment is required here as we are about to update the tuple
1609 * that is updated as part of RenameRelationInternal.
1611 CommandCounterIncrement();
1612 ResetRelRewrite(newrel->rd_rel->reltoastrelid);
1614 relation_close(newrel, NoLock);
1617 /* if it's not a catalog table, clear any missing attribute settings */
1618 if (!is_system_catalog)
1620 Relation newrel;
1622 newrel = table_open(OIDOldHeap, NoLock);
1623 RelationClearMissing(newrel);
1624 relation_close(newrel, NoLock);
1630 * Get a list of tables that the current user has privileges on and
1631 * have indisclustered set. Return the list in a List * of RelToCluster
1632 * (stored in the specified memory context), each one giving the tableOid
1633 * and the indexOid on which the table is already clustered.
1635 static List *
1636 get_tables_to_cluster(MemoryContext cluster_context)
1638 Relation indRelation;
1639 TableScanDesc scan;
1640 ScanKeyData entry;
1641 HeapTuple indexTuple;
1642 Form_pg_index index;
1643 MemoryContext old_context;
1644 List *rtcs = NIL;
1647 * Get all indexes that have indisclustered set and that the current user
1648 * has the appropriate privileges for.
1650 indRelation = table_open(IndexRelationId, AccessShareLock);
1651 ScanKeyInit(&entry,
1652 Anum_pg_index_indisclustered,
1653 BTEqualStrategyNumber, F_BOOLEQ,
1654 BoolGetDatum(true));
1655 scan = table_beginscan_catalog(indRelation, 1, &entry);
1656 while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1658 RelToCluster *rtc;
1660 index = (Form_pg_index) GETSTRUCT(indexTuple);
1662 if (!cluster_is_permitted_for_relation(index->indrelid, GetUserId()))
1663 continue;
1665 /* Use a permanent memory context for the result list */
1666 old_context = MemoryContextSwitchTo(cluster_context);
1668 rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1669 rtc->tableOid = index->indrelid;
1670 rtc->indexOid = index->indexrelid;
1671 rtcs = lappend(rtcs, rtc);
1673 MemoryContextSwitchTo(old_context);
1675 table_endscan(scan);
1677 relation_close(indRelation, AccessShareLock);
1679 return rtcs;
1683 * Given an index on a partitioned table, return a list of RelToCluster for
1684 * all the children leaves tables/indexes.
1686 * Like expand_vacuum_rel, but here caller must hold AccessExclusiveLock
1687 * on the table containing the index.
1689 static List *
1690 get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid)
1692 List *inhoids;
1693 ListCell *lc;
1694 List *rtcs = NIL;
1695 MemoryContext old_context;
1697 /* Do not lock the children until they're processed */
1698 inhoids = find_all_inheritors(indexOid, NoLock, NULL);
1700 foreach(lc, inhoids)
1702 Oid indexrelid = lfirst_oid(lc);
1703 Oid relid = IndexGetRelation(indexrelid, false);
1704 RelToCluster *rtc;
1706 /* consider only leaf indexes */
1707 if (get_rel_relkind(indexrelid) != RELKIND_INDEX)
1708 continue;
1711 * It's possible that the user does not have privileges to CLUSTER the
1712 * leaf partition despite having such privileges on the partitioned
1713 * table. We skip any partitions which the user is not permitted to
1714 * CLUSTER.
1716 if (!cluster_is_permitted_for_relation(relid, GetUserId()))
1717 continue;
1719 /* Use a permanent memory context for the result list */
1720 old_context = MemoryContextSwitchTo(cluster_context);
1722 rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1723 rtc->tableOid = relid;
1724 rtc->indexOid = indexrelid;
1725 rtcs = lappend(rtcs, rtc);
1727 MemoryContextSwitchTo(old_context);
1730 return rtcs;
1734 * Return whether userid has privileges to CLUSTER relid. If not, this
1735 * function emits a WARNING.
1737 static bool
1738 cluster_is_permitted_for_relation(Oid relid, Oid userid)
1740 if (pg_class_aclcheck(relid, userid, ACL_MAINTAIN) == ACLCHECK_OK)
1741 return true;
1743 ereport(WARNING,
1744 (errmsg("permission denied to cluster \"%s\", skipping it",
1745 get_rel_name(relid))));
1746 return false;