src/backend/commands/cluster.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * cluster.c
   4  *        CLUSTER a table on an index.  This is now also used for VACUUM FULL.
   5  *
   6  * There is hardly anything left of Paul Brown's original implementation...
   7  *
   8  *
   9  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  10  * Portions Copyright (c) 1994-5, Regents of the University of California
  11  *
  12  *
  13  * IDENTIFICATION
  14  *        src/backend/commands/cluster.c
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19
  20 #include "access/amapi.h"
  21 #include "access/heapam.h"
  22 #include "access/multixact.h"
  23 #include "access/relscan.h"
  24 #include "access/tableam.h"
  25 #include "access/toast_internals.h"
  26 #include "access/transam.h"
  27 #include "access/xact.h"
  28 #include "catalog/catalog.h"
  29 #include "catalog/dependency.h"
  30 #include "catalog/heap.h"
  31 #include "catalog/index.h"
  32 #include "catalog/namespace.h"
  33 #include "catalog/objectaccess.h"
  34 #include "catalog/pg_am.h"
  35 #include "catalog/pg_database.h"
  36 #include "catalog/pg_inherits.h"
  37 #include "catalog/toasting.h"
  38 #include "commands/cluster.h"
  39 #include "commands/defrem.h"
  40 #include "commands/progress.h"
  41 #include "commands/tablecmds.h"
  42 #include "commands/vacuum.h"
  43 #include "miscadmin.h"
  44 #include "optimizer/optimizer.h"
  45 #include "pgstat.h"
  46 #include "storage/bufmgr.h"
  47 #include "storage/lmgr.h"
  48 #include "storage/predicate.h"
  49 #include "utils/acl.h"
  50 #include "utils/fmgroids.h"
  51 #include "utils/guc.h"
  52 #include "utils/inval.h"
  53 #include "utils/lsyscache.h"
  54 #include "utils/memutils.h"
  55 #include "utils/pg_rusage.h"
  56 #include "utils/relmapper.h"
  57 #include "utils/snapmgr.h"
  58 #include "utils/syscache.h"
  59
  60 /*
  61  * This struct is used to pass around the information on tables to be
  62  * clustered. We need this so we can make a list of them when invoked without
  63  * a specific table/index pair.
  64  */
  65 typedef struct
  66 {
  67         Oid                     tableOid;
  68         Oid                     indexOid;
  69 } RelToCluster;
  70
  71
  72 static void cluster_multiple_rels(List *rtcs, ClusterParams *params);
  73 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
  74 static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
  75                                                         bool verbose, bool *pSwapToastByContent,
  76                                                         TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
  77 static List *get_tables_to_cluster(MemoryContext cluster_context);
  78 static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context,
  79                                                                                            Oid indexOid);
  80 static bool cluster_is_permitted_for_relation(Oid relid, Oid userid);
  81
  82
  83 /*---------------------------------------------------------------------------
  84  * This cluster code allows for clustering multiple tables at once. Because
  85  * of this, we cannot just run everything on a single transaction, or we
  86  * would be forced to acquire exclusive locks on all the tables being
  87  * clustered, simultaneously --- very likely leading to deadlock.
  88  *
  89  * To solve this we follow a similar strategy to VACUUM code,
  90  * clustering each relation in a separate transaction. For this to work,
  91  * we need to:
  92  *      - provide a separate memory context so that we can pass information in
  93  *        a way that survives across transactions
  94  *      - start a new transaction every time a new relation is clustered
  95  *      - check for validity of the information on to-be-clustered relations,
  96  *        as someone might have deleted a relation behind our back, or
  97  *        clustered one on a different index
  98  *      - end the transaction
  99  *
 100  * The single-relation case does not have any such overhead.
 101  *
 102  * We also allow a relation to be specified without index.  In that case,
 103  * the indisclustered bit will be looked up, and an ERROR will be thrown
 104  * if there is no index with the bit set.
 105  *---------------------------------------------------------------------------
 106  */
 107 void
 108 cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 109 {
 110         ListCell   *lc;
 111         ClusterParams params = {0};
 112         bool            verbose = false;
 113         Relation        rel = NULL;
 114         Oid                     indexOid = InvalidOid;
 115         MemoryContext cluster_context;
 116         List       *rtcs;
 117
 118         /* Parse option list */
 119         foreach(lc, stmt->params)
 120         {
 121                 DefElem    *opt = (DefElem *) lfirst(lc);
 122
 123                 if (strcmp(opt->defname, "verbose") == 0)
 124                         verbose = defGetBoolean(opt);
 125                 else
 126                         ereport(ERROR,
 127                                         (errcode(ERRCODE_SYNTAX_ERROR),
 128                                          errmsg("unrecognized CLUSTER option \"%s\"",
 129                                                         opt->defname),
 130                                          parser_errposition(pstate, opt->location)));
 131         }
 132
 133         params.options = (verbose ? CLUOPT_VERBOSE : 0);
 134
 135         if (stmt->relation != NULL)
 136         {
 137                 /* This is the single-relation case. */
 138                 Oid                     tableOid;
 139
 140                 /*
 141                  * Find, lock, and check permissions on the table.  We obtain
 142                  * AccessExclusiveLock right away to avoid lock-upgrade hazard in the
 143                  * single-transaction case.
 144                  */
 145                 tableOid = RangeVarGetRelidExtended(stmt->relation,
 146                                                                                         AccessExclusiveLock,
 147                                                                                         0,
 148                                                                                         RangeVarCallbackMaintainsTable,
 149                                                                                         NULL);
 150                 rel = table_open(tableOid, NoLock);
 151
 152                 /*
 153                  * Reject clustering a remote temp table ... their local buffer
 154                  * manager is not going to cope.
 155                  */
 156                 if (RELATION_IS_OTHER_TEMP(rel))
 157                         ereport(ERROR,
 158                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 159                                          errmsg("cannot cluster temporary tables of other sessions")));
 160
 161                 if (stmt->indexname == NULL)
 162                 {
 163                         ListCell   *index;
 164
 165                         /* We need to find the index that has indisclustered set. */
 166                         foreach(index, RelationGetIndexList(rel))
 167                         {
 168                                 indexOid = lfirst_oid(index);
 169                                 if (get_index_isclustered(indexOid))
 170                                         break;
 171                                 indexOid = InvalidOid;
 172                         }
 173
 174                         if (!OidIsValid(indexOid))
 175                                 ereport(ERROR,
 176                                                 (errcode(ERRCODE_UNDEFINED_OBJECT),
 177                                                  errmsg("there is no previously clustered index for table \"%s\"",
 178                                                                 stmt->relation->relname)));
 179                 }
 180                 else
 181                 {
 182                         /*
 183                          * The index is expected to be in the same namespace as the
 184                          * relation.
 185                          */
 186                         indexOid = get_relname_relid(stmt->indexname,
 187                                                                                  rel->rd_rel->relnamespace);
 188                         if (!OidIsValid(indexOid))
 189                                 ereport(ERROR,
 190                                                 (errcode(ERRCODE_UNDEFINED_OBJECT),
 191                                                  errmsg("index \"%s\" for table \"%s\" does not exist",
 192                                                                 stmt->indexname, stmt->relation->relname)));
 193                 }
 194
 195                 if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
 196                 {
 197                         /* close relation, keep lock till commit */
 198                         table_close(rel, NoLock);
 199
 200                         /* Do the job. */
 201                         cluster_rel(tableOid, indexOid, &params);
 202
 203                         return;
 204                 }
 205         }
 206
 207         /*
 208          * By here, we know we are in a multi-table situation.  In order to avoid
 209          * holding locks for too long, we want to process each table in its own
 210          * transaction.  This forces us to disallow running inside a user
 211          * transaction block.
 212          */
 213         PreventInTransactionBlock(isTopLevel, "CLUSTER");
 214
 215         /* Also, we need a memory context to hold our list of relations */
 216         cluster_context = AllocSetContextCreate(PortalContext,
 217                                                                                         "Cluster",
 218                                                                                         ALLOCSET_DEFAULT_SIZES);
 219
 220         /*
 221          * Either we're processing a partitioned table, or we were not given any
 222          * table name at all.  In either case, obtain a list of relations to
 223          * process.
 224          *
 225          * In the former case, an index name must have been given, so we don't
 226          * need to recheck its "indisclustered" bit, but we have to check that it
 227          * is an index that we can cluster on.  In the latter case, we set the
 228          * option bit to have indisclustered verified.
 229          *
 230          * Rechecking the relation itself is necessary here in all cases.
 231          */
 232         params.options |= CLUOPT_RECHECK;
 233         if (rel != NULL)
 234         {
 235                 Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
 236                 check_index_is_clusterable(rel, indexOid, AccessShareLock);
 237                 rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid);
 238
 239                 /* close relation, releasing lock on parent table */
 240                 table_close(rel, AccessExclusiveLock);
 241         }
 242         else
 243         {
 244                 rtcs = get_tables_to_cluster(cluster_context);
 245                 params.options |= CLUOPT_RECHECK_ISCLUSTERED;
 246         }
 247
 248         /* Do the job. */
 249         cluster_multiple_rels(rtcs, &params);
 250
 251         /* Start a new transaction for the cleanup work. */
 252         StartTransactionCommand();
 253
 254         /* Clean up working storage */
 255         MemoryContextDelete(cluster_context);
 256 }
 257
 258 /*
 259  * Given a list of relations to cluster, process each of them in a separate
 260  * transaction.
 261  *
 262  * We expect to be in a transaction at start, but there isn't one when we
 263  * return.
 264  */
 265 static void
 266 cluster_multiple_rels(List *rtcs, ClusterParams *params)
 267 {
 268         ListCell   *lc;
 269
 270         /* Commit to get out of starting transaction */
 271         PopActiveSnapshot();
 272         CommitTransactionCommand();
 273
 274         /* Cluster the tables, each in a separate transaction */
 275         foreach(lc, rtcs)
 276         {
 277                 RelToCluster *rtc = (RelToCluster *) lfirst(lc);
 278
 279                 /* Start a new transaction for each relation. */
 280                 StartTransactionCommand();
 281
 282                 /* functions in indexes may want a snapshot set */
 283                 PushActiveSnapshot(GetTransactionSnapshot());
 284
 285                 /* Do the job. */
 286                 cluster_rel(rtc->tableOid, rtc->indexOid, params);
 287
 288                 PopActiveSnapshot();
 289                 CommitTransactionCommand();
 290         }
 291 }
 292
 293 /*
 294  * cluster_rel
 295  *
 296  * This clusters the table by creating a new, clustered table and
 297  * swapping the relfilenumbers of the new table and the old table, so
 298  * the OID of the original table is preserved.  Thus we do not lose
 299  * GRANT, inheritance nor references to this table (this was a bug
 300  * in releases through 7.3).
 301  *
 302  * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
 303  * the new table, it's better to create the indexes afterwards than to fill
 304  * them incrementally while we load the table.
 305  *
 306  * If indexOid is InvalidOid, the table will be rewritten in physical order
 307  * instead of index order.  This is the new implementation of VACUUM FULL,
 308  * and error messages should refer to the operation as VACUUM not CLUSTER.
 309  */
 310 void
 311 cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 312 {
 313         Relation        OldHeap;
 314         Oid                     save_userid;
 315         int                     save_sec_context;
 316         int                     save_nestlevel;
 317         bool            verbose = ((params->options & CLUOPT_VERBOSE) != 0);
 318         bool            recheck = ((params->options & CLUOPT_RECHECK) != 0);
 319
 320         /* Check for user-requested abort. */
 321         CHECK_FOR_INTERRUPTS();
 322
 323         pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
 324         if (OidIsValid(indexOid))
 325                 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
 326                                                                          PROGRESS_CLUSTER_COMMAND_CLUSTER);
 327         else
 328                 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
 329                                                                          PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
 330
 331         /*
 332          * We grab exclusive access to the target rel and index for the duration
 333          * of the transaction.  (This is redundant for the single-transaction
 334          * case, since cluster() already did it.)  The index lock is taken inside
 335          * check_index_is_clusterable.
 336          */
 337         OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
 338
 339         /* If the table has gone away, we can skip processing it */
 340         if (!OldHeap)
 341         {
 342                 pgstat_progress_end_command();
 343                 return;
 344         }
 345
 346         /*
 347          * Switch to the table owner's userid, so that any index functions are run
 348          * as that user.  Also lock down security-restricted operations and
 349          * arrange to make GUC variable changes local to this command.
 350          */
 351         GetUserIdAndSecContext(&save_userid, &save_sec_context);
 352         SetUserIdAndSecContext(OldHeap->rd_rel->relowner,
 353                                                    save_sec_context | SECURITY_RESTRICTED_OPERATION);
 354         save_nestlevel = NewGUCNestLevel();
 355         RestrictSearchPath();
 356
 357         /*
 358          * Since we may open a new transaction for each relation, we have to check
 359          * that the relation still is what we think it is.
 360          *
 361          * If this is a single-transaction CLUSTER, we can skip these tests. We
 362          * *must* skip the one on indisclustered since it would reject an attempt
 363          * to cluster a not-previously-clustered index.
 364          */
 365         if (recheck)
 366         {
 367                 /* Check that the user still has privileges for the relation */
 368                 if (!cluster_is_permitted_for_relation(tableOid, save_userid))
 369                 {
 370                         relation_close(OldHeap, AccessExclusiveLock);
 371                         goto out;
 372                 }
 373
 374                 /*
 375                  * Silently skip a temp table for a remote session.  Only doing this
 376                  * check in the "recheck" case is appropriate (which currently means
 377                  * somebody is executing a database-wide CLUSTER or on a partitioned
 378                  * table), because there is another check in cluster() which will stop
 379                  * any attempt to cluster remote temp tables by name.  There is
 380                  * another check in cluster_rel which is redundant, but we leave it
 381                  * for extra safety.
 382                  */
 383                 if (RELATION_IS_OTHER_TEMP(OldHeap))
 384                 {
 385                         relation_close(OldHeap, AccessExclusiveLock);
 386                         goto out;
 387                 }
 388
 389                 if (OidIsValid(indexOid))
 390                 {
 391                         /*
 392                          * Check that the index still exists
 393                          */
 394                         if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
 395                         {
 396                                 relation_close(OldHeap, AccessExclusiveLock);
 397                                 goto out;
 398                         }
 399
 400                         /*
 401                          * Check that the index is still the one with indisclustered set,
 402                          * if needed.
 403                          */
 404                         if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 &&
 405                                 !get_index_isclustered(indexOid))
 406                         {
 407                                 relation_close(OldHeap, AccessExclusiveLock);
 408                                 goto out;
 409                         }
 410                 }
 411         }
 412
 413         /*
 414          * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
 415          * would work in most respects, but the index would only get marked as
 416          * indisclustered in the current database, leading to unexpected behavior
 417          * if CLUSTER were later invoked in another database.
 418          */
 419         if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
 420                 ereport(ERROR,
 421                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 422                                  errmsg("cannot cluster a shared catalog")));
 423
 424         /*
 425          * Don't process temp tables of other backends ... their local buffer
 426          * manager is not going to cope.
 427          */
 428         if (RELATION_IS_OTHER_TEMP(OldHeap))
 429         {
 430                 if (OidIsValid(indexOid))
 431                         ereport(ERROR,
 432                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 433                                          errmsg("cannot cluster temporary tables of other sessions")));
 434                 else
 435                         ereport(ERROR,
 436                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 437                                          errmsg("cannot vacuum temporary tables of other sessions")));
 438         }
 439
 440         /*
 441          * Also check for active uses of the relation in the current transaction,
 442          * including open scans and pending AFTER trigger events.
 443          */
 444         CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
 445
 446         /* Check heap and index are valid to cluster on */
 447         if (OidIsValid(indexOid))
 448                 check_index_is_clusterable(OldHeap, indexOid, AccessExclusiveLock);
 449
 450         /*
 451          * Quietly ignore the request if this is a materialized view which has not
 452          * been populated from its query. No harm is done because there is no data
 453          * to deal with, and we don't want to throw an error if this is part of a
 454          * multi-relation request -- for example, CLUSTER was run on the entire
 455          * database.
 456          */
 457         if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
 458                 !RelationIsPopulated(OldHeap))
 459         {
 460                 relation_close(OldHeap, AccessExclusiveLock);
 461                 goto out;
 462         }
 463
 464         Assert(OldHeap->rd_rel->relkind == RELKIND_RELATION ||
 465                    OldHeap->rd_rel->relkind == RELKIND_MATVIEW ||
 466                    OldHeap->rd_rel->relkind == RELKIND_TOASTVALUE);
 467
 468         /*
 469          * All predicate locks on the tuples or pages are about to be made
 470          * invalid, because we move tuples around.  Promote them to relation
 471          * locks.  Predicate locks on indexes will be promoted when they are
 472          * reindexed.
 473          */
 474         TransferPredicateLocksToHeapRelation(OldHeap);
 475
 476         /* rebuild_relation does all the dirty work */
 477         rebuild_relation(OldHeap, indexOid, verbose);
 478
 479         /* NB: rebuild_relation does table_close() on OldHeap */
 480
 481 out:
 482         /* Roll back any GUC changes executed by index functions */
 483         AtEOXact_GUC(false, save_nestlevel);
 484
 485         /* Restore userid and security context */
 486         SetUserIdAndSecContext(save_userid, save_sec_context);
 487
 488         pgstat_progress_end_command();
 489 }
 490
 491 /*
 492  * Verify that the specified heap and index are valid to cluster on
 493  *
 494  * Side effect: obtains lock on the index.  The caller may
 495  * in some cases already have AccessExclusiveLock on the table, but
 496  * not in all cases so we can't rely on the table-level lock for
 497  * protection here.
 498  */
 499 void
 500 check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode)
 501 {
 502         Relation        OldIndex;
 503
 504         OldIndex = index_open(indexOid, lockmode);
 505
 506         /*
 507          * Check that index is in fact an index on the given relation
 508          */
 509         if (OldIndex->rd_index == NULL ||
 510                 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
 511                 ereport(ERROR,
 512                                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
 513                                  errmsg("\"%s\" is not an index for table \"%s\"",
 514                                                 RelationGetRelationName(OldIndex),
 515                                                 RelationGetRelationName(OldHeap))));
 516
 517         /* Index AM must allow clustering */
 518         if (!OldIndex->rd_indam->amclusterable)
 519                 ereport(ERROR,
 520                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 521                                  errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
 522                                                 RelationGetRelationName(OldIndex))));
 523
 524         /*
 525          * Disallow clustering on incomplete indexes (those that might not index
 526          * every row of the relation).  We could relax this by making a separate
 527          * seqscan pass over the table to copy the missing rows, but that seems
 528          * expensive and tedious.
 529          */
 530         if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
 531                 ereport(ERROR,
 532                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 533                                  errmsg("cannot cluster on partial index \"%s\"",
 534                                                 RelationGetRelationName(OldIndex))));
 535
 536         /*
 537          * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
 538          * it might well not contain entries for every heap row, or might not even
 539          * be internally consistent.  (But note that we don't check indcheckxmin;
 540          * the worst consequence of following broken HOT chains would be that we
 541          * might put recently-dead tuples out-of-order in the new table, and there
 542          * is little harm in that.)
 543          */
 544         if (!OldIndex->rd_index->indisvalid)
 545                 ereport(ERROR,
 546                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 547                                  errmsg("cannot cluster on invalid index \"%s\"",
 548                                                 RelationGetRelationName(OldIndex))));
 549
 550         /* Drop relcache refcnt on OldIndex, but keep lock */
 551         index_close(OldIndex, NoLock);
 552 }
 553
 554 /*
 555  * mark_index_clustered: mark the specified index as the one clustered on
 556  *
 557  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
 558  */
 559 void
 560 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
 561 {
 562         HeapTuple       indexTuple;
 563         Form_pg_index indexForm;
 564         Relation        pg_index;
 565         ListCell   *index;
 566
 567         /* Disallow applying to a partitioned table */
 568         if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 569                 ereport(ERROR,
 570                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 571                                  errmsg("cannot mark index clustered in partitioned table")));
 572
 573         /*
 574          * If the index is already marked clustered, no need to do anything.
 575          */
 576         if (OidIsValid(indexOid))
 577         {
 578                 if (get_index_isclustered(indexOid))
 579                         return;
 580         }
 581
 582         /*
 583          * Check each index of the relation and set/clear the bit as needed.
 584          */
 585         pg_index = table_open(IndexRelationId, RowExclusiveLock);
 586
 587         foreach(index, RelationGetIndexList(rel))
 588         {
 589                 Oid                     thisIndexOid = lfirst_oid(index);
 590
 591                 indexTuple = SearchSysCacheCopy1(INDEXRELID,
 592                                                                                  ObjectIdGetDatum(thisIndexOid));
 593                 if (!HeapTupleIsValid(indexTuple))
 594                         elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
 595                 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 596
 597                 /*
 598                  * Unset the bit if set.  We know it's wrong because we checked this
 599                  * earlier.
 600                  */
 601                 if (indexForm->indisclustered)
 602                 {
 603                         indexForm->indisclustered = false;
 604                         CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
 605                 }
 606                 else if (thisIndexOid == indexOid)
 607                 {
 608                         /* this was checked earlier, but let's be real sure */
 609                         if (!indexForm->indisvalid)
 610                                 elog(ERROR, "cannot cluster on invalid index %u", indexOid);
 611                         indexForm->indisclustered = true;
 612                         CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
 613                 }
 614
 615                 InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
 616                                                                          InvalidOid, is_internal);
 617
 618                 heap_freetuple(indexTuple);
 619         }
 620
 621         table_close(pg_index, RowExclusiveLock);
 622 }
 623
 624 /*
 625  * rebuild_relation: rebuild an existing relation in index or physical order
 626  *
 627  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
 628  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
 629  *
 630  * NB: this routine closes OldHeap at the right time; caller should not.
 631  */
 632 static void
 633 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
 634 {
 635         Oid                     tableOid = RelationGetRelid(OldHeap);
 636         Oid                     accessMethod = OldHeap->rd_rel->relam;
 637         Oid                     tableSpace = OldHeap->rd_rel->reltablespace;
 638         Oid                     OIDNewHeap;
 639         char            relpersistence;
 640         bool            is_system_catalog;
 641         bool            swap_toast_by_content;
 642         TransactionId frozenXid;
 643         MultiXactId cutoffMulti;
 644
 645         if (OidIsValid(indexOid))
 646                 /* Mark the correct index as clustered */
 647                 mark_index_clustered(OldHeap, indexOid, true);
 648
 649         /* Remember info about rel before closing OldHeap */
 650         relpersistence = OldHeap->rd_rel->relpersistence;
 651         is_system_catalog = IsSystemRelation(OldHeap);
 652
 653         /* Close relcache entry, but keep lock until transaction commit */
 654         table_close(OldHeap, NoLock);
 655
 656         /* Create the transient table that will receive the re-ordered data */
 657         OIDNewHeap = make_new_heap(tableOid, tableSpace,
 658                                                            accessMethod,
 659                                                            relpersistence,
 660                                                            AccessExclusiveLock);
 661
 662         /* Copy the heap data into the new table in the desired order */
 663         copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
 664                                         &swap_toast_by_content, &frozenXid, &cutoffMulti);
 665
 666         /*
 667          * Swap the physical files of the target and transient tables, then
 668          * rebuild the target's indexes and throw away the transient table.
 669          */
 670         finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
 671                                          swap_toast_by_content, false, true,
 672                                          frozenXid, cutoffMulti,
 673                                          relpersistence);
 674 }
 675
 676
 677 /*
 678  * Create the transient table that will be filled with new data during
 679  * CLUSTER, ALTER TABLE, and similar operations.  The transient table
 680  * duplicates the logical structure of the OldHeap; but will have the
 681  * specified physical storage properties NewTableSpace, NewAccessMethod, and
 682  * relpersistence.
 683  *
 684  * After this, the caller should load the new heap with transferred/modified
 685  * data, then call finish_heap_swap to complete the operation.
 686  */
 687 Oid
 688 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
 689                           char relpersistence, LOCKMODE lockmode)
 690 {
 691         TupleDesc       OldHeapDesc;
 692         char            NewHeapName[NAMEDATALEN];
 693         Oid                     OIDNewHeap;
 694         Oid                     toastid;
 695         Relation        OldHeap;
 696         HeapTuple       tuple;
 697         Datum           reloptions;
 698         bool            isNull;
 699         Oid                     namespaceid;
 700
 701         OldHeap = table_open(OIDOldHeap, lockmode);
 702         OldHeapDesc = RelationGetDescr(OldHeap);
 703
 704         /*
 705          * Note that the NewHeap will not receive any of the defaults or
 706          * constraints associated with the OldHeap; we don't need 'em, and there's
 707          * no reason to spend cycles inserting them into the catalogs only to
 708          * delete them.
 709          */
 710
 711         /*
 712          * But we do want to use reloptions of the old heap for new heap.
 713          */
 714         tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
 715         if (!HeapTupleIsValid(tuple))
 716                 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
 717         reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
 718                                                                  &isNull);
 719         if (isNull)
 720                 reloptions = (Datum) 0;
 721
 722         if (relpersistence == RELPERSISTENCE_TEMP)
 723                 namespaceid = LookupCreationNamespace("pg_temp");
 724         else
 725                 namespaceid = RelationGetNamespace(OldHeap);
 726
 727         /*
 728          * Create the new heap, using a temporary name in the same namespace as
 729          * the existing table.  NOTE: there is some risk of collision with user
 730          * relnames.  Working around this seems more trouble than it's worth; in
 731          * particular, we can't create the new heap in a different namespace from
 732          * the old, or we will have problems with the TEMP status of temp tables.
 733          *
 734          * Note: the new heap is not a shared relation, even if we are rebuilding
 735          * a shared rel.  However, we do make the new heap mapped if the source is
 736          * mapped.  This simplifies swap_relation_files, and is absolutely
 737          * necessary for rebuilding pg_class, for reasons explained there.
 738          */
 739         snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
 740
 741         OIDNewHeap = heap_create_with_catalog(NewHeapName,
 742                                                                                   namespaceid,
 743                                                                                   NewTableSpace,
 744                                                                                   InvalidOid,
 745                                                                                   InvalidOid,
 746                                                                                   InvalidOid,
 747                                                                                   OldHeap->rd_rel->relowner,
 748                                                                                   NewAccessMethod,
 749                                                                                   OldHeapDesc,
 750                                                                                   NIL,
 751                                                                                   RELKIND_RELATION,
 752                                                                                   relpersistence,
 753                                                                                   false,
 754                                                                                   RelationIsMapped(OldHeap),
 755                                                                                   ONCOMMIT_NOOP,
 756                                                                                   reloptions,
 757                                                                                   false,
 758                                                                                   true,
 759                                                                                   true,
 760                                                                                   OIDOldHeap,
 761                                                                                   NULL);
 762         Assert(OIDNewHeap != InvalidOid);
 763
 764         ReleaseSysCache(tuple);
 765
 766         /*
 767          * Advance command counter so that the newly-created relation's catalog
 768          * tuples will be visible to table_open.
 769          */
 770         CommandCounterIncrement();
 771
 772         /*
 773          * If necessary, create a TOAST table for the new relation.
 774          *
 775          * If the relation doesn't have a TOAST table already, we can't need one
 776          * for the new relation.  The other way around is possible though: if some
 777          * wide columns have been dropped, NewHeapCreateToastTable can decide that
 778          * no TOAST table is needed for the new table.
 779          *
 780          * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
 781          * that the TOAST table will be visible for insertion.
 782          */
 783         toastid = OldHeap->rd_rel->reltoastrelid;
 784         if (OidIsValid(toastid))
 785         {
 786                 /* keep the existing toast table's reloptions, if any */
 787                 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
 788                 if (!HeapTupleIsValid(tuple))
 789                         elog(ERROR, "cache lookup failed for relation %u", toastid);
 790                 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
 791                                                                          &isNull);
 792                 if (isNull)
 793                         reloptions = (Datum) 0;
 794
 795                 NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);
 796
 797                 ReleaseSysCache(tuple);
 798         }
 799
 800         table_close(OldHeap, NoLock);
 801
 802         return OIDNewHeap;
 803 }
 804
 805 /*
 806  * Do the physical copying of table data.
 807  *
 808  * There are three output parameters:
 809  * *pSwapToastByContent is set true if toast tables must be swapped by content.
 810  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
 811  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
 812  */
 813 static void
 814 copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 815                                 bool *pSwapToastByContent, TransactionId *pFreezeXid,
 816                                 MultiXactId *pCutoffMulti)
 817 {
 818         Relation        NewHeap,
 819                                 OldHeap,
 820                                 OldIndex;
 821         Relation        relRelation;
 822         HeapTuple       reltup;
 823         Form_pg_class relform;
 824         TupleDesc       oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
 825         TupleDesc       newTupDesc PG_USED_FOR_ASSERTS_ONLY;
 826         VacuumParams params;
 827         struct VacuumCutoffs cutoffs;
 828         bool            use_sort;
 829         double          num_tuples = 0,
 830                                 tups_vacuumed = 0,
 831                                 tups_recently_dead = 0;
 832         BlockNumber num_pages;
 833         int                     elevel = verbose ? INFO : DEBUG2;
 834         PGRUsage        ru0;
 835         char       *nspname;
 836
 837         pg_rusage_init(&ru0);
 838
 839         /*
 840          * Open the relations we need.
 841          */
 842         NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
 843         OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
 844         if (OidIsValid(OIDOldIndex))
 845                 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
 846         else
 847                 OldIndex = NULL;
 848
 849         /* Store a copy of the namespace name for logging purposes */
 850         nspname = get_namespace_name(RelationGetNamespace(OldHeap));
 851
 852         /*
 853          * Their tuple descriptors should be exactly alike, but here we only need
 854          * assume that they have the same number of columns.
 855          */
 856         oldTupDesc = RelationGetDescr(OldHeap);
 857         newTupDesc = RelationGetDescr(NewHeap);
 858         Assert(newTupDesc->natts == oldTupDesc->natts);
 859
 860         /*
 861          * If the OldHeap has a toast table, get lock on the toast table to keep
 862          * it from being vacuumed.  This is needed because autovacuum processes
 863          * toast tables independently of their main tables, with no lock on the
 864          * latter.  If an autovacuum were to start on the toast table after we
 865          * compute our OldestXmin below, it would use a later OldestXmin, and then
 866          * possibly remove as DEAD toast tuples belonging to main tuples we think
 867          * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
 868          * tuples.
 869          *
 870          * We don't need to open the toast relation here, just lock it.  The lock
 871          * will be held till end of transaction.
 872          */
 873         if (OldHeap->rd_rel->reltoastrelid)
 874                 LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
 875
 876         /*
 877          * If both tables have TOAST tables, perform toast swap by content.  It is
 878          * possible that the old table has a toast table but the new one doesn't,
 879          * if toastable columns have been dropped.  In that case we have to do
 880          * swap by links.  This is okay because swap by content is only essential
 881          * for system catalogs, and we don't support schema changes for them.
 882          */
 883         if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
 884         {
 885                 *pSwapToastByContent = true;
 886
 887                 /*
 888                  * When doing swap by content, any toast pointers written into NewHeap
 889                  * must use the old toast table's OID, because that's where the toast
 890                  * data will eventually be found.  Set this up by setting rd_toastoid.
 891                  * This also tells toast_save_datum() to preserve the toast value
 892                  * OIDs, which we want so as not to invalidate toast pointers in
 893                  * system catalog caches, and to avoid making multiple copies of a
 894                  * single toast value.
 895                  *
 896                  * Note that we must hold NewHeap open until we are done writing data,
 897                  * since the relcache will not guarantee to remember this setting once
 898                  * the relation is closed.  Also, this technique depends on the fact
 899                  * that no one will try to read from the NewHeap until after we've
 900                  * finished writing it and swapping the rels --- otherwise they could
 901                  * follow the toast pointers to the wrong place.  (It would actually
 902                  * work for values copied over from the old toast table, but not for
 903                  * any values that we toast which were previously not toasted.)
 904                  */
 905                 NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
 906         }
 907         else
 908                 *pSwapToastByContent = false;
 909
 910         /*
 911          * Compute xids used to freeze and weed out dead tuples and multixacts.
 912          * Since we're going to rewrite the whole table anyway, there's no reason
 913          * not to be aggressive about this.
 914          */
 915         memset(&params, 0, sizeof(VacuumParams));
 916         vacuum_get_cutoffs(OldHeap, &params, &cutoffs);
 917
 918         /*
 919          * FreezeXid will become the table's new relfrozenxid, and that mustn't go
 920          * backwards, so take the max.
 921          */
 922         {
 923                 TransactionId relfrozenxid = OldHeap->rd_rel->relfrozenxid;
 924
 925                 if (TransactionIdIsValid(relfrozenxid) &&
 926                         TransactionIdPrecedes(cutoffs.FreezeLimit, relfrozenxid))
 927                         cutoffs.FreezeLimit = relfrozenxid;
 928         }
 929
 930         /*
 931          * MultiXactCutoff, similarly, shouldn't go backwards either.
 932          */
 933         {
 934                 MultiXactId relminmxid = OldHeap->rd_rel->relminmxid;
 935
 936                 if (MultiXactIdIsValid(relminmxid) &&
 937                         MultiXactIdPrecedes(cutoffs.MultiXactCutoff, relminmxid))
 938                         cutoffs.MultiXactCutoff = relminmxid;
 939         }
 940
 941         /*
 942          * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
 943          * the OldHeap.  We know how to use a sort to duplicate the ordering of a
 944          * btree index, and will use seqscan-and-sort for that case if the planner
 945          * tells us it's cheaper.  Otherwise, always indexscan if an index is
 946          * provided, else plain seqscan.
 947          */
 948         if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
 949                 use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
 950         else
 951                 use_sort = false;
 952
 953         /* Log what we're doing */
 954         if (OldIndex != NULL && !use_sort)
 955                 ereport(elevel,
 956                                 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
 957                                                 nspname,
 958                                                 RelationGetRelationName(OldHeap),
 959                                                 RelationGetRelationName(OldIndex))));
 960         else if (use_sort)
 961                 ereport(elevel,
 962                                 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
 963                                                 nspname,
 964                                                 RelationGetRelationName(OldHeap))));
 965         else
 966                 ereport(elevel,
 967                                 (errmsg("vacuuming \"%s.%s\"",
 968                                                 nspname,
 969                                                 RelationGetRelationName(OldHeap))));
 970
 971         /*
 972          * Hand off the actual copying to AM specific function, the generic code
 973          * cannot know how to deal with visibility across AMs. Note that this
 974          * routine is allowed to set FreezeXid / MultiXactCutoff to different
 975          * values (e.g. because the AM doesn't use freezing).
 976          */
 977         table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
 978                                                                         cutoffs.OldestXmin, &cutoffs.FreezeLimit,
 979                                                                         &cutoffs.MultiXactCutoff,
 980                                                                         &num_tuples, &tups_vacuumed,
 981                                                                         &tups_recently_dead);
 982
 983         /* return selected values to caller, get set as relfrozenxid/minmxid */
 984         *pFreezeXid = cutoffs.FreezeLimit;
 985         *pCutoffMulti = cutoffs.MultiXactCutoff;
 986
 987         /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
 988         NewHeap->rd_toastoid = InvalidOid;
 989
 990         num_pages = RelationGetNumberOfBlocks(NewHeap);
 991
 992         /* Log what we did */
 993         ereport(elevel,
 994                         (errmsg("\"%s.%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
 995                                         nspname,
 996                                         RelationGetRelationName(OldHeap),
 997                                         tups_vacuumed, num_tuples,
 998                                         RelationGetNumberOfBlocks(OldHeap)),
 999                          errdetail("%.0f dead row versions cannot be removed yet.\n"
1000                                            "%s.",
1001                                            tups_recently_dead,
1002                                            pg_rusage_show(&ru0))));
1003
1004         if (OldIndex != NULL)
1005                 index_close(OldIndex, NoLock);
1006         table_close(OldHeap, NoLock);
1007         table_close(NewHeap, NoLock);
1008
1009         /* Update pg_class to reflect the correct values of pages and tuples. */
1010         relRelation = table_open(RelationRelationId, RowExclusiveLock);
1011
1012         reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
1013         if (!HeapTupleIsValid(reltup))
1014                 elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
1015         relform = (Form_pg_class) GETSTRUCT(reltup);
1016
1017         relform->relpages = num_pages;
1018         relform->reltuples = num_tuples;
1019
1020         /* Don't update the stats for pg_class.  See swap_relation_files. */
1021         if (OIDOldHeap != RelationRelationId)
1022                 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1023         else
1024                 CacheInvalidateRelcacheByTuple(reltup);
1025
1026         /* Clean up. */
1027         heap_freetuple(reltup);
1028         table_close(relRelation, RowExclusiveLock);
1029
1030         /* Make the update visible */
1031         CommandCounterIncrement();
1032 }
1033
1034 /*
1035  * Swap the physical files of two given relations.
1036  *
1037  * We swap the physical identity (reltablespace, relfilenumber) while keeping
1038  * the same logical identities of the two relations.  relpersistence is also
1039  * swapped, which is critical since it determines where buffers live for each
1040  * relation.
1041  *
1042  * We can swap associated TOAST data in either of two ways: recursively swap
1043  * the physical content of the toast tables (and their indexes), or swap the
1044  * TOAST links in the given relations' pg_class entries.  The former is needed
1045  * to manage rewrites of shared catalogs (where we cannot change the pg_class
1046  * links) while the latter is the only way to handle cases in which a toast
1047  * table is added or removed altogether.
1048  *
1049  * Additionally, the first relation is marked with relfrozenxid set to
1050  * frozenXid.  It seems a bit ugly to have this here, but the caller would
1051  * have to do it anyway, so having it here saves a heap_update.  Note: in
1052  * the swap-toast-links case, we assume we don't need to change the toast
1053  * table's relfrozenxid: the new version of the toast table should already
1054  * have relfrozenxid set to RecentXmin, which is good enough.
1055  *
1056  * Lastly, if r2 and its toast table and toast index (if any) are mapped,
1057  * their OIDs are emitted into mapped_tables[].  This is hacky but beats
1058  * having to look the information up again later in finish_heap_swap.
1059  */
1060 static void
1061 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1062                                         bool swap_toast_by_content,
1063                                         bool is_internal,
1064                                         TransactionId frozenXid,
1065                                         MultiXactId cutoffMulti,
1066                                         Oid *mapped_tables)
1067 {
1068         Relation        relRelation;
1069         HeapTuple       reltup1,
1070                                 reltup2;
1071         Form_pg_class relform1,
1072                                 relform2;
1073         RelFileNumber relfilenumber1,
1074                                 relfilenumber2;
1075         RelFileNumber swaptemp;
1076         char            swptmpchr;
1077         Oid                     relam1,
1078                                 relam2;
1079
1080         /* We need writable copies of both pg_class tuples. */
1081         relRelation = table_open(RelationRelationId, RowExclusiveLock);
1082
1083         reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1084         if (!HeapTupleIsValid(reltup1))
1085                 elog(ERROR, "cache lookup failed for relation %u", r1);
1086         relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1087
1088         reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1089         if (!HeapTupleIsValid(reltup2))
1090                 elog(ERROR, "cache lookup failed for relation %u", r2);
1091         relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1092
1093         relfilenumber1 = relform1->relfilenode;
1094         relfilenumber2 = relform2->relfilenode;
1095         relam1 = relform1->relam;
1096         relam2 = relform2->relam;
1097
1098         if (RelFileNumberIsValid(relfilenumber1) &&
1099                 RelFileNumberIsValid(relfilenumber2))
1100         {
1101                 /*
1102                  * Normal non-mapped relations: swap relfilenumbers, reltablespaces,
1103                  * relpersistence
1104                  */
1105                 Assert(!target_is_pg_class);
1106
1107                 swaptemp = relform1->relfilenode;
1108                 relform1->relfilenode = relform2->relfilenode;
1109                 relform2->relfilenode = swaptemp;
1110
1111                 swaptemp = relform1->reltablespace;
1112                 relform1->reltablespace = relform2->reltablespace;
1113                 relform2->reltablespace = swaptemp;
1114
1115                 swaptemp = relform1->relam;
1116                 relform1->relam = relform2->relam;
1117                 relform2->relam = swaptemp;
1118
1119                 swptmpchr = relform1->relpersistence;
1120                 relform1->relpersistence = relform2->relpersistence;
1121                 relform2->relpersistence = swptmpchr;
1122
1123                 /* Also swap toast links, if we're swapping by links */
1124                 if (!swap_toast_by_content)
1125                 {
1126                         swaptemp = relform1->reltoastrelid;
1127                         relform1->reltoastrelid = relform2->reltoastrelid;
1128                         relform2->reltoastrelid = swaptemp;
1129                 }
1130         }
1131         else
1132         {
1133                 /*
1134                  * Mapped-relation case.  Here we have to swap the relation mappings
1135                  * instead of modifying the pg_class columns.  Both must be mapped.
1136                  */
1137                 if (RelFileNumberIsValid(relfilenumber1) ||
1138                         RelFileNumberIsValid(relfilenumber2))
1139                         elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1140                                  NameStr(relform1->relname));
1141
1142                 /*
1143                  * We can't change the tablespace nor persistence of a mapped rel, and
1144                  * we can't handle toast link swapping for one either, because we must
1145                  * not apply any critical changes to its pg_class row.  These cases
1146                  * should be prevented by upstream permissions tests, so these checks
1147                  * are non-user-facing emergency backstop.
1148                  */
1149                 if (relform1->reltablespace != relform2->reltablespace)
1150                         elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1151                                  NameStr(relform1->relname));
1152                 if (relform1->relpersistence != relform2->relpersistence)
1153                         elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1154                                  NameStr(relform1->relname));
1155                 if (relform1->relam != relform2->relam)
1156                         elog(ERROR, "cannot change access method of mapped relation \"%s\"",
1157                                  NameStr(relform1->relname));
1158                 if (!swap_toast_by_content &&
1159                         (relform1->reltoastrelid || relform2->reltoastrelid))
1160                         elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1161                                  NameStr(relform1->relname));
1162
1163                 /*
1164                  * Fetch the mappings --- shouldn't fail, but be paranoid
1165                  */
1166                 relfilenumber1 = RelationMapOidToFilenumber(r1, relform1->relisshared);
1167                 if (!RelFileNumberIsValid(relfilenumber1))
1168                         elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1169                                  NameStr(relform1->relname), r1);
1170                 relfilenumber2 = RelationMapOidToFilenumber(r2, relform2->relisshared);
1171                 if (!RelFileNumberIsValid(relfilenumber2))
1172                         elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1173                                  NameStr(relform2->relname), r2);
1174
1175                 /*
1176                  * Send replacement mappings to relmapper.  Note these won't actually
1177                  * take effect until CommandCounterIncrement.
1178                  */
1179                 RelationMapUpdateMap(r1, relfilenumber2, relform1->relisshared, false);
1180                 RelationMapUpdateMap(r2, relfilenumber1, relform2->relisshared, false);
1181
1182                 /* Pass OIDs of mapped r2 tables back to caller */
1183                 *mapped_tables++ = r2;
1184         }
1185
1186         /*
1187          * Recognize that rel1's relfilenumber (swapped from rel2) is new in this
1188          * subtransaction. The rel2 storage (swapped from rel1) may or may not be
1189          * new.
1190          */
1191         {
1192                 Relation        rel1,
1193                                         rel2;
1194
1195                 rel1 = relation_open(r1, NoLock);
1196                 rel2 = relation_open(r2, NoLock);
1197                 rel2->rd_createSubid = rel1->rd_createSubid;
1198                 rel2->rd_newRelfilelocatorSubid = rel1->rd_newRelfilelocatorSubid;
1199                 rel2->rd_firstRelfilelocatorSubid = rel1->rd_firstRelfilelocatorSubid;
1200                 RelationAssumeNewRelfilelocator(rel1);
1201                 relation_close(rel1, NoLock);
1202                 relation_close(rel2, NoLock);
1203         }
1204
1205         /*
1206          * In the case of a shared catalog, these next few steps will only affect
1207          * our own database's pg_class row; but that's okay, because they are all
1208          * noncritical updates.  That's also an important fact for the case of a
1209          * mapped catalog, because it's possible that we'll commit the map change
1210          * and then fail to commit the pg_class update.
1211          */
1212
1213         /* set rel1's frozen Xid and minimum MultiXid */
1214         if (relform1->relkind != RELKIND_INDEX)
1215         {
1216                 Assert(!TransactionIdIsValid(frozenXid) ||
1217                            TransactionIdIsNormal(frozenXid));
1218                 relform1->relfrozenxid = frozenXid;
1219                 relform1->relminmxid = cutoffMulti;
1220         }
1221
1222         /* swap size statistics too, since new rel has freshly-updated stats */
1223         {
1224                 int32           swap_pages;
1225                 float4          swap_tuples;
1226                 int32           swap_allvisible;
1227
1228                 swap_pages = relform1->relpages;
1229                 relform1->relpages = relform2->relpages;
1230                 relform2->relpages = swap_pages;
1231
1232                 swap_tuples = relform1->reltuples;
1233                 relform1->reltuples = relform2->reltuples;
1234                 relform2->reltuples = swap_tuples;
1235
1236                 swap_allvisible = relform1->relallvisible;
1237                 relform1->relallvisible = relform2->relallvisible;
1238                 relform2->relallvisible = swap_allvisible;
1239         }
1240
1241         /*
1242          * Update the tuples in pg_class --- unless the target relation of the
1243          * swap is pg_class itself.  In that case, there is zero point in making
1244          * changes because we'd be updating the old data that we're about to throw
1245          * away.  Because the real work being done here for a mapped relation is
1246          * just to change the relation map settings, it's all right to not update
1247          * the pg_class rows in this case. The most important changes will instead
1248          * performed later, in finish_heap_swap() itself.
1249          */
1250         if (!target_is_pg_class)
1251         {
1252                 CatalogIndexState indstate;
1253
1254                 indstate = CatalogOpenIndexes(relRelation);
1255                 CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1256                                                                    indstate);
1257                 CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1258                                                                    indstate);
1259                 CatalogCloseIndexes(indstate);
1260         }
1261         else
1262         {
1263                 /* no update ... but we do still need relcache inval */
1264                 CacheInvalidateRelcacheByTuple(reltup1);
1265                 CacheInvalidateRelcacheByTuple(reltup2);
1266         }
1267
1268         /*
1269          * Now that pg_class has been updated with its relevant information for
1270          * the swap, update the dependency of the relations to point to their new
1271          * table AM, if it has changed.
1272          */
1273         if (relam1 != relam2)
1274         {
1275                 if (changeDependencyFor(RelationRelationId,
1276                                                                 r1,
1277                                                                 AccessMethodRelationId,
1278                                                                 relam1,
1279                                                                 relam2) != 1)
1280                         elog(ERROR, "could not change access method dependency for relation \"%s.%s\"",
1281                                  get_namespace_name(get_rel_namespace(r1)),
1282                                  get_rel_name(r1));
1283                 if (changeDependencyFor(RelationRelationId,
1284                                                                 r2,
1285                                                                 AccessMethodRelationId,
1286                                                                 relam2,
1287                                                                 relam1) != 1)
1288                         elog(ERROR, "could not change access method dependency for relation \"%s.%s\"",
1289                                  get_namespace_name(get_rel_namespace(r2)),
1290                                  get_rel_name(r2));
1291         }
1292
1293         /*
1294          * Post alter hook for modified relations. The change to r2 is always
1295          * internal, but r1 depends on the invocation context.
1296          */
1297         InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1298                                                                  InvalidOid, is_internal);
1299         InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1300                                                                  InvalidOid, true);
1301
1302         /*
1303          * If we have toast tables associated with the relations being swapped,
1304          * deal with them too.
1305          */
1306         if (relform1->reltoastrelid || relform2->reltoastrelid)
1307         {
1308                 if (swap_toast_by_content)
1309                 {
1310                         if (relform1->reltoastrelid && relform2->reltoastrelid)
1311                         {
1312                                 /* Recursively swap the contents of the toast tables */
1313                                 swap_relation_files(relform1->reltoastrelid,
1314                                                                         relform2->reltoastrelid,
1315                                                                         target_is_pg_class,
1316                                                                         swap_toast_by_content,
1317                                                                         is_internal,
1318                                                                         frozenXid,
1319                                                                         cutoffMulti,
1320                                                                         mapped_tables);
1321                         }
1322                         else
1323                         {
1324                                 /* caller messed up */
1325                                 elog(ERROR, "cannot swap toast files by content when there's only one");
1326                         }
1327                 }
1328                 else
1329                 {
1330                         /*
1331                          * We swapped the ownership links, so we need to change dependency
1332                          * data to match.
1333                          *
1334                          * NOTE: it is possible that only one table has a toast table.
1335                          *
1336                          * NOTE: at present, a TOAST table's only dependency is the one on
1337                          * its owning table.  If more are ever created, we'd need to use
1338                          * something more selective than deleteDependencyRecordsFor() to
1339                          * get rid of just the link we want.
1340                          */
1341                         ObjectAddress baseobject,
1342                                                 toastobject;
1343                         long            count;
1344
1345                         /*
1346                          * We disallow this case for system catalogs, to avoid the
1347                          * possibility that the catalog we're rebuilding is one of the
1348                          * ones the dependency changes would change.  It's too late to be
1349                          * making any data changes to the target catalog.
1350                          */
1351                         if (IsSystemClass(r1, relform1))
1352                                 elog(ERROR, "cannot swap toast files by links for system catalogs");
1353
1354                         /* Delete old dependencies */
1355                         if (relform1->reltoastrelid)
1356                         {
1357                                 count = deleteDependencyRecordsFor(RelationRelationId,
1358                                                                                                    relform1->reltoastrelid,
1359                                                                                                    false);
1360                                 if (count != 1)
1361                                         elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1362                                                  count);
1363                         }
1364                         if (relform2->reltoastrelid)
1365                         {
1366                                 count = deleteDependencyRecordsFor(RelationRelationId,
1367                                                                                                    relform2->reltoastrelid,
1368                                                                                                    false);
1369                                 if (count != 1)
1370                                         elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1371                                                  count);
1372                         }
1373
1374                         /* Register new dependencies */
1375                         baseobject.classId = RelationRelationId;
1376                         baseobject.objectSubId = 0;
1377                         toastobject.classId = RelationRelationId;
1378                         toastobject.objectSubId = 0;
1379
1380                         if (relform1->reltoastrelid)
1381                         {
1382                                 baseobject.objectId = r1;
1383                                 toastobject.objectId = relform1->reltoastrelid;
1384                                 recordDependencyOn(&toastobject, &baseobject,
1385                                                                    DEPENDENCY_INTERNAL);
1386                         }
1387
1388                         if (relform2->reltoastrelid)
1389                         {
1390                                 baseobject.objectId = r2;
1391                                 toastobject.objectId = relform2->reltoastrelid;
1392                                 recordDependencyOn(&toastobject, &baseobject,
1393                                                                    DEPENDENCY_INTERNAL);
1394                         }
1395                 }
1396         }
1397
1398         /*
1399          * If we're swapping two toast tables by content, do the same for their
1400          * valid index. The swap can actually be safely done only if the relations
1401          * have indexes.
1402          */
1403         if (swap_toast_by_content &&
1404                 relform1->relkind == RELKIND_TOASTVALUE &&
1405                 relform2->relkind == RELKIND_TOASTVALUE)
1406         {
1407                 Oid                     toastIndex1,
1408                                         toastIndex2;
1409
1410                 /* Get valid index for each relation */
1411                 toastIndex1 = toast_get_valid_index(r1,
1412                                                                                         AccessExclusiveLock);
1413                 toastIndex2 = toast_get_valid_index(r2,
1414                                                                                         AccessExclusiveLock);
1415
1416                 swap_relation_files(toastIndex1,
1417                                                         toastIndex2,
1418                                                         target_is_pg_class,
1419                                                         swap_toast_by_content,
1420                                                         is_internal,
1421                                                         InvalidTransactionId,
1422                                                         InvalidMultiXactId,
1423                                                         mapped_tables);
1424         }
1425
1426         /* Clean up. */
1427         heap_freetuple(reltup1);
1428         heap_freetuple(reltup2);
1429
1430         table_close(relRelation, RowExclusiveLock);
1431 }
1432
1433 /*
1434  * Remove the transient table that was built by make_new_heap, and finish
1435  * cleaning up (including rebuilding all indexes on the old heap).
1436  */
1437 void
1438 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1439                                  bool is_system_catalog,
1440                                  bool swap_toast_by_content,
1441                                  bool check_constraints,
1442                                  bool is_internal,
1443                                  TransactionId frozenXid,
1444                                  MultiXactId cutoffMulti,
1445                                  char newrelpersistence)
1446 {
1447         ObjectAddress object;
1448         Oid                     mapped_tables[4];
1449         int                     reindex_flags;
1450         ReindexParams reindex_params = {0};
1451         int                     i;
1452
1453         /* Report that we are now swapping relation files */
1454         pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1455                                                                  PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1456
1457         /* Zero out possible results from swapped_relation_files */
1458         memset(mapped_tables, 0, sizeof(mapped_tables));
1459
1460         /*
1461          * Swap the contents of the heap relations (including any toast tables).
1462          * Also set old heap's relfrozenxid to frozenXid.
1463          */
1464         swap_relation_files(OIDOldHeap, OIDNewHeap,
1465                                                 (OIDOldHeap == RelationRelationId),
1466                                                 swap_toast_by_content, is_internal,
1467                                                 frozenXid, cutoffMulti, mapped_tables);
1468
1469         /*
1470          * If it's a system catalog, queue a sinval message to flush all catcaches
1471          * on the catalog when we reach CommandCounterIncrement.
1472          */
1473         if (is_system_catalog)
1474                 CacheInvalidateCatalog(OIDOldHeap);
1475
1476         /*
1477          * Rebuild each index on the relation (but not the toast table, which is
1478          * all-new at this point).  It is important to do this before the DROP
1479          * step because if we are processing a system catalog that will be used
1480          * during DROP, we want to have its indexes available.  There is no
1481          * advantage to the other order anyway because this is all transactional,
1482          * so no chance to reclaim disk space before commit.  We do not need a
1483          * final CommandCounterIncrement() because reindex_relation does it.
1484          *
1485          * Note: because index_build is called via reindex_relation, it will never
1486          * set indcheckxmin true for the indexes.  This is OK even though in some
1487          * sense we are building new indexes rather than rebuilding existing ones,
1488          * because the new heap won't contain any HOT chains at all, let alone
1489          * broken ones, so it can't be necessary to set indcheckxmin.
1490          */
1491         reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1492         if (check_constraints)
1493                 reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1494
1495         /*
1496          * Ensure that the indexes have the same persistence as the parent
1497          * relation.
1498          */
1499         if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1500                 reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1501         else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1502                 reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1503
1504         /* Report that we are now reindexing relations */
1505         pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1506                                                                  PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1507
1508         reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params);
1509
1510         /* Report that we are now doing clean up */
1511         pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1512                                                                  PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1513
1514         /*
1515          * If the relation being rebuilt is pg_class, swap_relation_files()
1516          * couldn't update pg_class's own pg_class entry (check comments in
1517          * swap_relation_files()), thus relfrozenxid was not updated. That's
1518          * annoying because a potential reason for doing a VACUUM FULL is a
1519          * imminent or actual anti-wraparound shutdown.  So, now that we can
1520          * access the new relation using its indices, update relfrozenxid.
1521          * pg_class doesn't have a toast relation, so we don't need to update the
1522          * corresponding toast relation. Not that there's little point moving all
1523          * relfrozenxid updates here since swap_relation_files() needs to write to
1524          * pg_class for non-mapped relations anyway.
1525          */
1526         if (OIDOldHeap == RelationRelationId)
1527         {
1528                 Relation        relRelation;
1529                 HeapTuple       reltup;
1530                 Form_pg_class relform;
1531
1532                 relRelation = table_open(RelationRelationId, RowExclusiveLock);
1533
1534                 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1535                 if (!HeapTupleIsValid(reltup))
1536                         elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1537                 relform = (Form_pg_class) GETSTRUCT(reltup);
1538
1539                 relform->relfrozenxid = frozenXid;
1540                 relform->relminmxid = cutoffMulti;
1541
1542                 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1543
1544                 table_close(relRelation, RowExclusiveLock);
1545         }
1546
1547         /* Destroy new heap with old filenumber */
1548         object.classId = RelationRelationId;
1549         object.objectId = OIDNewHeap;
1550         object.objectSubId = 0;
1551
1552         /*
1553          * The new relation is local to our transaction and we know nothing
1554          * depends on it, so DROP_RESTRICT should be OK.
1555          */
1556         performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1557
1558         /* performDeletion does CommandCounterIncrement at end */
1559
1560         /*
1561          * Now we must remove any relation mapping entries that we set up for the
1562          * transient table, as well as its toast table and toast index if any. If
1563          * we fail to do this before commit, the relmapper will complain about new
1564          * permanent map entries being added post-bootstrap.
1565          */
1566         for (i = 0; OidIsValid(mapped_tables[i]); i++)
1567                 RelationMapRemoveMapping(mapped_tables[i]);
1568
1569         /*
1570          * At this point, everything is kosher except that, if we did toast swap
1571          * by links, the toast table's name corresponds to the transient table.
1572          * The name is irrelevant to the backend because it's referenced by OID,
1573          * but users looking at the catalogs could be confused.  Rename it to
1574          * prevent this problem.
1575          *
1576          * Note no lock required on the relation, because we already hold an
1577          * exclusive lock on it.
1578          */
1579         if (!swap_toast_by_content)
1580         {
1581                 Relation        newrel;
1582
1583                 newrel = table_open(OIDOldHeap, NoLock);
1584                 if (OidIsValid(newrel->rd_rel->reltoastrelid))
1585                 {
1586                         Oid                     toastidx;
1587                         char            NewToastName[NAMEDATALEN];
1588
1589                         /* Get the associated valid index to be renamed */
1590                         toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1591                                                                                          NoLock);
1592
1593                         /* rename the toast table ... */
1594                         snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1595                                          OIDOldHeap);
1596                         RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1597                                                                    NewToastName, true, false);
1598
1599                         /* ... and its valid index too. */
1600                         snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1601                                          OIDOldHeap);
1602
1603                         RenameRelationInternal(toastidx,
1604                                                                    NewToastName, true, true);
1605
1606                         /*
1607                          * Reset the relrewrite for the toast. The command-counter
1608                          * increment is required here as we are about to update the tuple
1609                          * that is updated as part of RenameRelationInternal.
1610                          */
1611                         CommandCounterIncrement();
1612                         ResetRelRewrite(newrel->rd_rel->reltoastrelid);
1613                 }
1614                 relation_close(newrel, NoLock);
1615         }
1616
1617         /* if it's not a catalog table, clear any missing attribute settings */
1618         if (!is_system_catalog)
1619         {
1620                 Relation        newrel;
1621
1622                 newrel = table_open(OIDOldHeap, NoLock);
1623                 RelationClearMissing(newrel);
1624                 relation_close(newrel, NoLock);
1625         }
1626 }
1627
1628
1629 /*
1630  * Get a list of tables that the current user has privileges on and
1631  * have indisclustered set.  Return the list in a List * of RelToCluster
1632  * (stored in the specified memory context), each one giving the tableOid
1633  * and the indexOid on which the table is already clustered.
1634  */
1635 static List *
1636 get_tables_to_cluster(MemoryContext cluster_context)
1637 {
1638         Relation        indRelation;
1639         TableScanDesc scan;
1640         ScanKeyData entry;
1641         HeapTuple       indexTuple;
1642         Form_pg_index index;
1643         MemoryContext old_context;
1644         List       *rtcs = NIL;
1645
1646         /*
1647          * Get all indexes that have indisclustered set and that the current user
1648          * has the appropriate privileges for.
1649          */
1650         indRelation = table_open(IndexRelationId, AccessShareLock);
1651         ScanKeyInit(&entry,
1652                                 Anum_pg_index_indisclustered,
1653                                 BTEqualStrategyNumber, F_BOOLEQ,
1654                                 BoolGetDatum(true));
1655         scan = table_beginscan_catalog(indRelation, 1, &entry);
1656         while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1657         {
1658                 RelToCluster *rtc;
1659
1660                 index = (Form_pg_index) GETSTRUCT(indexTuple);
1661
1662                 if (!cluster_is_permitted_for_relation(index->indrelid, GetUserId()))
1663                         continue;
1664
1665                 /* Use a permanent memory context for the result list */
1666                 old_context = MemoryContextSwitchTo(cluster_context);
1667
1668                 rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1669                 rtc->tableOid = index->indrelid;
1670                 rtc->indexOid = index->indexrelid;
1671                 rtcs = lappend(rtcs, rtc);
1672
1673                 MemoryContextSwitchTo(old_context);
1674         }
1675         table_endscan(scan);
1676
1677         relation_close(indRelation, AccessShareLock);
1678
1679         return rtcs;
1680 }
1681
1682 /*
1683  * Given an index on a partitioned table, return a list of RelToCluster for
1684  * all the children leaves tables/indexes.
1685  *
1686  * Like expand_vacuum_rel, but here caller must hold AccessExclusiveLock
1687  * on the table containing the index.
1688  */
1689 static List *
1690 get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid)
1691 {
1692         List       *inhoids;
1693         ListCell   *lc;
1694         List       *rtcs = NIL;
1695         MemoryContext old_context;
1696
1697         /* Do not lock the children until they're processed */
1698         inhoids = find_all_inheritors(indexOid, NoLock, NULL);
1699
1700         foreach(lc, inhoids)
1701         {
1702                 Oid                     indexrelid = lfirst_oid(lc);
1703                 Oid                     relid = IndexGetRelation(indexrelid, false);
1704                 RelToCluster *rtc;
1705
1706                 /* consider only leaf indexes */
1707                 if (get_rel_relkind(indexrelid) != RELKIND_INDEX)
1708                         continue;
1709
1710                 /*
1711                  * It's possible that the user does not have privileges to CLUSTER the
1712                  * leaf partition despite having such privileges on the partitioned
1713                  * table.  We skip any partitions which the user is not permitted to
1714                  * CLUSTER.
1715                  */
1716                 if (!cluster_is_permitted_for_relation(relid, GetUserId()))
1717                         continue;
1718
1719                 /* Use a permanent memory context for the result list */
1720                 old_context = MemoryContextSwitchTo(cluster_context);
1721
1722                 rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1723                 rtc->tableOid = relid;
1724                 rtc->indexOid = indexrelid;
1725                 rtcs = lappend(rtcs, rtc);
1726
1727                 MemoryContextSwitchTo(old_context);
1728         }
1729
1730         return rtcs;
1731 }
1732
1733 /*
1734  * Return whether userid has privileges to CLUSTER relid.  If not, this
1735  * function emits a WARNING.
1736  */
1737 static bool
1738 cluster_is_permitted_for_relation(Oid relid, Oid userid)
1739 {
1740         if (pg_class_aclcheck(relid, userid, ACL_MAINTAIN) == ACLCHECK_OK)
1741                 return true;
1742
1743         ereport(WARNING,
1744                         (errmsg("permission denied to cluster \"%s\", skipping it",
1745                                         get_rel_name(relid))));
1746         return false;
1747 }