Force a checkpoint in CREATE DATABASE before starting to copy the files,
[PostgreSQL.git] / src / backend / commands / cluster.c
blobdf75c48dcea5eb711269632853d4934656fd0d96
1 /*-------------------------------------------------------------------------
3 * cluster.c
4 * CLUSTER a table on an index.
6 * There is hardly anything left of Paul Brown's original implementation...
9 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994-5, Regents of the University of California
13 * IDENTIFICATION
14 * $PostgreSQL$
16 *-------------------------------------------------------------------------
18 #include "postgres.h"
20 #include "access/genam.h"
21 #include "access/heapam.h"
22 #include "access/relscan.h"
23 #include "access/rewriteheap.h"
24 #include "access/transam.h"
25 #include "access/xact.h"
26 #include "catalog/catalog.h"
27 #include "catalog/dependency.h"
28 #include "catalog/heap.h"
29 #include "catalog/index.h"
30 #include "catalog/indexing.h"
31 #include "catalog/namespace.h"
32 #include "catalog/toasting.h"
33 #include "commands/cluster.h"
34 #include "commands/tablecmds.h"
35 #include "commands/trigger.h"
36 #include "commands/vacuum.h"
37 #include "miscadmin.h"
38 #include "storage/bufmgr.h"
39 #include "storage/procarray.h"
40 #include "utils/acl.h"
41 #include "utils/fmgroids.h"
42 #include "utils/inval.h"
43 #include "utils/lsyscache.h"
44 #include "utils/memutils.h"
45 #include "utils/relcache.h"
46 #include "utils/snapmgr.h"
47 #include "utils/syscache.h"
48 #include "utils/tqual.h"
52 * This struct is used to pass around the information on tables to be
53 * clustered. We need this so we can make a list of them when invoked without
54 * a specific table/index pair.
56 typedef struct
58 Oid tableOid;
59 Oid indexOid;
60 } RelToCluster;
63 static void cluster_rel(RelToCluster *rv, bool recheck);
64 static void rebuild_relation(Relation OldHeap, Oid indexOid);
65 static TransactionId copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
66 static List *get_tables_to_cluster(MemoryContext cluster_context);
70 /*---------------------------------------------------------------------------
71 * This cluster code allows for clustering multiple tables at once. Because
72 * of this, we cannot just run everything on a single transaction, or we
73 * would be forced to acquire exclusive locks on all the tables being
74 * clustered, simultaneously --- very likely leading to deadlock.
76 * To solve this we follow a similar strategy to VACUUM code,
77 * clustering each relation in a separate transaction. For this to work,
78 * we need to:
79 * - provide a separate memory context so that we can pass information in
80 * a way that survives across transactions
81 * - start a new transaction every time a new relation is clustered
82 * - check for validity of the information on to-be-clustered relations,
83 * as someone might have deleted a relation behind our back, or
84 * clustered one on a different index
85 * - end the transaction
87 * The single-relation case does not have any such overhead.
89 * We also allow a relation to be specified without index. In that case,
90 * the indisclustered bit will be looked up, and an ERROR will be thrown
91 * if there is no index with the bit set.
92 *---------------------------------------------------------------------------
94 void
95 cluster(ClusterStmt *stmt, bool isTopLevel)
97 if (stmt->relation != NULL)
99 /* This is the single-relation case. */
100 Oid tableOid,
101 indexOid = InvalidOid;
102 Relation rel;
103 RelToCluster rvtc;
105 /* Find and lock the table */
106 rel = heap_openrv(stmt->relation, AccessExclusiveLock);
108 tableOid = RelationGetRelid(rel);
110 /* Check permissions */
111 if (!pg_class_ownercheck(tableOid, GetUserId()))
112 aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
113 RelationGetRelationName(rel));
116 * Reject clustering a remote temp table ... their local buffer
117 * manager is not going to cope.
119 if (isOtherTempNamespace(RelationGetNamespace(rel)))
120 ereport(ERROR,
121 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
122 errmsg("cannot cluster temporary tables of other sessions")));
124 if (stmt->indexname == NULL)
126 ListCell *index;
128 /* We need to find the index that has indisclustered set. */
129 foreach(index, RelationGetIndexList(rel))
131 HeapTuple idxtuple;
132 Form_pg_index indexForm;
134 indexOid = lfirst_oid(index);
135 idxtuple = SearchSysCache(INDEXRELID,
136 ObjectIdGetDatum(indexOid),
137 0, 0, 0);
138 if (!HeapTupleIsValid(idxtuple))
139 elog(ERROR, "cache lookup failed for index %u", indexOid);
140 indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
141 if (indexForm->indisclustered)
143 ReleaseSysCache(idxtuple);
144 break;
146 ReleaseSysCache(idxtuple);
147 indexOid = InvalidOid;
150 if (!OidIsValid(indexOid))
151 ereport(ERROR,
152 (errcode(ERRCODE_UNDEFINED_OBJECT),
153 errmsg("there is no previously clustered index for table \"%s\"",
154 stmt->relation->relname)));
156 else
159 * The index is expected to be in the same namespace as the
160 * relation.
162 indexOid = get_relname_relid(stmt->indexname,
163 rel->rd_rel->relnamespace);
164 if (!OidIsValid(indexOid))
165 ereport(ERROR,
166 (errcode(ERRCODE_UNDEFINED_OBJECT),
167 errmsg("index \"%s\" for table \"%s\" does not exist",
168 stmt->indexname, stmt->relation->relname)));
171 /* All other checks are done in cluster_rel() */
172 rvtc.tableOid = tableOid;
173 rvtc.indexOid = indexOid;
175 /* close relation, keep lock till commit */
176 heap_close(rel, NoLock);
178 /* Do the job */
179 cluster_rel(&rvtc, false);
181 else
184 * This is the "multi relation" case. We need to cluster all tables
185 * that have some index with indisclustered set.
187 MemoryContext cluster_context;
188 List *rvs;
189 ListCell *rv;
192 * We cannot run this form of CLUSTER inside a user transaction block;
193 * we'd be holding locks way too long.
195 PreventTransactionChain(isTopLevel, "CLUSTER");
198 * Create special memory context for cross-transaction storage.
200 * Since it is a child of PortalContext, it will go away even in case
201 * of error.
203 cluster_context = AllocSetContextCreate(PortalContext,
204 "Cluster",
205 ALLOCSET_DEFAULT_MINSIZE,
206 ALLOCSET_DEFAULT_INITSIZE,
207 ALLOCSET_DEFAULT_MAXSIZE);
210 * Build the list of relations to cluster. Note that this lives in
211 * cluster_context.
213 rvs = get_tables_to_cluster(cluster_context);
215 /* Commit to get out of starting transaction */
216 PopActiveSnapshot();
217 CommitTransactionCommand();
219 /* Ok, now that we've got them all, cluster them one by one */
220 foreach(rv, rvs)
222 RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
224 /* Start a new transaction for each relation. */
225 StartTransactionCommand();
226 /* functions in indexes may want a snapshot set */
227 PushActiveSnapshot(GetTransactionSnapshot());
228 cluster_rel(rvtc, true);
229 PopActiveSnapshot();
230 CommitTransactionCommand();
233 /* Start a new transaction for the cleanup work. */
234 StartTransactionCommand();
236 /* Clean up working storage */
237 MemoryContextDelete(cluster_context);
242 * cluster_rel
244 * This clusters the table by creating a new, clustered table and
245 * swapping the relfilenodes of the new table and the old table, so
246 * the OID of the original table is preserved. Thus we do not lose
247 * GRANT, inheritance nor references to this table (this was a bug
248 * in releases thru 7.3).
250 * Also create new indexes and swap the filenodes with the old indexes the
251 * same way we do for the relation. Since we are effectively bulk-loading
252 * the new table, it's better to create the indexes afterwards than to fill
253 * them incrementally while we load the table.
255 static void
256 cluster_rel(RelToCluster *rvtc, bool recheck)
258 Relation OldHeap;
260 /* Check for user-requested abort. */
261 CHECK_FOR_INTERRUPTS();
264 * We grab exclusive access to the target rel and index for the duration
265 * of the transaction. (This is redundant for the single-transaction
266 * case, since cluster() already did it.) The index lock is taken inside
267 * check_index_is_clusterable.
269 OldHeap = try_relation_open(rvtc->tableOid, AccessExclusiveLock);
271 /* If the table has gone away, we can skip processing it */
272 if (!OldHeap)
273 return;
276 * Since we may open a new transaction for each relation, we have to check
277 * that the relation still is what we think it is.
279 * If this is a single-transaction CLUSTER, we can skip these tests. We
280 * *must* skip the one on indisclustered since it would reject an attempt
281 * to cluster a not-previously-clustered index.
283 if (recheck)
285 HeapTuple tuple;
286 Form_pg_index indexForm;
288 /* Check that the user still owns the relation */
289 if (!pg_class_ownercheck(rvtc->tableOid, GetUserId()))
291 relation_close(OldHeap, AccessExclusiveLock);
292 return;
296 * Silently skip a temp table for a remote session. Only doing this
297 * check in the "recheck" case is appropriate (which currently means
298 * somebody is executing a database-wide CLUSTER), because there is
299 * another check in cluster() which will stop any attempt to cluster
300 * remote temp tables by name. There is another check in
301 * check_index_is_clusterable which is redundant, but we leave it for
302 * extra safety.
304 if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
306 relation_close(OldHeap, AccessExclusiveLock);
307 return;
311 * Check that the index still exists
313 if (!SearchSysCacheExists(RELOID,
314 ObjectIdGetDatum(rvtc->indexOid),
315 0, 0, 0))
317 relation_close(OldHeap, AccessExclusiveLock);
318 return;
322 * Check that the index is still the one with indisclustered set.
324 tuple = SearchSysCache(INDEXRELID,
325 ObjectIdGetDatum(rvtc->indexOid),
326 0, 0, 0);
327 if (!HeapTupleIsValid(tuple)) /* probably can't happen */
329 relation_close(OldHeap, AccessExclusiveLock);
330 return;
332 indexForm = (Form_pg_index) GETSTRUCT(tuple);
333 if (!indexForm->indisclustered)
335 ReleaseSysCache(tuple);
336 relation_close(OldHeap, AccessExclusiveLock);
337 return;
339 ReleaseSysCache(tuple);
342 /* Check index is valid to cluster on */
343 check_index_is_clusterable(OldHeap, rvtc->indexOid, recheck);
345 /* rebuild_relation does all the dirty work */
346 rebuild_relation(OldHeap, rvtc->indexOid);
348 /* NB: rebuild_relation does heap_close() on OldHeap */
352 * Verify that the specified index is a legitimate index to cluster on
354 * Side effect: obtains exclusive lock on the index. The caller should
355 * already have exclusive lock on the table, so the index lock is likely
356 * redundant, but it seems best to grab it anyway to ensure the index
357 * definition can't change under us.
359 void
360 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck)
362 Relation OldIndex;
364 OldIndex = index_open(indexOid, AccessExclusiveLock);
367 * Check that index is in fact an index on the given relation
369 if (OldIndex->rd_index == NULL ||
370 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
371 ereport(ERROR,
372 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
373 errmsg("\"%s\" is not an index for table \"%s\"",
374 RelationGetRelationName(OldIndex),
375 RelationGetRelationName(OldHeap))));
378 * Disallow clustering on incomplete indexes (those that might not index
379 * every row of the relation). We could relax this by making a separate
380 * seqscan pass over the table to copy the missing rows, but that seems
381 * expensive and tedious.
383 if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
384 ereport(ERROR,
385 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
386 errmsg("cannot cluster on partial index \"%s\"",
387 RelationGetRelationName(OldIndex))));
389 if (!OldIndex->rd_am->amclusterable)
390 ereport(ERROR,
391 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
392 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
393 RelationGetRelationName(OldIndex))));
395 if (!OldIndex->rd_am->amindexnulls)
397 AttrNumber colno;
400 * If the AM doesn't index nulls, then it's a partial index unless we
401 * can prove all the rows are non-null. Note we only need look at the
402 * first column; multicolumn-capable AMs are *required* to index nulls
403 * in columns after the first.
405 colno = OldIndex->rd_index->indkey.values[0];
406 if (colno > 0)
408 /* ordinary user attribute */
409 if (!OldHeap->rd_att->attrs[colno - 1]->attnotnull)
410 ereport(ERROR,
411 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
412 errmsg("cannot cluster on index \"%s\" because access method does not handle null values",
413 RelationGetRelationName(OldIndex)),
414 recheck
415 ? errhint("You might be able to work around this by marking column \"%s\" NOT NULL, or use ALTER TABLE ... SET WITHOUT CLUSTER to remove the cluster specification from the table.",
416 NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))
417 : errhint("You might be able to work around this by marking column \"%s\" NOT NULL.",
418 NameStr(OldHeap->rd_att->attrs[colno - 1]->attname))));
420 else if (colno < 0)
422 /* system column --- okay, always non-null */
424 else
425 /* index expression, lose... */
426 ereport(ERROR,
427 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
428 errmsg("cannot cluster on expressional index \"%s\" because its index access method does not handle null values",
429 RelationGetRelationName(OldIndex))));
433 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
434 * it might well not contain entries for every heap row, or might not even
435 * be internally consistent. (But note that we don't check indcheckxmin;
436 * the worst consequence of following broken HOT chains would be that we
437 * might put recently-dead tuples out-of-order in the new table, and there
438 * is little harm in that.)
440 if (!OldIndex->rd_index->indisvalid)
441 ereport(ERROR,
442 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
443 errmsg("cannot cluster on invalid index \"%s\"",
444 RelationGetRelationName(OldIndex))));
447 * Disallow clustering system relations. This will definitely NOT work
448 * for shared relations (we have no way to update pg_class rows in other
449 * databases), nor for nailed-in-cache relations (the relfilenode values
450 * for those are hardwired, see relcache.c). It might work for other
451 * system relations, but I ain't gonna risk it.
453 if (IsSystemRelation(OldHeap))
454 ereport(ERROR,
455 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
456 errmsg("\"%s\" is a system catalog",
457 RelationGetRelationName(OldHeap))));
460 * Don't allow cluster on temp tables of other backends ... their local
461 * buffer manager is not going to cope.
463 if (isOtherTempNamespace(RelationGetNamespace(OldHeap)))
464 ereport(ERROR,
465 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
466 errmsg("cannot cluster temporary tables of other sessions")));
469 * Also check for active uses of the relation in the current transaction,
470 * including open scans and pending AFTER trigger events.
472 CheckTableNotInUse(OldHeap, "CLUSTER");
474 /* Drop relcache refcnt on OldIndex, but keep lock */
475 index_close(OldIndex, NoLock);
479 * mark_index_clustered: mark the specified index as the one clustered on
481 * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
483 void
484 mark_index_clustered(Relation rel, Oid indexOid)
486 HeapTuple indexTuple;
487 Form_pg_index indexForm;
488 Relation pg_index;
489 ListCell *index;
492 * If the index is already marked clustered, no need to do anything.
494 if (OidIsValid(indexOid))
496 indexTuple = SearchSysCache(INDEXRELID,
497 ObjectIdGetDatum(indexOid),
498 0, 0, 0);
499 if (!HeapTupleIsValid(indexTuple))
500 elog(ERROR, "cache lookup failed for index %u", indexOid);
501 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
503 if (indexForm->indisclustered)
505 ReleaseSysCache(indexTuple);
506 return;
509 ReleaseSysCache(indexTuple);
513 * Check each index of the relation and set/clear the bit as needed.
515 pg_index = heap_open(IndexRelationId, RowExclusiveLock);
517 foreach(index, RelationGetIndexList(rel))
519 Oid thisIndexOid = lfirst_oid(index);
521 indexTuple = SearchSysCacheCopy(INDEXRELID,
522 ObjectIdGetDatum(thisIndexOid),
523 0, 0, 0);
524 if (!HeapTupleIsValid(indexTuple))
525 elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
526 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
529 * Unset the bit if set. We know it's wrong because we checked this
530 * earlier.
532 if (indexForm->indisclustered)
534 indexForm->indisclustered = false;
535 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
536 CatalogUpdateIndexes(pg_index, indexTuple);
537 /* Ensure we see the update in the index's relcache entry */
538 CacheInvalidateRelcacheByRelid(thisIndexOid);
540 else if (thisIndexOid == indexOid)
542 indexForm->indisclustered = true;
543 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
544 CatalogUpdateIndexes(pg_index, indexTuple);
545 /* Ensure we see the update in the index's relcache entry */
546 CacheInvalidateRelcacheByRelid(thisIndexOid);
548 heap_freetuple(indexTuple);
551 heap_close(pg_index, RowExclusiveLock);
555 * rebuild_relation: rebuild an existing relation in index order
557 * OldHeap: table to rebuild --- must be opened and exclusive-locked!
558 * indexOid: index to cluster by
560 * NB: this routine closes OldHeap at the right time; caller should not.
562 static void
563 rebuild_relation(Relation OldHeap, Oid indexOid)
565 Oid tableOid = RelationGetRelid(OldHeap);
566 Oid tableSpace = OldHeap->rd_rel->reltablespace;
567 Oid OIDNewHeap;
568 char NewHeapName[NAMEDATALEN];
569 TransactionId frozenXid;
570 ObjectAddress object;
572 /* Mark the correct index as clustered */
573 mark_index_clustered(OldHeap, indexOid);
575 /* Close relcache entry, but keep lock until transaction commit */
576 heap_close(OldHeap, NoLock);
579 * Create the new heap, using a temporary name in the same namespace as
580 * the existing table. NOTE: there is some risk of collision with user
581 * relnames. Working around this seems more trouble than it's worth; in
582 * particular, we can't create the new heap in a different namespace from
583 * the old, or we will have problems with the TEMP status of temp tables.
585 snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", tableOid);
587 OIDNewHeap = make_new_heap(tableOid, NewHeapName, tableSpace);
590 * We don't need CommandCounterIncrement() because make_new_heap did it.
594 * Copy the heap data into the new table in the desired order.
596 frozenXid = copy_heap_data(OIDNewHeap, tableOid, indexOid);
598 /* To make the new heap's data visible (probably not needed?). */
599 CommandCounterIncrement();
601 /* Swap the physical files of the old and new heaps. */
602 swap_relation_files(tableOid, OIDNewHeap, frozenXid);
604 CommandCounterIncrement();
606 /* Destroy new heap with old filenode */
607 object.classId = RelationRelationId;
608 object.objectId = OIDNewHeap;
609 object.objectSubId = 0;
612 * The new relation is local to our transaction and we know nothing
613 * depends on it, so DROP_RESTRICT should be OK.
615 performDeletion(&object, DROP_RESTRICT);
617 /* performDeletion does CommandCounterIncrement at end */
620 * Rebuild each index on the relation (but not the toast table, which is
621 * all-new at this point). We do not need CommandCounterIncrement()
622 * because reindex_relation does it.
624 reindex_relation(tableOid, false);
628 * Create the new table that we will fill with correctly-ordered data.
631 make_new_heap(Oid OIDOldHeap, const char *NewName, Oid NewTableSpace)
633 TupleDesc OldHeapDesc,
634 tupdesc;
635 Oid OIDNewHeap;
636 Relation OldHeap;
637 HeapTuple tuple;
638 Datum reloptions;
639 bool isNull;
641 OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
642 OldHeapDesc = RelationGetDescr(OldHeap);
645 * Need to make a copy of the tuple descriptor, since
646 * heap_create_with_catalog modifies it. Note that the NewHeap will
647 * not receive any of the defaults or constraints associated with the
648 * OldHeap; we don't need 'em, and there's no reason to spend cycles
649 * inserting them into the catalogs only to delete them.
651 tupdesc = CreateTupleDescCopy(OldHeapDesc);
654 * Use options of the old heap for new heap.
656 tuple = SearchSysCache(RELOID,
657 ObjectIdGetDatum(OIDOldHeap),
658 0, 0, 0);
659 if (!HeapTupleIsValid(tuple))
660 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
661 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
662 &isNull);
663 if (isNull)
664 reloptions = (Datum) 0;
666 OIDNewHeap = heap_create_with_catalog(NewName,
667 RelationGetNamespace(OldHeap),
668 NewTableSpace,
669 InvalidOid,
670 OldHeap->rd_rel->relowner,
671 tupdesc,
672 NIL,
673 OldHeap->rd_rel->relkind,
674 OldHeap->rd_rel->relisshared,
675 true,
677 ONCOMMIT_NOOP,
678 reloptions,
679 allowSystemTableMods);
681 ReleaseSysCache(tuple);
684 * Advance command counter so that the newly-created relation's catalog
685 * tuples will be visible to heap_open.
687 CommandCounterIncrement();
690 * If necessary, create a TOAST table for the new relation. Note that
691 * AlterTableCreateToastTable ends with CommandCounterIncrement(), so that
692 * the TOAST table will be visible for insertion.
694 AlterTableCreateToastTable(OIDNewHeap);
696 heap_close(OldHeap, NoLock);
698 return OIDNewHeap;
702 * Do the physical copying of heap data. Returns the TransactionId used as
703 * freeze cutoff point for the tuples.
705 static TransactionId
706 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
708 Relation NewHeap,
709 OldHeap,
710 OldIndex;
711 TupleDesc oldTupDesc;
712 TupleDesc newTupDesc;
713 int natts;
714 Datum *values;
715 bool *isnull;
716 IndexScanDesc scan;
717 HeapTuple tuple;
718 bool use_wal;
719 TransactionId OldestXmin;
720 TransactionId FreezeXid;
721 RewriteState rwstate;
724 * Open the relations we need.
726 NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
727 OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
728 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
731 * Their tuple descriptors should be exactly alike, but here we only need
732 * assume that they have the same number of columns.
734 oldTupDesc = RelationGetDescr(OldHeap);
735 newTupDesc = RelationGetDescr(NewHeap);
736 Assert(newTupDesc->natts == oldTupDesc->natts);
738 /* Preallocate values/isnull arrays */
739 natts = newTupDesc->natts;
740 values = (Datum *) palloc(natts * sizeof(Datum));
741 isnull = (bool *) palloc(natts * sizeof(bool));
744 * We need to log the copied data in WAL iff WAL archiving is enabled AND
745 * it's not a temp rel.
747 use_wal = XLogArchivingActive() && !NewHeap->rd_istemp;
749 /* use_wal off requires rd_targblock be initially invalid */
750 Assert(NewHeap->rd_targblock == InvalidBlockNumber);
753 * compute xids used to freeze and weed out dead tuples. We use -1
754 * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a
755 * plain VACUUM would.
757 vacuum_set_xid_limits(-1, OldHeap->rd_rel->relisshared,
758 &OldestXmin, &FreezeXid);
761 * FreezeXid will become the table's new relfrozenxid, and that mustn't
762 * go backwards, so take the max.
764 if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
765 FreezeXid = OldHeap->rd_rel->relfrozenxid;
767 /* Initialize the rewrite operation */
768 rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal);
771 * Scan through the OldHeap in OldIndex order and copy each tuple into the
772 * NewHeap. To ensure we see recently-dead tuples that still need to be
773 * copied, we scan with SnapshotAny and use HeapTupleSatisfiesVacuum for
774 * the visibility test.
776 scan = index_beginscan(OldHeap, OldIndex,
777 SnapshotAny, 0, (ScanKey) NULL);
779 while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL)
781 HeapTuple copiedTuple;
782 bool isdead;
783 int i;
785 CHECK_FOR_INTERRUPTS();
787 /* Since we used no scan keys, should never need to recheck */
788 if (scan->xs_recheck)
789 elog(ERROR, "CLUSTER does not support lossy index conditions");
791 LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
793 switch (HeapTupleSatisfiesVacuum(tuple->t_data, OldestXmin,
794 scan->xs_cbuf))
796 case HEAPTUPLE_DEAD:
797 /* Definitely dead */
798 isdead = true;
799 break;
800 case HEAPTUPLE_LIVE:
801 case HEAPTUPLE_RECENTLY_DEAD:
802 /* Live or recently dead, must copy it */
803 isdead = false;
804 break;
805 case HEAPTUPLE_INSERT_IN_PROGRESS:
808 * We should not see this unless it's been inserted earlier in
809 * our own transaction.
811 if (!TransactionIdIsCurrentTransactionId(
812 HeapTupleHeaderGetXmin(tuple->t_data)))
813 elog(ERROR, "concurrent insert in progress");
814 /* treat as live */
815 isdead = false;
816 break;
817 case HEAPTUPLE_DELETE_IN_PROGRESS:
820 * We should not see this unless it's been deleted earlier in
821 * our own transaction.
823 Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
824 if (!TransactionIdIsCurrentTransactionId(
825 HeapTupleHeaderGetXmax(tuple->t_data)))
826 elog(ERROR, "concurrent delete in progress");
827 /* treat as recently dead */
828 isdead = false;
829 break;
830 default:
831 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
832 isdead = false; /* keep compiler quiet */
833 break;
836 LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
838 if (isdead)
840 /* heap rewrite module still needs to see it... */
841 rewrite_heap_dead_tuple(rwstate, tuple);
842 continue;
846 * We cannot simply copy the tuple as-is, for several reasons:
848 * 1. We'd like to squeeze out the values of any dropped columns, both
849 * to save space and to ensure we have no corner-case failures. (It's
850 * possible for example that the new table hasn't got a TOAST table
851 * and so is unable to store any large values of dropped cols.)
853 * 2. The tuple might not even be legal for the new table; this is
854 * currently only known to happen as an after-effect of ALTER TABLE
855 * SET WITHOUT OIDS.
857 * So, we must reconstruct the tuple from component Datums.
859 heap_deform_tuple(tuple, oldTupDesc, values, isnull);
861 /* Be sure to null out any dropped columns */
862 for (i = 0; i < natts; i++)
864 if (newTupDesc->attrs[i]->attisdropped)
865 isnull[i] = true;
868 copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
870 /* Preserve OID, if any */
871 if (NewHeap->rd_rel->relhasoids)
872 HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
874 /* The heap rewrite module does the rest */
875 rewrite_heap_tuple(rwstate, tuple, copiedTuple);
877 heap_freetuple(copiedTuple);
880 index_endscan(scan);
882 /* Write out any remaining tuples, and fsync if needed */
883 end_heap_rewrite(rwstate);
885 pfree(values);
886 pfree(isnull);
888 index_close(OldIndex, NoLock);
889 heap_close(OldHeap, NoLock);
890 heap_close(NewHeap, NoLock);
892 return FreezeXid;
896 * Swap the physical files of two given relations.
898 * We swap the physical identity (reltablespace and relfilenode) while
899 * keeping the same logical identities of the two relations.
901 * Also swap any TOAST links, so that the toast data moves along with
902 * the main-table data.
904 * Additionally, the first relation is marked with relfrozenxid set to
905 * frozenXid. It seems a bit ugly to have this here, but all callers would
906 * have to do it anyway, so having it here saves a heap_update. Note: the
907 * TOAST table needs no special handling, because since we swapped the links,
908 * the entry for the TOAST table will now contain RecentXmin in relfrozenxid,
909 * which is the correct value.
911 void
912 swap_relation_files(Oid r1, Oid r2, TransactionId frozenXid)
914 Relation relRelation;
915 HeapTuple reltup1,
916 reltup2;
917 Form_pg_class relform1,
918 relform2;
919 Oid swaptemp;
920 CatalogIndexState indstate;
922 /* We need writable copies of both pg_class tuples. */
923 relRelation = heap_open(RelationRelationId, RowExclusiveLock);
925 reltup1 = SearchSysCacheCopy(RELOID,
926 ObjectIdGetDatum(r1),
927 0, 0, 0);
928 if (!HeapTupleIsValid(reltup1))
929 elog(ERROR, "cache lookup failed for relation %u", r1);
930 relform1 = (Form_pg_class) GETSTRUCT(reltup1);
932 reltup2 = SearchSysCacheCopy(RELOID,
933 ObjectIdGetDatum(r2),
934 0, 0, 0);
935 if (!HeapTupleIsValid(reltup2))
936 elog(ERROR, "cache lookup failed for relation %u", r2);
937 relform2 = (Form_pg_class) GETSTRUCT(reltup2);
940 * Actually swap the fields in the two tuples
942 swaptemp = relform1->relfilenode;
943 relform1->relfilenode = relform2->relfilenode;
944 relform2->relfilenode = swaptemp;
946 swaptemp = relform1->reltablespace;
947 relform1->reltablespace = relform2->reltablespace;
948 relform2->reltablespace = swaptemp;
950 swaptemp = relform1->reltoastrelid;
951 relform1->reltoastrelid = relform2->reltoastrelid;
952 relform2->reltoastrelid = swaptemp;
954 /* we should not swap reltoastidxid */
956 /* set rel1's frozen Xid */
957 Assert(TransactionIdIsNormal(frozenXid));
958 relform1->relfrozenxid = frozenXid;
960 /* swap size statistics too, since new rel has freshly-updated stats */
962 int4 swap_pages;
963 float4 swap_tuples;
965 swap_pages = relform1->relpages;
966 relform1->relpages = relform2->relpages;
967 relform2->relpages = swap_pages;
969 swap_tuples = relform1->reltuples;
970 relform1->reltuples = relform2->reltuples;
971 relform2->reltuples = swap_tuples;
974 /* Update the tuples in pg_class */
975 simple_heap_update(relRelation, &reltup1->t_self, reltup1);
976 simple_heap_update(relRelation, &reltup2->t_self, reltup2);
978 /* Keep system catalogs current */
979 indstate = CatalogOpenIndexes(relRelation);
980 CatalogIndexInsert(indstate, reltup1);
981 CatalogIndexInsert(indstate, reltup2);
982 CatalogCloseIndexes(indstate);
985 * If we have toast tables associated with the relations being swapped,
986 * change their dependency links to re-associate them with their new
987 * owning relations. Otherwise the wrong one will get dropped ...
989 * NOTE: it is possible that only one table has a toast table; this can
990 * happen in CLUSTER if there were dropped columns in the old table, and
991 * in ALTER TABLE when adding or changing type of columns.
993 * NOTE: at present, a TOAST table's only dependency is the one on its
994 * owning table. If more are ever created, we'd need to use something
995 * more selective than deleteDependencyRecordsFor() to get rid of only the
996 * link we want.
998 if (relform1->reltoastrelid || relform2->reltoastrelid)
1000 ObjectAddress baseobject,
1001 toastobject;
1002 long count;
1004 /* Delete old dependencies */
1005 if (relform1->reltoastrelid)
1007 count = deleteDependencyRecordsFor(RelationRelationId,
1008 relform1->reltoastrelid);
1009 if (count != 1)
1010 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1011 count);
1013 if (relform2->reltoastrelid)
1015 count = deleteDependencyRecordsFor(RelationRelationId,
1016 relform2->reltoastrelid);
1017 if (count != 1)
1018 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1019 count);
1022 /* Register new dependencies */
1023 baseobject.classId = RelationRelationId;
1024 baseobject.objectSubId = 0;
1025 toastobject.classId = RelationRelationId;
1026 toastobject.objectSubId = 0;
1028 if (relform1->reltoastrelid)
1030 baseobject.objectId = r1;
1031 toastobject.objectId = relform1->reltoastrelid;
1032 recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1035 if (relform2->reltoastrelid)
1037 baseobject.objectId = r2;
1038 toastobject.objectId = relform2->reltoastrelid;
1039 recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1044 * Blow away the old relcache entries now. We need this kluge because
1045 * relcache.c keeps a link to the smgr relation for the physical file, and
1046 * that will be out of date as soon as we do CommandCounterIncrement.
1047 * Whichever of the rels is the second to be cleared during cache
1048 * invalidation will have a dangling reference to an already-deleted smgr
1049 * relation. Rather than trying to avoid this by ordering operations just
1050 * so, it's easiest to not have the relcache entries there at all.
1051 * (Fortunately, since one of the entries is local in our transaction,
1052 * it's sufficient to clear out our own relcache this way; the problem
1053 * cannot arise for other backends when they see our update on the
1054 * non-local relation.)
1056 RelationForgetRelation(r1);
1057 RelationForgetRelation(r2);
1059 /* Clean up. */
1060 heap_freetuple(reltup1);
1061 heap_freetuple(reltup2);
1063 heap_close(relRelation, RowExclusiveLock);
1067 * Get a list of tables that the current user owns and
1068 * have indisclustered set. Return the list in a List * of rvsToCluster
1069 * with the tableOid and the indexOid on which the table is already
1070 * clustered.
1072 static List *
1073 get_tables_to_cluster(MemoryContext cluster_context)
1075 Relation indRelation;
1076 HeapScanDesc scan;
1077 ScanKeyData entry;
1078 HeapTuple indexTuple;
1079 Form_pg_index index;
1080 MemoryContext old_context;
1081 RelToCluster *rvtc;
1082 List *rvs = NIL;
1085 * Get all indexes that have indisclustered set and are owned by
1086 * appropriate user. System relations or nailed-in relations cannot ever
1087 * have indisclustered set, because CLUSTER will refuse to set it when
1088 * called with one of them as argument.
1090 indRelation = heap_open(IndexRelationId, AccessShareLock);
1091 ScanKeyInit(&entry,
1092 Anum_pg_index_indisclustered,
1093 BTEqualStrategyNumber, F_BOOLEQ,
1094 BoolGetDatum(true));
1095 scan = heap_beginscan(indRelation, SnapshotNow, 1, &entry);
1096 while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1098 index = (Form_pg_index) GETSTRUCT(indexTuple);
1100 if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1101 continue;
1104 * We have to build the list in a different memory context so it will
1105 * survive the cross-transaction processing
1107 old_context = MemoryContextSwitchTo(cluster_context);
1109 rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1110 rvtc->tableOid = index->indrelid;
1111 rvtc->indexOid = index->indexrelid;
1112 rvs = lcons(rvtc, rvs);
1114 MemoryContextSwitchTo(old_context);
1116 heap_endscan(scan);
1118 relation_close(indRelation, AccessShareLock);
1120 return rvs;