src/backend/catalog/storage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * storage.c
   4  *        code to create and destroy physical storage for relations
   5  *
   6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/catalog/storage.c
  12  *
  13  * NOTES
  14  *        Some of this code used to be in storage/smgr/smgr.c, and the
  15  *        function names still reflect that.
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19
  20 #include "postgres.h"
  21
  22 #include "access/visibilitymap.h"
  23 #include "access/xact.h"
  24 #include "access/xlog.h"
  25 #include "access/xloginsert.h"
  26 #include "access/xlogutils.h"
  27 #include "catalog/storage.h"
  28 #include "catalog/storage_xlog.h"
  29 #include "miscadmin.h"
  30 #include "storage/bulk_write.h"
  31 #include "storage/freespace.h"
  32 #include "storage/proc.h"
  33 #include "storage/smgr.h"
  34 #include "utils/hsearch.h"
  35 #include "utils/memutils.h"
  36 #include "utils/rel.h"
  37
  38 /* GUC variables */
  39 int                     wal_skip_threshold = 2048;      /* in kilobytes */
  40
  41 /*
  42  * We keep a list of all relations (represented as RelFileLocator values)
  43  * that have been created or deleted in the current transaction.  When
  44  * a relation is created, we create the physical file immediately, but
  45  * remember it so that we can delete the file again if the current
  46  * transaction is aborted.  Conversely, a deletion request is NOT
  47  * executed immediately, but is just entered in the list.  When and if
  48  * the transaction commits, we can delete the physical file.
  49  *
  50  * To handle subtransactions, every entry is marked with its transaction
  51  * nesting level.  At subtransaction commit, we reassign the subtransaction's
  52  * entries to the parent nesting level.  At subtransaction abort, we can
  53  * immediately execute the abort-time actions for all entries of the current
  54  * nesting level.
  55  *
  56  * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
  57  * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
  58  * but I'm being paranoid.
  59  */
  60
  61 typedef struct PendingRelDelete
  62 {
  63         RelFileLocator rlocator;        /* relation that may need to be deleted */
  64         ProcNumber      procNumber;             /* INVALID_PROC_NUMBER if not a temp rel */
  65         bool            atCommit;               /* T=delete at commit; F=delete at abort */
  66         int                     nestLevel;              /* xact nesting level of request */
  67         struct PendingRelDelete *next;  /* linked-list link */
  68 } PendingRelDelete;
  69
  70 typedef struct PendingRelSync
  71 {
  72         RelFileLocator rlocator;
  73         bool            is_truncated;   /* Has the file experienced truncation? */
  74 } PendingRelSync;
  75
  76 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
  77 static HTAB *pendingSyncHash = NULL;
  78
  79
  80 /*
  81  * AddPendingSync
  82  *              Queue an at-commit fsync.
  83  */
  84 static void
  85 AddPendingSync(const RelFileLocator *rlocator)
  86 {
  87         PendingRelSync *pending;
  88         bool            found;
  89
  90         /* create the hash if not yet */
  91         if (!pendingSyncHash)
  92         {
  93                 HASHCTL         ctl;
  94
  95                 ctl.keysize = sizeof(RelFileLocator);
  96                 ctl.entrysize = sizeof(PendingRelSync);
  97                 ctl.hcxt = TopTransactionContext;
  98                 pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
  99                                                                           HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 100         }
 101
 102         pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
 103         Assert(!found);
 104         pending->is_truncated = false;
 105 }
 106
 107 /*
 108  * RelationCreateStorage
 109  *              Create physical storage for a relation.
 110  *
 111  * Create the underlying disk file storage for the relation. This only
 112  * creates the main fork; additional forks are created lazily by the
 113  * modules that need them.
 114  *
 115  * This function is transactional. The creation is WAL-logged, and if the
 116  * transaction aborts later on, the storage will be destroyed.  A caller
 117  * that does not want the storage to be destroyed in case of an abort may
 118  * pass register_delete = false.
 119  */
 120 SMgrRelation
 121 RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
 122                                           bool register_delete)
 123 {
 124         SMgrRelation srel;
 125         ProcNumber      procNumber;
 126         bool            needs_wal;
 127
 128         Assert(!IsInParallelMode());    /* couldn't update pendingSyncHash */
 129
 130         switch (relpersistence)
 131         {
 132                 case RELPERSISTENCE_TEMP:
 133                         procNumber = ProcNumberForTempRelations();
 134                         needs_wal = false;
 135                         break;
 136                 case RELPERSISTENCE_UNLOGGED:
 137                         procNumber = INVALID_PROC_NUMBER;
 138                         needs_wal = false;
 139                         break;
 140                 case RELPERSISTENCE_PERMANENT:
 141                         procNumber = INVALID_PROC_NUMBER;
 142                         needs_wal = true;
 143                         break;
 144                 default:
 145                         elog(ERROR, "invalid relpersistence: %c", relpersistence);
 146                         return NULL;            /* placate compiler */
 147         }
 148
 149         srel = smgropen(rlocator, procNumber);
 150         smgrcreate(srel, MAIN_FORKNUM, false);
 151
 152         if (needs_wal)
 153                 log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
 154
 155         /*
 156          * Add the relation to the list of stuff to delete at abort, if we are
 157          * asked to do so.
 158          */
 159         if (register_delete)
 160         {
 161                 PendingRelDelete *pending;
 162
 163                 pending = (PendingRelDelete *)
 164                         MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
 165                 pending->rlocator = rlocator;
 166                 pending->procNumber = procNumber;
 167                 pending->atCommit = false;      /* delete if abort */
 168                 pending->nestLevel = GetCurrentTransactionNestLevel();
 169                 pending->next = pendingDeletes;
 170                 pendingDeletes = pending;
 171         }
 172
 173         if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
 174         {
 175                 Assert(procNumber == INVALID_PROC_NUMBER);
 176                 AddPendingSync(&rlocator);
 177         }
 178
 179         return srel;
 180 }
 181
 182 /*
 183  * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
 184  */
 185 void
 186 log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
 187 {
 188         xl_smgr_create xlrec;
 189
 190         /*
 191          * Make an XLOG entry reporting the file creation.
 192          */
 193         xlrec.rlocator = *rlocator;
 194         xlrec.forkNum = forkNum;
 195
 196         XLogBeginInsert();
 197         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
 198         XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
 199 }
 200
 201 /*
 202  * RelationDropStorage
 203  *              Schedule unlinking of physical storage at transaction commit.
 204  */
 205 void
 206 RelationDropStorage(Relation rel)
 207 {
 208         PendingRelDelete *pending;
 209
 210         /* Add the relation to the list of stuff to delete at commit */
 211         pending = (PendingRelDelete *)
 212                 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
 213         pending->rlocator = rel->rd_locator;
 214         pending->procNumber = rel->rd_backend;
 215         pending->atCommit = true;       /* delete if commit */
 216         pending->nestLevel = GetCurrentTransactionNestLevel();
 217         pending->next = pendingDeletes;
 218         pendingDeletes = pending;
 219
 220         /*
 221          * NOTE: if the relation was created in this transaction, it will now be
 222          * present in the pending-delete list twice, once with atCommit true and
 223          * once with atCommit false.  Hence, it will be physically deleted at end
 224          * of xact in either case (and the other entry will be ignored by
 225          * smgrDoPendingDeletes, so no error will occur).  We could instead remove
 226          * the existing list entry and delete the physical file immediately, but
 227          * for now I'll keep the logic simple.
 228          */
 229
 230         RelationCloseSmgr(rel);
 231 }
 232
 233 /*
 234  * RelationPreserveStorage
 235  *              Mark a relation as not to be deleted after all.
 236  *
 237  * We need this function because relation mapping changes are committed
 238  * separately from commit of the whole transaction, so it's still possible
 239  * for the transaction to abort after the mapping update is done.
 240  * When a new physical relation is installed in the map, it would be
 241  * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
 242  * The relation mapper fixes this by telling us to not delete such relations
 243  * after all as part of its commit.
 244  *
 245  * We also use this to reuse an old build of an index during ALTER TABLE, this
 246  * time removing the delete-at-commit entry.
 247  *
 248  * No-op if the relation is not among those scheduled for deletion.
 249  */
 250 void
 251 RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
 252 {
 253         PendingRelDelete *pending;
 254         PendingRelDelete *prev;
 255         PendingRelDelete *next;
 256
 257         prev = NULL;
 258         for (pending = pendingDeletes; pending != NULL; pending = next)
 259         {
 260                 next = pending->next;
 261                 if (RelFileLocatorEquals(rlocator, pending->rlocator)
 262                         && pending->atCommit == atCommit)
 263                 {
 264                         /* unlink and delete list entry */
 265                         if (prev)
 266                                 prev->next = next;
 267                         else
 268                                 pendingDeletes = next;
 269                         pfree(pending);
 270                         /* prev does not change */
 271                 }
 272                 else
 273                 {
 274                         /* unrelated entry, don't touch it */
 275                         prev = pending;
 276                 }
 277         }
 278 }
 279
 280 /*
 281  * RelationTruncate
 282  *              Physically truncate a relation to the specified number of blocks.
 283  *
 284  * This includes getting rid of any buffers for the blocks that are to be
 285  * dropped.
 286  */
 287 void
 288 RelationTruncate(Relation rel, BlockNumber nblocks)
 289 {
 290         bool            fsm;
 291         bool            vm;
 292         bool            need_fsm_vacuum = false;
 293         ForkNumber      forks[MAX_FORKNUM];
 294         BlockNumber blocks[MAX_FORKNUM];
 295         int                     nforks = 0;
 296         SMgrRelation reln;
 297
 298         /*
 299          * Make sure smgr_targblock etc aren't pointing somewhere past new end.
 300          * (Note: don't rely on this reln pointer below this loop.)
 301          */
 302         reln = RelationGetSmgr(rel);
 303         reln->smgr_targblock = InvalidBlockNumber;
 304         for (int i = 0; i <= MAX_FORKNUM; ++i)
 305                 reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
 306
 307         /* Prepare for truncation of MAIN fork of the relation */
 308         forks[nforks] = MAIN_FORKNUM;
 309         blocks[nforks] = nblocks;
 310         nforks++;
 311
 312         /* Prepare for truncation of the FSM if it exists */
 313         fsm = smgrexists(RelationGetSmgr(rel), FSM_FORKNUM);
 314         if (fsm)
 315         {
 316                 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
 317                 if (BlockNumberIsValid(blocks[nforks]))
 318                 {
 319                         forks[nforks] = FSM_FORKNUM;
 320                         nforks++;
 321                         need_fsm_vacuum = true;
 322                 }
 323         }
 324
 325         /* Prepare for truncation of the visibility map too if it exists */
 326         vm = smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM);
 327         if (vm)
 328         {
 329                 blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
 330                 if (BlockNumberIsValid(blocks[nforks]))
 331                 {
 332                         forks[nforks] = VISIBILITYMAP_FORKNUM;
 333                         nforks++;
 334                 }
 335         }
 336
 337         RelationPreTruncate(rel);
 338
 339         /*
 340          * The code which follows can interact with concurrent checkpoints in two
 341          * separate ways.
 342          *
 343          * First, the truncation operation might drop buffers that the checkpoint
 344          * otherwise would have flushed. If it does, then it's essential that the
 345          * files actually get truncated on disk before the checkpoint record is
 346          * written. Otherwise, if reply begins from that checkpoint, the
 347          * to-be-truncated blocks might still exist on disk but have older
 348          * contents than expected, which can cause replay to fail. It's OK for the
 349          * blocks to not exist on disk at all, but not for them to have the wrong
 350          * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
 351          * this code executes.
 352          *
 353          * Second, the call to smgrtruncate() below will in turn call
 354          * RegisterSyncRequest(). We need the sync request created by that call to
 355          * be processed before the checkpoint completes. CheckPointGuts() will
 356          * call ProcessSyncRequests(), but if we register our sync request after
 357          * that happens, then the WAL record for the truncation could end up
 358          * preceding the checkpoint record, while the actual sync doesn't happen
 359          * until the next checkpoint. To prevent that, we need to set
 360          * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
 361          * the redo pointer of a concurrent checkpoint, we're guaranteed that the
 362          * corresponding sync request will be processed before the checkpoint
 363          * completes.
 364          */
 365         Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
 366         MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
 367
 368         /*
 369          * We WAL-log the truncation before actually truncating, which means
 370          * trouble if the truncation fails. If we then crash, the WAL replay
 371          * likely isn't going to succeed in the truncation either, and cause a
 372          * PANIC. It's tempting to put a critical section here, but that cure
 373          * would be worse than the disease. It would turn a usually harmless
 374          * failure to truncate, that might spell trouble at WAL replay, into a
 375          * certain PANIC.
 376          */
 377         if (RelationNeedsWAL(rel))
 378         {
 379                 /*
 380                  * Make an XLOG entry reporting the file truncation.
 381                  */
 382                 XLogRecPtr      lsn;
 383                 xl_smgr_truncate xlrec;
 384
 385                 xlrec.blkno = nblocks;
 386                 xlrec.rlocator = rel->rd_locator;
 387                 xlrec.flags = SMGR_TRUNCATE_ALL;
 388
 389                 XLogBeginInsert();
 390                 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
 391
 392                 lsn = XLogInsert(RM_SMGR_ID,
 393                                                  XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
 394
 395                 /*
 396                  * Flush, because otherwise the truncation of the main relation might
 397                  * hit the disk before the WAL record, and the truncation of the FSM
 398                  * or visibility map. If we crashed during that window, we'd be left
 399                  * with a truncated heap, but the FSM or visibility map would still
 400                  * contain entries for the non-existent heap pages.
 401                  */
 402                 if (fsm || vm)
 403                         XLogFlush(lsn);
 404         }
 405
 406         /*
 407          * This will first remove any buffers from the buffer pool that should no
 408          * longer exist after truncation is complete, and then truncate the
 409          * corresponding files on disk.
 410          */
 411         smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
 412
 413         /* We've done all the critical work, so checkpoints are OK now. */
 414         MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
 415
 416         /*
 417          * Update upper-level FSM pages to account for the truncation. This is
 418          * important because the just-truncated pages were likely marked as
 419          * all-free, and would be preferentially selected.
 420          *
 421          * NB: There's no point in delaying checkpoints until this is done.
 422          * Because the FSM is not WAL-logged, we have to be prepared for the
 423          * possibility of corruption after a crash anyway.
 424          */
 425         if (need_fsm_vacuum)
 426                 FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
 427 }
 428
 429 /*
 430  * RelationPreTruncate
 431  *              Perform AM-independent work before a physical truncation.
 432  *
 433  * If an access method's relation_nontransactional_truncate does not call
 434  * RelationTruncate(), it must call this before decreasing the table size.
 435  */
 436 void
 437 RelationPreTruncate(Relation rel)
 438 {
 439         PendingRelSync *pending;
 440
 441         if (!pendingSyncHash)
 442                 return;
 443
 444         pending = hash_search(pendingSyncHash,
 445                                                   &(RelationGetSmgr(rel)->smgr_rlocator.locator),
 446                                                   HASH_FIND, NULL);
 447         if (pending)
 448                 pending->is_truncated = true;
 449 }
 450
 451 /*
 452  * Copy a fork's data, block by block.
 453  *
 454  * Note that this requires that there is no dirty data in shared buffers. If
 455  * it's possible that there are, callers need to flush those using
 456  * e.g. FlushRelationBuffers(rel).
 457  *
 458  * Also note that this is frequently called via locutions such as
 459  *              RelationCopyStorage(RelationGetSmgr(rel), ...);
 460  * That's safe only because we perform only smgr and WAL operations here.
 461  * If we invoked anything else, a relcache flush could cause our SMgrRelation
 462  * argument to become a dangling pointer.
 463  */
 464 void
 465 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 466                                         ForkNumber forkNum, char relpersistence)
 467 {
 468         bool            use_wal;
 469         bool            copying_initfork;
 470         BlockNumber nblocks;
 471         BlockNumber blkno;
 472         BulkWriteState *bulkstate;
 473
 474         /*
 475          * The init fork for an unlogged relation in many respects has to be
 476          * treated the same as normal relation, changes need to be WAL logged and
 477          * it needs to be synced to disk.
 478          */
 479         copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
 480                 forkNum == INIT_FORKNUM;
 481
 482         /*
 483          * We need to log the copied data in WAL iff WAL archiving/streaming is
 484          * enabled AND it's a permanent relation.  This gives the same answer as
 485          * "RelationNeedsWAL(rel) || copying_initfork", because we know the
 486          * current operation created new relation storage.
 487          */
 488         use_wal = XLogIsNeeded() &&
 489                 (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 490
 491         bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal);
 492
 493         nblocks = smgrnblocks(src, forkNum);
 494
 495         for (blkno = 0; blkno < nblocks; blkno++)
 496         {
 497                 BulkWriteBuffer buf;
 498
 499                 /* If we got a cancel signal during the copy of the data, quit */
 500                 CHECK_FOR_INTERRUPTS();
 501
 502                 buf = smgr_bulk_get_buf(bulkstate);
 503                 smgrread(src, forkNum, blkno, (Page) buf);
 504
 505                 if (!PageIsVerifiedExtended((Page) buf, blkno,
 506                                                                         PIV_LOG_WARNING | PIV_REPORT_STAT))
 507                 {
 508                         /*
 509                          * For paranoia's sake, capture the file path before invoking the
 510                          * ereport machinery.  This guards against the possibility of a
 511                          * relcache flush caused by, e.g., an errcontext callback.
 512                          * (errcontext callbacks shouldn't be risking any such thing, but
 513                          * people have been known to forget that rule.)
 514                          */
 515                         char       *relpath = relpathbackend(src->smgr_rlocator.locator,
 516                                                                                                  src->smgr_rlocator.backend,
 517                                                                                                  forkNum);
 518
 519                         ereport(ERROR,
 520                                         (errcode(ERRCODE_DATA_CORRUPTED),
 521                                          errmsg("invalid page in block %u of relation %s",
 522                                                         blkno, relpath)));
 523                 }
 524
 525                 /*
 526                  * Queue the page for WAL-logging and writing out.  Unfortunately we
 527                  * don't know what kind of a page this is, so we have to log the full
 528                  * page including any unused space.
 529                  */
 530                 smgr_bulk_write(bulkstate, blkno, buf, false);
 531         }
 532         smgr_bulk_finish(bulkstate);
 533 }
 534
 535 /*
 536  * RelFileLocatorSkippingWAL
 537  *              Check if a BM_PERMANENT relfilelocator is using WAL.
 538  *
 539  * Changes to certain relations must not write WAL; see "Skipping WAL for
 540  * New RelFileLocator" in src/backend/access/transam/README.  Though it is
 541  * known from Relation efficiently, this function is intended for the code
 542  * paths not having access to Relation.
 543  */
 544 bool
 545 RelFileLocatorSkippingWAL(RelFileLocator rlocator)
 546 {
 547         if (!pendingSyncHash ||
 548                 hash_search(pendingSyncHash, &rlocator, HASH_FIND, NULL) == NULL)
 549                 return false;
 550
 551         return true;
 552 }
 553
 554 /*
 555  * EstimatePendingSyncsSpace
 556  *              Estimate space needed to pass syncs to parallel workers.
 557  */
 558 Size
 559 EstimatePendingSyncsSpace(void)
 560 {
 561         long            entries;
 562
 563         entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
 564         return mul_size(1 + entries, sizeof(RelFileLocator));
 565 }
 566
 567 /*
 568  * SerializePendingSyncs
 569  *              Serialize syncs for parallel workers.
 570  */
 571 void
 572 SerializePendingSyncs(Size maxSize, char *startAddress)
 573 {
 574         HTAB       *tmphash;
 575         HASHCTL         ctl;
 576         HASH_SEQ_STATUS scan;
 577         PendingRelSync *sync;
 578         PendingRelDelete *delete;
 579         RelFileLocator *src;
 580         RelFileLocator *dest = (RelFileLocator *) startAddress;
 581
 582         if (!pendingSyncHash)
 583                 goto terminate;
 584
 585         /* Create temporary hash to collect active relfilelocators */
 586         ctl.keysize = sizeof(RelFileLocator);
 587         ctl.entrysize = sizeof(RelFileLocator);
 588         ctl.hcxt = CurrentMemoryContext;
 589         tmphash = hash_create("tmp relfilelocators",
 590                                                   hash_get_num_entries(pendingSyncHash), &ctl,
 591                                                   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 592
 593         /* collect all rlocator from pending syncs */
 594         hash_seq_init(&scan, pendingSyncHash);
 595         while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
 596                 (void) hash_search(tmphash, &sync->rlocator, HASH_ENTER, NULL);
 597
 598         /* remove deleted rnodes */
 599         for (delete = pendingDeletes; delete != NULL; delete = delete->next)
 600                 if (delete->atCommit)
 601                         (void) hash_search(tmphash, &delete->rlocator,
 602                                                            HASH_REMOVE, NULL);
 603
 604         hash_seq_init(&scan, tmphash);
 605         while ((src = (RelFileLocator *) hash_seq_search(&scan)))
 606                 *dest++ = *src;
 607
 608         hash_destroy(tmphash);
 609
 610 terminate:
 611         MemSet(dest, 0, sizeof(RelFileLocator));
 612 }
 613
 614 /*
 615  * RestorePendingSyncs
 616  *              Restore syncs within a parallel worker.
 617  *
 618  * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
 619  * answer to parallel workers.  Only smgrDoPendingSyncs() reads the
 620  * is_truncated field, at end of transaction.  Hence, don't restore it.
 621  */
 622 void
 623 RestorePendingSyncs(char *startAddress)
 624 {
 625         RelFileLocator *rlocator;
 626
 627         Assert(pendingSyncHash == NULL);
 628         for (rlocator = (RelFileLocator *) startAddress; rlocator->relNumber != 0;
 629                  rlocator++)
 630                 AddPendingSync(rlocator);
 631 }
 632
 633 /*
 634  *      smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
 635  *
 636  * This also runs when aborting a subxact; we want to clean up a failed
 637  * subxact immediately.
 638  *
 639  * Note: It's possible that we're being asked to remove a relation that has
 640  * no physical storage in any fork. In particular, it's possible that we're
 641  * cleaning up an old temporary relation for which RemovePgTempFiles has
 642  * already recovered the physical storage.
 643  */
 644 void
 645 smgrDoPendingDeletes(bool isCommit)
 646 {
 647         int                     nestLevel = GetCurrentTransactionNestLevel();
 648         PendingRelDelete *pending;
 649         PendingRelDelete *prev;
 650         PendingRelDelete *next;
 651         int                     nrels = 0,
 652                                 maxrels = 0;
 653         SMgrRelation *srels = NULL;
 654
 655         prev = NULL;
 656         for (pending = pendingDeletes; pending != NULL; pending = next)
 657         {
 658                 next = pending->next;
 659                 if (pending->nestLevel < nestLevel)
 660                 {
 661                         /* outer-level entries should not be processed yet */
 662                         prev = pending;
 663                 }
 664                 else
 665                 {
 666                         /* unlink list entry first, so we don't retry on failure */
 667                         if (prev)
 668                                 prev->next = next;
 669                         else
 670                                 pendingDeletes = next;
 671                         /* do deletion if called for */
 672                         if (pending->atCommit == isCommit)
 673                         {
 674                                 SMgrRelation srel;
 675
 676                                 srel = smgropen(pending->rlocator, pending->procNumber);
 677
 678                                 /* allocate the initial array, or extend it, if needed */
 679                                 if (maxrels == 0)
 680                                 {
 681                                         maxrels = 8;
 682                                         srels = palloc(sizeof(SMgrRelation) * maxrels);
 683                                 }
 684                                 else if (maxrels <= nrels)
 685                                 {
 686                                         maxrels *= 2;
 687                                         srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
 688                                 }
 689
 690                                 srels[nrels++] = srel;
 691                         }
 692                         /* must explicitly free the list entry */
 693                         pfree(pending);
 694                         /* prev does not change */
 695                 }
 696         }
 697
 698         if (nrels > 0)
 699         {
 700                 smgrdounlinkall(srels, nrels, false);
 701
 702                 for (int i = 0; i < nrels; i++)
 703                         smgrclose(srels[i]);
 704
 705                 pfree(srels);
 706         }
 707 }
 708
 709 /*
 710  *      smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
 711  */
 712 void
 713 smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
 714 {
 715         PendingRelDelete *pending;
 716         int                     nrels = 0,
 717                                 maxrels = 0;
 718         SMgrRelation *srels = NULL;
 719         HASH_SEQ_STATUS scan;
 720         PendingRelSync *pendingsync;
 721
 722         Assert(GetCurrentTransactionNestLevel() == 1);
 723
 724         if (!pendingSyncHash)
 725                 return;                                 /* no relation needs sync */
 726
 727         /* Abort -- just throw away all pending syncs */
 728         if (!isCommit)
 729         {
 730                 pendingSyncHash = NULL;
 731                 return;
 732         }
 733
 734         AssertPendingSyncs_RelationCache();
 735
 736         /* Parallel worker -- just throw away all pending syncs */
 737         if (isParallelWorker)
 738         {
 739                 pendingSyncHash = NULL;
 740                 return;
 741         }
 742
 743         /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
 744         for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 745                 if (pending->atCommit)
 746                         (void) hash_search(pendingSyncHash, &pending->rlocator,
 747                                                            HASH_REMOVE, NULL);
 748
 749         hash_seq_init(&scan, pendingSyncHash);
 750         while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
 751         {
 752                 ForkNumber      fork;
 753                 BlockNumber nblocks[MAX_FORKNUM + 1];
 754                 BlockNumber total_blocks = 0;
 755                 SMgrRelation srel;
 756
 757                 srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER);
 758
 759                 /*
 760                  * We emit newpage WAL records for smaller relations.
 761                  *
 762                  * Small WAL records have a chance to be flushed along with other
 763                  * backends' WAL records.  We emit WAL records instead of syncing for
 764                  * files that are smaller than a certain threshold, expecting faster
 765                  * commit.  The threshold is defined by the GUC wal_skip_threshold.
 766                  */
 767                 if (!pendingsync->is_truncated)
 768                 {
 769                         for (fork = 0; fork <= MAX_FORKNUM; fork++)
 770                         {
 771                                 if (smgrexists(srel, fork))
 772                                 {
 773                                         BlockNumber n = smgrnblocks(srel, fork);
 774
 775                                         /* we shouldn't come here for unlogged relations */
 776                                         Assert(fork != INIT_FORKNUM);
 777                                         nblocks[fork] = n;
 778                                         total_blocks += n;
 779                                 }
 780                                 else
 781                                         nblocks[fork] = InvalidBlockNumber;
 782                         }
 783                 }
 784
 785                 /*
 786                  * Sync file or emit WAL records for its contents.
 787                  *
 788                  * Although we emit WAL record if the file is small enough, do file
 789                  * sync regardless of the size if the file has experienced a
 790                  * truncation. It is because the file would be followed by trailing
 791                  * garbage blocks after a crash recovery if, while a past longer file
 792                  * had been flushed out, we omitted syncing-out of the file and
 793                  * emitted WAL instead.  You might think that we could choose WAL if
 794                  * the current main fork is longer than ever, but there's a case where
 795                  * main fork is longer than ever but FSM fork gets shorter.
 796                  */
 797                 if (pendingsync->is_truncated ||
 798                         total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
 799                 {
 800                         /* allocate the initial array, or extend it, if needed */
 801                         if (maxrels == 0)
 802                         {
 803                                 maxrels = 8;
 804                                 srels = palloc(sizeof(SMgrRelation) * maxrels);
 805                         }
 806                         else if (maxrels <= nrels)
 807                         {
 808                                 maxrels *= 2;
 809                                 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
 810                         }
 811
 812                         srels[nrels++] = srel;
 813                 }
 814                 else
 815                 {
 816                         /* Emit WAL records for all blocks.  The file is small enough. */
 817                         for (fork = 0; fork <= MAX_FORKNUM; fork++)
 818                         {
 819                                 int                     n = nblocks[fork];
 820                                 Relation        rel;
 821
 822                                 if (!BlockNumberIsValid(n))
 823                                         continue;
 824
 825                                 /*
 826                                  * Emit WAL for the whole file.  Unfortunately we don't know
 827                                  * what kind of a page this is, so we have to log the full
 828                                  * page including any unused space.  ReadBufferExtended()
 829                                  * counts some pgstat events; unfortunately, we discard them.
 830                                  */
 831                                 rel = CreateFakeRelcacheEntry(srel->smgr_rlocator.locator);
 832                                 log_newpage_range(rel, fork, 0, n, false);
 833                                 FreeFakeRelcacheEntry(rel);
 834                         }
 835                 }
 836         }
 837
 838         pendingSyncHash = NULL;
 839
 840         if (nrels > 0)
 841         {
 842                 smgrdosyncall(srels, nrels);
 843                 pfree(srels);
 844         }
 845 }
 846
 847 /*
 848  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
 849  *
 850  * The return value is the number of relations scheduled for termination.
 851  * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
 852  * If there are no relations to be deleted, *ptr is set to NULL.
 853  *
 854  * Only non-temporary relations are included in the returned list.  This is OK
 855  * because the list is used only in contexts where temporary relations don't
 856  * matter: we're either writing to the two-phase state file (and transactions
 857  * that have touched temp tables can't be prepared) or we're writing to xlog
 858  * (and all temporary files will be zapped if we restart anyway, so no need
 859  * for redo to do it also).
 860  *
 861  * Note that the list does not include anything scheduled for termination
 862  * by upper-level transactions.
 863  */
 864 int
 865 smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
 866 {
 867         int                     nestLevel = GetCurrentTransactionNestLevel();
 868         int                     nrels;
 869         RelFileLocator *rptr;
 870         PendingRelDelete *pending;
 871
 872         nrels = 0;
 873         for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 874         {
 875                 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
 876                         && pending->procNumber == INVALID_PROC_NUMBER)
 877                         nrels++;
 878         }
 879         if (nrels == 0)
 880         {
 881                 *ptr = NULL;
 882                 return 0;
 883         }
 884         rptr = (RelFileLocator *) palloc(nrels * sizeof(RelFileLocator));
 885         *ptr = rptr;
 886         for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 887         {
 888                 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
 889                         && pending->procNumber == INVALID_PROC_NUMBER)
 890                 {
 891                         *rptr = pending->rlocator;
 892                         rptr++;
 893                 }
 894         }
 895         return nrels;
 896 }
 897
 898 /*
 899  *      PostPrepare_smgr -- Clean up after a successful PREPARE
 900  *
 901  * What we have to do here is throw away the in-memory state about pending
 902  * relation deletes.  It's all been recorded in the 2PC state file and
 903  * it's no longer smgr's job to worry about it.
 904  */
 905 void
 906 PostPrepare_smgr(void)
 907 {
 908         PendingRelDelete *pending;
 909         PendingRelDelete *next;
 910
 911         for (pending = pendingDeletes; pending != NULL; pending = next)
 912         {
 913                 next = pending->next;
 914                 pendingDeletes = next;
 915                 /* must explicitly free the list entry */
 916                 pfree(pending);
 917         }
 918 }
 919
 920
 921 /*
 922  * AtSubCommit_smgr() --- Take care of subtransaction commit.
 923  *
 924  * Reassign all items in the pending-deletes list to the parent transaction.
 925  */
 926 void
 927 AtSubCommit_smgr(void)
 928 {
 929         int                     nestLevel = GetCurrentTransactionNestLevel();
 930         PendingRelDelete *pending;
 931
 932         for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 933         {
 934                 if (pending->nestLevel >= nestLevel)
 935                         pending->nestLevel = nestLevel - 1;
 936         }
 937 }
 938
 939 /*
 940  * AtSubAbort_smgr() --- Take care of subtransaction abort.
 941  *
 942  * Delete created relations and forget about deleted relations.
 943  * We can execute these operations immediately because we know this
 944  * subtransaction will not commit.
 945  */
 946 void
 947 AtSubAbort_smgr(void)
 948 {
 949         smgrDoPendingDeletes(false);
 950 }
 951
 952 void
 953 smgr_redo(XLogReaderState *record)
 954 {
 955         XLogRecPtr      lsn = record->EndRecPtr;
 956         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
 957
 958         /* Backup blocks are not used in smgr records */
 959         Assert(!XLogRecHasAnyBlockRefs(record));
 960
 961         if (info == XLOG_SMGR_CREATE)
 962         {
 963                 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
 964                 SMgrRelation reln;
 965
 966                 reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
 967                 smgrcreate(reln, xlrec->forkNum, true);
 968         }
 969         else if (info == XLOG_SMGR_TRUNCATE)
 970         {
 971                 xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
 972                 SMgrRelation reln;
 973                 Relation        rel;
 974                 ForkNumber      forks[MAX_FORKNUM];
 975                 BlockNumber blocks[MAX_FORKNUM];
 976                 int                     nforks = 0;
 977                 bool            need_fsm_vacuum = false;
 978
 979                 reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
 980
 981                 /*
 982                  * Forcibly create relation if it doesn't exist (which suggests that
 983                  * it was dropped somewhere later in the WAL sequence).  As in
 984                  * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
 985                  * log as best we can until the drop is seen.
 986                  */
 987                 smgrcreate(reln, MAIN_FORKNUM, true);
 988
 989                 /*
 990                  * Before we perform the truncation, update minimum recovery point to
 991                  * cover this WAL record. Once the relation is truncated, there's no
 992                  * going back. The buffer manager enforces the WAL-first rule for
 993                  * normal updates to relation files, so that the minimum recovery
 994                  * point is always updated before the corresponding change in the data
 995                  * file is flushed to disk. We have to do the same manually here.
 996                  *
 997                  * Doing this before the truncation means that if the truncation fails
 998                  * for some reason, you cannot start up the system even after restart,
 999                  * until you fix the underlying situation so that the truncation will
1000                  * succeed. Alternatively, we could update the minimum recovery point
1001                  * after truncation, but that would leave a small window where the
1002                  * WAL-first rule could be violated.
1003                  */
1004                 XLogFlush(lsn);
1005
1006                 /* Prepare for truncation of MAIN fork */
1007                 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1008                 {
1009                         forks[nforks] = MAIN_FORKNUM;
1010                         blocks[nforks] = xlrec->blkno;
1011                         nforks++;
1012
1013                         /* Also tell xlogutils.c about it */
1014                         XLogTruncateRelation(xlrec->rlocator, MAIN_FORKNUM, xlrec->blkno);
1015                 }
1016
1017                 /* Prepare for truncation of FSM and VM too */
1018                 rel = CreateFakeRelcacheEntry(xlrec->rlocator);
1019
1020                 if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
1021                         smgrexists(reln, FSM_FORKNUM))
1022                 {
1023                         blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
1024                         if (BlockNumberIsValid(blocks[nforks]))
1025                         {
1026                                 forks[nforks] = FSM_FORKNUM;
1027                                 nforks++;
1028                                 need_fsm_vacuum = true;
1029                         }
1030                 }
1031                 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
1032                         smgrexists(reln, VISIBILITYMAP_FORKNUM))
1033                 {
1034                         blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
1035                         if (BlockNumberIsValid(blocks[nforks]))
1036                         {
1037                                 forks[nforks] = VISIBILITYMAP_FORKNUM;
1038                                 nforks++;
1039                         }
1040                 }
1041
1042                 /* Do the real work to truncate relation forks */
1043                 if (nforks > 0)
1044                         smgrtruncate(reln, forks, nforks, blocks);
1045
1046                 /*
1047                  * Update upper-level FSM pages to account for the truncation. This is
1048                  * important because the just-truncated pages were likely marked as
1049                  * all-free, and would be preferentially selected.
1050                  */
1051                 if (need_fsm_vacuum)
1052                         FreeSpaceMapVacuumRange(rel, xlrec->blkno,
1053                                                                         InvalidBlockNumber);
1054
1055                 FreeFakeRelcacheEntry(rel);
1056         }
1057         else
1058                 elog(PANIC, "smgr_redo: unknown op code %u", info);
1059 }