src/backend/replication/logical/snapbuild.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * snapbuild.c
   4  *
   5  *        Infrastructure for building historic catalog snapshots based on contents
   6  *        of the WAL, for the purpose of decoding heapam.c style values in the
   7  *        WAL.
   8  *
   9  * NOTES:
  10  *
  11  * We build snapshots which can *only* be used to read catalog contents and we
  12  * do so by reading and interpreting the WAL stream. The aim is to build a
  13  * snapshot that behaves the same as a freshly taken MVCC snapshot would have
  14  * at the time the XLogRecord was generated.
  15  *
  16  * To build the snapshots we reuse the infrastructure built for Hot
  17  * Standby. The in-memory snapshots we build look different than HS' because
  18  * we have different needs. To successfully decode data from the WAL we only
  19  * need to access catalog tables and (sys|rel|cat)cache, not the actual user
  20  * tables since the data we decode is wholly contained in the WAL
  21  * records. Also, our snapshots need to be different in comparison to normal
  22  * MVCC ones because in contrast to those we cannot fully rely on the clog and
  23  * pg_subtrans for information about committed transactions because they might
  24  * commit in the future from the POV of the WAL entry we're currently
  25  * decoding. This definition has the advantage that we only need to prevent
  26  * removal of catalog rows, while normal table's rows can still be
  27  * removed. This is achieved by using the replication slot mechanism.
  28  *
  29  * As the percentage of transactions modifying the catalog normally is fairly
  30  * small in comparisons to ones only manipulating user data, we keep track of
  31  * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
  32  * track of all running transactions like it's done in a normal snapshot. Note
  33  * that we're generally only looking at transactions that have acquired an
  34  * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
  35  * that we consider committed, everything else is considered aborted/in
  36  * progress. That also allows us not to care about subtransactions before they
  37  * have committed which means this module, in contrast to HS, doesn't have to
  38  * care about suboverflowed subtransactions and similar.
  39  *
  40  * One complexity of doing this is that to e.g. handle mixed DDL/DML
  41  * transactions we need Snapshots that see intermediate versions of the
  42  * catalog in a transaction. During normal operation this is achieved by using
  43  * CommandIds/cmin/cmax. The problem with that however is that for space
  44  * efficiency reasons, the cmin and cmax are not included in WAL records. We
  45  * cannot read the cmin/cmax from the tuple itself, either, because it is
  46  * reset on crash recovery. Even if we could, we could not decode combocids
  47  * which are only tracked in the original backend's memory. To work around
  48  * that, heapam writes an extra WAL record (XLOG_HEAP2_NEW_CID) every time a
  49  * catalog row is modified, which includes the cmin and cmax of the
  50  * tuple. During decoding, we insert the ctid->(cmin,cmax) mappings into the
  51  * reorder buffer, and use them at visibility checks instead of the cmin/cmax
  52  * on the tuple itself. Check the reorderbuffer.c's comment above
  53  * ResolveCminCmaxDuringDecoding() for details.
  54  *
  55  * To facilitate all this we need our own visibility routine, as the normal
  56  * ones are optimized for different usecases.
  57  *
  58  * To replace the normal catalog snapshots with decoding ones use the
  59  * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
  60  *
  61  *
  62  *
  63  * The snapbuild machinery is starting up in several stages, as illustrated
  64  * by the following graph describing the SnapBuild->state transitions:
  65  *
  66  *                 +-------------------------+
  67  *        +----|                 START                   |-------------+
  68  *        |    +-------------------------+                         |
  69  *        |                                     |                                                  |
  70  *        |                                     |                                                  |
  71  *        |                running_xacts #1                                        |
  72  *        |                                     |                                                  |
  73  *        |                                     |                                                  |
  74  *        |                                     v                                                  |
  75  *        |    +-------------------------+                         v
  76  *        |    |   BUILDING_SNAPSHOT     |------------>|
  77  *        |    +-------------------------+                         |
  78  *        |                                     |                                                  |
  79  *        |                                     |                                                  |
  80  *        | running_xacts #2, xacts from #1 finished   |
  81  *        |                                     |                                                  |
  82  *        |                                     |                                                  |
  83  *        |                                     v                                                  |
  84  *        |    +-------------------------+                         v
  85  *        |    |           FULL_SNAPSHOT         |------------>|
  86  *        |    +-------------------------+                         |
  87  *        |                                     |                                                  |
  88  * running_xacts                |                                          saved snapshot
  89  * with zero xacts              |                                 at running_xacts's lsn
  90  *        |                                     |                                                  |
  91  *        | running_xacts with xacts from #2 finished  |
  92  *        |                                     |                                                  |
  93  *        |                                     v                                                  |
  94  *        |    +-------------------------+                         |
  95  *        +--->|SNAPBUILD_CONSISTENT     |<------------+
  96  *                 +-------------------------+
  97  *
  98  * Initially the machinery is in the START stage. When an xl_running_xacts
  99  * record is read that is sufficiently new (above the safe xmin horizon),
 100  * there's a state transition. If there were no running xacts when the
 101  * xl_running_xacts record was generated, we'll directly go into CONSISTENT
 102  * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full
 103  * snapshot means that all transactions that start henceforth can be decoded
 104  * in their entirety, but transactions that started previously can't. In
 105  * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
 106  * running transactions have committed or aborted.
 107  *
 108  * Only transactions that commit after CONSISTENT state has been reached will
 109  * be replayed, even though they might have started while still in
 110  * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
 111  * changes has been exported, but all the following ones will be. That point
 112  * is a convenient point to initialize replication from, which is why we
 113  * export a snapshot at that point, which *can* be used to read normal data.
 114  *
 115  * Copyright (c) 2012-2024, PostgreSQL Global Development Group
 116  *
 117  * IDENTIFICATION
 118  *        src/backend/replication/logical/snapbuild.c
 119  *
 120  *-------------------------------------------------------------------------
 121  */
 122
 123 #include "postgres.h"
 124
 125 #include <sys/stat.h>
 126 #include <unistd.h>
 127
 128 #include "access/heapam_xlog.h"
 129 #include "access/transam.h"
 130 #include "access/xact.h"
 131 #include "common/file_utils.h"
 132 #include "miscadmin.h"
 133 #include "pgstat.h"
 134 #include "replication/logical.h"
 135 #include "replication/reorderbuffer.h"
 136 #include "replication/snapbuild.h"
 137 #include "replication/snapbuild_internal.h"
 138 #include "storage/fd.h"
 139 #include "storage/lmgr.h"
 140 #include "storage/proc.h"
 141 #include "storage/procarray.h"
 142 #include "storage/standby.h"
 143 #include "utils/builtins.h"
 144 #include "utils/memutils.h"
 145 #include "utils/snapmgr.h"
 146 #include "utils/snapshot.h"
 147 /*
 148  * Starting a transaction -- which we need to do while exporting a snapshot --
 149  * removes knowledge about the previously used resowner, so we save it here.
 150  */
 151 static ResourceOwner SavedResourceOwnerDuringExport = NULL;
 152 static bool ExportInProgress = false;
 153
 154 /* ->committed and ->catchange manipulation */
 155 static void SnapBuildPurgeOlderTxn(SnapBuild *builder);
 156
 157 /* snapshot building/manipulation/distribution functions */
 158 static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder);
 159
 160 static void SnapBuildFreeSnapshot(Snapshot snap);
 161
 162 static void SnapBuildSnapIncRefcount(Snapshot snap);
 163
 164 static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
 165
 166 static inline bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid,
 167                                                                                                  uint32 xinfo);
 168
 169 /* xlog reading helper functions for SnapBuildProcessRunningXacts */
 170 static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
 171 static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff);
 172
 173 /* serialization functions */
 174 static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
 175 static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
 176 static void SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path);
 177
 178 /*
 179  * Allocate a new snapshot builder.
 180  *
 181  * xmin_horizon is the xid >= which we can be sure no catalog rows have been
 182  * removed, start_lsn is the LSN >= we want to replay commits.
 183  */
 184 SnapBuild *
 185 AllocateSnapshotBuilder(ReorderBuffer *reorder,
 186                                                 TransactionId xmin_horizon,
 187                                                 XLogRecPtr start_lsn,
 188                                                 bool need_full_snapshot,
 189                                                 bool in_slot_creation,
 190                                                 XLogRecPtr two_phase_at)
 191 {
 192         MemoryContext context;
 193         MemoryContext oldcontext;
 194         SnapBuild  *builder;
 195
 196         /* allocate memory in own context, to have better accountability */
 197         context = AllocSetContextCreate(CurrentMemoryContext,
 198                                                                         "snapshot builder context",
 199                                                                         ALLOCSET_DEFAULT_SIZES);
 200         oldcontext = MemoryContextSwitchTo(context);
 201
 202         builder = palloc0(sizeof(SnapBuild));
 203
 204         builder->state = SNAPBUILD_START;
 205         builder->context = context;
 206         builder->reorder = reorder;
 207         /* Other struct members initialized by zeroing via palloc0 above */
 208
 209         builder->committed.xcnt = 0;
 210         builder->committed.xcnt_space = 128;    /* arbitrary number */
 211         builder->committed.xip =
 212                 palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
 213         builder->committed.includes_all_transactions = true;
 214
 215         builder->catchange.xcnt = 0;
 216         builder->catchange.xip = NULL;
 217
 218         builder->initial_xmin_horizon = xmin_horizon;
 219         builder->start_decoding_at = start_lsn;
 220         builder->in_slot_creation = in_slot_creation;
 221         builder->building_full_snapshot = need_full_snapshot;
 222         builder->two_phase_at = two_phase_at;
 223
 224         MemoryContextSwitchTo(oldcontext);
 225
 226         return builder;
 227 }
 228
 229 /*
 230  * Free a snapshot builder.
 231  */
 232 void
 233 FreeSnapshotBuilder(SnapBuild *builder)
 234 {
 235         MemoryContext context = builder->context;
 236
 237         /* free snapshot explicitly, that contains some error checking */
 238         if (builder->snapshot != NULL)
 239         {
 240                 SnapBuildSnapDecRefcount(builder->snapshot);
 241                 builder->snapshot = NULL;
 242         }
 243
 244         /* other resources are deallocated via memory context reset */
 245         MemoryContextDelete(context);
 246 }
 247
 248 /*
 249  * Free an unreferenced snapshot that has previously been built by us.
 250  */
 251 static void
 252 SnapBuildFreeSnapshot(Snapshot snap)
 253 {
 254         /* make sure we don't get passed an external snapshot */
 255         Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC);
 256
 257         /* make sure nobody modified our snapshot */
 258         Assert(snap->curcid == FirstCommandId);
 259         Assert(!snap->suboverflowed);
 260         Assert(!snap->takenDuringRecovery);
 261         Assert(snap->regd_count == 0);
 262
 263         /* slightly more likely, so it's checked even without c-asserts */
 264         if (snap->copied)
 265                 elog(ERROR, "cannot free a copied snapshot");
 266
 267         if (snap->active_count)
 268                 elog(ERROR, "cannot free an active snapshot");
 269
 270         pfree(snap);
 271 }
 272
 273 /*
 274  * In which state of snapshot building are we?
 275  */
 276 SnapBuildState
 277 SnapBuildCurrentState(SnapBuild *builder)
 278 {
 279         return builder->state;
 280 }
 281
 282 /*
 283  * Return the LSN at which the two-phase decoding was first enabled.
 284  */
 285 XLogRecPtr
 286 SnapBuildGetTwoPhaseAt(SnapBuild *builder)
 287 {
 288         return builder->two_phase_at;
 289 }
 290
 291 /*
 292  * Set the LSN at which two-phase decoding is enabled.
 293  */
 294 void
 295 SnapBuildSetTwoPhaseAt(SnapBuild *builder, XLogRecPtr ptr)
 296 {
 297         builder->two_phase_at = ptr;
 298 }
 299
 300 /*
 301  * Should the contents of transaction ending at 'ptr' be decoded?
 302  */
 303 bool
 304 SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
 305 {
 306         return ptr < builder->start_decoding_at;
 307 }
 308
 309 /*
 310  * Increase refcount of a snapshot.
 311  *
 312  * This is used when handing out a snapshot to some external resource or when
 313  * adding a Snapshot as builder->snapshot.
 314  */
 315 static void
 316 SnapBuildSnapIncRefcount(Snapshot snap)
 317 {
 318         snap->active_count++;
 319 }
 320
 321 /*
 322  * Decrease refcount of a snapshot and free if the refcount reaches zero.
 323  *
 324  * Externally visible, so that external resources that have been handed an
 325  * IncRef'ed Snapshot can adjust its refcount easily.
 326  */
 327 void
 328 SnapBuildSnapDecRefcount(Snapshot snap)
 329 {
 330         /* make sure we don't get passed an external snapshot */
 331         Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC);
 332
 333         /* make sure nobody modified our snapshot */
 334         Assert(snap->curcid == FirstCommandId);
 335         Assert(!snap->suboverflowed);
 336         Assert(!snap->takenDuringRecovery);
 337
 338         Assert(snap->regd_count == 0);
 339
 340         Assert(snap->active_count > 0);
 341
 342         /* slightly more likely, so it's checked even without casserts */
 343         if (snap->copied)
 344                 elog(ERROR, "cannot free a copied snapshot");
 345
 346         snap->active_count--;
 347         if (snap->active_count == 0)
 348                 SnapBuildFreeSnapshot(snap);
 349 }
 350
 351 /*
 352  * Build a new snapshot, based on currently committed catalog-modifying
 353  * transactions.
 354  *
 355  * In-progress transactions with catalog access are *not* allowed to modify
 356  * these snapshots; they have to copy them and fill in appropriate ->curcid
 357  * and ->subxip/subxcnt values.
 358  */
 359 static Snapshot
 360 SnapBuildBuildSnapshot(SnapBuild *builder)
 361 {
 362         Snapshot        snapshot;
 363         Size            ssize;
 364
 365         Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
 366
 367         ssize = sizeof(SnapshotData)
 368                 + sizeof(TransactionId) * builder->committed.xcnt
 369                 + sizeof(TransactionId) * 1 /* toplevel xid */ ;
 370
 371         snapshot = MemoryContextAllocZero(builder->context, ssize);
 372
 373         snapshot->snapshot_type = SNAPSHOT_HISTORIC_MVCC;
 374
 375         /*
 376          * We misuse the original meaning of SnapshotData's xip and subxip fields
 377          * to make the more fitting for our needs.
 378          *
 379          * In the 'xip' array we store transactions that have to be treated as
 380          * committed. Since we will only ever look at tuples from transactions
 381          * that have modified the catalog it's more efficient to store those few
 382          * that exist between xmin and xmax (frequently there are none).
 383          *
 384          * Snapshots that are used in transactions that have modified the catalog
 385          * also use the 'subxip' array to store their toplevel xid and all the
 386          * subtransaction xids so we can recognize when we need to treat rows as
 387          * visible that are not in xip but still need to be visible. Subxip only
 388          * gets filled when the transaction is copied into the context of a
 389          * catalog modifying transaction since we otherwise share a snapshot
 390          * between transactions. As long as a txn hasn't modified the catalog it
 391          * doesn't need to treat any uncommitted rows as visible, so there is no
 392          * need for those xids.
 393          *
 394          * Both arrays are qsort'ed so that we can use bsearch() on them.
 395          */
 396         Assert(TransactionIdIsNormal(builder->xmin));
 397         Assert(TransactionIdIsNormal(builder->xmax));
 398
 399         snapshot->xmin = builder->xmin;
 400         snapshot->xmax = builder->xmax;
 401
 402         /* store all transactions to be treated as committed by this snapshot */
 403         snapshot->xip =
 404                 (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
 405         snapshot->xcnt = builder->committed.xcnt;
 406         memcpy(snapshot->xip,
 407                    builder->committed.xip,
 408                    builder->committed.xcnt * sizeof(TransactionId));
 409
 410         /* sort so we can bsearch() */
 411         qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
 412
 413         /*
 414          * Initially, subxip is empty, i.e. it's a snapshot to be used by
 415          * transactions that don't modify the catalog. Will be filled by
 416          * ReorderBufferCopySnap() if necessary.
 417          */
 418         snapshot->subxcnt = 0;
 419         snapshot->subxip = NULL;
 420
 421         snapshot->suboverflowed = false;
 422         snapshot->takenDuringRecovery = false;
 423         snapshot->copied = false;
 424         snapshot->curcid = FirstCommandId;
 425         snapshot->active_count = 0;
 426         snapshot->regd_count = 0;
 427         snapshot->snapXactCompletionCount = 0;
 428
 429         return snapshot;
 430 }
 431
 432 /*
 433  * Build the initial slot snapshot and convert it to a normal snapshot that
 434  * is understood by HeapTupleSatisfiesMVCC.
 435  *
 436  * The snapshot will be usable directly in current transaction or exported
 437  * for loading in different transaction.
 438  */
 439 Snapshot
 440 SnapBuildInitialSnapshot(SnapBuild *builder)
 441 {
 442         Snapshot        snap;
 443         TransactionId xid;
 444         TransactionId safeXid;
 445         TransactionId *newxip;
 446         int                     newxcnt = 0;
 447
 448         Assert(XactIsoLevel == XACT_REPEATABLE_READ);
 449         Assert(builder->building_full_snapshot);
 450
 451         /* don't allow older snapshots */
 452         InvalidateCatalogSnapshot();    /* about to overwrite MyProc->xmin */
 453         if (HaveRegisteredOrActiveSnapshot())
 454                 elog(ERROR, "cannot build an initial slot snapshot when snapshots exist");
 455         Assert(!HistoricSnapshotActive());
 456
 457         if (builder->state != SNAPBUILD_CONSISTENT)
 458                 elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state");
 459
 460         if (!builder->committed.includes_all_transactions)
 461                 elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
 462
 463         /* so we don't overwrite the existing value */
 464         if (TransactionIdIsValid(MyProc->xmin))
 465                 elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid");
 466
 467         snap = SnapBuildBuildSnapshot(builder);
 468
 469         /*
 470          * We know that snap->xmin is alive, enforced by the logical xmin
 471          * mechanism. Due to that we can do this without locks, we're only
 472          * changing our own value.
 473          *
 474          * Building an initial snapshot is expensive and an unenforced xmin
 475          * horizon would have bad consequences, therefore always double-check that
 476          * the horizon is enforced.
 477          */
 478         LWLockAcquire(ProcArrayLock, LW_SHARED);
 479         safeXid = GetOldestSafeDecodingTransactionId(false);
 480         LWLockRelease(ProcArrayLock);
 481
 482         if (TransactionIdFollows(safeXid, snap->xmin))
 483                 elog(ERROR, "cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u",
 484                          safeXid, snap->xmin);
 485
 486         MyProc->xmin = snap->xmin;
 487
 488         /* allocate in transaction context */
 489         newxip = (TransactionId *)
 490                 palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
 491
 492         /*
 493          * snapbuild.c builds transactions in an "inverted" manner, which means it
 494          * stores committed transactions in ->xip, not ones in progress. Build a
 495          * classical snapshot by marking all non-committed transactions as
 496          * in-progress. This can be expensive.
 497          */
 498         for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
 499         {
 500                 void       *test;
 501
 502                 /*
 503                  * Check whether transaction committed using the decoding snapshot
 504                  * meaning of ->xip.
 505                  */
 506                 test = bsearch(&xid, snap->xip, snap->xcnt,
 507                                            sizeof(TransactionId), xidComparator);
 508
 509                 if (test == NULL)
 510                 {
 511                         if (newxcnt >= GetMaxSnapshotXidCount())
 512                                 ereport(ERROR,
 513                                                 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
 514                                                  errmsg("initial slot snapshot too large")));
 515
 516                         newxip[newxcnt++] = xid;
 517                 }
 518
 519                 TransactionIdAdvance(xid);
 520         }
 521
 522         /* adjust remaining snapshot fields as needed */
 523         snap->snapshot_type = SNAPSHOT_MVCC;
 524         snap->xcnt = newxcnt;
 525         snap->xip = newxip;
 526
 527         return snap;
 528 }
 529
 530 /*
 531  * Export a snapshot so it can be set in another session with SET TRANSACTION
 532  * SNAPSHOT.
 533  *
 534  * For that we need to start a transaction in the current backend as the
 535  * importing side checks whether the source transaction is still open to make
 536  * sure the xmin horizon hasn't advanced since then.
 537  */
 538 const char *
 539 SnapBuildExportSnapshot(SnapBuild *builder)
 540 {
 541         Snapshot        snap;
 542         char       *snapname;
 543
 544         if (IsTransactionOrTransactionBlock())
 545                 elog(ERROR, "cannot export a snapshot from within a transaction");
 546
 547         if (SavedResourceOwnerDuringExport)
 548                 elog(ERROR, "can only export one snapshot at a time");
 549
 550         SavedResourceOwnerDuringExport = CurrentResourceOwner;
 551         ExportInProgress = true;
 552
 553         StartTransactionCommand();
 554
 555         /* There doesn't seem to a nice API to set these */
 556         XactIsoLevel = XACT_REPEATABLE_READ;
 557         XactReadOnly = true;
 558
 559         snap = SnapBuildInitialSnapshot(builder);
 560
 561         /*
 562          * now that we've built a plain snapshot, make it active and use the
 563          * normal mechanisms for exporting it
 564          */
 565         snapname = ExportSnapshot(snap);
 566
 567         ereport(LOG,
 568                         (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
 569                                                    "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
 570                                                    snap->xcnt,
 571                                                    snapname, snap->xcnt)));
 572         return snapname;
 573 }
 574
 575 /*
 576  * Ensure there is a snapshot and if not build one for current transaction.
 577  */
 578 Snapshot
 579 SnapBuildGetOrBuildSnapshot(SnapBuild *builder)
 580 {
 581         Assert(builder->state == SNAPBUILD_CONSISTENT);
 582
 583         /* only build a new snapshot if we don't have a prebuilt one */
 584         if (builder->snapshot == NULL)
 585         {
 586                 builder->snapshot = SnapBuildBuildSnapshot(builder);
 587                 /* increase refcount for the snapshot builder */
 588                 SnapBuildSnapIncRefcount(builder->snapshot);
 589         }
 590
 591         return builder->snapshot;
 592 }
 593
 594 /*
 595  * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
 596  * any. Aborts the previously started transaction and resets the resource
 597  * owner back to its original value.
 598  */
 599 void
 600 SnapBuildClearExportedSnapshot(void)
 601 {
 602         ResourceOwner tmpResOwner;
 603
 604         /* nothing exported, that is the usual case */
 605         if (!ExportInProgress)
 606                 return;
 607
 608         if (!IsTransactionState())
 609                 elog(ERROR, "clearing exported snapshot in wrong transaction state");
 610
 611         /*
 612          * AbortCurrentTransaction() takes care of resetting the snapshot state,
 613          * so remember SavedResourceOwnerDuringExport.
 614          */
 615         tmpResOwner = SavedResourceOwnerDuringExport;
 616
 617         /* make sure nothing could have ever happened */
 618         AbortCurrentTransaction();
 619
 620         CurrentResourceOwner = tmpResOwner;
 621 }
 622
 623 /*
 624  * Clear snapshot export state during transaction abort.
 625  */
 626 void
 627 SnapBuildResetExportedSnapshotState(void)
 628 {
 629         SavedResourceOwnerDuringExport = NULL;
 630         ExportInProgress = false;
 631 }
 632
 633 /*
 634  * Handle the effects of a single heap change, appropriate to the current state
 635  * of the snapshot builder and returns whether changes made at (xid, lsn) can
 636  * be decoded.
 637  */
 638 bool
 639 SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
 640 {
 641         /*
 642          * We can't handle data in transactions if we haven't built a snapshot
 643          * yet, so don't store them.
 644          */
 645         if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
 646                 return false;
 647
 648         /*
 649          * No point in keeping track of changes in transactions that we don't have
 650          * enough information about to decode. This means that they started before
 651          * we got into the SNAPBUILD_FULL_SNAPSHOT state.
 652          */
 653         if (builder->state < SNAPBUILD_CONSISTENT &&
 654                 TransactionIdPrecedes(xid, builder->next_phase_at))
 655                 return false;
 656
 657         /*
 658          * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
 659          * be needed to decode the change we're currently processing.
 660          */
 661         if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
 662         {
 663                 /* only build a new snapshot if we don't have a prebuilt one */
 664                 if (builder->snapshot == NULL)
 665                 {
 666                         builder->snapshot = SnapBuildBuildSnapshot(builder);
 667                         /* increase refcount for the snapshot builder */
 668                         SnapBuildSnapIncRefcount(builder->snapshot);
 669                 }
 670
 671                 /*
 672                  * Increase refcount for the transaction we're handing the snapshot
 673                  * out to.
 674                  */
 675                 SnapBuildSnapIncRefcount(builder->snapshot);
 676                 ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
 677                                                                          builder->snapshot);
 678         }
 679
 680         return true;
 681 }
 682
 683 /*
 684  * Do CommandId/combo CID handling after reading an xl_heap_new_cid record.
 685  * This implies that a transaction has done some form of write to system
 686  * catalogs.
 687  */
 688 void
 689 SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
 690                                            XLogRecPtr lsn, xl_heap_new_cid *xlrec)
 691 {
 692         CommandId       cid;
 693
 694         /*
 695          * we only log new_cid's if a catalog tuple was modified, so mark the
 696          * transaction as containing catalog modifications
 697          */
 698         ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn);
 699
 700         ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
 701                                                                  xlrec->target_locator, xlrec->target_tid,
 702                                                                  xlrec->cmin, xlrec->cmax,
 703                                                                  xlrec->combocid);
 704
 705         /* figure out new command id */
 706         if (xlrec->cmin != InvalidCommandId &&
 707                 xlrec->cmax != InvalidCommandId)
 708                 cid = Max(xlrec->cmin, xlrec->cmax);
 709         else if (xlrec->cmax != InvalidCommandId)
 710                 cid = xlrec->cmax;
 711         else if (xlrec->cmin != InvalidCommandId)
 712                 cid = xlrec->cmin;
 713         else
 714         {
 715                 cid = InvalidCommandId; /* silence compiler */
 716                 elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
 717         }
 718
 719         ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
 720 }
 721
 722 /*
 723  * Add a new Snapshot to all transactions we're decoding that currently are
 724  * in-progress so they can see new catalog contents made by the transaction
 725  * that just committed. This is necessary because those in-progress
 726  * transactions will use the new catalog's contents from here on (at the very
 727  * least everything they do needs to be compatible with newer catalog
 728  * contents).
 729  */
 730 static void
 731 SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
 732 {
 733         dlist_iter      txn_i;
 734         ReorderBufferTXN *txn;
 735
 736         /*
 737          * Iterate through all toplevel transactions. This can include
 738          * subtransactions which we just don't yet know to be that, but that's
 739          * fine, they will just get an unnecessary snapshot queued.
 740          */
 741         dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
 742         {
 743                 txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
 744
 745                 Assert(TransactionIdIsValid(txn->xid));
 746
 747                 /*
 748                  * If we don't have a base snapshot yet, there are no changes in this
 749                  * transaction which in turn implies we don't yet need a snapshot at
 750                  * all. We'll add a snapshot when the first change gets queued.
 751                  *
 752                  * NB: This works correctly even for subtransactions because
 753                  * ReorderBufferAssignChild() takes care to transfer the base snapshot
 754                  * to the top-level transaction, and while iterating the changequeue
 755                  * we'll get the change from the subtxn.
 756                  */
 757                 if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
 758                         continue;
 759
 760                 /*
 761                  * We don't need to add snapshot to prepared transactions as they
 762                  * should not see the new catalog contents.
 763                  */
 764                 if (rbtxn_prepared(txn) || rbtxn_skip_prepared(txn))
 765                         continue;
 766
 767                 elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
 768                          txn->xid, LSN_FORMAT_ARGS(lsn));
 769
 770                 /*
 771                  * increase the snapshot's refcount for the transaction we are handing
 772                  * it out to
 773                  */
 774                 SnapBuildSnapIncRefcount(builder->snapshot);
 775                 ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
 776                                                                  builder->snapshot);
 777         }
 778 }
 779
 780 /*
 781  * Keep track of a new catalog changing transaction that has committed.
 782  */
 783 static void
 784 SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
 785 {
 786         Assert(TransactionIdIsValid(xid));
 787
 788         if (builder->committed.xcnt == builder->committed.xcnt_space)
 789         {
 790                 builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
 791
 792                 elog(DEBUG1, "increasing space for committed transactions to %u",
 793                          (uint32) builder->committed.xcnt_space);
 794
 795                 builder->committed.xip = repalloc(builder->committed.xip,
 796                                                                                   builder->committed.xcnt_space * sizeof(TransactionId));
 797         }
 798
 799         /*
 800          * TODO: It might make sense to keep the array sorted here instead of
 801          * doing it every time we build a new snapshot. On the other hand this
 802          * gets called repeatedly when a transaction with subtransactions commits.
 803          */
 804         builder->committed.xip[builder->committed.xcnt++] = xid;
 805 }
 806
 807 /*
 808  * Remove knowledge about transactions we treat as committed or containing catalog
 809  * changes that are smaller than ->xmin. Those won't ever get checked via
 810  * the ->committed or ->catchange array, respectively. The committed xids will
 811  * get checked via the clog machinery.
 812  *
 813  * We can ideally remove the transaction from catchange array once it is
 814  * finished (committed/aborted) but that could be costly as we need to maintain
 815  * the xids order in the array.
 816  */
 817 static void
 818 SnapBuildPurgeOlderTxn(SnapBuild *builder)
 819 {
 820         int                     off;
 821         TransactionId *workspace;
 822         int                     surviving_xids = 0;
 823
 824         /* not ready yet */
 825         if (!TransactionIdIsNormal(builder->xmin))
 826                 return;
 827
 828         /* TODO: Neater algorithm than just copying and iterating? */
 829         workspace =
 830                 MemoryContextAlloc(builder->context,
 831                                                    builder->committed.xcnt * sizeof(TransactionId));
 832
 833         /* copy xids that still are interesting to workspace */
 834         for (off = 0; off < builder->committed.xcnt; off++)
 835         {
 836                 if (NormalTransactionIdPrecedes(builder->committed.xip[off],
 837                                                                                 builder->xmin))
 838                         ;                                       /* remove */
 839                 else
 840                         workspace[surviving_xids++] = builder->committed.xip[off];
 841         }
 842
 843         /* copy workspace back to persistent state */
 844         memcpy(builder->committed.xip, workspace,
 845                    surviving_xids * sizeof(TransactionId));
 846
 847         elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
 848                  (uint32) builder->committed.xcnt, (uint32) surviving_xids,
 849                  builder->xmin, builder->xmax);
 850         builder->committed.xcnt = surviving_xids;
 851
 852         pfree(workspace);
 853
 854         /*
 855          * Purge xids in ->catchange as well. The purged array must also be sorted
 856          * in xidComparator order.
 857          */
 858         if (builder->catchange.xcnt > 0)
 859         {
 860                 /*
 861                  * Since catchange.xip is sorted, we find the lower bound of xids that
 862                  * are still interesting.
 863                  */
 864                 for (off = 0; off < builder->catchange.xcnt; off++)
 865                 {
 866                         if (TransactionIdFollowsOrEquals(builder->catchange.xip[off],
 867                                                                                          builder->xmin))
 868                                 break;
 869                 }
 870
 871                 surviving_xids = builder->catchange.xcnt - off;
 872
 873                 if (surviving_xids > 0)
 874                 {
 875                         memmove(builder->catchange.xip, &(builder->catchange.xip[off]),
 876                                         surviving_xids * sizeof(TransactionId));
 877                 }
 878                 else
 879                 {
 880                         pfree(builder->catchange.xip);
 881                         builder->catchange.xip = NULL;
 882                 }
 883
 884                 elog(DEBUG3, "purged catalog modifying transactions from %u to %u, xmin: %u, xmax: %u",
 885                          (uint32) builder->catchange.xcnt, (uint32) surviving_xids,
 886                          builder->xmin, builder->xmax);
 887                 builder->catchange.xcnt = surviving_xids;
 888         }
 889 }
 890
 891 /*
 892  * Handle everything that needs to be done when a transaction commits
 893  */
 894 void
 895 SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
 896                                    int nsubxacts, TransactionId *subxacts, uint32 xinfo)
 897 {
 898         int                     nxact;
 899
 900         bool            needs_snapshot = false;
 901         bool            needs_timetravel = false;
 902         bool            sub_needs_timetravel = false;
 903
 904         TransactionId xmax = xid;
 905
 906         /*
 907          * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
 908          * will they be part of a snapshot.  So we don't need to record anything.
 909          */
 910         if (builder->state == SNAPBUILD_START ||
 911                 (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
 912                  TransactionIdPrecedes(xid, builder->next_phase_at)))
 913         {
 914                 /* ensure that only commits after this are getting replayed */
 915                 if (builder->start_decoding_at <= lsn)
 916                         builder->start_decoding_at = lsn + 1;
 917                 return;
 918         }
 919
 920         if (builder->state < SNAPBUILD_CONSISTENT)
 921         {
 922                 /* ensure that only commits after this are getting replayed */
 923                 if (builder->start_decoding_at <= lsn)
 924                         builder->start_decoding_at = lsn + 1;
 925
 926                 /*
 927                  * If building an exportable snapshot, force xid to be tracked, even
 928                  * if the transaction didn't modify the catalog.
 929                  */
 930                 if (builder->building_full_snapshot)
 931                 {
 932                         needs_timetravel = true;
 933                 }
 934         }
 935
 936         for (nxact = 0; nxact < nsubxacts; nxact++)
 937         {
 938                 TransactionId subxid = subxacts[nxact];
 939
 940                 /*
 941                  * Add subtransaction to base snapshot if catalog modifying, we don't
 942                  * distinguish to toplevel transactions there.
 943                  */
 944                 if (SnapBuildXidHasCatalogChanges(builder, subxid, xinfo))
 945                 {
 946                         sub_needs_timetravel = true;
 947                         needs_snapshot = true;
 948
 949                         elog(DEBUG1, "found subtransaction %u:%u with catalog changes",
 950                                  xid, subxid);
 951
 952                         SnapBuildAddCommittedTxn(builder, subxid);
 953
 954                         if (NormalTransactionIdFollows(subxid, xmax))
 955                                 xmax = subxid;
 956                 }
 957
 958                 /*
 959                  * If we're forcing timetravel we also need visibility information
 960                  * about subtransaction, so keep track of subtransaction's state, even
 961                  * if not catalog modifying.  Don't need to distribute a snapshot in
 962                  * that case.
 963                  */
 964                 else if (needs_timetravel)
 965                 {
 966                         SnapBuildAddCommittedTxn(builder, subxid);
 967                         if (NormalTransactionIdFollows(subxid, xmax))
 968                                 xmax = subxid;
 969                 }
 970         }
 971
 972         /* if top-level modified catalog, it'll need a snapshot */
 973         if (SnapBuildXidHasCatalogChanges(builder, xid, xinfo))
 974         {
 975                 elog(DEBUG2, "found top level transaction %u, with catalog changes",
 976                          xid);
 977                 needs_snapshot = true;
 978                 needs_timetravel = true;
 979                 SnapBuildAddCommittedTxn(builder, xid);
 980         }
 981         else if (sub_needs_timetravel)
 982         {
 983                 /* track toplevel txn as well, subxact alone isn't meaningful */
 984                 elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions",
 985                          xid);
 986                 needs_timetravel = true;
 987                 SnapBuildAddCommittedTxn(builder, xid);
 988         }
 989         else if (needs_timetravel)
 990         {
 991                 elog(DEBUG2, "forced transaction %u to do timetravel", xid);
 992
 993                 SnapBuildAddCommittedTxn(builder, xid);
 994         }
 995
 996         if (!needs_timetravel)
 997         {
 998                 /* record that we cannot export a general snapshot anymore */
 999                 builder->committed.includes_all_transactions = false;
1000         }
1001
1002         Assert(!needs_snapshot || needs_timetravel);
1003
1004         /*
1005          * Adjust xmax of the snapshot builder, we only do that for committed,
1006          * catalog modifying, transactions, everything else isn't interesting for
1007          * us since we'll never look at the respective rows.
1008          */
1009         if (needs_timetravel &&
1010                 (!TransactionIdIsValid(builder->xmax) ||
1011                  TransactionIdFollowsOrEquals(xmax, builder->xmax)))
1012         {
1013                 builder->xmax = xmax;
1014                 TransactionIdAdvance(builder->xmax);
1015         }
1016
1017         /* if there's any reason to build a historic snapshot, do so now */
1018         if (needs_snapshot)
1019         {
1020                 /*
1021                  * If we haven't built a complete snapshot yet there's no need to hand
1022                  * it out, it wouldn't (and couldn't) be used anyway.
1023                  */
1024                 if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
1025                         return;
1026
1027                 /*
1028                  * Decrease the snapshot builder's refcount of the old snapshot, note
1029                  * that it still will be used if it has been handed out to the
1030                  * reorderbuffer earlier.
1031                  */
1032                 if (builder->snapshot)
1033                         SnapBuildSnapDecRefcount(builder->snapshot);
1034
1035                 builder->snapshot = SnapBuildBuildSnapshot(builder);
1036
1037                 /* we might need to execute invalidations, add snapshot */
1038                 if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
1039                 {
1040                         SnapBuildSnapIncRefcount(builder->snapshot);
1041                         ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
1042                                                                                  builder->snapshot);
1043                 }
1044
1045                 /* refcount of the snapshot builder for the new snapshot */
1046                 SnapBuildSnapIncRefcount(builder->snapshot);
1047
1048                 /* add a new catalog snapshot to all currently running transactions */
1049                 SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
1050         }
1051 }
1052
1053 /*
1054  * Check the reorder buffer and the snapshot to see if the given transaction has
1055  * modified catalogs.
1056  */
1057 static inline bool
1058 SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid,
1059                                                           uint32 xinfo)
1060 {
1061         if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
1062                 return true;
1063
1064         /*
1065          * The transactions that have changed catalogs must have invalidation
1066          * info.
1067          */
1068         if (!(xinfo & XACT_XINFO_HAS_INVALS))
1069                 return false;
1070
1071         /* Check the catchange XID array */
1072         return ((builder->catchange.xcnt > 0) &&
1073                         (bsearch(&xid, builder->catchange.xip, builder->catchange.xcnt,
1074                                          sizeof(TransactionId), xidComparator) != NULL));
1075 }
1076
1077 /* -----------------------------------
1078  * Snapshot building functions dealing with xlog records
1079  * -----------------------------------
1080  */
1081
1082 /*
1083  * Process a running xacts record, and use its information to first build a
1084  * historic snapshot and later to release resources that aren't needed
1085  * anymore.
1086  */
1087 void
1088 SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
1089 {
1090         ReorderBufferTXN *txn;
1091         TransactionId xmin;
1092
1093         /*
1094          * If we're not consistent yet, inspect the record to see whether it
1095          * allows to get closer to being consistent. If we are consistent, dump
1096          * our snapshot so others or we, after a restart, can use it.
1097          */
1098         if (builder->state < SNAPBUILD_CONSISTENT)
1099         {
1100                 /* returns false if there's no point in performing cleanup just yet */
1101                 if (!SnapBuildFindSnapshot(builder, lsn, running))
1102                         return;
1103         }
1104         else
1105                 SnapBuildSerialize(builder, lsn);
1106
1107         /*
1108          * Update range of interesting xids based on the running xacts
1109          * information. We don't increase ->xmax using it, because once we are in
1110          * a consistent state we can do that ourselves and much more efficiently
1111          * so, because we only need to do it for catalog transactions since we
1112          * only ever look at those.
1113          *
1114          * NB: We only increase xmax when a catalog modifying transaction commits
1115          * (see SnapBuildCommitTxn).  Because of this, xmax can be lower than
1116          * xmin, which looks odd but is correct and actually more efficient, since
1117          * we hit fast paths in heapam_visibility.c.
1118          */
1119         builder->xmin = running->oldestRunningXid;
1120
1121         /* Remove transactions we don't need to keep track off anymore */
1122         SnapBuildPurgeOlderTxn(builder);
1123
1124         /*
1125          * Advance the xmin limit for the current replication slot, to allow
1126          * vacuum to clean up the tuples this slot has been protecting.
1127          *
1128          * The reorderbuffer might have an xmin among the currently running
1129          * snapshots; use it if so.  If not, we need only consider the snapshots
1130          * we'll produce later, which can't be less than the oldest running xid in
1131          * the record we're reading now.
1132          */
1133         xmin = ReorderBufferGetOldestXmin(builder->reorder);
1134         if (xmin == InvalidTransactionId)
1135                 xmin = running->oldestRunningXid;
1136         elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
1137                  builder->xmin, builder->xmax, running->oldestRunningXid, xmin);
1138         LogicalIncreaseXminForSlot(lsn, xmin);
1139
1140         /*
1141          * Also tell the slot where we can restart decoding from. We don't want to
1142          * do that after every commit because changing that implies an fsync of
1143          * the logical slot's state file, so we only do it every time we see a
1144          * running xacts record.
1145          *
1146          * Do so by looking for the oldest in progress transaction (determined by
1147          * the first LSN of any of its relevant records). Every transaction
1148          * remembers the last location we stored the snapshot to disk before its
1149          * beginning. That point is where we can restart from.
1150          */
1151
1152         /*
1153          * Can't know about a serialized snapshot's location if we're not
1154          * consistent.
1155          */
1156         if (builder->state < SNAPBUILD_CONSISTENT)
1157                 return;
1158
1159         txn = ReorderBufferGetOldestTXN(builder->reorder);
1160
1161         /*
1162          * oldest ongoing txn might have started when we didn't yet serialize
1163          * anything because we hadn't reached a consistent state yet.
1164          */
1165         if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
1166                 LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
1167
1168         /*
1169          * No in-progress transaction, can reuse the last serialized snapshot if
1170          * we have one.
1171          */
1172         else if (txn == NULL &&
1173                          builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
1174                          builder->last_serialized_snapshot != InvalidXLogRecPtr)
1175                 LogicalIncreaseRestartDecodingForSlot(lsn,
1176                                                                                           builder->last_serialized_snapshot);
1177 }
1178
1179
1180 /*
1181  * Build the start of a snapshot that's capable of decoding the catalog.
1182  *
1183  * Helper function for SnapBuildProcessRunningXacts() while we're not yet
1184  * consistent.
1185  *
1186  * Returns true if there is a point in performing internal maintenance/cleanup
1187  * using the xl_running_xacts record.
1188  */
1189 static bool
1190 SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
1191 {
1192         /* ---
1193          * Build catalog decoding snapshot incrementally using information about
1194          * the currently running transactions. There are several ways to do that:
1195          *
1196          * a) There were no running transactions when the xl_running_xacts record
1197          *        was inserted, jump to CONSISTENT immediately. We might find such a
1198          *        state while waiting on c)'s sub-states.
1199          *
1200          * b) This (in a previous run) or another decoding slot serialized a
1201          *        snapshot to disk that we can use. Can't use this method while finding
1202          *        the start point for decoding changes as the restart LSN would be an
1203          *        arbitrary LSN but we need to find the start point to extract changes
1204          *        where we won't see the data for partial transactions. Also, we cannot
1205          *        use this method when a slot needs a full snapshot for export or direct
1206          *        use, as that snapshot will only contain catalog modifying transactions.
1207          *
1208          * c) First incrementally build a snapshot for catalog tuples
1209          *        (BUILDING_SNAPSHOT), that requires all, already in-progress,
1210          *        transactions to finish.  Every transaction starting after that
1211          *        (FULL_SNAPSHOT state), has enough information to be decoded.  But
1212          *        for older running transactions no viable snapshot exists yet, so
1213          *        CONSISTENT will only be reached once all of those have finished.
1214          * ---
1215          */
1216
1217         /*
1218          * xl_running_xacts record is older than what we can use, we might not
1219          * have all necessary catalog rows anymore.
1220          */
1221         if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
1222                 NormalTransactionIdPrecedes(running->oldestRunningXid,
1223                                                                         builder->initial_xmin_horizon))
1224         {
1225                 ereport(DEBUG1,
1226                                 (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
1227                                                                  LSN_FORMAT_ARGS(lsn)),
1228                                  errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
1229                                                                         builder->initial_xmin_horizon, running->oldestRunningXid)));
1230
1231
1232                 SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon);
1233
1234                 return true;
1235         }
1236
1237         /*
1238          * a) No transaction were running, we can jump to consistent.
1239          *
1240          * This is not affected by races around xl_running_xacts, because we can
1241          * miss transaction commits, but currently not transactions starting.
1242          *
1243          * NB: We might have already started to incrementally assemble a snapshot,
1244          * so we need to be careful to deal with that.
1245          */
1246         if (running->oldestRunningXid == running->nextXid)
1247         {
1248                 if (builder->start_decoding_at == InvalidXLogRecPtr ||
1249                         builder->start_decoding_at <= lsn)
1250                         /* can decode everything after this */
1251                         builder->start_decoding_at = lsn + 1;
1252
1253                 /* As no transactions were running xmin/xmax can be trivially set. */
1254                 builder->xmin = running->nextXid;       /* < are finished */
1255                 builder->xmax = running->nextXid;       /* >= are running */
1256
1257                 /* so we can safely use the faster comparisons */
1258                 Assert(TransactionIdIsNormal(builder->xmin));
1259                 Assert(TransactionIdIsNormal(builder->xmax));
1260
1261                 builder->state = SNAPBUILD_CONSISTENT;
1262                 builder->next_phase_at = InvalidTransactionId;
1263
1264                 ereport(LOG,
1265                                 (errmsg("logical decoding found consistent point at %X/%X",
1266                                                 LSN_FORMAT_ARGS(lsn)),
1267                                  errdetail("There are no running transactions.")));
1268
1269                 return false;
1270         }
1271
1272         /*
1273          * b) valid on disk state and while neither building full snapshot nor
1274          * creating a slot.
1275          */
1276         else if (!builder->building_full_snapshot &&
1277                          !builder->in_slot_creation &&
1278                          SnapBuildRestore(builder, lsn))
1279         {
1280                 /* there won't be any state to cleanup */
1281                 return false;
1282         }
1283
1284         /*
1285          * c) transition from START to BUILDING_SNAPSHOT.
1286          *
1287          * In START state, and a xl_running_xacts record with running xacts is
1288          * encountered.  In that case, switch to BUILDING_SNAPSHOT state, and
1289          * record xl_running_xacts->nextXid.  Once all running xacts have finished
1290          * (i.e. they're all >= nextXid), we have a complete catalog snapshot.  It
1291          * might look that we could use xl_running_xacts's ->xids information to
1292          * get there quicker, but that is problematic because transactions marked
1293          * as running, might already have inserted their commit record - it's
1294          * infeasible to change that with locking.
1295          */
1296         else if (builder->state == SNAPBUILD_START)
1297         {
1298                 builder->state = SNAPBUILD_BUILDING_SNAPSHOT;
1299                 builder->next_phase_at = running->nextXid;
1300
1301                 /*
1302                  * Start with an xmin/xmax that's correct for future, when all the
1303                  * currently running transactions have finished. We'll update both
1304                  * while waiting for the pending transactions to finish.
1305                  */
1306                 builder->xmin = running->nextXid;       /* < are finished */
1307                 builder->xmax = running->nextXid;       /* >= are running */
1308
1309                 /* so we can safely use the faster comparisons */
1310                 Assert(TransactionIdIsNormal(builder->xmin));
1311                 Assert(TransactionIdIsNormal(builder->xmax));
1312
1313                 ereport(LOG,
1314                                 (errmsg("logical decoding found initial starting point at %X/%X",
1315                                                 LSN_FORMAT_ARGS(lsn)),
1316                                  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1317                                                    running->xcnt, running->nextXid)));
1318
1319                 SnapBuildWaitSnapshot(running, running->nextXid);
1320         }
1321
1322         /*
1323          * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT.
1324          *
1325          * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid
1326          * is >= than nextXid from when we switched to BUILDING_SNAPSHOT.  This
1327          * means all transactions starting afterwards have enough information to
1328          * be decoded.  Switch to FULL_SNAPSHOT.
1329          */
1330         else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
1331                          TransactionIdPrecedesOrEquals(builder->next_phase_at,
1332                                                                                    running->oldestRunningXid))
1333         {
1334                 builder->state = SNAPBUILD_FULL_SNAPSHOT;
1335                 builder->next_phase_at = running->nextXid;
1336
1337                 ereport(LOG,
1338                                 (errmsg("logical decoding found initial consistent point at %X/%X",
1339                                                 LSN_FORMAT_ARGS(lsn)),
1340                                  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1341                                                    running->xcnt, running->nextXid)));
1342
1343                 SnapBuildWaitSnapshot(running, running->nextXid);
1344         }
1345
1346         /*
1347          * c) transition from FULL_SNAPSHOT to CONSISTENT.
1348          *
1349          * In FULL_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid is
1350          * >= than nextXid from when we switched to FULL_SNAPSHOT.  This means all
1351          * transactions that are currently in progress have a catalog snapshot,
1352          * and all their changes have been collected.  Switch to CONSISTENT.
1353          */
1354         else if (builder->state == SNAPBUILD_FULL_SNAPSHOT &&
1355                          TransactionIdPrecedesOrEquals(builder->next_phase_at,
1356                                                                                    running->oldestRunningXid))
1357         {
1358                 builder->state = SNAPBUILD_CONSISTENT;
1359                 builder->next_phase_at = InvalidTransactionId;
1360
1361                 ereport(LOG,
1362                                 (errmsg("logical decoding found consistent point at %X/%X",
1363                                                 LSN_FORMAT_ARGS(lsn)),
1364                                  errdetail("There are no old transactions anymore.")));
1365         }
1366
1367         /*
1368          * We already started to track running xacts and need to wait for all
1369          * in-progress ones to finish. We fall through to the normal processing of
1370          * records so incremental cleanup can be performed.
1371          */
1372         return true;
1373 }
1374
1375 /* ---
1376  * Iterate through xids in record, wait for all older than the cutoff to
1377  * finish.  Then, if possible, log a new xl_running_xacts record.
1378  *
1379  * This isn't required for the correctness of decoding, but to:
1380  * a) allow isolationtester to notice that we're currently waiting for
1381  *        something.
1382  * b) log a new xl_running_xacts record where it'd be helpful, without having
1383  *        to wait for bgwriter or checkpointer.
1384  * ---
1385  */
1386 static void
1387 SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff)
1388 {
1389         int                     off;
1390
1391         for (off = 0; off < running->xcnt; off++)
1392         {
1393                 TransactionId xid = running->xids[off];
1394
1395                 /*
1396                  * Upper layers should prevent that we ever need to wait on ourselves.
1397                  * Check anyway, since failing to do so would either result in an
1398                  * endless wait or an Assert() failure.
1399                  */
1400                 if (TransactionIdIsCurrentTransactionId(xid))
1401                         elog(ERROR, "waiting for ourselves");
1402
1403                 if (TransactionIdFollows(xid, cutoff))
1404                         continue;
1405
1406                 XactLockTableWait(xid, NULL, NULL, XLTW_None);
1407         }
1408
1409         /*
1410          * All transactions we needed to finish finished - try to ensure there is
1411          * another xl_running_xacts record in a timely manner, without having to
1412          * wait for bgwriter or checkpointer to log one.  During recovery we can't
1413          * enforce that, so we'll have to wait.
1414          */
1415         if (!RecoveryInProgress())
1416         {
1417                 LogStandbySnapshot();
1418         }
1419 }
1420
1421 #define SnapBuildOnDiskConstantSize \
1422         offsetof(SnapBuildOnDisk, builder)
1423 #define SnapBuildOnDiskNotChecksummedSize \
1424         offsetof(SnapBuildOnDisk, version)
1425
1426 #define SNAPBUILD_MAGIC 0x51A1E001
1427 #define SNAPBUILD_VERSION 6
1428
1429 /*
1430  * Store/Load a snapshot from disk, depending on the snapshot builder's state.
1431  *
1432  * Supposed to be used by external (i.e. not snapbuild.c) code that just read
1433  * a record that's a potential location for a serialized snapshot.
1434  */
1435 void
1436 SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
1437 {
1438         if (builder->state < SNAPBUILD_CONSISTENT)
1439                 SnapBuildRestore(builder, lsn);
1440         else
1441                 SnapBuildSerialize(builder, lsn);
1442 }
1443
1444 /*
1445  * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
1446  * been done by another decoding process.
1447  */
1448 static void
1449 SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
1450 {
1451         Size            needed_length;
1452         SnapBuildOnDisk *ondisk = NULL;
1453         TransactionId *catchange_xip = NULL;
1454         MemoryContext old_ctx;
1455         size_t          catchange_xcnt;
1456         char       *ondisk_c;
1457         int                     fd;
1458         char            tmppath[MAXPGPATH];
1459         char            path[MAXPGPATH];
1460         int                     ret;
1461         struct stat stat_buf;
1462         Size            sz;
1463
1464         Assert(lsn != InvalidXLogRecPtr);
1465         Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
1466                    builder->last_serialized_snapshot <= lsn);
1467
1468         /*
1469          * no point in serializing if we cannot continue to work immediately after
1470          * restoring the snapshot
1471          */
1472         if (builder->state < SNAPBUILD_CONSISTENT)
1473                 return;
1474
1475         /* consistent snapshots have no next phase */
1476         Assert(builder->next_phase_at == InvalidTransactionId);
1477
1478         /*
1479          * We identify snapshots by the LSN they are valid for. We don't need to
1480          * include timelines in the name as each LSN maps to exactly one timeline
1481          * unless the user used pg_resetwal or similar. If a user did so, there's
1482          * no hope continuing to decode anyway.
1483          */
1484         sprintf(path, "%s/%X-%X.snap",
1485                         PG_LOGICAL_SNAPSHOTS_DIR,
1486                         LSN_FORMAT_ARGS(lsn));
1487
1488         /*
1489          * first check whether some other backend already has written the snapshot
1490          * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
1491          * as a valid state. Everything else is an unexpected error.
1492          */
1493         ret = stat(path, &stat_buf);
1494
1495         if (ret != 0 && errno != ENOENT)
1496                 ereport(ERROR,
1497                                 (errcode_for_file_access(),
1498                                  errmsg("could not stat file \"%s\": %m", path)));
1499
1500         else if (ret == 0)
1501         {
1502                 /*
1503                  * somebody else has already serialized to this point, don't overwrite
1504                  * but remember location, so we don't need to read old data again.
1505                  *
1506                  * To be sure it has been synced to disk after the rename() from the
1507                  * tempfile filename to the real filename, we just repeat the fsync.
1508                  * That ought to be cheap because in most scenarios it should already
1509                  * be safely on disk.
1510                  */
1511                 fsync_fname(path, false);
1512                 fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR, true);
1513
1514                 builder->last_serialized_snapshot = lsn;
1515                 goto out;
1516         }
1517
1518         /*
1519          * there is an obvious race condition here between the time we stat(2) the
1520          * file and us writing the file. But we rename the file into place
1521          * atomically and all files created need to contain the same data anyway,
1522          * so this is perfectly fine, although a bit of a resource waste. Locking
1523          * seems like pointless complication.
1524          */
1525         elog(DEBUG1, "serializing snapshot to %s", path);
1526
1527         /* to make sure only we will write to this tempfile, include pid */
1528         sprintf(tmppath, "%s/%X-%X.snap.%d.tmp",
1529                         PG_LOGICAL_SNAPSHOTS_DIR,
1530                         LSN_FORMAT_ARGS(lsn), MyProcPid);
1531
1532         /*
1533          * Unlink temporary file if it already exists, needs to have been before a
1534          * crash/error since we won't enter this function twice from within a
1535          * single decoding slot/backend and the temporary file contains the pid of
1536          * the current process.
1537          */
1538         if (unlink(tmppath) != 0 && errno != ENOENT)
1539                 ereport(ERROR,
1540                                 (errcode_for_file_access(),
1541                                  errmsg("could not remove file \"%s\": %m", tmppath)));
1542
1543         old_ctx = MemoryContextSwitchTo(builder->context);
1544
1545         /* Get the catalog modifying transactions that are yet not committed */
1546         catchange_xip = ReorderBufferGetCatalogChangesXacts(builder->reorder);
1547         catchange_xcnt = dclist_count(&builder->reorder->catchange_txns);
1548
1549         needed_length = sizeof(SnapBuildOnDisk) +
1550                 sizeof(TransactionId) * (builder->committed.xcnt + catchange_xcnt);
1551
1552         ondisk_c = palloc0(needed_length);
1553         ondisk = (SnapBuildOnDisk *) ondisk_c;
1554         ondisk->magic = SNAPBUILD_MAGIC;
1555         ondisk->version = SNAPBUILD_VERSION;
1556         ondisk->length = needed_length;
1557         INIT_CRC32C(ondisk->checksum);
1558         COMP_CRC32C(ondisk->checksum,
1559                                 ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
1560                                 SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
1561         ondisk_c += sizeof(SnapBuildOnDisk);
1562
1563         memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
1564         /* NULL-ify memory-only data */
1565         ondisk->builder.context = NULL;
1566         ondisk->builder.snapshot = NULL;
1567         ondisk->builder.reorder = NULL;
1568         ondisk->builder.committed.xip = NULL;
1569         ondisk->builder.catchange.xip = NULL;
1570         /* update catchange only on disk data */
1571         ondisk->builder.catchange.xcnt = catchange_xcnt;
1572
1573         COMP_CRC32C(ondisk->checksum,
1574                                 &ondisk->builder,
1575                                 sizeof(SnapBuild));
1576
1577         /* copy committed xacts */
1578         if (builder->committed.xcnt > 0)
1579         {
1580                 sz = sizeof(TransactionId) * builder->committed.xcnt;
1581                 memcpy(ondisk_c, builder->committed.xip, sz);
1582                 COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1583                 ondisk_c += sz;
1584         }
1585
1586         /* copy catalog modifying xacts */
1587         if (catchange_xcnt > 0)
1588         {
1589                 sz = sizeof(TransactionId) * catchange_xcnt;
1590                 memcpy(ondisk_c, catchange_xip, sz);
1591                 COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1592                 ondisk_c += sz;
1593         }
1594
1595         FIN_CRC32C(ondisk->checksum);
1596
1597         /* we have valid data now, open tempfile and write it there */
1598         fd = OpenTransientFile(tmppath,
1599                                                    O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1600         if (fd < 0)
1601                 ereport(ERROR,
1602                                 (errcode_for_file_access(),
1603                                  errmsg("could not open file \"%s\": %m", tmppath)));
1604
1605         errno = 0;
1606         pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE);
1607         if ((write(fd, ondisk, needed_length)) != needed_length)
1608         {
1609                 int                     save_errno = errno;
1610
1611                 CloseTransientFile(fd);
1612
1613                 /* if write didn't set errno, assume problem is no disk space */
1614                 errno = save_errno ? save_errno : ENOSPC;
1615                 ereport(ERROR,
1616                                 (errcode_for_file_access(),
1617                                  errmsg("could not write to file \"%s\": %m", tmppath)));
1618         }
1619         pgstat_report_wait_end();
1620
1621         /*
1622          * fsync the file before renaming so that even if we crash after this we
1623          * have either a fully valid file or nothing.
1624          *
1625          * It's safe to just ERROR on fsync() here because we'll retry the whole
1626          * operation including the writes.
1627          *
1628          * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
1629          * some noticeable overhead since it's performed synchronously during
1630          * decoding?
1631          */
1632         pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC);
1633         if (pg_fsync(fd) != 0)
1634         {
1635                 int                     save_errno = errno;
1636
1637                 CloseTransientFile(fd);
1638                 errno = save_errno;
1639                 ereport(ERROR,
1640                                 (errcode_for_file_access(),
1641                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
1642         }
1643         pgstat_report_wait_end();
1644
1645         if (CloseTransientFile(fd) != 0)
1646                 ereport(ERROR,
1647                                 (errcode_for_file_access(),
1648                                  errmsg("could not close file \"%s\": %m", tmppath)));
1649
1650         fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR, true);
1651
1652         /*
1653          * We may overwrite the work from some other backend, but that's ok, our
1654          * snapshot is valid as well, we'll just have done some superfluous work.
1655          */
1656         if (rename(tmppath, path) != 0)
1657         {
1658                 ereport(ERROR,
1659                                 (errcode_for_file_access(),
1660                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
1661                                                 tmppath, path)));
1662         }
1663
1664         /* make sure we persist */
1665         fsync_fname(path, false);
1666         fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR, true);
1667
1668         /*
1669          * Now there's no way we can lose the dumped state anymore, remember this
1670          * as a serialization point.
1671          */
1672         builder->last_serialized_snapshot = lsn;
1673
1674         MemoryContextSwitchTo(old_ctx);
1675
1676 out:
1677         ReorderBufferSetRestartPoint(builder->reorder,
1678                                                                  builder->last_serialized_snapshot);
1679         /* be tidy */
1680         if (ondisk)
1681                 pfree(ondisk);
1682         if (catchange_xip)
1683                 pfree(catchange_xip);
1684 }
1685
1686 /*
1687  * Restore the logical snapshot file contents to 'ondisk'.
1688  *
1689  * 'context' is the memory context where the catalog modifying/committed xid
1690  * will live.
1691  * If 'missing_ok' is true, will not throw an error if the file is not found.
1692  */
1693 bool
1694 SnapBuildRestoreSnapshot(SnapBuildOnDisk *ondisk, const char *path,
1695                                                  MemoryContext context, bool missing_ok)
1696 {
1697         int                     fd;
1698         pg_crc32c       checksum;
1699         Size            sz;
1700
1701         fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
1702
1703         if (fd < 0)
1704         {
1705                 if (missing_ok && errno == ENOENT)
1706                         return false;
1707
1708                 ereport(ERROR,
1709                                 (errcode_for_file_access(),
1710                                  errmsg("could not open file \"%s\": %m", path)));
1711         }
1712
1713         /* ----
1714          * Make sure the snapshot had been stored safely to disk, that's normally
1715          * cheap.
1716          * Note that we do not need PANIC here, nobody will be able to use the
1717          * slot without fsyncing, and saving it won't succeed without an fsync()
1718          * either...
1719          * ----
1720          */
1721         fsync_fname(path, false);
1722         fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR, true);
1723
1724         /* read statically sized portion of snapshot */
1725         SnapBuildRestoreContents(fd, (char *) ondisk, SnapBuildOnDiskConstantSize, path);
1726
1727         if (ondisk->magic != SNAPBUILD_MAGIC)
1728                 ereport(ERROR,
1729                                 (errcode(ERRCODE_DATA_CORRUPTED),
1730                                  errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
1731                                                 path, ondisk->magic, SNAPBUILD_MAGIC)));
1732
1733         if (ondisk->version != SNAPBUILD_VERSION)
1734                 ereport(ERROR,
1735                                 (errcode(ERRCODE_DATA_CORRUPTED),
1736                                  errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
1737                                                 path, ondisk->version, SNAPBUILD_VERSION)));
1738
1739         INIT_CRC32C(checksum);
1740         COMP_CRC32C(checksum,
1741                                 ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
1742                                 SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
1743
1744         /* read SnapBuild */
1745         SnapBuildRestoreContents(fd, (char *) &ondisk->builder, sizeof(SnapBuild), path);
1746         COMP_CRC32C(checksum, &ondisk->builder, sizeof(SnapBuild));
1747
1748         /* restore committed xacts information */
1749         if (ondisk->builder.committed.xcnt > 0)
1750         {
1751                 sz = sizeof(TransactionId) * ondisk->builder.committed.xcnt;
1752                 ondisk->builder.committed.xip = MemoryContextAllocZero(context, sz);
1753                 SnapBuildRestoreContents(fd, (char *) ondisk->builder.committed.xip, sz, path);
1754                 COMP_CRC32C(checksum, ondisk->builder.committed.xip, sz);
1755         }
1756
1757         /* restore catalog modifying xacts information */
1758         if (ondisk->builder.catchange.xcnt > 0)
1759         {
1760                 sz = sizeof(TransactionId) * ondisk->builder.catchange.xcnt;
1761                 ondisk->builder.catchange.xip = MemoryContextAllocZero(context, sz);
1762                 SnapBuildRestoreContents(fd, (char *) ondisk->builder.catchange.xip, sz, path);
1763                 COMP_CRC32C(checksum, ondisk->builder.catchange.xip, sz);
1764         }
1765
1766         if (CloseTransientFile(fd) != 0)
1767                 ereport(ERROR,
1768                                 (errcode_for_file_access(),
1769                                  errmsg("could not close file \"%s\": %m", path)));
1770
1771         FIN_CRC32C(checksum);
1772
1773         /* verify checksum of what we've read */
1774         if (!EQ_CRC32C(checksum, ondisk->checksum))
1775                 ereport(ERROR,
1776                                 (errcode(ERRCODE_DATA_CORRUPTED),
1777                                  errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
1778                                                 path, checksum, ondisk->checksum)));
1779
1780         return true;
1781 }
1782
1783 /*
1784  * Restore a snapshot into 'builder' if previously one has been stored at the
1785  * location indicated by 'lsn'. Returns true if successful, false otherwise.
1786  */
1787 static bool
1788 SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
1789 {
1790         SnapBuildOnDisk ondisk;
1791         char            path[MAXPGPATH];
1792
1793         /* no point in loading a snapshot if we're already there */
1794         if (builder->state == SNAPBUILD_CONSISTENT)
1795                 return false;
1796
1797         sprintf(path, "%s/%X-%X.snap",
1798                         PG_LOGICAL_SNAPSHOTS_DIR,
1799                         LSN_FORMAT_ARGS(lsn));
1800
1801         /* validate and restore the snapshot to 'ondisk' */
1802         if (!SnapBuildRestoreSnapshot(&ondisk, path, builder->context, true))
1803                 return false;
1804
1805         /*
1806          * ok, we now have a sensible snapshot here, figure out if it has more
1807          * information than we have.
1808          */
1809
1810         /*
1811          * We are only interested in consistent snapshots for now, comparing
1812          * whether one incomplete snapshot is more "advanced" seems to be
1813          * unnecessarily complex.
1814          */
1815         if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
1816                 goto snapshot_not_interesting;
1817
1818         /*
1819          * Don't use a snapshot that requires an xmin that we cannot guarantee to
1820          * be available.
1821          */
1822         if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
1823                 goto snapshot_not_interesting;
1824
1825         /*
1826          * Consistent snapshots have no next phase. Reset next_phase_at as it is
1827          * possible that an old value may remain.
1828          */
1829         Assert(ondisk.builder.next_phase_at == InvalidTransactionId);
1830         builder->next_phase_at = InvalidTransactionId;
1831
1832         /* ok, we think the snapshot is sensible, copy over everything important */
1833         builder->xmin = ondisk.builder.xmin;
1834         builder->xmax = ondisk.builder.xmax;
1835         builder->state = ondisk.builder.state;
1836
1837         builder->committed.xcnt = ondisk.builder.committed.xcnt;
1838         /* We only allocated/stored xcnt, not xcnt_space xids ! */
1839         /* don't overwrite preallocated xip, if we don't have anything here */
1840         if (builder->committed.xcnt > 0)
1841         {
1842                 pfree(builder->committed.xip);
1843                 builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
1844                 builder->committed.xip = ondisk.builder.committed.xip;
1845         }
1846         ondisk.builder.committed.xip = NULL;
1847
1848         /* set catalog modifying transactions */
1849         if (builder->catchange.xip)
1850                 pfree(builder->catchange.xip);
1851         builder->catchange.xcnt = ondisk.builder.catchange.xcnt;
1852         builder->catchange.xip = ondisk.builder.catchange.xip;
1853         ondisk.builder.catchange.xip = NULL;
1854
1855         /* our snapshot is not interesting anymore, build a new one */
1856         if (builder->snapshot != NULL)
1857         {
1858                 SnapBuildSnapDecRefcount(builder->snapshot);
1859         }
1860         builder->snapshot = SnapBuildBuildSnapshot(builder);
1861         SnapBuildSnapIncRefcount(builder->snapshot);
1862
1863         ReorderBufferSetRestartPoint(builder->reorder, lsn);
1864
1865         Assert(builder->state == SNAPBUILD_CONSISTENT);
1866
1867         ereport(LOG,
1868                         (errmsg("logical decoding found consistent point at %X/%X",
1869                                         LSN_FORMAT_ARGS(lsn)),
1870                          errdetail("Logical decoding will begin using saved snapshot.")));
1871         return true;
1872
1873 snapshot_not_interesting:
1874         if (ondisk.builder.committed.xip != NULL)
1875                 pfree(ondisk.builder.committed.xip);
1876         if (ondisk.builder.catchange.xip != NULL)
1877                 pfree(ondisk.builder.catchange.xip);
1878         return false;
1879 }
1880
1881 /*
1882  * Read the contents of the serialized snapshot to 'dest'.
1883  */
1884 static void
1885 SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path)
1886 {
1887         int                     readBytes;
1888
1889         pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
1890         readBytes = read(fd, dest, size);
1891         pgstat_report_wait_end();
1892         if (readBytes != size)
1893         {
1894                 int                     save_errno = errno;
1895
1896                 CloseTransientFile(fd);
1897
1898                 if (readBytes < 0)
1899                 {
1900                         errno = save_errno;
1901                         ereport(ERROR,
1902                                         (errcode_for_file_access(),
1903                                          errmsg("could not read file \"%s\": %m", path)));
1904                 }
1905                 else
1906                         ereport(ERROR,
1907                                         (errcode(ERRCODE_DATA_CORRUPTED),
1908                                          errmsg("could not read file \"%s\": read %d of %zu",
1909                                                         path, readBytes, size)));
1910         }
1911 }
1912
1913 /*
1914  * Remove all serialized snapshots that are not required anymore because no
1915  * slot can need them. This doesn't actually have to run during a checkpoint,
1916  * but it's a convenient point to schedule this.
1917  *
1918  * NB: We run this during checkpoints even if logical decoding is disabled so
1919  * we cleanup old slots at some point after it got disabled.
1920  */
1921 void
1922 CheckPointSnapBuild(void)
1923 {
1924         XLogRecPtr      cutoff;
1925         XLogRecPtr      redo;
1926         DIR                *snap_dir;
1927         struct dirent *snap_de;
1928         char            path[MAXPGPATH + sizeof(PG_LOGICAL_SNAPSHOTS_DIR)];
1929
1930         /*
1931          * We start off with a minimum of the last redo pointer. No new
1932          * replication slot will start before that, so that's a safe upper bound
1933          * for removal.
1934          */
1935         redo = GetRedoRecPtr();
1936
1937         /* now check for the restart ptrs from existing slots */
1938         cutoff = ReplicationSlotsComputeLogicalRestartLSN();
1939
1940         /* don't start earlier than the restart lsn */
1941         if (redo < cutoff)
1942                 cutoff = redo;
1943
1944         snap_dir = AllocateDir(PG_LOGICAL_SNAPSHOTS_DIR);
1945         while ((snap_de = ReadDir(snap_dir, PG_LOGICAL_SNAPSHOTS_DIR)) != NULL)
1946         {
1947                 uint32          hi;
1948                 uint32          lo;
1949                 XLogRecPtr      lsn;
1950                 PGFileType      de_type;
1951
1952                 if (strcmp(snap_de->d_name, ".") == 0 ||
1953                         strcmp(snap_de->d_name, "..") == 0)
1954                         continue;
1955
1956                 snprintf(path, sizeof(path), "%s/%s", PG_LOGICAL_SNAPSHOTS_DIR, snap_de->d_name);
1957                 de_type = get_dirent_type(path, snap_de, false, DEBUG1);
1958
1959                 if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_REG)
1960                 {
1961                         elog(DEBUG1, "only regular files expected: %s", path);
1962                         continue;
1963                 }
1964
1965                 /*
1966                  * temporary filenames from SnapBuildSerialize() include the LSN and
1967                  * everything but are postfixed by .$pid.tmp. We can just remove them
1968                  * the same as other files because there can be none that are
1969                  * currently being written that are older than cutoff.
1970                  *
1971                  * We just log a message if a file doesn't fit the pattern, it's
1972                  * probably some editors lock/state file or similar...
1973                  */
1974                 if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
1975                 {
1976                         ereport(LOG,
1977                                         (errmsg("could not parse file name \"%s\"", path)));
1978                         continue;
1979                 }
1980
1981                 lsn = ((uint64) hi) << 32 | lo;
1982
1983                 /* check whether we still need it */
1984                 if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
1985                 {
1986                         elog(DEBUG1, "removing snapbuild snapshot %s", path);
1987
1988                         /*
1989                          * It's not particularly harmful, though strange, if we can't
1990                          * remove the file here. Don't prevent the checkpoint from
1991                          * completing, that'd be a cure worse than the disease.
1992                          */
1993                         if (unlink(path) < 0)
1994                         {
1995                                 ereport(LOG,
1996                                                 (errcode_for_file_access(),
1997                                                  errmsg("could not remove file \"%s\": %m",
1998                                                                 path)));
1999                                 continue;
2000                         }
2001                 }
2002         }
2003         FreeDir(snap_dir);
2004 }
2005
2006 /*
2007  * Check if a logical snapshot at the specified point has been serialized.
2008  */
2009 bool
2010 SnapBuildSnapshotExists(XLogRecPtr lsn)
2011 {
2012         char            path[MAXPGPATH];
2013         int                     ret;
2014         struct stat stat_buf;
2015
2016         sprintf(path, "%s/%X-%X.snap",
2017                         PG_LOGICAL_SNAPSHOTS_DIR,
2018                         LSN_FORMAT_ARGS(lsn));
2019
2020         ret = stat(path, &stat_buf);
2021
2022         if (ret != 0 && errno != ENOENT)
2023                 ereport(ERROR,
2024                                 (errcode_for_file_access(),
2025                                  errmsg("could not stat file \"%s\": %m", path)));
2026
2027         return ret == 0;
2028 }