1 /*-------------------------------------------------------------------------
5 * Infrastructure for building historic catalog snapshots based on contents
6 * of the WAL, for the purpose of decoding heapam.c style values in the
11 * We build snapshots which can *only* be used to read catalog contents and we
12 * do so by reading and interpreting the WAL stream. The aim is to build a
13 * snapshot that behaves the same as a freshly taken MVCC snapshot would have
14 * at the time the XLogRecord was generated.
16 * To build the snapshots we reuse the infrastructure built for Hot
17 * Standby. The in-memory snapshots we build look different than HS' because
18 * we have different needs. To successfully decode data from the WAL we only
19 * need to access catalog tables and (sys|rel|cat)cache, not the actual user
20 * tables since the data we decode is wholly contained in the WAL
21 * records. Also, our snapshots need to be different in comparison to normal
22 * MVCC ones because in contrast to those we cannot fully rely on the clog and
23 * pg_subtrans for information about committed transactions because they might
24 * commit in the future from the POV of the WAL entry we're currently
25 * decoding. This definition has the advantage that we only need to prevent
26 * removal of catalog rows, while normal table's rows can still be
27 * removed. This is achieved by using the replication slot mechanism.
29 * As the percentage of transactions modifying the catalog normally is fairly
30 * small in comparisons to ones only manipulating user data, we keep track of
31 * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
32 * track of all running transactions like it's done in a normal snapshot. Note
33 * that we're generally only looking at transactions that have acquired an
34 * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
35 * that we consider committed, everything else is considered aborted/in
36 * progress. That also allows us not to care about subtransactions before they
37 * have committed which means this module, in contrast to HS, doesn't have to
38 * care about suboverflowed subtransactions and similar.
40 * One complexity of doing this is that to e.g. handle mixed DDL/DML
41 * transactions we need Snapshots that see intermediate versions of the
42 * catalog in a transaction. During normal operation this is achieved by using
43 * CommandIds/cmin/cmax. The problem with that however is that for space
44 * efficiency reasons, the cmin and cmax are not included in WAL records. We
45 * cannot read the cmin/cmax from the tuple itself, either, because it is
46 * reset on crash recovery. Even if we could, we could not decode combocids
47 * which are only tracked in the original backend's memory. To work around
48 * that, heapam writes an extra WAL record (XLOG_HEAP2_NEW_CID) every time a
49 * catalog row is modified, which includes the cmin and cmax of the
50 * tuple. During decoding, we insert the ctid->(cmin,cmax) mappings into the
51 * reorder buffer, and use them at visibility checks instead of the cmin/cmax
52 * on the tuple itself. Check the reorderbuffer.c's comment above
53 * ResolveCminCmaxDuringDecoding() for details.
55 * To facilitate all this we need our own visibility routine, as the normal
56 * ones are optimized for different usecases.
58 * To replace the normal catalog snapshots with decoding ones use the
59 * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
63 * The snapbuild machinery is starting up in several stages, as illustrated
64 * by the following graph describing the SnapBuild->state transitions:
66 * +-------------------------+
67 * +----| START |-------------+
68 * | +-------------------------+ |
71 * | running_xacts #1 |
75 * | +-------------------------+ v
76 * | | BUILDING_SNAPSHOT |------------>|
77 * | +-------------------------+ |
80 * | running_xacts #2, xacts from #1 finished |
84 * | +-------------------------+ v
85 * | | FULL_SNAPSHOT |------------>|
86 * | +-------------------------+ |
88 * running_xacts | saved snapshot
89 * with zero xacts | at running_xacts's lsn
91 * | running_xacts with xacts from #2 finished |
94 * | +-------------------------+ |
95 * +--->|SNAPBUILD_CONSISTENT |<------------+
96 * +-------------------------+
98 * Initially the machinery is in the START stage. When an xl_running_xacts
99 * record is read that is sufficiently new (above the safe xmin horizon),
100 * there's a state transition. If there were no running xacts when the
101 * xl_running_xacts record was generated, we'll directly go into CONSISTENT
102 * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full
103 * snapshot means that all transactions that start henceforth can be decoded
104 * in their entirety, but transactions that started previously can't. In
105 * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
106 * running transactions have committed or aborted.
108 * Only transactions that commit after CONSISTENT state has been reached will
109 * be replayed, even though they might have started while still in
110 * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
111 * changes has been exported, but all the following ones will be. That point
112 * is a convenient point to initialize replication from, which is why we
113 * export a snapshot at that point, which *can* be used to read normal data.
115 * Copyright (c) 2012-2024, PostgreSQL Global Development Group
118 * src/backend/replication/logical/snapbuild.c
120 *-------------------------------------------------------------------------
123 #include "postgres.h"
125 #include <sys/stat.h>
128 #include "access/heapam_xlog.h"
129 #include "access/transam.h"
130 #include "access/xact.h"
131 #include "common/file_utils.h"
132 #include "miscadmin.h"
134 #include "replication/logical.h"
135 #include "replication/reorderbuffer.h"
136 #include "replication/snapbuild.h"
137 #include "replication/snapbuild_internal.h"
138 #include "storage/fd.h"
139 #include "storage/lmgr.h"
140 #include "storage/proc.h"
141 #include "storage/procarray.h"
142 #include "storage/standby.h"
143 #include "utils/builtins.h"
144 #include "utils/memutils.h"
145 #include "utils/snapmgr.h"
146 #include "utils/snapshot.h"
148 * Starting a transaction -- which we need to do while exporting a snapshot --
149 * removes knowledge about the previously used resowner, so we save it here.
151 static ResourceOwner SavedResourceOwnerDuringExport
= NULL
;
152 static bool ExportInProgress
= false;
154 /* ->committed and ->catchange manipulation */
155 static void SnapBuildPurgeOlderTxn(SnapBuild
*builder
);
157 /* snapshot building/manipulation/distribution functions */
158 static Snapshot
SnapBuildBuildSnapshot(SnapBuild
*builder
);
160 static void SnapBuildFreeSnapshot(Snapshot snap
);
162 static void SnapBuildSnapIncRefcount(Snapshot snap
);
164 static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild
*builder
, XLogRecPtr lsn
);
166 static inline bool SnapBuildXidHasCatalogChanges(SnapBuild
*builder
, TransactionId xid
,
169 /* xlog reading helper functions for SnapBuildProcessRunningXacts */
170 static bool SnapBuildFindSnapshot(SnapBuild
*builder
, XLogRecPtr lsn
, xl_running_xacts
*running
);
171 static void SnapBuildWaitSnapshot(xl_running_xacts
*running
, TransactionId cutoff
);
173 /* serialization functions */
174 static void SnapBuildSerialize(SnapBuild
*builder
, XLogRecPtr lsn
);
175 static bool SnapBuildRestore(SnapBuild
*builder
, XLogRecPtr lsn
);
176 static void SnapBuildRestoreContents(int fd
, char *dest
, Size size
, const char *path
);
179 * Allocate a new snapshot builder.
181 * xmin_horizon is the xid >= which we can be sure no catalog rows have been
182 * removed, start_lsn is the LSN >= we want to replay commits.
185 AllocateSnapshotBuilder(ReorderBuffer
*reorder
,
186 TransactionId xmin_horizon
,
187 XLogRecPtr start_lsn
,
188 bool need_full_snapshot
,
189 bool in_slot_creation
,
190 XLogRecPtr two_phase_at
)
192 MemoryContext context
;
193 MemoryContext oldcontext
;
196 /* allocate memory in own context, to have better accountability */
197 context
= AllocSetContextCreate(CurrentMemoryContext
,
198 "snapshot builder context",
199 ALLOCSET_DEFAULT_SIZES
);
200 oldcontext
= MemoryContextSwitchTo(context
);
202 builder
= palloc0(sizeof(SnapBuild
));
204 builder
->state
= SNAPBUILD_START
;
205 builder
->context
= context
;
206 builder
->reorder
= reorder
;
207 /* Other struct members initialized by zeroing via palloc0 above */
209 builder
->committed
.xcnt
= 0;
210 builder
->committed
.xcnt_space
= 128; /* arbitrary number */
211 builder
->committed
.xip
=
212 palloc0(builder
->committed
.xcnt_space
* sizeof(TransactionId
));
213 builder
->committed
.includes_all_transactions
= true;
215 builder
->catchange
.xcnt
= 0;
216 builder
->catchange
.xip
= NULL
;
218 builder
->initial_xmin_horizon
= xmin_horizon
;
219 builder
->start_decoding_at
= start_lsn
;
220 builder
->in_slot_creation
= in_slot_creation
;
221 builder
->building_full_snapshot
= need_full_snapshot
;
222 builder
->two_phase_at
= two_phase_at
;
224 MemoryContextSwitchTo(oldcontext
);
230 * Free a snapshot builder.
233 FreeSnapshotBuilder(SnapBuild
*builder
)
235 MemoryContext context
= builder
->context
;
237 /* free snapshot explicitly, that contains some error checking */
238 if (builder
->snapshot
!= NULL
)
240 SnapBuildSnapDecRefcount(builder
->snapshot
);
241 builder
->snapshot
= NULL
;
244 /* other resources are deallocated via memory context reset */
245 MemoryContextDelete(context
);
249 * Free an unreferenced snapshot that has previously been built by us.
252 SnapBuildFreeSnapshot(Snapshot snap
)
254 /* make sure we don't get passed an external snapshot */
255 Assert(snap
->snapshot_type
== SNAPSHOT_HISTORIC_MVCC
);
257 /* make sure nobody modified our snapshot */
258 Assert(snap
->curcid
== FirstCommandId
);
259 Assert(!snap
->suboverflowed
);
260 Assert(!snap
->takenDuringRecovery
);
261 Assert(snap
->regd_count
== 0);
263 /* slightly more likely, so it's checked even without c-asserts */
265 elog(ERROR
, "cannot free a copied snapshot");
267 if (snap
->active_count
)
268 elog(ERROR
, "cannot free an active snapshot");
274 * In which state of snapshot building are we?
277 SnapBuildCurrentState(SnapBuild
*builder
)
279 return builder
->state
;
283 * Return the LSN at which the two-phase decoding was first enabled.
286 SnapBuildGetTwoPhaseAt(SnapBuild
*builder
)
288 return builder
->two_phase_at
;
292 * Set the LSN at which two-phase decoding is enabled.
295 SnapBuildSetTwoPhaseAt(SnapBuild
*builder
, XLogRecPtr ptr
)
297 builder
->two_phase_at
= ptr
;
301 * Should the contents of transaction ending at 'ptr' be decoded?
304 SnapBuildXactNeedsSkip(SnapBuild
*builder
, XLogRecPtr ptr
)
306 return ptr
< builder
->start_decoding_at
;
310 * Increase refcount of a snapshot.
312 * This is used when handing out a snapshot to some external resource or when
313 * adding a Snapshot as builder->snapshot.
316 SnapBuildSnapIncRefcount(Snapshot snap
)
318 snap
->active_count
++;
322 * Decrease refcount of a snapshot and free if the refcount reaches zero.
324 * Externally visible, so that external resources that have been handed an
325 * IncRef'ed Snapshot can adjust its refcount easily.
328 SnapBuildSnapDecRefcount(Snapshot snap
)
330 /* make sure we don't get passed an external snapshot */
331 Assert(snap
->snapshot_type
== SNAPSHOT_HISTORIC_MVCC
);
333 /* make sure nobody modified our snapshot */
334 Assert(snap
->curcid
== FirstCommandId
);
335 Assert(!snap
->suboverflowed
);
336 Assert(!snap
->takenDuringRecovery
);
338 Assert(snap
->regd_count
== 0);
340 Assert(snap
->active_count
> 0);
342 /* slightly more likely, so it's checked even without casserts */
344 elog(ERROR
, "cannot free a copied snapshot");
346 snap
->active_count
--;
347 if (snap
->active_count
== 0)
348 SnapBuildFreeSnapshot(snap
);
352 * Build a new snapshot, based on currently committed catalog-modifying
355 * In-progress transactions with catalog access are *not* allowed to modify
356 * these snapshots; they have to copy them and fill in appropriate ->curcid
357 * and ->subxip/subxcnt values.
360 SnapBuildBuildSnapshot(SnapBuild
*builder
)
365 Assert(builder
->state
>= SNAPBUILD_FULL_SNAPSHOT
);
367 ssize
= sizeof(SnapshotData
)
368 + sizeof(TransactionId
) * builder
->committed
.xcnt
369 + sizeof(TransactionId
) * 1 /* toplevel xid */ ;
371 snapshot
= MemoryContextAllocZero(builder
->context
, ssize
);
373 snapshot
->snapshot_type
= SNAPSHOT_HISTORIC_MVCC
;
376 * We misuse the original meaning of SnapshotData's xip and subxip fields
377 * to make the more fitting for our needs.
379 * In the 'xip' array we store transactions that have to be treated as
380 * committed. Since we will only ever look at tuples from transactions
381 * that have modified the catalog it's more efficient to store those few
382 * that exist between xmin and xmax (frequently there are none).
384 * Snapshots that are used in transactions that have modified the catalog
385 * also use the 'subxip' array to store their toplevel xid and all the
386 * subtransaction xids so we can recognize when we need to treat rows as
387 * visible that are not in xip but still need to be visible. Subxip only
388 * gets filled when the transaction is copied into the context of a
389 * catalog modifying transaction since we otherwise share a snapshot
390 * between transactions. As long as a txn hasn't modified the catalog it
391 * doesn't need to treat any uncommitted rows as visible, so there is no
392 * need for those xids.
394 * Both arrays are qsort'ed so that we can use bsearch() on them.
396 Assert(TransactionIdIsNormal(builder
->xmin
));
397 Assert(TransactionIdIsNormal(builder
->xmax
));
399 snapshot
->xmin
= builder
->xmin
;
400 snapshot
->xmax
= builder
->xmax
;
402 /* store all transactions to be treated as committed by this snapshot */
404 (TransactionId
*) ((char *) snapshot
+ sizeof(SnapshotData
));
405 snapshot
->xcnt
= builder
->committed
.xcnt
;
406 memcpy(snapshot
->xip
,
407 builder
->committed
.xip
,
408 builder
->committed
.xcnt
* sizeof(TransactionId
));
410 /* sort so we can bsearch() */
411 qsort(snapshot
->xip
, snapshot
->xcnt
, sizeof(TransactionId
), xidComparator
);
414 * Initially, subxip is empty, i.e. it's a snapshot to be used by
415 * transactions that don't modify the catalog. Will be filled by
416 * ReorderBufferCopySnap() if necessary.
418 snapshot
->subxcnt
= 0;
419 snapshot
->subxip
= NULL
;
421 snapshot
->suboverflowed
= false;
422 snapshot
->takenDuringRecovery
= false;
423 snapshot
->copied
= false;
424 snapshot
->curcid
= FirstCommandId
;
425 snapshot
->active_count
= 0;
426 snapshot
->regd_count
= 0;
427 snapshot
->snapXactCompletionCount
= 0;
433 * Build the initial slot snapshot and convert it to a normal snapshot that
434 * is understood by HeapTupleSatisfiesMVCC.
436 * The snapshot will be usable directly in current transaction or exported
437 * for loading in different transaction.
440 SnapBuildInitialSnapshot(SnapBuild
*builder
)
444 TransactionId safeXid
;
445 TransactionId
*newxip
;
448 Assert(XactIsoLevel
== XACT_REPEATABLE_READ
);
449 Assert(builder
->building_full_snapshot
);
451 /* don't allow older snapshots */
452 InvalidateCatalogSnapshot(); /* about to overwrite MyProc->xmin */
453 if (HaveRegisteredOrActiveSnapshot())
454 elog(ERROR
, "cannot build an initial slot snapshot when snapshots exist");
455 Assert(!HistoricSnapshotActive());
457 if (builder
->state
!= SNAPBUILD_CONSISTENT
)
458 elog(ERROR
, "cannot build an initial slot snapshot before reaching a consistent state");
460 if (!builder
->committed
.includes_all_transactions
)
461 elog(ERROR
, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
463 /* so we don't overwrite the existing value */
464 if (TransactionIdIsValid(MyProc
->xmin
))
465 elog(ERROR
, "cannot build an initial slot snapshot when MyProc->xmin already is valid");
467 snap
= SnapBuildBuildSnapshot(builder
);
470 * We know that snap->xmin is alive, enforced by the logical xmin
471 * mechanism. Due to that we can do this without locks, we're only
472 * changing our own value.
474 * Building an initial snapshot is expensive and an unenforced xmin
475 * horizon would have bad consequences, therefore always double-check that
476 * the horizon is enforced.
478 LWLockAcquire(ProcArrayLock
, LW_SHARED
);
479 safeXid
= GetOldestSafeDecodingTransactionId(false);
480 LWLockRelease(ProcArrayLock
);
482 if (TransactionIdFollows(safeXid
, snap
->xmin
))
483 elog(ERROR
, "cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u",
484 safeXid
, snap
->xmin
);
486 MyProc
->xmin
= snap
->xmin
;
488 /* allocate in transaction context */
489 newxip
= (TransactionId
*)
490 palloc(sizeof(TransactionId
) * GetMaxSnapshotXidCount());
493 * snapbuild.c builds transactions in an "inverted" manner, which means it
494 * stores committed transactions in ->xip, not ones in progress. Build a
495 * classical snapshot by marking all non-committed transactions as
496 * in-progress. This can be expensive.
498 for (xid
= snap
->xmin
; NormalTransactionIdPrecedes(xid
, snap
->xmax
);)
503 * Check whether transaction committed using the decoding snapshot
506 test
= bsearch(&xid
, snap
->xip
, snap
->xcnt
,
507 sizeof(TransactionId
), xidComparator
);
511 if (newxcnt
>= GetMaxSnapshotXidCount())
513 (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE
),
514 errmsg("initial slot snapshot too large")));
516 newxip
[newxcnt
++] = xid
;
519 TransactionIdAdvance(xid
);
522 /* adjust remaining snapshot fields as needed */
523 snap
->snapshot_type
= SNAPSHOT_MVCC
;
524 snap
->xcnt
= newxcnt
;
531 * Export a snapshot so it can be set in another session with SET TRANSACTION
534 * For that we need to start a transaction in the current backend as the
535 * importing side checks whether the source transaction is still open to make
536 * sure the xmin horizon hasn't advanced since then.
539 SnapBuildExportSnapshot(SnapBuild
*builder
)
544 if (IsTransactionOrTransactionBlock())
545 elog(ERROR
, "cannot export a snapshot from within a transaction");
547 if (SavedResourceOwnerDuringExport
)
548 elog(ERROR
, "can only export one snapshot at a time");
550 SavedResourceOwnerDuringExport
= CurrentResourceOwner
;
551 ExportInProgress
= true;
553 StartTransactionCommand();
555 /* There doesn't seem to a nice API to set these */
556 XactIsoLevel
= XACT_REPEATABLE_READ
;
559 snap
= SnapBuildInitialSnapshot(builder
);
562 * now that we've built a plain snapshot, make it active and use the
563 * normal mechanisms for exporting it
565 snapname
= ExportSnapshot(snap
);
568 (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
569 "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
571 snapname
, snap
->xcnt
)));
576 * Ensure there is a snapshot and if not build one for current transaction.
579 SnapBuildGetOrBuildSnapshot(SnapBuild
*builder
)
581 Assert(builder
->state
== SNAPBUILD_CONSISTENT
);
583 /* only build a new snapshot if we don't have a prebuilt one */
584 if (builder
->snapshot
== NULL
)
586 builder
->snapshot
= SnapBuildBuildSnapshot(builder
);
587 /* increase refcount for the snapshot builder */
588 SnapBuildSnapIncRefcount(builder
->snapshot
);
591 return builder
->snapshot
;
595 * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
596 * any. Aborts the previously started transaction and resets the resource
597 * owner back to its original value.
600 SnapBuildClearExportedSnapshot(void)
602 ResourceOwner tmpResOwner
;
604 /* nothing exported, that is the usual case */
605 if (!ExportInProgress
)
608 if (!IsTransactionState())
609 elog(ERROR
, "clearing exported snapshot in wrong transaction state");
612 * AbortCurrentTransaction() takes care of resetting the snapshot state,
613 * so remember SavedResourceOwnerDuringExport.
615 tmpResOwner
= SavedResourceOwnerDuringExport
;
617 /* make sure nothing could have ever happened */
618 AbortCurrentTransaction();
620 CurrentResourceOwner
= tmpResOwner
;
624 * Clear snapshot export state during transaction abort.
627 SnapBuildResetExportedSnapshotState(void)
629 SavedResourceOwnerDuringExport
= NULL
;
630 ExportInProgress
= false;
634 * Handle the effects of a single heap change, appropriate to the current state
635 * of the snapshot builder and returns whether changes made at (xid, lsn) can
639 SnapBuildProcessChange(SnapBuild
*builder
, TransactionId xid
, XLogRecPtr lsn
)
642 * We can't handle data in transactions if we haven't built a snapshot
643 * yet, so don't store them.
645 if (builder
->state
< SNAPBUILD_FULL_SNAPSHOT
)
649 * No point in keeping track of changes in transactions that we don't have
650 * enough information about to decode. This means that they started before
651 * we got into the SNAPBUILD_FULL_SNAPSHOT state.
653 if (builder
->state
< SNAPBUILD_CONSISTENT
&&
654 TransactionIdPrecedes(xid
, builder
->next_phase_at
))
658 * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
659 * be needed to decode the change we're currently processing.
661 if (!ReorderBufferXidHasBaseSnapshot(builder
->reorder
, xid
))
663 /* only build a new snapshot if we don't have a prebuilt one */
664 if (builder
->snapshot
== NULL
)
666 builder
->snapshot
= SnapBuildBuildSnapshot(builder
);
667 /* increase refcount for the snapshot builder */
668 SnapBuildSnapIncRefcount(builder
->snapshot
);
672 * Increase refcount for the transaction we're handing the snapshot
675 SnapBuildSnapIncRefcount(builder
->snapshot
);
676 ReorderBufferSetBaseSnapshot(builder
->reorder
, xid
, lsn
,
684 * Do CommandId/combo CID handling after reading an xl_heap_new_cid record.
685 * This implies that a transaction has done some form of write to system
689 SnapBuildProcessNewCid(SnapBuild
*builder
, TransactionId xid
,
690 XLogRecPtr lsn
, xl_heap_new_cid
*xlrec
)
695 * we only log new_cid's if a catalog tuple was modified, so mark the
696 * transaction as containing catalog modifications
698 ReorderBufferXidSetCatalogChanges(builder
->reorder
, xid
, lsn
);
700 ReorderBufferAddNewTupleCids(builder
->reorder
, xlrec
->top_xid
, lsn
,
701 xlrec
->target_locator
, xlrec
->target_tid
,
702 xlrec
->cmin
, xlrec
->cmax
,
705 /* figure out new command id */
706 if (xlrec
->cmin
!= InvalidCommandId
&&
707 xlrec
->cmax
!= InvalidCommandId
)
708 cid
= Max(xlrec
->cmin
, xlrec
->cmax
);
709 else if (xlrec
->cmax
!= InvalidCommandId
)
711 else if (xlrec
->cmin
!= InvalidCommandId
)
715 cid
= InvalidCommandId
; /* silence compiler */
716 elog(ERROR
, "xl_heap_new_cid record without a valid CommandId");
719 ReorderBufferAddNewCommandId(builder
->reorder
, xid
, lsn
, cid
+ 1);
723 * Add a new Snapshot to all transactions we're decoding that currently are
724 * in-progress so they can see new catalog contents made by the transaction
725 * that just committed. This is necessary because those in-progress
726 * transactions will use the new catalog's contents from here on (at the very
727 * least everything they do needs to be compatible with newer catalog
731 SnapBuildDistributeNewCatalogSnapshot(SnapBuild
*builder
, XLogRecPtr lsn
)
734 ReorderBufferTXN
*txn
;
737 * Iterate through all toplevel transactions. This can include
738 * subtransactions which we just don't yet know to be that, but that's
739 * fine, they will just get an unnecessary snapshot queued.
741 dlist_foreach(txn_i
, &builder
->reorder
->toplevel_by_lsn
)
743 txn
= dlist_container(ReorderBufferTXN
, node
, txn_i
.cur
);
745 Assert(TransactionIdIsValid(txn
->xid
));
748 * If we don't have a base snapshot yet, there are no changes in this
749 * transaction which in turn implies we don't yet need a snapshot at
750 * all. We'll add a snapshot when the first change gets queued.
752 * NB: This works correctly even for subtransactions because
753 * ReorderBufferAssignChild() takes care to transfer the base snapshot
754 * to the top-level transaction, and while iterating the changequeue
755 * we'll get the change from the subtxn.
757 if (!ReorderBufferXidHasBaseSnapshot(builder
->reorder
, txn
->xid
))
761 * We don't need to add snapshot to prepared transactions as they
762 * should not see the new catalog contents.
764 if (rbtxn_prepared(txn
) || rbtxn_skip_prepared(txn
))
767 elog(DEBUG2
, "adding a new snapshot to %u at %X/%X",
768 txn
->xid
, LSN_FORMAT_ARGS(lsn
));
771 * increase the snapshot's refcount for the transaction we are handing
774 SnapBuildSnapIncRefcount(builder
->snapshot
);
775 ReorderBufferAddSnapshot(builder
->reorder
, txn
->xid
, lsn
,
781 * Keep track of a new catalog changing transaction that has committed.
784 SnapBuildAddCommittedTxn(SnapBuild
*builder
, TransactionId xid
)
786 Assert(TransactionIdIsValid(xid
));
788 if (builder
->committed
.xcnt
== builder
->committed
.xcnt_space
)
790 builder
->committed
.xcnt_space
= builder
->committed
.xcnt_space
* 2 + 1;
792 elog(DEBUG1
, "increasing space for committed transactions to %u",
793 (uint32
) builder
->committed
.xcnt_space
);
795 builder
->committed
.xip
= repalloc(builder
->committed
.xip
,
796 builder
->committed
.xcnt_space
* sizeof(TransactionId
));
800 * TODO: It might make sense to keep the array sorted here instead of
801 * doing it every time we build a new snapshot. On the other hand this
802 * gets called repeatedly when a transaction with subtransactions commits.
804 builder
->committed
.xip
[builder
->committed
.xcnt
++] = xid
;
808 * Remove knowledge about transactions we treat as committed or containing catalog
809 * changes that are smaller than ->xmin. Those won't ever get checked via
810 * the ->committed or ->catchange array, respectively. The committed xids will
811 * get checked via the clog machinery.
813 * We can ideally remove the transaction from catchange array once it is
814 * finished (committed/aborted) but that could be costly as we need to maintain
815 * the xids order in the array.
818 SnapBuildPurgeOlderTxn(SnapBuild
*builder
)
821 TransactionId
*workspace
;
822 int surviving_xids
= 0;
825 if (!TransactionIdIsNormal(builder
->xmin
))
828 /* TODO: Neater algorithm than just copying and iterating? */
830 MemoryContextAlloc(builder
->context
,
831 builder
->committed
.xcnt
* sizeof(TransactionId
));
833 /* copy xids that still are interesting to workspace */
834 for (off
= 0; off
< builder
->committed
.xcnt
; off
++)
836 if (NormalTransactionIdPrecedes(builder
->committed
.xip
[off
],
840 workspace
[surviving_xids
++] = builder
->committed
.xip
[off
];
843 /* copy workspace back to persistent state */
844 memcpy(builder
->committed
.xip
, workspace
,
845 surviving_xids
* sizeof(TransactionId
));
847 elog(DEBUG3
, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
848 (uint32
) builder
->committed
.xcnt
, (uint32
) surviving_xids
,
849 builder
->xmin
, builder
->xmax
);
850 builder
->committed
.xcnt
= surviving_xids
;
855 * Purge xids in ->catchange as well. The purged array must also be sorted
856 * in xidComparator order.
858 if (builder
->catchange
.xcnt
> 0)
861 * Since catchange.xip is sorted, we find the lower bound of xids that
862 * are still interesting.
864 for (off
= 0; off
< builder
->catchange
.xcnt
; off
++)
866 if (TransactionIdFollowsOrEquals(builder
->catchange
.xip
[off
],
871 surviving_xids
= builder
->catchange
.xcnt
- off
;
873 if (surviving_xids
> 0)
875 memmove(builder
->catchange
.xip
, &(builder
->catchange
.xip
[off
]),
876 surviving_xids
* sizeof(TransactionId
));
880 pfree(builder
->catchange
.xip
);
881 builder
->catchange
.xip
= NULL
;
884 elog(DEBUG3
, "purged catalog modifying transactions from %u to %u, xmin: %u, xmax: %u",
885 (uint32
) builder
->catchange
.xcnt
, (uint32
) surviving_xids
,
886 builder
->xmin
, builder
->xmax
);
887 builder
->catchange
.xcnt
= surviving_xids
;
892 * Handle everything that needs to be done when a transaction commits
895 SnapBuildCommitTxn(SnapBuild
*builder
, XLogRecPtr lsn
, TransactionId xid
,
896 int nsubxacts
, TransactionId
*subxacts
, uint32 xinfo
)
900 bool needs_snapshot
= false;
901 bool needs_timetravel
= false;
902 bool sub_needs_timetravel
= false;
904 TransactionId xmax
= xid
;
907 * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
908 * will they be part of a snapshot. So we don't need to record anything.
910 if (builder
->state
== SNAPBUILD_START
||
911 (builder
->state
== SNAPBUILD_BUILDING_SNAPSHOT
&&
912 TransactionIdPrecedes(xid
, builder
->next_phase_at
)))
914 /* ensure that only commits after this are getting replayed */
915 if (builder
->start_decoding_at
<= lsn
)
916 builder
->start_decoding_at
= lsn
+ 1;
920 if (builder
->state
< SNAPBUILD_CONSISTENT
)
922 /* ensure that only commits after this are getting replayed */
923 if (builder
->start_decoding_at
<= lsn
)
924 builder
->start_decoding_at
= lsn
+ 1;
927 * If building an exportable snapshot, force xid to be tracked, even
928 * if the transaction didn't modify the catalog.
930 if (builder
->building_full_snapshot
)
932 needs_timetravel
= true;
936 for (nxact
= 0; nxact
< nsubxacts
; nxact
++)
938 TransactionId subxid
= subxacts
[nxact
];
941 * Add subtransaction to base snapshot if catalog modifying, we don't
942 * distinguish to toplevel transactions there.
944 if (SnapBuildXidHasCatalogChanges(builder
, subxid
, xinfo
))
946 sub_needs_timetravel
= true;
947 needs_snapshot
= true;
949 elog(DEBUG1
, "found subtransaction %u:%u with catalog changes",
952 SnapBuildAddCommittedTxn(builder
, subxid
);
954 if (NormalTransactionIdFollows(subxid
, xmax
))
959 * If we're forcing timetravel we also need visibility information
960 * about subtransaction, so keep track of subtransaction's state, even
961 * if not catalog modifying. Don't need to distribute a snapshot in
964 else if (needs_timetravel
)
966 SnapBuildAddCommittedTxn(builder
, subxid
);
967 if (NormalTransactionIdFollows(subxid
, xmax
))
972 /* if top-level modified catalog, it'll need a snapshot */
973 if (SnapBuildXidHasCatalogChanges(builder
, xid
, xinfo
))
975 elog(DEBUG2
, "found top level transaction %u, with catalog changes",
977 needs_snapshot
= true;
978 needs_timetravel
= true;
979 SnapBuildAddCommittedTxn(builder
, xid
);
981 else if (sub_needs_timetravel
)
983 /* track toplevel txn as well, subxact alone isn't meaningful */
984 elog(DEBUG2
, "forced transaction %u to do timetravel due to one of its subtransactions",
986 needs_timetravel
= true;
987 SnapBuildAddCommittedTxn(builder
, xid
);
989 else if (needs_timetravel
)
991 elog(DEBUG2
, "forced transaction %u to do timetravel", xid
);
993 SnapBuildAddCommittedTxn(builder
, xid
);
996 if (!needs_timetravel
)
998 /* record that we cannot export a general snapshot anymore */
999 builder
->committed
.includes_all_transactions
= false;
1002 Assert(!needs_snapshot
|| needs_timetravel
);
1005 * Adjust xmax of the snapshot builder, we only do that for committed,
1006 * catalog modifying, transactions, everything else isn't interesting for
1007 * us since we'll never look at the respective rows.
1009 if (needs_timetravel
&&
1010 (!TransactionIdIsValid(builder
->xmax
) ||
1011 TransactionIdFollowsOrEquals(xmax
, builder
->xmax
)))
1013 builder
->xmax
= xmax
;
1014 TransactionIdAdvance(builder
->xmax
);
1017 /* if there's any reason to build a historic snapshot, do so now */
1021 * If we haven't built a complete snapshot yet there's no need to hand
1022 * it out, it wouldn't (and couldn't) be used anyway.
1024 if (builder
->state
< SNAPBUILD_FULL_SNAPSHOT
)
1028 * Decrease the snapshot builder's refcount of the old snapshot, note
1029 * that it still will be used if it has been handed out to the
1030 * reorderbuffer earlier.
1032 if (builder
->snapshot
)
1033 SnapBuildSnapDecRefcount(builder
->snapshot
);
1035 builder
->snapshot
= SnapBuildBuildSnapshot(builder
);
1037 /* we might need to execute invalidations, add snapshot */
1038 if (!ReorderBufferXidHasBaseSnapshot(builder
->reorder
, xid
))
1040 SnapBuildSnapIncRefcount(builder
->snapshot
);
1041 ReorderBufferSetBaseSnapshot(builder
->reorder
, xid
, lsn
,
1045 /* refcount of the snapshot builder for the new snapshot */
1046 SnapBuildSnapIncRefcount(builder
->snapshot
);
1048 /* add a new catalog snapshot to all currently running transactions */
1049 SnapBuildDistributeNewCatalogSnapshot(builder
, lsn
);
1054 * Check the reorder buffer and the snapshot to see if the given transaction has
1055 * modified catalogs.
1058 SnapBuildXidHasCatalogChanges(SnapBuild
*builder
, TransactionId xid
,
1061 if (ReorderBufferXidHasCatalogChanges(builder
->reorder
, xid
))
1065 * The transactions that have changed catalogs must have invalidation
1068 if (!(xinfo
& XACT_XINFO_HAS_INVALS
))
1071 /* Check the catchange XID array */
1072 return ((builder
->catchange
.xcnt
> 0) &&
1073 (bsearch(&xid
, builder
->catchange
.xip
, builder
->catchange
.xcnt
,
1074 sizeof(TransactionId
), xidComparator
) != NULL
));
1077 /* -----------------------------------
1078 * Snapshot building functions dealing with xlog records
1079 * -----------------------------------
1083 * Process a running xacts record, and use its information to first build a
1084 * historic snapshot and later to release resources that aren't needed
1088 SnapBuildProcessRunningXacts(SnapBuild
*builder
, XLogRecPtr lsn
, xl_running_xacts
*running
)
1090 ReorderBufferTXN
*txn
;
1094 * If we're not consistent yet, inspect the record to see whether it
1095 * allows to get closer to being consistent. If we are consistent, dump
1096 * our snapshot so others or we, after a restart, can use it.
1098 if (builder
->state
< SNAPBUILD_CONSISTENT
)
1100 /* returns false if there's no point in performing cleanup just yet */
1101 if (!SnapBuildFindSnapshot(builder
, lsn
, running
))
1105 SnapBuildSerialize(builder
, lsn
);
1108 * Update range of interesting xids based on the running xacts
1109 * information. We don't increase ->xmax using it, because once we are in
1110 * a consistent state we can do that ourselves and much more efficiently
1111 * so, because we only need to do it for catalog transactions since we
1112 * only ever look at those.
1114 * NB: We only increase xmax when a catalog modifying transaction commits
1115 * (see SnapBuildCommitTxn). Because of this, xmax can be lower than
1116 * xmin, which looks odd but is correct and actually more efficient, since
1117 * we hit fast paths in heapam_visibility.c.
1119 builder
->xmin
= running
->oldestRunningXid
;
1121 /* Remove transactions we don't need to keep track off anymore */
1122 SnapBuildPurgeOlderTxn(builder
);
1125 * Advance the xmin limit for the current replication slot, to allow
1126 * vacuum to clean up the tuples this slot has been protecting.
1128 * The reorderbuffer might have an xmin among the currently running
1129 * snapshots; use it if so. If not, we need only consider the snapshots
1130 * we'll produce later, which can't be less than the oldest running xid in
1131 * the record we're reading now.
1133 xmin
= ReorderBufferGetOldestXmin(builder
->reorder
);
1134 if (xmin
== InvalidTransactionId
)
1135 xmin
= running
->oldestRunningXid
;
1136 elog(DEBUG3
, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
1137 builder
->xmin
, builder
->xmax
, running
->oldestRunningXid
, xmin
);
1138 LogicalIncreaseXminForSlot(lsn
, xmin
);
1141 * Also tell the slot where we can restart decoding from. We don't want to
1142 * do that after every commit because changing that implies an fsync of
1143 * the logical slot's state file, so we only do it every time we see a
1144 * running xacts record.
1146 * Do so by looking for the oldest in progress transaction (determined by
1147 * the first LSN of any of its relevant records). Every transaction
1148 * remembers the last location we stored the snapshot to disk before its
1149 * beginning. That point is where we can restart from.
1153 * Can't know about a serialized snapshot's location if we're not
1156 if (builder
->state
< SNAPBUILD_CONSISTENT
)
1159 txn
= ReorderBufferGetOldestTXN(builder
->reorder
);
1162 * oldest ongoing txn might have started when we didn't yet serialize
1163 * anything because we hadn't reached a consistent state yet.
1165 if (txn
!= NULL
&& txn
->restart_decoding_lsn
!= InvalidXLogRecPtr
)
1166 LogicalIncreaseRestartDecodingForSlot(lsn
, txn
->restart_decoding_lsn
);
1169 * No in-progress transaction, can reuse the last serialized snapshot if
1172 else if (txn
== NULL
&&
1173 builder
->reorder
->current_restart_decoding_lsn
!= InvalidXLogRecPtr
&&
1174 builder
->last_serialized_snapshot
!= InvalidXLogRecPtr
)
1175 LogicalIncreaseRestartDecodingForSlot(lsn
,
1176 builder
->last_serialized_snapshot
);
1181 * Build the start of a snapshot that's capable of decoding the catalog.
1183 * Helper function for SnapBuildProcessRunningXacts() while we're not yet
1186 * Returns true if there is a point in performing internal maintenance/cleanup
1187 * using the xl_running_xacts record.
1190 SnapBuildFindSnapshot(SnapBuild
*builder
, XLogRecPtr lsn
, xl_running_xacts
*running
)
1193 * Build catalog decoding snapshot incrementally using information about
1194 * the currently running transactions. There are several ways to do that:
1196 * a) There were no running transactions when the xl_running_xacts record
1197 * was inserted, jump to CONSISTENT immediately. We might find such a
1198 * state while waiting on c)'s sub-states.
1200 * b) This (in a previous run) or another decoding slot serialized a
1201 * snapshot to disk that we can use. Can't use this method while finding
1202 * the start point for decoding changes as the restart LSN would be an
1203 * arbitrary LSN but we need to find the start point to extract changes
1204 * where we won't see the data for partial transactions. Also, we cannot
1205 * use this method when a slot needs a full snapshot for export or direct
1206 * use, as that snapshot will only contain catalog modifying transactions.
1208 * c) First incrementally build a snapshot for catalog tuples
1209 * (BUILDING_SNAPSHOT), that requires all, already in-progress,
1210 * transactions to finish. Every transaction starting after that
1211 * (FULL_SNAPSHOT state), has enough information to be decoded. But
1212 * for older running transactions no viable snapshot exists yet, so
1213 * CONSISTENT will only be reached once all of those have finished.
1218 * xl_running_xacts record is older than what we can use, we might not
1219 * have all necessary catalog rows anymore.
1221 if (TransactionIdIsNormal(builder
->initial_xmin_horizon
) &&
1222 NormalTransactionIdPrecedes(running
->oldestRunningXid
,
1223 builder
->initial_xmin_horizon
))
1226 (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
1227 LSN_FORMAT_ARGS(lsn
)),
1228 errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
1229 builder
->initial_xmin_horizon
, running
->oldestRunningXid
)));
1232 SnapBuildWaitSnapshot(running
, builder
->initial_xmin_horizon
);
1238 * a) No transaction were running, we can jump to consistent.
1240 * This is not affected by races around xl_running_xacts, because we can
1241 * miss transaction commits, but currently not transactions starting.
1243 * NB: We might have already started to incrementally assemble a snapshot,
1244 * so we need to be careful to deal with that.
1246 if (running
->oldestRunningXid
== running
->nextXid
)
1248 if (builder
->start_decoding_at
== InvalidXLogRecPtr
||
1249 builder
->start_decoding_at
<= lsn
)
1250 /* can decode everything after this */
1251 builder
->start_decoding_at
= lsn
+ 1;
1253 /* As no transactions were running xmin/xmax can be trivially set. */
1254 builder
->xmin
= running
->nextXid
; /* < are finished */
1255 builder
->xmax
= running
->nextXid
; /* >= are running */
1257 /* so we can safely use the faster comparisons */
1258 Assert(TransactionIdIsNormal(builder
->xmin
));
1259 Assert(TransactionIdIsNormal(builder
->xmax
));
1261 builder
->state
= SNAPBUILD_CONSISTENT
;
1262 builder
->next_phase_at
= InvalidTransactionId
;
1265 (errmsg("logical decoding found consistent point at %X/%X",
1266 LSN_FORMAT_ARGS(lsn
)),
1267 errdetail("There are no running transactions.")));
1273 * b) valid on disk state and while neither building full snapshot nor
1276 else if (!builder
->building_full_snapshot
&&
1277 !builder
->in_slot_creation
&&
1278 SnapBuildRestore(builder
, lsn
))
1280 /* there won't be any state to cleanup */
1285 * c) transition from START to BUILDING_SNAPSHOT.
1287 * In START state, and a xl_running_xacts record with running xacts is
1288 * encountered. In that case, switch to BUILDING_SNAPSHOT state, and
1289 * record xl_running_xacts->nextXid. Once all running xacts have finished
1290 * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It
1291 * might look that we could use xl_running_xacts's ->xids information to
1292 * get there quicker, but that is problematic because transactions marked
1293 * as running, might already have inserted their commit record - it's
1294 * infeasible to change that with locking.
1296 else if (builder
->state
== SNAPBUILD_START
)
1298 builder
->state
= SNAPBUILD_BUILDING_SNAPSHOT
;
1299 builder
->next_phase_at
= running
->nextXid
;
1302 * Start with an xmin/xmax that's correct for future, when all the
1303 * currently running transactions have finished. We'll update both
1304 * while waiting for the pending transactions to finish.
1306 builder
->xmin
= running
->nextXid
; /* < are finished */
1307 builder
->xmax
= running
->nextXid
; /* >= are running */
1309 /* so we can safely use the faster comparisons */
1310 Assert(TransactionIdIsNormal(builder
->xmin
));
1311 Assert(TransactionIdIsNormal(builder
->xmax
));
1314 (errmsg("logical decoding found initial starting point at %X/%X",
1315 LSN_FORMAT_ARGS(lsn
)),
1316 errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1317 running
->xcnt
, running
->nextXid
)));
1319 SnapBuildWaitSnapshot(running
, running
->nextXid
);
1323 * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT.
1325 * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid
1326 * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This
1327 * means all transactions starting afterwards have enough information to
1328 * be decoded. Switch to FULL_SNAPSHOT.
1330 else if (builder
->state
== SNAPBUILD_BUILDING_SNAPSHOT
&&
1331 TransactionIdPrecedesOrEquals(builder
->next_phase_at
,
1332 running
->oldestRunningXid
))
1334 builder
->state
= SNAPBUILD_FULL_SNAPSHOT
;
1335 builder
->next_phase_at
= running
->nextXid
;
1338 (errmsg("logical decoding found initial consistent point at %X/%X",
1339 LSN_FORMAT_ARGS(lsn
)),
1340 errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1341 running
->xcnt
, running
->nextXid
)));
1343 SnapBuildWaitSnapshot(running
, running
->nextXid
);
1347 * c) transition from FULL_SNAPSHOT to CONSISTENT.
1349 * In FULL_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid is
1350 * >= than nextXid from when we switched to FULL_SNAPSHOT. This means all
1351 * transactions that are currently in progress have a catalog snapshot,
1352 * and all their changes have been collected. Switch to CONSISTENT.
1354 else if (builder
->state
== SNAPBUILD_FULL_SNAPSHOT
&&
1355 TransactionIdPrecedesOrEquals(builder
->next_phase_at
,
1356 running
->oldestRunningXid
))
1358 builder
->state
= SNAPBUILD_CONSISTENT
;
1359 builder
->next_phase_at
= InvalidTransactionId
;
1362 (errmsg("logical decoding found consistent point at %X/%X",
1363 LSN_FORMAT_ARGS(lsn
)),
1364 errdetail("There are no old transactions anymore.")));
1368 * We already started to track running xacts and need to wait for all
1369 * in-progress ones to finish. We fall through to the normal processing of
1370 * records so incremental cleanup can be performed.
1376 * Iterate through xids in record, wait for all older than the cutoff to
1377 * finish. Then, if possible, log a new xl_running_xacts record.
1379 * This isn't required for the correctness of decoding, but to:
1380 * a) allow isolationtester to notice that we're currently waiting for
1382 * b) log a new xl_running_xacts record where it'd be helpful, without having
1383 * to wait for bgwriter or checkpointer.
1387 SnapBuildWaitSnapshot(xl_running_xacts
*running
, TransactionId cutoff
)
1391 for (off
= 0; off
< running
->xcnt
; off
++)
1393 TransactionId xid
= running
->xids
[off
];
1396 * Upper layers should prevent that we ever need to wait on ourselves.
1397 * Check anyway, since failing to do so would either result in an
1398 * endless wait or an Assert() failure.
1400 if (TransactionIdIsCurrentTransactionId(xid
))
1401 elog(ERROR
, "waiting for ourselves");
1403 if (TransactionIdFollows(xid
, cutoff
))
1406 XactLockTableWait(xid
, NULL
, NULL
, XLTW_None
);
1410 * All transactions we needed to finish finished - try to ensure there is
1411 * another xl_running_xacts record in a timely manner, without having to
1412 * wait for bgwriter or checkpointer to log one. During recovery we can't
1413 * enforce that, so we'll have to wait.
1415 if (!RecoveryInProgress())
1417 LogStandbySnapshot();
1421 #define SnapBuildOnDiskConstantSize \
1422 offsetof(SnapBuildOnDisk, builder)
1423 #define SnapBuildOnDiskNotChecksummedSize \
1424 offsetof(SnapBuildOnDisk, version)
1426 #define SNAPBUILD_MAGIC 0x51A1E001
1427 #define SNAPBUILD_VERSION 6
1430 * Store/Load a snapshot from disk, depending on the snapshot builder's state.
1432 * Supposed to be used by external (i.e. not snapbuild.c) code that just read
1433 * a record that's a potential location for a serialized snapshot.
1436 SnapBuildSerializationPoint(SnapBuild
*builder
, XLogRecPtr lsn
)
1438 if (builder
->state
< SNAPBUILD_CONSISTENT
)
1439 SnapBuildRestore(builder
, lsn
);
1441 SnapBuildSerialize(builder
, lsn
);
1445 * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
1446 * been done by another decoding process.
1449 SnapBuildSerialize(SnapBuild
*builder
, XLogRecPtr lsn
)
1452 SnapBuildOnDisk
*ondisk
= NULL
;
1453 TransactionId
*catchange_xip
= NULL
;
1454 MemoryContext old_ctx
;
1455 size_t catchange_xcnt
;
1458 char tmppath
[MAXPGPATH
];
1459 char path
[MAXPGPATH
];
1461 struct stat stat_buf
;
1464 Assert(lsn
!= InvalidXLogRecPtr
);
1465 Assert(builder
->last_serialized_snapshot
== InvalidXLogRecPtr
||
1466 builder
->last_serialized_snapshot
<= lsn
);
1469 * no point in serializing if we cannot continue to work immediately after
1470 * restoring the snapshot
1472 if (builder
->state
< SNAPBUILD_CONSISTENT
)
1475 /* consistent snapshots have no next phase */
1476 Assert(builder
->next_phase_at
== InvalidTransactionId
);
1479 * We identify snapshots by the LSN they are valid for. We don't need to
1480 * include timelines in the name as each LSN maps to exactly one timeline
1481 * unless the user used pg_resetwal or similar. If a user did so, there's
1482 * no hope continuing to decode anyway.
1484 sprintf(path
, "%s/%X-%X.snap",
1485 PG_LOGICAL_SNAPSHOTS_DIR
,
1486 LSN_FORMAT_ARGS(lsn
));
1489 * first check whether some other backend already has written the snapshot
1490 * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
1491 * as a valid state. Everything else is an unexpected error.
1493 ret
= stat(path
, &stat_buf
);
1495 if (ret
!= 0 && errno
!= ENOENT
)
1497 (errcode_for_file_access(),
1498 errmsg("could not stat file \"%s\": %m", path
)));
1503 * somebody else has already serialized to this point, don't overwrite
1504 * but remember location, so we don't need to read old data again.
1506 * To be sure it has been synced to disk after the rename() from the
1507 * tempfile filename to the real filename, we just repeat the fsync.
1508 * That ought to be cheap because in most scenarios it should already
1509 * be safely on disk.
1511 fsync_fname(path
, false);
1512 fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR
, true);
1514 builder
->last_serialized_snapshot
= lsn
;
1519 * there is an obvious race condition here between the time we stat(2) the
1520 * file and us writing the file. But we rename the file into place
1521 * atomically and all files created need to contain the same data anyway,
1522 * so this is perfectly fine, although a bit of a resource waste. Locking
1523 * seems like pointless complication.
1525 elog(DEBUG1
, "serializing snapshot to %s", path
);
1527 /* to make sure only we will write to this tempfile, include pid */
1528 sprintf(tmppath
, "%s/%X-%X.snap.%d.tmp",
1529 PG_LOGICAL_SNAPSHOTS_DIR
,
1530 LSN_FORMAT_ARGS(lsn
), MyProcPid
);
1533 * Unlink temporary file if it already exists, needs to have been before a
1534 * crash/error since we won't enter this function twice from within a
1535 * single decoding slot/backend and the temporary file contains the pid of
1536 * the current process.
1538 if (unlink(tmppath
) != 0 && errno
!= ENOENT
)
1540 (errcode_for_file_access(),
1541 errmsg("could not remove file \"%s\": %m", tmppath
)));
1543 old_ctx
= MemoryContextSwitchTo(builder
->context
);
1545 /* Get the catalog modifying transactions that are yet not committed */
1546 catchange_xip
= ReorderBufferGetCatalogChangesXacts(builder
->reorder
);
1547 catchange_xcnt
= dclist_count(&builder
->reorder
->catchange_txns
);
1549 needed_length
= sizeof(SnapBuildOnDisk
) +
1550 sizeof(TransactionId
) * (builder
->committed
.xcnt
+ catchange_xcnt
);
1552 ondisk_c
= palloc0(needed_length
);
1553 ondisk
= (SnapBuildOnDisk
*) ondisk_c
;
1554 ondisk
->magic
= SNAPBUILD_MAGIC
;
1555 ondisk
->version
= SNAPBUILD_VERSION
;
1556 ondisk
->length
= needed_length
;
1557 INIT_CRC32C(ondisk
->checksum
);
1558 COMP_CRC32C(ondisk
->checksum
,
1559 ((char *) ondisk
) + SnapBuildOnDiskNotChecksummedSize
,
1560 SnapBuildOnDiskConstantSize
- SnapBuildOnDiskNotChecksummedSize
);
1561 ondisk_c
+= sizeof(SnapBuildOnDisk
);
1563 memcpy(&ondisk
->builder
, builder
, sizeof(SnapBuild
));
1564 /* NULL-ify memory-only data */
1565 ondisk
->builder
.context
= NULL
;
1566 ondisk
->builder
.snapshot
= NULL
;
1567 ondisk
->builder
.reorder
= NULL
;
1568 ondisk
->builder
.committed
.xip
= NULL
;
1569 ondisk
->builder
.catchange
.xip
= NULL
;
1570 /* update catchange only on disk data */
1571 ondisk
->builder
.catchange
.xcnt
= catchange_xcnt
;
1573 COMP_CRC32C(ondisk
->checksum
,
1577 /* copy committed xacts */
1578 if (builder
->committed
.xcnt
> 0)
1580 sz
= sizeof(TransactionId
) * builder
->committed
.xcnt
;
1581 memcpy(ondisk_c
, builder
->committed
.xip
, sz
);
1582 COMP_CRC32C(ondisk
->checksum
, ondisk_c
, sz
);
1586 /* copy catalog modifying xacts */
1587 if (catchange_xcnt
> 0)
1589 sz
= sizeof(TransactionId
) * catchange_xcnt
;
1590 memcpy(ondisk_c
, catchange_xip
, sz
);
1591 COMP_CRC32C(ondisk
->checksum
, ondisk_c
, sz
);
1595 FIN_CRC32C(ondisk
->checksum
);
1597 /* we have valid data now, open tempfile and write it there */
1598 fd
= OpenTransientFile(tmppath
,
1599 O_CREAT
| O_EXCL
| O_WRONLY
| PG_BINARY
);
1602 (errcode_for_file_access(),
1603 errmsg("could not open file \"%s\": %m", tmppath
)));
1606 pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE
);
1607 if ((write(fd
, ondisk
, needed_length
)) != needed_length
)
1609 int save_errno
= errno
;
1611 CloseTransientFile(fd
);
1613 /* if write didn't set errno, assume problem is no disk space */
1614 errno
= save_errno
? save_errno
: ENOSPC
;
1616 (errcode_for_file_access(),
1617 errmsg("could not write to file \"%s\": %m", tmppath
)));
1619 pgstat_report_wait_end();
1622 * fsync the file before renaming so that even if we crash after this we
1623 * have either a fully valid file or nothing.
1625 * It's safe to just ERROR on fsync() here because we'll retry the whole
1626 * operation including the writes.
1628 * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
1629 * some noticeable overhead since it's performed synchronously during
1632 pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC
);
1633 if (pg_fsync(fd
) != 0)
1635 int save_errno
= errno
;
1637 CloseTransientFile(fd
);
1640 (errcode_for_file_access(),
1641 errmsg("could not fsync file \"%s\": %m", tmppath
)));
1643 pgstat_report_wait_end();
1645 if (CloseTransientFile(fd
) != 0)
1647 (errcode_for_file_access(),
1648 errmsg("could not close file \"%s\": %m", tmppath
)));
1650 fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR
, true);
1653 * We may overwrite the work from some other backend, but that's ok, our
1654 * snapshot is valid as well, we'll just have done some superfluous work.
1656 if (rename(tmppath
, path
) != 0)
1659 (errcode_for_file_access(),
1660 errmsg("could not rename file \"%s\" to \"%s\": %m",
1664 /* make sure we persist */
1665 fsync_fname(path
, false);
1666 fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR
, true);
1669 * Now there's no way we can lose the dumped state anymore, remember this
1670 * as a serialization point.
1672 builder
->last_serialized_snapshot
= lsn
;
1674 MemoryContextSwitchTo(old_ctx
);
1677 ReorderBufferSetRestartPoint(builder
->reorder
,
1678 builder
->last_serialized_snapshot
);
1683 pfree(catchange_xip
);
1687 * Restore the logical snapshot file contents to 'ondisk'.
1689 * 'context' is the memory context where the catalog modifying/committed xid
1691 * If 'missing_ok' is true, will not throw an error if the file is not found.
1694 SnapBuildRestoreSnapshot(SnapBuildOnDisk
*ondisk
, const char *path
,
1695 MemoryContext context
, bool missing_ok
)
1701 fd
= OpenTransientFile(path
, O_RDONLY
| PG_BINARY
);
1705 if (missing_ok
&& errno
== ENOENT
)
1709 (errcode_for_file_access(),
1710 errmsg("could not open file \"%s\": %m", path
)));
1714 * Make sure the snapshot had been stored safely to disk, that's normally
1716 * Note that we do not need PANIC here, nobody will be able to use the
1717 * slot without fsyncing, and saving it won't succeed without an fsync()
1721 fsync_fname(path
, false);
1722 fsync_fname(PG_LOGICAL_SNAPSHOTS_DIR
, true);
1724 /* read statically sized portion of snapshot */
1725 SnapBuildRestoreContents(fd
, (char *) ondisk
, SnapBuildOnDiskConstantSize
, path
);
1727 if (ondisk
->magic
!= SNAPBUILD_MAGIC
)
1729 (errcode(ERRCODE_DATA_CORRUPTED
),
1730 errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
1731 path
, ondisk
->magic
, SNAPBUILD_MAGIC
)));
1733 if (ondisk
->version
!= SNAPBUILD_VERSION
)
1735 (errcode(ERRCODE_DATA_CORRUPTED
),
1736 errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
1737 path
, ondisk
->version
, SNAPBUILD_VERSION
)));
1739 INIT_CRC32C(checksum
);
1740 COMP_CRC32C(checksum
,
1741 ((char *) ondisk
) + SnapBuildOnDiskNotChecksummedSize
,
1742 SnapBuildOnDiskConstantSize
- SnapBuildOnDiskNotChecksummedSize
);
1744 /* read SnapBuild */
1745 SnapBuildRestoreContents(fd
, (char *) &ondisk
->builder
, sizeof(SnapBuild
), path
);
1746 COMP_CRC32C(checksum
, &ondisk
->builder
, sizeof(SnapBuild
));
1748 /* restore committed xacts information */
1749 if (ondisk
->builder
.committed
.xcnt
> 0)
1751 sz
= sizeof(TransactionId
) * ondisk
->builder
.committed
.xcnt
;
1752 ondisk
->builder
.committed
.xip
= MemoryContextAllocZero(context
, sz
);
1753 SnapBuildRestoreContents(fd
, (char *) ondisk
->builder
.committed
.xip
, sz
, path
);
1754 COMP_CRC32C(checksum
, ondisk
->builder
.committed
.xip
, sz
);
1757 /* restore catalog modifying xacts information */
1758 if (ondisk
->builder
.catchange
.xcnt
> 0)
1760 sz
= sizeof(TransactionId
) * ondisk
->builder
.catchange
.xcnt
;
1761 ondisk
->builder
.catchange
.xip
= MemoryContextAllocZero(context
, sz
);
1762 SnapBuildRestoreContents(fd
, (char *) ondisk
->builder
.catchange
.xip
, sz
, path
);
1763 COMP_CRC32C(checksum
, ondisk
->builder
.catchange
.xip
, sz
);
1766 if (CloseTransientFile(fd
) != 0)
1768 (errcode_for_file_access(),
1769 errmsg("could not close file \"%s\": %m", path
)));
1771 FIN_CRC32C(checksum
);
1773 /* verify checksum of what we've read */
1774 if (!EQ_CRC32C(checksum
, ondisk
->checksum
))
1776 (errcode(ERRCODE_DATA_CORRUPTED
),
1777 errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
1778 path
, checksum
, ondisk
->checksum
)));
1784 * Restore a snapshot into 'builder' if previously one has been stored at the
1785 * location indicated by 'lsn'. Returns true if successful, false otherwise.
1788 SnapBuildRestore(SnapBuild
*builder
, XLogRecPtr lsn
)
1790 SnapBuildOnDisk ondisk
;
1791 char path
[MAXPGPATH
];
1793 /* no point in loading a snapshot if we're already there */
1794 if (builder
->state
== SNAPBUILD_CONSISTENT
)
1797 sprintf(path
, "%s/%X-%X.snap",
1798 PG_LOGICAL_SNAPSHOTS_DIR
,
1799 LSN_FORMAT_ARGS(lsn
));
1801 /* validate and restore the snapshot to 'ondisk' */
1802 if (!SnapBuildRestoreSnapshot(&ondisk
, path
, builder
->context
, true))
1806 * ok, we now have a sensible snapshot here, figure out if it has more
1807 * information than we have.
1811 * We are only interested in consistent snapshots for now, comparing
1812 * whether one incomplete snapshot is more "advanced" seems to be
1813 * unnecessarily complex.
1815 if (ondisk
.builder
.state
< SNAPBUILD_CONSISTENT
)
1816 goto snapshot_not_interesting
;
1819 * Don't use a snapshot that requires an xmin that we cannot guarantee to
1822 if (TransactionIdPrecedes(ondisk
.builder
.xmin
, builder
->initial_xmin_horizon
))
1823 goto snapshot_not_interesting
;
1826 * Consistent snapshots have no next phase. Reset next_phase_at as it is
1827 * possible that an old value may remain.
1829 Assert(ondisk
.builder
.next_phase_at
== InvalidTransactionId
);
1830 builder
->next_phase_at
= InvalidTransactionId
;
1832 /* ok, we think the snapshot is sensible, copy over everything important */
1833 builder
->xmin
= ondisk
.builder
.xmin
;
1834 builder
->xmax
= ondisk
.builder
.xmax
;
1835 builder
->state
= ondisk
.builder
.state
;
1837 builder
->committed
.xcnt
= ondisk
.builder
.committed
.xcnt
;
1838 /* We only allocated/stored xcnt, not xcnt_space xids ! */
1839 /* don't overwrite preallocated xip, if we don't have anything here */
1840 if (builder
->committed
.xcnt
> 0)
1842 pfree(builder
->committed
.xip
);
1843 builder
->committed
.xcnt_space
= ondisk
.builder
.committed
.xcnt
;
1844 builder
->committed
.xip
= ondisk
.builder
.committed
.xip
;
1846 ondisk
.builder
.committed
.xip
= NULL
;
1848 /* set catalog modifying transactions */
1849 if (builder
->catchange
.xip
)
1850 pfree(builder
->catchange
.xip
);
1851 builder
->catchange
.xcnt
= ondisk
.builder
.catchange
.xcnt
;
1852 builder
->catchange
.xip
= ondisk
.builder
.catchange
.xip
;
1853 ondisk
.builder
.catchange
.xip
= NULL
;
1855 /* our snapshot is not interesting anymore, build a new one */
1856 if (builder
->snapshot
!= NULL
)
1858 SnapBuildSnapDecRefcount(builder
->snapshot
);
1860 builder
->snapshot
= SnapBuildBuildSnapshot(builder
);
1861 SnapBuildSnapIncRefcount(builder
->snapshot
);
1863 ReorderBufferSetRestartPoint(builder
->reorder
, lsn
);
1865 Assert(builder
->state
== SNAPBUILD_CONSISTENT
);
1868 (errmsg("logical decoding found consistent point at %X/%X",
1869 LSN_FORMAT_ARGS(lsn
)),
1870 errdetail("Logical decoding will begin using saved snapshot.")));
1873 snapshot_not_interesting
:
1874 if (ondisk
.builder
.committed
.xip
!= NULL
)
1875 pfree(ondisk
.builder
.committed
.xip
);
1876 if (ondisk
.builder
.catchange
.xip
!= NULL
)
1877 pfree(ondisk
.builder
.catchange
.xip
);
1882 * Read the contents of the serialized snapshot to 'dest'.
1885 SnapBuildRestoreContents(int fd
, char *dest
, Size size
, const char *path
)
1889 pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ
);
1890 readBytes
= read(fd
, dest
, size
);
1891 pgstat_report_wait_end();
1892 if (readBytes
!= size
)
1894 int save_errno
= errno
;
1896 CloseTransientFile(fd
);
1902 (errcode_for_file_access(),
1903 errmsg("could not read file \"%s\": %m", path
)));
1907 (errcode(ERRCODE_DATA_CORRUPTED
),
1908 errmsg("could not read file \"%s\": read %d of %zu",
1909 path
, readBytes
, size
)));
1914 * Remove all serialized snapshots that are not required anymore because no
1915 * slot can need them. This doesn't actually have to run during a checkpoint,
1916 * but it's a convenient point to schedule this.
1918 * NB: We run this during checkpoints even if logical decoding is disabled so
1919 * we cleanup old slots at some point after it got disabled.
1922 CheckPointSnapBuild(void)
1927 struct dirent
*snap_de
;
1928 char path
[MAXPGPATH
+ sizeof(PG_LOGICAL_SNAPSHOTS_DIR
)];
1931 * We start off with a minimum of the last redo pointer. No new
1932 * replication slot will start before that, so that's a safe upper bound
1935 redo
= GetRedoRecPtr();
1937 /* now check for the restart ptrs from existing slots */
1938 cutoff
= ReplicationSlotsComputeLogicalRestartLSN();
1940 /* don't start earlier than the restart lsn */
1944 snap_dir
= AllocateDir(PG_LOGICAL_SNAPSHOTS_DIR
);
1945 while ((snap_de
= ReadDir(snap_dir
, PG_LOGICAL_SNAPSHOTS_DIR
)) != NULL
)
1952 if (strcmp(snap_de
->d_name
, ".") == 0 ||
1953 strcmp(snap_de
->d_name
, "..") == 0)
1956 snprintf(path
, sizeof(path
), "%s/%s", PG_LOGICAL_SNAPSHOTS_DIR
, snap_de
->d_name
);
1957 de_type
= get_dirent_type(path
, snap_de
, false, DEBUG1
);
1959 if (de_type
!= PGFILETYPE_ERROR
&& de_type
!= PGFILETYPE_REG
)
1961 elog(DEBUG1
, "only regular files expected: %s", path
);
1966 * temporary filenames from SnapBuildSerialize() include the LSN and
1967 * everything but are postfixed by .$pid.tmp. We can just remove them
1968 * the same as other files because there can be none that are
1969 * currently being written that are older than cutoff.
1971 * We just log a message if a file doesn't fit the pattern, it's
1972 * probably some editors lock/state file or similar...
1974 if (sscanf(snap_de
->d_name
, "%X-%X.snap", &hi
, &lo
) != 2)
1977 (errmsg("could not parse file name \"%s\"", path
)));
1981 lsn
= ((uint64
) hi
) << 32 | lo
;
1983 /* check whether we still need it */
1984 if (lsn
< cutoff
|| cutoff
== InvalidXLogRecPtr
)
1986 elog(DEBUG1
, "removing snapbuild snapshot %s", path
);
1989 * It's not particularly harmful, though strange, if we can't
1990 * remove the file here. Don't prevent the checkpoint from
1991 * completing, that'd be a cure worse than the disease.
1993 if (unlink(path
) < 0)
1996 (errcode_for_file_access(),
1997 errmsg("could not remove file \"%s\": %m",
2007 * Check if a logical snapshot at the specified point has been serialized.
2010 SnapBuildSnapshotExists(XLogRecPtr lsn
)
2012 char path
[MAXPGPATH
];
2014 struct stat stat_buf
;
2016 sprintf(path
, "%s/%X-%X.snap",
2017 PG_LOGICAL_SNAPSHOTS_DIR
,
2018 LSN_FORMAT_ARGS(lsn
));
2020 ret
= stat(path
, &stat_buf
);
2022 if (ret
!= 0 && errno
!= ENOENT
)
2024 (errcode_for_file_access(),
2025 errmsg("could not stat file \"%s\": %m", path
)));