1 /*-------------------------------------------------------------------------
4 * PostgreSQL logical replay/reorder buffer management
7 * Copyright (c) 2012-2024, PostgreSQL Global Development Group
11 * src/backend/replication/logical/reorderbuffer.c
14 * This module gets handed individual pieces of transactions in the order
15 * they are written to the WAL and is responsible to reassemble them into
16 * toplevel transaction sized pieces. When a transaction is completely
17 * reassembled - signaled by reading the transaction commit record - it
18 * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 * individual changes. The output plugins rely on snapshots built by
20 * snapbuild.c which hands them to us.
22 * Transactions and subtransactions/savepoints in postgres are not
23 * immediately linked to each other from outside the performing
24 * backend. Only at commit/abort (or special xact_assignment records) they
25 * are linked together. Which means that we will have to splice together a
26 * toplevel transaction from its subtransactions. To do that efficiently we
27 * build a binary heap indexed by the smallest current lsn of the individual
28 * subtransactions' changestreams. As the individual streams are inherently
29 * ordered by LSN - since that is where we build them from - the transaction
30 * can easily be reassembled by always using the subtransaction with the
31 * smallest current LSN from the heap.
33 * In order to cope with large transactions - which can be several times as
34 * big as the available memory - this module supports spooling the contents
35 * of large transactions to disk. When the transaction is replayed the
36 * contents of individual (sub-)transactions will be read from disk in
39 * This module also has to deal with reassembling toast records from the
40 * individual chunks stored in WAL. When a new (or initial) version of a
41 * tuple is stored in WAL it will always be preceded by the toast chunks
42 * emitted for the columns stored out of line. Within a single toplevel
43 * transaction there will be no other data carrying records between a row's
44 * toast chunks and the row data itself. See ReorderBufferToast* for
47 * ReorderBuffer uses two special memory context types - SlabContext for
48 * allocations of fixed-length structures (changes and transactions), and
49 * GenerationContext for the variable-length transaction data (allocated
50 * and freed in groups with similar lifespans).
52 * To limit the amount of memory used by decoded changes, we track memory
53 * used at the reorder buffer level (i.e. total amount of memory), and for
54 * each transaction. When the total amount of used memory exceeds the
55 * limit, the transaction consuming the most memory is then serialized to
58 * Only decoded changes are evicted from memory (spilled to disk), not the
59 * transaction records. The number of toplevel transactions is limited,
60 * but a transaction with many subtransactions may still consume significant
61 * amounts of memory. However, the transaction records are fairly small and
62 * are not included in the memory limit.
64 * The current eviction algorithm is very simple - the transaction is
65 * picked merely by size, while it might be useful to also consider age
66 * (LSN) of the changes for example. With the new Generational memory
67 * allocator, evicting the oldest changes would make it more likely the
68 * memory gets actually freed.
70 * We use a max-heap with transaction size as the key to efficiently find
71 * the largest transaction. We update the max-heap whenever the memory
72 * counter is updated; however transactions with size 0 are not stored in
73 * the heap, because they have no changes to evict.
75 * We still rely on max_changes_in_memory when loading serialized changes
76 * back into memory. At that point we can't use the memory limit directly
77 * as we load the subxacts independently. One option to deal with this
78 * would be to count the subxacts, and allow each to allocate 1/N of the
79 * memory limit. That however does not seem very appealing, because with
80 * many subtransactions it may easily cause thrashing (short cycles of
81 * deserializing and applying very few changes). We probably should give
82 * a bit more memory to the oldest subtransactions, because it's likely
83 * they are the source for the next sequence of changes.
85 * -------------------------------------------------------------------------
92 #include "access/detoast.h"
93 #include "access/heapam.h"
94 #include "access/rewriteheap.h"
95 #include "access/transam.h"
96 #include "access/xact.h"
97 #include "access/xlog_internal.h"
98 #include "catalog/catalog.h"
99 #include "common/int.h"
100 #include "lib/binaryheap.h"
101 #include "miscadmin.h"
103 #include "replication/logical.h"
104 #include "replication/reorderbuffer.h"
105 #include "replication/slot.h"
106 #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 #include "storage/bufmgr.h"
108 #include "storage/fd.h"
109 #include "storage/sinval.h"
110 #include "utils/builtins.h"
111 #include "utils/memutils.h"
112 #include "utils/rel.h"
113 #include "utils/relfilenumbermap.h"
115 /* entry for a hash table we use to map from xid to our transaction state */
116 typedef struct ReorderBufferTXNByIdEnt
119 ReorderBufferTXN
*txn
;
120 } ReorderBufferTXNByIdEnt
;
122 /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
123 typedef struct ReorderBufferTupleCidKey
125 RelFileLocator rlocator
;
127 } ReorderBufferTupleCidKey
;
129 typedef struct ReorderBufferTupleCidEnt
131 ReorderBufferTupleCidKey key
;
134 CommandId combocid
; /* just for debugging */
135 } ReorderBufferTupleCidEnt
;
137 /* Virtual file descriptor with file offset tracking */
138 typedef struct TXNEntryFile
140 File vfd
; /* -1 when the file is closed */
141 off_t curOffset
; /* offset for next write or read. Reset to 0
142 * when vfd is opened. */
145 /* k-way in-order change iteration support structures */
146 typedef struct ReorderBufferIterTXNEntry
149 ReorderBufferChange
*change
;
150 ReorderBufferTXN
*txn
;
153 } ReorderBufferIterTXNEntry
;
155 typedef struct ReorderBufferIterTXNState
159 dlist_head old_change
;
160 ReorderBufferIterTXNEntry entries
[FLEXIBLE_ARRAY_MEMBER
];
161 } ReorderBufferIterTXNState
;
163 /* toast datastructures */
164 typedef struct ReorderBufferToastEnt
166 Oid chunk_id
; /* toast_table.chunk_id */
167 int32 last_chunk_seq
; /* toast_table.chunk_seq of the last chunk we
169 Size num_chunks
; /* number of chunks we've already seen */
170 Size size
; /* combined size of chunks seen */
171 dlist_head chunks
; /* linked list of chunks */
172 struct varlena
*reconstructed
; /* reconstructed varlena now pointed to in
174 } ReorderBufferToastEnt
;
176 /* Disk serialization support datastructures */
177 typedef struct ReorderBufferDiskChange
180 ReorderBufferChange change
;
182 } ReorderBufferDiskChange
;
184 #define IsSpecInsert(action) \
186 ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
188 #define IsSpecConfirmOrAbort(action) \
190 (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
191 ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
193 #define IsInsertOrUpdate(action) \
195 (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
196 ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
197 ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
201 * Maximum number of changes kept in memory, per transaction. After that,
202 * changes are spooled to disk.
204 * The current value should be sufficient to decode the entire transaction
205 * without hitting disk in OLTP workloads, while starting to spool to disk in
206 * other workloads reasonably fast.
208 * At some point in the future it probably makes sense to have a more elaborate
209 * resource management here, but it's not entirely clear what that would look
212 int logical_decoding_work_mem
;
213 static const Size max_changes_in_memory
= 4096; /* XXX for restore only */
216 int debug_logical_replication_streaming
= DEBUG_LOGICAL_REP_STREAMING_BUFFERED
;
218 /* ---------------------------------------
219 * primary reorderbuffer support routines
220 * ---------------------------------------
222 static ReorderBufferTXN
*ReorderBufferGetTXN(ReorderBuffer
*rb
);
223 static void ReorderBufferReturnTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
);
224 static ReorderBufferTXN
*ReorderBufferTXNByXid(ReorderBuffer
*rb
,
225 TransactionId xid
, bool create
, bool *is_new
,
226 XLogRecPtr lsn
, bool create_as_top
);
227 static void ReorderBufferTransferSnapToParent(ReorderBufferTXN
*txn
,
228 ReorderBufferTXN
*subtxn
);
230 static void AssertTXNLsnOrder(ReorderBuffer
*rb
);
232 /* ---------------------------------------
233 * support functions for lsn-order iterating over the ->changes of a
234 * transaction and its subtransactions
236 * used for iteration over the k-way heap merge of a transaction and its
238 * ---------------------------------------
240 static void ReorderBufferIterTXNInit(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
241 ReorderBufferIterTXNState
*volatile *iter_state
);
242 static ReorderBufferChange
*ReorderBufferIterTXNNext(ReorderBuffer
*rb
, ReorderBufferIterTXNState
*state
);
243 static void ReorderBufferIterTXNFinish(ReorderBuffer
*rb
,
244 ReorderBufferIterTXNState
*state
);
245 static void ReorderBufferExecuteInvalidations(uint32 nmsgs
, SharedInvalidationMessage
*msgs
);
248 * ---------------------------------------
249 * Disk serialization support functions
250 * ---------------------------------------
252 static void ReorderBufferCheckMemoryLimit(ReorderBuffer
*rb
);
253 static void ReorderBufferSerializeTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
);
254 static void ReorderBufferSerializeChange(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
255 int fd
, ReorderBufferChange
*change
);
256 static Size
ReorderBufferRestoreChanges(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
257 TXNEntryFile
*file
, XLogSegNo
*segno
);
258 static void ReorderBufferRestoreChange(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
260 static void ReorderBufferRestoreCleanup(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
);
261 static void ReorderBufferTruncateTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
263 static void ReorderBufferCleanupSerializedTXNs(const char *slotname
);
264 static void ReorderBufferSerializedPath(char *path
, ReplicationSlot
*slot
,
265 TransactionId xid
, XLogSegNo segno
);
266 static int ReorderBufferTXNSizeCompare(const pairingheap_node
*a
, const pairingheap_node
*b
, void *arg
);
268 static void ReorderBufferFreeSnap(ReorderBuffer
*rb
, Snapshot snap
);
269 static Snapshot
ReorderBufferCopySnap(ReorderBuffer
*rb
, Snapshot orig_snap
,
270 ReorderBufferTXN
*txn
, CommandId cid
);
273 * ---------------------------------------
274 * Streaming support functions
275 * ---------------------------------------
277 static inline bool ReorderBufferCanStream(ReorderBuffer
*rb
);
278 static inline bool ReorderBufferCanStartStreaming(ReorderBuffer
*rb
);
279 static void ReorderBufferStreamTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
);
280 static void ReorderBufferStreamCommit(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
);
282 /* ---------------------------------------
283 * toast reassembly support
284 * ---------------------------------------
286 static void ReorderBufferToastInitHash(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
);
287 static void ReorderBufferToastReset(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
);
288 static void ReorderBufferToastReplace(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
289 Relation relation
, ReorderBufferChange
*change
);
290 static void ReorderBufferToastAppendChunk(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
291 Relation relation
, ReorderBufferChange
*change
);
294 * ---------------------------------------
296 * ---------------------------------------
298 static Size
ReorderBufferChangeSize(ReorderBufferChange
*change
);
299 static void ReorderBufferChangeMemoryUpdate(ReorderBuffer
*rb
,
300 ReorderBufferChange
*change
,
301 ReorderBufferTXN
*txn
,
302 bool addition
, Size sz
);
305 * Allocate a new ReorderBuffer and clean out any old serialized state from
306 * prior ReorderBuffer instances for the same slot.
309 ReorderBufferAllocate(void)
311 ReorderBuffer
*buffer
;
313 MemoryContext new_ctx
;
315 Assert(MyReplicationSlot
!= NULL
);
317 /* allocate memory in own context, to have better accountability */
318 new_ctx
= AllocSetContextCreate(CurrentMemoryContext
,
320 ALLOCSET_DEFAULT_SIZES
);
323 (ReorderBuffer
*) MemoryContextAlloc(new_ctx
, sizeof(ReorderBuffer
));
325 memset(&hash_ctl
, 0, sizeof(hash_ctl
));
327 buffer
->context
= new_ctx
;
329 buffer
->change_context
= SlabContextCreate(new_ctx
,
331 SLAB_DEFAULT_BLOCK_SIZE
,
332 sizeof(ReorderBufferChange
));
334 buffer
->txn_context
= SlabContextCreate(new_ctx
,
336 SLAB_DEFAULT_BLOCK_SIZE
,
337 sizeof(ReorderBufferTXN
));
340 * To minimize memory fragmentation caused by long-running transactions
341 * with changes spanning multiple memory blocks, we use a single
342 * fixed-size memory block for decoded tuple storage. The performance
343 * testing showed that the default memory block size maintains logical
344 * decoding performance without causing fragmentation due to concurrent
345 * transactions. One might think that we can use the max size as
346 * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
347 * the memory fragmentation.
349 buffer
->tup_context
= GenerationContextCreate(new_ctx
,
351 SLAB_DEFAULT_BLOCK_SIZE
,
352 SLAB_DEFAULT_BLOCK_SIZE
,
353 SLAB_DEFAULT_BLOCK_SIZE
);
355 hash_ctl
.keysize
= sizeof(TransactionId
);
356 hash_ctl
.entrysize
= sizeof(ReorderBufferTXNByIdEnt
);
357 hash_ctl
.hcxt
= buffer
->context
;
359 buffer
->by_txn
= hash_create("ReorderBufferByXid", 1000, &hash_ctl
,
360 HASH_ELEM
| HASH_BLOBS
| HASH_CONTEXT
);
362 buffer
->by_txn_last_xid
= InvalidTransactionId
;
363 buffer
->by_txn_last_txn
= NULL
;
365 buffer
->outbuf
= NULL
;
366 buffer
->outbufsize
= 0;
369 /* txn_heap is ordered by transaction size */
370 buffer
->txn_heap
= pairingheap_allocate(ReorderBufferTXNSizeCompare
, NULL
);
372 buffer
->spillTxns
= 0;
373 buffer
->spillCount
= 0;
374 buffer
->spillBytes
= 0;
375 buffer
->streamTxns
= 0;
376 buffer
->streamCount
= 0;
377 buffer
->streamBytes
= 0;
378 buffer
->totalTxns
= 0;
379 buffer
->totalBytes
= 0;
381 buffer
->current_restart_decoding_lsn
= InvalidXLogRecPtr
;
383 dlist_init(&buffer
->toplevel_by_lsn
);
384 dlist_init(&buffer
->txns_by_base_snapshot_lsn
);
385 dclist_init(&buffer
->catchange_txns
);
388 * Ensure there's no stale data from prior uses of this slot, in case some
389 * prior exit avoided calling ReorderBufferFree. Failure to do this can
390 * produce duplicated txns, and it's very cheap if there's nothing there.
392 ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot
->data
.name
));
398 * Free a ReorderBuffer
401 ReorderBufferFree(ReorderBuffer
*rb
)
403 MemoryContext context
= rb
->context
;
406 * We free separately allocated data by entirely scrapping reorderbuffer's
409 MemoryContextDelete(context
);
411 /* Free disk space used by unconsumed reorder buffers */
412 ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot
->data
.name
));
416 * Get an unused, possibly preallocated, ReorderBufferTXN.
418 static ReorderBufferTXN
*
419 ReorderBufferGetTXN(ReorderBuffer
*rb
)
421 ReorderBufferTXN
*txn
;
423 txn
= (ReorderBufferTXN
*)
424 MemoryContextAlloc(rb
->txn_context
, sizeof(ReorderBufferTXN
));
426 memset(txn
, 0, sizeof(ReorderBufferTXN
));
428 dlist_init(&txn
->changes
);
429 dlist_init(&txn
->tuplecids
);
430 dlist_init(&txn
->subtxns
);
432 /* InvalidCommandId is not zero, so set it explicitly */
433 txn
->command_id
= InvalidCommandId
;
434 txn
->output_plugin_private
= NULL
;
440 * Free a ReorderBufferTXN.
443 ReorderBufferReturnTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
445 /* clean the lookup cache if we were cached (quite likely) */
446 if (rb
->by_txn_last_xid
== txn
->xid
)
448 rb
->by_txn_last_xid
= InvalidTransactionId
;
449 rb
->by_txn_last_txn
= NULL
;
452 /* free data that's contained */
454 if (txn
->gid
!= NULL
)
460 if (txn
->tuplecid_hash
!= NULL
)
462 hash_destroy(txn
->tuplecid_hash
);
463 txn
->tuplecid_hash
= NULL
;
466 if (txn
->invalidations
)
468 pfree(txn
->invalidations
);
469 txn
->invalidations
= NULL
;
472 /* Reset the toast hash */
473 ReorderBufferToastReset(rb
, txn
);
475 /* All changes must be deallocated */
476 Assert(txn
->size
== 0);
482 * Get a fresh ReorderBufferChange.
484 ReorderBufferChange
*
485 ReorderBufferGetChange(ReorderBuffer
*rb
)
487 ReorderBufferChange
*change
;
489 change
= (ReorderBufferChange
*)
490 MemoryContextAlloc(rb
->change_context
, sizeof(ReorderBufferChange
));
492 memset(change
, 0, sizeof(ReorderBufferChange
));
497 * Free a ReorderBufferChange and update memory accounting, if requested.
500 ReorderBufferReturnChange(ReorderBuffer
*rb
, ReorderBufferChange
*change
,
503 /* update memory accounting info */
505 ReorderBufferChangeMemoryUpdate(rb
, change
, NULL
, false,
506 ReorderBufferChangeSize(change
));
508 /* free contained data */
509 switch (change
->action
)
511 case REORDER_BUFFER_CHANGE_INSERT
:
512 case REORDER_BUFFER_CHANGE_UPDATE
:
513 case REORDER_BUFFER_CHANGE_DELETE
:
514 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT
:
515 if (change
->data
.tp
.newtuple
)
517 ReorderBufferReturnTupleBuf(change
->data
.tp
.newtuple
);
518 change
->data
.tp
.newtuple
= NULL
;
521 if (change
->data
.tp
.oldtuple
)
523 ReorderBufferReturnTupleBuf(change
->data
.tp
.oldtuple
);
524 change
->data
.tp
.oldtuple
= NULL
;
527 case REORDER_BUFFER_CHANGE_MESSAGE
:
528 if (change
->data
.msg
.prefix
!= NULL
)
529 pfree(change
->data
.msg
.prefix
);
530 change
->data
.msg
.prefix
= NULL
;
531 if (change
->data
.msg
.message
!= NULL
)
532 pfree(change
->data
.msg
.message
);
533 change
->data
.msg
.message
= NULL
;
535 case REORDER_BUFFER_CHANGE_INVALIDATION
:
536 if (change
->data
.inval
.invalidations
)
537 pfree(change
->data
.inval
.invalidations
);
538 change
->data
.inval
.invalidations
= NULL
;
540 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT
:
541 if (change
->data
.snapshot
)
543 ReorderBufferFreeSnap(rb
, change
->data
.snapshot
);
544 change
->data
.snapshot
= NULL
;
547 /* no data in addition to the struct itself */
548 case REORDER_BUFFER_CHANGE_TRUNCATE
:
549 if (change
->data
.truncate
.relids
!= NULL
)
551 ReorderBufferReturnRelids(rb
, change
->data
.truncate
.relids
);
552 change
->data
.truncate
.relids
= NULL
;
555 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM
:
556 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT
:
557 case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID
:
558 case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
:
566 * Get a fresh HeapTuple fitting a tuple of size tuple_len (excluding header
570 ReorderBufferGetTupleBuf(ReorderBuffer
*rb
, Size tuple_len
)
575 alloc_len
= tuple_len
+ SizeofHeapTupleHeader
;
577 tuple
= (HeapTuple
) MemoryContextAlloc(rb
->tup_context
,
578 HEAPTUPLESIZE
+ alloc_len
);
579 tuple
->t_data
= (HeapTupleHeader
) ((char *) tuple
+ HEAPTUPLESIZE
);
585 * Free a HeapTuple returned by ReorderBufferGetTupleBuf().
588 ReorderBufferReturnTupleBuf(HeapTuple tuple
)
594 * Get an array for relids of truncated relations.
596 * We use the global memory context (for the whole reorder buffer), because
597 * none of the existing ones seems like a good match (some are SLAB, so we
598 * can't use those, and tup_context is meant for tuple data, not relids). We
599 * could add yet another context, but it seems like an overkill - TRUNCATE is
600 * not particularly common operation, so it does not seem worth it.
603 ReorderBufferGetRelids(ReorderBuffer
*rb
, int nrelids
)
608 alloc_len
= sizeof(Oid
) * nrelids
;
610 relids
= (Oid
*) MemoryContextAlloc(rb
->context
, alloc_len
);
616 * Free an array of relids.
619 ReorderBufferReturnRelids(ReorderBuffer
*rb
, Oid
*relids
)
625 * Return the ReorderBufferTXN from the given buffer, specified by Xid.
626 * If create is true, and a transaction doesn't already exist, create it
627 * (with the given LSN, and as top transaction if that's specified);
628 * when this happens, is_new is set to true.
630 static ReorderBufferTXN
*
631 ReorderBufferTXNByXid(ReorderBuffer
*rb
, TransactionId xid
, bool create
,
632 bool *is_new
, XLogRecPtr lsn
, bool create_as_top
)
634 ReorderBufferTXN
*txn
;
635 ReorderBufferTXNByIdEnt
*ent
;
638 Assert(TransactionIdIsValid(xid
));
641 * Check the one-entry lookup cache first
643 if (TransactionIdIsValid(rb
->by_txn_last_xid
) &&
644 rb
->by_txn_last_xid
== xid
)
646 txn
= rb
->by_txn_last_txn
;
650 /* found it, and it's valid */
657 * cached as non-existent, and asked not to create? Then nothing else
662 /* otherwise fall through to create it */
666 * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
670 /* search the lookup table */
671 ent
= (ReorderBufferTXNByIdEnt
*)
672 hash_search(rb
->by_txn
,
674 create
? HASH_ENTER
: HASH_FIND
,
680 /* initialize the new entry, if creation was requested */
682 Assert(lsn
!= InvalidXLogRecPtr
);
684 ent
->txn
= ReorderBufferGetTXN(rb
);
687 txn
->first_lsn
= lsn
;
688 txn
->restart_decoding_lsn
= rb
->current_restart_decoding_lsn
;
692 dlist_push_tail(&rb
->toplevel_by_lsn
, &txn
->node
);
693 AssertTXNLsnOrder(rb
);
697 txn
= NULL
; /* not found and not asked to create */
700 rb
->by_txn_last_xid
= xid
;
701 rb
->by_txn_last_txn
= txn
;
706 Assert(!create
|| txn
!= NULL
);
711 * Record the partial change for the streaming of in-progress transactions. We
712 * can stream only complete changes so if we have a partial change like toast
713 * table insert or speculative insert then we mark such a 'txn' so that it
714 * can't be streamed. We also ensure that if the changes in such a 'txn' can
715 * be streamed and are above logical_decoding_work_mem threshold then we stream
716 * them as soon as we have a complete change.
719 ReorderBufferProcessPartialChange(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
720 ReorderBufferChange
*change
,
723 ReorderBufferTXN
*toptxn
;
726 * The partial changes need to be processed only while streaming
727 * in-progress transactions.
729 if (!ReorderBufferCanStream(rb
))
732 /* Get the top transaction. */
733 toptxn
= rbtxn_get_toptxn(txn
);
736 * Indicate a partial change for toast inserts. The change will be
737 * considered as complete once we get the insert or update on the main
738 * table and we are sure that the pending toast chunks are not required
741 * If we allow streaming when there are pending toast chunks then such
742 * chunks won't be released till the insert (multi_insert) is complete and
743 * we expect the txn to have streamed all changes after streaming. This
744 * restriction is mainly to ensure the correctness of streamed
745 * transactions and it doesn't seem worth uplifting such a restriction
746 * just to allow this case because anyway we will stream the transaction
747 * once such an insert is complete.
750 toptxn
->txn_flags
|= RBTXN_HAS_PARTIAL_CHANGE
;
751 else if (rbtxn_has_partial_change(toptxn
) &&
752 IsInsertOrUpdate(change
->action
) &&
753 change
->data
.tp
.clear_toast_afterwards
)
754 toptxn
->txn_flags
&= ~RBTXN_HAS_PARTIAL_CHANGE
;
757 * Indicate a partial change for speculative inserts. The change will be
758 * considered as complete once we get the speculative confirm or abort
761 if (IsSpecInsert(change
->action
))
762 toptxn
->txn_flags
|= RBTXN_HAS_PARTIAL_CHANGE
;
763 else if (rbtxn_has_partial_change(toptxn
) &&
764 IsSpecConfirmOrAbort(change
->action
))
765 toptxn
->txn_flags
&= ~RBTXN_HAS_PARTIAL_CHANGE
;
768 * Stream the transaction if it is serialized before and the changes are
769 * now complete in the top-level transaction.
771 * The reason for doing the streaming of such a transaction as soon as we
772 * get the complete change for it is that previously it would have reached
773 * the memory threshold and wouldn't get streamed because of incomplete
774 * changes. Delaying such transactions would increase apply lag for them.
776 if (ReorderBufferCanStartStreaming(rb
) &&
777 !(rbtxn_has_partial_change(toptxn
)) &&
778 rbtxn_is_serialized(txn
) &&
779 rbtxn_has_streamable_change(toptxn
))
780 ReorderBufferStreamTXN(rb
, toptxn
);
784 * Queue a change into a transaction so it can be replayed upon commit or will be
785 * streamed when we reach logical_decoding_work_mem threshold.
788 ReorderBufferQueueChange(ReorderBuffer
*rb
, TransactionId xid
, XLogRecPtr lsn
,
789 ReorderBufferChange
*change
, bool toast_insert
)
791 ReorderBufferTXN
*txn
;
793 txn
= ReorderBufferTXNByXid(rb
, xid
, true, NULL
, lsn
, true);
796 * While streaming the previous changes we have detected that the
797 * transaction is aborted. So there is no point in collecting further
800 if (txn
->concurrent_abort
)
803 * We don't need to update memory accounting for this change as we
804 * have not added it to the queue yet.
806 ReorderBufferReturnChange(rb
, change
, false);
811 * The changes that are sent downstream are considered streamable. We
812 * remember such transactions so that only those will later be considered
815 if (change
->action
== REORDER_BUFFER_CHANGE_INSERT
||
816 change
->action
== REORDER_BUFFER_CHANGE_UPDATE
||
817 change
->action
== REORDER_BUFFER_CHANGE_DELETE
||
818 change
->action
== REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT
||
819 change
->action
== REORDER_BUFFER_CHANGE_TRUNCATE
||
820 change
->action
== REORDER_BUFFER_CHANGE_MESSAGE
)
822 ReorderBufferTXN
*toptxn
= rbtxn_get_toptxn(txn
);
824 toptxn
->txn_flags
|= RBTXN_HAS_STREAMABLE_CHANGE
;
830 Assert(InvalidXLogRecPtr
!= lsn
);
831 dlist_push_tail(&txn
->changes
, &change
->node
);
835 /* update memory accounting information */
836 ReorderBufferChangeMemoryUpdate(rb
, change
, NULL
, true,
837 ReorderBufferChangeSize(change
));
839 /* process partial change */
840 ReorderBufferProcessPartialChange(rb
, txn
, change
, toast_insert
);
842 /* check the memory limits and evict something if needed */
843 ReorderBufferCheckMemoryLimit(rb
);
847 * A transactional message is queued to be processed upon commit and a
848 * non-transactional message gets processed immediately.
851 ReorderBufferQueueMessage(ReorderBuffer
*rb
, TransactionId xid
,
852 Snapshot snap
, XLogRecPtr lsn
,
853 bool transactional
, const char *prefix
,
854 Size message_size
, const char *message
)
858 MemoryContext oldcontext
;
859 ReorderBufferChange
*change
;
861 Assert(xid
!= InvalidTransactionId
);
864 * We don't expect snapshots for transactional changes - we'll use the
865 * snapshot derived later during apply (unless the change gets
870 oldcontext
= MemoryContextSwitchTo(rb
->context
);
872 change
= ReorderBufferGetChange(rb
);
873 change
->action
= REORDER_BUFFER_CHANGE_MESSAGE
;
874 change
->data
.msg
.prefix
= pstrdup(prefix
);
875 change
->data
.msg
.message_size
= message_size
;
876 change
->data
.msg
.message
= palloc(message_size
);
877 memcpy(change
->data
.msg
.message
, message
, message_size
);
879 ReorderBufferQueueChange(rb
, xid
, lsn
, change
, false);
881 MemoryContextSwitchTo(oldcontext
);
885 ReorderBufferTXN
*txn
= NULL
;
886 volatile Snapshot snapshot_now
= snap
;
888 /* Non-transactional changes require a valid snapshot. */
889 Assert(snapshot_now
);
891 if (xid
!= InvalidTransactionId
)
892 txn
= ReorderBufferTXNByXid(rb
, xid
, true, NULL
, lsn
, true);
894 /* setup snapshot to allow catalog access */
895 SetupHistoricSnapshot(snapshot_now
, NULL
);
898 rb
->message(rb
, txn
, lsn
, false, prefix
, message_size
, message
);
900 TeardownHistoricSnapshot(false);
904 TeardownHistoricSnapshot(true);
913 * Verify LSN ordering of transaction lists in the reorderbuffer
915 * Other LSN-related invariants are checked too.
917 * No-op if assertions are not in use.
920 AssertTXNLsnOrder(ReorderBuffer
*rb
)
922 #ifdef USE_ASSERT_CHECKING
923 LogicalDecodingContext
*ctx
= rb
->private_data
;
925 XLogRecPtr prev_first_lsn
= InvalidXLogRecPtr
;
926 XLogRecPtr prev_base_snap_lsn
= InvalidXLogRecPtr
;
929 * Skip the verification if we don't reach the LSN at which we start
930 * decoding the contents of transactions yet because until we reach the
931 * LSN, we could have transactions that don't have the association between
932 * the top-level transaction and subtransaction yet and consequently have
933 * the same LSN. We don't guarantee this association until we try to
934 * decode the actual contents of transaction. The ordering of the records
935 * prior to the start_decoding_at LSN should have been checked before the
938 if (SnapBuildXactNeedsSkip(ctx
->snapshot_builder
, ctx
->reader
->EndRecPtr
))
941 dlist_foreach(iter
, &rb
->toplevel_by_lsn
)
943 ReorderBufferTXN
*cur_txn
= dlist_container(ReorderBufferTXN
, node
,
946 /* start LSN must be set */
947 Assert(cur_txn
->first_lsn
!= InvalidXLogRecPtr
);
949 /* If there is an end LSN, it must be higher than start LSN */
950 if (cur_txn
->end_lsn
!= InvalidXLogRecPtr
)
951 Assert(cur_txn
->first_lsn
<= cur_txn
->end_lsn
);
953 /* Current initial LSN must be strictly higher than previous */
954 if (prev_first_lsn
!= InvalidXLogRecPtr
)
955 Assert(prev_first_lsn
< cur_txn
->first_lsn
);
957 /* known-as-subtxn txns must not be listed */
958 Assert(!rbtxn_is_known_subxact(cur_txn
));
960 prev_first_lsn
= cur_txn
->first_lsn
;
963 dlist_foreach(iter
, &rb
->txns_by_base_snapshot_lsn
)
965 ReorderBufferTXN
*cur_txn
= dlist_container(ReorderBufferTXN
,
969 /* base snapshot (and its LSN) must be set */
970 Assert(cur_txn
->base_snapshot
!= NULL
);
971 Assert(cur_txn
->base_snapshot_lsn
!= InvalidXLogRecPtr
);
973 /* current LSN must be strictly higher than previous */
974 if (prev_base_snap_lsn
!= InvalidXLogRecPtr
)
975 Assert(prev_base_snap_lsn
< cur_txn
->base_snapshot_lsn
);
977 /* known-as-subtxn txns must not be listed */
978 Assert(!rbtxn_is_known_subxact(cur_txn
));
980 prev_base_snap_lsn
= cur_txn
->base_snapshot_lsn
;
986 * AssertChangeLsnOrder
988 * Check ordering of changes in the (sub)transaction.
991 AssertChangeLsnOrder(ReorderBufferTXN
*txn
)
993 #ifdef USE_ASSERT_CHECKING
995 XLogRecPtr prev_lsn
= txn
->first_lsn
;
997 dlist_foreach(iter
, &txn
->changes
)
999 ReorderBufferChange
*cur_change
;
1001 cur_change
= dlist_container(ReorderBufferChange
, node
, iter
.cur
);
1003 Assert(txn
->first_lsn
!= InvalidXLogRecPtr
);
1004 Assert(cur_change
->lsn
!= InvalidXLogRecPtr
);
1005 Assert(txn
->first_lsn
<= cur_change
->lsn
);
1007 if (txn
->end_lsn
!= InvalidXLogRecPtr
)
1008 Assert(cur_change
->lsn
<= txn
->end_lsn
);
1010 Assert(prev_lsn
<= cur_change
->lsn
);
1012 prev_lsn
= cur_change
->lsn
;
1018 * ReorderBufferGetOldestTXN
1019 * Return oldest transaction in reorderbuffer
1022 ReorderBufferGetOldestTXN(ReorderBuffer
*rb
)
1024 ReorderBufferTXN
*txn
;
1026 AssertTXNLsnOrder(rb
);
1028 if (dlist_is_empty(&rb
->toplevel_by_lsn
))
1031 txn
= dlist_head_element(ReorderBufferTXN
, node
, &rb
->toplevel_by_lsn
);
1033 Assert(!rbtxn_is_known_subxact(txn
));
1034 Assert(txn
->first_lsn
!= InvalidXLogRecPtr
);
1039 * ReorderBufferGetOldestXmin
1040 * Return oldest Xmin in reorderbuffer
1042 * Returns oldest possibly running Xid from the point of view of snapshots
1043 * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1046 * Since snapshots are assigned monotonically, this equals the Xmin of the
1047 * base snapshot with minimal base_snapshot_lsn.
1050 ReorderBufferGetOldestXmin(ReorderBuffer
*rb
)
1052 ReorderBufferTXN
*txn
;
1054 AssertTXNLsnOrder(rb
);
1056 if (dlist_is_empty(&rb
->txns_by_base_snapshot_lsn
))
1057 return InvalidTransactionId
;
1059 txn
= dlist_head_element(ReorderBufferTXN
, base_snapshot_node
,
1060 &rb
->txns_by_base_snapshot_lsn
);
1061 return txn
->base_snapshot
->xmin
;
1065 ReorderBufferSetRestartPoint(ReorderBuffer
*rb
, XLogRecPtr ptr
)
1067 rb
->current_restart_decoding_lsn
= ptr
;
1071 * ReorderBufferAssignChild
1073 * Make note that we know that subxid is a subtransaction of xid, seen as of
1077 ReorderBufferAssignChild(ReorderBuffer
*rb
, TransactionId xid
,
1078 TransactionId subxid
, XLogRecPtr lsn
)
1080 ReorderBufferTXN
*txn
;
1081 ReorderBufferTXN
*subtxn
;
1085 txn
= ReorderBufferTXNByXid(rb
, xid
, true, &new_top
, lsn
, true);
1086 subtxn
= ReorderBufferTXNByXid(rb
, subxid
, true, &new_sub
, lsn
, false);
1090 if (rbtxn_is_known_subxact(subtxn
))
1092 /* already associated, nothing to do */
1098 * We already saw this transaction, but initially added it to the
1099 * list of top-level txns. Now that we know it's not top-level,
1100 * remove it from there.
1102 dlist_delete(&subtxn
->node
);
1106 subtxn
->txn_flags
|= RBTXN_IS_SUBXACT
;
1107 subtxn
->toplevel_xid
= xid
;
1108 Assert(subtxn
->nsubtxns
== 0);
1110 /* set the reference to top-level transaction */
1111 subtxn
->toptxn
= txn
;
1113 /* add to subtransaction list */
1114 dlist_push_tail(&txn
->subtxns
, &subtxn
->node
);
1117 /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1118 ReorderBufferTransferSnapToParent(txn
, subtxn
);
1120 /* Verify LSN-ordering invariant */
1121 AssertTXNLsnOrder(rb
);
1125 * ReorderBufferTransferSnapToParent
1126 * Transfer base snapshot from subtxn to top-level txn, if needed
1128 * This is done if the top-level txn doesn't have a base snapshot, or if the
1129 * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1130 * snapshot's LSN. This can happen if there are no changes in the toplevel
1131 * txn but there are some in the subtxn, or the first change in subtxn has
1132 * earlier LSN than first change in the top-level txn and we learned about
1133 * their kinship only now.
1135 * The subtransaction's snapshot is cleared regardless of the transfer
1136 * happening, since it's not needed anymore in either case.
1138 * We do this as soon as we become aware of their kinship, to avoid queueing
1139 * extra snapshots to txns known-as-subtxns -- only top-level txns will
1140 * receive further snapshots.
1143 ReorderBufferTransferSnapToParent(ReorderBufferTXN
*txn
,
1144 ReorderBufferTXN
*subtxn
)
1146 Assert(subtxn
->toplevel_xid
== txn
->xid
);
1148 if (subtxn
->base_snapshot
!= NULL
)
1150 if (txn
->base_snapshot
== NULL
||
1151 subtxn
->base_snapshot_lsn
< txn
->base_snapshot_lsn
)
1154 * If the toplevel transaction already has a base snapshot but
1155 * it's newer than the subxact's, purge it.
1157 if (txn
->base_snapshot
!= NULL
)
1159 SnapBuildSnapDecRefcount(txn
->base_snapshot
);
1160 dlist_delete(&txn
->base_snapshot_node
);
1164 * The snapshot is now the top transaction's; transfer it, and
1165 * adjust the list position of the top transaction in the list by
1166 * moving it to where the subtransaction is.
1168 txn
->base_snapshot
= subtxn
->base_snapshot
;
1169 txn
->base_snapshot_lsn
= subtxn
->base_snapshot_lsn
;
1170 dlist_insert_before(&subtxn
->base_snapshot_node
,
1171 &txn
->base_snapshot_node
);
1174 * The subtransaction doesn't have a snapshot anymore (so it
1175 * mustn't be in the list.)
1177 subtxn
->base_snapshot
= NULL
;
1178 subtxn
->base_snapshot_lsn
= InvalidXLogRecPtr
;
1179 dlist_delete(&subtxn
->base_snapshot_node
);
1183 /* Base snap of toplevel is fine, so subxact's is not needed */
1184 SnapBuildSnapDecRefcount(subtxn
->base_snapshot
);
1185 dlist_delete(&subtxn
->base_snapshot_node
);
1186 subtxn
->base_snapshot
= NULL
;
1187 subtxn
->base_snapshot_lsn
= InvalidXLogRecPtr
;
1193 * Associate a subtransaction with its toplevel transaction at commit
1194 * time. There may be no further changes added after this.
1197 ReorderBufferCommitChild(ReorderBuffer
*rb
, TransactionId xid
,
1198 TransactionId subxid
, XLogRecPtr commit_lsn
,
1201 ReorderBufferTXN
*subtxn
;
1203 subtxn
= ReorderBufferTXNByXid(rb
, subxid
, false, NULL
,
1204 InvalidXLogRecPtr
, false);
1207 * No need to do anything if that subtxn didn't contain any changes
1212 subtxn
->final_lsn
= commit_lsn
;
1213 subtxn
->end_lsn
= end_lsn
;
1216 * Assign this subxact as a child of the toplevel xact (no-op if already
1219 ReorderBufferAssignChild(rb
, xid
, subxid
, InvalidXLogRecPtr
);
1224 * Support for efficiently iterating over a transaction's and its
1225 * subtransactions' changes.
1227 * We do by doing a k-way merge between transactions/subtransactions. For that
1228 * we model the current heads of the different transactions as a binary heap
1229 * so we easily know which (sub-)transaction has the change with the smallest
1232 * We assume the changes in individual transactions are already sorted by LSN.
1236 * Binary heap comparison function.
1239 ReorderBufferIterCompare(Datum a
, Datum b
, void *arg
)
1241 ReorderBufferIterTXNState
*state
= (ReorderBufferIterTXNState
*) arg
;
1242 XLogRecPtr pos_a
= state
->entries
[DatumGetInt32(a
)].lsn
;
1243 XLogRecPtr pos_b
= state
->entries
[DatumGetInt32(b
)].lsn
;
1247 else if (pos_a
== pos_b
)
1253 * Allocate & initialize an iterator which iterates in lsn order over a
1254 * transaction and all its subtransactions.
1256 * Note: The iterator state is returned through iter_state parameter rather
1257 * than the function's return value. This is because the state gets cleaned up
1258 * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1259 * back the state even if this function throws an exception.
1262 ReorderBufferIterTXNInit(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
1263 ReorderBufferIterTXNState
*volatile *iter_state
)
1266 ReorderBufferIterTXNState
*state
;
1267 dlist_iter cur_txn_i
;
1272 /* Check ordering of changes in the toplevel transaction. */
1273 AssertChangeLsnOrder(txn
);
1276 * Calculate the size of our heap: one element for every transaction that
1277 * contains changes. (Besides the transactions already in the reorder
1278 * buffer, we count the one we were directly passed.)
1280 if (txn
->nentries
> 0)
1283 dlist_foreach(cur_txn_i
, &txn
->subtxns
)
1285 ReorderBufferTXN
*cur_txn
;
1287 cur_txn
= dlist_container(ReorderBufferTXN
, node
, cur_txn_i
.cur
);
1289 /* Check ordering of changes in this subtransaction. */
1290 AssertChangeLsnOrder(cur_txn
);
1292 if (cur_txn
->nentries
> 0)
1296 /* allocate iteration state */
1297 state
= (ReorderBufferIterTXNState
*)
1298 MemoryContextAllocZero(rb
->context
,
1299 sizeof(ReorderBufferIterTXNState
) +
1300 sizeof(ReorderBufferIterTXNEntry
) * nr_txns
);
1302 state
->nr_txns
= nr_txns
;
1303 dlist_init(&state
->old_change
);
1305 for (off
= 0; off
< state
->nr_txns
; off
++)
1307 state
->entries
[off
].file
.vfd
= -1;
1308 state
->entries
[off
].segno
= 0;
1312 state
->heap
= binaryheap_allocate(state
->nr_txns
,
1313 ReorderBufferIterCompare
,
1316 /* Now that the state fields are initialized, it is safe to return it. */
1317 *iter_state
= state
;
1320 * Now insert items into the binary heap, in an unordered fashion. (We
1321 * will run a heap assembly step at the end; this is more efficient.)
1326 /* add toplevel transaction if it contains changes */
1327 if (txn
->nentries
> 0)
1329 ReorderBufferChange
*cur_change
;
1331 if (rbtxn_is_serialized(txn
))
1333 /* serialize remaining changes */
1334 ReorderBufferSerializeTXN(rb
, txn
);
1335 ReorderBufferRestoreChanges(rb
, txn
, &state
->entries
[off
].file
,
1336 &state
->entries
[off
].segno
);
1339 cur_change
= dlist_head_element(ReorderBufferChange
, node
,
1342 state
->entries
[off
].lsn
= cur_change
->lsn
;
1343 state
->entries
[off
].change
= cur_change
;
1344 state
->entries
[off
].txn
= txn
;
1346 binaryheap_add_unordered(state
->heap
, Int32GetDatum(off
++));
1349 /* add subtransactions if they contain changes */
1350 dlist_foreach(cur_txn_i
, &txn
->subtxns
)
1352 ReorderBufferTXN
*cur_txn
;
1354 cur_txn
= dlist_container(ReorderBufferTXN
, node
, cur_txn_i
.cur
);
1356 if (cur_txn
->nentries
> 0)
1358 ReorderBufferChange
*cur_change
;
1360 if (rbtxn_is_serialized(cur_txn
))
1362 /* serialize remaining changes */
1363 ReorderBufferSerializeTXN(rb
, cur_txn
);
1364 ReorderBufferRestoreChanges(rb
, cur_txn
,
1365 &state
->entries
[off
].file
,
1366 &state
->entries
[off
].segno
);
1368 cur_change
= dlist_head_element(ReorderBufferChange
, node
,
1371 state
->entries
[off
].lsn
= cur_change
->lsn
;
1372 state
->entries
[off
].change
= cur_change
;
1373 state
->entries
[off
].txn
= cur_txn
;
1375 binaryheap_add_unordered(state
->heap
, Int32GetDatum(off
++));
1379 /* assemble a valid binary heap */
1380 binaryheap_build(state
->heap
);
1384 * Return the next change when iterating over a transaction and its
1387 * Returns NULL when no further changes exist.
1389 static ReorderBufferChange
*
1390 ReorderBufferIterTXNNext(ReorderBuffer
*rb
, ReorderBufferIterTXNState
*state
)
1392 ReorderBufferChange
*change
;
1393 ReorderBufferIterTXNEntry
*entry
;
1396 /* nothing there anymore */
1397 if (state
->heap
->bh_size
== 0)
1400 off
= DatumGetInt32(binaryheap_first(state
->heap
));
1401 entry
= &state
->entries
[off
];
1403 /* free memory we might have "leaked" in the previous *Next call */
1404 if (!dlist_is_empty(&state
->old_change
))
1406 change
= dlist_container(ReorderBufferChange
, node
,
1407 dlist_pop_head_node(&state
->old_change
));
1408 ReorderBufferReturnChange(rb
, change
, true);
1409 Assert(dlist_is_empty(&state
->old_change
));
1412 change
= entry
->change
;
1415 * update heap with information about which transaction has the next
1416 * relevant change in LSN order
1419 /* there are in-memory changes */
1420 if (dlist_has_next(&entry
->txn
->changes
, &entry
->change
->node
))
1422 dlist_node
*next
= dlist_next_node(&entry
->txn
->changes
, &change
->node
);
1423 ReorderBufferChange
*next_change
=
1424 dlist_container(ReorderBufferChange
, node
, next
);
1426 /* txn stays the same */
1427 state
->entries
[off
].lsn
= next_change
->lsn
;
1428 state
->entries
[off
].change
= next_change
;
1430 binaryheap_replace_first(state
->heap
, Int32GetDatum(off
));
1434 /* try to load changes from disk */
1435 if (entry
->txn
->nentries
!= entry
->txn
->nentries_mem
)
1438 * Ugly: restoring changes will reuse *Change records, thus delete the
1439 * current one from the per-tx list and only free in the next call.
1441 dlist_delete(&change
->node
);
1442 dlist_push_tail(&state
->old_change
, &change
->node
);
1445 * Update the total bytes processed by the txn for which we are
1446 * releasing the current set of changes and restoring the new set of
1449 rb
->totalBytes
+= entry
->txn
->size
;
1450 if (ReorderBufferRestoreChanges(rb
, entry
->txn
, &entry
->file
,
1451 &state
->entries
[off
].segno
))
1453 /* successfully restored changes from disk */
1454 ReorderBufferChange
*next_change
=
1455 dlist_head_element(ReorderBufferChange
, node
,
1456 &entry
->txn
->changes
);
1458 elog(DEBUG2
, "restored %u/%u changes from disk",
1459 (uint32
) entry
->txn
->nentries_mem
,
1460 (uint32
) entry
->txn
->nentries
);
1462 Assert(entry
->txn
->nentries_mem
);
1463 /* txn stays the same */
1464 state
->entries
[off
].lsn
= next_change
->lsn
;
1465 state
->entries
[off
].change
= next_change
;
1466 binaryheap_replace_first(state
->heap
, Int32GetDatum(off
));
1472 /* ok, no changes there anymore, remove */
1473 binaryheap_remove_first(state
->heap
);
1479 * Deallocate the iterator
1482 ReorderBufferIterTXNFinish(ReorderBuffer
*rb
,
1483 ReorderBufferIterTXNState
*state
)
1487 for (off
= 0; off
< state
->nr_txns
; off
++)
1489 if (state
->entries
[off
].file
.vfd
!= -1)
1490 FileClose(state
->entries
[off
].file
.vfd
);
1493 /* free memory we might have "leaked" in the last *Next call */
1494 if (!dlist_is_empty(&state
->old_change
))
1496 ReorderBufferChange
*change
;
1498 change
= dlist_container(ReorderBufferChange
, node
,
1499 dlist_pop_head_node(&state
->old_change
));
1500 ReorderBufferReturnChange(rb
, change
, true);
1501 Assert(dlist_is_empty(&state
->old_change
));
1504 binaryheap_free(state
->heap
);
1509 * Cleanup the contents of a transaction, usually after the transaction
1510 * committed or aborted.
1513 ReorderBufferCleanupTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
1516 dlist_mutable_iter iter
;
1519 /* cleanup subtransactions & their changes */
1520 dlist_foreach_modify(iter
, &txn
->subtxns
)
1522 ReorderBufferTXN
*subtxn
;
1524 subtxn
= dlist_container(ReorderBufferTXN
, node
, iter
.cur
);
1527 * Subtransactions are always associated to the toplevel TXN, even if
1528 * they originally were happening inside another subtxn, so we won't
1529 * ever recurse more than one level deep here.
1531 Assert(rbtxn_is_known_subxact(subtxn
));
1532 Assert(subtxn
->nsubtxns
== 0);
1534 ReorderBufferCleanupTXN(rb
, subtxn
);
1537 /* cleanup changes in the txn */
1538 dlist_foreach_modify(iter
, &txn
->changes
)
1540 ReorderBufferChange
*change
;
1542 change
= dlist_container(ReorderBufferChange
, node
, iter
.cur
);
1544 /* Check we're not mixing changes from different transactions. */
1545 Assert(change
->txn
== txn
);
1548 * Instead of updating the memory counter for individual changes, we
1549 * sum up the size of memory to free so we can update the memory
1550 * counter all together below. This saves costs of maintaining the
1553 mem_freed
+= ReorderBufferChangeSize(change
);
1555 ReorderBufferReturnChange(rb
, change
, false);
1558 /* Update the memory counter */
1559 ReorderBufferChangeMemoryUpdate(rb
, NULL
, txn
, false, mem_freed
);
1562 * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1563 * They are always stored in the toplevel transaction.
1565 dlist_foreach_modify(iter
, &txn
->tuplecids
)
1567 ReorderBufferChange
*change
;
1569 change
= dlist_container(ReorderBufferChange
, node
, iter
.cur
);
1571 /* Check we're not mixing changes from different transactions. */
1572 Assert(change
->txn
== txn
);
1573 Assert(change
->action
== REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
);
1575 ReorderBufferReturnChange(rb
, change
, true);
1579 * Cleanup the base snapshot, if set.
1581 if (txn
->base_snapshot
!= NULL
)
1583 SnapBuildSnapDecRefcount(txn
->base_snapshot
);
1584 dlist_delete(&txn
->base_snapshot_node
);
1588 * Cleanup the snapshot for the last streamed run.
1590 if (txn
->snapshot_now
!= NULL
)
1592 Assert(rbtxn_is_streamed(txn
));
1593 ReorderBufferFreeSnap(rb
, txn
->snapshot_now
);
1597 * Remove TXN from its containing lists.
1599 * Note: if txn is known as subxact, we are deleting the TXN from its
1600 * parent's list of known subxacts; this leaves the parent's nsubxacts
1601 * count too high, but we don't care. Otherwise, we are deleting the TXN
1602 * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1603 * list of catalog modifying transactions as well.
1605 dlist_delete(&txn
->node
);
1606 if (rbtxn_has_catalog_changes(txn
))
1607 dclist_delete_from(&rb
->catchange_txns
, &txn
->catchange_node
);
1609 /* now remove reference from buffer */
1610 hash_search(rb
->by_txn
, &txn
->xid
, HASH_REMOVE
, &found
);
1613 /* remove entries spilled to disk */
1614 if (rbtxn_is_serialized(txn
))
1615 ReorderBufferRestoreCleanup(rb
, txn
);
1618 ReorderBufferReturnTXN(rb
, txn
);
1622 * Discard changes from a transaction (and subtransactions), either after
1623 * streaming or decoding them at PREPARE. Keep the remaining info -
1624 * transactions, tuplecids, invalidations and snapshots.
1626 * We additionally remove tuplecids after decoding the transaction at prepare
1627 * time as we only need to perform invalidation at rollback or commit prepared.
1629 * 'txn_prepared' indicates that we have decoded the transaction at prepare
1633 ReorderBufferTruncateTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
, bool txn_prepared
)
1635 dlist_mutable_iter iter
;
1638 /* cleanup subtransactions & their changes */
1639 dlist_foreach_modify(iter
, &txn
->subtxns
)
1641 ReorderBufferTXN
*subtxn
;
1643 subtxn
= dlist_container(ReorderBufferTXN
, node
, iter
.cur
);
1646 * Subtransactions are always associated to the toplevel TXN, even if
1647 * they originally were happening inside another subtxn, so we won't
1648 * ever recurse more than one level deep here.
1650 Assert(rbtxn_is_known_subxact(subtxn
));
1651 Assert(subtxn
->nsubtxns
== 0);
1653 ReorderBufferTruncateTXN(rb
, subtxn
, txn_prepared
);
1656 /* cleanup changes in the txn */
1657 dlist_foreach_modify(iter
, &txn
->changes
)
1659 ReorderBufferChange
*change
;
1661 change
= dlist_container(ReorderBufferChange
, node
, iter
.cur
);
1663 /* Check we're not mixing changes from different transactions. */
1664 Assert(change
->txn
== txn
);
1666 /* remove the change from it's containing list */
1667 dlist_delete(&change
->node
);
1670 * Instead of updating the memory counter for individual changes, we
1671 * sum up the size of memory to free so we can update the memory
1672 * counter all together below. This saves costs of maintaining the
1675 mem_freed
+= ReorderBufferChangeSize(change
);
1677 ReorderBufferReturnChange(rb
, change
, false);
1680 /* Update the memory counter */
1681 ReorderBufferChangeMemoryUpdate(rb
, NULL
, txn
, false, mem_freed
);
1684 * Mark the transaction as streamed.
1686 * The top-level transaction, is marked as streamed always, even if it
1687 * does not contain any changes (that is, when all the changes are in
1690 * For subtransactions, we only mark them as streamed when there are
1693 * We do it this way because of aborts - we don't want to send aborts for
1694 * XIDs the downstream is not aware of. And of course, it always knows
1695 * about the toplevel xact (we send the XID in all messages), but we never
1696 * stream XIDs of empty subxacts.
1698 if ((!txn_prepared
) && (rbtxn_is_toptxn(txn
) || (txn
->nentries_mem
!= 0)))
1699 txn
->txn_flags
|= RBTXN_IS_STREAMED
;
1704 * If this is a prepared txn, cleanup the tuplecids we stored for
1705 * decoding catalog snapshot access. They are always stored in the
1706 * toplevel transaction.
1708 dlist_foreach_modify(iter
, &txn
->tuplecids
)
1710 ReorderBufferChange
*change
;
1712 change
= dlist_container(ReorderBufferChange
, node
, iter
.cur
);
1714 /* Check we're not mixing changes from different transactions. */
1715 Assert(change
->txn
== txn
);
1716 Assert(change
->action
== REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
);
1718 /* Remove the change from its containing list. */
1719 dlist_delete(&change
->node
);
1721 ReorderBufferReturnChange(rb
, change
, true);
1726 * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1727 * memory. We could also keep the hash table and update it with new ctid
1728 * values, but this seems simpler and good enough for now.
1730 if (txn
->tuplecid_hash
!= NULL
)
1732 hash_destroy(txn
->tuplecid_hash
);
1733 txn
->tuplecid_hash
= NULL
;
1736 /* If this txn is serialized then clean the disk space. */
1737 if (rbtxn_is_serialized(txn
))
1739 ReorderBufferRestoreCleanup(rb
, txn
);
1740 txn
->txn_flags
&= ~RBTXN_IS_SERIALIZED
;
1743 * We set this flag to indicate if the transaction is ever serialized.
1744 * We need this to accurately update the stats as otherwise the same
1745 * transaction can be counted as serialized multiple times.
1747 txn
->txn_flags
|= RBTXN_IS_SERIALIZED_CLEAR
;
1750 /* also reset the number of entries in the transaction */
1751 txn
->nentries_mem
= 0;
1756 * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1757 * HeapTupleSatisfiesHistoricMVCC.
1760 ReorderBufferBuildTupleCidHash(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
1765 if (!rbtxn_has_catalog_changes(txn
) || dlist_is_empty(&txn
->tuplecids
))
1768 hash_ctl
.keysize
= sizeof(ReorderBufferTupleCidKey
);
1769 hash_ctl
.entrysize
= sizeof(ReorderBufferTupleCidEnt
);
1770 hash_ctl
.hcxt
= rb
->context
;
1773 * create the hash with the exact number of to-be-stored tuplecids from
1776 txn
->tuplecid_hash
=
1777 hash_create("ReorderBufferTupleCid", txn
->ntuplecids
, &hash_ctl
,
1778 HASH_ELEM
| HASH_BLOBS
| HASH_CONTEXT
);
1780 dlist_foreach(iter
, &txn
->tuplecids
)
1782 ReorderBufferTupleCidKey key
;
1783 ReorderBufferTupleCidEnt
*ent
;
1785 ReorderBufferChange
*change
;
1787 change
= dlist_container(ReorderBufferChange
, node
, iter
.cur
);
1789 Assert(change
->action
== REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
);
1791 /* be careful about padding */
1792 memset(&key
, 0, sizeof(ReorderBufferTupleCidKey
));
1794 key
.rlocator
= change
->data
.tuplecid
.locator
;
1796 ItemPointerCopy(&change
->data
.tuplecid
.tid
,
1799 ent
= (ReorderBufferTupleCidEnt
*)
1800 hash_search(txn
->tuplecid_hash
, &key
, HASH_ENTER
, &found
);
1803 ent
->cmin
= change
->data
.tuplecid
.cmin
;
1804 ent
->cmax
= change
->data
.tuplecid
.cmax
;
1805 ent
->combocid
= change
->data
.tuplecid
.combocid
;
1810 * Maybe we already saw this tuple before in this transaction, but
1811 * if so it must have the same cmin.
1813 Assert(ent
->cmin
== change
->data
.tuplecid
.cmin
);
1816 * cmax may be initially invalid, but once set it can only grow,
1817 * and never become invalid again.
1819 Assert((ent
->cmax
== InvalidCommandId
) ||
1820 ((change
->data
.tuplecid
.cmax
!= InvalidCommandId
) &&
1821 (change
->data
.tuplecid
.cmax
> ent
->cmax
)));
1822 ent
->cmax
= change
->data
.tuplecid
.cmax
;
1828 * Copy a provided snapshot so we can modify it privately. This is needed so
1829 * that catalog modifying transactions can look into intermediate catalog
1833 ReorderBufferCopySnap(ReorderBuffer
*rb
, Snapshot orig_snap
,
1834 ReorderBufferTXN
*txn
, CommandId cid
)
1841 size
= sizeof(SnapshotData
) +
1842 sizeof(TransactionId
) * orig_snap
->xcnt
+
1843 sizeof(TransactionId
) * (txn
->nsubtxns
+ 1);
1845 snap
= MemoryContextAllocZero(rb
->context
, size
);
1846 memcpy(snap
, orig_snap
, sizeof(SnapshotData
));
1848 snap
->copied
= true;
1849 snap
->active_count
= 1; /* mark as active so nobody frees it */
1850 snap
->regd_count
= 0;
1851 snap
->xip
= (TransactionId
*) (snap
+ 1);
1853 memcpy(snap
->xip
, orig_snap
->xip
, sizeof(TransactionId
) * snap
->xcnt
);
1856 * snap->subxip contains all txids that belong to our transaction which we
1857 * need to check via cmin/cmax. That's why we store the toplevel
1858 * transaction in there as well.
1860 snap
->subxip
= snap
->xip
+ snap
->xcnt
;
1861 snap
->subxip
[i
++] = txn
->xid
;
1864 * subxcnt isn't decreased when subtransactions abort, so count manually.
1865 * Since it's an upper boundary it is safe to use it for the allocation
1870 dlist_foreach(iter
, &txn
->subtxns
)
1872 ReorderBufferTXN
*sub_txn
;
1874 sub_txn
= dlist_container(ReorderBufferTXN
, node
, iter
.cur
);
1875 snap
->subxip
[i
++] = sub_txn
->xid
;
1879 /* sort so we can bsearch() later */
1880 qsort(snap
->subxip
, snap
->subxcnt
, sizeof(TransactionId
), xidComparator
);
1882 /* store the specified current CommandId */
1889 * Free a previously ReorderBufferCopySnap'ed snapshot
1892 ReorderBufferFreeSnap(ReorderBuffer
*rb
, Snapshot snap
)
1897 SnapBuildSnapDecRefcount(snap
);
1901 * If the transaction was (partially) streamed, we need to prepare or commit
1902 * it in a 'streamed' way. That is, we first stream the remaining part of the
1903 * transaction, and then invoke stream_prepare or stream_commit message as per
1907 ReorderBufferStreamCommit(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
1909 /* we should only call this for previously streamed transactions */
1910 Assert(rbtxn_is_streamed(txn
));
1912 ReorderBufferStreamTXN(rb
, txn
);
1914 if (rbtxn_prepared(txn
))
1917 * Note, we send stream prepare even if a concurrent abort is
1918 * detected. See DecodePrepare for more information.
1920 rb
->stream_prepare(rb
, txn
, txn
->final_lsn
);
1923 * This is a PREPARED transaction, part of a two-phase commit. The
1924 * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1925 * just truncate txn by removing changes and tuplecids.
1927 ReorderBufferTruncateTXN(rb
, txn
, true);
1928 /* Reset the CheckXidAlive */
1929 CheckXidAlive
= InvalidTransactionId
;
1933 rb
->stream_commit(rb
, txn
, txn
->final_lsn
);
1934 ReorderBufferCleanupTXN(rb
, txn
);
1939 * Set xid to detect concurrent aborts.
1941 * While streaming an in-progress transaction or decoding a prepared
1942 * transaction there is a possibility that the (sub)transaction might get
1943 * aborted concurrently. In such case if the (sub)transaction has catalog
1944 * update then we might decode the tuple using wrong catalog version. For
1945 * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1946 * the transaction 501 updates the catalog tuple and after that we will have
1947 * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1948 * aborted and some other transaction say 502 updates the same catalog tuple
1949 * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1950 * problem is that when we try to decode the tuple inserted/updated in 501
1951 * after the catalog update, we will see the catalog tuple with (xmin: 500,
1952 * xmax: 502) as visible because it will consider that the tuple is deleted by
1953 * xid 502 which is not visible to our snapshot. And when we will try to
1954 * decode with that catalog tuple, it can lead to a wrong result or a crash.
1955 * So, it is necessary to detect concurrent aborts to allow streaming of
1956 * in-progress transactions or decoding of prepared transactions.
1958 * For detecting the concurrent abort we set CheckXidAlive to the current
1959 * (sub)transaction's xid for which this change belongs to. And, during
1960 * catalog scan we can check the status of the xid and if it is aborted we will
1961 * report a specific error so that we can stop streaming current transaction
1962 * and discard the already streamed changes on such an error. We might have
1963 * already streamed some of the changes for the aborted (sub)transaction, but
1964 * that is fine because when we decode the abort we will stream abort message
1965 * to truncate the changes in the subscriber. Similarly, for prepared
1966 * transactions, we stop decoding if concurrent abort is detected and then
1967 * rollback the changes when rollback prepared is encountered. See
1971 SetupCheckXidLive(TransactionId xid
)
1974 * If the input transaction id is already set as a CheckXidAlive then
1977 if (TransactionIdEquals(CheckXidAlive
, xid
))
1981 * setup CheckXidAlive if it's not committed yet. We don't check if the
1982 * xid is aborted. That will happen during catalog access.
1984 if (!TransactionIdDidCommit(xid
))
1985 CheckXidAlive
= xid
;
1987 CheckXidAlive
= InvalidTransactionId
;
1991 * Helper function for ReorderBufferProcessTXN for applying change.
1994 ReorderBufferApplyChange(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
1995 Relation relation
, ReorderBufferChange
*change
,
1999 rb
->stream_change(rb
, txn
, relation
, change
);
2001 rb
->apply_change(rb
, txn
, relation
, change
);
2005 * Helper function for ReorderBufferProcessTXN for applying the truncate.
2008 ReorderBufferApplyTruncate(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
2009 int nrelations
, Relation
*relations
,
2010 ReorderBufferChange
*change
, bool streaming
)
2013 rb
->stream_truncate(rb
, txn
, nrelations
, relations
, change
);
2015 rb
->apply_truncate(rb
, txn
, nrelations
, relations
, change
);
2019 * Helper function for ReorderBufferProcessTXN for applying the message.
2022 ReorderBufferApplyMessage(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
2023 ReorderBufferChange
*change
, bool streaming
)
2026 rb
->stream_message(rb
, txn
, change
->lsn
, true,
2027 change
->data
.msg
.prefix
,
2028 change
->data
.msg
.message_size
,
2029 change
->data
.msg
.message
);
2031 rb
->message(rb
, txn
, change
->lsn
, true,
2032 change
->data
.msg
.prefix
,
2033 change
->data
.msg
.message_size
,
2034 change
->data
.msg
.message
);
2038 * Function to store the command id and snapshot at the end of the current
2039 * stream so that we can reuse the same while sending the next stream.
2042 ReorderBufferSaveTXNSnapshot(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
2043 Snapshot snapshot_now
, CommandId command_id
)
2045 txn
->command_id
= command_id
;
2047 /* Avoid copying if it's already copied. */
2048 if (snapshot_now
->copied
)
2049 txn
->snapshot_now
= snapshot_now
;
2051 txn
->snapshot_now
= ReorderBufferCopySnap(rb
, snapshot_now
,
2056 * Helper function for ReorderBufferProcessTXN to handle the concurrent
2057 * abort of the streaming transaction. This resets the TXN such that it
2058 * can be used to stream the remaining data of transaction being processed.
2059 * This can happen when the subtransaction is aborted and we still want to
2060 * continue processing the main or other subtransactions data.
2063 ReorderBufferResetTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
2064 Snapshot snapshot_now
,
2065 CommandId command_id
,
2066 XLogRecPtr last_lsn
,
2067 ReorderBufferChange
*specinsert
)
2069 /* Discard the changes that we just streamed */
2070 ReorderBufferTruncateTXN(rb
, txn
, rbtxn_prepared(txn
));
2072 /* Free all resources allocated for toast reconstruction */
2073 ReorderBufferToastReset(rb
, txn
);
2075 /* Return the spec insert change if it is not NULL */
2076 if (specinsert
!= NULL
)
2078 ReorderBufferReturnChange(rb
, specinsert
, true);
2083 * For the streaming case, stop the stream and remember the command ID and
2084 * snapshot for the streaming run.
2086 if (rbtxn_is_streamed(txn
))
2088 rb
->stream_stop(rb
, txn
, last_lsn
);
2089 ReorderBufferSaveTXNSnapshot(rb
, txn
, snapshot_now
, command_id
);
2092 /* All changes must be deallocated */
2093 Assert(txn
->size
== 0);
2097 * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2099 * Send data of a transaction (and its subtransactions) to the
2100 * output plugin. We iterate over the top and subtransactions (using a k-way
2101 * merge) and replay the changes in lsn order.
2103 * If streaming is true then data will be sent using stream API.
2105 * Note: "volatile" markers on some parameters are to avoid trouble with
2106 * PG_TRY inside the function.
2109 ReorderBufferProcessTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
2110 XLogRecPtr commit_lsn
,
2111 volatile Snapshot snapshot_now
,
2112 volatile CommandId command_id
,
2116 MemoryContext ccxt
= CurrentMemoryContext
;
2117 ReorderBufferIterTXNState
*volatile iterstate
= NULL
;
2118 volatile XLogRecPtr prev_lsn
= InvalidXLogRecPtr
;
2119 ReorderBufferChange
*volatile specinsert
= NULL
;
2120 volatile bool stream_started
= false;
2121 ReorderBufferTXN
*volatile curtxn
= NULL
;
2123 /* build data to be able to lookup the CommandIds of catalog tuples */
2124 ReorderBufferBuildTupleCidHash(rb
, txn
);
2126 /* setup the initial snapshot */
2127 SetupHistoricSnapshot(snapshot_now
, txn
->tuplecid_hash
);
2130 * Decoding needs access to syscaches et al., which in turn use
2131 * heavyweight locks and such. Thus we need to have enough state around to
2132 * keep track of those. The easiest way is to simply use a transaction
2133 * internally. That also allows us to easily enforce that nothing writes
2134 * to the database by checking for xid assignments.
2136 * When we're called via the SQL SRF there's already a transaction
2137 * started, so start an explicit subtransaction there.
2139 using_subtxn
= IsTransactionOrTransactionBlock();
2143 ReorderBufferChange
*change
;
2144 int changes_count
= 0; /* used to accumulate the number of
2148 BeginInternalSubTransaction(streaming
? "stream" : "replay");
2150 StartTransactionCommand();
2153 * We only need to send begin/begin-prepare for non-streamed
2158 if (rbtxn_prepared(txn
))
2159 rb
->begin_prepare(rb
, txn
);
2164 ReorderBufferIterTXNInit(rb
, txn
, &iterstate
);
2165 while ((change
= ReorderBufferIterTXNNext(rb
, iterstate
)) != NULL
)
2167 Relation relation
= NULL
;
2170 CHECK_FOR_INTERRUPTS();
2173 * We can't call start stream callback before processing first
2176 if (prev_lsn
== InvalidXLogRecPtr
)
2180 txn
->origin_id
= change
->origin_id
;
2181 rb
->stream_start(rb
, txn
, change
->lsn
);
2182 stream_started
= true;
2187 * Enforce correct ordering of changes, merged from multiple
2188 * subtransactions. The changes may have the same LSN due to
2189 * MULTI_INSERT xlog records.
2191 Assert(prev_lsn
== InvalidXLogRecPtr
|| prev_lsn
<= change
->lsn
);
2193 prev_lsn
= change
->lsn
;
2196 * Set the current xid to detect concurrent aborts. This is
2197 * required for the cases when we decode the changes before the
2198 * COMMIT record is processed.
2200 if (streaming
|| rbtxn_prepared(change
->txn
))
2202 curtxn
= change
->txn
;
2203 SetupCheckXidLive(curtxn
->xid
);
2206 switch (change
->action
)
2208 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM
:
2211 * Confirmation for speculative insertion arrived. Simply
2212 * use as a normal record. It'll be cleaned up at the end
2213 * of INSERT processing.
2215 if (specinsert
== NULL
)
2216 elog(ERROR
, "invalid ordering of speculative insertion changes");
2217 Assert(specinsert
->data
.tp
.oldtuple
== NULL
);
2218 change
= specinsert
;
2219 change
->action
= REORDER_BUFFER_CHANGE_INSERT
;
2221 /* intentionally fall through */
2222 case REORDER_BUFFER_CHANGE_INSERT
:
2223 case REORDER_BUFFER_CHANGE_UPDATE
:
2224 case REORDER_BUFFER_CHANGE_DELETE
:
2225 Assert(snapshot_now
);
2227 reloid
= RelidByRelfilenumber(change
->data
.tp
.rlocator
.spcOid
,
2228 change
->data
.tp
.rlocator
.relNumber
);
2231 * Mapped catalog tuple without data, emitted while
2232 * catalog table was in the process of being rewritten. We
2233 * can fail to look up the relfilenumber, because the
2234 * relmapper has no "historic" view, in contrast to the
2235 * normal catalog during decoding. Thus repeated rewrites
2236 * can cause a lookup failure. That's OK because we do not
2237 * decode catalog changes anyway. Normally such tuples
2238 * would be skipped over below, but we can't identify
2239 * whether the table should be logically logged without
2240 * mapping the relfilenumber to the oid.
2242 if (reloid
== InvalidOid
&&
2243 change
->data
.tp
.newtuple
== NULL
&&
2244 change
->data
.tp
.oldtuple
== NULL
)
2246 else if (reloid
== InvalidOid
)
2247 elog(ERROR
, "could not map filenumber \"%s\" to relation OID",
2248 relpathperm(change
->data
.tp
.rlocator
,
2251 relation
= RelationIdGetRelation(reloid
);
2253 if (!RelationIsValid(relation
))
2254 elog(ERROR
, "could not open relation with OID %u (for filenumber \"%s\")",
2256 relpathperm(change
->data
.tp
.rlocator
,
2259 if (!RelationIsLogicallyLogged(relation
))
2263 * Ignore temporary heaps created during DDL unless the
2264 * plugin has asked for them.
2266 if (relation
->rd_rel
->relrewrite
&& !rb
->output_rewrites
)
2270 * For now ignore sequence changes entirely. Most of the
2271 * time they don't log changes using records we
2272 * understand, so it doesn't make sense to handle the few
2275 if (relation
->rd_rel
->relkind
== RELKIND_SEQUENCE
)
2278 /* user-triggered change */
2279 if (!IsToastRelation(relation
))
2281 ReorderBufferToastReplace(rb
, txn
, relation
, change
);
2282 ReorderBufferApplyChange(rb
, txn
, relation
, change
,
2286 * Only clear reassembled toast chunks if we're sure
2287 * they're not required anymore. The creator of the
2290 if (change
->data
.tp
.clear_toast_afterwards
)
2291 ReorderBufferToastReset(rb
, txn
);
2293 /* we're not interested in toast deletions */
2294 else if (change
->action
== REORDER_BUFFER_CHANGE_INSERT
)
2297 * Need to reassemble the full toasted Datum in
2298 * memory, to ensure the chunks don't get reused till
2299 * we're done remove it from the list of this
2300 * transaction's changes. Otherwise it will get
2301 * freed/reused while restoring spooled data from
2304 Assert(change
->data
.tp
.newtuple
!= NULL
);
2306 dlist_delete(&change
->node
);
2307 ReorderBufferToastAppendChunk(rb
, txn
, relation
,
2314 * If speculative insertion was confirmed, the record
2315 * isn't needed anymore.
2317 if (specinsert
!= NULL
)
2319 ReorderBufferReturnChange(rb
, specinsert
, true);
2323 if (RelationIsValid(relation
))
2325 RelationClose(relation
);
2330 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT
:
2333 * Speculative insertions are dealt with by delaying the
2334 * processing of the insert until the confirmation record
2335 * arrives. For that we simply unlink the record from the
2336 * chain, so it does not get freed/reused while restoring
2337 * spooled data from disk.
2339 * This is safe in the face of concurrent catalog changes
2340 * because the relevant relation can't be changed between
2341 * speculative insertion and confirmation due to
2342 * CheckTableNotInUse() and locking.
2345 /* clear out a pending (and thus failed) speculation */
2346 if (specinsert
!= NULL
)
2348 ReorderBufferReturnChange(rb
, specinsert
, true);
2352 /* and memorize the pending insertion */
2353 dlist_delete(&change
->node
);
2354 specinsert
= change
;
2357 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT
:
2360 * Abort for speculative insertion arrived. So cleanup the
2361 * specinsert tuple and toast hash.
2363 * Note that we get the spec abort change for each toast
2364 * entry but we need to perform the cleanup only the first
2365 * time we get it for the main table.
2367 if (specinsert
!= NULL
)
2370 * We must clean the toast hash before processing a
2371 * completely new tuple to avoid confusion about the
2372 * previous tuple's toast chunks.
2374 Assert(change
->data
.tp
.clear_toast_afterwards
);
2375 ReorderBufferToastReset(rb
, txn
);
2377 /* We don't need this record anymore. */
2378 ReorderBufferReturnChange(rb
, specinsert
, true);
2383 case REORDER_BUFFER_CHANGE_TRUNCATE
:
2386 int nrelids
= change
->data
.truncate
.nrelids
;
2388 Relation
*relations
;
2390 relations
= palloc0(nrelids
* sizeof(Relation
));
2391 for (i
= 0; i
< nrelids
; i
++)
2393 Oid relid
= change
->data
.truncate
.relids
[i
];
2396 rel
= RelationIdGetRelation(relid
);
2398 if (!RelationIsValid(rel
))
2399 elog(ERROR
, "could not open relation with OID %u", relid
);
2401 if (!RelationIsLogicallyLogged(rel
))
2404 relations
[nrelations
++] = rel
;
2407 /* Apply the truncate. */
2408 ReorderBufferApplyTruncate(rb
, txn
, nrelations
,
2412 for (i
= 0; i
< nrelations
; i
++)
2413 RelationClose(relations
[i
]);
2418 case REORDER_BUFFER_CHANGE_MESSAGE
:
2419 ReorderBufferApplyMessage(rb
, txn
, change
, streaming
);
2422 case REORDER_BUFFER_CHANGE_INVALIDATION
:
2423 /* Execute the invalidation messages locally */
2424 ReorderBufferExecuteInvalidations(change
->data
.inval
.ninvalidations
,
2425 change
->data
.inval
.invalidations
);
2428 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT
:
2429 /* get rid of the old */
2430 TeardownHistoricSnapshot(false);
2432 if (snapshot_now
->copied
)
2434 ReorderBufferFreeSnap(rb
, snapshot_now
);
2436 ReorderBufferCopySnap(rb
, change
->data
.snapshot
,
2441 * Restored from disk, need to be careful not to double
2442 * free. We could introduce refcounting for that, but for
2443 * now this seems infrequent enough not to care.
2445 else if (change
->data
.snapshot
->copied
)
2448 ReorderBufferCopySnap(rb
, change
->data
.snapshot
,
2453 snapshot_now
= change
->data
.snapshot
;
2456 /* and continue with the new one */
2457 SetupHistoricSnapshot(snapshot_now
, txn
->tuplecid_hash
);
2460 case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID
:
2461 Assert(change
->data
.command_id
!= InvalidCommandId
);
2463 if (command_id
< change
->data
.command_id
)
2465 command_id
= change
->data
.command_id
;
2467 if (!snapshot_now
->copied
)
2469 /* we don't use the global one anymore */
2470 snapshot_now
= ReorderBufferCopySnap(rb
, snapshot_now
,
2474 snapshot_now
->curcid
= command_id
;
2476 TeardownHistoricSnapshot(false);
2477 SetupHistoricSnapshot(snapshot_now
, txn
->tuplecid_hash
);
2482 case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
:
2483 elog(ERROR
, "tuplecid value in changequeue");
2488 * It is possible that the data is not sent to downstream for a
2489 * long time either because the output plugin filtered it or there
2490 * is a DDL that generates a lot of data that is not processed by
2491 * the plugin. So, in such cases, the downstream can timeout. To
2492 * avoid that we try to send a keepalive message if required.
2493 * Trying to send a keepalive message after every change has some
2494 * overhead, but testing showed there is no noticeable overhead if
2495 * we do it after every ~100 changes.
2497 #define CHANGES_THRESHOLD 100
2499 if (++changes_count
>= CHANGES_THRESHOLD
)
2501 rb
->update_progress_txn(rb
, txn
, change
->lsn
);
2506 /* speculative insertion record must be freed by now */
2507 Assert(!specinsert
);
2509 /* clean up the iterator */
2510 ReorderBufferIterTXNFinish(rb
, iterstate
);
2514 * Update total transaction count and total bytes processed by the
2515 * transaction and its subtransactions. Ensure to not count the
2516 * streamed transaction multiple times.
2518 * Note that the statistics computation has to be done after
2519 * ReorderBufferIterTXNFinish as it releases the serialized change
2520 * which we have already accounted in ReorderBufferIterTXNNext.
2522 if (!rbtxn_is_streamed(txn
))
2525 rb
->totalBytes
+= txn
->total_size
;
2528 * Done with current changes, send the last message for this set of
2529 * changes depending upon streaming mode.
2535 rb
->stream_stop(rb
, txn
, prev_lsn
);
2536 stream_started
= false;
2542 * Call either PREPARE (for two-phase transactions) or COMMIT (for
2545 if (rbtxn_prepared(txn
))
2546 rb
->prepare(rb
, txn
, commit_lsn
);
2548 rb
->commit(rb
, txn
, commit_lsn
);
2551 /* this is just a sanity check against bad output plugin behaviour */
2552 if (GetCurrentTransactionIdIfAny() != InvalidTransactionId
)
2553 elog(ERROR
, "output plugin used XID %u",
2554 GetCurrentTransactionId());
2557 * Remember the command ID and snapshot for the next set of changes in
2561 ReorderBufferSaveTXNSnapshot(rb
, txn
, snapshot_now
, command_id
);
2562 else if (snapshot_now
->copied
)
2563 ReorderBufferFreeSnap(rb
, snapshot_now
);
2566 TeardownHistoricSnapshot(false);
2569 * Aborting the current (sub-)transaction as a whole has the right
2570 * semantics. We want all locks acquired in here to be released, not
2571 * reassigned to the parent and we do not want any database access
2572 * have persistent effects.
2574 AbortCurrentTransaction();
2576 /* make sure there's no cache pollution */
2577 ReorderBufferExecuteInvalidations(txn
->ninvalidations
, txn
->invalidations
);
2580 RollbackAndReleaseCurrentSubTransaction();
2583 * We are here due to one of the four reasons: 1. Decoding an
2584 * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2585 * prepared txn that was (partially) streamed. 4. Decoding a committed
2588 * For 1, we allow truncation of txn data by removing the changes
2589 * already streamed but still keeping other things like invalidations,
2590 * snapshot, and tuplecids. For 2 and 3, we indicate
2591 * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2592 * data as the entire transaction has been decoded except for commit.
2593 * For 4, as the entire txn has been decoded, we can fully clean up
2594 * the TXN reorder buffer.
2596 if (streaming
|| rbtxn_prepared(txn
))
2598 ReorderBufferTruncateTXN(rb
, txn
, rbtxn_prepared(txn
));
2599 /* Reset the CheckXidAlive */
2600 CheckXidAlive
= InvalidTransactionId
;
2603 ReorderBufferCleanupTXN(rb
, txn
);
2607 MemoryContext ecxt
= MemoryContextSwitchTo(ccxt
);
2608 ErrorData
*errdata
= CopyErrorData();
2610 /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2612 ReorderBufferIterTXNFinish(rb
, iterstate
);
2614 TeardownHistoricSnapshot(true);
2617 * Force cache invalidation to happen outside of a valid transaction
2618 * to prevent catalog access as we just caught an error.
2620 AbortCurrentTransaction();
2622 /* make sure there's no cache pollution */
2623 ReorderBufferExecuteInvalidations(txn
->ninvalidations
,
2624 txn
->invalidations
);
2627 RollbackAndReleaseCurrentSubTransaction();
2630 * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2631 * abort of the (sub)transaction we are streaming or preparing. We
2632 * need to do the cleanup and return gracefully on this error, see
2633 * SetupCheckXidLive.
2635 * This error code can be thrown by one of the callbacks we call
2636 * during decoding so we need to ensure that we return gracefully only
2637 * when we are sending the data in streaming mode and the streaming is
2638 * not finished yet or when we are sending the data out on a PREPARE
2639 * during a two-phase commit.
2641 if (errdata
->sqlerrcode
== ERRCODE_TRANSACTION_ROLLBACK
&&
2642 (stream_started
|| rbtxn_prepared(txn
)))
2644 /* curtxn must be set for streaming or prepared transactions */
2647 /* Cleanup the temporary error state. */
2649 FreeErrorData(errdata
);
2651 curtxn
->concurrent_abort
= true;
2653 /* Reset the TXN so that it is allowed to stream remaining data. */
2654 ReorderBufferResetTXN(rb
, txn
, snapshot_now
,
2655 command_id
, prev_lsn
,
2660 ReorderBufferCleanupTXN(rb
, txn
);
2661 MemoryContextSwitchTo(ecxt
);
2669 * Perform the replay of a transaction and its non-aborted subtransactions.
2671 * Subtransactions previously have to be processed by
2672 * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2673 * transaction with ReorderBufferAssignChild.
2675 * This interface is called once a prepare or toplevel commit is read for both
2676 * streamed as well as non-streamed transactions.
2679 ReorderBufferReplay(ReorderBufferTXN
*txn
,
2680 ReorderBuffer
*rb
, TransactionId xid
,
2681 XLogRecPtr commit_lsn
, XLogRecPtr end_lsn
,
2682 TimestampTz commit_time
,
2683 RepOriginId origin_id
, XLogRecPtr origin_lsn
)
2685 Snapshot snapshot_now
;
2686 CommandId command_id
= FirstCommandId
;
2688 txn
->final_lsn
= commit_lsn
;
2689 txn
->end_lsn
= end_lsn
;
2690 txn
->xact_time
.commit_time
= commit_time
;
2691 txn
->origin_id
= origin_id
;
2692 txn
->origin_lsn
= origin_lsn
;
2695 * If the transaction was (partially) streamed, we need to commit it in a
2696 * 'streamed' way. That is, we first stream the remaining part of the
2697 * transaction, and then invoke stream_commit message.
2699 * Called after everything (origin ID, LSN, ...) is stored in the
2700 * transaction to avoid passing that information directly.
2702 if (rbtxn_is_streamed(txn
))
2704 ReorderBufferStreamCommit(rb
, txn
);
2709 * If this transaction has no snapshot, it didn't make any changes to the
2710 * database, so there's nothing to decode. Note that
2711 * ReorderBufferCommitChild will have transferred any snapshots from
2712 * subtransactions if there were any.
2714 if (txn
->base_snapshot
== NULL
)
2716 Assert(txn
->ninvalidations
== 0);
2719 * Removing this txn before a commit might result in the computation
2720 * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2722 if (!rbtxn_prepared(txn
))
2723 ReorderBufferCleanupTXN(rb
, txn
);
2727 snapshot_now
= txn
->base_snapshot
;
2729 /* Process and send the changes to output plugin. */
2730 ReorderBufferProcessTXN(rb
, txn
, commit_lsn
, snapshot_now
,
2735 * Commit a transaction.
2737 * See comments for ReorderBufferReplay().
2740 ReorderBufferCommit(ReorderBuffer
*rb
, TransactionId xid
,
2741 XLogRecPtr commit_lsn
, XLogRecPtr end_lsn
,
2742 TimestampTz commit_time
,
2743 RepOriginId origin_id
, XLogRecPtr origin_lsn
)
2745 ReorderBufferTXN
*txn
;
2747 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
,
2750 /* unknown transaction, nothing to replay */
2754 ReorderBufferReplay(txn
, rb
, xid
, commit_lsn
, end_lsn
, commit_time
,
2755 origin_id
, origin_lsn
);
2759 * Record the prepare information for a transaction.
2762 ReorderBufferRememberPrepareInfo(ReorderBuffer
*rb
, TransactionId xid
,
2763 XLogRecPtr prepare_lsn
, XLogRecPtr end_lsn
,
2764 TimestampTz prepare_time
,
2765 RepOriginId origin_id
, XLogRecPtr origin_lsn
)
2767 ReorderBufferTXN
*txn
;
2769 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
, false);
2771 /* unknown transaction, nothing to do */
2776 * Remember the prepare information to be later used by commit prepared in
2777 * case we skip doing prepare.
2779 txn
->final_lsn
= prepare_lsn
;
2780 txn
->end_lsn
= end_lsn
;
2781 txn
->xact_time
.prepare_time
= prepare_time
;
2782 txn
->origin_id
= origin_id
;
2783 txn
->origin_lsn
= origin_lsn
;
2788 /* Remember that we have skipped prepare */
2790 ReorderBufferSkipPrepare(ReorderBuffer
*rb
, TransactionId xid
)
2792 ReorderBufferTXN
*txn
;
2794 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
, false);
2796 /* unknown transaction, nothing to do */
2800 txn
->txn_flags
|= RBTXN_SKIPPED_PREPARE
;
2804 * Prepare a two-phase transaction.
2806 * See comments for ReorderBufferReplay().
2809 ReorderBufferPrepare(ReorderBuffer
*rb
, TransactionId xid
,
2812 ReorderBufferTXN
*txn
;
2814 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
,
2817 /* unknown transaction, nothing to replay */
2821 txn
->txn_flags
|= RBTXN_PREPARE
;
2822 txn
->gid
= pstrdup(gid
);
2824 /* The prepare info must have been updated in txn by now. */
2825 Assert(txn
->final_lsn
!= InvalidXLogRecPtr
);
2827 ReorderBufferReplay(txn
, rb
, xid
, txn
->final_lsn
, txn
->end_lsn
,
2828 txn
->xact_time
.prepare_time
, txn
->origin_id
, txn
->origin_lsn
);
2831 * We send the prepare for the concurrently aborted xacts so that later
2832 * when rollback prepared is decoded and sent, the downstream should be
2833 * able to rollback such a xact. See comments atop DecodePrepare.
2835 * Note, for the concurrent_abort + streaming case a stream_prepare was
2836 * already sent within the ReorderBufferReplay call above.
2838 if (txn
->concurrent_abort
&& !rbtxn_is_streamed(txn
))
2839 rb
->prepare(rb
, txn
, txn
->final_lsn
);
2843 * This is used to handle COMMIT/ROLLBACK PREPARED.
2846 ReorderBufferFinishPrepared(ReorderBuffer
*rb
, TransactionId xid
,
2847 XLogRecPtr commit_lsn
, XLogRecPtr end_lsn
,
2848 XLogRecPtr two_phase_at
,
2849 TimestampTz commit_time
, RepOriginId origin_id
,
2850 XLogRecPtr origin_lsn
, char *gid
, bool is_commit
)
2852 ReorderBufferTXN
*txn
;
2853 XLogRecPtr prepare_end_lsn
;
2854 TimestampTz prepare_time
;
2856 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, commit_lsn
, false);
2858 /* unknown transaction, nothing to do */
2863 * By this time the txn has the prepare record information, remember it to
2864 * be later used for rollback.
2866 prepare_end_lsn
= txn
->end_lsn
;
2867 prepare_time
= txn
->xact_time
.prepare_time
;
2869 /* add the gid in the txn */
2870 txn
->gid
= pstrdup(gid
);
2873 * It is possible that this transaction is not decoded at prepare time
2874 * either because by that time we didn't have a consistent snapshot, or
2875 * two_phase was not enabled, or it was decoded earlier but we have
2876 * restarted. We only need to send the prepare if it was not decoded
2877 * earlier. We don't need to decode the xact for aborts if it is not done
2880 if ((txn
->final_lsn
< two_phase_at
) && is_commit
)
2882 txn
->txn_flags
|= RBTXN_PREPARE
;
2885 * The prepare info must have been updated in txn even if we skip
2888 Assert(txn
->final_lsn
!= InvalidXLogRecPtr
);
2891 * By this time the txn has the prepare record information and it is
2892 * important to use that so that downstream gets the accurate
2893 * information. If instead, we have passed commit information here
2894 * then downstream can behave as it has already replayed commit
2895 * prepared after the restart.
2897 ReorderBufferReplay(txn
, rb
, xid
, txn
->final_lsn
, txn
->end_lsn
,
2898 txn
->xact_time
.prepare_time
, txn
->origin_id
, txn
->origin_lsn
);
2901 txn
->final_lsn
= commit_lsn
;
2902 txn
->end_lsn
= end_lsn
;
2903 txn
->xact_time
.commit_time
= commit_time
;
2904 txn
->origin_id
= origin_id
;
2905 txn
->origin_lsn
= origin_lsn
;
2908 rb
->commit_prepared(rb
, txn
, commit_lsn
);
2910 rb
->rollback_prepared(rb
, txn
, prepare_end_lsn
, prepare_time
);
2912 /* cleanup: make sure there's no cache pollution */
2913 ReorderBufferExecuteInvalidations(txn
->ninvalidations
,
2914 txn
->invalidations
);
2915 ReorderBufferCleanupTXN(rb
, txn
);
2919 * Abort a transaction that possibly has previous changes. Needs to be first
2920 * called for subtransactions and then for the toplevel xid.
2922 * NB: Transactions handled here have to have actively aborted (i.e. have
2923 * produced an abort record). Implicitly aborted transactions are handled via
2924 * ReorderBufferAbortOld(); transactions we're just not interested in, but
2925 * which have committed are handled in ReorderBufferForget().
2927 * This function purges this transaction and its contents from memory and
2931 ReorderBufferAbort(ReorderBuffer
*rb
, TransactionId xid
, XLogRecPtr lsn
,
2932 TimestampTz abort_time
)
2934 ReorderBufferTXN
*txn
;
2936 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
,
2939 /* unknown, nothing to remove */
2943 txn
->xact_time
.abort_time
= abort_time
;
2945 /* For streamed transactions notify the remote node about the abort. */
2946 if (rbtxn_is_streamed(txn
))
2948 rb
->stream_abort(rb
, txn
, lsn
);
2951 * We might have decoded changes for this transaction that could load
2952 * the cache as per the current transaction's view (consider DDL's
2953 * happened in this transaction). We don't want the decoding of future
2954 * transactions to use those cache entries so execute invalidations.
2956 if (txn
->ninvalidations
> 0)
2957 ReorderBufferImmediateInvalidation(rb
, txn
->ninvalidations
,
2958 txn
->invalidations
);
2962 txn
->final_lsn
= lsn
;
2964 /* remove potential on-disk data, and deallocate */
2965 ReorderBufferCleanupTXN(rb
, txn
);
2969 * Abort all transactions that aren't actually running anymore because the
2972 * NB: These really have to be transactions that have aborted due to a server
2973 * crash/immediate restart, as we don't deal with invalidations here.
2976 ReorderBufferAbortOld(ReorderBuffer
*rb
, TransactionId oldestRunningXid
)
2978 dlist_mutable_iter it
;
2981 * Iterate through all (potential) toplevel TXNs and abort all that are
2982 * older than what possibly can be running. Once we've found the first
2983 * that is alive we stop, there might be some that acquired an xid earlier
2984 * but started writing later, but it's unlikely and they will be cleaned
2985 * up in a later call to this function.
2987 dlist_foreach_modify(it
, &rb
->toplevel_by_lsn
)
2989 ReorderBufferTXN
*txn
;
2991 txn
= dlist_container(ReorderBufferTXN
, node
, it
.cur
);
2993 if (TransactionIdPrecedes(txn
->xid
, oldestRunningXid
))
2995 elog(DEBUG2
, "aborting old transaction %u", txn
->xid
);
2997 /* Notify the remote node about the crash/immediate restart. */
2998 if (rbtxn_is_streamed(txn
))
2999 rb
->stream_abort(rb
, txn
, InvalidXLogRecPtr
);
3001 /* remove potential on-disk data, and deallocate this tx */
3002 ReorderBufferCleanupTXN(rb
, txn
);
3010 * Forget the contents of a transaction if we aren't interested in its
3011 * contents. Needs to be first called for subtransactions and then for the
3014 * This is significantly different to ReorderBufferAbort() because
3015 * transactions that have committed need to be treated differently from aborted
3016 * ones since they may have modified the catalog.
3018 * Note that this is only allowed to be called in the moment a transaction
3019 * commit has just been read, not earlier; otherwise later records referring
3020 * to this xid might re-create the transaction incompletely.
3023 ReorderBufferForget(ReorderBuffer
*rb
, TransactionId xid
, XLogRecPtr lsn
)
3025 ReorderBufferTXN
*txn
;
3027 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
,
3030 /* unknown, nothing to forget */
3034 /* this transaction mustn't be streamed */
3035 Assert(!rbtxn_is_streamed(txn
));
3038 txn
->final_lsn
= lsn
;
3041 * Process cache invalidation messages if there are any. Even if we're not
3042 * interested in the transaction's contents, it could have manipulated the
3043 * catalog and we need to update the caches according to that.
3045 if (txn
->base_snapshot
!= NULL
&& txn
->ninvalidations
> 0)
3046 ReorderBufferImmediateInvalidation(rb
, txn
->ninvalidations
,
3047 txn
->invalidations
);
3049 Assert(txn
->ninvalidations
== 0);
3051 /* remove potential on-disk data, and deallocate */
3052 ReorderBufferCleanupTXN(rb
, txn
);
3056 * Invalidate cache for those transactions that need to be skipped just in case
3057 * catalogs were manipulated as part of the transaction.
3059 * Note that this is a special-purpose function for prepared transactions where
3060 * we don't want to clean up the TXN even when we decide to skip it. See
3064 ReorderBufferInvalidate(ReorderBuffer
*rb
, TransactionId xid
, XLogRecPtr lsn
)
3066 ReorderBufferTXN
*txn
;
3068 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
,
3071 /* unknown, nothing to do */
3076 * Process cache invalidation messages if there are any. Even if we're not
3077 * interested in the transaction's contents, it could have manipulated the
3078 * catalog and we need to update the caches according to that.
3080 if (txn
->base_snapshot
!= NULL
&& txn
->ninvalidations
> 0)
3081 ReorderBufferImmediateInvalidation(rb
, txn
->ninvalidations
,
3082 txn
->invalidations
);
3084 Assert(txn
->ninvalidations
== 0);
3089 * Execute invalidations happening outside the context of a decoded
3090 * transaction. That currently happens either for xid-less commits
3091 * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3092 * transactions (via ReorderBufferForget()).
3095 ReorderBufferImmediateInvalidation(ReorderBuffer
*rb
, uint32 ninvalidations
,
3096 SharedInvalidationMessage
*invalidations
)
3098 bool use_subtxn
= IsTransactionOrTransactionBlock();
3102 BeginInternalSubTransaction("replay");
3105 * Force invalidations to happen outside of a valid transaction - that way
3106 * entries will just be marked as invalid without accessing the catalog.
3107 * That's advantageous because we don't need to setup the full state
3108 * necessary for catalog access.
3111 AbortCurrentTransaction();
3113 for (i
= 0; i
< ninvalidations
; i
++)
3114 LocalExecuteInvalidationMessage(&invalidations
[i
]);
3117 RollbackAndReleaseCurrentSubTransaction();
3121 * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3122 * least once for every xid in XLogRecord->xl_xid (other places in records
3123 * may, but do not have to be passed through here).
3125 * Reorderbuffer keeps some data structures about transactions in LSN order,
3126 * for efficiency. To do that it has to know about when transactions are seen
3127 * first in the WAL. As many types of records are not actually interesting for
3128 * logical decoding, they do not necessarily pass through here.
3131 ReorderBufferProcessXid(ReorderBuffer
*rb
, TransactionId xid
, XLogRecPtr lsn
)
3133 /* many records won't have an xid assigned, centralize check here */
3134 if (xid
!= InvalidTransactionId
)
3135 ReorderBufferTXNByXid(rb
, xid
, true, NULL
, lsn
, true);
3139 * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3140 * because the previous snapshot doesn't describe the catalog correctly for
3144 ReorderBufferAddSnapshot(ReorderBuffer
*rb
, TransactionId xid
,
3145 XLogRecPtr lsn
, Snapshot snap
)
3147 ReorderBufferChange
*change
= ReorderBufferGetChange(rb
);
3149 change
->data
.snapshot
= snap
;
3150 change
->action
= REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT
;
3152 ReorderBufferQueueChange(rb
, xid
, lsn
, change
, false);
3156 * Set up the transaction's base snapshot.
3158 * If we know that xid is a subtransaction, set the base snapshot on the
3159 * top-level transaction instead.
3162 ReorderBufferSetBaseSnapshot(ReorderBuffer
*rb
, TransactionId xid
,
3163 XLogRecPtr lsn
, Snapshot snap
)
3165 ReorderBufferTXN
*txn
;
3168 Assert(snap
!= NULL
);
3171 * Fetch the transaction to operate on. If we know it's a subtransaction,
3172 * operate on its top-level transaction instead.
3174 txn
= ReorderBufferTXNByXid(rb
, xid
, true, &is_new
, lsn
, true);
3175 if (rbtxn_is_known_subxact(txn
))
3176 txn
= ReorderBufferTXNByXid(rb
, txn
->toplevel_xid
, false,
3177 NULL
, InvalidXLogRecPtr
, false);
3178 Assert(txn
->base_snapshot
== NULL
);
3180 txn
->base_snapshot
= snap
;
3181 txn
->base_snapshot_lsn
= lsn
;
3182 dlist_push_tail(&rb
->txns_by_base_snapshot_lsn
, &txn
->base_snapshot_node
);
3184 AssertTXNLsnOrder(rb
);
3188 * Access the catalog with this CommandId at this point in the changestream.
3190 * May only be called for command ids > 1
3193 ReorderBufferAddNewCommandId(ReorderBuffer
*rb
, TransactionId xid
,
3194 XLogRecPtr lsn
, CommandId cid
)
3196 ReorderBufferChange
*change
= ReorderBufferGetChange(rb
);
3198 change
->data
.command_id
= cid
;
3199 change
->action
= REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID
;
3201 ReorderBufferQueueChange(rb
, xid
, lsn
, change
, false);
3205 * Update memory counters to account for the new or removed change.
3207 * We update two counters - in the reorder buffer, and in the transaction
3208 * containing the change. The reorder buffer counter allows us to quickly
3209 * decide if we reached the memory limit, the transaction counter allows
3210 * us to quickly pick the largest transaction for eviction.
3212 * Either txn or change must be non-NULL at least. We update the memory
3213 * counter of txn if it's non-NULL, otherwise change->txn.
3215 * When streaming is enabled, we need to update the toplevel transaction
3216 * counters instead - we don't really care about subtransactions as we
3217 * can't stream them individually anyway, and we only pick toplevel
3218 * transactions for eviction. So only toplevel transactions matter.
3221 ReorderBufferChangeMemoryUpdate(ReorderBuffer
*rb
,
3222 ReorderBufferChange
*change
,
3223 ReorderBufferTXN
*txn
,
3224 bool addition
, Size sz
)
3226 ReorderBufferTXN
*toptxn
;
3228 Assert(txn
|| change
);
3231 * Ignore tuple CID changes, because those are not evicted when reaching
3232 * memory limit. So we just don't count them, because it might easily
3233 * trigger a pointless attempt to spill.
3235 if (change
&& change
->action
== REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
)
3243 Assert(txn
!= NULL
);
3246 * Update the total size in top level as well. This is later used to
3247 * compute the decoding stats.
3249 toptxn
= rbtxn_get_toptxn(txn
);
3253 Size oldsize
= txn
->size
;
3258 /* Update the total size in the top transaction. */
3259 toptxn
->total_size
+= sz
;
3261 /* Update the max-heap */
3263 pairingheap_remove(rb
->txn_heap
, &txn
->txn_node
);
3264 pairingheap_add(rb
->txn_heap
, &txn
->txn_node
);
3268 Assert((rb
->size
>= sz
) && (txn
->size
>= sz
));
3272 /* Update the total size in the top transaction. */
3273 toptxn
->total_size
-= sz
;
3275 /* Update the max-heap */
3276 pairingheap_remove(rb
->txn_heap
, &txn
->txn_node
);
3278 pairingheap_add(rb
->txn_heap
, &txn
->txn_node
);
3281 Assert(txn
->size
<= rb
->size
);
3285 * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3287 * We do not include this change type in memory accounting, because we
3288 * keep CIDs in a separate list and do not evict them when reaching
3292 ReorderBufferAddNewTupleCids(ReorderBuffer
*rb
, TransactionId xid
,
3293 XLogRecPtr lsn
, RelFileLocator locator
,
3294 ItemPointerData tid
, CommandId cmin
,
3295 CommandId cmax
, CommandId combocid
)
3297 ReorderBufferChange
*change
= ReorderBufferGetChange(rb
);
3298 ReorderBufferTXN
*txn
;
3300 txn
= ReorderBufferTXNByXid(rb
, xid
, true, NULL
, lsn
, true);
3302 change
->data
.tuplecid
.locator
= locator
;
3303 change
->data
.tuplecid
.tid
= tid
;
3304 change
->data
.tuplecid
.cmin
= cmin
;
3305 change
->data
.tuplecid
.cmax
= cmax
;
3306 change
->data
.tuplecid
.combocid
= combocid
;
3309 change
->action
= REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
;
3311 dlist_push_tail(&txn
->tuplecids
, &change
->node
);
3316 * Accumulate the invalidations for executing them later.
3318 * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3319 * accumulates all the invalidation messages in the toplevel transaction, if
3320 * available, otherwise in the current transaction, as well as in the form of
3321 * change in reorder buffer. We require to record it in form of the change
3322 * so that we can execute only the required invalidations instead of executing
3323 * all the invalidations on each CommandId increment. We also need to
3324 * accumulate these in the txn buffer because in some cases where we skip
3325 * processing the transaction (see ReorderBufferForget), we need to execute
3326 * all the invalidations together.
3329 ReorderBufferAddInvalidations(ReorderBuffer
*rb
, TransactionId xid
,
3330 XLogRecPtr lsn
, Size nmsgs
,
3331 SharedInvalidationMessage
*msgs
)
3333 ReorderBufferTXN
*txn
;
3334 MemoryContext oldcontext
;
3335 ReorderBufferChange
*change
;
3337 txn
= ReorderBufferTXNByXid(rb
, xid
, true, NULL
, lsn
, true);
3339 oldcontext
= MemoryContextSwitchTo(rb
->context
);
3342 * Collect all the invalidations under the top transaction, if available,
3343 * so that we can execute them all together. See comments atop this
3346 txn
= rbtxn_get_toptxn(txn
);
3350 /* Accumulate invalidations. */
3351 if (txn
->ninvalidations
== 0)
3353 txn
->ninvalidations
= nmsgs
;
3354 txn
->invalidations
= (SharedInvalidationMessage
*)
3355 palloc(sizeof(SharedInvalidationMessage
) * nmsgs
);
3356 memcpy(txn
->invalidations
, msgs
,
3357 sizeof(SharedInvalidationMessage
) * nmsgs
);
3361 txn
->invalidations
= (SharedInvalidationMessage
*)
3362 repalloc(txn
->invalidations
, sizeof(SharedInvalidationMessage
) *
3363 (txn
->ninvalidations
+ nmsgs
));
3365 memcpy(txn
->invalidations
+ txn
->ninvalidations
, msgs
,
3366 nmsgs
* sizeof(SharedInvalidationMessage
));
3367 txn
->ninvalidations
+= nmsgs
;
3370 change
= ReorderBufferGetChange(rb
);
3371 change
->action
= REORDER_BUFFER_CHANGE_INVALIDATION
;
3372 change
->data
.inval
.ninvalidations
= nmsgs
;
3373 change
->data
.inval
.invalidations
= (SharedInvalidationMessage
*)
3374 palloc(sizeof(SharedInvalidationMessage
) * nmsgs
);
3375 memcpy(change
->data
.inval
.invalidations
, msgs
,
3376 sizeof(SharedInvalidationMessage
) * nmsgs
);
3378 ReorderBufferQueueChange(rb
, xid
, lsn
, change
, false);
3380 MemoryContextSwitchTo(oldcontext
);
3384 * Apply all invalidations we know. Possibly we only need parts at this point
3385 * in the changestream but we don't know which those are.
3388 ReorderBufferExecuteInvalidations(uint32 nmsgs
, SharedInvalidationMessage
*msgs
)
3392 for (i
= 0; i
< nmsgs
; i
++)
3393 LocalExecuteInvalidationMessage(&msgs
[i
]);
3397 * Mark a transaction as containing catalog changes
3400 ReorderBufferXidSetCatalogChanges(ReorderBuffer
*rb
, TransactionId xid
,
3403 ReorderBufferTXN
*txn
;
3405 txn
= ReorderBufferTXNByXid(rb
, xid
, true, NULL
, lsn
, true);
3407 if (!rbtxn_has_catalog_changes(txn
))
3409 txn
->txn_flags
|= RBTXN_HAS_CATALOG_CHANGES
;
3410 dclist_push_tail(&rb
->catchange_txns
, &txn
->catchange_node
);
3414 * Mark top-level transaction as having catalog changes too if one of its
3415 * children has so that the ReorderBufferBuildTupleCidHash can
3416 * conveniently check just top-level transaction and decide whether to
3417 * build the hash table or not.
3419 if (rbtxn_is_subtxn(txn
))
3421 ReorderBufferTXN
*toptxn
= rbtxn_get_toptxn(txn
);
3423 if (!rbtxn_has_catalog_changes(toptxn
))
3425 toptxn
->txn_flags
|= RBTXN_HAS_CATALOG_CHANGES
;
3426 dclist_push_tail(&rb
->catchange_txns
, &toptxn
->catchange_node
);
3432 * Return palloc'ed array of the transactions that have changed catalogs.
3433 * The returned array is sorted in xidComparator order.
3435 * The caller must free the returned array when done with it.
3438 ReorderBufferGetCatalogChangesXacts(ReorderBuffer
*rb
)
3441 TransactionId
*xids
= NULL
;
3444 /* Quick return if the list is empty */
3445 if (dclist_count(&rb
->catchange_txns
) == 0)
3448 /* Initialize XID array */
3449 xids
= (TransactionId
*) palloc(sizeof(TransactionId
) *
3450 dclist_count(&rb
->catchange_txns
));
3451 dclist_foreach(iter
, &rb
->catchange_txns
)
3453 ReorderBufferTXN
*txn
= dclist_container(ReorderBufferTXN
,
3457 Assert(rbtxn_has_catalog_changes(txn
));
3459 xids
[xcnt
++] = txn
->xid
;
3462 qsort(xids
, xcnt
, sizeof(TransactionId
), xidComparator
);
3464 Assert(xcnt
== dclist_count(&rb
->catchange_txns
));
3469 * Query whether a transaction is already *known* to contain catalog
3470 * changes. This can be wrong until directly before the commit!
3473 ReorderBufferXidHasCatalogChanges(ReorderBuffer
*rb
, TransactionId xid
)
3475 ReorderBufferTXN
*txn
;
3477 txn
= ReorderBufferTXNByXid(rb
, xid
, false, NULL
, InvalidXLogRecPtr
,
3482 return rbtxn_has_catalog_changes(txn
);
3486 * ReorderBufferXidHasBaseSnapshot
3487 * Have we already set the base snapshot for the given txn/subtxn?
3490 ReorderBufferXidHasBaseSnapshot(ReorderBuffer
*rb
, TransactionId xid
)
3492 ReorderBufferTXN
*txn
;
3494 txn
= ReorderBufferTXNByXid(rb
, xid
, false,
3495 NULL
, InvalidXLogRecPtr
, false);
3497 /* transaction isn't known yet, ergo no snapshot */
3501 /* a known subtxn? operate on top-level txn instead */
3502 if (rbtxn_is_known_subxact(txn
))
3503 txn
= ReorderBufferTXNByXid(rb
, txn
->toplevel_xid
, false,
3504 NULL
, InvalidXLogRecPtr
, false);
3506 return txn
->base_snapshot
!= NULL
;
3511 * ---------------------------------------
3512 * Disk serialization support
3513 * ---------------------------------------
3517 * Ensure the IO buffer is >= sz.
3520 ReorderBufferSerializeReserve(ReorderBuffer
*rb
, Size sz
)
3522 if (!rb
->outbufsize
)
3524 rb
->outbuf
= MemoryContextAlloc(rb
->context
, sz
);
3525 rb
->outbufsize
= sz
;
3527 else if (rb
->outbufsize
< sz
)
3529 rb
->outbuf
= repalloc(rb
->outbuf
, sz
);
3530 rb
->outbufsize
= sz
;
3535 /* Compare two transactions by size */
3537 ReorderBufferTXNSizeCompare(const pairingheap_node
*a
, const pairingheap_node
*b
, void *arg
)
3539 const ReorderBufferTXN
*ta
= pairingheap_const_container(ReorderBufferTXN
, txn_node
, a
);
3540 const ReorderBufferTXN
*tb
= pairingheap_const_container(ReorderBufferTXN
, txn_node
, b
);
3542 if (ta
->size
< tb
->size
)
3544 if (ta
->size
> tb
->size
)
3550 * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3552 static ReorderBufferTXN
*
3553 ReorderBufferLargestTXN(ReorderBuffer
*rb
)
3555 ReorderBufferTXN
*largest
;
3557 /* Get the largest transaction from the max-heap */
3558 largest
= pairingheap_container(ReorderBufferTXN
, txn_node
,
3559 pairingheap_first(rb
->txn_heap
));
3562 Assert(largest
->size
> 0);
3563 Assert(largest
->size
<= rb
->size
);
3569 * Find the largest streamable toplevel transaction to evict (by streaming).
3571 * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3572 * should give us the same transaction (because we don't update memory account
3573 * for subtransaction with streaming, so it's always 0). But we can simply
3574 * iterate over the limited number of toplevel transactions that have a base
3575 * snapshot. There is no use of selecting a transaction that doesn't have base
3576 * snapshot because we don't decode such transactions. Also, we do not select
3577 * the transaction which doesn't have any streamable change.
3579 * Note that, we skip transactions that contain incomplete changes. There
3580 * is a scope of optimization here such that we can select the largest
3581 * transaction which has incomplete changes. But that will make the code and
3582 * design quite complex and that might not be worth the benefit. If we plan to
3583 * stream the transactions that contain incomplete changes then we need to
3584 * find a way to partially stream/truncate the transaction changes in-memory
3585 * and build a mechanism to partially truncate the spilled files.
3586 * Additionally, whenever we partially stream the transaction we need to
3587 * maintain the last streamed lsn and next time we need to restore from that
3588 * segment and the offset in WAL. As we stream the changes from the top
3589 * transaction and restore them subtransaction wise, we need to even remember
3590 * the subxact from where we streamed the last change.
3592 static ReorderBufferTXN
*
3593 ReorderBufferLargestStreamableTopTXN(ReorderBuffer
*rb
)
3596 Size largest_size
= 0;
3597 ReorderBufferTXN
*largest
= NULL
;
3599 /* Find the largest top-level transaction having a base snapshot. */
3600 dlist_foreach(iter
, &rb
->txns_by_base_snapshot_lsn
)
3602 ReorderBufferTXN
*txn
;
3604 txn
= dlist_container(ReorderBufferTXN
, base_snapshot_node
, iter
.cur
);
3606 /* must not be a subtxn */
3607 Assert(!rbtxn_is_known_subxact(txn
));
3608 /* base_snapshot must be set */
3609 Assert(txn
->base_snapshot
!= NULL
);
3611 if ((largest
== NULL
|| txn
->total_size
> largest_size
) &&
3612 (txn
->total_size
> 0) && !(rbtxn_has_partial_change(txn
)) &&
3613 rbtxn_has_streamable_change(txn
))
3616 largest_size
= txn
->total_size
;
3624 * Check whether the logical_decoding_work_mem limit was reached, and if yes
3625 * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3626 * disk or send to the output plugin until we reach under the memory limit.
3628 * If debug_logical_replication_streaming is set to "immediate", stream or
3629 * serialize the changes immediately.
3631 * XXX At this point we select the transactions until we reach under the memory
3632 * limit, but we might also adapt a more elaborate eviction strategy - for example
3633 * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3637 ReorderBufferCheckMemoryLimit(ReorderBuffer
*rb
)
3639 ReorderBufferTXN
*txn
;
3642 * Bail out if debug_logical_replication_streaming is buffered and we
3643 * haven't exceeded the memory limit.
3645 if (debug_logical_replication_streaming
== DEBUG_LOGICAL_REP_STREAMING_BUFFERED
&&
3646 rb
->size
< logical_decoding_work_mem
* 1024L)
3650 * If debug_logical_replication_streaming is immediate, loop until there's
3651 * no change. Otherwise, loop until we reach under the memory limit. One
3652 * might think that just by evicting the largest (sub)transaction we will
3653 * come under the memory limit based on assumption that the selected
3654 * transaction is at least as large as the most recent change (which
3655 * caused us to go over the memory limit). However, that is not true
3656 * because a user can reduce the logical_decoding_work_mem to a smaller
3657 * value before the most recent change.
3659 while (rb
->size
>= logical_decoding_work_mem
* 1024L ||
3660 (debug_logical_replication_streaming
== DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE
&&
3664 * Pick the largest transaction and evict it from memory by streaming,
3665 * if possible. Otherwise, spill to disk.
3667 if (ReorderBufferCanStartStreaming(rb
) &&
3668 (txn
= ReorderBufferLargestStreamableTopTXN(rb
)) != NULL
)
3670 /* we know there has to be one, because the size is not zero */
3671 Assert(txn
&& rbtxn_is_toptxn(txn
));
3672 Assert(txn
->total_size
> 0);
3673 Assert(rb
->size
>= txn
->total_size
);
3675 ReorderBufferStreamTXN(rb
, txn
);
3680 * Pick the largest transaction (or subtransaction) and evict it
3681 * from memory by serializing it to disk.
3683 txn
= ReorderBufferLargestTXN(rb
);
3685 /* we know there has to be one, because the size is not zero */
3687 Assert(txn
->size
> 0);
3688 Assert(rb
->size
>= txn
->size
);
3690 ReorderBufferSerializeTXN(rb
, txn
);
3694 * After eviction, the transaction should have no entries in memory,
3695 * and should use 0 bytes for changes.
3697 Assert(txn
->size
== 0);
3698 Assert(txn
->nentries_mem
== 0);
3701 /* We must be under the memory limit now. */
3702 Assert(rb
->size
< logical_decoding_work_mem
* 1024L);
3707 * Spill data of a large transaction (and its subtransactions) to disk.
3710 ReorderBufferSerializeTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
3712 dlist_iter subtxn_i
;
3713 dlist_mutable_iter change_i
;
3715 XLogSegNo curOpenSegNo
= 0;
3717 Size size
= txn
->size
;
3719 elog(DEBUG2
, "spill %u changes in XID %u to disk",
3720 (uint32
) txn
->nentries_mem
, txn
->xid
);
3722 /* do the same to all child TXs */
3723 dlist_foreach(subtxn_i
, &txn
->subtxns
)
3725 ReorderBufferTXN
*subtxn
;
3727 subtxn
= dlist_container(ReorderBufferTXN
, node
, subtxn_i
.cur
);
3728 ReorderBufferSerializeTXN(rb
, subtxn
);
3731 /* serialize changestream */
3732 dlist_foreach_modify(change_i
, &txn
->changes
)
3734 ReorderBufferChange
*change
;
3736 change
= dlist_container(ReorderBufferChange
, node
, change_i
.cur
);
3739 * store in segment in which it belongs by start lsn, don't split over
3740 * multiple segments tho
3743 !XLByteInSeg(change
->lsn
, curOpenSegNo
, wal_segment_size
))
3745 char path
[MAXPGPATH
];
3748 CloseTransientFile(fd
);
3750 XLByteToSeg(change
->lsn
, curOpenSegNo
, wal_segment_size
);
3753 * No need to care about TLIs here, only used during a single run,
3754 * so each LSN only maps to a specific WAL record.
3756 ReorderBufferSerializedPath(path
, MyReplicationSlot
, txn
->xid
,
3759 /* open segment, create it if necessary */
3760 fd
= OpenTransientFile(path
,
3761 O_CREAT
| O_WRONLY
| O_APPEND
| PG_BINARY
);
3765 (errcode_for_file_access(),
3766 errmsg("could not open file \"%s\": %m", path
)));
3769 ReorderBufferSerializeChange(rb
, txn
, fd
, change
);
3770 dlist_delete(&change
->node
);
3771 ReorderBufferReturnChange(rb
, change
, false);
3776 /* Update the memory counter */
3777 ReorderBufferChangeMemoryUpdate(rb
, NULL
, txn
, false, size
);
3779 /* update the statistics iff we have spilled anything */
3782 rb
->spillCount
+= 1;
3783 rb
->spillBytes
+= size
;
3785 /* don't consider already serialized transactions */
3786 rb
->spillTxns
+= (rbtxn_is_serialized(txn
) || rbtxn_is_serialized_clear(txn
)) ? 0 : 1;
3788 /* update the decoding stats */
3789 UpdateDecodingStats((LogicalDecodingContext
*) rb
->private_data
);
3792 Assert(spilled
== txn
->nentries_mem
);
3793 Assert(dlist_is_empty(&txn
->changes
));
3794 txn
->nentries_mem
= 0;
3795 txn
->txn_flags
|= RBTXN_IS_SERIALIZED
;
3798 CloseTransientFile(fd
);
3802 * Serialize individual change to disk.
3805 ReorderBufferSerializeChange(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
3806 int fd
, ReorderBufferChange
*change
)
3808 ReorderBufferDiskChange
*ondisk
;
3809 Size sz
= sizeof(ReorderBufferDiskChange
);
3811 ReorderBufferSerializeReserve(rb
, sz
);
3813 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
3814 memcpy(&ondisk
->change
, change
, sizeof(ReorderBufferChange
));
3816 switch (change
->action
)
3818 /* fall through these, they're all similar enough */
3819 case REORDER_BUFFER_CHANGE_INSERT
:
3820 case REORDER_BUFFER_CHANGE_UPDATE
:
3821 case REORDER_BUFFER_CHANGE_DELETE
:
3822 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT
:
3830 oldtup
= change
->data
.tp
.oldtuple
;
3831 newtup
= change
->data
.tp
.newtuple
;
3835 sz
+= sizeof(HeapTupleData
);
3836 oldlen
= oldtup
->t_len
;
3842 sz
+= sizeof(HeapTupleData
);
3843 newlen
= newtup
->t_len
;
3847 /* make sure we have enough space */
3848 ReorderBufferSerializeReserve(rb
, sz
);
3850 data
= ((char *) rb
->outbuf
) + sizeof(ReorderBufferDiskChange
);
3851 /* might have been reallocated above */
3852 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
3856 memcpy(data
, oldtup
, sizeof(HeapTupleData
));
3857 data
+= sizeof(HeapTupleData
);
3859 memcpy(data
, oldtup
->t_data
, oldlen
);
3865 memcpy(data
, newtup
, sizeof(HeapTupleData
));
3866 data
+= sizeof(HeapTupleData
);
3868 memcpy(data
, newtup
->t_data
, newlen
);
3873 case REORDER_BUFFER_CHANGE_MESSAGE
:
3876 Size prefix_size
= strlen(change
->data
.msg
.prefix
) + 1;
3878 sz
+= prefix_size
+ change
->data
.msg
.message_size
+
3879 sizeof(Size
) + sizeof(Size
);
3880 ReorderBufferSerializeReserve(rb
, sz
);
3882 data
= ((char *) rb
->outbuf
) + sizeof(ReorderBufferDiskChange
);
3884 /* might have been reallocated above */
3885 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
3887 /* write the prefix including the size */
3888 memcpy(data
, &prefix_size
, sizeof(Size
));
3889 data
+= sizeof(Size
);
3890 memcpy(data
, change
->data
.msg
.prefix
,
3892 data
+= prefix_size
;
3894 /* write the message including the size */
3895 memcpy(data
, &change
->data
.msg
.message_size
, sizeof(Size
));
3896 data
+= sizeof(Size
);
3897 memcpy(data
, change
->data
.msg
.message
,
3898 change
->data
.msg
.message_size
);
3899 data
+= change
->data
.msg
.message_size
;
3903 case REORDER_BUFFER_CHANGE_INVALIDATION
:
3906 Size inval_size
= sizeof(SharedInvalidationMessage
) *
3907 change
->data
.inval
.ninvalidations
;
3911 ReorderBufferSerializeReserve(rb
, sz
);
3912 data
= ((char *) rb
->outbuf
) + sizeof(ReorderBufferDiskChange
);
3914 /* might have been reallocated above */
3915 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
3916 memcpy(data
, change
->data
.inval
.invalidations
, inval_size
);
3921 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT
:
3926 snap
= change
->data
.snapshot
;
3928 sz
+= sizeof(SnapshotData
) +
3929 sizeof(TransactionId
) * snap
->xcnt
+
3930 sizeof(TransactionId
) * snap
->subxcnt
;
3932 /* make sure we have enough space */
3933 ReorderBufferSerializeReserve(rb
, sz
);
3934 data
= ((char *) rb
->outbuf
) + sizeof(ReorderBufferDiskChange
);
3935 /* might have been reallocated above */
3936 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
3938 memcpy(data
, snap
, sizeof(SnapshotData
));
3939 data
+= sizeof(SnapshotData
);
3943 memcpy(data
, snap
->xip
,
3944 sizeof(TransactionId
) * snap
->xcnt
);
3945 data
+= sizeof(TransactionId
) * snap
->xcnt
;
3950 memcpy(data
, snap
->subxip
,
3951 sizeof(TransactionId
) * snap
->subxcnt
);
3952 data
+= sizeof(TransactionId
) * snap
->subxcnt
;
3956 case REORDER_BUFFER_CHANGE_TRUNCATE
:
3961 /* account for the OIDs of truncated relations */
3962 size
= sizeof(Oid
) * change
->data
.truncate
.nrelids
;
3965 /* make sure we have enough space */
3966 ReorderBufferSerializeReserve(rb
, sz
);
3968 data
= ((char *) rb
->outbuf
) + sizeof(ReorderBufferDiskChange
);
3969 /* might have been reallocated above */
3970 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
3972 memcpy(data
, change
->data
.truncate
.relids
, size
);
3977 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM
:
3978 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT
:
3979 case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID
:
3980 case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
:
3981 /* ReorderBufferChange contains everything important */
3988 pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE
);
3989 if (write(fd
, rb
->outbuf
, ondisk
->size
) != ondisk
->size
)
3991 int save_errno
= errno
;
3993 CloseTransientFile(fd
);
3995 /* if write didn't set errno, assume problem is no disk space */
3996 errno
= save_errno
? save_errno
: ENOSPC
;
3998 (errcode_for_file_access(),
3999 errmsg("could not write to data file for XID %u: %m",
4002 pgstat_report_wait_end();
4005 * Keep the transaction's final_lsn up to date with each change we send to
4006 * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4007 * only do this on commit and abort records, but that doesn't work if a
4008 * system crash leaves a transaction without its abort record).
4010 * Make sure not to move it backwards.
4012 if (txn
->final_lsn
< change
->lsn
)
4013 txn
->final_lsn
= change
->lsn
;
4015 Assert(ondisk
->change
.action
== change
->action
);
4018 /* Returns true, if the output plugin supports streaming, false, otherwise. */
4020 ReorderBufferCanStream(ReorderBuffer
*rb
)
4022 LogicalDecodingContext
*ctx
= rb
->private_data
;
4024 return ctx
->streaming
;
4027 /* Returns true, if the streaming can be started now, false, otherwise. */
4029 ReorderBufferCanStartStreaming(ReorderBuffer
*rb
)
4031 LogicalDecodingContext
*ctx
= rb
->private_data
;
4032 SnapBuild
*builder
= ctx
->snapshot_builder
;
4034 /* We can't start streaming unless a consistent state is reached. */
4035 if (SnapBuildCurrentState(builder
) < SNAPBUILD_CONSISTENT
)
4039 * We can't start streaming immediately even if the streaming is enabled
4040 * because we previously decoded this transaction and now just are
4043 if (ReorderBufferCanStream(rb
) &&
4044 !SnapBuildXactNeedsSkip(builder
, ctx
->reader
->ReadRecPtr
))
4051 * Send data of a large transaction (and its subtransactions) to the
4052 * output plugin, but using the stream API.
4055 ReorderBufferStreamTXN(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
4057 Snapshot snapshot_now
;
4058 CommandId command_id
;
4060 bool txn_is_streamed
;
4062 /* We can never reach here for a subtransaction. */
4063 Assert(rbtxn_is_toptxn(txn
));
4066 * We can't make any assumptions about base snapshot here, similar to what
4067 * ReorderBufferCommit() does. That relies on base_snapshot getting
4068 * transferred from subxact in ReorderBufferCommitChild(), but that was
4069 * not yet called as the transaction is in-progress.
4071 * So just walk the subxacts and use the same logic here. But we only need
4072 * to do that once, when the transaction is streamed for the first time.
4073 * After that we need to reuse the snapshot from the previous run.
4075 * Unlike DecodeCommit which adds xids of all the subtransactions in
4076 * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4077 * we do add them to subxip array instead via ReorderBufferCopySnap. This
4078 * allows the catalog changes made in subtransactions decoded till now to
4081 if (txn
->snapshot_now
== NULL
)
4083 dlist_iter subxact_i
;
4085 /* make sure this transaction is streamed for the first time */
4086 Assert(!rbtxn_is_streamed(txn
));
4088 /* at the beginning we should have invalid command ID */
4089 Assert(txn
->command_id
== InvalidCommandId
);
4091 dlist_foreach(subxact_i
, &txn
->subtxns
)
4093 ReorderBufferTXN
*subtxn
;
4095 subtxn
= dlist_container(ReorderBufferTXN
, node
, subxact_i
.cur
);
4096 ReorderBufferTransferSnapToParent(txn
, subtxn
);
4100 * If this transaction has no snapshot, it didn't make any changes to
4101 * the database till now, so there's nothing to decode.
4103 if (txn
->base_snapshot
== NULL
)
4105 Assert(txn
->ninvalidations
== 0);
4109 command_id
= FirstCommandId
;
4110 snapshot_now
= ReorderBufferCopySnap(rb
, txn
->base_snapshot
,
4115 /* the transaction must have been already streamed */
4116 Assert(rbtxn_is_streamed(txn
));
4119 * Nah, we already have snapshot from the previous streaming run. We
4120 * assume new subxacts can't move the LSN backwards, and so can't beat
4121 * the LSN condition in the previous branch (so no need to walk
4122 * through subxacts again). In fact, we must not do that as we may be
4123 * using snapshot half-way through the subxact.
4125 command_id
= txn
->command_id
;
4128 * We can't use txn->snapshot_now directly because after the last
4129 * streaming run, we might have got some new sub-transactions. So we
4130 * need to add them to the snapshot.
4132 snapshot_now
= ReorderBufferCopySnap(rb
, txn
->snapshot_now
,
4135 /* Free the previously copied snapshot. */
4136 Assert(txn
->snapshot_now
->copied
);
4137 ReorderBufferFreeSnap(rb
, txn
->snapshot_now
);
4138 txn
->snapshot_now
= NULL
;
4142 * Remember this information to be used later to update stats. We can't
4143 * update the stats here as an error while processing the changes would
4144 * lead to the accumulation of stats even though we haven't streamed all
4147 txn_is_streamed
= rbtxn_is_streamed(txn
);
4148 stream_bytes
= txn
->total_size
;
4150 /* Process and send the changes to output plugin. */
4151 ReorderBufferProcessTXN(rb
, txn
, InvalidXLogRecPtr
, snapshot_now
,
4154 rb
->streamCount
+= 1;
4155 rb
->streamBytes
+= stream_bytes
;
4157 /* Don't consider already streamed transaction. */
4158 rb
->streamTxns
+= (txn_is_streamed
) ? 0 : 1;
4160 /* update the decoding stats */
4161 UpdateDecodingStats((LogicalDecodingContext
*) rb
->private_data
);
4163 Assert(dlist_is_empty(&txn
->changes
));
4164 Assert(txn
->nentries
== 0);
4165 Assert(txn
->nentries_mem
== 0);
4169 * Size of a change in memory.
4172 ReorderBufferChangeSize(ReorderBufferChange
*change
)
4174 Size sz
= sizeof(ReorderBufferChange
);
4176 switch (change
->action
)
4178 /* fall through these, they're all similar enough */
4179 case REORDER_BUFFER_CHANGE_INSERT
:
4180 case REORDER_BUFFER_CHANGE_UPDATE
:
4181 case REORDER_BUFFER_CHANGE_DELETE
:
4182 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT
:
4189 oldtup
= change
->data
.tp
.oldtuple
;
4190 newtup
= change
->data
.tp
.newtuple
;
4194 sz
+= sizeof(HeapTupleData
);
4195 oldlen
= oldtup
->t_len
;
4201 sz
+= sizeof(HeapTupleData
);
4202 newlen
= newtup
->t_len
;
4208 case REORDER_BUFFER_CHANGE_MESSAGE
:
4210 Size prefix_size
= strlen(change
->data
.msg
.prefix
) + 1;
4212 sz
+= prefix_size
+ change
->data
.msg
.message_size
+
4213 sizeof(Size
) + sizeof(Size
);
4217 case REORDER_BUFFER_CHANGE_INVALIDATION
:
4219 sz
+= sizeof(SharedInvalidationMessage
) *
4220 change
->data
.inval
.ninvalidations
;
4223 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT
:
4227 snap
= change
->data
.snapshot
;
4229 sz
+= sizeof(SnapshotData
) +
4230 sizeof(TransactionId
) * snap
->xcnt
+
4231 sizeof(TransactionId
) * snap
->subxcnt
;
4235 case REORDER_BUFFER_CHANGE_TRUNCATE
:
4237 sz
+= sizeof(Oid
) * change
->data
.truncate
.nrelids
;
4241 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM
:
4242 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT
:
4243 case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID
:
4244 case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
:
4245 /* ReorderBufferChange contains everything important */
4254 * Restore a number of changes spilled to disk back into memory.
4257 ReorderBufferRestoreChanges(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
4258 TXNEntryFile
*file
, XLogSegNo
*segno
)
4261 XLogSegNo last_segno
;
4262 dlist_mutable_iter cleanup_iter
;
4263 File
*fd
= &file
->vfd
;
4265 Assert(txn
->first_lsn
!= InvalidXLogRecPtr
);
4266 Assert(txn
->final_lsn
!= InvalidXLogRecPtr
);
4268 /* free current entries, so we have memory for more */
4269 dlist_foreach_modify(cleanup_iter
, &txn
->changes
)
4271 ReorderBufferChange
*cleanup
=
4272 dlist_container(ReorderBufferChange
, node
, cleanup_iter
.cur
);
4274 dlist_delete(&cleanup
->node
);
4275 ReorderBufferReturnChange(rb
, cleanup
, true);
4277 txn
->nentries_mem
= 0;
4278 Assert(dlist_is_empty(&txn
->changes
));
4280 XLByteToSeg(txn
->final_lsn
, last_segno
, wal_segment_size
);
4282 while (restored
< max_changes_in_memory
&& *segno
<= last_segno
)
4285 ReorderBufferDiskChange
*ondisk
;
4287 CHECK_FOR_INTERRUPTS();
4291 char path
[MAXPGPATH
];
4295 XLByteToSeg(txn
->first_lsn
, *segno
, wal_segment_size
);
4297 Assert(*segno
!= 0 || dlist_is_empty(&txn
->changes
));
4300 * No need to care about TLIs here, only used during a single run,
4301 * so each LSN only maps to a specific WAL record.
4303 ReorderBufferSerializedPath(path
, MyReplicationSlot
, txn
->xid
,
4306 *fd
= PathNameOpenFile(path
, O_RDONLY
| PG_BINARY
);
4308 /* No harm in resetting the offset even in case of failure */
4309 file
->curOffset
= 0;
4311 if (*fd
< 0 && errno
== ENOENT
)
4319 (errcode_for_file_access(),
4320 errmsg("could not open file \"%s\": %m",
4325 * Read the statically sized part of a change which has information
4326 * about the total size. If we couldn't read a record, we're at the
4329 ReorderBufferSerializeReserve(rb
, sizeof(ReorderBufferDiskChange
));
4330 readBytes
= FileRead(file
->vfd
, rb
->outbuf
,
4331 sizeof(ReorderBufferDiskChange
),
4332 file
->curOffset
, WAIT_EVENT_REORDER_BUFFER_READ
);
4342 else if (readBytes
< 0)
4344 (errcode_for_file_access(),
4345 errmsg("could not read from reorderbuffer spill file: %m")));
4346 else if (readBytes
!= sizeof(ReorderBufferDiskChange
))
4348 (errcode_for_file_access(),
4349 errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4351 (uint32
) sizeof(ReorderBufferDiskChange
))));
4353 file
->curOffset
+= readBytes
;
4355 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
4357 ReorderBufferSerializeReserve(rb
,
4358 sizeof(ReorderBufferDiskChange
) + ondisk
->size
);
4359 ondisk
= (ReorderBufferDiskChange
*) rb
->outbuf
;
4361 readBytes
= FileRead(file
->vfd
,
4362 rb
->outbuf
+ sizeof(ReorderBufferDiskChange
),
4363 ondisk
->size
- sizeof(ReorderBufferDiskChange
),
4365 WAIT_EVENT_REORDER_BUFFER_READ
);
4369 (errcode_for_file_access(),
4370 errmsg("could not read from reorderbuffer spill file: %m")));
4371 else if (readBytes
!= ondisk
->size
- sizeof(ReorderBufferDiskChange
))
4373 (errcode_for_file_access(),
4374 errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4376 (uint32
) (ondisk
->size
- sizeof(ReorderBufferDiskChange
)))));
4378 file
->curOffset
+= readBytes
;
4381 * ok, read a full change from disk, now restore it into proper
4384 ReorderBufferRestoreChange(rb
, txn
, rb
->outbuf
);
4392 * Convert change from its on-disk format to in-memory format and queue it onto
4393 * the TXN's ->changes list.
4395 * Note: although "data" is declared char*, at entry it points to a
4396 * maxalign'd buffer, making it safe in most of this function to assume
4397 * that the pointed-to data is suitably aligned for direct access.
4400 ReorderBufferRestoreChange(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
4403 ReorderBufferDiskChange
*ondisk
;
4404 ReorderBufferChange
*change
;
4406 ondisk
= (ReorderBufferDiskChange
*) data
;
4408 change
= ReorderBufferGetChange(rb
);
4410 /* copy static part */
4411 memcpy(change
, &ondisk
->change
, sizeof(ReorderBufferChange
));
4413 data
+= sizeof(ReorderBufferDiskChange
);
4415 /* restore individual stuff */
4416 switch (change
->action
)
4418 /* fall through these, they're all similar enough */
4419 case REORDER_BUFFER_CHANGE_INSERT
:
4420 case REORDER_BUFFER_CHANGE_UPDATE
:
4421 case REORDER_BUFFER_CHANGE_DELETE
:
4422 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT
:
4423 if (change
->data
.tp
.oldtuple
)
4425 uint32 tuplelen
= ((HeapTuple
) data
)->t_len
;
4427 change
->data
.tp
.oldtuple
=
4428 ReorderBufferGetTupleBuf(rb
, tuplelen
- SizeofHeapTupleHeader
);
4430 /* restore ->tuple */
4431 memcpy(change
->data
.tp
.oldtuple
, data
,
4432 sizeof(HeapTupleData
));
4433 data
+= sizeof(HeapTupleData
);
4435 /* reset t_data pointer into the new tuplebuf */
4436 change
->data
.tp
.oldtuple
->t_data
=
4437 (HeapTupleHeader
) ((char *) change
->data
.tp
.oldtuple
+ HEAPTUPLESIZE
);
4439 /* restore tuple data itself */
4440 memcpy(change
->data
.tp
.oldtuple
->t_data
, data
, tuplelen
);
4444 if (change
->data
.tp
.newtuple
)
4446 /* here, data might not be suitably aligned! */
4449 memcpy(&tuplelen
, data
+ offsetof(HeapTupleData
, t_len
),
4452 change
->data
.tp
.newtuple
=
4453 ReorderBufferGetTupleBuf(rb
, tuplelen
- SizeofHeapTupleHeader
);
4455 /* restore ->tuple */
4456 memcpy(change
->data
.tp
.newtuple
, data
,
4457 sizeof(HeapTupleData
));
4458 data
+= sizeof(HeapTupleData
);
4460 /* reset t_data pointer into the new tuplebuf */
4461 change
->data
.tp
.newtuple
->t_data
=
4462 (HeapTupleHeader
) ((char *) change
->data
.tp
.newtuple
+ HEAPTUPLESIZE
);
4464 /* restore tuple data itself */
4465 memcpy(change
->data
.tp
.newtuple
->t_data
, data
, tuplelen
);
4470 case REORDER_BUFFER_CHANGE_MESSAGE
:
4475 memcpy(&prefix_size
, data
, sizeof(Size
));
4476 data
+= sizeof(Size
);
4477 change
->data
.msg
.prefix
= MemoryContextAlloc(rb
->context
,
4479 memcpy(change
->data
.msg
.prefix
, data
, prefix_size
);
4480 Assert(change
->data
.msg
.prefix
[prefix_size
- 1] == '\0');
4481 data
+= prefix_size
;
4483 /* read the message */
4484 memcpy(&change
->data
.msg
.message_size
, data
, sizeof(Size
));
4485 data
+= sizeof(Size
);
4486 change
->data
.msg
.message
= MemoryContextAlloc(rb
->context
,
4487 change
->data
.msg
.message_size
);
4488 memcpy(change
->data
.msg
.message
, data
,
4489 change
->data
.msg
.message_size
);
4490 data
+= change
->data
.msg
.message_size
;
4494 case REORDER_BUFFER_CHANGE_INVALIDATION
:
4496 Size inval_size
= sizeof(SharedInvalidationMessage
) *
4497 change
->data
.inval
.ninvalidations
;
4499 change
->data
.inval
.invalidations
=
4500 MemoryContextAlloc(rb
->context
, inval_size
);
4502 /* read the message */
4503 memcpy(change
->data
.inval
.invalidations
, data
, inval_size
);
4507 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT
:
4513 oldsnap
= (Snapshot
) data
;
4515 size
= sizeof(SnapshotData
) +
4516 sizeof(TransactionId
) * oldsnap
->xcnt
+
4517 sizeof(TransactionId
) * (oldsnap
->subxcnt
+ 0);
4519 change
->data
.snapshot
= MemoryContextAllocZero(rb
->context
, size
);
4521 newsnap
= change
->data
.snapshot
;
4523 memcpy(newsnap
, data
, size
);
4524 newsnap
->xip
= (TransactionId
*)
4525 (((char *) newsnap
) + sizeof(SnapshotData
));
4526 newsnap
->subxip
= newsnap
->xip
+ newsnap
->xcnt
;
4527 newsnap
->copied
= true;
4530 /* the base struct contains all the data, easy peasy */
4531 case REORDER_BUFFER_CHANGE_TRUNCATE
:
4535 relids
= ReorderBufferGetRelids(rb
,
4536 change
->data
.truncate
.nrelids
);
4537 memcpy(relids
, data
, change
->data
.truncate
.nrelids
* sizeof(Oid
));
4538 change
->data
.truncate
.relids
= relids
;
4542 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM
:
4543 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT
:
4544 case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID
:
4545 case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
:
4549 dlist_push_tail(&txn
->changes
, &change
->node
);
4550 txn
->nentries_mem
++;
4553 * Update memory accounting for the restored change. We need to do this
4554 * although we don't check the memory limit when restoring the changes in
4555 * this branch (we only do that when initially queueing the changes after
4556 * decoding), because we will release the changes later, and that will
4557 * update the accounting too (subtracting the size from the counters). And
4558 * we don't want to underflow there.
4560 ReorderBufferChangeMemoryUpdate(rb
, change
, NULL
, true,
4561 ReorderBufferChangeSize(change
));
4565 * Remove all on-disk stored for the passed in transaction.
4568 ReorderBufferRestoreCleanup(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
4574 Assert(txn
->first_lsn
!= InvalidXLogRecPtr
);
4575 Assert(txn
->final_lsn
!= InvalidXLogRecPtr
);
4577 XLByteToSeg(txn
->first_lsn
, first
, wal_segment_size
);
4578 XLByteToSeg(txn
->final_lsn
, last
, wal_segment_size
);
4580 /* iterate over all possible filenames, and delete them */
4581 for (cur
= first
; cur
<= last
; cur
++)
4583 char path
[MAXPGPATH
];
4585 ReorderBufferSerializedPath(path
, MyReplicationSlot
, txn
->xid
, cur
);
4586 if (unlink(path
) != 0 && errno
!= ENOENT
)
4588 (errcode_for_file_access(),
4589 errmsg("could not remove file \"%s\": %m", path
)));
4594 * Remove any leftover serialized reorder buffers from a slot directory after a
4595 * prior crash or decoding session exit.
4598 ReorderBufferCleanupSerializedTXNs(const char *slotname
)
4601 struct dirent
*spill_de
;
4602 struct stat statbuf
;
4603 char path
[MAXPGPATH
* 2 + sizeof(PG_REPLSLOT_DIR
)];
4605 sprintf(path
, "%s/%s", PG_REPLSLOT_DIR
, slotname
);
4607 /* we're only handling directories here, skip if it's not ours */
4608 if (lstat(path
, &statbuf
) == 0 && !S_ISDIR(statbuf
.st_mode
))
4611 spill_dir
= AllocateDir(path
);
4612 while ((spill_de
= ReadDirExtended(spill_dir
, path
, INFO
)) != NULL
)
4614 /* only look at names that can be ours */
4615 if (strncmp(spill_de
->d_name
, "xid", 3) == 0)
4617 snprintf(path
, sizeof(path
),
4618 "%s/%s/%s", PG_REPLSLOT_DIR
, slotname
,
4621 if (unlink(path
) != 0)
4623 (errcode_for_file_access(),
4624 errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4625 path
, PG_REPLSLOT_DIR
, slotname
)));
4632 * Given a replication slot, transaction ID and segment number, fill in the
4633 * corresponding spill file into 'path', which is a caller-owned buffer of size
4634 * at least MAXPGPATH.
4637 ReorderBufferSerializedPath(char *path
, ReplicationSlot
*slot
, TransactionId xid
,
4642 XLogSegNoOffsetToRecPtr(segno
, 0, wal_segment_size
, recptr
);
4644 snprintf(path
, MAXPGPATH
, "%s/%s/xid-%u-lsn-%X-%X.spill",
4646 NameStr(MyReplicationSlot
->data
.name
),
4647 xid
, LSN_FORMAT_ARGS(recptr
));
4651 * Delete all data spilled to disk after we've restarted/crashed. It will be
4652 * recreated when the respective slots are reused.
4655 StartupReorderBuffer(void)
4658 struct dirent
*logical_de
;
4660 logical_dir
= AllocateDir(PG_REPLSLOT_DIR
);
4661 while ((logical_de
= ReadDir(logical_dir
, PG_REPLSLOT_DIR
)) != NULL
)
4663 if (strcmp(logical_de
->d_name
, ".") == 0 ||
4664 strcmp(logical_de
->d_name
, "..") == 0)
4667 /* if it cannot be a slot, skip the directory */
4668 if (!ReplicationSlotValidateName(logical_de
->d_name
, DEBUG2
))
4672 * ok, has to be a surviving logical slot, iterate and delete
4673 * everything starting with xid-*
4675 ReorderBufferCleanupSerializedTXNs(logical_de
->d_name
);
4677 FreeDir(logical_dir
);
4680 /* ---------------------------------------
4681 * toast reassembly support
4682 * ---------------------------------------
4686 * Initialize per tuple toast reconstruction support.
4689 ReorderBufferToastInitHash(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
4693 Assert(txn
->toast_hash
== NULL
);
4695 hash_ctl
.keysize
= sizeof(Oid
);
4696 hash_ctl
.entrysize
= sizeof(ReorderBufferToastEnt
);
4697 hash_ctl
.hcxt
= rb
->context
;
4698 txn
->toast_hash
= hash_create("ReorderBufferToastHash", 5, &hash_ctl
,
4699 HASH_ELEM
| HASH_BLOBS
| HASH_CONTEXT
);
4703 * Per toast-chunk handling for toast reconstruction
4705 * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4706 * toasted Datum comes along.
4709 ReorderBufferToastAppendChunk(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
4710 Relation relation
, ReorderBufferChange
*change
)
4712 ReorderBufferToastEnt
*ent
;
4718 TupleDesc desc
= RelationGetDescr(relation
);
4722 if (txn
->toast_hash
== NULL
)
4723 ReorderBufferToastInitHash(rb
, txn
);
4725 Assert(IsToastRelation(relation
));
4727 newtup
= change
->data
.tp
.newtuple
;
4728 chunk_id
= DatumGetObjectId(fastgetattr(newtup
, 1, desc
, &isnull
));
4730 chunk_seq
= DatumGetInt32(fastgetattr(newtup
, 2, desc
, &isnull
));
4733 ent
= (ReorderBufferToastEnt
*)
4734 hash_search(txn
->toast_hash
, &chunk_id
, HASH_ENTER
, &found
);
4738 Assert(ent
->chunk_id
== chunk_id
);
4739 ent
->num_chunks
= 0;
4740 ent
->last_chunk_seq
= 0;
4742 ent
->reconstructed
= NULL
;
4743 dlist_init(&ent
->chunks
);
4746 elog(ERROR
, "got sequence entry %d for toast chunk %u instead of seq 0",
4747 chunk_seq
, chunk_id
);
4749 else if (found
&& chunk_seq
!= ent
->last_chunk_seq
+ 1)
4750 elog(ERROR
, "got sequence entry %d for toast chunk %u instead of seq %d",
4751 chunk_seq
, chunk_id
, ent
->last_chunk_seq
+ 1);
4753 chunk
= DatumGetPointer(fastgetattr(newtup
, 3, desc
, &isnull
));
4756 /* calculate size so we can allocate the right size at once later */
4757 if (!VARATT_IS_EXTENDED(chunk
))
4758 chunksize
= VARSIZE(chunk
) - VARHDRSZ
;
4759 else if (VARATT_IS_SHORT(chunk
))
4760 /* could happen due to heap_form_tuple doing its thing */
4761 chunksize
= VARSIZE_SHORT(chunk
) - VARHDRSZ_SHORT
;
4763 elog(ERROR
, "unexpected type of toast chunk");
4765 ent
->size
+= chunksize
;
4766 ent
->last_chunk_seq
= chunk_seq
;
4768 dlist_push_tail(&ent
->chunks
, &change
->node
);
4772 * Rejigger change->newtuple to point to in-memory toast tuples instead of
4773 * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
4775 * We cannot replace unchanged toast tuples though, so those will still point
4776 * to on-disk toast data.
4778 * While updating the existing change with detoasted tuple data, we need to
4779 * update the memory accounting info, because the change size will differ.
4780 * Otherwise the accounting may get out of sync, triggering serialization
4781 * at unexpected times.
4783 * We simply subtract size of the change before rejiggering the tuple, and
4784 * then add the new size. This makes it look like the change was removed
4785 * and then added back, except it only tweaks the accounting info.
4787 * In particular it can't trigger serialization, which would be pointless
4788 * anyway as it happens during commit processing right before handing
4789 * the change to the output plugin.
4792 ReorderBufferToastReplace(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
,
4793 Relation relation
, ReorderBufferChange
*change
)
4802 TupleDesc toast_desc
;
4803 MemoryContext oldcontext
;
4807 /* no toast tuples changed */
4808 if (txn
->toast_hash
== NULL
)
4812 * We're going to modify the size of the change. So, to make sure the
4813 * accounting is correct we record the current change size and then after
4814 * re-computing the change we'll subtract the recorded size and then
4815 * re-add the new change size at the end. We don't immediately subtract
4816 * the old size because if there is any error before we add the new size,
4817 * we will release the changes and that will update the accounting info
4818 * (subtracting the size from the counters). And we don't want to
4821 old_size
= ReorderBufferChangeSize(change
);
4823 oldcontext
= MemoryContextSwitchTo(rb
->context
);
4825 /* we should only have toast tuples in an INSERT or UPDATE */
4826 Assert(change
->data
.tp
.newtuple
);
4828 desc
= RelationGetDescr(relation
);
4830 toast_rel
= RelationIdGetRelation(relation
->rd_rel
->reltoastrelid
);
4831 if (!RelationIsValid(toast_rel
))
4832 elog(ERROR
, "could not open toast relation with OID %u (base relation \"%s\")",
4833 relation
->rd_rel
->reltoastrelid
, RelationGetRelationName(relation
));
4835 toast_desc
= RelationGetDescr(toast_rel
);
4837 /* should we allocate from stack instead? */
4838 attrs
= palloc0(sizeof(Datum
) * desc
->natts
);
4839 isnull
= palloc0(sizeof(bool) * desc
->natts
);
4840 free
= palloc0(sizeof(bool) * desc
->natts
);
4842 newtup
= change
->data
.tp
.newtuple
;
4844 heap_deform_tuple(newtup
, desc
, attrs
, isnull
);
4846 for (natt
= 0; natt
< desc
->natts
; natt
++)
4848 Form_pg_attribute attr
= TupleDescAttr(desc
, natt
);
4849 ReorderBufferToastEnt
*ent
;
4850 struct varlena
*varlena
;
4852 /* va_rawsize is the size of the original datum -- including header */
4853 struct varatt_external toast_pointer
;
4854 struct varatt_indirect redirect_pointer
;
4855 struct varlena
*new_datum
= NULL
;
4856 struct varlena
*reconstructed
;
4860 /* system columns aren't toasted */
4861 if (attr
->attnum
< 0)
4864 if (attr
->attisdropped
)
4867 /* not a varlena datatype */
4868 if (attr
->attlen
!= -1)
4875 /* ok, we know we have a toast datum */
4876 varlena
= (struct varlena
*) DatumGetPointer(attrs
[natt
]);
4878 /* no need to do anything if the tuple isn't external */
4879 if (!VARATT_IS_EXTERNAL(varlena
))
4882 VARATT_EXTERNAL_GET_POINTER(toast_pointer
, varlena
);
4885 * Check whether the toast tuple changed, replace if so.
4887 ent
= (ReorderBufferToastEnt
*)
4888 hash_search(txn
->toast_hash
,
4889 &toast_pointer
.va_valueid
,
4896 (struct varlena
*) palloc0(INDIRECT_POINTER_SIZE
);
4900 reconstructed
= palloc0(toast_pointer
.va_rawsize
);
4902 ent
->reconstructed
= reconstructed
;
4904 /* stitch toast tuple back together from its parts */
4905 dlist_foreach(it
, &ent
->chunks
)
4908 ReorderBufferChange
*cchange
;
4912 cchange
= dlist_container(ReorderBufferChange
, node
, it
.cur
);
4913 ctup
= cchange
->data
.tp
.newtuple
;
4914 chunk
= DatumGetPointer(fastgetattr(ctup
, 3, toast_desc
, &cisnull
));
4917 Assert(!VARATT_IS_EXTERNAL(chunk
));
4918 Assert(!VARATT_IS_SHORT(chunk
));
4920 memcpy(VARDATA(reconstructed
) + data_done
,
4922 VARSIZE(chunk
) - VARHDRSZ
);
4923 data_done
+= VARSIZE(chunk
) - VARHDRSZ
;
4925 Assert(data_done
== VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer
));
4927 /* make sure its marked as compressed or not */
4928 if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer
))
4929 SET_VARSIZE_COMPRESSED(reconstructed
, data_done
+ VARHDRSZ
);
4931 SET_VARSIZE(reconstructed
, data_done
+ VARHDRSZ
);
4933 memset(&redirect_pointer
, 0, sizeof(redirect_pointer
));
4934 redirect_pointer
.pointer
= reconstructed
;
4936 SET_VARTAG_EXTERNAL(new_datum
, VARTAG_INDIRECT
);
4937 memcpy(VARDATA_EXTERNAL(new_datum
), &redirect_pointer
,
4938 sizeof(redirect_pointer
));
4940 attrs
[natt
] = PointerGetDatum(new_datum
);
4944 * Build tuple in separate memory & copy tuple back into the tuplebuf
4945 * passed to the output plugin. We can't directly heap_fill_tuple() into
4946 * the tuplebuf because attrs[] will point back into the current content.
4948 tmphtup
= heap_form_tuple(desc
, attrs
, isnull
);
4949 Assert(newtup
->t_len
<= MaxHeapTupleSize
);
4950 Assert(newtup
->t_data
== (HeapTupleHeader
) ((char *) newtup
+ HEAPTUPLESIZE
));
4952 memcpy(newtup
->t_data
, tmphtup
->t_data
, tmphtup
->t_len
);
4953 newtup
->t_len
= tmphtup
->t_len
;
4956 * free resources we won't further need, more persistent stuff will be
4957 * free'd in ReorderBufferToastReset().
4959 RelationClose(toast_rel
);
4961 for (natt
= 0; natt
< desc
->natts
; natt
++)
4964 pfree(DatumGetPointer(attrs
[natt
]));
4970 MemoryContextSwitchTo(oldcontext
);
4972 /* subtract the old change size */
4973 ReorderBufferChangeMemoryUpdate(rb
, change
, NULL
, false, old_size
);
4974 /* now add the change back, with the correct size */
4975 ReorderBufferChangeMemoryUpdate(rb
, change
, NULL
, true,
4976 ReorderBufferChangeSize(change
));
4980 * Free all resources allocated for toast reconstruction.
4983 ReorderBufferToastReset(ReorderBuffer
*rb
, ReorderBufferTXN
*txn
)
4985 HASH_SEQ_STATUS hstat
;
4986 ReorderBufferToastEnt
*ent
;
4988 if (txn
->toast_hash
== NULL
)
4991 /* sequentially walk over the hash and free everything */
4992 hash_seq_init(&hstat
, txn
->toast_hash
);
4993 while ((ent
= (ReorderBufferToastEnt
*) hash_seq_search(&hstat
)) != NULL
)
4995 dlist_mutable_iter it
;
4997 if (ent
->reconstructed
!= NULL
)
4998 pfree(ent
->reconstructed
);
5000 dlist_foreach_modify(it
, &ent
->chunks
)
5002 ReorderBufferChange
*change
=
5003 dlist_container(ReorderBufferChange
, node
, it
.cur
);
5005 dlist_delete(&change
->node
);
5006 ReorderBufferReturnChange(rb
, change
, true);
5010 hash_destroy(txn
->toast_hash
);
5011 txn
->toast_hash
= NULL
;
5015 /* ---------------------------------------
5016 * Visibility support for logical decoding
5019 * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5020 * always rely on stored cmin/cmax values because of two scenarios:
5022 * * A tuple got changed multiple times during a single transaction and thus
5023 * has got a combo CID. Combo CIDs are only valid for the duration of a
5024 * single transaction.
5025 * * A tuple with a cmin but no cmax (and thus no combo CID) got
5026 * deleted/updated in another transaction than the one which created it
5027 * which we are looking at right now. As only one of cmin, cmax or combo CID
5028 * is actually stored in the heap we don't have access to the value we
5031 * To resolve those problems we have a per-transaction hash of (cmin,
5032 * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5033 * (cmin, cmax) values. That also takes care of combo CIDs by simply
5034 * not caring about them at all. As we have the real cmin/cmax values
5035 * combo CIDs aren't interesting.
5037 * As we only care about catalog tuples here the overhead of this
5038 * hashtable should be acceptable.
5040 * Heap rewrites complicate this a bit, check rewriteheap.c for
5042 * -------------------------------------------------------------------------
5045 /* struct for sorting mapping files by LSN efficiently */
5046 typedef struct RewriteMappingFile
5049 char fname
[MAXPGPATH
];
5050 } RewriteMappingFile
;
5054 DisplayMapping(HTAB
*tuplecid_data
)
5056 HASH_SEQ_STATUS hstat
;
5057 ReorderBufferTupleCidEnt
*ent
;
5059 hash_seq_init(&hstat
, tuplecid_data
);
5060 while ((ent
= (ReorderBufferTupleCidEnt
*) hash_seq_search(&hstat
)) != NULL
)
5062 elog(DEBUG3
, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5063 ent
->key
.rlocator
.dbOid
,
5064 ent
->key
.rlocator
.spcOid
,
5065 ent
->key
.rlocator
.relNumber
,
5066 ItemPointerGetBlockNumber(&ent
->key
.tid
),
5067 ItemPointerGetOffsetNumber(&ent
->key
.tid
),
5076 * Apply a single mapping file to tuplecid_data.
5078 * The mapping file has to have been verified to be a) committed b) for our
5079 * transaction c) applied in LSN order.
5082 ApplyLogicalMappingFile(HTAB
*tuplecid_data
, Oid relid
, const char *fname
)
5084 char path
[MAXPGPATH
];
5087 LogicalRewriteMappingData map
;
5089 sprintf(path
, "%s/%s", PG_LOGICAL_MAPPINGS_DIR
, fname
);
5090 fd
= OpenTransientFile(path
, O_RDONLY
| PG_BINARY
);
5093 (errcode_for_file_access(),
5094 errmsg("could not open file \"%s\": %m", path
)));
5098 ReorderBufferTupleCidKey key
;
5099 ReorderBufferTupleCidEnt
*ent
;
5100 ReorderBufferTupleCidEnt
*new_ent
;
5103 /* be careful about padding */
5104 memset(&key
, 0, sizeof(ReorderBufferTupleCidKey
));
5106 /* read all mappings till the end of the file */
5107 pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ
);
5108 readBytes
= read(fd
, &map
, sizeof(LogicalRewriteMappingData
));
5109 pgstat_report_wait_end();
5113 (errcode_for_file_access(),
5114 errmsg("could not read file \"%s\": %m",
5116 else if (readBytes
== 0) /* EOF */
5118 else if (readBytes
!= sizeof(LogicalRewriteMappingData
))
5120 (errcode_for_file_access(),
5121 errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5123 (int32
) sizeof(LogicalRewriteMappingData
))));
5125 key
.rlocator
= map
.old_locator
;
5126 ItemPointerCopy(&map
.old_tid
,
5130 ent
= (ReorderBufferTupleCidEnt
*)
5131 hash_search(tuplecid_data
, &key
, HASH_FIND
, NULL
);
5133 /* no existing mapping, no need to update */
5137 key
.rlocator
= map
.new_locator
;
5138 ItemPointerCopy(&map
.new_tid
,
5141 new_ent
= (ReorderBufferTupleCidEnt
*)
5142 hash_search(tuplecid_data
, &key
, HASH_ENTER
, &found
);
5147 * Make sure the existing mapping makes sense. We sometime update
5148 * old records that did not yet have a cmax (e.g. pg_class' own
5149 * entry while rewriting it) during rewrites, so allow that.
5151 Assert(ent
->cmin
== InvalidCommandId
|| ent
->cmin
== new_ent
->cmin
);
5152 Assert(ent
->cmax
== InvalidCommandId
|| ent
->cmax
== new_ent
->cmax
);
5156 /* update mapping */
5157 new_ent
->cmin
= ent
->cmin
;
5158 new_ent
->cmax
= ent
->cmax
;
5159 new_ent
->combocid
= ent
->combocid
;
5163 if (CloseTransientFile(fd
) != 0)
5165 (errcode_for_file_access(),
5166 errmsg("could not close file \"%s\": %m", path
)));
5171 * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5174 TransactionIdInArray(TransactionId xid
, TransactionId
*xip
, Size num
)
5176 return bsearch(&xid
, xip
, num
,
5177 sizeof(TransactionId
), xidComparator
) != NULL
;
5181 * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5184 file_sort_by_lsn(const ListCell
*a_p
, const ListCell
*b_p
)
5186 RewriteMappingFile
*a
= (RewriteMappingFile
*) lfirst(a_p
);
5187 RewriteMappingFile
*b
= (RewriteMappingFile
*) lfirst(b_p
);
5189 return pg_cmp_u64(a
->lsn
, b
->lsn
);
5193 * Apply any existing logical remapping files if there are any targeted at our
5194 * transaction for relid.
5197 UpdateLogicalMappings(HTAB
*tuplecid_data
, Oid relid
, Snapshot snapshot
)
5200 struct dirent
*mapping_de
;
5203 Oid dboid
= IsSharedRelation(relid
) ? InvalidOid
: MyDatabaseId
;
5205 mapping_dir
= AllocateDir(PG_LOGICAL_MAPPINGS_DIR
);
5206 while ((mapping_de
= ReadDir(mapping_dir
, PG_LOGICAL_MAPPINGS_DIR
)) != NULL
)
5210 TransactionId f_mapped_xid
;
5211 TransactionId f_create_xid
;
5215 RewriteMappingFile
*f
;
5217 if (strcmp(mapping_de
->d_name
, ".") == 0 ||
5218 strcmp(mapping_de
->d_name
, "..") == 0)
5221 /* Ignore files that aren't ours */
5222 if (strncmp(mapping_de
->d_name
, "map-", 4) != 0)
5225 if (sscanf(mapping_de
->d_name
, LOGICAL_REWRITE_FORMAT
,
5226 &f_dboid
, &f_relid
, &f_hi
, &f_lo
,
5227 &f_mapped_xid
, &f_create_xid
) != 6)
5228 elog(ERROR
, "could not parse filename \"%s\"", mapping_de
->d_name
);
5230 f_lsn
= ((uint64
) f_hi
) << 32 | f_lo
;
5232 /* mapping for another database */
5233 if (f_dboid
!= dboid
)
5236 /* mapping for another relation */
5237 if (f_relid
!= relid
)
5240 /* did the creating transaction abort? */
5241 if (!TransactionIdDidCommit(f_create_xid
))
5244 /* not for our transaction */
5245 if (!TransactionIdInArray(f_mapped_xid
, snapshot
->subxip
, snapshot
->subxcnt
))
5248 /* ok, relevant, queue for apply */
5249 f
= palloc(sizeof(RewriteMappingFile
));
5251 strcpy(f
->fname
, mapping_de
->d_name
);
5252 files
= lappend(files
, f
);
5254 FreeDir(mapping_dir
);
5256 /* sort files so we apply them in LSN order */
5257 list_sort(files
, file_sort_by_lsn
);
5259 foreach(file
, files
)
5261 RewriteMappingFile
*f
= (RewriteMappingFile
*) lfirst(file
);
5263 elog(DEBUG1
, "applying mapping: \"%s\" in %u", f
->fname
,
5264 snapshot
->subxip
[0]);
5265 ApplyLogicalMappingFile(tuplecid_data
, relid
, f
->fname
);
5271 * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5275 ResolveCminCmaxDuringDecoding(HTAB
*tuplecid_data
,
5277 HeapTuple htup
, Buffer buffer
,
5278 CommandId
*cmin
, CommandId
*cmax
)
5280 ReorderBufferTupleCidKey key
;
5281 ReorderBufferTupleCidEnt
*ent
;
5283 BlockNumber blockno
;
5284 bool updated_mapping
= false;
5287 * Return unresolved if tuplecid_data is not valid. That's because when
5288 * streaming in-progress transactions we may run into tuples with the CID
5289 * before actually decoding them. Think e.g. about INSERT followed by
5290 * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5291 * INSERT. So in such cases, we assume the CID is from the future
5294 if (tuplecid_data
== NULL
)
5297 /* be careful about padding */
5298 memset(&key
, 0, sizeof(key
));
5300 Assert(!BufferIsLocal(buffer
));
5303 * get relfilelocator from the buffer, no convenient way to access it
5306 BufferGetTag(buffer
, &key
.rlocator
, &forkno
, &blockno
);
5308 /* tuples can only be in the main fork */
5309 Assert(forkno
== MAIN_FORKNUM
);
5310 Assert(blockno
== ItemPointerGetBlockNumber(&htup
->t_self
));
5312 ItemPointerCopy(&htup
->t_self
,
5316 ent
= (ReorderBufferTupleCidEnt
*)
5317 hash_search(tuplecid_data
, &key
, HASH_FIND
, NULL
);
5320 * failed to find a mapping, check whether the table was rewritten and
5321 * apply mapping if so, but only do that once - there can be no new
5322 * mappings while we are in here since we have to hold a lock on the
5325 if (ent
== NULL
&& !updated_mapping
)
5327 UpdateLogicalMappings(tuplecid_data
, htup
->t_tableOid
, snapshot
);
5328 /* now check but don't update for a mapping again */
5329 updated_mapping
= true;
5332 else if (ent
== NULL
)