1 /*-------------------------------------------------------------------------
4 * header file for postgres btree xlog routines
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
9 * src/include/access/nbtxlog.h
11 *-------------------------------------------------------------------------
16 #include "access/transam.h"
17 #include "access/xlogreader.h"
18 #include "lib/stringinfo.h"
19 #include "storage/off.h"
22 * XLOG records for btree operations
24 * XLOG allows to store some information in high 4 bits of log
25 * record xl_info field
27 #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
28 #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
29 #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
30 #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
31 #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
32 #define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */
33 #define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */
34 #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
35 #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
36 #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
37 #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
38 #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
39 #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
41 #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
43 #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
47 * All that we need to regenerate the meta-data page
49 typedef struct xl_btree_metadata
56 uint32 last_cleanup_num_delpages
;
61 * This is what we need to know about simple (without split) insert.
63 * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
64 * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a
65 * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
68 * Backup Blk 0: original page
69 * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
70 * Backup Blk 2: xl_btree_metadata, if INSERT_META
72 * Note: The new tuple is actually the "original" new item in the posting
73 * list split insert case (i.e. the INSERT_POST case). A split offset for
74 * the posting list is logged before the original new item. Recovery needs
75 * both, since it must do an in-place update of the existing posting list
76 * that was split as an extra step. Also, recovery generates a "final"
77 * newitem. See _bt_swap_posting() for details on posting list splits.
79 typedef struct xl_btree_insert
83 /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
84 /* NEW TUPLE ALWAYS FOLLOWS AT THE END */
87 #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
90 * On insert with split, we save all the items going into the right sibling
91 * so that we can restore it completely from the log record. This way takes
92 * less xlog space than the normal approach, because if we did it standardly,
93 * XLogInsert would almost always think the right page is new and store its
94 * whole page image. The left page, however, is handled in the normal
95 * incremental-update fashion.
97 * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
98 * There are two variants to indicate whether the inserted tuple went into the
99 * left or right split page (and thus, whether the new item is stored or not).
100 * We always log the left page high key because suffix truncation can generate
101 * a new leaf high key using user-defined code. This is also necessary on
102 * internal pages, since the firstright item that the left page's high key was
103 * based on will have been truncated to zero attributes in the right page (the
104 * separator key is unavailable from the right page).
106 * Backup Blk 0: original page / new left page
108 * The left page's data portion contains the new item, if it's the _L variant.
109 * _R variant split records generally do not have a newitem (_R variant leaf
110 * page split records that must deal with a posting list split will include an
111 * explicit newitem, though it is never used on the right page -- it is
112 * actually an orignewitem needed to update existing posting list). The new
113 * high key of the left/original page appears last of all (and must always be
116 * Page split records that need the REDO routine to deal with a posting list
117 * split directly will have an explicit newitem, which is actually an
118 * orignewitem (the newitem as it was before the posting list split, not
119 * after). A posting list split always has a newitem that comes immediately
120 * after the posting list being split (which would have overlapped with
121 * orignewitem prior to split). Usually REDO must deal with posting list
122 * splits with an _L variant page split record, and usually both the new
123 * posting list and the final newitem go on the left page (the existing
124 * posting list will be inserted instead of the old, and the final newitem
125 * will be inserted next to that). However, _R variant split records will
126 * include an orignewitem when the split point for the page happens to have a
127 * lastleft tuple that is also the posting list being split (leaving newitem
128 * as the page split's firstright tuple). The existence of this corner case
129 * does not change the basic fact about newitem/orignewitem for the REDO
130 * routine: it is always state used for the left page alone. (This is why the
131 * record's postingoff field isn't a reliable indicator of whether or not a
132 * posting list split occurred during the page split; a non-zero value merely
133 * indicates that the REDO routine must reconstruct a new posting list tuple
134 * that is needed for the left page.)
136 * This posting list split handling is equivalent to the xl_btree_insert REDO
137 * routine's INSERT_POST handling. While the details are more complicated
138 * here, the concept and goals are exactly the same. See _bt_swap_posting()
139 * for details on posting list splits.
141 * Backup Blk 1: new right page
143 * The right page's data portion contains the right page's tuples in the form
144 * used by _bt_restore_page. This includes the new item, if it's the _R
145 * variant. The right page's tuples also include the right page's high key
146 * with either variant (moved from the left/original page during the split),
147 * unless the split happened to be of the rightmost page on its level, where
148 * there is no high key for new right page.
150 * Backup Blk 2: next block (orig page's rightlink), if any
151 * Backup Blk 3: child's left sibling, if non-leaf split
153 typedef struct xl_btree_split
155 uint32 level
; /* tree level of page being split */
156 OffsetNumber firstrightoff
; /* first origpage item on rightpage */
157 OffsetNumber newitemoff
; /* new item's offset */
158 uint16 postingoff
; /* offset inside orig posting tuple */
161 #define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
164 * When page is deduplicated, consecutive groups of tuples with equal keys are
165 * merged together into posting list tuples.
167 * The WAL record represents a deduplication pass for a leaf page. An array
168 * of BTDedupInterval structs follows.
170 typedef struct xl_btree_dedup
174 /* DEDUPLICATION INTERVALS FOLLOW */
177 #define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
180 * This is what we need to know about page reuse within btree. This record
181 * only exists to generate a conflict point for Hot Standby.
183 * Note that we must include a RelFileNode in the record because we don't
184 * actually register the buffer with the record.
186 typedef struct xl_btree_reuse_page
190 FullTransactionId latestRemovedFullXid
;
191 } xl_btree_reuse_page
;
193 #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
196 * xl_btree_vacuum and xl_btree_delete records describe deletion of index
197 * tuples on a leaf page. The former variant is used by VACUUM, while the
198 * latter variant is used by the ad-hoc deletions that sometimes take place
199 * when btinsert() is called.
201 * The records are very similar. The only difference is that xl_btree_delete
202 * has to include a latestRemovedXid field to generate recovery conflicts.
203 * (VACUUM operations can just rely on earlier conflicts generated during
204 * pruning of the table whose TIDs the to-be-deleted index tuples point to.
205 * There are also small differences between each REDO routine that we don't go
208 * xl_btree_vacuum and xl_btree_delete both represent deletion of any number
209 * of index tuples on a single leaf page using page offset numbers. Both also
210 * support "updates" of index tuples, which is how deletes of a subset of TIDs
211 * contained in an existing posting list tuple are implemented.
213 * Updated posting list tuples are represented using xl_btree_update metadata.
214 * The REDO routines each use the xl_btree_update entries (plus each
215 * corresponding original index tuple from the target leaf page) to generate
216 * the final updated tuple.
218 * Updates are only used when there will be some remaining TIDs left by the
219 * REDO routine. Otherwise the posting list tuple just gets deleted outright.
221 typedef struct xl_btree_vacuum
226 /* DELETED TARGET OFFSET NUMBERS FOLLOW */
227 /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
228 /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
231 #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
233 typedef struct xl_btree_delete
235 TransactionId latestRemovedXid
;
239 /* DELETED TARGET OFFSET NUMBERS FOLLOW */
240 /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
241 /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
244 #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16))
247 * The offsets that appear in xl_btree_update metadata are offsets into the
248 * original posting list from tuple, not page offset numbers. These are
249 * 0-based. The page offset number for the original posting list tuple comes
250 * from the main xl_btree_vacuum/xl_btree_delete record.
252 typedef struct xl_btree_update
256 /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
259 #define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
262 * This is what we need to know about marking an empty subtree for deletion.
263 * The target identifies the tuple removed from the parent page (note that we
264 * remove this tuple's downlink and the *following* tuple's key). Note that
265 * the leaf page is empty, so we don't need to store its content --- it is
266 * just reinitialized during recovery using the rest of the fields.
268 * Backup Blk 0: leaf block
269 * Backup Blk 1: top parent
271 typedef struct xl_btree_mark_page_halfdead
273 OffsetNumber poffset
; /* deleted tuple id in parent page */
275 /* information needed to recreate the leaf page: */
276 BlockNumber leafblk
; /* leaf block ultimately being deleted */
277 BlockNumber leftblk
; /* leaf block's left sibling, if any */
278 BlockNumber rightblk
; /* leaf block's right sibling */
279 BlockNumber topparent
; /* topmost internal page in the subtree */
280 } xl_btree_mark_page_halfdead
;
282 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
285 * This is what we need to know about deletion of a btree page. Note that we
286 * only leave behind a small amount of bookkeeping information in deleted
287 * pages (deleted pages must be kept around as tombstones for a while). It is
288 * convenient for the REDO routine to regenerate its target page from scratch.
289 * This is why WAL record describes certain details that are actually directly
290 * available from the target page.
292 * Backup Blk 0: target block being deleted
293 * Backup Blk 1: target block's left sibling, if any
294 * Backup Blk 2: target block's right sibling
295 * Backup Blk 3: leaf block (if different from target)
296 * Backup Blk 4: metapage (if rightsib becomes new fast root)
298 typedef struct xl_btree_unlink_page
300 BlockNumber leftsib
; /* target block's left sibling, if any */
301 BlockNumber rightsib
; /* target block's right sibling */
302 uint32 level
; /* target block's level */
303 FullTransactionId safexid
; /* target block's BTPageSetDeleted() XID */
306 * Information needed to recreate a half-dead leaf page with correct
307 * topparent link. The fields are only used when deletion operation's
308 * target page is an internal page. REDO routine creates half-dead page
309 * from scratch to keep things simple (this is the same convenient
310 * approach used for the target page itself).
312 BlockNumber leafleftsib
;
313 BlockNumber leafrightsib
;
314 BlockNumber leaftopparent
; /* next child down in the subtree */
316 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
317 } xl_btree_unlink_page
;
319 #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
322 * New root log record. There are zero tuples if this is to establish an
323 * empty root, or two if it is the result of splitting an old root.
325 * Note that although this implies rewriting the metadata page, we don't need
326 * an xl_btree_metadata record --- the rootblk and level are sufficient.
328 * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
329 * Backup Blk 1: left child (if splitting an old root)
330 * Backup Blk 2: metapage
332 typedef struct xl_btree_newroot
334 BlockNumber rootblk
; /* location of new root (redundant with blk 0) */
335 uint32 level
; /* its tree level */
338 #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
342 * prototypes for functions in nbtxlog.c
344 extern void btree_redo(XLogReaderState
*record
);
345 extern void btree_desc(StringInfo buf
, XLogReaderState
*record
);
346 extern const char *btree_identify(uint8 info
);
347 extern void btree_xlog_startup(void);
348 extern void btree_xlog_cleanup(void);
349 extern void btree_mask(char *pagedata
, BlockNumber blkno
);
351 #endif /* NBTXLOG_H */