src/include/access/nbtxlog.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtxlog.h
   4  *        header file for postgres btree xlog routines
   5  *
   6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  * src/include/access/nbtxlog.h
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #ifndef NBTXLOG_H
  14 #define NBTXLOG_H
  15
  16 #include "access/transam.h"
  17 #include "access/xlogreader.h"
  18 #include "lib/stringinfo.h"
  19 #include "storage/off.h"
  20
  21 /*
  22  * XLOG records for btree operations
  23  *
  24  * XLOG allows to store some information in high 4 bits of log
  25  * record xl_info field
  26  */
  27 #define XLOG_BTREE_INSERT_LEAF  0x00    /* add index tuple without split */
  28 #define XLOG_BTREE_INSERT_UPPER 0x10    /* same, on a non-leaf page */
  29 #define XLOG_BTREE_INSERT_META  0x20    /* same, plus update metapage */
  30 #define XLOG_BTREE_SPLIT_L              0x30    /* add index tuple with split */
  31 #define XLOG_BTREE_SPLIT_R              0x40    /* as above, new item on right */
  32 #define XLOG_BTREE_INSERT_POST  0x50    /* add index tuple with posting split */
  33 #define XLOG_BTREE_DEDUP                0x60    /* deduplicate tuples for a page */
  34 #define XLOG_BTREE_DELETE               0x70    /* delete leaf index tuples for a page */
  35 #define XLOG_BTREE_UNLINK_PAGE  0x80    /* delete a half-dead page */
  36 #define XLOG_BTREE_UNLINK_PAGE_META 0x90        /* same, and update metapage */
  37 #define XLOG_BTREE_NEWROOT              0xA0    /* new root page */
  38 #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0      /* mark a leaf as half-dead */
  39 #define XLOG_BTREE_VACUUM               0xC0    /* delete entries on a page during
  40                                                                                  * vacuum */
  41 #define XLOG_BTREE_REUSE_PAGE   0xD0    /* old page is about to be reused from
  42                                                                                  * FSM */
  43 #define XLOG_BTREE_META_CLEANUP 0xE0    /* update cleanup-related data in the
  44                                                                                  * metapage */
  45
  46 /*
  47  * All that we need to regenerate the meta-data page
  48  */
  49 typedef struct xl_btree_metadata
  50 {
  51         uint32          version;
  52         BlockNumber root;
  53         uint32          level;
  54         BlockNumber fastroot;
  55         uint32          fastlevel;
  56         uint32          last_cleanup_num_delpages;
  57         bool            allequalimage;
  58 } xl_btree_metadata;
  59
  60 /*
  61  * This is what we need to know about simple (without split) insert.
  62  *
  63  * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
  64  * INSERT_POST.  Note that INSERT_META and INSERT_UPPER implies it's not a
  65  * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
  66  * page.
  67  *
  68  * Backup Blk 0: original page
  69  * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
  70  * Backup Blk 2: xl_btree_metadata, if INSERT_META
  71  *
  72  * Note: The new tuple is actually the "original" new item in the posting
  73  * list split insert case (i.e. the INSERT_POST case).  A split offset for
  74  * the posting list is logged before the original new item.  Recovery needs
  75  * both, since it must do an in-place update of the existing posting list
  76  * that was split as an extra step.  Also, recovery generates a "final"
  77  * newitem.  See _bt_swap_posting() for details on posting list splits.
  78  */
  79 typedef struct xl_btree_insert
  80 {
  81         OffsetNumber offnum;
  82
  83         /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
  84         /* NEW TUPLE ALWAYS FOLLOWS AT THE END */
  85 } xl_btree_insert;
  86
  87 #define SizeOfBtreeInsert       (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
  88
  89 /*
  90  * On insert with split, we save all the items going into the right sibling
  91  * so that we can restore it completely from the log record.  This way takes
  92  * less xlog space than the normal approach, because if we did it standardly,
  93  * XLogInsert would almost always think the right page is new and store its
  94  * whole page image.  The left page, however, is handled in the normal
  95  * incremental-update fashion.
  96  *
  97  * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
  98  * There are two variants to indicate whether the inserted tuple went into the
  99  * left or right split page (and thus, whether the new item is stored or not).
 100  * We always log the left page high key because suffix truncation can generate
 101  * a new leaf high key using user-defined code.  This is also necessary on
 102  * internal pages, since the firstright item that the left page's high key was
 103  * based on will have been truncated to zero attributes in the right page (the
 104  * separator key is unavailable from the right page).
 105  *
 106  * Backup Blk 0: original page / new left page
 107  *
 108  * The left page's data portion contains the new item, if it's the _L variant.
 109  * _R variant split records generally do not have a newitem (_R variant leaf
 110  * page split records that must deal with a posting list split will include an
 111  * explicit newitem, though it is never used on the right page -- it is
 112  * actually an orignewitem needed to update existing posting list).  The new
 113  * high key of the left/original page appears last of all (and must always be
 114  * present).
 115  *
 116  * Page split records that need the REDO routine to deal with a posting list
 117  * split directly will have an explicit newitem, which is actually an
 118  * orignewitem (the newitem as it was before the posting list split, not
 119  * after).  A posting list split always has a newitem that comes immediately
 120  * after the posting list being split (which would have overlapped with
 121  * orignewitem prior to split).  Usually REDO must deal with posting list
 122  * splits with an _L variant page split record, and usually both the new
 123  * posting list and the final newitem go on the left page (the existing
 124  * posting list will be inserted instead of the old, and the final newitem
 125  * will be inserted next to that).  However, _R variant split records will
 126  * include an orignewitem when the split point for the page happens to have a
 127  * lastleft tuple that is also the posting list being split (leaving newitem
 128  * as the page split's firstright tuple).  The existence of this corner case
 129  * does not change the basic fact about newitem/orignewitem for the REDO
 130  * routine: it is always state used for the left page alone.  (This is why the
 131  * record's postingoff field isn't a reliable indicator of whether or not a
 132  * posting list split occurred during the page split; a non-zero value merely
 133  * indicates that the REDO routine must reconstruct a new posting list tuple
 134  * that is needed for the left page.)
 135  *
 136  * This posting list split handling is equivalent to the xl_btree_insert REDO
 137  * routine's INSERT_POST handling.  While the details are more complicated
 138  * here, the concept and goals are exactly the same.  See _bt_swap_posting()
 139  * for details on posting list splits.
 140  *
 141  * Backup Blk 1: new right page
 142  *
 143  * The right page's data portion contains the right page's tuples in the form
 144  * used by _bt_restore_page.  This includes the new item, if it's the _R
 145  * variant.  The right page's tuples also include the right page's high key
 146  * with either variant (moved from the left/original page during the split),
 147  * unless the split happened to be of the rightmost page on its level, where
 148  * there is no high key for new right page.
 149  *
 150  * Backup Blk 2: next block (orig page's rightlink), if any
 151  * Backup Blk 3: child's left sibling, if non-leaf split
 152  */
 153 typedef struct xl_btree_split
 154 {
 155         uint32          level;                  /* tree level of page being split */
 156         OffsetNumber firstrightoff; /* first origpage item on rightpage */
 157         OffsetNumber newitemoff;        /* new item's offset */
 158         uint16          postingoff;             /* offset inside orig posting tuple */
 159 } xl_btree_split;
 160
 161 #define SizeOfBtreeSplit        (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
 162
 163 /*
 164  * When page is deduplicated, consecutive groups of tuples with equal keys are
 165  * merged together into posting list tuples.
 166  *
 167  * The WAL record represents a deduplication pass for a leaf page.  An array
 168  * of BTDedupInterval structs follows.
 169  */
 170 typedef struct xl_btree_dedup
 171 {
 172         uint16          nintervals;
 173
 174         /* DEDUPLICATION INTERVALS FOLLOW */
 175 } xl_btree_dedup;
 176
 177 #define SizeOfBtreeDedup        (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
 178
 179 /*
 180  * This is what we need to know about page reuse within btree.  This record
 181  * only exists to generate a conflict point for Hot Standby.
 182  *
 183  * Note that we must include a RelFileNode in the record because we don't
 184  * actually register the buffer with the record.
 185  */
 186 typedef struct xl_btree_reuse_page
 187 {
 188         RelFileNode node;
 189         BlockNumber block;
 190         FullTransactionId latestRemovedFullXid;
 191 } xl_btree_reuse_page;
 192
 193 #define SizeOfBtreeReusePage    (sizeof(xl_btree_reuse_page))
 194
 195 /*
 196  * xl_btree_vacuum and xl_btree_delete records describe deletion of index
 197  * tuples on a leaf page.  The former variant is used by VACUUM, while the
 198  * latter variant is used by the ad-hoc deletions that sometimes take place
 199  * when btinsert() is called.
 200  *
 201  * The records are very similar.  The only difference is that xl_btree_delete
 202  * has to include a latestRemovedXid field to generate recovery conflicts.
 203  * (VACUUM operations can just rely on earlier conflicts generated during
 204  * pruning of the table whose TIDs the to-be-deleted index tuples point to.
 205  * There are also small differences between each REDO routine that we don't go
 206  * into here.)
 207  *
 208  * xl_btree_vacuum and xl_btree_delete both represent deletion of any number
 209  * of index tuples on a single leaf page using page offset numbers.  Both also
 210  * support "updates" of index tuples, which is how deletes of a subset of TIDs
 211  * contained in an existing posting list tuple are implemented.
 212  *
 213  * Updated posting list tuples are represented using xl_btree_update metadata.
 214  * The REDO routines each use the xl_btree_update entries (plus each
 215  * corresponding original index tuple from the target leaf page) to generate
 216  * the final updated tuple.
 217  *
 218  * Updates are only used when there will be some remaining TIDs left by the
 219  * REDO routine.  Otherwise the posting list tuple just gets deleted outright.
 220  */
 221 typedef struct xl_btree_vacuum
 222 {
 223         uint16          ndeleted;
 224         uint16          nupdated;
 225
 226         /* DELETED TARGET OFFSET NUMBERS FOLLOW */
 227         /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
 228         /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
 229 } xl_btree_vacuum;
 230
 231 #define SizeOfBtreeVacuum       (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
 232
 233 typedef struct xl_btree_delete
 234 {
 235         TransactionId latestRemovedXid;
 236         uint16          ndeleted;
 237         uint16          nupdated;
 238
 239         /* DELETED TARGET OFFSET NUMBERS FOLLOW */
 240         /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
 241         /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
 242 } xl_btree_delete;
 243
 244 #define SizeOfBtreeDelete       (offsetof(xl_btree_delete, nupdated) + sizeof(uint16))
 245
 246 /*
 247  * The offsets that appear in xl_btree_update metadata are offsets into the
 248  * original posting list from tuple, not page offset numbers.  These are
 249  * 0-based.  The page offset number for the original posting list tuple comes
 250  * from the main xl_btree_vacuum/xl_btree_delete record.
 251  */
 252 typedef struct xl_btree_update
 253 {
 254         uint16          ndeletedtids;
 255
 256         /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
 257 } xl_btree_update;
 258
 259 #define SizeOfBtreeUpdate       (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
 260
 261 /*
 262  * This is what we need to know about marking an empty subtree for deletion.
 263  * The target identifies the tuple removed from the parent page (note that we
 264  * remove this tuple's downlink and the *following* tuple's key).  Note that
 265  * the leaf page is empty, so we don't need to store its content --- it is
 266  * just reinitialized during recovery using the rest of the fields.
 267  *
 268  * Backup Blk 0: leaf block
 269  * Backup Blk 1: top parent
 270  */
 271 typedef struct xl_btree_mark_page_halfdead
 272 {
 273         OffsetNumber poffset;           /* deleted tuple id in parent page */
 274
 275         /* information needed to recreate the leaf page: */
 276         BlockNumber leafblk;            /* leaf block ultimately being deleted */
 277         BlockNumber leftblk;            /* leaf block's left sibling, if any */
 278         BlockNumber rightblk;           /* leaf block's right sibling */
 279         BlockNumber topparent;          /* topmost internal page in the subtree */
 280 } xl_btree_mark_page_halfdead;
 281
 282 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
 283
 284 /*
 285  * This is what we need to know about deletion of a btree page.  Note that we
 286  * only leave behind a small amount of bookkeeping information in deleted
 287  * pages (deleted pages must be kept around as tombstones for a while).  It is
 288  * convenient for the REDO routine to regenerate its target page from scratch.
 289  * This is why WAL record describes certain details that are actually directly
 290  * available from the target page.
 291  *
 292  * Backup Blk 0: target block being deleted
 293  * Backup Blk 1: target block's left sibling, if any
 294  * Backup Blk 2: target block's right sibling
 295  * Backup Blk 3: leaf block (if different from target)
 296  * Backup Blk 4: metapage (if rightsib becomes new fast root)
 297  */
 298 typedef struct xl_btree_unlink_page
 299 {
 300         BlockNumber leftsib;            /* target block's left sibling, if any */
 301         BlockNumber rightsib;           /* target block's right sibling */
 302         uint32          level;                  /* target block's level */
 303         FullTransactionId safexid;      /* target block's BTPageSetDeleted() XID */
 304
 305         /*
 306          * Information needed to recreate a half-dead leaf page with correct
 307          * topparent link.  The fields are only used when deletion operation's
 308          * target page is an internal page.  REDO routine creates half-dead page
 309          * from scratch to keep things simple (this is the same convenient
 310          * approach used for the target page itself).
 311          */
 312         BlockNumber leafleftsib;
 313         BlockNumber leafrightsib;
 314         BlockNumber leaftopparent;      /* next child down in the subtree */
 315
 316         /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
 317 } xl_btree_unlink_page;
 318
 319 #define SizeOfBtreeUnlinkPage   (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
 320
 321 /*
 322  * New root log record.  There are zero tuples if this is to establish an
 323  * empty root, or two if it is the result of splitting an old root.
 324  *
 325  * Note that although this implies rewriting the metadata page, we don't need
 326  * an xl_btree_metadata record --- the rootblk and level are sufficient.
 327  *
 328  * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
 329  * Backup Blk 1: left child (if splitting an old root)
 330  * Backup Blk 2: metapage
 331  */
 332 typedef struct xl_btree_newroot
 333 {
 334         BlockNumber rootblk;            /* location of new root (redundant with blk 0) */
 335         uint32          level;                  /* its tree level */
 336 } xl_btree_newroot;
 337
 338 #define SizeOfBtreeNewroot      (offsetof(xl_btree_newroot, level) + sizeof(uint32))
 339
 340
 341 /*
 342  * prototypes for functions in nbtxlog.c
 343  */
 344 extern void btree_redo(XLogReaderState *record);
 345 extern void btree_desc(StringInfo buf, XLogReaderState *record);
 346 extern const char *btree_identify(uint8 info);
 347 extern void btree_xlog_startup(void);
 348 extern void btree_xlog_cleanup(void);
 349 extern void btree_mask(char *pagedata, BlockNumber blkno);
 350
 351 #endif                                                  /* NBTXLOG_H */