src/include/access/nbtree.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtree.h
   4  *        header file for postgres btree access method implementation.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/include/access/nbtree.h
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #ifndef NBTREE_H
  15 #define NBTREE_H
  16
  17 #include "access/amapi.h"
  18 #include "access/itup.h"
  19 #include "access/sdir.h"
  20 #include "access/tableam.h"
  21 #include "access/xlogreader.h"
  22 #include "catalog/pg_am_d.h"
  23 #include "catalog/pg_index.h"
  24 #include "lib/stringinfo.h"
  25 #include "storage/bufmgr.h"
  26 #include "storage/shm_toc.h"
  27
  28 /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
  29 typedef uint16 BTCycleId;
  30
  31 /*
  32  *      BTPageOpaqueData -- At the end of every page, we store a pointer
  33  *      to both siblings in the tree.  This is used to do forward/backward
  34  *      index scans.  The next-page link is also critical for recovery when
  35  *      a search has navigated to the wrong page due to concurrent page splits
  36  *      or deletions; see src/backend/access/nbtree/README for more info.
  37  *
  38  *      In addition, we store the page's btree level (counting upwards from
  39  *      zero at a leaf page) as well as some flag bits indicating the page type
  40  *      and status.  If the page is deleted, a BTDeletedPageData struct is stored
  41  *      in the page's tuple area, while a standard BTPageOpaqueData struct is
  42  *      stored in the page special area.
  43  *
  44  *      We also store a "vacuum cycle ID".  When a page is split while VACUUM is
  45  *      processing the index, a nonzero value associated with the VACUUM run is
  46  *      stored into both halves of the split page.  (If VACUUM is not running,
  47  *      both pages receive zero cycleids.)      This allows VACUUM to detect whether
  48  *      a page was split since it started, with a small probability of false match
  49  *      if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
  50  *      ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
  51  *      (original) page, and set in the right page, but only if the next page
  52  *      to its right has a different cycleid.
  53  *
  54  *      NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
  55  *      instead.
  56  *
  57  *      NOTE: the btpo_level field used to be a union type in order to allow
  58  *      deleted pages to store a 32-bit safexid in the same field.  We now store
  59  *      64-bit/full safexid values using BTDeletedPageData instead.
  60  */
  61
  62 typedef struct BTPageOpaqueData
  63 {
  64         BlockNumber btpo_prev;          /* left sibling, or P_NONE if leftmost */
  65         BlockNumber btpo_next;          /* right sibling, or P_NONE if rightmost */
  66         uint32          btpo_level;             /* tree level --- zero for leaf pages */
  67         uint16          btpo_flags;             /* flag bits, see below */
  68         BTCycleId       btpo_cycleid;   /* vacuum cycle ID of latest split */
  69 } BTPageOpaqueData;
  70
  71 typedef BTPageOpaqueData *BTPageOpaque;
  72
  73 /* Bits defined in btpo_flags */
  74 #define BTP_LEAF                (1 << 0)        /* leaf page, i.e. not internal page */
  75 #define BTP_ROOT                (1 << 1)        /* root page (has no parent) */
  76 #define BTP_DELETED             (1 << 2)        /* page has been deleted from tree */
  77 #define BTP_META                (1 << 3)        /* meta-page */
  78 #define BTP_HALF_DEAD   (1 << 4)        /* empty, but still in tree */
  79 #define BTP_SPLIT_END   (1 << 5)        /* rightmost page of split group */
  80 #define BTP_HAS_GARBAGE (1 << 6)        /* page has LP_DEAD tuples (deprecated) */
  81 #define BTP_INCOMPLETE_SPLIT (1 << 7)   /* right sibling's downlink is missing */
  82 #define BTP_HAS_FULLXID (1 << 8)        /* contains BTDeletedPageData */
  83
  84 /*
  85  * The max allowed value of a cycle ID is a bit less than 64K.  This is
  86  * for convenience of pg_filedump and similar utilities: we want to use
  87  * the last 2 bytes of special space as an index type indicator, and
  88  * restricting cycle ID lets btree use that space for vacuum cycle IDs
  89  * while still allowing index type to be identified.
  90  */
  91 #define MAX_BT_CYCLE_ID         0xFF7F
  92
  93
  94 /*
  95  * The Meta page is always the first page in the btree index.
  96  * Its primary purpose is to point to the location of the btree root page.
  97  * We also point to the "fast" root, which is the current effective root;
  98  * see README for discussion.
  99  */
 100
 101 typedef struct BTMetaPageData
 102 {
 103         uint32          btm_magic;              /* should contain BTREE_MAGIC */
 104         uint32          btm_version;    /* nbtree version (always <= BTREE_VERSION) */
 105         BlockNumber btm_root;           /* current root location */
 106         uint32          btm_level;              /* tree level of the root page */
 107         BlockNumber btm_fastroot;       /* current "fast" root location */
 108         uint32          btm_fastlevel;  /* tree level of the "fast" root page */
 109         /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
 110
 111         /* number of deleted, non-recyclable pages during last cleanup */
 112         uint32          btm_last_cleanup_num_delpages;
 113         /* number of heap tuples during last cleanup (deprecated) */
 114         float8          btm_last_cleanup_num_heap_tuples;
 115
 116         bool            btm_allequalimage;      /* are all columns "equalimage"? */
 117 } BTMetaPageData;
 118
 119 #define BTPageGetMeta(p) \
 120         ((BTMetaPageData *) PageGetContents(p))
 121
 122 /*
 123  * The current Btree version is 4.  That's what you'll get when you create
 124  * a new index.
 125  *
 126  * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
 127  * version 4, but heap TIDs were not part of the keyspace.  Index tuples
 128  * with duplicate keys could be stored in any order.  We continue to
 129  * support reading and writing Btree versions 2 and 3, so that they don't
 130  * need to be immediately re-indexed at pg_upgrade.  In order to get the
 131  * new heapkeyspace semantics, however, a REINDEX is needed.
 132  *
 133  * Deduplication is safe to use when the btm_allequalimage field is set to
 134  * true.  It's safe to read the btm_allequalimage field on version 3, but
 135  * only version 4 indexes make use of deduplication.  Even version 4
 136  * indexes created on PostgreSQL v12 will need a REINDEX to make use of
 137  * deduplication, though, since there is no other way to set
 138  * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
 139  * metapage field).
 140  *
 141  * Btree version 2 is mostly the same as version 3.  There are two new
 142  * fields in the metapage that were introduced in version 3.  A version 2
 143  * metapage will be automatically upgraded to version 3 on the first
 144  * insert to it.  INCLUDE indexes cannot use version 2.
 145  */
 146 #define BTREE_METAPAGE  0               /* first page is meta */
 147 #define BTREE_MAGIC             0x053162        /* magic number in metapage */
 148 #define BTREE_VERSION   4               /* current version number */
 149 #define BTREE_MIN_VERSION       2       /* minimum supported version */
 150 #define BTREE_NOVAC_VERSION     3       /* version with all meta fields set */
 151
 152 /*
 153  * Maximum size of a btree index entry, including its tuple header.
 154  *
 155  * We actually need to be able to fit three items on every page,
 156  * so restrict any one item to 1/3 the per-page available space.
 157  *
 158  * There are rare cases where _bt_truncate() will need to enlarge
 159  * a heap index tuple to make space for a tiebreaker heap TID
 160  * attribute, which we account for here.
 161  */
 162 #define BTMaxItemSize(page) \
 163         MAXALIGN_DOWN((PageGetPageSize(page) - \
 164                                    MAXALIGN(SizeOfPageHeaderData + \
 165                                                         3*sizeof(ItemIdData)  + \
 166                                                         3*sizeof(ItemPointerData)) - \
 167                                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
 168 #define BTMaxItemSizeNoHeapTid(page) \
 169         MAXALIGN_DOWN((PageGetPageSize(page) - \
 170                                    MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 171                                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
 172
 173 /*
 174  * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
 175  * that may be stored on a btree leaf page.  It is used to size the
 176  * per-page temporary buffers.
 177  *
 178  * Note: we don't bother considering per-tuple overheads here to keep
 179  * things simple (value is based on how many elements a single array of
 180  * heap TIDs must have to fill the space between the page header and
 181  * special area).  The value is slightly higher (i.e. more conservative)
 182  * than necessary as a result, which is considered acceptable.
 183  */
 184 #define MaxTIDsPerBTreePage \
 185         (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
 186                    sizeof(ItemPointerData))
 187
 188 /*
 189  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
 190  * For pages above the leaf level, we use a fixed 70% fillfactor.
 191  * The fillfactor is applied during index build and when splitting
 192  * a rightmost page; when splitting non-rightmost pages we try to
 193  * divide the data equally.  When splitting a page that's entirely
 194  * filled with a single value (duplicates), the effective leaf-page
 195  * fillfactor is 96%, regardless of whether the page is a rightmost
 196  * page.
 197  */
 198 #define BTREE_MIN_FILLFACTOR            10
 199 #define BTREE_DEFAULT_FILLFACTOR        90
 200 #define BTREE_NONLEAF_FILLFACTOR        70
 201 #define BTREE_SINGLEVAL_FILLFACTOR      96
 202
 203 /*
 204  *      In general, the btree code tries to localize its knowledge about
 205  *      page layout to a couple of routines.  However, we need a special
 206  *      value to indicate "no page number" in those places where we expect
 207  *      page numbers.  We can use zero for this because we never need to
 208  *      make a pointer to the metadata page.
 209  */
 210
 211 #define P_NONE                  0
 212
 213 /*
 214  * Macros to test whether a page is leftmost or rightmost on its tree level,
 215  * as well as other state info kept in the opaque data.
 216  */
 217 #define P_LEFTMOST(opaque)              ((opaque)->btpo_prev == P_NONE)
 218 #define P_RIGHTMOST(opaque)             ((opaque)->btpo_next == P_NONE)
 219 #define P_ISLEAF(opaque)                (((opaque)->btpo_flags & BTP_LEAF) != 0)
 220 #define P_ISROOT(opaque)                (((opaque)->btpo_flags & BTP_ROOT) != 0)
 221 #define P_ISDELETED(opaque)             (((opaque)->btpo_flags & BTP_DELETED) != 0)
 222 #define P_ISMETA(opaque)                (((opaque)->btpo_flags & BTP_META) != 0)
 223 #define P_ISHALFDEAD(opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 224 #define P_IGNORE(opaque)                (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 225 #define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 226 #define P_INCOMPLETE_SPLIT(opaque)      (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 227 #define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 228
 229 /*
 230  * BTDeletedPageData is the page contents of a deleted page
 231  */
 232 typedef struct BTDeletedPageData
 233 {
 234         FullTransactionId safexid;      /* See BTPageIsRecyclable() */
 235 } BTDeletedPageData;
 236
 237 static inline void
 238 BTPageSetDeleted(Page page, FullTransactionId safexid)
 239 {
 240         BTPageOpaque opaque;
 241         PageHeader      header;
 242         BTDeletedPageData *contents;
 243
 244         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 245         header = ((PageHeader) page);
 246
 247         opaque->btpo_flags &= ~BTP_HALF_DEAD;
 248         opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
 249         header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
 250                 sizeof(BTDeletedPageData);
 251         header->pd_upper = header->pd_special;
 252
 253         /* Set safexid in deleted page */
 254         contents = ((BTDeletedPageData *) PageGetContents(page));
 255         contents->safexid = safexid;
 256 }
 257
 258 static inline FullTransactionId
 259 BTPageGetDeleteXid(Page page)
 260 {
 261         BTPageOpaque opaque;
 262         BTDeletedPageData *contents;
 263
 264         /* We only expect to be called with a deleted page */
 265         Assert(!PageIsNew(page));
 266         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 267         Assert(P_ISDELETED(opaque));
 268
 269         /* pg_upgrade'd deleted page -- must be safe to delete now */
 270         if (!P_HAS_FULLXID(opaque))
 271                 return FirstNormalFullTransactionId;
 272
 273         /* Get safexid from deleted page */
 274         contents = ((BTDeletedPageData *) PageGetContents(page));
 275         return contents->safexid;
 276 }
 277
 278 /*
 279  * Is an existing page recyclable?
 280  *
 281  * This exists to centralize the policy on which deleted pages are now safe to
 282  * re-use.  However, _bt_pendingfsm_finalize() duplicates some of the same
 283  * logic because it doesn't work directly with pages -- keep the two in sync.
 284  *
 285  * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
 286  * them here (caller is responsible for that case themselves).  Caller might
 287  * well need special handling for new pages anyway.
 288  */
 289 static inline bool
 290 BTPageIsRecyclable(Page page)
 291 {
 292         BTPageOpaque opaque;
 293
 294         Assert(!PageIsNew(page));
 295
 296         /* Recycling okay iff page is deleted and safexid is old enough */
 297         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 298         if (P_ISDELETED(opaque))
 299         {
 300                 /*
 301                  * The page was deleted, but when? If it was just deleted, a scan
 302                  * might have seen the downlink to it, and will read the page later.
 303                  * As long as that can happen, we must keep the deleted page around as
 304                  * a tombstone.
 305                  *
 306                  * For that check if the deletion XID could still be visible to
 307                  * anyone. If not, then no scan that's still in progress could have
 308                  * seen its downlink, and we can recycle it.
 309                  *
 310                  * XXX: If we had the heap relation we could be more aggressive about
 311                  * recycling deleted pages in non-catalog relations.  For now we just
 312                  * pass NULL.  That is at least simple and consistent.
 313                  */
 314                 return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page));
 315         }
 316
 317         return false;
 318 }
 319
 320 /*
 321  * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM.
 322  * They are exported for use by page deletion related code in nbtpage.c.
 323  */
 324 typedef struct BTPendingFSM
 325 {
 326         BlockNumber target;                     /* Page deleted by current VACUUM */
 327         FullTransactionId safexid;      /* Page's BTDeletedPageData.safexid */
 328 } BTPendingFSM;
 329
 330 typedef struct BTVacState
 331 {
 332         IndexVacuumInfo *info;
 333         IndexBulkDeleteResult *stats;
 334         IndexBulkDeleteCallback callback;
 335         void       *callback_state;
 336         BTCycleId       cycleid;
 337         MemoryContext pagedelcontext;
 338
 339         /*
 340          * _bt_pendingfsm_finalize() state
 341          */
 342         int                     bufsize;                /* pendingpages space (in # elements) */
 343         int                     maxbufsize;             /* max bufsize that respects work_mem */
 344         BTPendingFSM *pendingpages; /* One entry per newly deleted page */
 345         int                     npendingpages;  /* current # valid pendingpages */
 346 } BTVacState;
 347
 348 /*
 349  *      Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
 350  *      page.  The high key is not a tuple that is used to visit the heap.  It is
 351  *      a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
 352  *      The high key on a page is required to be greater than or equal to any
 353  *      other key that appears on the page.  If we find ourselves trying to
 354  *      insert a key that is strictly > high key, we know we need to move right
 355  *      (this should only happen if the page was split since we examined the
 356  *      parent page).
 357  *
 358  *      Our insertion algorithm guarantees that we can use the initial least key
 359  *      on our right sibling as the high key.  Once a page is created, its high
 360  *      key changes only if the page is split.
 361  *
 362  *      On a non-rightmost page, the high key lives in item 1 and data items
 363  *      start in item 2.  Rightmost pages have no high key, so we store data
 364  *      items beginning in item 1.
 365  */
 366
 367 #define P_HIKEY                         ((OffsetNumber) 1)
 368 #define P_FIRSTKEY                      ((OffsetNumber) 2)
 369 #define P_FIRSTDATAKEY(opaque)  (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 370
 371 /*
 372  * Notes on B-Tree tuple format, and key and non-key attributes:
 373  *
 374  * INCLUDE B-Tree indexes have non-key attributes.  These are extra
 375  * attributes that may be returned by index-only scans, but do not influence
 376  * the order of items in the index (formally, non-key attributes are not
 377  * considered to be part of the key space).  Non-key attributes are only
 378  * present in leaf index tuples whose item pointers actually point to heap
 379  * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
 380  * described here.
 381  *
 382  * Non-pivot tuple format (plain/non-posting variant):
 383  *
 384  *  t_tid | t_info | key values | INCLUDE columns, if any
 385  *
 386  * t_tid points to the heap TID, which is a tiebreaker key column as of
 387  * BTREE_VERSION 4.
 388  *
 389  * Non-pivot tuples complement pivot tuples, which only have key columns.
 390  * The sole purpose of pivot tuples is to represent how the key space is
 391  * separated.  In general, any B-Tree index that has more than one level
 392  * (i.e. any index that does not just consist of a metapage and a single
 393  * leaf root page) must have some number of pivot tuples, since pivot
 394  * tuples are used for traversing the tree.  Suffix truncation can omit
 395  * trailing key columns when a new pivot is formed, which makes minus
 396  * infinity their logical value.  Since BTREE_VERSION 4 indexes treat heap
 397  * TID as a trailing key column that ensures that all index tuples are
 398  * physically unique, it is necessary to represent heap TID as a trailing
 399  * key column in pivot tuples, though very often this can be truncated
 400  * away, just like any other key column. (Actually, the heap TID is
 401  * omitted rather than truncated, since its representation is different to
 402  * the non-pivot representation.)
 403  *
 404  * Pivot tuple format:
 405  *
 406  *  t_tid | t_info | key values | [heap TID]
 407  *
 408  * We store the number of columns present inside pivot tuples by abusing
 409  * their t_tid offset field, since pivot tuples never need to store a real
 410  * offset (pivot tuples generally store a downlink in t_tid, though).  The
 411  * offset field only stores the number of columns/attributes when the
 412  * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
 413  * TID column sometimes stored in pivot tuples -- that's represented by
 414  * the presence of BT_PIVOT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in
 415  * t_info is always set on BTREE_VERSION 4 pivot tuples, since
 416  * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
 417  *
 418  * In version 2 or version 3 (!heapkeyspace) indexes, INDEX_ALT_TID_MASK
 419  * might not be set in pivot tuples.  BTreeTupleIsPivot() won't work
 420  * reliably as a result.  The number of columns stored is implicitly the
 421  * same as the number of columns in the index, just like any non-pivot
 422  * tuple. (The number of columns stored should not vary, since suffix
 423  * truncation of key columns is unsafe within any !heapkeyspace index.)
 424  *
 425  * The 12 least significant bits from t_tid's offset number are used to
 426  * represent the number of key columns within a pivot tuple.  This leaves 4
 427  * status bits (BT_STATUS_OFFSET_MASK bits), which are shared by all tuples
 428  * that have the INDEX_ALT_TID_MASK bit set (set in t_info) to store basic
 429  * tuple metadata.  BTreeTupleIsPivot() and BTreeTupleIsPosting() use the
 430  * BT_STATUS_OFFSET_MASK bits.
 431  *
 432  * Sometimes non-pivot tuples also use a representation that repurposes
 433  * t_tid to store metadata rather than a TID.  PostgreSQL v13 introduced a
 434  * new non-pivot tuple format to support deduplication: posting list
 435  * tuples.  Deduplication merges together multiple equal non-pivot tuples
 436  * into a logically equivalent, space efficient representation.  A posting
 437  * list is an array of ItemPointerData elements.  Non-pivot tuples are
 438  * merged together to form posting list tuples lazily, at the point where
 439  * we'd otherwise have to split a leaf page.
 440  *
 441  * Posting tuple format (alternative non-pivot tuple representation):
 442  *
 443  *  t_tid | t_info | key values | posting list (TID array)
 444  *
 445  * Posting list tuples are recognized as such by having the
 446  * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
 447  * bit set in t_tid's offset number.  These flags redefine the content of
 448  * the posting tuple's t_tid to store the location of the posting list
 449  * (instead of a block number), as well as the total number of heap TIDs
 450  * present in the tuple (instead of a real offset number).
 451  *
 452  * The 12 least significant bits from t_tid's offset number are used to
 453  * represent the number of heap TIDs present in the tuple, leaving 4 status
 454  * bits (the BT_STATUS_OFFSET_MASK bits).  Like any non-pivot tuple, the
 455  * number of columns stored is always implicitly the total number in the
 456  * index (in practice there can never be non-key columns stored, since
 457  * deduplication is not supported with INCLUDE indexes).
 458  */
 459 #define INDEX_ALT_TID_MASK                      INDEX_AM_RESERVED_BIT
 460
 461 /* Item pointer offset bit masks */
 462 #define BT_OFFSET_MASK                          0x0FFF
 463 #define BT_STATUS_OFFSET_MASK           0xF000
 464 /* BT_STATUS_OFFSET_MASK status bits */
 465 #define BT_PIVOT_HEAP_TID_ATTR          0x1000
 466 #define BT_IS_POSTING                           0x2000
 467
 468 /*
 469  * Note: BTreeTupleIsPivot() can have false negatives (but not false
 470  * positives) when used with !heapkeyspace indexes
 471  */
 472 static inline bool
 473 BTreeTupleIsPivot(IndexTuple itup)
 474 {
 475         if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
 476                 return false;
 477         /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
 478         if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
 479                 return false;
 480
 481         return true;
 482 }
 483
 484 static inline bool
 485 BTreeTupleIsPosting(IndexTuple itup)
 486 {
 487         if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
 488                 return false;
 489         /* presence of BT_IS_POSTING in offset number indicates posting tuple */
 490         if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
 491                 return false;
 492
 493         return true;
 494 }
 495
 496 static inline void
 497 BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
 498 {
 499         Assert(nhtids > 1);
 500         Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
 501         Assert((size_t) postingoffset == MAXALIGN(postingoffset));
 502         Assert(postingoffset < INDEX_SIZE_MASK);
 503         Assert(!BTreeTupleIsPivot(itup));
 504
 505         itup->t_info |= INDEX_ALT_TID_MASK;
 506         ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
 507         ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
 508 }
 509
 510 static inline uint16
 511 BTreeTupleGetNPosting(IndexTuple posting)
 512 {
 513         OffsetNumber existing;
 514
 515         Assert(BTreeTupleIsPosting(posting));
 516
 517         existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
 518         return (existing & BT_OFFSET_MASK);
 519 }
 520
 521 static inline uint32
 522 BTreeTupleGetPostingOffset(IndexTuple posting)
 523 {
 524         Assert(BTreeTupleIsPosting(posting));
 525
 526         return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
 527 }
 528
 529 static inline ItemPointer
 530 BTreeTupleGetPosting(IndexTuple posting)
 531 {
 532         return (ItemPointer) ((char *) posting +
 533                                                   BTreeTupleGetPostingOffset(posting));
 534 }
 535
 536 static inline ItemPointer
 537 BTreeTupleGetPostingN(IndexTuple posting, int n)
 538 {
 539         return BTreeTupleGetPosting(posting) + n;
 540 }
 541
 542 /*
 543  * Get/set downlink block number in pivot tuple.
 544  *
 545  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
 546  * !heapkeyspace indexes would exhibit false positive assertion failures.
 547  */
 548 static inline BlockNumber
 549 BTreeTupleGetDownLink(IndexTuple pivot)
 550 {
 551         return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
 552 }
 553
 554 static inline void
 555 BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
 556 {
 557         ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
 558 }
 559
 560 /*
 561  * Get number of attributes within tuple.
 562  *
 563  * Note that this does not include an implicit tiebreaker heap TID
 564  * attribute, if any.  Note also that the number of key attributes must be
 565  * explicitly represented in all heapkeyspace pivot tuples.
 566  *
 567  * Note: This is defined as a macro rather than an inline function to
 568  * avoid including rel.h.
 569  */
 570 #define BTreeTupleGetNAtts(itup, rel)   \
 571         ( \
 572                 (BTreeTupleIsPivot(itup)) ? \
 573                 ( \
 574                         ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
 575                 ) \
 576                 : \
 577                 IndexRelationGetNumberOfAttributes(rel) \
 578         )
 579
 580 /*
 581  * Set number of key attributes in tuple.
 582  *
 583  * The heap TID tiebreaker attribute bit may also be set here, indicating that
 584  * a heap TID value will be stored at the end of the tuple (i.e. using the
 585  * special pivot tuple representation).
 586  */
 587 static inline void
 588 BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
 589 {
 590         Assert(nkeyatts <= INDEX_MAX_KEYS);
 591         Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
 592         Assert(!heaptid || nkeyatts > 0);
 593         Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
 594
 595         itup->t_info |= INDEX_ALT_TID_MASK;
 596
 597         if (heaptid)
 598                 nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
 599
 600         /* BT_IS_POSTING bit is deliberately unset here */
 601         ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
 602         Assert(BTreeTupleIsPivot(itup));
 603 }
 604
 605 /*
 606  * Get/set leaf page's "top parent" link from its high key.  Used during page
 607  * deletion.
 608  *
 609  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
 610  * !heapkeyspace indexes would exhibit false positive assertion failures.
 611  */
 612 static inline BlockNumber
 613 BTreeTupleGetTopParent(IndexTuple leafhikey)
 614 {
 615         return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
 616 }
 617
 618 static inline void
 619 BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
 620 {
 621         ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
 622         BTreeTupleSetNAtts(leafhikey, 0, false);
 623 }
 624
 625 /*
 626  * Get tiebreaker heap TID attribute, if any.
 627  *
 628  * This returns the first/lowest heap TID in the case of a posting list tuple.
 629  */
 630 static inline ItemPointer
 631 BTreeTupleGetHeapTID(IndexTuple itup)
 632 {
 633         if (BTreeTupleIsPivot(itup))
 634         {
 635                 /* Pivot tuple heap TID representation? */
 636                 if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
 637                          BT_PIVOT_HEAP_TID_ATTR) != 0)
 638                         return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
 639                                                                   sizeof(ItemPointerData));
 640
 641                 /* Heap TID attribute was truncated */
 642                 return NULL;
 643         }
 644         else if (BTreeTupleIsPosting(itup))
 645                 return BTreeTupleGetPosting(itup);
 646
 647         return &itup->t_tid;
 648 }
 649
 650 /*
 651  * Get maximum heap TID attribute, which could be the only TID in the case of
 652  * a non-pivot tuple that does not have a posting list tuple.
 653  *
 654  * Works with non-pivot tuples only.
 655  */
 656 static inline ItemPointer
 657 BTreeTupleGetMaxHeapTID(IndexTuple itup)
 658 {
 659         Assert(!BTreeTupleIsPivot(itup));
 660
 661         if (BTreeTupleIsPosting(itup))
 662         {
 663                 uint16          nposting = BTreeTupleGetNPosting(itup);
 664
 665                 return BTreeTupleGetPostingN(itup, nposting - 1);
 666         }
 667
 668         return &itup->t_tid;
 669 }
 670
 671 /*
 672  *      Operator strategy numbers for B-tree have been moved to access/stratnum.h,
 673  *      because many places need to use them in ScanKeyInit() calls.
 674  *
 675  *      The strategy numbers are chosen so that we can commute them by
 676  *      subtraction, thus:
 677  */
 678 #define BTCommuteStrategyNumber(strat)  (BTMaxStrategyNumber + 1 - (strat))
 679
 680 /*
 681  *      When a new operator class is declared, we require that the user
 682  *      supply us with an amproc procedure (BTORDER_PROC) for determining
 683  *      whether, for two keys a and b, a < b, a = b, or a > b.  This routine
 684  *      must return < 0, 0, > 0, respectively, in these three cases.
 685  *
 686  *      To facilitate accelerated sorting, an operator class may choose to
 687  *      offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
 688  *      src/include/utils/sortsupport.h.
 689  *
 690  *      To support window frames defined by "RANGE offset PRECEDING/FOLLOWING",
 691  *      an operator class may choose to offer a third amproc procedure
 692  *      (BTINRANGE_PROC), independently of whether it offers sortsupport.
 693  *      For full details, see doc/src/sgml/btree.sgml.
 694  *
 695  *      To facilitate B-Tree deduplication, an operator class may choose to
 696  *      offer a forth amproc procedure (BTEQUALIMAGE_PROC).  For full details,
 697  *      see doc/src/sgml/btree.sgml.
 698  */
 699
 700 #define BTORDER_PROC            1
 701 #define BTSORTSUPPORT_PROC      2
 702 #define BTINRANGE_PROC          3
 703 #define BTEQUALIMAGE_PROC       4
 704 #define BTOPTIONS_PROC          5
 705 #define BTNProcs                        5
 706
 707 /*
 708  *      We need to be able to tell the difference between read and write
 709  *      requests for pages, in order to do locking correctly.
 710  */
 711
 712 #define BT_READ                 BUFFER_LOCK_SHARE
 713 #define BT_WRITE                BUFFER_LOCK_EXCLUSIVE
 714
 715 /*
 716  * BTStackData -- As we descend a tree, we push the location of pivot
 717  * tuples whose downlink we are about to follow onto a private stack.  If
 718  * we split a leaf, we use this stack to walk back up the tree and insert
 719  * data into its parent page at the correct location.  We also have to
 720  * recursively insert into the grandparent page if and when the parent page
 721  * splits.  Our private stack can become stale due to concurrent page
 722  * splits and page deletions, but it should never give us an irredeemably
 723  * bad picture.
 724  */
 725 typedef struct BTStackData
 726 {
 727         BlockNumber bts_blkno;
 728         OffsetNumber bts_offset;
 729         struct BTStackData *bts_parent;
 730 } BTStackData;
 731
 732 typedef BTStackData *BTStack;
 733
 734 /*
 735  * BTScanInsertData is the btree-private state needed to find an initial
 736  * position for an indexscan, or to insert new tuples -- an "insertion
 737  * scankey" (not to be confused with a search scankey).  It's used to descend
 738  * a B-Tree using _bt_search.
 739  *
 740  * heapkeyspace indicates if we expect all keys in the index to be physically
 741  * unique because heap TID is used as a tiebreaker attribute, and if index may
 742  * have truncated key attributes in pivot tuples.  This is actually a property
 743  * of the index relation itself (not an indexscan).  heapkeyspace indexes are
 744  * indexes whose version is >= version 4.  It's convenient to keep this close
 745  * by, rather than accessing the metapage repeatedly.
 746  *
 747  * allequalimage is set to indicate that deduplication is safe for the index.
 748  * This is also a property of the index relation rather than an indexscan.
 749  *
 750  * anynullkeys indicates if any of the keys had NULL value when scankey was
 751  * built from index tuple (note that already-truncated tuple key attributes
 752  * set NULL as a placeholder key value, which also affects value of
 753  * anynullkeys).  This is a convenience for unique index non-pivot tuple
 754  * insertion, which usually temporarily unsets scantid, but shouldn't iff
 755  * anynullkeys is true.  Value generally matches non-pivot tuple's HasNulls
 756  * bit, but may not when inserting into an INCLUDE index (tuple header value
 757  * is affected by the NULL-ness of both key and non-key attributes).
 758  *
 759  * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
 760  * locate the first item >= scankey.  When nextkey is true, they will locate
 761  * the first item > scan key.
 762  *
 763  * pivotsearch is set to true by callers that want to re-find a leaf page
 764  * using a scankey built from a leaf page's high key.  Most callers set this
 765  * to false.
 766  *
 767  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
 768  * is set to NULL when index scan doesn't need to find a position for a
 769  * specific physical tuple.  Must be set when inserting new tuples into
 770  * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
 771  * in one exact position (it's never set with !heapkeyspace indexes, though).
 772  * Despite the representational difference, nbtree search code considers
 773  * scantid to be just another insertion scankey attribute.
 774  *
 775  * scankeys is an array of scan key entries for attributes that are compared
 776  * before scantid (user-visible attributes).  keysz is the size of the array.
 777  * During insertion, there must be a scan key for every attribute, but when
 778  * starting a regular index scan some can be omitted.  The array is used as a
 779  * flexible array member, though it's sized in a way that makes it possible to
 780  * use stack allocations.  See nbtree/README for full details.
 781  */
 782 typedef struct BTScanInsertData
 783 {
 784         bool            heapkeyspace;
 785         bool            allequalimage;
 786         bool            anynullkeys;
 787         bool            nextkey;
 788         bool            pivotsearch;
 789         ItemPointer scantid;            /* tiebreaker for scankeys */
 790         int                     keysz;                  /* Size of scankeys array */
 791         ScanKeyData scankeys[INDEX_MAX_KEYS];   /* Must appear last */
 792 } BTScanInsertData;
 793
 794 typedef BTScanInsertData *BTScanInsert;
 795
 796 /*
 797  * BTInsertStateData is a working area used during insertion.
 798  *
 799  * This is filled in after descending the tree to the first leaf page the new
 800  * tuple might belong on.  Tracks the current position while performing
 801  * uniqueness check, before we have determined which exact page to insert
 802  * to.
 803  *
 804  * (This should be private to nbtinsert.c, but it's also used by
 805  * _bt_binsrch_insert)
 806  */
 807 typedef struct BTInsertStateData
 808 {
 809         IndexTuple      itup;                   /* Item we're inserting */
 810         Size            itemsz;                 /* Size of itup -- should be MAXALIGN()'d */
 811         BTScanInsert itup_key;          /* Insertion scankey */
 812
 813         /* Buffer containing leaf page we're likely to insert itup on */
 814         Buffer          buf;
 815
 816         /*
 817          * Cache of bounds within the current buffer.  Only used for insertions
 818          * where _bt_check_unique is called.  See _bt_binsrch_insert and
 819          * _bt_findinsertloc for details.
 820          */
 821         bool            bounds_valid;
 822         OffsetNumber low;
 823         OffsetNumber stricthigh;
 824
 825         /*
 826          * if _bt_binsrch_insert found the location inside existing posting list,
 827          * save the position inside the list.  -1 sentinel value indicates overlap
 828          * with an existing posting list tuple that has its LP_DEAD bit set.
 829          */
 830         int                     postingoff;
 831 } BTInsertStateData;
 832
 833 typedef BTInsertStateData *BTInsertState;
 834
 835 /*
 836  * State used to representing an individual pending tuple during
 837  * deduplication.
 838  */
 839 typedef struct BTDedupInterval
 840 {
 841         OffsetNumber baseoff;
 842         uint16          nitems;
 843 } BTDedupInterval;
 844
 845 /*
 846  * BTDedupStateData is a working area used during deduplication.
 847  *
 848  * The status info fields track the state of a whole-page deduplication pass.
 849  * State about the current pending posting list is also tracked.
 850  *
 851  * A pending posting list is comprised of a contiguous group of equal items
 852  * from the page, starting from page offset number 'baseoff'.  This is the
 853  * offset number of the "base" tuple for new posting list.  'nitems' is the
 854  * current total number of existing items from the page that will be merged to
 855  * make a new posting list tuple, including the base tuple item.  (Existing
 856  * items may themselves be posting list tuples, or regular non-pivot tuples.)
 857  *
 858  * The total size of the existing tuples to be freed when pending posting list
 859  * is processed gets tracked by 'phystupsize'.  This information allows
 860  * deduplication to calculate the space saving for each new posting list
 861  * tuple, and for the entire pass over the page as a whole.
 862  */
 863 typedef struct BTDedupStateData
 864 {
 865         /* Deduplication status info for entire pass over page */
 866         bool            deduplicate;    /* Still deduplicating page? */
 867         int                     nmaxitems;              /* Number of max-sized tuples so far */
 868         Size            maxpostingsize; /* Limit on size of final tuple */
 869
 870         /* Metadata about base tuple of current pending posting list */
 871         IndexTuple      base;                   /* Use to form new posting list */
 872         OffsetNumber baseoff;           /* page offset of base */
 873         Size            basetupsize;    /* base size without original posting list */
 874
 875         /* Other metadata about pending posting list */
 876         ItemPointer htids;                      /* Heap TIDs in pending posting list */
 877         int                     nhtids;                 /* Number of heap TIDs in htids array */
 878         int                     nitems;                 /* Number of existing tuples/line pointers */
 879         Size            phystupsize;    /* Includes line pointer overhead */
 880
 881         /*
 882          * Array of tuples to go on new version of the page.  Contains one entry
 883          * for each group of consecutive items.  Note that existing tuples that
 884          * will not become posting list tuples do not appear in the array (they
 885          * are implicitly unchanged by deduplication pass).
 886          */
 887         int                     nintervals;             /* current number of intervals in array */
 888         BTDedupInterval intervals[MaxIndexTuplesPerPage];
 889 } BTDedupStateData;
 890
 891 typedef BTDedupStateData *BTDedupState;
 892
 893 /*
 894  * BTVacuumPostingData is state that represents how to VACUUM (or delete) a
 895  * posting list tuple when some (though not all) of its TIDs are to be
 896  * deleted.
 897  *
 898  * Convention is that itup field is the original posting list tuple on input,
 899  * and palloc()'d final tuple used to overwrite existing tuple on output.
 900  */
 901 typedef struct BTVacuumPostingData
 902 {
 903         /* Tuple that will be/was updated */
 904         IndexTuple      itup;
 905         OffsetNumber updatedoffset;
 906
 907         /* State needed to describe final itup in WAL */
 908         uint16          ndeletedtids;
 909         uint16          deletetids[FLEXIBLE_ARRAY_MEMBER];
 910 } BTVacuumPostingData;
 911
 912 typedef BTVacuumPostingData *BTVacuumPosting;
 913
 914 /*
 915  * BTScanOpaqueData is the btree-private state needed for an indexscan.
 916  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
 917  * details of the preprocessing), information about the current location
 918  * of the scan, and information about the marked location, if any.  (We use
 919  * BTScanPosData to represent the data needed for each of current and marked
 920  * locations.)  In addition we can remember some known-killed index entries
 921  * that must be marked before we can move off the current page.
 922  *
 923  * Index scans work a page at a time: we pin and read-lock the page, identify
 924  * all the matching items on the page and save them in BTScanPosData, then
 925  * release the read-lock while returning the items to the caller for
 926  * processing.  This approach minimizes lock/unlock traffic.  Note that we
 927  * keep the pin on the index page until the caller is done with all the items
 928  * (this is needed for VACUUM synchronization, see nbtree/README).  When we
 929  * are ready to step to the next page, if the caller has told us any of the
 930  * items were killed, we re-lock the page to mark them killed, then unlock.
 931  * Finally we drop the pin and step to the next page in the appropriate
 932  * direction.
 933  *
 934  * If we are doing an index-only scan, we save the entire IndexTuple for each
 935  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
 936  * into a separate workspace array; each BTScanPosItem stores its tuple's
 937  * offset within that array.  Posting list tuples store a "base" tuple once,
 938  * allowing the same key to be returned for each TID in the posting list
 939  * tuple.
 940  */
 941
 942 typedef struct BTScanPosItem    /* what we remember about each match */
 943 {
 944         ItemPointerData heapTid;        /* TID of referenced heap item */
 945         OffsetNumber indexOffset;       /* index item's location within page */
 946         LocationIndex tupleOffset;      /* IndexTuple's offset in workspace, if any */
 947 } BTScanPosItem;
 948
 949 typedef struct BTScanPosData
 950 {
 951         Buffer          buf;                    /* if valid, the buffer is pinned */
 952
 953         XLogRecPtr      lsn;                    /* pos in the WAL stream when page was read */
 954         BlockNumber currPage;           /* page referenced by items array */
 955         BlockNumber nextPage;           /* page's right link when we scanned it */
 956
 957         /*
 958          * moreLeft and moreRight track whether we think there may be matching
 959          * index entries to the left and right of the current page, respectively.
 960          * We can clear the appropriate one of these flags when _bt_checkkeys()
 961          * returns continuescan = false.
 962          */
 963         bool            moreLeft;
 964         bool            moreRight;
 965
 966         /*
 967          * If we are doing an index-only scan, nextTupleOffset is the first free
 968          * location in the associated tuple storage workspace.
 969          */
 970         int                     nextTupleOffset;
 971
 972         /*
 973          * The items array is always ordered in index order (ie, increasing
 974          * indexoffset).  When scanning backwards it is convenient to fill the
 975          * array back-to-front, so we start at the last slot and fill downwards.
 976          * Hence we need both a first-valid-entry and a last-valid-entry counter.
 977          * itemIndex is a cursor showing which entry was last returned to caller.
 978          */
 979         int                     firstItem;              /* first valid index in items[] */
 980         int                     lastItem;               /* last valid index in items[] */
 981         int                     itemIndex;              /* current index in items[] */
 982
 983         BTScanPosItem items[MaxTIDsPerBTreePage];       /* MUST BE LAST */
 984 } BTScanPosData;
 985
 986 typedef BTScanPosData *BTScanPos;
 987
 988 #define BTScanPosIsPinned(scanpos) \
 989 ( \
 990         AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
 991                                 !BufferIsValid((scanpos).buf)), \
 992         BufferIsValid((scanpos).buf) \
 993 )
 994 #define BTScanPosUnpin(scanpos) \
 995         do { \
 996                 ReleaseBuffer((scanpos).buf); \
 997                 (scanpos).buf = InvalidBuffer; \
 998         } while (0)
 999 #define BTScanPosUnpinIfPinned(scanpos) \
1000         do { \
1001                 if (BTScanPosIsPinned(scanpos)) \
1002                         BTScanPosUnpin(scanpos); \
1003         } while (0)
1004
1005 #define BTScanPosIsValid(scanpos) \
1006 ( \
1007         AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
1008                                 !BufferIsValid((scanpos).buf)), \
1009         BlockNumberIsValid((scanpos).currPage) \
1010 )
1011 #define BTScanPosInvalidate(scanpos) \
1012         do { \
1013                 (scanpos).currPage = InvalidBlockNumber; \
1014                 (scanpos).nextPage = InvalidBlockNumber; \
1015                 (scanpos).buf = InvalidBuffer; \
1016                 (scanpos).lsn = InvalidXLogRecPtr; \
1017                 (scanpos).nextTupleOffset = 0; \
1018         } while (0)
1019
1020 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
1021 typedef struct BTArrayKeyInfo
1022 {
1023         int                     scan_key;               /* index of associated key in arrayKeyData */
1024         int                     cur_elem;               /* index of current element in elem_values */
1025         int                     mark_elem;              /* index of marked element in elem_values */
1026         int                     num_elems;              /* number of elems in current array value */
1027         Datum      *elem_values;        /* array of num_elems Datums */
1028 } BTArrayKeyInfo;
1029
1030 typedef struct BTScanOpaqueData
1031 {
1032         /* these fields are set by _bt_preprocess_keys(): */
1033         bool            qual_ok;                /* false if qual can never be satisfied */
1034         int                     numberOfKeys;   /* number of preprocessed scan keys */
1035         ScanKey         keyData;                /* array of preprocessed scan keys */
1036
1037         /* workspace for SK_SEARCHARRAY support */
1038         ScanKey         arrayKeyData;   /* modified copy of scan->keyData */
1039         int                     numArrayKeys;   /* number of equality-type array keys (-1 if
1040                                                                  * there are any unsatisfiable array keys) */
1041         int                     arrayKeyCount;  /* count indicating number of array scan keys
1042                                                                  * processed */
1043         BTArrayKeyInfo *arrayKeys;      /* info about each equality-type array key */
1044         MemoryContext arrayContext; /* scan-lifespan context for array data */
1045
1046         /* info about killed items if any (killedItems is NULL if never used) */
1047         int                *killedItems;        /* currPos.items indexes of killed items */
1048         int                     numKilled;              /* number of currently stored items */
1049
1050         /*
1051          * If we are doing an index-only scan, these are the tuple storage
1052          * workspaces for the currPos and markPos respectively.  Each is of size
1053          * BLCKSZ, so it can hold as much as a full page's worth of tuples.
1054          */
1055         char       *currTuples;         /* tuple storage for currPos */
1056         char       *markTuples;         /* tuple storage for markPos */
1057
1058         /*
1059          * If the marked position is on the same page as current position, we
1060          * don't use markPos, but just keep the marked itemIndex in markItemIndex
1061          * (all the rest of currPos is valid for the mark position). Hence, to
1062          * determine if there is a mark, first look at markItemIndex, then at
1063          * markPos.
1064          */
1065         int                     markItemIndex;  /* itemIndex, or -1 if not valid */
1066
1067         /* keep these last in struct for efficiency */
1068         BTScanPosData currPos;          /* current position data */
1069         BTScanPosData markPos;          /* marked position, if any */
1070 } BTScanOpaqueData;
1071
1072 typedef BTScanOpaqueData *BTScanOpaque;
1073
1074 /*
1075  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
1076  * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
1077  * index's indoption[] array entry for the index attribute.
1078  */
1079 #define SK_BT_REQFWD    0x00010000      /* required to continue forward scan */
1080 #define SK_BT_REQBKWD   0x00020000      /* required to continue backward scan */
1081 #define SK_BT_INDOPTION_SHIFT  24       /* must clear the above bits */
1082 #define SK_BT_DESC                      (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
1083 #define SK_BT_NULLS_FIRST       (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
1084
1085 typedef struct BTOptions
1086 {
1087         int32           varlena_header_;        /* varlena header (do not touch directly!) */
1088         int                     fillfactor;             /* page fill factor in percent (0..100) */
1089         float8          vacuum_cleanup_index_scale_factor;      /* deprecated */
1090         bool            deduplicate_items;      /* Try to deduplicate items? */
1091 } BTOptions;
1092
1093 #define BTGetFillFactor(relation) \
1094         (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
1095                                  relation->rd_rel->relam == BTREE_AM_OID), \
1096          (relation)->rd_options ? \
1097          ((BTOptions *) (relation)->rd_options)->fillfactor : \
1098          BTREE_DEFAULT_FILLFACTOR)
1099 #define BTGetTargetPageFreeSpace(relation) \
1100         (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
1101 #define BTGetDeduplicateItems(relation) \
1102         (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
1103                                  relation->rd_rel->relam == BTREE_AM_OID), \
1104         ((relation)->rd_options ? \
1105          ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
1106
1107 /*
1108  * Constant definition for progress reporting.  Phase numbers must match
1109  * btbuildphasename.
1110  */
1111 /* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */
1112 #define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN               2
1113 #define PROGRESS_BTREE_PHASE_PERFORMSORT_1                              3
1114 #define PROGRESS_BTREE_PHASE_PERFORMSORT_2                              4
1115 #define PROGRESS_BTREE_PHASE_LEAF_LOAD                                  5
1116
1117 /*
1118  * external entry points for btree, in nbtree.c
1119  */
1120 extern void btbuildempty(Relation index);
1121 extern bool btinsert(Relation rel, Datum *values, bool *isnull,
1122                                          ItemPointer ht_ctid, Relation heapRel,
1123                                          IndexUniqueCheck checkUnique,
1124                                          bool indexUnchanged,
1125                                          struct IndexInfo *indexInfo);
1126 extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
1127 extern Size btestimateparallelscan(void);
1128 extern void btinitparallelscan(void *target);
1129 extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
1130 extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
1131 extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
1132                                          ScanKey orderbys, int norderbys);
1133 extern void btparallelrescan(IndexScanDesc scan);
1134 extern void btendscan(IndexScanDesc scan);
1135 extern void btmarkpos(IndexScanDesc scan);
1136 extern void btrestrpos(IndexScanDesc scan);
1137 extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
1138                                                                                    IndexBulkDeleteResult *stats,
1139                                                                                    IndexBulkDeleteCallback callback,
1140                                                                                    void *callback_state);
1141 extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
1142                                                                                           IndexBulkDeleteResult *stats);
1143 extern bool btcanreturn(Relation index, int attno);
1144
1145 /*
1146  * prototypes for internal functions in nbtree.c
1147  */
1148 extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
1149 extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
1150 extern void _bt_parallel_done(IndexScanDesc scan);
1151 extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
1152
1153 /*
1154  * prototypes for functions in nbtdedup.c
1155  */
1156 extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
1157                                                    IndexTuple newitem, Size newitemsz,
1158                                                    bool checkingunique);
1159 extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
1160                                                                  Size newitemsz);
1161 extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
1162                                                                         OffsetNumber baseoff);
1163 extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
1164 extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
1165 extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
1166                                                                    int nhtids);
1167 extern void _bt_update_posting(BTVacuumPosting vacposting);
1168 extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
1169                                                                    int postingoff);
1170
1171 /*
1172  * prototypes for functions in nbtinsert.c
1173  */
1174 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
1175                                                  IndexUniqueCheck checkUnique, bool indexUnchanged,
1176                                                  Relation heapRel);
1177 extern void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack);
1178 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
1179
1180 /*
1181  * prototypes for functions in nbtsplitloc.c
1182  */
1183 extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
1184                                                                          OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1185                                                                          bool *newitemonleft);
1186
1187 /*
1188  * prototypes for functions in nbtpage.c
1189  */
1190 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
1191                                                          bool allequalimage);
1192 extern bool _bt_vacuum_needs_cleanup(Relation rel);
1193 extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages);
1194 extern void _bt_upgrademetapage(Page page);
1195 extern Buffer _bt_getroot(Relation rel, int access);
1196 extern Buffer _bt_gettrueroot(Relation rel);
1197 extern int      _bt_getrootheight(Relation rel);
1198 extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
1199                                                         bool *allequalimage);
1200 extern void _bt_checkpage(Relation rel, Buffer buf);
1201 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
1202 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
1203                                                            BlockNumber blkno, int access);
1204 extern void _bt_relbuf(Relation rel, Buffer buf);
1205 extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
1206 extern void _bt_unlockbuf(Relation rel, Buffer buf);
1207 extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
1208 extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
1209 extern void _bt_pageinit(Page page, Size size);
1210 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
1211                                                                 OffsetNumber *deletable, int ndeletable,
1212                                                                 BTVacuumPosting *updatable, int nupdatable);
1213 extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
1214                                                                           Relation heapRel,
1215                                                                           TM_IndexDeleteOp *delstate);
1216 extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate);
1217 extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate,
1218                                                                 bool cleanuponly);
1219 extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
1220
1221 /*
1222  * prototypes for functions in nbtsearch.c
1223  */
1224 extern BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP,
1225                                                   int access, Snapshot snapshot);
1226 extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf,
1227                                                         bool forupdate, BTStack stack, int access, Snapshot snapshot);
1228 extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
1229 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
1230 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
1231 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
1232 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
1233                                                            Snapshot snapshot);
1234
1235 /*
1236  * prototypes for functions in nbtutils.c
1237  */
1238 extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
1239 extern void _bt_freestack(BTStack stack);
1240 extern void _bt_preprocess_array_keys(IndexScanDesc scan);
1241 extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
1242 extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
1243 extern void _bt_mark_array_keys(IndexScanDesc scan);
1244 extern void _bt_restore_array_keys(IndexScanDesc scan);
1245 extern void _bt_preprocess_keys(IndexScanDesc scan);
1246 extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
1247                                                   int tupnatts, ScanDirection dir, bool *continuescan);
1248 extern void _bt_killitems(IndexScanDesc scan);
1249 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
1250 extern BTCycleId _bt_start_vacuum(Relation rel);
1251 extern void _bt_end_vacuum(Relation rel);
1252 extern void _bt_end_vacuum_callback(int code, Datum arg);
1253 extern Size BTreeShmemSize(void);
1254 extern void BTreeShmemInit(void);
1255 extern bytea *btoptions(Datum reloptions, bool validate);
1256 extern bool btproperty(Oid index_oid, int attno,
1257                                            IndexAMProperty prop, const char *propname,
1258                                            bool *res, bool *isnull);
1259 extern char *btbuildphasename(int64 phasenum);
1260 extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
1261                                                            IndexTuple firstright, BTScanInsert itup_key);
1262 extern int      _bt_keep_natts_fast(Relation rel, IndexTuple lastleft,
1263                                                                 IndexTuple firstright);
1264 extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
1265                                                         OffsetNumber offnum);
1266 extern void _bt_check_third_page(Relation rel, Relation heap,
1267                                                                  bool needheaptidspace, Page page, IndexTuple newtup);
1268 extern bool _bt_allequalimage(Relation rel, bool debugmessage);
1269
1270 /*
1271  * prototypes for functions in nbtvalidate.c
1272  */
1273 extern bool btvalidate(Oid opclassoid);
1274 extern void btadjustmembers(Oid opfamilyoid,
1275                                                         Oid opclassoid,
1276                                                         List *operators,
1277                                                         List *functions);
1278
1279 /*
1280  * prototypes for functions in nbtsort.c
1281  */
1282 extern IndexBuildResult *btbuild(Relation heap, Relation index,
1283                                                                  struct IndexInfo *indexInfo);
1284 extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
1285
1286 #endif                                                  /* NBTREE_H */