src/include/access/nbtree.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtree.h
   4  *        header file for postgres btree access method implementation.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/include/access/nbtree.h
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #ifndef NBTREE_H
  15 #define NBTREE_H
  16
  17 #include "access/amapi.h"
  18 #include "access/itup.h"
  19 #include "access/sdir.h"
  20 #include "access/tableam.h"
  21 #include "access/xlogreader.h"
  22 #include "catalog/pg_am_d.h"
  23 #include "catalog/pg_index.h"
  24 #include "lib/stringinfo.h"
  25 #include "storage/bufmgr.h"
  26 #include "storage/shm_toc.h"
  27
  28 /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
  29 typedef uint16 BTCycleId;
  30
  31 /*
  32  *      BTPageOpaqueData -- At the end of every page, we store a pointer
  33  *      to both siblings in the tree.  This is used to do forward/backward
  34  *      index scans.  The next-page link is also critical for recovery when
  35  *      a search has navigated to the wrong page due to concurrent page splits
  36  *      or deletions; see src/backend/access/nbtree/README for more info.
  37  *
  38  *      In addition, we store the page's btree level (counting upwards from
  39  *      zero at a leaf page) as well as some flag bits indicating the page type
  40  *      and status.  If the page is deleted, a BTDeletedPageData struct is stored
  41  *      in the page's tuple area, while a standard BTPageOpaqueData struct is
  42  *      stored in the page special area.
  43  *
  44  *      We also store a "vacuum cycle ID".  When a page is split while VACUUM is
  45  *      processing the index, a nonzero value associated with the VACUUM run is
  46  *      stored into both halves of the split page.  (If VACUUM is not running,
  47  *      both pages receive zero cycleids.)      This allows VACUUM to detect whether
  48  *      a page was split since it started, with a small probability of false match
  49  *      if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
  50  *      ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
  51  *      (original) page, and set in the right page, but only if the next page
  52  *      to its right has a different cycleid.
  53  *
  54  *      NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
  55  *      instead.
  56  *
  57  *      NOTE: the btpo_level field used to be a union type in order to allow
  58  *      deleted pages to store a 32-bit safexid in the same field.  We now store
  59  *      64-bit/full safexid values using BTDeletedPageData instead.
  60  */
  61
  62 typedef struct BTPageOpaqueData
  63 {
  64         BlockNumber btpo_prev;          /* left sibling, or P_NONE if leftmost */
  65         BlockNumber btpo_next;          /* right sibling, or P_NONE if rightmost */
  66         uint32          btpo_level;             /* tree level --- zero for leaf pages */
  67         uint16          btpo_flags;             /* flag bits, see below */
  68         BTCycleId       btpo_cycleid;   /* vacuum cycle ID of latest split */
  69 } BTPageOpaqueData;
  70
  71 typedef BTPageOpaqueData *BTPageOpaque;
  72
  73 #define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page))
  74
  75 /* Bits defined in btpo_flags */
  76 #define BTP_LEAF                (1 << 0)        /* leaf page, i.e. not internal page */
  77 #define BTP_ROOT                (1 << 1)        /* root page (has no parent) */
  78 #define BTP_DELETED             (1 << 2)        /* page has been deleted from tree */
  79 #define BTP_META                (1 << 3)        /* meta-page */
  80 #define BTP_HALF_DEAD   (1 << 4)        /* empty, but still in tree */
  81 #define BTP_SPLIT_END   (1 << 5)        /* rightmost page of split group */
  82 #define BTP_HAS_GARBAGE (1 << 6)        /* page has LP_DEAD tuples (deprecated) */
  83 #define BTP_INCOMPLETE_SPLIT (1 << 7)   /* right sibling's downlink is missing */
  84 #define BTP_HAS_FULLXID (1 << 8)        /* contains BTDeletedPageData */
  85
  86 /*
  87  * The max allowed value of a cycle ID is a bit less than 64K.  This is
  88  * for convenience of pg_filedump and similar utilities: we want to use
  89  * the last 2 bytes of special space as an index type indicator, and
  90  * restricting cycle ID lets btree use that space for vacuum cycle IDs
  91  * while still allowing index type to be identified.
  92  */
  93 #define MAX_BT_CYCLE_ID         0xFF7F
  94
  95
  96 /*
  97  * The Meta page is always the first page in the btree index.
  98  * Its primary purpose is to point to the location of the btree root page.
  99  * We also point to the "fast" root, which is the current effective root;
 100  * see README for discussion.
 101  */
 102
 103 typedef struct BTMetaPageData
 104 {
 105         uint32          btm_magic;              /* should contain BTREE_MAGIC */
 106         uint32          btm_version;    /* nbtree version (always <= BTREE_VERSION) */
 107         BlockNumber btm_root;           /* current root location */
 108         uint32          btm_level;              /* tree level of the root page */
 109         BlockNumber btm_fastroot;       /* current "fast" root location */
 110         uint32          btm_fastlevel;  /* tree level of the "fast" root page */
 111         /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
 112
 113         /* number of deleted, non-recyclable pages during last cleanup */
 114         uint32          btm_last_cleanup_num_delpages;
 115         /* number of heap tuples during last cleanup (deprecated) */
 116         float8          btm_last_cleanup_num_heap_tuples;
 117
 118         bool            btm_allequalimage;      /* are all columns "equalimage"? */
 119 } BTMetaPageData;
 120
 121 #define BTPageGetMeta(p) \
 122         ((BTMetaPageData *) PageGetContents(p))
 123
 124 /*
 125  * The current Btree version is 4.  That's what you'll get when you create
 126  * a new index.
 127  *
 128  * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
 129  * version 4, but heap TIDs were not part of the keyspace.  Index tuples
 130  * with duplicate keys could be stored in any order.  We continue to
 131  * support reading and writing Btree versions 2 and 3, so that they don't
 132  * need to be immediately re-indexed at pg_upgrade.  In order to get the
 133  * new heapkeyspace semantics, however, a REINDEX is needed.
 134  *
 135  * Deduplication is safe to use when the btm_allequalimage field is set to
 136  * true.  It's safe to read the btm_allequalimage field on version 3, but
 137  * only version 4 indexes make use of deduplication.  Even version 4
 138  * indexes created on PostgreSQL v12 will need a REINDEX to make use of
 139  * deduplication, though, since there is no other way to set
 140  * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
 141  * metapage field).
 142  *
 143  * Btree version 2 is mostly the same as version 3.  There are two new
 144  * fields in the metapage that were introduced in version 3.  A version 2
 145  * metapage will be automatically upgraded to version 3 on the first
 146  * insert to it.  INCLUDE indexes cannot use version 2.
 147  */
 148 #define BTREE_METAPAGE  0               /* first page is meta */
 149 #define BTREE_MAGIC             0x053162        /* magic number in metapage */
 150 #define BTREE_VERSION   4               /* current version number */
 151 #define BTREE_MIN_VERSION       2       /* minimum supported version */
 152 #define BTREE_NOVAC_VERSION     3       /* version with all meta fields set */
 153
 154 /*
 155  * Maximum size of a btree index entry, including its tuple header.
 156  *
 157  * We actually need to be able to fit three items on every page,
 158  * so restrict any one item to 1/3 the per-page available space.
 159  *
 160  * There are rare cases where _bt_truncate() will need to enlarge
 161  * a heap index tuple to make space for a tiebreaker heap TID
 162  * attribute, which we account for here.
 163  */
 164 #define BTMaxItemSize(page) \
 165         (MAXALIGN_DOWN((PageGetPageSize(page) - \
 166                                         MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 167                                         MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
 168                                         MAXALIGN(sizeof(ItemPointerData)))
 169 #define BTMaxItemSizeNoHeapTid(page) \
 170         MAXALIGN_DOWN((PageGetPageSize(page) - \
 171                                    MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 172                                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
 173
 174 /*
 175  * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
 176  * that may be stored on a btree leaf page.  It is used to size the
 177  * per-page temporary buffers.
 178  *
 179  * Note: we don't bother considering per-tuple overheads here to keep
 180  * things simple (value is based on how many elements a single array of
 181  * heap TIDs must have to fill the space between the page header and
 182  * special area).  The value is slightly higher (i.e. more conservative)
 183  * than necessary as a result, which is considered acceptable.
 184  */
 185 #define MaxTIDsPerBTreePage \
 186         (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
 187                    sizeof(ItemPointerData))
 188
 189 /*
 190  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
 191  * For pages above the leaf level, we use a fixed 70% fillfactor.
 192  * The fillfactor is applied during index build and when splitting
 193  * a rightmost page; when splitting non-rightmost pages we try to
 194  * divide the data equally.  When splitting a page that's entirely
 195  * filled with a single value (duplicates), the effective leaf-page
 196  * fillfactor is 96%, regardless of whether the page is a rightmost
 197  * page.
 198  */
 199 #define BTREE_MIN_FILLFACTOR            10
 200 #define BTREE_DEFAULT_FILLFACTOR        90
 201 #define BTREE_NONLEAF_FILLFACTOR        70
 202 #define BTREE_SINGLEVAL_FILLFACTOR      96
 203
 204 /*
 205  *      In general, the btree code tries to localize its knowledge about
 206  *      page layout to a couple of routines.  However, we need a special
 207  *      value to indicate "no page number" in those places where we expect
 208  *      page numbers.  We can use zero for this because we never need to
 209  *      make a pointer to the metadata page.
 210  */
 211
 212 #define P_NONE                  0
 213
 214 /*
 215  * Macros to test whether a page is leftmost or rightmost on its tree level,
 216  * as well as other state info kept in the opaque data.
 217  */
 218 #define P_LEFTMOST(opaque)              ((opaque)->btpo_prev == P_NONE)
 219 #define P_RIGHTMOST(opaque)             ((opaque)->btpo_next == P_NONE)
 220 #define P_ISLEAF(opaque)                (((opaque)->btpo_flags & BTP_LEAF) != 0)
 221 #define P_ISROOT(opaque)                (((opaque)->btpo_flags & BTP_ROOT) != 0)
 222 #define P_ISDELETED(opaque)             (((opaque)->btpo_flags & BTP_DELETED) != 0)
 223 #define P_ISMETA(opaque)                (((opaque)->btpo_flags & BTP_META) != 0)
 224 #define P_ISHALFDEAD(opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 225 #define P_IGNORE(opaque)                (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 226 #define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 227 #define P_INCOMPLETE_SPLIT(opaque)      (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 228 #define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 229
 230 /*
 231  * BTDeletedPageData is the page contents of a deleted page
 232  */
 233 typedef struct BTDeletedPageData
 234 {
 235         FullTransactionId safexid;      /* See BTPageIsRecyclable() */
 236 } BTDeletedPageData;
 237
 238 static inline void
 239 BTPageSetDeleted(Page page, FullTransactionId safexid)
 240 {
 241         BTPageOpaque opaque;
 242         PageHeader      header;
 243         BTDeletedPageData *contents;
 244
 245         opaque = BTPageGetOpaque(page);
 246         header = ((PageHeader) page);
 247
 248         opaque->btpo_flags &= ~BTP_HALF_DEAD;
 249         opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
 250         header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
 251                 sizeof(BTDeletedPageData);
 252         header->pd_upper = header->pd_special;
 253
 254         /* Set safexid in deleted page */
 255         contents = ((BTDeletedPageData *) PageGetContents(page));
 256         contents->safexid = safexid;
 257 }
 258
 259 static inline FullTransactionId
 260 BTPageGetDeleteXid(Page page)
 261 {
 262         BTPageOpaque opaque;
 263         BTDeletedPageData *contents;
 264
 265         /* We only expect to be called with a deleted page */
 266         Assert(!PageIsNew(page));
 267         opaque = BTPageGetOpaque(page);
 268         Assert(P_ISDELETED(opaque));
 269
 270         /* pg_upgrade'd deleted page -- must be safe to recycle now */
 271         if (!P_HAS_FULLXID(opaque))
 272                 return FirstNormalFullTransactionId;
 273
 274         /* Get safexid from deleted page */
 275         contents = ((BTDeletedPageData *) PageGetContents(page));
 276         return contents->safexid;
 277 }
 278
 279 /*
 280  * Is an existing page recyclable?
 281  *
 282  * This exists to centralize the policy on which deleted pages are now safe to
 283  * re-use.  However, _bt_pendingfsm_finalize() duplicates some of the same
 284  * logic because it doesn't work directly with pages -- keep the two in sync.
 285  *
 286  * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
 287  * them here (caller is responsible for that case themselves).  Caller might
 288  * well need special handling for new pages anyway.
 289  */
 290 static inline bool
 291 BTPageIsRecyclable(Page page, Relation heaprel)
 292 {
 293         BTPageOpaque opaque;
 294
 295         Assert(!PageIsNew(page));
 296         Assert(heaprel != NULL);
 297
 298         /* Recycling okay iff page is deleted and safexid is old enough */
 299         opaque = BTPageGetOpaque(page);
 300         if (P_ISDELETED(opaque))
 301         {
 302                 FullTransactionId safexid = BTPageGetDeleteXid(page);
 303
 304                 /*
 305                  * The page was deleted, but when? If it was just deleted, a scan
 306                  * might have seen the downlink to it, and will read the page later.
 307                  * As long as that can happen, we must keep the deleted page around as
 308                  * a tombstone.
 309                  *
 310                  * For that check if the deletion XID could still be visible to
 311                  * anyone. If not, then no scan that's still in progress could have
 312                  * seen its downlink, and we can recycle it.
 313                  */
 314                 return GlobalVisCheckRemovableFullXid(heaprel, safexid);
 315         }
 316
 317         return false;
 318 }
 319
 320 /*
 321  * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM.
 322  * They are exported for use by page deletion related code in nbtpage.c.
 323  */
 324 typedef struct BTPendingFSM
 325 {
 326         BlockNumber target;                     /* Page deleted by current VACUUM */
 327         FullTransactionId safexid;      /* Page's BTDeletedPageData.safexid */
 328 } BTPendingFSM;
 329
 330 typedef struct BTVacState
 331 {
 332         IndexVacuumInfo *info;
 333         IndexBulkDeleteResult *stats;
 334         IndexBulkDeleteCallback callback;
 335         void       *callback_state;
 336         BTCycleId       cycleid;
 337         MemoryContext pagedelcontext;
 338
 339         /*
 340          * _bt_pendingfsm_finalize() state
 341          */
 342         int                     bufsize;                /* pendingpages space (in # elements) */
 343         int                     maxbufsize;             /* max bufsize that respects work_mem */
 344         BTPendingFSM *pendingpages; /* One entry per newly deleted page */
 345         int                     npendingpages;  /* current # valid pendingpages */
 346 } BTVacState;
 347
 348 /*
 349  *      Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
 350  *      page.  The high key is not a tuple that is used to visit the heap.  It is
 351  *      a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
 352  *      The high key on a page is required to be greater than or equal to any
 353  *      other key that appears on the page.  If we find ourselves trying to
 354  *      insert a key that is strictly > high key, we know we need to move right
 355  *      (this should only happen if the page was split since we examined the
 356  *      parent page).
 357  *
 358  *      Our insertion algorithm guarantees that we can use the initial least key
 359  *      on our right sibling as the high key.  Once a page is created, its high
 360  *      key changes only if the page is split.
 361  *
 362  *      On a non-rightmost page, the high key lives in item 1 and data items
 363  *      start in item 2.  Rightmost pages have no high key, so we store data
 364  *      items beginning in item 1.
 365  */
 366
 367 #define P_HIKEY                         ((OffsetNumber) 1)
 368 #define P_FIRSTKEY                      ((OffsetNumber) 2)
 369 #define P_FIRSTDATAKEY(opaque)  (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 370
 371 /*
 372  * Notes on B-Tree tuple format, and key and non-key attributes:
 373  *
 374  * INCLUDE B-Tree indexes have non-key attributes.  These are extra
 375  * attributes that may be returned by index-only scans, but do not influence
 376  * the order of items in the index (formally, non-key attributes are not
 377  * considered to be part of the key space).  Non-key attributes are only
 378  * present in leaf index tuples whose item pointers actually point to heap
 379  * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
 380  * described here.
 381  *
 382  * Non-pivot tuple format (plain/non-posting variant):
 383  *
 384  *  t_tid | t_info | key values | INCLUDE columns, if any
 385  *
 386  * t_tid points to the heap TID, which is a tiebreaker key column as of
 387  * BTREE_VERSION 4.
 388  *
 389  * Non-pivot tuples complement pivot tuples, which only have key columns.
 390  * The sole purpose of pivot tuples is to represent how the key space is
 391  * separated.  In general, any B-Tree index that has more than one level
 392  * (i.e. any index that does not just consist of a metapage and a single
 393  * leaf root page) must have some number of pivot tuples, since pivot
 394  * tuples are used for traversing the tree.  Suffix truncation can omit
 395  * trailing key columns when a new pivot is formed, which makes minus
 396  * infinity their logical value.  Since BTREE_VERSION 4 indexes treat heap
 397  * TID as a trailing key column that ensures that all index tuples are
 398  * physically unique, it is necessary to represent heap TID as a trailing
 399  * key column in pivot tuples, though very often this can be truncated
 400  * away, just like any other key column. (Actually, the heap TID is
 401  * omitted rather than truncated, since its representation is different to
 402  * the non-pivot representation.)
 403  *
 404  * Pivot tuple format:
 405  *
 406  *  t_tid | t_info | key values | [heap TID]
 407  *
 408  * We store the number of columns present inside pivot tuples by abusing
 409  * their t_tid offset field, since pivot tuples never need to store a real
 410  * offset (pivot tuples generally store a downlink in t_tid, though).  The
 411  * offset field only stores the number of columns/attributes when the
 412  * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
 413  * TID column sometimes stored in pivot tuples -- that's represented by
 414  * the presence of BT_PIVOT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in
 415  * t_info is always set on BTREE_VERSION 4 pivot tuples, since
 416  * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
 417  *
 418  * In version 2 or version 3 (!heapkeyspace) indexes, INDEX_ALT_TID_MASK
 419  * might not be set in pivot tuples.  BTreeTupleIsPivot() won't work
 420  * reliably as a result.  The number of columns stored is implicitly the
 421  * same as the number of columns in the index, just like any non-pivot
 422  * tuple. (The number of columns stored should not vary, since suffix
 423  * truncation of key columns is unsafe within any !heapkeyspace index.)
 424  *
 425  * The 12 least significant bits from t_tid's offset number are used to
 426  * represent the number of key columns within a pivot tuple.  This leaves 4
 427  * status bits (BT_STATUS_OFFSET_MASK bits), which are shared by all tuples
 428  * that have the INDEX_ALT_TID_MASK bit set (set in t_info) to store basic
 429  * tuple metadata.  BTreeTupleIsPivot() and BTreeTupleIsPosting() use the
 430  * BT_STATUS_OFFSET_MASK bits.
 431  *
 432  * Sometimes non-pivot tuples also use a representation that repurposes
 433  * t_tid to store metadata rather than a TID.  PostgreSQL v13 introduced a
 434  * new non-pivot tuple format to support deduplication: posting list
 435  * tuples.  Deduplication merges together multiple equal non-pivot tuples
 436  * into a logically equivalent, space efficient representation.  A posting
 437  * list is an array of ItemPointerData elements.  Non-pivot tuples are
 438  * merged together to form posting list tuples lazily, at the point where
 439  * we'd otherwise have to split a leaf page.
 440  *
 441  * Posting tuple format (alternative non-pivot tuple representation):
 442  *
 443  *  t_tid | t_info | key values | posting list (TID array)
 444  *
 445  * Posting list tuples are recognized as such by having the
 446  * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
 447  * bit set in t_tid's offset number.  These flags redefine the content of
 448  * the posting tuple's t_tid to store the location of the posting list
 449  * (instead of a block number), as well as the total number of heap TIDs
 450  * present in the tuple (instead of a real offset number).
 451  *
 452  * The 12 least significant bits from t_tid's offset number are used to
 453  * represent the number of heap TIDs present in the tuple, leaving 4 status
 454  * bits (the BT_STATUS_OFFSET_MASK bits).  Like any non-pivot tuple, the
 455  * number of columns stored is always implicitly the total number in the
 456  * index (in practice there can never be non-key columns stored, since
 457  * deduplication is not supported with INCLUDE indexes).
 458  */
 459 #define INDEX_ALT_TID_MASK                      INDEX_AM_RESERVED_BIT
 460
 461 /* Item pointer offset bit masks */
 462 #define BT_OFFSET_MASK                          0x0FFF
 463 #define BT_STATUS_OFFSET_MASK           0xF000
 464 /* BT_STATUS_OFFSET_MASK status bits */
 465 #define BT_PIVOT_HEAP_TID_ATTR          0x1000
 466 #define BT_IS_POSTING                           0x2000
 467
 468 /*
 469  * Mask allocated for number of keys in index tuple must be able to fit
 470  * maximum possible number of index attributes
 471  */
 472 StaticAssertDecl(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
 473                                  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
 474
 475 /*
 476  * Note: BTreeTupleIsPivot() can have false negatives (but not false
 477  * positives) when used with !heapkeyspace indexes
 478  */
 479 static inline bool
 480 BTreeTupleIsPivot(IndexTuple itup)
 481 {
 482         if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
 483                 return false;
 484         /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
 485         if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
 486                 return false;
 487
 488         return true;
 489 }
 490
 491 static inline bool
 492 BTreeTupleIsPosting(IndexTuple itup)
 493 {
 494         if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
 495                 return false;
 496         /* presence of BT_IS_POSTING in offset number indicates posting tuple */
 497         if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
 498                 return false;
 499
 500         return true;
 501 }
 502
 503 static inline void
 504 BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
 505 {
 506         Assert(nhtids > 1);
 507         Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
 508         Assert((size_t) postingoffset == MAXALIGN(postingoffset));
 509         Assert(postingoffset < INDEX_SIZE_MASK);
 510         Assert(!BTreeTupleIsPivot(itup));
 511
 512         itup->t_info |= INDEX_ALT_TID_MASK;
 513         ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
 514         ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
 515 }
 516
 517 static inline uint16
 518 BTreeTupleGetNPosting(IndexTuple posting)
 519 {
 520         OffsetNumber existing;
 521
 522         Assert(BTreeTupleIsPosting(posting));
 523
 524         existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
 525         return (existing & BT_OFFSET_MASK);
 526 }
 527
 528 static inline uint32
 529 BTreeTupleGetPostingOffset(IndexTuple posting)
 530 {
 531         Assert(BTreeTupleIsPosting(posting));
 532
 533         return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
 534 }
 535
 536 static inline ItemPointer
 537 BTreeTupleGetPosting(IndexTuple posting)
 538 {
 539         return (ItemPointer) ((char *) posting +
 540                                                   BTreeTupleGetPostingOffset(posting));
 541 }
 542
 543 static inline ItemPointer
 544 BTreeTupleGetPostingN(IndexTuple posting, int n)
 545 {
 546         return BTreeTupleGetPosting(posting) + n;
 547 }
 548
 549 /*
 550  * Get/set downlink block number in pivot tuple.
 551  *
 552  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
 553  * !heapkeyspace indexes would exhibit false positive assertion failures.
 554  */
 555 static inline BlockNumber
 556 BTreeTupleGetDownLink(IndexTuple pivot)
 557 {
 558         return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
 559 }
 560
 561 static inline void
 562 BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
 563 {
 564         ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
 565 }
 566
 567 /*
 568  * Get number of attributes within tuple.
 569  *
 570  * Note that this does not include an implicit tiebreaker heap TID
 571  * attribute, if any.  Note also that the number of key attributes must be
 572  * explicitly represented in all heapkeyspace pivot tuples.
 573  *
 574  * Note: This is defined as a macro rather than an inline function to
 575  * avoid including rel.h.
 576  */
 577 #define BTreeTupleGetNAtts(itup, rel)   \
 578         ( \
 579                 (BTreeTupleIsPivot(itup)) ? \
 580                 ( \
 581                         ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
 582                 ) \
 583                 : \
 584                 IndexRelationGetNumberOfAttributes(rel) \
 585         )
 586
 587 /*
 588  * Set number of key attributes in tuple.
 589  *
 590  * The heap TID tiebreaker attribute bit may also be set here, indicating that
 591  * a heap TID value will be stored at the end of the tuple (i.e. using the
 592  * special pivot tuple representation).
 593  */
 594 static inline void
 595 BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
 596 {
 597         Assert(nkeyatts <= INDEX_MAX_KEYS);
 598         Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
 599         Assert(!heaptid || nkeyatts > 0);
 600         Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
 601
 602         itup->t_info |= INDEX_ALT_TID_MASK;
 603
 604         if (heaptid)
 605                 nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
 606
 607         /* BT_IS_POSTING bit is deliberately unset here */
 608         ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
 609         Assert(BTreeTupleIsPivot(itup));
 610 }
 611
 612 /*
 613  * Get/set leaf page's "top parent" link from its high key.  Used during page
 614  * deletion.
 615  *
 616  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
 617  * !heapkeyspace indexes would exhibit false positive assertion failures.
 618  */
 619 static inline BlockNumber
 620 BTreeTupleGetTopParent(IndexTuple leafhikey)
 621 {
 622         return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
 623 }
 624
 625 static inline void
 626 BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
 627 {
 628         ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
 629         BTreeTupleSetNAtts(leafhikey, 0, false);
 630 }
 631
 632 /*
 633  * Get tiebreaker heap TID attribute, if any.
 634  *
 635  * This returns the first/lowest heap TID in the case of a posting list tuple.
 636  */
 637 static inline ItemPointer
 638 BTreeTupleGetHeapTID(IndexTuple itup)
 639 {
 640         if (BTreeTupleIsPivot(itup))
 641         {
 642                 /* Pivot tuple heap TID representation? */
 643                 if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
 644                          BT_PIVOT_HEAP_TID_ATTR) != 0)
 645                         return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
 646                                                                   sizeof(ItemPointerData));
 647
 648                 /* Heap TID attribute was truncated */
 649                 return NULL;
 650         }
 651         else if (BTreeTupleIsPosting(itup))
 652                 return BTreeTupleGetPosting(itup);
 653
 654         return &itup->t_tid;
 655 }
 656
 657 /*
 658  * Get maximum heap TID attribute, which could be the only TID in the case of
 659  * a non-pivot tuple that does not have a posting list.
 660  *
 661  * Works with non-pivot tuples only.
 662  */
 663 static inline ItemPointer
 664 BTreeTupleGetMaxHeapTID(IndexTuple itup)
 665 {
 666         Assert(!BTreeTupleIsPivot(itup));
 667
 668         if (BTreeTupleIsPosting(itup))
 669         {
 670                 uint16          nposting = BTreeTupleGetNPosting(itup);
 671
 672                 return BTreeTupleGetPostingN(itup, nposting - 1);
 673         }
 674
 675         return &itup->t_tid;
 676 }
 677
 678 /*
 679  *      Operator strategy numbers for B-tree have been moved to access/stratnum.h,
 680  *      because many places need to use them in ScanKeyInit() calls.
 681  *
 682  *      The strategy numbers are chosen so that we can commute them by
 683  *      subtraction, thus:
 684  */
 685 #define BTCommuteStrategyNumber(strat)  (BTMaxStrategyNumber + 1 - (strat))
 686
 687 /*
 688  *      When a new operator class is declared, we require that the user
 689  *      supply us with an amproc procedure (BTORDER_PROC) for determining
 690  *      whether, for two keys a and b, a < b, a = b, or a > b.  This routine
 691  *      must return < 0, 0, > 0, respectively, in these three cases.
 692  *
 693  *      To facilitate accelerated sorting, an operator class may choose to
 694  *      offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
 695  *      src/include/utils/sortsupport.h.
 696  *
 697  *      To support window frames defined by "RANGE offset PRECEDING/FOLLOWING",
 698  *      an operator class may choose to offer a third amproc procedure
 699  *      (BTINRANGE_PROC), independently of whether it offers sortsupport.
 700  *      For full details, see doc/src/sgml/btree.sgml.
 701  *
 702  *      To facilitate B-Tree deduplication, an operator class may choose to
 703  *      offer a forth amproc procedure (BTEQUALIMAGE_PROC).  For full details,
 704  *      see doc/src/sgml/btree.sgml.
 705  */
 706
 707 #define BTORDER_PROC            1
 708 #define BTSORTSUPPORT_PROC      2
 709 #define BTINRANGE_PROC          3
 710 #define BTEQUALIMAGE_PROC       4
 711 #define BTOPTIONS_PROC          5
 712 #define BTNProcs                        5
 713
 714 /*
 715  *      We need to be able to tell the difference between read and write
 716  *      requests for pages, in order to do locking correctly.
 717  */
 718
 719 #define BT_READ                 BUFFER_LOCK_SHARE
 720 #define BT_WRITE                BUFFER_LOCK_EXCLUSIVE
 721
 722 /*
 723  * BTStackData -- As we descend a tree, we push the location of pivot
 724  * tuples whose downlink we are about to follow onto a private stack.  If
 725  * we split a leaf, we use this stack to walk back up the tree and insert
 726  * data into its parent page at the correct location.  We also have to
 727  * recursively insert into the grandparent page if and when the parent page
 728  * splits.  Our private stack can become stale due to concurrent page
 729  * splits and page deletions, but it should never give us an irredeemably
 730  * bad picture.
 731  */
 732 typedef struct BTStackData
 733 {
 734         BlockNumber bts_blkno;
 735         OffsetNumber bts_offset;
 736         struct BTStackData *bts_parent;
 737 } BTStackData;
 738
 739 typedef BTStackData *BTStack;
 740
 741 /*
 742  * BTScanInsertData is the btree-private state needed to find an initial
 743  * position for an indexscan, or to insert new tuples -- an "insertion
 744  * scankey" (not to be confused with a search scankey).  It's used to descend
 745  * a B-Tree using _bt_search.
 746  *
 747  * heapkeyspace indicates if we expect all keys in the index to be physically
 748  * unique because heap TID is used as a tiebreaker attribute, and if index may
 749  * have truncated key attributes in pivot tuples.  This is actually a property
 750  * of the index relation itself (not an indexscan).  heapkeyspace indexes are
 751  * indexes whose version is >= version 4.  It's convenient to keep this close
 752  * by, rather than accessing the metapage repeatedly.
 753  *
 754  * allequalimage is set to indicate that deduplication is safe for the index.
 755  * This is also a property of the index relation rather than an indexscan.
 756  *
 757  * anynullkeys indicates if any of the keys had NULL value when scankey was
 758  * built from index tuple (note that already-truncated tuple key attributes
 759  * set NULL as a placeholder key value, which also affects value of
 760  * anynullkeys).  This is a convenience for unique index non-pivot tuple
 761  * insertion, which usually temporarily unsets scantid, but shouldn't iff
 762  * anynullkeys is true.  Value generally matches non-pivot tuple's HasNulls
 763  * bit, but may not when inserting into an INCLUDE index (tuple header value
 764  * is affected by the NULL-ness of both key and non-key attributes).
 765  *
 766  * See comments in _bt_first for an explanation of the nextkey and backward
 767  * fields.
 768  *
 769  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
 770  * is set to NULL when index scan doesn't need to find a position for a
 771  * specific physical tuple.  Must be set when inserting new tuples into
 772  * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
 773  * in one exact position (it's never set with !heapkeyspace indexes, though).
 774  * Despite the representational difference, nbtree search code considers
 775  * scantid to be just another insertion scankey attribute.
 776  *
 777  * scankeys is an array of scan key entries for attributes that are compared
 778  * before scantid (user-visible attributes).  keysz is the size of the array.
 779  * During insertion, there must be a scan key for every attribute, but when
 780  * starting a regular index scan some can be omitted.  The array is used as a
 781  * flexible array member, though it's sized in a way that makes it possible to
 782  * use stack allocations.  See nbtree/README for full details.
 783  */
 784 typedef struct BTScanInsertData
 785 {
 786         bool            heapkeyspace;
 787         bool            allequalimage;
 788         bool            anynullkeys;
 789         bool            nextkey;
 790         bool            backward;               /* backward index scan? */
 791         ItemPointer scantid;            /* tiebreaker for scankeys */
 792         int                     keysz;                  /* Size of scankeys array */
 793         ScanKeyData scankeys[INDEX_MAX_KEYS];   /* Must appear last */
 794 } BTScanInsertData;
 795
 796 typedef BTScanInsertData *BTScanInsert;
 797
 798 /*
 799  * BTInsertStateData is a working area used during insertion.
 800  *
 801  * This is filled in after descending the tree to the first leaf page the new
 802  * tuple might belong on.  Tracks the current position while performing
 803  * uniqueness check, before we have determined which exact page to insert
 804  * to.
 805  *
 806  * (This should be private to nbtinsert.c, but it's also used by
 807  * _bt_binsrch_insert)
 808  */
 809 typedef struct BTInsertStateData
 810 {
 811         IndexTuple      itup;                   /* Item we're inserting */
 812         Size            itemsz;                 /* Size of itup -- should be MAXALIGN()'d */
 813         BTScanInsert itup_key;          /* Insertion scankey */
 814
 815         /* Buffer containing leaf page we're likely to insert itup on */
 816         Buffer          buf;
 817
 818         /*
 819          * Cache of bounds within the current buffer.  Only used for insertions
 820          * where _bt_check_unique is called.  See _bt_binsrch_insert and
 821          * _bt_findinsertloc for details.
 822          */
 823         bool            bounds_valid;
 824         OffsetNumber low;
 825         OffsetNumber stricthigh;
 826
 827         /*
 828          * if _bt_binsrch_insert found the location inside existing posting list,
 829          * save the position inside the list.  -1 sentinel value indicates overlap
 830          * with an existing posting list tuple that has its LP_DEAD bit set.
 831          */
 832         int                     postingoff;
 833 } BTInsertStateData;
 834
 835 typedef BTInsertStateData *BTInsertState;
 836
 837 /*
 838  * State used to representing an individual pending tuple during
 839  * deduplication.
 840  */
 841 typedef struct BTDedupInterval
 842 {
 843         OffsetNumber baseoff;
 844         uint16          nitems;
 845 } BTDedupInterval;
 846
 847 /*
 848  * BTDedupStateData is a working area used during deduplication.
 849  *
 850  * The status info fields track the state of a whole-page deduplication pass.
 851  * State about the current pending posting list is also tracked.
 852  *
 853  * A pending posting list is comprised of a contiguous group of equal items
 854  * from the page, starting from page offset number 'baseoff'.  This is the
 855  * offset number of the "base" tuple for new posting list.  'nitems' is the
 856  * current total number of existing items from the page that will be merged to
 857  * make a new posting list tuple, including the base tuple item.  (Existing
 858  * items may themselves be posting list tuples, or regular non-pivot tuples.)
 859  *
 860  * The total size of the existing tuples to be freed when pending posting list
 861  * is processed gets tracked by 'phystupsize'.  This information allows
 862  * deduplication to calculate the space saving for each new posting list
 863  * tuple, and for the entire pass over the page as a whole.
 864  */
 865 typedef struct BTDedupStateData
 866 {
 867         /* Deduplication status info for entire pass over page */
 868         bool            deduplicate;    /* Still deduplicating page? */
 869         int                     nmaxitems;              /* Number of max-sized tuples so far */
 870         Size            maxpostingsize; /* Limit on size of final tuple */
 871
 872         /* Metadata about base tuple of current pending posting list */
 873         IndexTuple      base;                   /* Use to form new posting list */
 874         OffsetNumber baseoff;           /* page offset of base */
 875         Size            basetupsize;    /* base size without original posting list */
 876
 877         /* Other metadata about pending posting list */
 878         ItemPointer htids;                      /* Heap TIDs in pending posting list */
 879         int                     nhtids;                 /* Number of heap TIDs in htids array */
 880         int                     nitems;                 /* Number of existing tuples/line pointers */
 881         Size            phystupsize;    /* Includes line pointer overhead */
 882
 883         /*
 884          * Array of tuples to go on new version of the page.  Contains one entry
 885          * for each group of consecutive items.  Note that existing tuples that
 886          * will not become posting list tuples do not appear in the array (they
 887          * are implicitly unchanged by deduplication pass).
 888          */
 889         int                     nintervals;             /* current number of intervals in array */
 890         BTDedupInterval intervals[MaxIndexTuplesPerPage];
 891 } BTDedupStateData;
 892
 893 typedef BTDedupStateData *BTDedupState;
 894
 895 /*
 896  * BTVacuumPostingData is state that represents how to VACUUM (or delete) a
 897  * posting list tuple when some (though not all) of its TIDs are to be
 898  * deleted.
 899  *
 900  * Convention is that itup field is the original posting list tuple on input,
 901  * and palloc()'d final tuple used to overwrite existing tuple on output.
 902  */
 903 typedef struct BTVacuumPostingData
 904 {
 905         /* Tuple that will be/was updated */
 906         IndexTuple      itup;
 907         OffsetNumber updatedoffset;
 908
 909         /* State needed to describe final itup in WAL */
 910         uint16          ndeletedtids;
 911         uint16          deletetids[FLEXIBLE_ARRAY_MEMBER];
 912 } BTVacuumPostingData;
 913
 914 typedef BTVacuumPostingData *BTVacuumPosting;
 915
 916 /*
 917  * BTScanOpaqueData is the btree-private state needed for an indexscan.
 918  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
 919  * details of the preprocessing), information about the current location
 920  * of the scan, and information about the marked location, if any.  (We use
 921  * BTScanPosData to represent the data needed for each of current and marked
 922  * locations.)  In addition we can remember some known-killed index entries
 923  * that must be marked before we can move off the current page.
 924  *
 925  * Index scans work a page at a time: we pin and read-lock the page, identify
 926  * all the matching items on the page and save them in BTScanPosData, then
 927  * release the read-lock while returning the items to the caller for
 928  * processing.  This approach minimizes lock/unlock traffic.  We must always
 929  * drop the lock to make it okay for caller to process the returned items.
 930  * Whether or not we can also release the pin during this window will vary.
 931  * We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM
 932  * (see nbtree/README section about making concurrent TID recycling safe).
 933  * We'll always release both the lock and the pin on the current page before
 934  * moving on to its sibling page.
 935  *
 936  * If we are doing an index-only scan, we save the entire IndexTuple for each
 937  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
 938  * into a separate workspace array; each BTScanPosItem stores its tuple's
 939  * offset within that array.  Posting list tuples store a "base" tuple once,
 940  * allowing the same key to be returned for each TID in the posting list
 941  * tuple.
 942  */
 943
 944 typedef struct BTScanPosItem    /* what we remember about each match */
 945 {
 946         ItemPointerData heapTid;        /* TID of referenced heap item */
 947         OffsetNumber indexOffset;       /* index item's location within page */
 948         LocationIndex tupleOffset;      /* IndexTuple's offset in workspace, if any */
 949 } BTScanPosItem;
 950
 951 typedef struct BTScanPosData
 952 {
 953         Buffer          buf;                    /* currPage buf (invalid means unpinned) */
 954
 955         /* page details as of the saved position's call to _bt_readpage */
 956         BlockNumber currPage;           /* page referenced by items array */
 957         BlockNumber prevPage;           /* currPage's left link */
 958         BlockNumber nextPage;           /* currPage's right link */
 959         XLogRecPtr      lsn;                    /* currPage's LSN */
 960
 961         /* scan direction for the saved position's call to _bt_readpage */
 962         ScanDirection dir;
 963
 964         /*
 965          * If we are doing an index-only scan, nextTupleOffset is the first free
 966          * location in the associated tuple storage workspace.
 967          */
 968         int                     nextTupleOffset;
 969
 970         /*
 971          * moreLeft and moreRight track whether we think there may be matching
 972          * index entries to the left and right of the current page, respectively.
 973          */
 974         bool            moreLeft;
 975         bool            moreRight;
 976
 977         /*
 978          * The items array is always ordered in index order (ie, increasing
 979          * indexoffset).  When scanning backwards it is convenient to fill the
 980          * array back-to-front, so we start at the last slot and fill downwards.
 981          * Hence we need both a first-valid-entry and a last-valid-entry counter.
 982          * itemIndex is a cursor showing which entry was last returned to caller.
 983          */
 984         int                     firstItem;              /* first valid index in items[] */
 985         int                     lastItem;               /* last valid index in items[] */
 986         int                     itemIndex;              /* current index in items[] */
 987
 988         BTScanPosItem items[MaxTIDsPerBTreePage];       /* MUST BE LAST */
 989 } BTScanPosData;
 990
 991 typedef BTScanPosData *BTScanPos;
 992
 993 #define BTScanPosIsPinned(scanpos) \
 994 ( \
 995         AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
 996                                 !BufferIsValid((scanpos).buf)), \
 997         BufferIsValid((scanpos).buf) \
 998 )
 999 #define BTScanPosUnpin(scanpos) \
1000         do { \
1001                 ReleaseBuffer((scanpos).buf); \
1002                 (scanpos).buf = InvalidBuffer; \
1003         } while (0)
1004 #define BTScanPosUnpinIfPinned(scanpos) \
1005         do { \
1006                 if (BTScanPosIsPinned(scanpos)) \
1007                         BTScanPosUnpin(scanpos); \
1008         } while (0)
1009
1010 #define BTScanPosIsValid(scanpos) \
1011 ( \
1012         AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
1013                                 !BufferIsValid((scanpos).buf)), \
1014         BlockNumberIsValid((scanpos).currPage) \
1015 )
1016 #define BTScanPosInvalidate(scanpos) \
1017         do { \
1018                 (scanpos).buf = InvalidBuffer; \
1019                 (scanpos).currPage = InvalidBlockNumber; \
1020         } while (0)
1021
1022 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
1023 typedef struct BTArrayKeyInfo
1024 {
1025         int                     scan_key;               /* index of associated key in keyData */
1026         int                     cur_elem;               /* index of current element in elem_values */
1027         int                     num_elems;              /* number of elems in current array value */
1028         Datum      *elem_values;        /* array of num_elems Datums */
1029 } BTArrayKeyInfo;
1030
1031 typedef struct BTScanOpaqueData
1032 {
1033         /* these fields are set by _bt_preprocess_keys(): */
1034         bool            qual_ok;                /* false if qual can never be satisfied */
1035         int                     numberOfKeys;   /* number of preprocessed scan keys */
1036         ScanKey         keyData;                /* array of preprocessed scan keys */
1037
1038         /* workspace for SK_SEARCHARRAY support */
1039         int                     numArrayKeys;   /* number of equality-type array keys */
1040         bool            needPrimScan;   /* New prim scan to continue in current dir? */
1041         bool            scanBehind;             /* Last array advancement matched -inf attr? */
1042         bool            oppositeDirCheck;       /* explicit scanBehind recheck needed? */
1043         BTArrayKeyInfo *arrayKeys;      /* info about each equality-type array key */
1044         FmgrInfo   *orderProcs;         /* ORDER procs for required equality keys */
1045         MemoryContext arrayContext; /* scan-lifespan context for array data */
1046
1047         /* info about killed items if any (killedItems is NULL if never used) */
1048         int                *killedItems;        /* currPos.items indexes of killed items */
1049         int                     numKilled;              /* number of currently stored items */
1050
1051         /*
1052          * If we are doing an index-only scan, these are the tuple storage
1053          * workspaces for the currPos and markPos respectively.  Each is of size
1054          * BLCKSZ, so it can hold as much as a full page's worth of tuples.
1055          */
1056         char       *currTuples;         /* tuple storage for currPos */
1057         char       *markTuples;         /* tuple storage for markPos */
1058
1059         /*
1060          * If the marked position is on the same page as current position, we
1061          * don't use markPos, but just keep the marked itemIndex in markItemIndex
1062          * (all the rest of currPos is valid for the mark position). Hence, to
1063          * determine if there is a mark, first look at markItemIndex, then at
1064          * markPos.
1065          */
1066         int                     markItemIndex;  /* itemIndex, or -1 if not valid */
1067
1068         /* keep these last in struct for efficiency */
1069         BTScanPosData currPos;          /* current position data */
1070         BTScanPosData markPos;          /* marked position, if any */
1071 } BTScanOpaqueData;
1072
1073 typedef BTScanOpaqueData *BTScanOpaque;
1074
1075 /*
1076  * _bt_readpage state used across _bt_checkkeys calls for a page
1077  */
1078 typedef struct BTReadPageState
1079 {
1080         /* Input parameters, set by _bt_readpage for _bt_checkkeys */
1081         OffsetNumber minoff;            /* Lowest non-pivot tuple's offset */
1082         OffsetNumber maxoff;            /* Highest non-pivot tuple's offset */
1083         IndexTuple      finaltup;               /* Needed by scans with array keys */
1084         Page            page;                   /* Page being read */
1085
1086         /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
1087         OffsetNumber offnum;            /* current tuple's page offset number */
1088
1089         /* Output parameter, set by _bt_checkkeys for _bt_readpage */
1090         OffsetNumber skip;                      /* Array keys "look ahead" skip offnum */
1091         bool            continuescan;   /* Terminate ongoing (primitive) index scan? */
1092
1093         /*
1094          * Input and output parameters, set and unset by both _bt_readpage and
1095          * _bt_checkkeys to manage precheck optimizations
1096          */
1097         bool            prechecked;             /* precheck set continuescan to 'true'? */
1098         bool            firstmatch;             /* at least one match so far?  */
1099
1100         /*
1101          * Private _bt_checkkeys state used to manage "look ahead" optimization
1102          * (only used during scans with array keys)
1103          */
1104         int16           rechecks;
1105         int16           targetdistance;
1106
1107 } BTReadPageState;
1108
1109 /*
1110  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
1111  * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
1112  * index's indoption[] array entry for the index attribute.
1113  */
1114 #define SK_BT_REQFWD    0x00010000      /* required to continue forward scan */
1115 #define SK_BT_REQBKWD   0x00020000      /* required to continue backward scan */
1116 #define SK_BT_INDOPTION_SHIFT  24       /* must clear the above bits */
1117 #define SK_BT_DESC                      (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
1118 #define SK_BT_NULLS_FIRST       (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
1119
1120 typedef struct BTOptions
1121 {
1122         int32           varlena_header_;        /* varlena header (do not touch directly!) */
1123         int                     fillfactor;             /* page fill factor in percent (0..100) */
1124         float8          vacuum_cleanup_index_scale_factor;      /* deprecated */
1125         bool            deduplicate_items;      /* Try to deduplicate items? */
1126 } BTOptions;
1127
1128 #define BTGetFillFactor(relation) \
1129         (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
1130                                  relation->rd_rel->relam == BTREE_AM_OID), \
1131          (relation)->rd_options ? \
1132          ((BTOptions *) (relation)->rd_options)->fillfactor : \
1133          BTREE_DEFAULT_FILLFACTOR)
1134 #define BTGetTargetPageFreeSpace(relation) \
1135         (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
1136 #define BTGetDeduplicateItems(relation) \
1137         (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
1138                                  relation->rd_rel->relam == BTREE_AM_OID), \
1139         ((relation)->rd_options ? \
1140          ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
1141
1142 /*
1143  * Constant definition for progress reporting.  Phase numbers must match
1144  * btbuildphasename.
1145  */
1146 /* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */
1147 #define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN               2
1148 #define PROGRESS_BTREE_PHASE_PERFORMSORT_1                              3
1149 #define PROGRESS_BTREE_PHASE_PERFORMSORT_2                              4
1150 #define PROGRESS_BTREE_PHASE_LEAF_LOAD                                  5
1151
1152 /*
1153  * external entry points for btree, in nbtree.c
1154  */
1155 extern void btbuildempty(Relation index);
1156 extern bool btinsert(Relation rel, Datum *values, bool *isnull,
1157                                          ItemPointer ht_ctid, Relation heapRel,
1158                                          IndexUniqueCheck checkUnique,
1159                                          bool indexUnchanged,
1160                                          struct IndexInfo *indexInfo);
1161 extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
1162 extern Size btestimateparallelscan(int nkeys, int norderbys);
1163 extern void btinitparallelscan(void *target);
1164 extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
1165 extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
1166 extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
1167                                          ScanKey orderbys, int norderbys);
1168 extern void btparallelrescan(IndexScanDesc scan);
1169 extern void btendscan(IndexScanDesc scan);
1170 extern void btmarkpos(IndexScanDesc scan);
1171 extern void btrestrpos(IndexScanDesc scan);
1172 extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
1173                                                                                    IndexBulkDeleteResult *stats,
1174                                                                                    IndexBulkDeleteCallback callback,
1175                                                                                    void *callback_state);
1176 extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
1177                                                                                           IndexBulkDeleteResult *stats);
1178 extern bool btcanreturn(Relation index, int attno);
1179 extern int      btgettreeheight(Relation rel);
1180
1181 /*
1182  * prototypes for internal functions in nbtree.c
1183  */
1184 extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
1185                                                            BlockNumber *last_curr_page, bool first);
1186 extern void _bt_parallel_release(IndexScanDesc scan,
1187                                                                  BlockNumber next_scan_page,
1188                                                                  BlockNumber curr_page);
1189 extern void _bt_parallel_done(IndexScanDesc scan);
1190 extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
1191                                                                                    BlockNumber curr_page);
1192
1193 /*
1194  * prototypes for functions in nbtdedup.c
1195  */
1196 extern void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem,
1197                                                    Size newitemsz, bool bottomupdedup);
1198 extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
1199                                                                  Size newitemsz);
1200 extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
1201                                                                         OffsetNumber baseoff);
1202 extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
1203 extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
1204 extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
1205                                                                    int nhtids);
1206 extern void _bt_update_posting(BTVacuumPosting vacposting);
1207 extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
1208                                                                    int postingoff);
1209
1210 /*
1211  * prototypes for functions in nbtinsert.c
1212  */
1213 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
1214                                                  IndexUniqueCheck checkUnique, bool indexUnchanged,
1215                                                  Relation heapRel);
1216 extern void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf,
1217                                                          BTStack stack);
1218 extern Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack,
1219                                                           BlockNumber child);
1220
1221 /*
1222  * prototypes for functions in nbtsplitloc.c
1223  */
1224 extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
1225                                                                          OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1226                                                                          bool *newitemonleft);
1227
1228 /*
1229  * prototypes for functions in nbtpage.c
1230  */
1231 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
1232                                                          bool allequalimage);
1233 extern bool _bt_vacuum_needs_cleanup(Relation rel);
1234 extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages);
1235 extern void _bt_upgrademetapage(Page page);
1236 extern Buffer _bt_getroot(Relation rel, Relation heaprel, int access);
1237 extern Buffer _bt_gettrueroot(Relation rel);
1238 extern int      _bt_getrootheight(Relation rel);
1239 extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
1240                                                         bool *allequalimage);
1241 extern void _bt_checkpage(Relation rel, Buffer buf);
1242 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
1243 extern Buffer _bt_allocbuf(Relation rel, Relation heaprel);
1244 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
1245                                                            BlockNumber blkno, int access);
1246 extern void _bt_relbuf(Relation rel, Buffer buf);
1247 extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
1248 extern void _bt_unlockbuf(Relation rel, Buffer buf);
1249 extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
1250 extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
1251 extern void _bt_pageinit(Page page, Size size);
1252 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
1253                                                                 OffsetNumber *deletable, int ndeletable,
1254                                                                 BTVacuumPosting *updatable, int nupdatable);
1255 extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
1256                                                                           Relation heapRel,
1257                                                                           TM_IndexDeleteOp *delstate);
1258 extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate);
1259 extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate,
1260                                                                 bool cleanuponly);
1261 extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
1262
1263 /*
1264  * prototypes for functions in nbtsearch.c
1265  */
1266 extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
1267                                                   Buffer *bufP, int access);
1268 extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
1269 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
1270 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
1271 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
1272 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
1273
1274 /*
1275  * prototypes for functions in nbtutils.c
1276  */
1277 extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
1278 extern void _bt_freestack(BTStack stack);
1279 extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir);
1280 extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
1281 extern void _bt_preprocess_keys(IndexScanDesc scan);
1282 extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
1283                                                   IndexTuple tuple, int tupnatts);
1284 extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
1285                                                                   IndexTuple finaltup);
1286 extern void _bt_killitems(IndexScanDesc scan);
1287 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
1288 extern BTCycleId _bt_start_vacuum(Relation rel);
1289 extern void _bt_end_vacuum(Relation rel);
1290 extern void _bt_end_vacuum_callback(int code, Datum arg);
1291 extern Size BTreeShmemSize(void);
1292 extern void BTreeShmemInit(void);
1293 extern bytea *btoptions(Datum reloptions, bool validate);
1294 extern bool btproperty(Oid index_oid, int attno,
1295                                            IndexAMProperty prop, const char *propname,
1296                                            bool *res, bool *isnull);
1297 extern char *btbuildphasename(int64 phasenum);
1298 extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
1299                                                            IndexTuple firstright, BTScanInsert itup_key);
1300 extern int      _bt_keep_natts_fast(Relation rel, IndexTuple lastleft,
1301                                                                 IndexTuple firstright);
1302 extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
1303                                                         OffsetNumber offnum);
1304 extern void _bt_check_third_page(Relation rel, Relation heap,
1305                                                                  bool needheaptidspace, Page page, IndexTuple newtup);
1306 extern bool _bt_allequalimage(Relation rel, bool debugmessage);
1307
1308 /*
1309  * prototypes for functions in nbtvalidate.c
1310  */
1311 extern bool btvalidate(Oid opclassoid);
1312 extern void btadjustmembers(Oid opfamilyoid,
1313                                                         Oid opclassoid,
1314                                                         List *operators,
1315                                                         List *functions);
1316
1317 /*
1318  * prototypes for functions in nbtsort.c
1319  */
1320 extern IndexBuildResult *btbuild(Relation heap, Relation index,
1321                                                                  struct IndexInfo *indexInfo);
1322 extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
1323
1324 #endif                                                  /* NBTREE_H */