src/backend/access/nbtree/nbtsearch.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtsearch.c
   4  *        Search code for postgres btrees.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/nbtree/nbtsearch.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "access/nbtree.h"
  19 #include "access/relscan.h"
  20 #include "access/xact.h"
  21 #include "miscadmin.h"
  22 #include "pgstat.h"
  23 #include "storage/predicate.h"
  24 #include "utils/lsyscache.h"
  25 #include "utils/rel.h"
  26
  27
  28 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
  29 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
  30                                                         Buffer buf, bool forupdate, BTStack stack,
  31                                                         int access);
  32 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
  33 static int      _bt_binsrch_posting(BTScanInsert key, Page page,
  34                                                                 OffsetNumber offnum);
  35 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
  36                                                  OffsetNumber offnum, bool firstPage);
  37 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
  38                                                  OffsetNumber offnum, IndexTuple itup);
  39 static int      _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
  40                                                                   OffsetNumber offnum, ItemPointer heapTid,
  41                                                                   IndexTuple itup);
  42 static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
  43                                                                            OffsetNumber offnum,
  44                                                                            ItemPointer heapTid, int tupleOffset);
  45 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
  46 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
  47 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
  48                                                                   ScanDirection dir);
  49 static Buffer _bt_walk_left(Relation rel, Buffer buf);
  50 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  51 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
  52
  53
  54 /*
  55  *      _bt_drop_lock_and_maybe_pin()
  56  *
  57  * Unlock the buffer; and if it is safe to release the pin, do that, too.
  58  * This will prevent vacuum from stalling in a blocked state trying to read a
  59  * page when a cursor is sitting on it.
  60  *
  61  * See nbtree/README section on making concurrent TID recycling safe.
  62  */
  63 static void
  64 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
  65 {
  66         _bt_unlockbuf(scan->indexRelation, sp->buf);
  67
  68         if (IsMVCCSnapshot(scan->xs_snapshot) &&
  69                 RelationNeedsWAL(scan->indexRelation) &&
  70                 !scan->xs_want_itup)
  71         {
  72                 ReleaseBuffer(sp->buf);
  73                 sp->buf = InvalidBuffer;
  74         }
  75 }
  76
  77 /*
  78  *      _bt_search() -- Search the tree for a particular scankey,
  79  *              or more precisely for the first leaf page it could be on.
  80  *
  81  * The passed scankey is an insertion-type scankey (see nbtree/README),
  82  * but it can omit the rightmost column(s) of the index.
  83  *
  84  * Return value is a stack of parent-page pointers (i.e. there is no entry for
  85  * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
  86  * which is locked and pinned.  No locks are held on the parent pages,
  87  * however!
  88  *
  89  * The returned buffer is locked according to access parameter.  Additionally,
  90  * access = BT_WRITE will allow an empty root page to be created and returned.
  91  * When access = BT_READ, an empty index will result in *bufP being set to
  92  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
  93  * during the search will be finished.
  94  *
  95  * heaprel must be provided by callers that pass access = BT_WRITE, since we
  96  * might need to allocate a new root page for caller -- see _bt_allocbuf.
  97  */
  98 BTStack
  99 _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
 100                    int access)
 101 {
 102         BTStack         stack_in = NULL;
 103         int                     page_access = BT_READ;
 104
 105         /* heaprel must be set whenever _bt_allocbuf is reachable */
 106         Assert(access == BT_READ || access == BT_WRITE);
 107         Assert(access == BT_READ || heaprel != NULL);
 108
 109         /* Get the root page to start with */
 110         *bufP = _bt_getroot(rel, heaprel, access);
 111
 112         /* If index is empty and access = BT_READ, no root page is created. */
 113         if (!BufferIsValid(*bufP))
 114                 return (BTStack) NULL;
 115
 116         /* Loop iterates once per level descended in the tree */
 117         for (;;)
 118         {
 119                 Page            page;
 120                 BTPageOpaque opaque;
 121                 OffsetNumber offnum;
 122                 ItemId          itemid;
 123                 IndexTuple      itup;
 124                 BlockNumber child;
 125                 BTStack         new_stack;
 126
 127                 /*
 128                  * Race -- the page we just grabbed may have split since we read its
 129                  * downlink in its parent page (or the metapage).  If it has, we may
 130                  * need to move right to its new sibling.  Do that.
 131                  *
 132                  * In write-mode, allow _bt_moveright to finish any incomplete splits
 133                  * along the way.  Strictly speaking, we'd only need to finish an
 134                  * incomplete split on the leaf page we're about to insert to, not on
 135                  * any of the upper levels (internal pages with incomplete splits are
 136                  * also taken care of in _bt_getstackbuf).  But this is a good
 137                  * opportunity to finish splits of internal pages too.
 138                  */
 139                 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
 140                                                           stack_in, page_access);
 141
 142                 /* if this is a leaf page, we're done */
 143                 page = BufferGetPage(*bufP);
 144                 opaque = BTPageGetOpaque(page);
 145                 if (P_ISLEAF(opaque))
 146                         break;
 147
 148                 /*
 149                  * Find the appropriate pivot tuple on this page.  Its downlink points
 150                  * to the child page that we're about to descend to.
 151                  */
 152                 offnum = _bt_binsrch(rel, key, *bufP);
 153                 itemid = PageGetItemId(page, offnum);
 154                 itup = (IndexTuple) PageGetItem(page, itemid);
 155                 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
 156                 child = BTreeTupleGetDownLink(itup);
 157
 158                 /*
 159                  * We need to save the location of the pivot tuple we chose in a new
 160                  * stack entry for this page/level.  If caller ends up splitting a
 161                  * page one level down, it usually ends up inserting a new pivot
 162                  * tuple/downlink immediately after the location recorded here.
 163                  */
 164                 new_stack = (BTStack) palloc(sizeof(BTStackData));
 165                 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
 166                 new_stack->bts_offset = offnum;
 167                 new_stack->bts_parent = stack_in;
 168
 169                 /*
 170                  * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
 171                  * we're on the level 1 and asked to lock leaf page in write mode,
 172                  * then lock next page in write mode, because it must be a leaf.
 173                  */
 174                 if (opaque->btpo_level == 1 && access == BT_WRITE)
 175                         page_access = BT_WRITE;
 176
 177                 /* drop the read lock on the page, then acquire one on its child */
 178                 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
 179
 180                 /* okay, all set to move down a level */
 181                 stack_in = new_stack;
 182         }
 183
 184         /*
 185          * If we're asked to lock leaf in write mode, but didn't manage to, then
 186          * relock.  This should only happen when the root page is a leaf page (and
 187          * the only page in the index other than the metapage).
 188          */
 189         if (access == BT_WRITE && page_access == BT_READ)
 190         {
 191                 /* trade in our read lock for a write lock */
 192                 _bt_unlockbuf(rel, *bufP);
 193                 _bt_lockbuf(rel, *bufP, BT_WRITE);
 194
 195                 /*
 196                  * Race -- the leaf page may have split after we dropped the read lock
 197                  * but before we acquired a write lock.  If it has, we may need to
 198                  * move right to its new sibling.  Do that.
 199                  */
 200                 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
 201         }
 202
 203         return stack_in;
 204 }
 205
 206 /*
 207  *      _bt_moveright() -- move right in the btree if necessary.
 208  *
 209  * When we follow a pointer to reach a page, it is possible that
 210  * the page has changed in the meanwhile.  If this happens, we're
 211  * guaranteed that the page has "split right" -- that is, that any
 212  * data that appeared on the page originally is either on the page
 213  * or strictly to the right of it.
 214  *
 215  * This routine decides whether or not we need to move right in the
 216  * tree by examining the high key entry on the page.  If that entry is
 217  * strictly less than the scankey, or <= the scankey in the
 218  * key.nextkey=true case, then we followed the wrong link and we need
 219  * to move right.
 220  *
 221  * The passed insertion-type scankey can omit the rightmost column(s) of the
 222  * index. (see nbtree/README)
 223  *
 224  * When key.nextkey is false (the usual case), we are looking for the first
 225  * item >= key.  When key.nextkey is true, we are looking for the first item
 226  * strictly greater than key.
 227  *
 228  * If forupdate is true, we will attempt to finish any incomplete splits
 229  * that we encounter.  This is required when locking a target page for an
 230  * insertion, because we don't allow inserting on a page before the split is
 231  * completed.  'heaprel' and 'stack' are only used if forupdate is true.
 232  *
 233  * On entry, we have the buffer pinned and a lock of the type specified by
 234  * 'access'.  If we move right, we release the buffer and lock and acquire
 235  * the same on the right sibling.  Return value is the buffer we stop at.
 236  */
 237 static Buffer
 238 _bt_moveright(Relation rel,
 239                           Relation heaprel,
 240                           BTScanInsert key,
 241                           Buffer buf,
 242                           bool forupdate,
 243                           BTStack stack,
 244                           int access)
 245 {
 246         Page            page;
 247         BTPageOpaque opaque;
 248         int32           cmpval;
 249
 250         Assert(!forupdate || heaprel != NULL);
 251
 252         /*
 253          * When nextkey = false (normal case): if the scan key that brought us to
 254          * this page is > the high key stored on the page, then the page has split
 255          * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
 256          * have some duplicates to the right as well as the left, but that's
 257          * something that's only ever dealt with on the leaf level, after
 258          * _bt_search has found an initial leaf page.)
 259          *
 260          * When nextkey = true: move right if the scan key is >= page's high key.
 261          * (Note that key.scantid cannot be set in this case.)
 262          *
 263          * The page could even have split more than once, so scan as far as
 264          * needed.
 265          *
 266          * We also have to move right if we followed a link that brought us to a
 267          * dead page.
 268          */
 269         cmpval = key->nextkey ? 0 : 1;
 270
 271         for (;;)
 272         {
 273                 page = BufferGetPage(buf);
 274                 opaque = BTPageGetOpaque(page);
 275
 276                 if (P_RIGHTMOST(opaque))
 277                         break;
 278
 279                 /*
 280                  * Finish any incomplete splits we encounter along the way.
 281                  */
 282                 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
 283                 {
 284                         BlockNumber blkno = BufferGetBlockNumber(buf);
 285
 286                         /* upgrade our lock if necessary */
 287                         if (access == BT_READ)
 288                         {
 289                                 _bt_unlockbuf(rel, buf);
 290                                 _bt_lockbuf(rel, buf, BT_WRITE);
 291                         }
 292
 293                         if (P_INCOMPLETE_SPLIT(opaque))
 294                                 _bt_finish_split(rel, heaprel, buf, stack);
 295                         else
 296                                 _bt_relbuf(rel, buf);
 297
 298                         /* re-acquire the lock in the right mode, and re-check */
 299                         buf = _bt_getbuf(rel, blkno, access);
 300                         continue;
 301                 }
 302
 303                 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
 304                 {
 305                         /* step right one page */
 306                         buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
 307                         continue;
 308                 }
 309                 else
 310                         break;
 311         }
 312
 313         if (P_IGNORE(opaque))
 314                 elog(ERROR, "fell off the end of index \"%s\"",
 315                          RelationGetRelationName(rel));
 316
 317         return buf;
 318 }
 319
 320 /*
 321  *      _bt_binsrch() -- Do a binary search for a key on a particular page.
 322  *
 323  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
 324  * of the last key < given scankey, or last key <= given scankey if nextkey
 325  * is true.  (Since _bt_compare treats the first data key of such a page as
 326  * minus infinity, there will be at least one key < scankey, so the result
 327  * always points at one of the keys on the page.)
 328  *
 329  * On a leaf page, _bt_binsrch() returns the final result of the initial
 330  * positioning process that started with _bt_first's call to _bt_search.
 331  * We're returning a non-pivot tuple offset, so things are a little different.
 332  * It is possible that we'll return an offset that's either past the last
 333  * non-pivot slot, or (in the case of a backward scan) before the first slot.
 334  *
 335  * This procedure is not responsible for walking right, it just examines
 336  * the given page.  _bt_binsrch() has no lock or refcount side effects
 337  * on the buffer.
 338  */
 339 static OffsetNumber
 340 _bt_binsrch(Relation rel,
 341                         BTScanInsert key,
 342                         Buffer buf)
 343 {
 344         Page            page;
 345         BTPageOpaque opaque;
 346         OffsetNumber low,
 347                                 high;
 348         int32           result,
 349                                 cmpval;
 350
 351         page = BufferGetPage(buf);
 352         opaque = BTPageGetOpaque(page);
 353
 354         /* Requesting nextkey semantics while using scantid seems nonsensical */
 355         Assert(!key->nextkey || key->scantid == NULL);
 356         /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
 357         Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
 358
 359         low = P_FIRSTDATAKEY(opaque);
 360         high = PageGetMaxOffsetNumber(page);
 361
 362         /*
 363          * If there are no keys on the page, return the first available slot. Note
 364          * this covers two cases: the page is really empty (no keys), or it
 365          * contains only a high key.  The latter case is possible after vacuuming.
 366          * This can never happen on an internal page, however, since they are
 367          * never empty (an internal page must have at least one child).
 368          */
 369         if (unlikely(high < low))
 370                 return low;
 371
 372         /*
 373          * Binary search to find the first key on the page >= scan key, or first
 374          * key > scankey when nextkey is true.
 375          *
 376          * For nextkey=false (cmpval=1), the loop invariant is: all slots before
 377          * 'low' are < scan key, all slots at or after 'high' are >= scan key.
 378          *
 379          * For nextkey=true (cmpval=0), the loop invariant is: all slots before
 380          * 'low' are <= scan key, all slots at or after 'high' are > scan key.
 381          *
 382          * We can fall out when high == low.
 383          */
 384         high++;                                         /* establish the loop invariant for high */
 385
 386         cmpval = key->nextkey ? 0 : 1;  /* select comparison value */
 387
 388         while (high > low)
 389         {
 390                 OffsetNumber mid = low + ((high - low) / 2);
 391
 392                 /* We have low <= mid < high, so mid points at a real slot */
 393
 394                 result = _bt_compare(rel, key, page, mid);
 395
 396                 if (result >= cmpval)
 397                         low = mid + 1;
 398                 else
 399                         high = mid;
 400         }
 401
 402         /*
 403          * At this point we have high == low.
 404          *
 405          * On a leaf page we always return the first non-pivot tuple >= scan key
 406          * (resp. > scan key) for forward scan callers.  For backward scans, it's
 407          * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
 408          */
 409         if (P_ISLEAF(opaque))
 410         {
 411                 /*
 412                  * In the backward scan case we're supposed to locate the last
 413                  * matching tuple on the leaf level -- not the first matching tuple
 414                  * (the last tuple will be the first one returned by the scan).
 415                  *
 416                  * At this point we've located the first non-pivot tuple immediately
 417                  * after the last matching tuple (which might just be maxoff + 1).
 418                  * Compensate by stepping back.
 419                  */
 420                 if (key->backward)
 421                         return OffsetNumberPrev(low);
 422
 423                 return low;
 424         }
 425
 426         /*
 427          * On a non-leaf page, return the last key < scan key (resp. <= scan key).
 428          * There must be one if _bt_compare() is playing by the rules.
 429          *
 430          * _bt_compare() will seldom see any exactly-matching pivot tuples, since
 431          * a truncated -inf heap TID is usually enough to prevent it altogether.
 432          * Even omitted scan key entries are treated as > truncated attributes.
 433          *
 434          * However, during backward scans _bt_compare() interprets omitted scan
 435          * key attributes as == corresponding truncated -inf attributes instead.
 436          * This works just like < would work here.  Under this scheme, < strategy
 437          * backward scans will always directly descend to the correct leaf page.
 438          * In particular, they will never incur an "extra" leaf page access with a
 439          * scan key that happens to contain the same prefix of values as some
 440          * pivot tuple's untruncated prefix.  VACUUM relies on this guarantee when
 441          * it uses a leaf page high key to "re-find" a page undergoing deletion.
 442          */
 443         Assert(low > P_FIRSTDATAKEY(opaque));
 444
 445         return OffsetNumberPrev(low);
 446 }
 447
 448 /*
 449  *
 450  *      _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
 451  *
 452  * Like _bt_binsrch(), but with support for caching the binary search
 453  * bounds.  Only used during insertion, and only on the leaf page that it
 454  * looks like caller will insert tuple on.  Exclusive-locked and pinned
 455  * leaf page is contained within insertstate.
 456  *
 457  * Caches the bounds fields in insertstate so that a subsequent call can
 458  * reuse the low and strict high bounds of original binary search.  Callers
 459  * that use these fields directly must be prepared for the case where low
 460  * and/or stricthigh are not on the same page (one or both exceed maxoff
 461  * for the page).  The case where there are no items on the page (high <
 462  * low) makes bounds invalid.
 463  *
 464  * Caller is responsible for invalidating bounds when it modifies the page
 465  * before calling here a second time, and for dealing with posting list
 466  * tuple matches (callers can use insertstate's postingoff field to
 467  * determine which existing heap TID will need to be replaced by a posting
 468  * list split).
 469  */
 470 OffsetNumber
 471 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 472 {
 473         BTScanInsert key = insertstate->itup_key;
 474         Page            page;
 475         BTPageOpaque opaque;
 476         OffsetNumber low,
 477                                 high,
 478                                 stricthigh;
 479         int32           result,
 480                                 cmpval;
 481
 482         page = BufferGetPage(insertstate->buf);
 483         opaque = BTPageGetOpaque(page);
 484
 485         Assert(P_ISLEAF(opaque));
 486         Assert(!key->nextkey);
 487         Assert(insertstate->postingoff == 0);
 488
 489         if (!insertstate->bounds_valid)
 490         {
 491                 /* Start new binary search */
 492                 low = P_FIRSTDATAKEY(opaque);
 493                 high = PageGetMaxOffsetNumber(page);
 494         }
 495         else
 496         {
 497                 /* Restore result of previous binary search against same page */
 498                 low = insertstate->low;
 499                 high = insertstate->stricthigh;
 500         }
 501
 502         /* If there are no keys on the page, return the first available slot */
 503         if (unlikely(high < low))
 504         {
 505                 /* Caller can't reuse bounds */
 506                 insertstate->low = InvalidOffsetNumber;
 507                 insertstate->stricthigh = InvalidOffsetNumber;
 508                 insertstate->bounds_valid = false;
 509                 return low;
 510         }
 511
 512         /*
 513          * Binary search to find the first key on the page >= scan key. (nextkey
 514          * is always false when inserting).
 515          *
 516          * The loop invariant is: all slots before 'low' are < scan key, all slots
 517          * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
 518          * maintained to save additional search effort for caller.
 519          *
 520          * We can fall out when high == low.
 521          */
 522         if (!insertstate->bounds_valid)
 523                 high++;                                 /* establish the loop invariant for high */
 524         stricthigh = high;                      /* high initially strictly higher */
 525
 526         cmpval = 1;                                     /* !nextkey comparison value */
 527
 528         while (high > low)
 529         {
 530                 OffsetNumber mid = low + ((high - low) / 2);
 531
 532                 /* We have low <= mid < high, so mid points at a real slot */
 533
 534                 result = _bt_compare(rel, key, page, mid);
 535
 536                 if (result >= cmpval)
 537                         low = mid + 1;
 538                 else
 539                 {
 540                         high = mid;
 541                         if (result != 0)
 542                                 stricthigh = high;
 543                 }
 544
 545                 /*
 546                  * If tuple at offset located by binary search is a posting list whose
 547                  * TID range overlaps with caller's scantid, perform posting list
 548                  * binary search to set postingoff for caller.  Caller must split the
 549                  * posting list when postingoff is set.  This should happen
 550                  * infrequently.
 551                  */
 552                 if (unlikely(result == 0 && key->scantid != NULL))
 553                 {
 554                         /*
 555                          * postingoff should never be set more than once per leaf page
 556                          * binary search.  That would mean that there are duplicate table
 557                          * TIDs in the index, which is never okay.  Check for that here.
 558                          */
 559                         if (insertstate->postingoff != 0)
 560                                 ereport(ERROR,
 561                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 562                                                  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
 563                                                                                  ItemPointerGetBlockNumber(key->scantid),
 564                                                                                  ItemPointerGetOffsetNumber(key->scantid),
 565                                                                                  low, stricthigh,
 566                                                                                  BufferGetBlockNumber(insertstate->buf),
 567                                                                                  RelationGetRelationName(rel))));
 568
 569                         insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
 570                 }
 571         }
 572
 573         /*
 574          * On a leaf page, a binary search always returns the first key >= scan
 575          * key (at least in !nextkey case), which could be the last slot + 1. This
 576          * is also the lower bound of cached search.
 577          *
 578          * stricthigh may also be the last slot + 1, which prevents caller from
 579          * using bounds directly, but is still useful to us if we're called a
 580          * second time with cached bounds (cached low will be < stricthigh when
 581          * that happens).
 582          */
 583         insertstate->low = low;
 584         insertstate->stricthigh = stricthigh;
 585         insertstate->bounds_valid = true;
 586
 587         return low;
 588 }
 589
 590 /*----------
 591  *      _bt_binsrch_posting() -- posting list binary search.
 592  *
 593  * Helper routine for _bt_binsrch_insert().
 594  *
 595  * Returns offset into posting list where caller's scantid belongs.
 596  *----------
 597  */
 598 static int
 599 _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
 600 {
 601         IndexTuple      itup;
 602         ItemId          itemid;
 603         int                     low,
 604                                 high,
 605                                 mid,
 606                                 res;
 607
 608         /*
 609          * If this isn't a posting tuple, then the index must be corrupt (if it is
 610          * an ordinary non-pivot tuple then there must be an existing tuple with a
 611          * heap TID that equals inserter's new heap TID/scantid).  Defensively
 612          * check that tuple is a posting list tuple whose posting list range
 613          * includes caller's scantid.
 614          *
 615          * (This is also needed because contrib/amcheck's rootdescend option needs
 616          * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
 617          */
 618         itemid = PageGetItemId(page, offnum);
 619         itup = (IndexTuple) PageGetItem(page, itemid);
 620         if (!BTreeTupleIsPosting(itup))
 621                 return 0;
 622
 623         Assert(key->heapkeyspace && key->allequalimage);
 624
 625         /*
 626          * In the event that posting list tuple has LP_DEAD bit set, indicate this
 627          * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
 628          * second call to _bt_binsrch_insert() can take place when its caller has
 629          * removed the dead item.
 630          */
 631         if (ItemIdIsDead(itemid))
 632                 return -1;
 633
 634         /* "high" is past end of posting list for loop invariant */
 635         low = 0;
 636         high = BTreeTupleGetNPosting(itup);
 637         Assert(high >= 2);
 638
 639         while (high > low)
 640         {
 641                 mid = low + ((high - low) / 2);
 642                 res = ItemPointerCompare(key->scantid,
 643                                                                  BTreeTupleGetPostingN(itup, mid));
 644
 645                 if (res > 0)
 646                         low = mid + 1;
 647                 else if (res < 0)
 648                         high = mid;
 649                 else
 650                         return mid;
 651         }
 652
 653         /* Exact match not found */
 654         return low;
 655 }
 656
 657 /*----------
 658  *      _bt_compare() -- Compare insertion-type scankey to tuple on a page.
 659  *
 660  *      page/offnum: location of btree item to be compared to.
 661  *
 662  *              This routine returns:
 663  *                      <0 if scankey < tuple at offnum;
 664  *                       0 if scankey == tuple at offnum;
 665  *                      >0 if scankey > tuple at offnum.
 666  *
 667  * NULLs in the keys are treated as sortable values.  Therefore
 668  * "equality" does not necessarily mean that the item should be returned
 669  * to the caller as a matching key.  Similarly, an insertion scankey
 670  * with its scantid set is treated as equal to a posting tuple whose TID
 671  * range overlaps with their scantid.  There generally won't be a
 672  * matching TID in the posting tuple, which caller must handle
 673  * themselves (e.g., by splitting the posting list tuple).
 674  *
 675  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
 676  * "minus infinity": this routine will always claim it is less than the
 677  * scankey.  The actual key value stored is explicitly truncated to 0
 678  * attributes (explicitly minus infinity) with version 3+ indexes, but
 679  * that isn't relied upon.  This allows us to implement the Lehman and
 680  * Yao convention that the first down-link pointer is before the first
 681  * key.  See backend/access/nbtree/README for details.
 682  *----------
 683  */
 684 int32
 685 _bt_compare(Relation rel,
 686                         BTScanInsert key,
 687                         Page page,
 688                         OffsetNumber offnum)
 689 {
 690         TupleDesc       itupdesc = RelationGetDescr(rel);
 691         BTPageOpaque opaque = BTPageGetOpaque(page);
 692         IndexTuple      itup;
 693         ItemPointer heapTid;
 694         ScanKey         scankey;
 695         int                     ncmpkey;
 696         int                     ntupatts;
 697         int32           result;
 698
 699         Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
 700         Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
 701         Assert(key->heapkeyspace || key->scantid == NULL);
 702
 703         /*
 704          * Force result ">" if target item is first data item on an internal page
 705          * --- see NOTE above.
 706          */
 707         if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
 708                 return 1;
 709
 710         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 711         ntupatts = BTreeTupleGetNAtts(itup, rel);
 712
 713         /*
 714          * The scan key is set up with the attribute number associated with each
 715          * term in the key.  It is important that, if the index is multi-key, the
 716          * scan contain the first k key attributes, and that they be in order.  If
 717          * you think about how multi-key ordering works, you'll understand why
 718          * this is.
 719          *
 720          * We don't test for violation of this condition here, however.  The
 721          * initial setup for the index scan had better have gotten it right (see
 722          * _bt_first).
 723          */
 724
 725         ncmpkey = Min(ntupatts, key->keysz);
 726         Assert(key->heapkeyspace || ncmpkey == key->keysz);
 727         Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
 728         scankey = key->scankeys;
 729         for (int i = 1; i <= ncmpkey; i++)
 730         {
 731                 Datum           datum;
 732                 bool            isNull;
 733
 734                 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
 735
 736                 if (scankey->sk_flags & SK_ISNULL)      /* key is NULL */
 737                 {
 738                         if (isNull)
 739                                 result = 0;             /* NULL "=" NULL */
 740                         else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
 741                                 result = -1;    /* NULL "<" NOT_NULL */
 742                         else
 743                                 result = 1;             /* NULL ">" NOT_NULL */
 744                 }
 745                 else if (isNull)                /* key is NOT_NULL and item is NULL */
 746                 {
 747                         if (scankey->sk_flags & SK_BT_NULLS_FIRST)
 748                                 result = 1;             /* NOT_NULL ">" NULL */
 749                         else
 750                                 result = -1;    /* NOT_NULL "<" NULL */
 751                 }
 752                 else
 753                 {
 754                         /*
 755                          * The sk_func needs to be passed the index value as left arg and
 756                          * the sk_argument as right arg (they might be of different
 757                          * types).  Since it is convenient for callers to think of
 758                          * _bt_compare as comparing the scankey to the index item, we have
 759                          * to flip the sign of the comparison result.  (Unless it's a DESC
 760                          * column, in which case we *don't* flip the sign.)
 761                          */
 762                         result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
 763                                                                                                          scankey->sk_collation,
 764                                                                                                          datum,
 765                                                                                                          scankey->sk_argument));
 766
 767                         if (!(scankey->sk_flags & SK_BT_DESC))
 768                                 INVERT_COMPARE_RESULT(result);
 769                 }
 770
 771                 /* if the keys are unequal, return the difference */
 772                 if (result != 0)
 773                         return result;
 774
 775                 scankey++;
 776         }
 777
 778         /*
 779          * All non-truncated attributes (other than heap TID) were found to be
 780          * equal.  Treat truncated attributes as minus infinity when scankey has a
 781          * key attribute value that would otherwise be compared directly.
 782          *
 783          * Note: it doesn't matter if ntupatts includes non-key attributes;
 784          * scankey won't, so explicitly excluding non-key attributes isn't
 785          * necessary.
 786          */
 787         if (key->keysz > ntupatts)
 788                 return 1;
 789
 790         /*
 791          * Use the heap TID attribute and scantid to try to break the tie.  The
 792          * rules are the same as any other key attribute -- only the
 793          * representation differs.
 794          */
 795         heapTid = BTreeTupleGetHeapTID(itup);
 796         if (key->scantid == NULL)
 797         {
 798                 /*
 799                  * Forward scans have a scankey that is considered greater than a
 800                  * truncated pivot tuple if and when the scankey has equal values for
 801                  * attributes up to and including the least significant untruncated
 802                  * attribute in tuple.  Even attributes that were omitted from the
 803                  * scan key are considered greater than -inf truncated attributes.
 804                  * (See _bt_binsrch for an explanation of our backward scan behavior.)
 805                  *
 806                  * For example, if an index has the minimum two attributes (single
 807                  * user key attribute, plus heap TID attribute), and a page's high key
 808                  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
 809                  * will not descend to the page to the left.  The search will descend
 810                  * right instead.  The truncated attribute in pivot tuple means that
 811                  * all non-pivot tuples on the page to the left are strictly < 'foo',
 812                  * so it isn't necessary to descend left.  In other words, search
 813                  * doesn't have to descend left because it isn't interested in a match
 814                  * that has a heap TID value of -inf.
 815                  *
 816                  * Note: the heap TID part of the test ensures that scankey is being
 817                  * compared to a pivot tuple with one or more truncated -inf key
 818                  * attributes.  The heap TID attribute is the last key attribute in
 819                  * every index, of course, but other than that it isn't special.
 820                  */
 821                 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
 822                         key->heapkeyspace)
 823                         return 1;
 824
 825                 /* All provided scankey arguments found to be equal */
 826                 return 0;
 827         }
 828
 829         /*
 830          * Treat truncated heap TID as minus infinity, since scankey has a key
 831          * attribute value (scantid) that would otherwise be compared directly
 832          */
 833         Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
 834         if (heapTid == NULL)
 835                 return 1;
 836
 837         /*
 838          * Scankey must be treated as equal to a posting list tuple if its scantid
 839          * value falls within the range of the posting list.  In all other cases
 840          * there can only be a single heap TID value, which is compared directly
 841          * with scantid.
 842          */
 843         Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
 844         result = ItemPointerCompare(key->scantid, heapTid);
 845         if (result <= 0 || !BTreeTupleIsPosting(itup))
 846                 return result;
 847         else
 848         {
 849                 result = ItemPointerCompare(key->scantid,
 850                                                                         BTreeTupleGetMaxHeapTID(itup));
 851                 if (result > 0)
 852                         return 1;
 853         }
 854
 855         return 0;
 856 }
 857
 858 /*
 859  *      _bt_first() -- Find the first item in a scan.
 860  *
 861  *              We need to be clever about the direction of scan, the search
 862  *              conditions, and the tree ordering.  We find the first item (or,
 863  *              if backwards scan, the last item) in the tree that satisfies the
 864  *              qualifications in the scan key.  On success exit, the page containing
 865  *              the current index tuple is pinned but not locked, and data about
 866  *              the matching tuple(s) on the page has been loaded into so->currPos.
 867  *              scan->xs_heaptid is set to the heap TID of the current tuple, and if
 868  *              requested, scan->xs_itup points to a copy of the index tuple.
 869  *
 870  * If there are no matching items in the index, we return false, with no
 871  * pins or locks held.
 872  *
 873  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
 874  * are both search-type scankeys (see nbtree/README for more about this).
 875  * Within this routine, we build a temporary insertion-type scankey to use
 876  * in locating the scan start position.
 877  */
 878 bool
 879 _bt_first(IndexScanDesc scan, ScanDirection dir)
 880 {
 881         Relation        rel = scan->indexRelation;
 882         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 883         Buffer          buf;
 884         BTStack         stack;
 885         OffsetNumber offnum;
 886         StrategyNumber strat;
 887         BTScanInsertData inskey;
 888         ScanKey         startKeys[INDEX_MAX_KEYS];
 889         ScanKeyData notnullkeys[INDEX_MAX_KEYS];
 890         int                     keysz = 0;
 891         int                     i;
 892         bool            status;
 893         StrategyNumber strat_total;
 894         BTScanPosItem *currItem;
 895         BlockNumber blkno;
 896
 897         Assert(!BTScanPosIsValid(so->currPos));
 898
 899         /*
 900          * Examine the scan keys and eliminate any redundant keys; also mark the
 901          * keys that must be matched to continue the scan.
 902          */
 903         _bt_preprocess_keys(scan);
 904
 905         /*
 906          * Quit now if _bt_preprocess_keys() discovered that the scan keys can
 907          * never be satisfied (eg, x == 1 AND x > 2).
 908          */
 909         if (!so->qual_ok)
 910         {
 911                 _bt_parallel_done(scan);
 912                 return false;
 913         }
 914
 915         /*
 916          * For parallel scans, get the starting page from shared state. If the
 917          * scan has not started, proceed to find out first leaf page in the usual
 918          * way while keeping other participating processes waiting.  If the scan
 919          * has already begun, use the page number from the shared structure.
 920          *
 921          * When a parallel scan has another primitive index scan scheduled, a
 922          * parallel worker will seize the scan for that purpose now.  This is
 923          * similar to the case where the top-level scan hasn't started.
 924          */
 925         if (scan->parallel_scan != NULL)
 926         {
 927                 status = _bt_parallel_seize(scan, &blkno, true);
 928
 929                 /*
 930                  * Initialize arrays (when _bt_parallel_seize didn't already set up
 931                  * the next primitive index scan)
 932                  */
 933                 if (so->numArrayKeys && !so->needPrimScan)
 934                         _bt_start_array_keys(scan, dir);
 935
 936                 if (!status)
 937                         return false;
 938                 else if (blkno == P_NONE)
 939                 {
 940                         _bt_parallel_done(scan);
 941                         return false;
 942                 }
 943                 else if (blkno != InvalidBlockNumber)
 944                 {
 945                         if (!_bt_parallel_readpage(scan, blkno, dir))
 946                                 return false;
 947                         goto readcomplete;
 948                 }
 949         }
 950         else if (so->numArrayKeys && !so->needPrimScan)
 951         {
 952                 /*
 953                  * First _bt_first call (for current btrescan) without parallelism.
 954                  *
 955                  * Initialize arrays, and the corresponding scan keys that were just
 956                  * output by _bt_preprocess_keys.
 957                  */
 958                 _bt_start_array_keys(scan, dir);
 959         }
 960
 961         /*
 962          * Count an indexscan for stats, now that we know that we'll call
 963          * _bt_search/_bt_endpoint below
 964          */
 965         pgstat_count_index_scan(rel);
 966
 967         /*----------
 968          * Examine the scan keys to discover where we need to start the scan.
 969          *
 970          * We want to identify the keys that can be used as starting boundaries;
 971          * these are =, >, or >= keys for a forward scan or =, <, <= keys for
 972          * a backwards scan.  We can use keys for multiple attributes so long as
 973          * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
 974          * a > or < boundary or find an attribute with no boundary (which can be
 975          * thought of as the same as "> -infinity"), we can't use keys for any
 976          * attributes to its right, because it would break our simplistic notion
 977          * of what initial positioning strategy to use.
 978          *
 979          * When the scan keys include cross-type operators, _bt_preprocess_keys
 980          * may not be able to eliminate redundant keys; in such cases we will
 981          * arbitrarily pick a usable one for each attribute.  This is correct
 982          * but possibly not optimal behavior.  (For example, with keys like
 983          * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
 984          * x=5 would be more efficient.)  Since the situation only arises given
 985          * a poorly-worded query plus an incomplete opfamily, live with it.
 986          *
 987          * When both equality and inequality keys appear for a single attribute
 988          * (again, only possible when cross-type operators appear), we *must*
 989          * select one of the equality keys for the starting point, because
 990          * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
 991          * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
 992          * start at x=4, we will fail and stop before reaching x=10.  If multiple
 993          * equality quals survive preprocessing, however, it doesn't matter which
 994          * one we use --- by definition, they are either redundant or
 995          * contradictory.
 996          *
 997          * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
 998          * If the index stores nulls at the end of the index we'll be starting
 999          * from, and we have no boundary key for the column (which means the key
1000          * we deduced NOT NULL from is an inequality key that constrains the other
1001          * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1002          * use as a boundary key.  If we didn't do this, we might find ourselves
1003          * traversing a lot of null entries at the start of the scan.
1004          *
1005          * In this loop, row-comparison keys are treated the same as keys on their
1006          * first (leftmost) columns.  We'll add on lower-order columns of the row
1007          * comparison below, if possible.
1008          *
1009          * The selected scan keys (at most one per index column) are remembered by
1010          * storing their addresses into the local startKeys[] array.
1011          *
1012          * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1013          * the next primitive index scan (for scans with array keys) based in part
1014          * on an understanding of how it'll enable us to reposition the scan.
1015          * They're directly aware of how we'll sometimes cons up an explicit
1016          * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a
1017          * symmetric "deduce NOT NULL" rule of their own.  This allows top-level
1018          * scans to skip large groups of NULLs through repeated deductions about
1019          * key strictness (for a required inequality key) and whether NULLs in the
1020          * key's index column are stored last or first (relative to non-NULLs).
1021          * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1022          * need to be kept in sync.
1023          *----------
1024          */
1025         strat_total = BTEqualStrategyNumber;
1026         if (so->numberOfKeys > 0)
1027         {
1028                 AttrNumber      curattr;
1029                 ScanKey         chosen;
1030                 ScanKey         impliesNN;
1031                 ScanKey         cur;
1032
1033                 /*
1034                  * chosen is the so-far-chosen key for the current attribute, if any.
1035                  * We don't cast the decision in stone until we reach keys for the
1036                  * next attribute.
1037                  */
1038                 curattr = 1;
1039                 chosen = NULL;
1040                 /* Also remember any scankey that implies a NOT NULL constraint */
1041                 impliesNN = NULL;
1042
1043                 /*
1044                  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1045                  * pass to handle after-last-key processing.  Actual exit from the
1046                  * loop is at one of the "break" statements below.
1047                  */
1048                 for (cur = so->keyData, i = 0;; cur++, i++)
1049                 {
1050                         if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1051                         {
1052                                 /*
1053                                  * Done looking at keys for curattr.  If we didn't find a
1054                                  * usable boundary key, see if we can deduce a NOT NULL key.
1055                                  */
1056                                 if (chosen == NULL && impliesNN != NULL &&
1057                                         ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1058                                          ScanDirectionIsForward(dir) :
1059                                          ScanDirectionIsBackward(dir)))
1060                                 {
1061                                         /* Yes, so build the key in notnullkeys[keysz] */
1062                                         chosen = &notnullkeys[keysz];
1063                                         ScanKeyEntryInitialize(chosen,
1064                                                                                    (SK_SEARCHNOTNULL | SK_ISNULL |
1065                                                                                         (impliesNN->sk_flags &
1066                                                                                          (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1067                                                                                    curattr,
1068                                                                                    ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1069                                                                                         BTGreaterStrategyNumber :
1070                                                                                         BTLessStrategyNumber),
1071                                                                                    InvalidOid,
1072                                                                                    InvalidOid,
1073                                                                                    InvalidOid,
1074                                                                                    (Datum) 0);
1075                                 }
1076
1077                                 /*
1078                                  * If we still didn't find a usable boundary key, quit; else
1079                                  * save the boundary key pointer in startKeys.
1080                                  */
1081                                 if (chosen == NULL)
1082                                         break;
1083                                 startKeys[keysz++] = chosen;
1084
1085                                 /*
1086                                  * Adjust strat_total, and quit if we have stored a > or <
1087                                  * key.
1088                                  */
1089                                 strat = chosen->sk_strategy;
1090                                 if (strat != BTEqualStrategyNumber)
1091                                 {
1092                                         strat_total = strat;
1093                                         if (strat == BTGreaterStrategyNumber ||
1094                                                 strat == BTLessStrategyNumber)
1095                                                 break;
1096                                 }
1097
1098                                 /*
1099                                  * Done if that was the last attribute, or if next key is not
1100                                  * in sequence (implying no boundary key is available for the
1101                                  * next attribute).
1102                                  */
1103                                 if (i >= so->numberOfKeys ||
1104                                         cur->sk_attno != curattr + 1)
1105                                         break;
1106
1107                                 /*
1108                                  * Reset for next attr.
1109                                  */
1110                                 curattr = cur->sk_attno;
1111                                 chosen = NULL;
1112                                 impliesNN = NULL;
1113                         }
1114
1115                         /*
1116                          * Can we use this key as a starting boundary for this attr?
1117                          *
1118                          * If not, does it imply a NOT NULL constraint?  (Because
1119                          * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1120                          * *any* inequality key works for that; we need not test.)
1121                          */
1122                         switch (cur->sk_strategy)
1123                         {
1124                                 case BTLessStrategyNumber:
1125                                 case BTLessEqualStrategyNumber:
1126                                         if (chosen == NULL)
1127                                         {
1128                                                 if (ScanDirectionIsBackward(dir))
1129                                                         chosen = cur;
1130                                                 else
1131                                                         impliesNN = cur;
1132                                         }
1133                                         break;
1134                                 case BTEqualStrategyNumber:
1135                                         /* override any non-equality choice */
1136                                         chosen = cur;
1137                                         break;
1138                                 case BTGreaterEqualStrategyNumber:
1139                                 case BTGreaterStrategyNumber:
1140                                         if (chosen == NULL)
1141                                         {
1142                                                 if (ScanDirectionIsForward(dir))
1143                                                         chosen = cur;
1144                                                 else
1145                                                         impliesNN = cur;
1146                                         }
1147                                         break;
1148                         }
1149                 }
1150         }
1151
1152         /*
1153          * If we found no usable boundary keys, we have to start from one end of
1154          * the tree.  Walk down that edge to the first or last key, and scan from
1155          * there.
1156          */
1157         if (keysz == 0)
1158         {
1159                 bool            match;
1160
1161                 match = _bt_endpoint(scan, dir);
1162
1163                 if (!match)
1164                 {
1165                         /* No match, so mark (parallel) scan finished */
1166                         _bt_parallel_done(scan);
1167                 }
1168
1169                 return match;
1170         }
1171
1172         /*
1173          * We want to start the scan somewhere within the index.  Set up an
1174          * insertion scankey we can use to search for the boundary point we
1175          * identified above.  The insertion scankey is built using the keys
1176          * identified by startKeys[].  (Remaining insertion scankey fields are
1177          * initialized after initial-positioning strategy is finalized.)
1178          */
1179         Assert(keysz <= INDEX_MAX_KEYS);
1180         for (i = 0; i < keysz; i++)
1181         {
1182                 ScanKey         cur = startKeys[i];
1183
1184                 Assert(cur->sk_attno == i + 1);
1185
1186                 if (cur->sk_flags & SK_ROW_HEADER)
1187                 {
1188                         /*
1189                          * Row comparison header: look to the first row member instead.
1190                          *
1191                          * The member scankeys are already in insertion format (ie, they
1192                          * have sk_func = 3-way-comparison function), but we have to watch
1193                          * out for nulls, which _bt_preprocess_keys didn't check. A null
1194                          * in the first row member makes the condition unmatchable, just
1195                          * like qual_ok = false.
1196                          */
1197                         ScanKey         subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1198
1199                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
1200                         if (subkey->sk_flags & SK_ISNULL)
1201                         {
1202                                 _bt_parallel_done(scan);
1203                                 return false;
1204                         }
1205                         memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1206
1207                         /*
1208                          * If the row comparison is the last positioning key we accepted,
1209                          * try to add additional keys from the lower-order row members.
1210                          * (If we accepted independent conditions on additional index
1211                          * columns, we use those instead --- doesn't seem worth trying to
1212                          * determine which is more restrictive.)  Note that this is OK
1213                          * even if the row comparison is of ">" or "<" type, because the
1214                          * condition applied to all but the last row member is effectively
1215                          * ">=" or "<=", and so the extra keys don't break the positioning
1216                          * scheme.  But, by the same token, if we aren't able to use all
1217                          * the row members, then the part of the row comparison that we
1218                          * did use has to be treated as just a ">=" or "<=" condition, and
1219                          * so we'd better adjust strat_total accordingly.
1220                          */
1221                         if (i == keysz - 1)
1222                         {
1223                                 bool            used_all_subkeys = false;
1224
1225                                 Assert(!(subkey->sk_flags & SK_ROW_END));
1226                                 for (;;)
1227                                 {
1228                                         subkey++;
1229                                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
1230                                         if (subkey->sk_attno != keysz + 1)
1231                                                 break;  /* out-of-sequence, can't use it */
1232                                         if (subkey->sk_strategy != cur->sk_strategy)
1233                                                 break;  /* wrong direction, can't use it */
1234                                         if (subkey->sk_flags & SK_ISNULL)
1235                                                 break;  /* can't use null keys */
1236                                         Assert(keysz < INDEX_MAX_KEYS);
1237                                         memcpy(inskey.scankeys + keysz, subkey,
1238                                                    sizeof(ScanKeyData));
1239                                         keysz++;
1240                                         if (subkey->sk_flags & SK_ROW_END)
1241                                         {
1242                                                 used_all_subkeys = true;
1243                                                 break;
1244                                         }
1245                                 }
1246                                 if (!used_all_subkeys)
1247                                 {
1248                                         switch (strat_total)
1249                                         {
1250                                                 case BTLessStrategyNumber:
1251                                                         strat_total = BTLessEqualStrategyNumber;
1252                                                         break;
1253                                                 case BTGreaterStrategyNumber:
1254                                                         strat_total = BTGreaterEqualStrategyNumber;
1255                                                         break;
1256                                         }
1257                                 }
1258                                 break;                  /* done with outer loop */
1259                         }
1260                 }
1261                 else
1262                 {
1263                         /*
1264                          * Ordinary comparison key.  Transform the search-style scan key
1265                          * to an insertion scan key by replacing the sk_func with the
1266                          * appropriate btree comparison function.
1267                          *
1268                          * If scankey operator is not a cross-type comparison, we can use
1269                          * the cached comparison function; otherwise gotta look it up in
1270                          * the catalogs.  (That can't lead to infinite recursion, since no
1271                          * indexscan initiated by syscache lookup will use cross-data-type
1272                          * operators.)
1273                          *
1274                          * We support the convention that sk_subtype == InvalidOid means
1275                          * the opclass input type; this is a hack to simplify life for
1276                          * ScanKeyInit().
1277                          */
1278                         if (cur->sk_subtype == rel->rd_opcintype[i] ||
1279                                 cur->sk_subtype == InvalidOid)
1280                         {
1281                                 FmgrInfo   *procinfo;
1282
1283                                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1284                                 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1285                                                                                            cur->sk_flags,
1286                                                                                            cur->sk_attno,
1287                                                                                            InvalidStrategy,
1288                                                                                            cur->sk_subtype,
1289                                                                                            cur->sk_collation,
1290                                                                                            procinfo,
1291                                                                                            cur->sk_argument);
1292                         }
1293                         else
1294                         {
1295                                 RegProcedure cmp_proc;
1296
1297                                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1298                                                                                          rel->rd_opcintype[i],
1299                                                                                          cur->sk_subtype,
1300                                                                                          BTORDER_PROC);
1301                                 if (!RegProcedureIsValid(cmp_proc))
1302                                         elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1303                                                  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1304                                                  cur->sk_attno, RelationGetRelationName(rel));
1305                                 ScanKeyEntryInitialize(inskey.scankeys + i,
1306                                                                            cur->sk_flags,
1307                                                                            cur->sk_attno,
1308                                                                            InvalidStrategy,
1309                                                                            cur->sk_subtype,
1310                                                                            cur->sk_collation,
1311                                                                            cmp_proc,
1312                                                                            cur->sk_argument);
1313                         }
1314                 }
1315         }
1316
1317         /*----------
1318          * Examine the selected initial-positioning strategy to determine exactly
1319          * where we need to start the scan, and set flag variables to control the
1320          * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1321          * page _bt_search returns).
1322          *----------
1323          */
1324         _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1325         inskey.anynullkeys = false; /* unused */
1326         inskey.scantid = NULL;
1327         inskey.keysz = keysz;
1328         switch (strat_total)
1329         {
1330                 case BTLessStrategyNumber:
1331
1332                         inskey.nextkey = false;
1333                         inskey.backward = true;
1334                         break;
1335
1336                 case BTLessEqualStrategyNumber:
1337
1338                         inskey.nextkey = true;
1339                         inskey.backward = true;
1340                         break;
1341
1342                 case BTEqualStrategyNumber:
1343
1344                         /*
1345                          * If a backward scan was specified, need to start with last equal
1346                          * item not first one.
1347                          */
1348                         if (ScanDirectionIsBackward(dir))
1349                         {
1350                                 /*
1351                                  * This is the same as the <= strategy
1352                                  */
1353                                 inskey.nextkey = true;
1354                                 inskey.backward = true;
1355                         }
1356                         else
1357                         {
1358                                 /*
1359                                  * This is the same as the >= strategy
1360                                  */
1361                                 inskey.nextkey = false;
1362                                 inskey.backward = false;
1363                         }
1364                         break;
1365
1366                 case BTGreaterEqualStrategyNumber:
1367
1368                         /*
1369                          * Find first item >= scankey
1370                          */
1371                         inskey.nextkey = false;
1372                         inskey.backward = false;
1373                         break;
1374
1375                 case BTGreaterStrategyNumber:
1376
1377                         /*
1378                          * Find first item > scankey
1379                          */
1380                         inskey.nextkey = true;
1381                         inskey.backward = false;
1382                         break;
1383
1384                 default:
1385                         /* can't get here, but keep compiler quiet */
1386                         elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1387                         return false;
1388         }
1389
1390         /*
1391          * Use the manufactured insertion scan key to descend the tree and
1392          * position ourselves on the target leaf page.
1393          */
1394         Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1395         stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1396
1397         /* don't need to keep the stack around... */
1398         _bt_freestack(stack);
1399
1400         if (!BufferIsValid(buf))
1401         {
1402                 /*
1403                  * We only get here if the index is completely empty. Lock relation
1404                  * because nothing finer to lock exists.  Without a buffer lock, it's
1405                  * possible for another transaction to insert data between
1406                  * _bt_search() and PredicateLockRelation().  We have to try again
1407                  * after taking the relation-level predicate lock, to close a narrow
1408                  * window where we wouldn't scan concurrently inserted tuples, but the
1409                  * writer wouldn't see our predicate lock.
1410                  */
1411                 if (IsolationIsSerializable())
1412                 {
1413                         PredicateLockRelation(rel, scan->xs_snapshot);
1414                         stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1415                         _bt_freestack(stack);
1416                 }
1417
1418                 if (!BufferIsValid(buf))
1419                 {
1420                         /*
1421                          * Mark parallel scan as done, so that all the workers can finish
1422                          * their scan.
1423                          */
1424                         _bt_parallel_done(scan);
1425                         BTScanPosInvalidate(so->currPos);
1426                         return false;
1427                 }
1428         }
1429
1430         PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1431
1432         _bt_initialize_more_data(so, dir);
1433
1434         /* position to the precise item on the page */
1435         offnum = _bt_binsrch(rel, &inskey, buf);
1436         Assert(!BTScanPosIsValid(so->currPos));
1437         so->currPos.buf = buf;
1438
1439         /*
1440          * Now load data from the first page of the scan.
1441          *
1442          * If inskey.nextkey = false and inskey.backward = false, offnum is
1443          * positioned at the first non-pivot tuple >= inskey.scankeys.
1444          *
1445          * If inskey.nextkey = false and inskey.backward = true, offnum is
1446          * positioned at the last non-pivot tuple < inskey.scankeys.
1447          *
1448          * If inskey.nextkey = true and inskey.backward = false, offnum is
1449          * positioned at the first non-pivot tuple > inskey.scankeys.
1450          *
1451          * If inskey.nextkey = true and inskey.backward = true, offnum is
1452          * positioned at the last non-pivot tuple <= inskey.scankeys.
1453          *
1454          * It's possible that _bt_binsrch returned an offnum that is out of bounds
1455          * for the page.  For example, when inskey is both < the leaf page's high
1456          * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1457          */
1458         if (!_bt_readpage(scan, dir, offnum, true))
1459         {
1460                 /*
1461                  * There's no actually-matching data on this page.  Try to advance to
1462                  * the next page.  Return false if there's no matching data at all.
1463                  */
1464                 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1465                 if (!_bt_steppage(scan, dir))
1466                         return false;
1467         }
1468         else
1469         {
1470                 /* We have at least one item to return as scan's first item */
1471                 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1472         }
1473
1474 readcomplete:
1475         /* OK, itemIndex says what to return */
1476         currItem = &so->currPos.items[so->currPos.itemIndex];
1477         scan->xs_heaptid = currItem->heapTid;
1478         if (scan->xs_want_itup)
1479                 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1480
1481         return true;
1482 }
1483
1484 /*
1485  *      _bt_next() -- Get the next item in a scan.
1486  *
1487  *              On entry, so->currPos describes the current page, which may be pinned
1488  *              but is not locked, and so->currPos.itemIndex identifies which item was
1489  *              previously returned.
1490  *
1491  *              On successful exit, scan->xs_heaptid is set to the TID of the next
1492  *              heap tuple, and if requested, scan->xs_itup points to a copy of the
1493  *              index tuple.  so->currPos is updated as needed.
1494  *
1495  *              On failure exit (no more tuples), we release pin and set
1496  *              so->currPos.buf to InvalidBuffer.
1497  */
1498 bool
1499 _bt_next(IndexScanDesc scan, ScanDirection dir)
1500 {
1501         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1502         BTScanPosItem *currItem;
1503
1504         /*
1505          * Advance to next tuple on current page; or if there's no more, try to
1506          * step to the next page with data.
1507          */
1508         if (ScanDirectionIsForward(dir))
1509         {
1510                 if (++so->currPos.itemIndex > so->currPos.lastItem)
1511                 {
1512                         if (!_bt_steppage(scan, dir))
1513                                 return false;
1514                 }
1515         }
1516         else
1517         {
1518                 if (--so->currPos.itemIndex < so->currPos.firstItem)
1519                 {
1520                         if (!_bt_steppage(scan, dir))
1521                                 return false;
1522                 }
1523         }
1524
1525         /* OK, itemIndex says what to return */
1526         currItem = &so->currPos.items[so->currPos.itemIndex];
1527         scan->xs_heaptid = currItem->heapTid;
1528         if (scan->xs_want_itup)
1529                 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1530
1531         return true;
1532 }
1533
1534 /*
1535  *      _bt_readpage() -- Load data from current index page into so->currPos
1536  *
1537  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1538  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
1539  * they are updated as appropriate.  All other fields of so->currPos are
1540  * initialized from scratch here.
1541  *
1542  * We scan the current page starting at offnum and moving in the indicated
1543  * direction.  All items matching the scan keys are loaded into currPos.items.
1544  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1545  * that there can be no more matching tuples in the current scan direction
1546  * (could just be for the current primitive index scan when scan has arrays).
1547  *
1548  * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1549  * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1550  * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
1551  * its scan key was marked required), so _bt_first _must_ pass us an offnum
1552  * exactly at the beginning of where equal tuples are to be found.  When we're
1553  * passed an offnum past the end of the page, we might still manage to stop
1554  * the scan on this page by calling _bt_checkkeys against the high key.
1555  *
1556  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1557  * prior to calling this function; this function will invoke
1558  * _bt_parallel_release before returning.
1559  *
1560  * Returns true if any matching items found on the page, false if none.
1561  */
1562 static bool
1563 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1564                          bool firstPage)
1565 {
1566         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1567         Page            page;
1568         BTPageOpaque opaque;
1569         OffsetNumber minoff;
1570         OffsetNumber maxoff;
1571         BTReadPageState pstate;
1572         bool            arrayKeys;
1573         int                     itemIndex,
1574                                 indnatts;
1575
1576         /*
1577          * We must have the buffer pinned and locked, but the usual macro can't be
1578          * used here; this function is what makes it good for currPos.
1579          */
1580         Assert(BufferIsValid(so->currPos.buf));
1581
1582         page = BufferGetPage(so->currPos.buf);
1583         opaque = BTPageGetOpaque(page);
1584
1585         /* allow next page be processed by parallel worker */
1586         if (scan->parallel_scan)
1587         {
1588                 if (ScanDirectionIsForward(dir))
1589                         pstate.prev_scan_page = opaque->btpo_next;
1590                 else
1591                         pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
1592
1593                 _bt_parallel_release(scan, pstate.prev_scan_page);
1594         }
1595
1596         indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1597         arrayKeys = so->numArrayKeys != 0;
1598         minoff = P_FIRSTDATAKEY(opaque);
1599         maxoff = PageGetMaxOffsetNumber(page);
1600
1601         /* initialize page-level state that we'll pass to _bt_checkkeys */
1602         pstate.dir = dir;
1603         pstate.minoff = minoff;
1604         pstate.maxoff = maxoff;
1605         pstate.finaltup = NULL;
1606         pstate.page = page;
1607         pstate.offnum = InvalidOffsetNumber;
1608         pstate.skip = InvalidOffsetNumber;
1609         pstate.continuescan = true; /* default assumption */
1610         pstate.prechecked = false;
1611         pstate.firstmatch = false;
1612         pstate.rechecks = 0;
1613         pstate.targetdistance = 0;
1614
1615         /*
1616          * We note the buffer's block number so that we can release the pin later.
1617          * This allows us to re-read the buffer if it is needed again for hinting.
1618          */
1619         so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1620
1621         /*
1622          * We save the LSN of the page as we read it, so that we know whether it
1623          * safe to apply LP_DEAD hints to the page later.  This allows us to drop
1624          * the pin for MVCC scans, which allows vacuum to avoid blocking.
1625          */
1626         so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1627
1628         /*
1629          * we must save the page's right-link while scanning it; this tells us
1630          * where to step right to after we're done with these items.  There is no
1631          * corresponding need for the left-link, since splits always go right.
1632          */
1633         so->currPos.nextPage = opaque->btpo_next;
1634
1635         /* initialize tuple workspace to empty */
1636         so->currPos.nextTupleOffset = 0;
1637
1638         /*
1639          * Now that the current page has been made consistent, the macro should be
1640          * good.
1641          */
1642         Assert(BTScanPosIsPinned(so->currPos));
1643
1644         /*
1645          * Prechecking the value of the continuescan flag for the last item on the
1646          * page (for backwards scan it will be the first item on a page).  If we
1647          * observe it to be true, then it should be true for all other items. This
1648          * allows us to do significant optimizations in the _bt_checkkeys()
1649          * function for all the items on the page.
1650          *
1651          * With the forward scan, we do this check for the last item on the page
1652          * instead of the high key.  It's relatively likely that the most
1653          * significant column in the high key will be different from the
1654          * corresponding value from the last item on the page.  So checking with
1655          * the last item on the page would give a more precise answer.
1656          *
1657          * We skip this for the first page read by each (primitive) scan, to avoid
1658          * slowing down point queries.  They typically don't stand to gain much
1659          * when the optimization can be applied, and are more likely to notice the
1660          * overhead of the precheck.
1661          *
1662          * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1663          * just set a low-order required array's key to the best available match
1664          * for a truncated -inf attribute value from the prior page's high key
1665          * (array element 0 is always the best available match in this scenario).
1666          * It's quite likely that matches for array element 0 begin on this page,
1667          * but the start of matches won't necessarily align with page boundaries.
1668          * When the start of matches is somewhere in the middle of this page, it
1669          * would be wrong to treat page's final non-pivot tuple as representative.
1670          * Doing so might lead us to treat some of the page's earlier tuples as
1671          * being part of a group of tuples thought to satisfy the required keys.
1672          *
1673          * Note: Conversely, in the case where the scan's arrays just advanced
1674          * using the prior page's HIKEY _without_ advancement setting scanBehind,
1675          * the start of matches must be aligned with page boundaries, which makes
1676          * it safe to attempt the optimization here now.  It's also safe when the
1677          * prior page's HIKEY simply didn't need to advance any required array. In
1678          * both cases we can safely assume that the _first_ tuple from this page
1679          * must be >= the current set of array keys/equality constraints. And so
1680          * if the final tuple is == those same keys (and also satisfies any
1681          * required < or <= strategy scan keys) during the precheck, we can safely
1682          * assume that this must also be true of all earlier tuples from the page.
1683          */
1684         if (!firstPage && !so->scanBehind && minoff < maxoff)
1685         {
1686                 ItemId          iid;
1687                 IndexTuple      itup;
1688
1689                 iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1690                 itup = (IndexTuple) PageGetItem(page, iid);
1691
1692                 /* Call with arrayKeys=false to avoid undesirable side-effects */
1693                 _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1694                 pstate.prechecked = pstate.continuescan;
1695                 pstate.continuescan = true; /* reset */
1696         }
1697
1698         if (ScanDirectionIsForward(dir))
1699         {
1700                 /* SK_SEARCHARRAY forward scans must provide high key up front */
1701                 if (arrayKeys && !P_RIGHTMOST(opaque))
1702                 {
1703                         ItemId          iid = PageGetItemId(page, P_HIKEY);
1704
1705                         pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1706
1707                         if (unlikely(so->oppositeDirCheck))
1708                         {
1709                                 Assert(so->scanBehind);
1710
1711                                 /*
1712                                  * Last _bt_readpage call scheduled a recheck of finaltup for
1713                                  * required scan keys up to and including a > or >= scan key.
1714                                  *
1715                                  * _bt_checkkeys won't consider the scanBehind flag unless the
1716                                  * scan is stopped by a scan key required in the current scan
1717                                  * direction.  We need this recheck so that we'll notice when
1718                                  * all tuples on this page are still before the _bt_first-wise
1719                                  * start of matches for the current set of array keys.
1720                                  */
1721                                 if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
1722                                 {
1723                                         /* Schedule another primitive index scan after all */
1724                                         so->currPos.moreRight = false;
1725                                         so->needPrimScan = true;
1726                                         return false;
1727                                 }
1728
1729                                 /* Deliberately don't unset scanBehind flag just yet */
1730                         }
1731                 }
1732
1733                 /* load items[] in ascending order */
1734                 itemIndex = 0;
1735
1736                 offnum = Max(offnum, minoff);
1737
1738                 while (offnum <= maxoff)
1739                 {
1740                         ItemId          iid = PageGetItemId(page, offnum);
1741                         IndexTuple      itup;
1742                         bool            passes_quals;
1743
1744                         /*
1745                          * If the scan specifies not to return killed tuples, then we
1746                          * treat a killed tuple as not passing the qual
1747                          */
1748                         if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1749                         {
1750                                 offnum = OffsetNumberNext(offnum);
1751                                 continue;
1752                         }
1753
1754                         itup = (IndexTuple) PageGetItem(page, iid);
1755                         Assert(!BTreeTupleIsPivot(itup));
1756
1757                         pstate.offnum = offnum;
1758                         passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1759                                                                                  itup, indnatts);
1760
1761                         /*
1762                          * Check if we need to skip ahead to a later tuple (only possible
1763                          * when the scan uses array keys)
1764                          */
1765                         if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1766                         {
1767                                 Assert(!passes_quals && pstate.continuescan);
1768                                 Assert(offnum < pstate.skip);
1769
1770                                 offnum = pstate.skip;
1771                                 pstate.skip = InvalidOffsetNumber;
1772                                 continue;
1773                         }
1774
1775                         if (passes_quals)
1776                         {
1777                                 /* tuple passes all scan key conditions */
1778                                 pstate.firstmatch = true;
1779                                 if (!BTreeTupleIsPosting(itup))
1780                                 {
1781                                         /* Remember it */
1782                                         _bt_saveitem(so, itemIndex, offnum, itup);
1783                                         itemIndex++;
1784                                 }
1785                                 else
1786                                 {
1787                                         int                     tupleOffset;
1788
1789                                         /*
1790                                          * Set up state to return posting list, and remember first
1791                                          * TID
1792                                          */
1793                                         tupleOffset =
1794                                                 _bt_setuppostingitems(so, itemIndex, offnum,
1795                                                                                           BTreeTupleGetPostingN(itup, 0),
1796                                                                                           itup);
1797                                         itemIndex++;
1798                                         /* Remember additional TIDs */
1799                                         for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1800                                         {
1801                                                 _bt_savepostingitem(so, itemIndex, offnum,
1802                                                                                         BTreeTupleGetPostingN(itup, i),
1803                                                                                         tupleOffset);
1804                                                 itemIndex++;
1805                                         }
1806                                 }
1807                         }
1808                         /* When !continuescan, there can't be any more matches, so stop */
1809                         if (!pstate.continuescan)
1810                                 break;
1811
1812                         offnum = OffsetNumberNext(offnum);
1813                 }
1814
1815                 /*
1816                  * We don't need to visit page to the right when the high key
1817                  * indicates that no more matches will be found there.
1818                  *
1819                  * Checking the high key like this works out more often than you might
1820                  * think.  Leaf page splits pick a split point between the two most
1821                  * dissimilar tuples (this is weighed against the need to evenly share
1822                  * free space).  Leaf pages with high key attribute values that can
1823                  * only appear on non-pivot tuples on the right sibling page are
1824                  * common.
1825                  */
1826                 if (pstate.continuescan && !P_RIGHTMOST(opaque))
1827                 {
1828                         ItemId          iid = PageGetItemId(page, P_HIKEY);
1829                         IndexTuple      itup = (IndexTuple) PageGetItem(page, iid);
1830                         int                     truncatt;
1831
1832                         truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1833                         pstate.prechecked = false;      /* precheck didn't cover HIKEY */
1834                         _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1835                 }
1836
1837                 if (!pstate.continuescan)
1838                         so->currPos.moreRight = false;
1839
1840                 Assert(itemIndex <= MaxTIDsPerBTreePage);
1841                 so->currPos.firstItem = 0;
1842                 so->currPos.lastItem = itemIndex - 1;
1843                 so->currPos.itemIndex = 0;
1844         }
1845         else
1846         {
1847                 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1848                 if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1849                 {
1850                         ItemId          iid = PageGetItemId(page, minoff);
1851
1852                         pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1853                 }
1854
1855                 /* load items[] in descending order */
1856                 itemIndex = MaxTIDsPerBTreePage;
1857
1858                 offnum = Min(offnum, maxoff);
1859
1860                 while (offnum >= minoff)
1861                 {
1862                         ItemId          iid = PageGetItemId(page, offnum);
1863                         IndexTuple      itup;
1864                         bool            tuple_alive;
1865                         bool            passes_quals;
1866
1867                         /*
1868                          * If the scan specifies not to return killed tuples, then we
1869                          * treat a killed tuple as not passing the qual.  Most of the
1870                          * time, it's a win to not bother examining the tuple's index
1871                          * keys, but just skip to the next tuple (previous, actually,
1872                          * since we're scanning backwards).  However, if this is the first
1873                          * tuple on the page, we do check the index keys, to prevent
1874                          * uselessly advancing to the page to the left.  This is similar
1875                          * to the high key optimization used by forward scans.
1876                          */
1877                         if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1878                         {
1879                                 Assert(offnum >= P_FIRSTDATAKEY(opaque));
1880                                 if (offnum > P_FIRSTDATAKEY(opaque))
1881                                 {
1882                                         offnum = OffsetNumberPrev(offnum);
1883                                         continue;
1884                                 }
1885
1886                                 tuple_alive = false;
1887                         }
1888                         else
1889                                 tuple_alive = true;
1890
1891                         itup = (IndexTuple) PageGetItem(page, iid);
1892                         Assert(!BTreeTupleIsPivot(itup));
1893
1894                         pstate.offnum = offnum;
1895                         passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1896                                                                                  itup, indnatts);
1897
1898                         /*
1899                          * Check if we need to skip ahead to a later tuple (only possible
1900                          * when the scan uses array keys)
1901                          */
1902                         if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1903                         {
1904                                 Assert(!passes_quals && pstate.continuescan);
1905                                 Assert(offnum > pstate.skip);
1906
1907                                 offnum = pstate.skip;
1908                                 pstate.skip = InvalidOffsetNumber;
1909                                 continue;
1910                         }
1911
1912                         if (passes_quals && tuple_alive)
1913                         {
1914                                 /* tuple passes all scan key conditions */
1915                                 pstate.firstmatch = true;
1916                                 if (!BTreeTupleIsPosting(itup))
1917                                 {
1918                                         /* Remember it */
1919                                         itemIndex--;
1920                                         _bt_saveitem(so, itemIndex, offnum, itup);
1921                                 }
1922                                 else
1923                                 {
1924                                         int                     tupleOffset;
1925
1926                                         /*
1927                                          * Set up state to return posting list, and remember first
1928                                          * TID.
1929                                          *
1930                                          * Note that we deliberately save/return items from
1931                                          * posting lists in ascending heap TID order for backwards
1932                                          * scans.  This allows _bt_killitems() to make a
1933                                          * consistent assumption about the order of items
1934                                          * associated with the same posting list tuple.
1935                                          */
1936                                         itemIndex--;
1937                                         tupleOffset =
1938                                                 _bt_setuppostingitems(so, itemIndex, offnum,
1939                                                                                           BTreeTupleGetPostingN(itup, 0),
1940                                                                                           itup);
1941                                         /* Remember additional TIDs */
1942                                         for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1943                                         {
1944                                                 itemIndex--;
1945                                                 _bt_savepostingitem(so, itemIndex, offnum,
1946                                                                                         BTreeTupleGetPostingN(itup, i),
1947                                                                                         tupleOffset);
1948                                         }
1949                                 }
1950                         }
1951                         /* When !continuescan, there can't be any more matches, so stop */
1952                         if (!pstate.continuescan)
1953                                 break;
1954
1955                         offnum = OffsetNumberPrev(offnum);
1956                 }
1957
1958                 /*
1959                  * We don't need to visit page to the left when no more matches will
1960                  * be found there
1961                  */
1962                 if (!pstate.continuescan || P_LEFTMOST(opaque))
1963                         so->currPos.moreLeft = false;
1964
1965                 Assert(itemIndex >= 0);
1966                 so->currPos.firstItem = itemIndex;
1967                 so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1968                 so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1969         }
1970
1971         return (so->currPos.firstItem <= so->currPos.lastItem);
1972 }
1973
1974 /* Save an index item into so->currPos.items[itemIndex] */
1975 static void
1976 _bt_saveitem(BTScanOpaque so, int itemIndex,
1977                          OffsetNumber offnum, IndexTuple itup)
1978 {
1979         BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1980
1981         Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1982
1983         currItem->heapTid = itup->t_tid;
1984         currItem->indexOffset = offnum;
1985         if (so->currTuples)
1986         {
1987                 Size            itupsz = IndexTupleSize(itup);
1988
1989                 currItem->tupleOffset = so->currPos.nextTupleOffset;
1990                 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1991                 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1992         }
1993 }
1994
1995 /*
1996  * Setup state to save TIDs/items from a single posting list tuple.
1997  *
1998  * Saves an index item into so->currPos.items[itemIndex] for TID that is
1999  * returned to scan first.  Second or subsequent TIDs for posting list should
2000  * be saved by calling _bt_savepostingitem().
2001  *
2002  * Returns an offset into tuple storage space that main tuple is stored at if
2003  * needed.
2004  */
2005 static int
2006 _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2007                                           ItemPointer heapTid, IndexTuple itup)
2008 {
2009         BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2010
2011         Assert(BTreeTupleIsPosting(itup));
2012
2013         currItem->heapTid = *heapTid;
2014         currItem->indexOffset = offnum;
2015         if (so->currTuples)
2016         {
2017                 /* Save base IndexTuple (truncate posting list) */
2018                 IndexTuple      base;
2019                 Size            itupsz = BTreeTupleGetPostingOffset(itup);
2020
2021                 itupsz = MAXALIGN(itupsz);
2022                 currItem->tupleOffset = so->currPos.nextTupleOffset;
2023                 base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
2024                 memcpy(base, itup, itupsz);
2025                 /* Defensively reduce work area index tuple header size */
2026                 base->t_info &= ~INDEX_SIZE_MASK;
2027                 base->t_info |= itupsz;
2028                 so->currPos.nextTupleOffset += itupsz;
2029
2030                 return currItem->tupleOffset;
2031         }
2032
2033         return 0;
2034 }
2035
2036 /*
2037  * Save an index item into so->currPos.items[itemIndex] for current posting
2038  * tuple.
2039  *
2040  * Assumes that _bt_setuppostingitems() has already been called for current
2041  * posting list tuple.  Caller passes its return value as tupleOffset.
2042  */
2043 static inline void
2044 _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2045                                         ItemPointer heapTid, int tupleOffset)
2046 {
2047         BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2048
2049         currItem->heapTid = *heapTid;
2050         currItem->indexOffset = offnum;
2051
2052         /*
2053          * Have index-only scans return the same base IndexTuple for every TID
2054          * that originates from the same posting list
2055          */
2056         if (so->currTuples)
2057                 currItem->tupleOffset = tupleOffset;
2058 }
2059
2060 /*
2061  *      _bt_steppage() -- Step to next page containing valid data for scan
2062  *
2063  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
2064  * if pinned, we'll drop the pin before moving to next page.  The buffer is
2065  * not locked on entry.
2066  *
2067  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
2068  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
2069  * to InvalidBuffer.  We return true to indicate success.
2070  */
2071 static bool
2072 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
2073 {
2074         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2075         BlockNumber blkno = InvalidBlockNumber;
2076         bool            status;
2077
2078         Assert(BTScanPosIsValid(so->currPos));
2079
2080         /* Before leaving current page, deal with any killed items */
2081         if (so->numKilled > 0)
2082                 _bt_killitems(scan);
2083
2084         /*
2085          * Before we modify currPos, make a copy of the page data if there was a
2086          * mark position that needs it.
2087          */
2088         if (so->markItemIndex >= 0)
2089         {
2090                 /* bump pin on current buffer for assignment to mark buffer */
2091                 if (BTScanPosIsPinned(so->currPos))
2092                         IncrBufferRefCount(so->currPos.buf);
2093                 memcpy(&so->markPos, &so->currPos,
2094                            offsetof(BTScanPosData, items[1]) +
2095                            so->currPos.lastItem * sizeof(BTScanPosItem));
2096                 if (so->markTuples)
2097                         memcpy(so->markTuples, so->currTuples,
2098                                    so->currPos.nextTupleOffset);
2099                 so->markPos.itemIndex = so->markItemIndex;
2100                 so->markItemIndex = -1;
2101
2102                 /*
2103                  * If we're just about to start the next primitive index scan
2104                  * (possible with a scan that has arrays keys, and needs to skip to
2105                  * continue in the current scan direction), moreLeft/moreRight only
2106                  * indicate the end of the current primitive index scan.  They must
2107                  * never be taken to indicate that the top-level index scan has ended
2108                  * (that would be wrong).
2109                  *
2110                  * We could handle this case by treating the current array keys as
2111                  * markPos state.  But depending on the current array state like this
2112                  * would add complexity.  Instead, we just unset markPos's copy of
2113                  * moreRight or moreLeft (whichever might be affected), while making
2114                  * btrestpos reset the scan's arrays to their initial scan positions.
2115                  * In effect, btrestpos leaves advancing the arrays up to the first
2116                  * _bt_readpage call (that takes place after it has restored markPos).
2117                  */
2118                 Assert(so->markPos.dir == dir);
2119                 if (so->needPrimScan)
2120                 {
2121                         if (ScanDirectionIsForward(dir))
2122                                 so->markPos.moreRight = true;
2123                         else
2124                                 so->markPos.moreLeft = true;
2125                 }
2126         }
2127
2128         if (ScanDirectionIsForward(dir))
2129         {
2130                 /* Walk right to the next page with data */
2131                 if (scan->parallel_scan != NULL)
2132                 {
2133                         /*
2134                          * Seize the scan to get the next block number; if the scan has
2135                          * ended already, bail out.
2136                          */
2137                         status = _bt_parallel_seize(scan, &blkno, false);
2138                         if (!status)
2139                         {
2140                                 /* release the previous buffer, if pinned */
2141                                 BTScanPosUnpinIfPinned(so->currPos);
2142                                 BTScanPosInvalidate(so->currPos);
2143                                 return false;
2144                         }
2145                 }
2146                 else
2147                 {
2148                         /* Not parallel, so use the previously-saved nextPage link. */
2149                         blkno = so->currPos.nextPage;
2150                 }
2151
2152                 /* Remember we left a page with data */
2153                 so->currPos.moreLeft = true;
2154
2155                 /* release the previous buffer, if pinned */
2156                 BTScanPosUnpinIfPinned(so->currPos);
2157         }
2158         else
2159         {
2160                 /* Remember we left a page with data */
2161                 so->currPos.moreRight = true;
2162
2163                 if (scan->parallel_scan != NULL)
2164                 {
2165                         /*
2166                          * Seize the scan to get the current block number; if the scan has
2167                          * ended already, bail out.
2168                          */
2169                         status = _bt_parallel_seize(scan, &blkno, false);
2170                         BTScanPosUnpinIfPinned(so->currPos);
2171                         if (!status)
2172                         {
2173                                 BTScanPosInvalidate(so->currPos);
2174                                 return false;
2175                         }
2176                 }
2177                 else
2178                 {
2179                         /* Not parallel, so just use our own notion of the current page */
2180                         blkno = so->currPos.currPage;
2181                 }
2182         }
2183
2184         if (!_bt_readnextpage(scan, blkno, dir))
2185                 return false;
2186
2187         /* We have at least one item to return as scan's next item */
2188         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2189
2190         return true;
2191 }
2192
2193 /*
2194  *      _bt_readnextpage() -- Read next page containing valid data for scan
2195  *
2196  * On success exit, so->currPos is updated to contain data from the next
2197  * interesting page, and we return true.  Caller must release the lock (and
2198  * maybe the pin) on the buffer on success exit.
2199  *
2200  * If there are no more matching records in the given direction, we drop all
2201  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
2202  */
2203 static bool
2204 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2205 {
2206         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2207         Relation        rel;
2208         Page            page;
2209         BTPageOpaque opaque;
2210         bool            status;
2211
2212         rel = scan->indexRelation;
2213
2214         if (ScanDirectionIsForward(dir))
2215         {
2216                 for (;;)
2217                 {
2218                         /*
2219                          * if we're at end of scan, give up and mark parallel scan as
2220                          * done, so that all the workers can finish their scan
2221                          */
2222                         if (blkno == P_NONE || !so->currPos.moreRight)
2223                         {
2224                                 _bt_parallel_done(scan);
2225                                 BTScanPosInvalidate(so->currPos);
2226                                 return false;
2227                         }
2228                         /* check for interrupts while we're not holding any buffer lock */
2229                         CHECK_FOR_INTERRUPTS();
2230                         /* step right one page */
2231                         so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2232                         page = BufferGetPage(so->currPos.buf);
2233                         opaque = BTPageGetOpaque(page);
2234                         /* check for deleted page */
2235                         if (!P_IGNORE(opaque))
2236                         {
2237                                 PredicateLockPage(rel, blkno, scan->xs_snapshot);
2238                                 /* see if there are any matches on this page */
2239                                 /* note that this will clear moreRight if we can stop */
2240                                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2241                                         break;
2242                         }
2243                         else if (scan->parallel_scan != NULL)
2244                         {
2245                                 /* allow next page be processed by parallel worker */
2246                                 _bt_parallel_release(scan, opaque->btpo_next);
2247                         }
2248
2249                         /* nope, keep going */
2250                         if (scan->parallel_scan != NULL)
2251                         {
2252                                 _bt_relbuf(rel, so->currPos.buf);
2253                                 status = _bt_parallel_seize(scan, &blkno, false);
2254                                 if (!status)
2255                                 {
2256                                         BTScanPosInvalidate(so->currPos);
2257                                         return false;
2258                                 }
2259                         }
2260                         else
2261                         {
2262                                 blkno = opaque->btpo_next;
2263                                 _bt_relbuf(rel, so->currPos.buf);
2264                         }
2265                 }
2266         }
2267         else
2268         {
2269                 /*
2270                  * Should only happen in parallel cases, when some other backend
2271                  * advanced the scan.
2272                  */
2273                 if (so->currPos.currPage != blkno)
2274                 {
2275                         BTScanPosUnpinIfPinned(so->currPos);
2276                         so->currPos.currPage = blkno;
2277                 }
2278
2279                 /* Done if we know that the left sibling link isn't of interest */
2280                 if (!so->currPos.moreLeft)
2281                 {
2282                         BTScanPosUnpinIfPinned(so->currPos);
2283                         _bt_parallel_done(scan);
2284                         BTScanPosInvalidate(so->currPos);
2285                         return false;
2286                 }
2287
2288                 /*
2289                  * Walk left to the next page with data.  This is much more complex
2290                  * than the walk-right case because of the possibility that the page
2291                  * to our left splits while we are in flight to it, plus the
2292                  * possibility that the page we were on gets deleted after we leave
2293                  * it.  See nbtree/README for details.
2294                  *
2295                  * It might be possible to rearrange this code to have less overhead
2296                  * in pinning and locking, but that would require capturing the left
2297                  * sibling block number when the page is initially read, and then
2298                  * optimistically starting there (rather than pinning the page twice).
2299                  * It is not clear that this would be worth the complexity.
2300                  */
2301                 if (BTScanPosIsPinned(so->currPos))
2302                         _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2303                 else
2304                         so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2305
2306                 for (;;)
2307                 {
2308                         /* Done if we know that the left sibling link isn't of interest */
2309                         if (!so->currPos.moreLeft)
2310                         {
2311                                 _bt_relbuf(rel, so->currPos.buf);
2312                                 _bt_parallel_done(scan);
2313                                 BTScanPosInvalidate(so->currPos);
2314                                 return false;
2315                         }
2316
2317                         /* Step to next physical page */
2318                         so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2319
2320                         /* if we're physically at end of index, return failure */
2321                         if (so->currPos.buf == InvalidBuffer)
2322                         {
2323                                 _bt_parallel_done(scan);
2324                                 BTScanPosInvalidate(so->currPos);
2325                                 return false;
2326                         }
2327
2328                         /*
2329                          * Okay, we managed to move left to a non-deleted page. Done if
2330                          * it's not half-dead and contains matching tuples. Else loop back
2331                          * and do it all again.
2332                          */
2333                         page = BufferGetPage(so->currPos.buf);
2334                         opaque = BTPageGetOpaque(page);
2335                         if (!P_IGNORE(opaque))
2336                         {
2337                                 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
2338                                 /* see if there are any matches on this page */
2339                                 /* note that this will clear moreLeft if we can stop */
2340                                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2341                                         break;
2342                         }
2343                         else if (scan->parallel_scan != NULL)
2344                         {
2345                                 /* allow next page be processed by parallel worker */
2346                                 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2347                         }
2348
2349                         /*
2350                          * For parallel scans, get the last page scanned as it is quite
2351                          * possible that by the time we try to seize the scan, some other
2352                          * worker has already advanced the scan to a different page.  We
2353                          * must continue based on the latest page scanned by any worker.
2354                          */
2355                         if (scan->parallel_scan != NULL)
2356                         {
2357                                 _bt_relbuf(rel, so->currPos.buf);
2358                                 status = _bt_parallel_seize(scan, &blkno, false);
2359                                 if (!status)
2360                                 {
2361                                         BTScanPosInvalidate(so->currPos);
2362                                         return false;
2363                                 }
2364                                 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2365                         }
2366                 }
2367         }
2368
2369         return true;
2370 }
2371
2372 /*
2373  *      _bt_parallel_readpage() -- Read current page containing valid data for scan
2374  *
2375  * On success, release lock and maybe pin on buffer.  We return true to
2376  * indicate success.
2377  */
2378 static bool
2379 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2380 {
2381         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2382
2383         Assert(!so->needPrimScan);
2384
2385         _bt_initialize_more_data(so, dir);
2386
2387         if (!_bt_readnextpage(scan, blkno, dir))
2388                 return false;
2389
2390         /* We have at least one item to return as scan's next item */
2391         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2392
2393         return true;
2394 }
2395
2396 /*
2397  * _bt_walk_left() -- step left one page, if possible
2398  *
2399  * The given buffer must be pinned and read-locked.  This will be dropped
2400  * before stepping left.  On return, we have pin and read lock on the
2401  * returned page, instead.
2402  *
2403  * Returns InvalidBuffer if there is no page to the left (no lock is held
2404  * in that case).
2405  *
2406  * It is possible for the returned leaf page to be half-dead; caller must
2407  * check that condition and step left again when required.
2408  */
2409 static Buffer
2410 _bt_walk_left(Relation rel, Buffer buf)
2411 {
2412         Page            page;
2413         BTPageOpaque opaque;
2414
2415         page = BufferGetPage(buf);
2416         opaque = BTPageGetOpaque(page);
2417
2418         for (;;)
2419         {
2420                 BlockNumber obknum;
2421                 BlockNumber lblkno;
2422                 BlockNumber blkno;
2423                 int                     tries;
2424
2425                 /* if we're at end of tree, release buf and return failure */
2426                 if (P_LEFTMOST(opaque))
2427                 {
2428                         _bt_relbuf(rel, buf);
2429                         break;
2430                 }
2431                 /* remember original page we are stepping left from */
2432                 obknum = BufferGetBlockNumber(buf);
2433                 /* step left */
2434                 blkno = lblkno = opaque->btpo_prev;
2435                 _bt_relbuf(rel, buf);
2436                 /* check for interrupts while we're not holding any buffer lock */
2437                 CHECK_FOR_INTERRUPTS();
2438                 buf = _bt_getbuf(rel, blkno, BT_READ);
2439                 page = BufferGetPage(buf);
2440                 opaque = BTPageGetOpaque(page);
2441
2442                 /*
2443                  * If this isn't the page we want, walk right till we find what we
2444                  * want --- but go no more than four hops (an arbitrary limit). If we
2445                  * don't find the correct page by then, the most likely bet is that
2446                  * the original page got deleted and isn't in the sibling chain at all
2447                  * anymore, not that its left sibling got split more than four times.
2448                  *
2449                  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2450                  * because half-dead pages are still in the sibling chain.
2451                  */
2452                 tries = 0;
2453                 for (;;)
2454                 {
2455                         if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2456                         {
2457                                 /* Found desired page, return it */
2458                                 return buf;
2459                         }
2460                         if (P_RIGHTMOST(opaque) || ++tries > 4)
2461                                 break;
2462                         blkno = opaque->btpo_next;
2463                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2464                         page = BufferGetPage(buf);
2465                         opaque = BTPageGetOpaque(page);
2466                 }
2467
2468                 /* Return to the original page to see what's up */
2469                 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2470                 page = BufferGetPage(buf);
2471                 opaque = BTPageGetOpaque(page);
2472                 if (P_ISDELETED(opaque))
2473                 {
2474                         /*
2475                          * It was deleted.  Move right to first nondeleted page (there
2476                          * must be one); that is the page that has acquired the deleted
2477                          * one's keyspace, so stepping left from it will take us where we
2478                          * want to be.
2479                          */
2480                         for (;;)
2481                         {
2482                                 if (P_RIGHTMOST(opaque))
2483                                         elog(ERROR, "fell off the end of index \"%s\"",
2484                                                  RelationGetRelationName(rel));
2485                                 blkno = opaque->btpo_next;
2486                                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2487                                 page = BufferGetPage(buf);
2488                                 opaque = BTPageGetOpaque(page);
2489                                 if (!P_ISDELETED(opaque))
2490                                         break;
2491                         }
2492
2493                         /*
2494                          * Now return to top of loop, resetting obknum to point to this
2495                          * nondeleted page, and try again.
2496                          */
2497                 }
2498                 else
2499                 {
2500                         /*
2501                          * It wasn't deleted; the explanation had better be that the page
2502                          * to the left got split or deleted. Without this check, we'd go
2503                          * into an infinite loop if there's anything wrong.
2504                          */
2505                         if (opaque->btpo_prev == lblkno)
2506                                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2507                                          obknum, RelationGetRelationName(rel));
2508                         /* Okay to try again with new lblkno value */
2509                 }
2510         }
2511
2512         return InvalidBuffer;
2513 }
2514
2515 /*
2516  * _bt_get_endpoint() -- Find the first or last page on a given tree level
2517  *
2518  * If the index is empty, we will return InvalidBuffer; any other failure
2519  * condition causes ereport().  We will not return a dead page.
2520  *
2521  * The returned buffer is pinned and read-locked.
2522  */
2523 Buffer
2524 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2525 {
2526         Buffer          buf;
2527         Page            page;
2528         BTPageOpaque opaque;
2529         OffsetNumber offnum;
2530         BlockNumber blkno;
2531         IndexTuple      itup;
2532
2533         /*
2534          * If we are looking for a leaf page, okay to descend from fast root;
2535          * otherwise better descend from true root.  (There is no point in being
2536          * smarter about intermediate levels.)
2537          */
2538         if (level == 0)
2539                 buf = _bt_getroot(rel, NULL, BT_READ);
2540         else
2541                 buf = _bt_gettrueroot(rel);
2542
2543         if (!BufferIsValid(buf))
2544                 return InvalidBuffer;
2545
2546         page = BufferGetPage(buf);
2547         opaque = BTPageGetOpaque(page);
2548
2549         for (;;)
2550         {
2551                 /*
2552                  * If we landed on a deleted page, step right to find a live page
2553                  * (there must be one).  Also, if we want the rightmost page, step
2554                  * right if needed to get to it (this could happen if the page split
2555                  * since we obtained a pointer to it).
2556                  */
2557                 while (P_IGNORE(opaque) ||
2558                            (rightmost && !P_RIGHTMOST(opaque)))
2559                 {
2560                         blkno = opaque->btpo_next;
2561                         if (blkno == P_NONE)
2562                                 elog(ERROR, "fell off the end of index \"%s\"",
2563                                          RelationGetRelationName(rel));
2564                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2565                         page = BufferGetPage(buf);
2566                         opaque = BTPageGetOpaque(page);
2567                 }
2568
2569                 /* Done? */
2570                 if (opaque->btpo_level == level)
2571                         break;
2572                 if (opaque->btpo_level < level)
2573                         ereport(ERROR,
2574                                         (errcode(ERRCODE_INDEX_CORRUPTED),
2575                                          errmsg_internal("btree level %u not found in index \"%s\"",
2576                                                                          level, RelationGetRelationName(rel))));
2577
2578                 /* Descend to leftmost or rightmost child page */
2579                 if (rightmost)
2580                         offnum = PageGetMaxOffsetNumber(page);
2581                 else
2582                         offnum = P_FIRSTDATAKEY(opaque);
2583
2584                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2585                 blkno = BTreeTupleGetDownLink(itup);
2586
2587                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2588                 page = BufferGetPage(buf);
2589                 opaque = BTPageGetOpaque(page);
2590         }
2591
2592         return buf;
2593 }
2594
2595 /*
2596  *      _bt_endpoint() -- Find the first or last page in the index, and scan
2597  * from there to the first key satisfying all the quals.
2598  *
2599  * This is used by _bt_first() to set up a scan when we've determined
2600  * that the scan must start at the beginning or end of the index (for
2601  * a forward or backward scan respectively).  Exit conditions are the
2602  * same as for _bt_first().
2603  */
2604 static bool
2605 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2606 {
2607         Relation        rel = scan->indexRelation;
2608         BTScanOpaque so = (BTScanOpaque) scan->opaque;
2609         Buffer          buf;
2610         Page            page;
2611         BTPageOpaque opaque;
2612         OffsetNumber start;
2613         BTScanPosItem *currItem;
2614
2615         /*
2616          * Scan down to the leftmost or rightmost leaf page.  This is a simplified
2617          * version of _bt_search().  We don't maintain a stack since we know we
2618          * won't need it.
2619          */
2620         buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2621
2622         if (!BufferIsValid(buf))
2623         {
2624                 /*
2625                  * Empty index. Lock the whole relation, as nothing finer to lock
2626                  * exists.
2627                  */
2628                 PredicateLockRelation(rel, scan->xs_snapshot);
2629                 BTScanPosInvalidate(so->currPos);
2630                 return false;
2631         }
2632
2633         PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2634         page = BufferGetPage(buf);
2635         opaque = BTPageGetOpaque(page);
2636         Assert(P_ISLEAF(opaque));
2637
2638         if (ScanDirectionIsForward(dir))
2639         {
2640                 /* There could be dead pages to the left, so not this: */
2641                 /* Assert(P_LEFTMOST(opaque)); */
2642
2643                 start = P_FIRSTDATAKEY(opaque);
2644         }
2645         else if (ScanDirectionIsBackward(dir))
2646         {
2647                 Assert(P_RIGHTMOST(opaque));
2648
2649                 start = PageGetMaxOffsetNumber(page);
2650         }
2651         else
2652         {
2653                 elog(ERROR, "invalid scan direction: %d", (int) dir);
2654                 start = 0;                              /* keep compiler quiet */
2655         }
2656
2657         /* remember which buffer we have pinned */
2658         so->currPos.buf = buf;
2659
2660         _bt_initialize_more_data(so, dir);
2661
2662         /*
2663          * Now load data from the first page of the scan.
2664          */
2665         if (!_bt_readpage(scan, dir, start, true))
2666         {
2667                 /*
2668                  * There's no actually-matching data on this page.  Try to advance to
2669                  * the next page.  Return false if there's no matching data at all.
2670                  */
2671                 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2672                 if (!_bt_steppage(scan, dir))
2673                         return false;
2674         }
2675         else
2676         {
2677                 /* We have at least one item to return as scan's first item */
2678                 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2679         }
2680
2681         /* OK, itemIndex says what to return */
2682         currItem = &so->currPos.items[so->currPos.itemIndex];
2683         scan->xs_heaptid = currItem->heapTid;
2684         if (scan->xs_want_itup)
2685                 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2686
2687         return true;
2688 }
2689
2690 /*
2691  * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
2692  * from currPos
2693  */
2694 static inline void
2695 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2696 {
2697         so->currPos.dir = dir;
2698         if (so->needPrimScan)
2699         {
2700                 Assert(so->numArrayKeys);
2701
2702                 so->currPos.moreLeft = true;
2703                 so->currPos.moreRight = true;
2704                 so->needPrimScan = false;
2705         }
2706         else if (ScanDirectionIsForward(dir))
2707         {
2708                 so->currPos.moreLeft = false;
2709                 so->currPos.moreRight = true;
2710         }
2711         else
2712         {
2713                 so->currPos.moreLeft = true;
2714                 so->currPos.moreRight = false;
2715         }
2716         so->numKilled = 0;                      /* just paranoia */
2717         so->markItemIndex = -1;         /* ditto */
2718 }