src/backend/access/nbtree/nbtsearch.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtsearch.c
   4  *        Search code for postgres btrees.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "access/genam.h"
  19 #include "access/nbtree.h"
  20 #include "access/relscan.h"
  21 #include "miscadmin.h"
  22 #include "pgstat.h"
  23 #include "storage/bufmgr.h"
  24 #include "utils/lsyscache.h"
  25 #include "utils/rel.h"
  26
  27
  28 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
  29                          OffsetNumber offnum);
  30 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
  31 static Buffer _bt_walk_left(Relation rel, Buffer buf);
  32 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  33
  34
  35 /*
  36  *      _bt_search() -- Search the tree for a particular scankey,
  37  *              or more precisely for the first leaf page it could be on.
  38  *
  39  * The passed scankey must be an insertion-type scankey (see nbtree/README),
  40  * but it can omit the rightmost column(s) of the index.
  41  *
  42  * When nextkey is false (the usual case), we are looking for the first
  43  * item >= scankey.  When nextkey is true, we are looking for the first
  44  * item strictly greater than scankey.
  45  *
  46  * Return value is a stack of parent-page pointers.  *bufP is set to the
  47  * address of the leaf-page buffer, which is read-locked and pinned.
  48  * No locks are held on the parent pages, however!
  49  *
  50  * NOTE that the returned buffer is read-locked regardless of the access
  51  * parameter.  However, access = BT_WRITE will allow an empty root page
  52  * to be created and returned.  When access = BT_READ, an empty index
  53  * will result in *bufP being set to InvalidBuffer.
  54  */
  55 BTStack
  56 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
  57                    Buffer *bufP, int access)
  58 {
  59         BTStack         stack_in = NULL;
  60
  61         /* Get the root page to start with */
  62         *bufP = _bt_getroot(rel, access);
  63
  64         /* If index is empty and access = BT_READ, no root page is created. */
  65         if (!BufferIsValid(*bufP))
  66                 return (BTStack) NULL;
  67
  68         /* Loop iterates once per level descended in the tree */
  69         for (;;)
  70         {
  71                 Page            page;
  72                 BTPageOpaque opaque;
  73                 OffsetNumber offnum;
  74                 ItemId          itemid;
  75                 IndexTuple      itup;
  76                 BlockNumber blkno;
  77                 BlockNumber par_blkno;
  78                 BTStack         new_stack;
  79
  80                 /*
  81                  * Race -- the page we just grabbed may have split since we read its
  82                  * pointer in the parent (or metapage).  If it has, we may need to
  83                  * move right to its new sibling.  Do that.
  84                  */
  85                 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
  86
  87                 /* if this is a leaf page, we're done */
  88                 page = BufferGetPage(*bufP);
  89                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  90                 if (P_ISLEAF(opaque))
  91                         break;
  92
  93                 /*
  94                  * Find the appropriate item on the internal page, and get the child
  95                  * page that it points to.
  96                  */
  97                 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
  98                 itemid = PageGetItemId(page, offnum);
  99                 itup = (IndexTuple) PageGetItem(page, itemid);
 100                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
 101                 par_blkno = BufferGetBlockNumber(*bufP);
 102
 103                 /*
 104                  * We need to save the location of the index entry we chose in the
 105                  * parent page on a stack. In case we split the tree, we'll use the
 106                  * stack to work back up to the parent page.  We also save the actual
 107                  * downlink (TID) to uniquely identify the index entry, in case it
 108                  * moves right while we're working lower in the tree.  See the paper
 109                  * by Lehman and Yao for how this is detected and handled. (We use the
 110                  * child link to disambiguate duplicate keys in the index -- Lehman
 111                  * and Yao disallow duplicate keys.)
 112                  */
 113                 new_stack = (BTStack) palloc(sizeof(BTStackData));
 114                 new_stack->bts_blkno = par_blkno;
 115                 new_stack->bts_offset = offnum;
 116                 memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
 117                 new_stack->bts_parent = stack_in;
 118
 119                 /* drop the read lock on the parent page, acquire one on the child */
 120                 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
 121
 122                 /* okay, all set to move down a level */
 123                 stack_in = new_stack;
 124         }
 125
 126         return stack_in;
 127 }
 128
 129 /*
 130  *      _bt_moveright() -- move right in the btree if necessary.
 131  *
 132  * When we follow a pointer to reach a page, it is possible that
 133  * the page has changed in the meanwhile.  If this happens, we're
 134  * guaranteed that the page has "split right" -- that is, that any
 135  * data that appeared on the page originally is either on the page
 136  * or strictly to the right of it.
 137  *
 138  * This routine decides whether or not we need to move right in the
 139  * tree by examining the high key entry on the page.  If that entry
 140  * is strictly less than the scankey, or <= the scankey in the nextkey=true
 141  * case, then we followed the wrong link and we need to move right.
 142  *
 143  * The passed scankey must be an insertion-type scankey (see nbtree/README),
 144  * but it can omit the rightmost column(s) of the index.
 145  *
 146  * When nextkey is false (the usual case), we are looking for the first
 147  * item >= scankey.  When nextkey is true, we are looking for the first
 148  * item strictly greater than scankey.
 149  *
 150  * On entry, we have the buffer pinned and a lock of the type specified by
 151  * 'access'.  If we move right, we release the buffer and lock and acquire
 152  * the same on the right sibling.  Return value is the buffer we stop at.
 153  */
 154 Buffer
 155 _bt_moveright(Relation rel,
 156                           Buffer buf,
 157                           int keysz,
 158                           ScanKey scankey,
 159                           bool nextkey,
 160                           int access)
 161 {
 162         Page            page;
 163         BTPageOpaque opaque;
 164         int32           cmpval;
 165
 166         page = BufferGetPage(buf);
 167         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 168
 169         /*
 170          * When nextkey = false (normal case): if the scan key that brought us to
 171          * this page is > the high key stored on the page, then the page has split
 172          * and we need to move right.  (If the scan key is equal to the high key,
 173          * we might or might not need to move right; have to scan the page first
 174          * anyway.)
 175          *
 176          * When nextkey = true: move right if the scan key is >= page's high key.
 177          *
 178          * The page could even have split more than once, so scan as far as
 179          * needed.
 180          *
 181          * We also have to move right if we followed a link that brought us to a
 182          * dead page.
 183          */
 184         cmpval = nextkey ? 0 : 1;
 185
 186         while (!P_RIGHTMOST(opaque) &&
 187                    (P_IGNORE(opaque) ||
 188                         _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
 189         {
 190                 /* step right one page */
 191                 BlockNumber rblkno = opaque->btpo_next;
 192
 193                 buf = _bt_relandgetbuf(rel, buf, rblkno, access);
 194                 page = BufferGetPage(buf);
 195                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 196         }
 197
 198         if (P_IGNORE(opaque))
 199                 elog(ERROR, "fell off the end of index \"%s\"",
 200                          RelationGetRelationName(rel));
 201
 202         return buf;
 203 }
 204
 205 /*
 206  *      _bt_binsrch() -- Do a binary search for a key on a particular page.
 207  *
 208  * The passed scankey must be an insertion-type scankey (see nbtree/README),
 209  * but it can omit the rightmost column(s) of the index.
 210  *
 211  * When nextkey is false (the usual case), we are looking for the first
 212  * item >= scankey.  When nextkey is true, we are looking for the first
 213  * item strictly greater than scankey.
 214  *
 215  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
 216  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
 217  * particular, this means it is possible to return a value 1 greater than the
 218  * number of keys on the page, if the scankey is > all keys on the page.)
 219  *
 220  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
 221  * of the last key < given scankey, or last key <= given scankey if nextkey
 222  * is true.  (Since _bt_compare treats the first data key of such a page as
 223  * minus infinity, there will be at least one key < scankey, so the result
 224  * always points at one of the keys on the page.)  This key indicates the
 225  * right place to descend to be sure we find all leaf keys >= given scankey
 226  * (or leaf keys > given scankey when nextkey is true).
 227  *
 228  * This procedure is not responsible for walking right, it just examines
 229  * the given page.      _bt_binsrch() has no lock or refcount side effects
 230  * on the buffer.
 231  */
 232 OffsetNumber
 233 _bt_binsrch(Relation rel,
 234                         Buffer buf,
 235                         int keysz,
 236                         ScanKey scankey,
 237                         bool nextkey)
 238 {
 239         Page            page;
 240         BTPageOpaque opaque;
 241         OffsetNumber low,
 242                                 high;
 243         int32           result,
 244                                 cmpval;
 245
 246         page = BufferGetPage(buf);
 247         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 248
 249         low = P_FIRSTDATAKEY(opaque);
 250         high = PageGetMaxOffsetNumber(page);
 251
 252         /*
 253          * If there are no keys on the page, return the first available slot. Note
 254          * this covers two cases: the page is really empty (no keys), or it
 255          * contains only a high key.  The latter case is possible after vacuuming.
 256          * This can never happen on an internal page, however, since they are
 257          * never empty (an internal page must have children).
 258          */
 259         if (high < low)
 260                 return low;
 261
 262         /*
 263          * Binary search to find the first key on the page >= scan key, or first
 264          * key > scankey when nextkey is true.
 265          *
 266          * For nextkey=false (cmpval=1), the loop invariant is: all slots before
 267          * 'low' are < scan key, all slots at or after 'high' are >= scan key.
 268          *
 269          * For nextkey=true (cmpval=0), the loop invariant is: all slots before
 270          * 'low' are <= scan key, all slots at or after 'high' are > scan key.
 271          *
 272          * We can fall out when high == low.
 273          */
 274         high++;                                         /* establish the loop invariant for high */
 275
 276         cmpval = nextkey ? 0 : 1;       /* select comparison value */
 277
 278         while (high > low)
 279         {
 280                 OffsetNumber mid = low + ((high - low) / 2);
 281
 282                 /* We have low <= mid < high, so mid points at a real slot */
 283
 284                 result = _bt_compare(rel, keysz, scankey, page, mid);
 285
 286                 if (result >= cmpval)
 287                         low = mid + 1;
 288                 else
 289                         high = mid;
 290         }
 291
 292         /*
 293          * At this point we have high == low, but be careful: they could point
 294          * past the last slot on the page.
 295          *
 296          * On a leaf page, we always return the first key >= scan key (resp. >
 297          * scan key), which could be the last slot + 1.
 298          */
 299         if (P_ISLEAF(opaque))
 300                 return low;
 301
 302         /*
 303          * On a non-leaf page, return the last key < scan key (resp. <= scan key).
 304          * There must be one if _bt_compare() is playing by the rules.
 305          */
 306         Assert(low > P_FIRSTDATAKEY(opaque));
 307
 308         return OffsetNumberPrev(low);
 309 }
 310
 311 /*----------
 312  *      _bt_compare() -- Compare scankey to a particular tuple on the page.
 313  *
 314  * The passed scankey must be an insertion-type scankey (see nbtree/README),
 315  * but it can omit the rightmost column(s) of the index.
 316  *
 317  *      keysz: number of key conditions to be checked (might be less than the
 318  *              number of index columns!)
 319  *      page/offnum: location of btree item to be compared to.
 320  *
 321  *              This routine returns:
 322  *                      <0 if scankey < tuple at offnum;
 323  *                       0 if scankey == tuple at offnum;
 324  *                      >0 if scankey > tuple at offnum.
 325  *              NULLs in the keys are treated as sortable values.  Therefore
 326  *              "equality" does not necessarily mean that the item should be
 327  *              returned to the caller as a matching key!
 328  *
 329  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
 330  * "minus infinity": this routine will always claim it is less than the
 331  * scankey.  The actual key value stored (if any, which there probably isn't)
 332  * does not matter.  This convention allows us to implement the Lehman and
 333  * Yao convention that the first down-link pointer is before the first key.
 334  * See backend/access/nbtree/README for details.
 335  *----------
 336  */
 337 int32
 338 _bt_compare(Relation rel,
 339                         int keysz,
 340                         ScanKey scankey,
 341                         Page page,
 342                         OffsetNumber offnum)
 343 {
 344         TupleDesc       itupdesc = RelationGetDescr(rel);
 345         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 346         IndexTuple      itup;
 347         int                     i;
 348
 349         /*
 350          * Force result ">" if target item is first data item on an internal page
 351          * --- see NOTE above.
 352          */
 353         if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
 354                 return 1;
 355
 356         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 357
 358         /*
 359          * The scan key is set up with the attribute number associated with each
 360          * term in the key.  It is important that, if the index is multi-key, the
 361          * scan contain the first k key attributes, and that they be in order.  If
 362          * you think about how multi-key ordering works, you'll understand why
 363          * this is.
 364          *
 365          * We don't test for violation of this condition here, however.  The
 366          * initial setup for the index scan had better have gotten it right (see
 367          * _bt_first).
 368          */
 369
 370         for (i = 1; i <= keysz; i++)
 371         {
 372                 Datum           datum;
 373                 bool            isNull;
 374                 int32           result;
 375
 376                 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
 377
 378                 /* see comments about NULLs handling in btbuild */
 379                 if (scankey->sk_flags & SK_ISNULL)              /* key is NULL */
 380                 {
 381                         if (isNull)
 382                                 result = 0;             /* NULL "=" NULL */
 383                         else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
 384                                 result = -1;    /* NULL "<" NOT_NULL */
 385                         else
 386                                 result = 1;             /* NULL ">" NOT_NULL */
 387                 }
 388                 else if (isNull)                /* key is NOT_NULL and item is NULL */
 389                 {
 390                         if (scankey->sk_flags & SK_BT_NULLS_FIRST)
 391                                 result = 1;             /* NOT_NULL ">" NULL */
 392                         else
 393                                 result = -1;    /* NOT_NULL "<" NULL */
 394                 }
 395                 else
 396                 {
 397                         /*
 398                          * The sk_func needs to be passed the index value as left arg and
 399                          * the sk_argument as right arg (they might be of different
 400                          * types).      Since it is convenient for callers to think of
 401                          * _bt_compare as comparing the scankey to the index item, we have
 402                          * to flip the sign of the comparison result.  (Unless it's a DESC
 403                          * column, in which case we *don't* flip the sign.)
 404                          */
 405                         result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
 406                                                                                                  datum,
 407                                                                                                  scankey->sk_argument));
 408
 409                         if (!(scankey->sk_flags & SK_BT_DESC))
 410                                 result = -result;
 411                 }
 412
 413                 /* if the keys are unequal, return the difference */
 414                 if (result != 0)
 415                         return result;
 416
 417                 scankey++;
 418         }
 419
 420         /* if we get here, the keys are equal */
 421         return 0;
 422 }
 423
 424 /*
 425  *      _bt_first() -- Find the first item in a scan.
 426  *
 427  *              We need to be clever about the direction of scan, the search
 428  *              conditions, and the tree ordering.      We find the first item (or,
 429  *              if backwards scan, the last item) in the tree that satisfies the
 430  *              qualifications in the scan key.  On success exit, the page containing
 431  *              the current index tuple is pinned but not locked, and data about
 432  *              the matching tuple(s) on the page has been loaded into so->currPos,
 433  *              and scan->xs_ctup.t_self is set to the heap TID of the current tuple.
 434  *
 435  * If there are no matching items in the index, we return FALSE, with no
 436  * pins or locks held.
 437  *
 438  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
 439  * are both search-type scankeys (see nbtree/README for more about this).
 440  * Within this routine, we build a temporary insertion-type scankey to use
 441  * in locating the scan start position.
 442  */
 443 bool
 444 _bt_first(IndexScanDesc scan, ScanDirection dir)
 445 {
 446         Relation        rel = scan->indexRelation;
 447         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 448         Buffer          buf;
 449         BTStack         stack;
 450         OffsetNumber offnum;
 451         StrategyNumber strat;
 452         bool            nextkey;
 453         bool            goback;
 454         ScanKey         startKeys[INDEX_MAX_KEYS];
 455         ScanKeyData scankeys[INDEX_MAX_KEYS];
 456         int                     keysCount = 0;
 457         int                     i;
 458         StrategyNumber strat_total;
 459
 460         pgstat_count_index_scan(rel);
 461
 462         /*
 463          * Examine the scan keys and eliminate any redundant keys; also mark the
 464          * keys that must be matched to continue the scan.
 465          */
 466         _bt_preprocess_keys(scan);
 467
 468         /*
 469          * Quit now if _bt_preprocess_keys() discovered that the scan keys can
 470          * never be satisfied (eg, x == 1 AND x > 2).
 471          */
 472         if (!so->qual_ok)
 473                 return false;
 474
 475         /*----------
 476          * Examine the scan keys to discover where we need to start the scan.
 477          *
 478          * We want to identify the keys that can be used as starting boundaries;
 479          * these are =, >, or >= keys for a forward scan or =, <, <= keys for
 480          * a backwards scan.  We can use keys for multiple attributes so long as
 481          * the prior attributes had only =, >= (resp. =, <=) keys.      Once we accept
 482          * a > or < boundary or find an attribute with no boundary (which can be
 483          * thought of as the same as "> -infinity"), we can't use keys for any
 484          * attributes to its right, because it would break our simplistic notion
 485          * of what initial positioning strategy to use.
 486          *
 487          * When the scan keys include cross-type operators, _bt_preprocess_keys
 488          * may not be able to eliminate redundant keys; in such cases we will
 489          * arbitrarily pick a usable one for each attribute.  This is correct
 490          * but possibly not optimal behavior.  (For example, with keys like
 491          * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
 492          * x=5 would be more efficient.)  Since the situation only arises given
 493          * a poorly-worded query plus an incomplete opfamily, live with it.
 494          *
 495          * When both equality and inequality keys appear for a single attribute
 496          * (again, only possible when cross-type operators appear), we *must*
 497          * select one of the equality keys for the starting point, because
 498          * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
 499          * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
 500          * start at x=4, we will fail and stop before reaching x=10.  If multiple
 501          * equality quals survive preprocessing, however, it doesn't matter which
 502          * one we use --- by definition, they are either redundant or
 503          * contradictory.
 504          *
 505          * In this loop, row-comparison keys are treated the same as keys on their
 506          * first (leftmost) columns.  We'll add on lower-order columns of the row
 507          * comparison below, if possible.
 508          *
 509          * The selected scan keys (at most one per index column) are remembered by
 510          * storing their addresses into the local startKeys[] array.
 511          *----------
 512          */
 513         strat_total = BTEqualStrategyNumber;
 514         if (so->numberOfKeys > 0)
 515         {
 516                 AttrNumber      curattr;
 517                 ScanKey         chosen;
 518                 ScanKey         cur;
 519
 520                 /*
 521                  * chosen is the so-far-chosen key for the current attribute, if any.
 522                  * We don't cast the decision in stone until we reach keys for the
 523                  * next attribute.
 524                  */
 525                 curattr = 1;
 526                 chosen = NULL;
 527
 528                 /*
 529                  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
 530                  * pass to handle after-last-key processing.  Actual exit from the
 531                  * loop is at one of the "break" statements below.
 532                  */
 533                 for (cur = so->keyData, i = 0;; cur++, i++)
 534                 {
 535                         if (i >= so->numberOfKeys || cur->sk_attno != curattr)
 536                         {
 537                                 /*
 538                                  * Done looking at keys for curattr.  If we didn't find a
 539                                  * usable boundary key, quit; else save the boundary key
 540                                  * pointer in startKeys.
 541                                  */
 542                                 if (chosen == NULL)
 543                                         break;
 544                                 startKeys[keysCount++] = chosen;
 545
 546                                 /*
 547                                  * Adjust strat_total, and quit if we have stored a > or <
 548                                  * key.
 549                                  */
 550                                 strat = chosen->sk_strategy;
 551                                 if (strat != BTEqualStrategyNumber)
 552                                 {
 553                                         strat_total = strat;
 554                                         if (strat == BTGreaterStrategyNumber ||
 555                                                 strat == BTLessStrategyNumber)
 556                                                 break;
 557                                 }
 558
 559                                 /*
 560                                  * Done if that was the last attribute, or if next key is not
 561                                  * in sequence (implying no boundary key is available for the
 562                                  * next attribute).
 563                                  */
 564                                 if (i >= so->numberOfKeys ||
 565                                         cur->sk_attno != curattr + 1)
 566                                         break;
 567
 568                                 /*
 569                                  * Reset for next attr.
 570                                  */
 571                                 curattr = cur->sk_attno;
 572                                 chosen = NULL;
 573                         }
 574
 575                         /* Can we use this key as a starting boundary for this attr? */
 576                         switch (cur->sk_strategy)
 577                         {
 578                                 case BTLessStrategyNumber:
 579                                 case BTLessEqualStrategyNumber:
 580                                         if (chosen == NULL && ScanDirectionIsBackward(dir))
 581                                                 chosen = cur;
 582                                         break;
 583                                 case BTEqualStrategyNumber:
 584                                         /* override any non-equality choice */
 585                                         chosen = cur;
 586                                         break;
 587                                 case BTGreaterEqualStrategyNumber:
 588                                 case BTGreaterStrategyNumber:
 589                                         if (chosen == NULL && ScanDirectionIsForward(dir))
 590                                                 chosen = cur;
 591                                         break;
 592                         }
 593                 }
 594         }
 595
 596         /*
 597          * If we found no usable boundary keys, we have to start from one end of
 598          * the tree.  Walk down that edge to the first or last key, and scan from
 599          * there.
 600          */
 601         if (keysCount == 0)
 602                 return _bt_endpoint(scan, dir);
 603
 604         /*
 605          * We want to start the scan somewhere within the index.  Set up an
 606          * insertion scankey we can use to search for the boundary point we
 607          * identified above.  The insertion scankey is built in the local
 608          * scankeys[] array, using the keys identified by startKeys[].
 609          */
 610         Assert(keysCount <= INDEX_MAX_KEYS);
 611         for (i = 0; i < keysCount; i++)
 612         {
 613                 ScanKey         cur = startKeys[i];
 614
 615                 Assert(cur->sk_attno == i + 1);
 616
 617                 if (cur->sk_flags & SK_ROW_HEADER)
 618                 {
 619                         /*
 620                          * Row comparison header: look to the first row member instead.
 621                          *
 622                          * The member scankeys are already in insertion format (ie, they
 623                          * have sk_func = 3-way-comparison function), but we have to watch
 624                          * out for nulls, which _bt_preprocess_keys didn't check. A null
 625                          * in the first row member makes the condition unmatchable, just
 626                          * like qual_ok = false.
 627                          */
 628                         ScanKey         subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
 629
 630                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
 631                         if (subkey->sk_flags & SK_ISNULL)
 632                                 return false;
 633                         memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
 634
 635                         /*
 636                          * If the row comparison is the last positioning key we accepted,
 637                          * try to add additional keys from the lower-order row members.
 638                          * (If we accepted independent conditions on additional index
 639                          * columns, we use those instead --- doesn't seem worth trying to
 640                          * determine which is more restrictive.)  Note that this is OK
 641                          * even if the row comparison is of ">" or "<" type, because the
 642                          * condition applied to all but the last row member is effectively
 643                          * ">=" or "<=", and so the extra keys don't break the positioning
 644                          * scheme.      But, by the same token, if we aren't able to use all
 645                          * the row members, then the part of the row comparison that we
 646                          * did use has to be treated as just a ">=" or "<=" condition, and
 647                          * so we'd better adjust strat_total accordingly.
 648                          */
 649                         if (i == keysCount - 1)
 650                         {
 651                                 bool            used_all_subkeys = false;
 652
 653                                 Assert(!(subkey->sk_flags & SK_ROW_END));
 654                                 for (;;)
 655                                 {
 656                                         subkey++;
 657                                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
 658                                         if (subkey->sk_attno != keysCount + 1)
 659                                                 break;  /* out-of-sequence, can't use it */
 660                                         if (subkey->sk_strategy != cur->sk_strategy)
 661                                                 break;  /* wrong direction, can't use it */
 662                                         if (subkey->sk_flags & SK_ISNULL)
 663                                                 break;  /* can't use null keys */
 664                                         Assert(keysCount < INDEX_MAX_KEYS);
 665                                         memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
 666                                         keysCount++;
 667                                         if (subkey->sk_flags & SK_ROW_END)
 668                                         {
 669                                                 used_all_subkeys = true;
 670                                                 break;
 671                                         }
 672                                 }
 673                                 if (!used_all_subkeys)
 674                                 {
 675                                         switch (strat_total)
 676                                         {
 677                                                 case BTLessStrategyNumber:
 678                                                         strat_total = BTLessEqualStrategyNumber;
 679                                                         break;
 680                                                 case BTGreaterStrategyNumber:
 681                                                         strat_total = BTGreaterEqualStrategyNumber;
 682                                                         break;
 683                                         }
 684                                 }
 685                                 break;                  /* done with outer loop */
 686                         }
 687                 }
 688                 else
 689                 {
 690                         /*
 691                          * Ordinary comparison key.  Transform the search-style scan key
 692                          * to an insertion scan key by replacing the sk_func with the
 693                          * appropriate btree comparison function.
 694                          *
 695                          * If scankey operator is not a cross-type comparison, we can use
 696                          * the cached comparison function; otherwise gotta look it up in
 697                          * the catalogs.  (That can't lead to infinite recursion, since no
 698                          * indexscan initiated by syscache lookup will use cross-data-type
 699                          * operators.)
 700                          *
 701                          * We support the convention that sk_subtype == InvalidOid means
 702                          * the opclass input type; this is a hack to simplify life for
 703                          * ScanKeyInit().
 704                          */
 705                         if (cur->sk_subtype == rel->rd_opcintype[i] ||
 706                                 cur->sk_subtype == InvalidOid)
 707                         {
 708                                 FmgrInfo   *procinfo;
 709
 710                                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
 711                                 ScanKeyEntryInitializeWithInfo(scankeys + i,
 712                                                                                            cur->sk_flags,
 713                                                                                            cur->sk_attno,
 714                                                                                            InvalidStrategy,
 715                                                                                            cur->sk_subtype,
 716                                                                                            procinfo,
 717                                                                                            cur->sk_argument);
 718                         }
 719                         else
 720                         {
 721                                 RegProcedure cmp_proc;
 722
 723                                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
 724                                                                                          rel->rd_opcintype[i],
 725                                                                                          cur->sk_subtype,
 726                                                                                          BTORDER_PROC);
 727                                 if (!RegProcedureIsValid(cmp_proc))
 728                                         elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
 729                                                  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
 730                                                  cur->sk_attno, RelationGetRelationName(rel));
 731                                 ScanKeyEntryInitialize(scankeys + i,
 732                                                                            cur->sk_flags,
 733                                                                            cur->sk_attno,
 734                                                                            InvalidStrategy,
 735                                                                            cur->sk_subtype,
 736                                                                            cmp_proc,
 737                                                                            cur->sk_argument);
 738                         }
 739                 }
 740         }
 741
 742         /*----------
 743          * Examine the selected initial-positioning strategy to determine exactly
 744          * where we need to start the scan, and set flag variables to control the
 745          * code below.
 746          *
 747          * If nextkey = false, _bt_search and _bt_binsrch will locate the first
 748          * item >= scan key.  If nextkey = true, they will locate the first
 749          * item > scan key.
 750          *
 751          * If goback = true, we will then step back one item, while if
 752          * goback = false, we will start the scan on the located item.
 753          *----------
 754          */
 755         switch (strat_total)
 756         {
 757                 case BTLessStrategyNumber:
 758
 759                         /*
 760                          * Find first item >= scankey, then back up one to arrive at last
 761                          * item < scankey.      (Note: this positioning strategy is only used
 762                          * for a backward scan, so that is always the correct starting
 763                          * position.)
 764                          */
 765                         nextkey = false;
 766                         goback = true;
 767                         break;
 768
 769                 case BTLessEqualStrategyNumber:
 770
 771                         /*
 772                          * Find first item > scankey, then back up one to arrive at last
 773                          * item <= scankey.  (Note: this positioning strategy is only used
 774                          * for a backward scan, so that is always the correct starting
 775                          * position.)
 776                          */
 777                         nextkey = true;
 778                         goback = true;
 779                         break;
 780
 781                 case BTEqualStrategyNumber:
 782
 783                         /*
 784                          * If a backward scan was specified, need to start with last equal
 785                          * item not first one.
 786                          */
 787                         if (ScanDirectionIsBackward(dir))
 788                         {
 789                                 /*
 790                                  * This is the same as the <= strategy.  We will check at the
 791                                  * end whether the found item is actually =.
 792                                  */
 793                                 nextkey = true;
 794                                 goback = true;
 795                         }
 796                         else
 797                         {
 798                                 /*
 799                                  * This is the same as the >= strategy.  We will check at the
 800                                  * end whether the found item is actually =.
 801                                  */
 802                                 nextkey = false;
 803                                 goback = false;
 804                         }
 805                         break;
 806
 807                 case BTGreaterEqualStrategyNumber:
 808
 809                         /*
 810                          * Find first item >= scankey.  (This is only used for forward
 811                          * scans.)
 812                          */
 813                         nextkey = false;
 814                         goback = false;
 815                         break;
 816
 817                 case BTGreaterStrategyNumber:
 818
 819                         /*
 820                          * Find first item > scankey.  (This is only used for forward
 821                          * scans.)
 822                          */
 823                         nextkey = true;
 824                         goback = false;
 825                         break;
 826
 827                 default:
 828                         /* can't get here, but keep compiler quiet */
 829                         elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
 830                         return false;
 831         }
 832
 833         /*
 834          * Use the manufactured insertion scan key to descend the tree and
 835          * position ourselves on the target leaf page.
 836          */
 837         stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
 838
 839         /* don't need to keep the stack around... */
 840         _bt_freestack(stack);
 841
 842         /* remember which buffer we have pinned, if any */
 843         so->currPos.buf = buf;
 844
 845         if (!BufferIsValid(buf))
 846         {
 847                 /* Only get here if index is completely empty */
 848                 return false;
 849         }
 850
 851         /* initialize moreLeft/moreRight appropriately for scan direction */
 852         if (ScanDirectionIsForward(dir))
 853         {
 854                 so->currPos.moreLeft = false;
 855                 so->currPos.moreRight = true;
 856         }
 857         else
 858         {
 859                 so->currPos.moreLeft = true;
 860                 so->currPos.moreRight = false;
 861         }
 862         so->numKilled = 0;                      /* just paranoia */
 863         so->markItemIndex = -1;         /* ditto */
 864
 865         /* position to the precise item on the page */
 866         offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
 867
 868         /*
 869          * If nextkey = false, we are positioned at the first item >= scan key, or
 870          * possibly at the end of a page on which all the existing items are less
 871          * than the scan key and we know that everything on later pages is greater
 872          * than or equal to scan key.
 873          *
 874          * If nextkey = true, we are positioned at the first item > scan key, or
 875          * possibly at the end of a page on which all the existing items are less
 876          * than or equal to the scan key and we know that everything on later
 877          * pages is greater than scan key.
 878          *
 879          * The actually desired starting point is either this item or the prior
 880          * one, or in the end-of-page case it's the first item on the next page or
 881          * the last item on this page.  Adjust the starting offset if needed. (If
 882          * this results in an offset before the first item or after the last one,
 883          * _bt_readpage will report no items found, and then we'll step to the
 884          * next page as needed.)
 885          */
 886         if (goback)
 887                 offnum = OffsetNumberPrev(offnum);
 888
 889         /*
 890          * Now load data from the first page of the scan.
 891          */
 892         if (!_bt_readpage(scan, dir, offnum))
 893         {
 894                 /*
 895                  * There's no actually-matching data on this page.  Try to advance to
 896                  * the next page.  Return false if there's no matching data at all.
 897                  */
 898                 if (!_bt_steppage(scan, dir))
 899                         return false;
 900         }
 901
 902         /* Drop the lock, but not pin, on the current page */
 903         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
 904
 905         /* OK, itemIndex says what to return */
 906         scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
 907
 908         return true;
 909 }
 910
 911 /*
 912  *      _bt_next() -- Get the next item in a scan.
 913  *
 914  *              On entry, so->currPos describes the current page, which is pinned
 915  *              but not locked, and so->currPos.itemIndex identifies which item was
 916  *              previously returned.
 917  *
 918  *              On successful exit, scan->xs_ctup.t_self is set to the TID of the
 919  *              next heap tuple, and so->currPos is updated as needed.
 920  *
 921  *              On failure exit (no more tuples), we release pin and set
 922  *              so->currPos.buf to InvalidBuffer.
 923  */
 924 bool
 925 _bt_next(IndexScanDesc scan, ScanDirection dir)
 926 {
 927         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 928
 929         /*
 930          * Advance to next tuple on current page; or if there's no more, try to
 931          * step to the next page with data.
 932          */
 933         if (ScanDirectionIsForward(dir))
 934         {
 935                 if (++so->currPos.itemIndex > so->currPos.lastItem)
 936                 {
 937                         /* We must acquire lock before applying _bt_steppage */
 938                         Assert(BufferIsValid(so->currPos.buf));
 939                         LockBuffer(so->currPos.buf, BT_READ);
 940                         if (!_bt_steppage(scan, dir))
 941                                 return false;
 942                         /* Drop the lock, but not pin, on the new page */
 943                         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
 944                 }
 945         }
 946         else
 947         {
 948                 if (--so->currPos.itemIndex < so->currPos.firstItem)
 949                 {
 950                         /* We must acquire lock before applying _bt_steppage */
 951                         Assert(BufferIsValid(so->currPos.buf));
 952                         LockBuffer(so->currPos.buf, BT_READ);
 953                         if (!_bt_steppage(scan, dir))
 954                                 return false;
 955                         /* Drop the lock, but not pin, on the new page */
 956                         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
 957                 }
 958         }
 959
 960         /* OK, itemIndex says what to return */
 961         scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
 962
 963         return true;
 964 }
 965
 966 /*
 967  *      _bt_readpage() -- Load data from current index page into so->currPos
 968  *
 969  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
 970  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
 971  * they are updated as appropriate.  All other fields of so->currPos are
 972  * initialized from scratch here.
 973  *
 974  * We scan the current page starting at offnum and moving in the indicated
 975  * direction.  All items matching the scan keys are loaded into currPos.items.
 976  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
 977  * that there can be no more matching tuples in the current scan direction.
 978  *
 979  * Returns true if any matching items found on the page, false if none.
 980  */
 981 static bool
 982 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 983 {
 984         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 985         Page            page;
 986         BTPageOpaque opaque;
 987         OffsetNumber minoff;
 988         OffsetNumber maxoff;
 989         int                     itemIndex;
 990         bool            continuescan;
 991
 992         /* we must have the buffer pinned and locked */
 993         Assert(BufferIsValid(so->currPos.buf));
 994
 995         page = BufferGetPage(so->currPos.buf);
 996         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 997         minoff = P_FIRSTDATAKEY(opaque);
 998         maxoff = PageGetMaxOffsetNumber(page);
 999
1000         /*
1001          * we must save the page's right-link while scanning it; this tells us
1002          * where to step right to after we're done with these items.  There is no
1003          * corresponding need for the left-link, since splits always go right.
1004          */
1005         so->currPos.nextPage = opaque->btpo_next;
1006
1007         if (ScanDirectionIsForward(dir))
1008         {
1009                 /* load items[] in ascending order */
1010                 itemIndex = 0;
1011
1012                 offnum = Max(offnum, minoff);
1013
1014                 while (offnum <= maxoff)
1015                 {
1016                         if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
1017                         {
1018                                 /* tuple passes all scan key conditions, so remember it */
1019                                 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
1020                                 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
1021                                 so->currPos.items[itemIndex].indexOffset = offnum;
1022                                 itemIndex++;
1023                         }
1024                         if (!continuescan)
1025                         {
1026                                 /* there can't be any more matches, so stop */
1027                                 so->currPos.moreRight = false;
1028                                 break;
1029                         }
1030
1031                         offnum = OffsetNumberNext(offnum);
1032                 }
1033
1034                 Assert(itemIndex <= MaxIndexTuplesPerPage);
1035                 so->currPos.firstItem = 0;
1036                 so->currPos.lastItem = itemIndex - 1;
1037                 so->currPos.itemIndex = 0;
1038         }
1039         else
1040         {
1041                 /* load items[] in descending order */
1042                 itemIndex = MaxIndexTuplesPerPage;
1043
1044                 offnum = Min(offnum, maxoff);
1045
1046                 while (offnum >= minoff)
1047                 {
1048                         if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
1049                         {
1050                                 /* tuple passes all scan key conditions, so remember it */
1051                                 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
1052                                 itemIndex--;
1053                                 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
1054                                 so->currPos.items[itemIndex].indexOffset = offnum;
1055                         }
1056                         if (!continuescan)
1057                         {
1058                                 /* there can't be any more matches, so stop */
1059                                 so->currPos.moreLeft = false;
1060                                 break;
1061                         }
1062
1063                         offnum = OffsetNumberPrev(offnum);
1064                 }
1065
1066                 Assert(itemIndex >= 0);
1067                 so->currPos.firstItem = itemIndex;
1068                 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1069                 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1070         }
1071
1072         return (so->currPos.firstItem <= so->currPos.lastItem);
1073 }
1074
1075 /*
1076  *      _bt_steppage() -- Step to next page containing valid data for scan
1077  *
1078  * On entry, so->currPos.buf must be pinned and read-locked.  We'll drop
1079  * the lock and pin before moving to next page.
1080  *
1081  * On success exit, we hold pin and read-lock on the next interesting page,
1082  * and so->currPos is updated to contain data from that page.
1083  *
1084  * If there are no more matching records in the given direction, we drop all
1085  * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
1086  */
1087 static bool
1088 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1089 {
1090         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1091         Relation        rel;
1092         Page            page;
1093         BTPageOpaque opaque;
1094
1095         /* we must have the buffer pinned and locked */
1096         Assert(BufferIsValid(so->currPos.buf));
1097
1098         /* Before leaving current page, deal with any killed items */
1099         if (so->numKilled > 0)
1100                 _bt_killitems(scan, true);
1101
1102         /*
1103          * Before we modify currPos, make a copy of the page data if there was a
1104          * mark position that needs it.
1105          */
1106         if (so->markItemIndex >= 0)
1107         {
1108                 /* bump pin on current buffer for assignment to mark buffer */
1109                 IncrBufferRefCount(so->currPos.buf);
1110                 memcpy(&so->markPos, &so->currPos,
1111                            offsetof(BTScanPosData, items[1]) +
1112                            so->currPos.lastItem * sizeof(BTScanPosItem));
1113                 so->markPos.itemIndex = so->markItemIndex;
1114                 so->markItemIndex = -1;
1115         }
1116
1117         rel = scan->indexRelation;
1118
1119         if (ScanDirectionIsForward(dir))
1120         {
1121                 /* Walk right to the next page with data */
1122                 /* We must rely on the previously saved nextPage link! */
1123                 BlockNumber blkno = so->currPos.nextPage;
1124
1125                 /* Remember we left a page with data */
1126                 so->currPos.moreLeft = true;
1127
1128                 for (;;)
1129                 {
1130                         /* release the previous buffer */
1131                         _bt_relbuf(rel, so->currPos.buf);
1132                         so->currPos.buf = InvalidBuffer;
1133                         /* if we're at end of scan, give up */
1134                         if (blkno == P_NONE || !so->currPos.moreRight)
1135                                 return false;
1136                         /* check for interrupts while we're not holding any buffer lock */
1137                         CHECK_FOR_INTERRUPTS();
1138                         /* step right one page */
1139                         so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1140                         /* check for deleted page */
1141                         page = BufferGetPage(so->currPos.buf);
1142                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1143                         if (!P_IGNORE(opaque))
1144                         {
1145                                 /* see if there are any matches on this page */
1146                                 /* note that this will clear moreRight if we can stop */
1147                                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1148                                         break;
1149                         }
1150                         /* nope, keep going */
1151                         blkno = opaque->btpo_next;
1152                 }
1153         }
1154         else
1155         {
1156                 /* Remember we left a page with data */
1157                 so->currPos.moreRight = true;
1158
1159                 /*
1160                  * Walk left to the next page with data.  This is much more complex
1161                  * than the walk-right case because of the possibility that the page
1162                  * to our left splits while we are in flight to it, plus the
1163                  * possibility that the page we were on gets deleted after we leave
1164                  * it.  See nbtree/README for details.
1165                  */
1166                 for (;;)
1167                 {
1168                         /* Done if we know there are no matching keys to the left */
1169                         if (!so->currPos.moreLeft)
1170                         {
1171                                 _bt_relbuf(rel, so->currPos.buf);
1172                                 so->currPos.buf = InvalidBuffer;
1173                                 return false;
1174                         }
1175
1176                         /* Step to next physical page */
1177                         so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
1178
1179                         /* if we're physically at end of index, return failure */
1180                         if (so->currPos.buf == InvalidBuffer)
1181                                 return false;
1182
1183                         /*
1184                          * Okay, we managed to move left to a non-deleted page. Done if
1185                          * it's not half-dead and contains matching tuples. Else loop back
1186                          * and do it all again.
1187                          */
1188                         page = BufferGetPage(so->currPos.buf);
1189                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1190                         if (!P_IGNORE(opaque))
1191                         {
1192                                 /* see if there are any matches on this page */
1193                                 /* note that this will clear moreLeft if we can stop */
1194                                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1195                                         break;
1196                         }
1197                 }
1198         }
1199
1200         return true;
1201 }
1202
1203 /*
1204  * _bt_walk_left() -- step left one page, if possible
1205  *
1206  * The given buffer must be pinned and read-locked.  This will be dropped
1207  * before stepping left.  On return, we have pin and read lock on the
1208  * returned page, instead.
1209  *
1210  * Returns InvalidBuffer if there is no page to the left (no lock is held
1211  * in that case).
1212  *
1213  * When working on a non-leaf level, it is possible for the returned page
1214  * to be half-dead; the caller should check that condition and step left
1215  * again if it's important.
1216  */
1217 static Buffer
1218 _bt_walk_left(Relation rel, Buffer buf)
1219 {
1220         Page            page;
1221         BTPageOpaque opaque;
1222
1223         page = BufferGetPage(buf);
1224         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1225
1226         for (;;)
1227         {
1228                 BlockNumber obknum;
1229                 BlockNumber lblkno;
1230                 BlockNumber blkno;
1231                 int                     tries;
1232
1233                 /* if we're at end of tree, release buf and return failure */
1234                 if (P_LEFTMOST(opaque))
1235                 {
1236                         _bt_relbuf(rel, buf);
1237                         break;
1238                 }
1239                 /* remember original page we are stepping left from */
1240                 obknum = BufferGetBlockNumber(buf);
1241                 /* step left */
1242                 blkno = lblkno = opaque->btpo_prev;
1243                 _bt_relbuf(rel, buf);
1244                 /* check for interrupts while we're not holding any buffer lock */
1245                 CHECK_FOR_INTERRUPTS();
1246                 buf = _bt_getbuf(rel, blkno, BT_READ);
1247                 page = BufferGetPage(buf);
1248                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1249
1250                 /*
1251                  * If this isn't the page we want, walk right till we find what we
1252                  * want --- but go no more than four hops (an arbitrary limit). If we
1253                  * don't find the correct page by then, the most likely bet is that
1254                  * the original page got deleted and isn't in the sibling chain at all
1255                  * anymore, not that its left sibling got split more than four times.
1256                  *
1257                  * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1258                  * because half-dead pages are still in the sibling chain.      Caller
1259                  * must reject half-dead pages if wanted.
1260                  */
1261                 tries = 0;
1262                 for (;;)
1263                 {
1264                         if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1265                         {
1266                                 /* Found desired page, return it */
1267                                 return buf;
1268                         }
1269                         if (P_RIGHTMOST(opaque) || ++tries > 4)
1270                                 break;
1271                         blkno = opaque->btpo_next;
1272                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1273                         page = BufferGetPage(buf);
1274                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1275                 }
1276
1277                 /* Return to the original page to see what's up */
1278                 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1279                 page = BufferGetPage(buf);
1280                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1281                 if (P_ISDELETED(opaque))
1282                 {
1283                         /*
1284                          * It was deleted.      Move right to first nondeleted page (there
1285                          * must be one); that is the page that has acquired the deleted
1286                          * one's keyspace, so stepping left from it will take us where we
1287                          * want to be.
1288                          */
1289                         for (;;)
1290                         {
1291                                 if (P_RIGHTMOST(opaque))
1292                                         elog(ERROR, "fell off the end of index \"%s\"",
1293                                                  RelationGetRelationName(rel));
1294                                 blkno = opaque->btpo_next;
1295                                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1296                                 page = BufferGetPage(buf);
1297                                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1298                                 if (!P_ISDELETED(opaque))
1299                                         break;
1300                         }
1301
1302                         /*
1303                          * Now return to top of loop, resetting obknum to point to this
1304                          * nondeleted page, and try again.
1305                          */
1306                 }
1307                 else
1308                 {
1309                         /*
1310                          * It wasn't deleted; the explanation had better be that the page
1311                          * to the left got split or deleted. Without this check, we'd go
1312                          * into an infinite loop if there's anything wrong.
1313                          */
1314                         if (opaque->btpo_prev == lblkno)
1315                                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1316                                          obknum, RelationGetRelationName(rel));
1317                         /* Okay to try again with new lblkno value */
1318                 }
1319         }
1320
1321         return InvalidBuffer;
1322 }
1323
1324 /*
1325  * _bt_get_endpoint() -- Find the first or last page on a given tree level
1326  *
1327  * If the index is empty, we will return InvalidBuffer; any other failure
1328  * condition causes ereport().  We will not return a dead page.
1329  *
1330  * The returned buffer is pinned and read-locked.
1331  */
1332 Buffer
1333 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
1334 {
1335         Buffer          buf;
1336         Page            page;
1337         BTPageOpaque opaque;
1338         OffsetNumber offnum;
1339         BlockNumber blkno;
1340         IndexTuple      itup;
1341
1342         /*
1343          * If we are looking for a leaf page, okay to descend from fast root;
1344          * otherwise better descend from true root.  (There is no point in being
1345          * smarter about intermediate levels.)
1346          */
1347         if (level == 0)
1348                 buf = _bt_getroot(rel, BT_READ);
1349         else
1350                 buf = _bt_gettrueroot(rel);
1351
1352         if (!BufferIsValid(buf))
1353         {
1354                 /* empty index... */
1355                 return InvalidBuffer;
1356         }
1357
1358         page = BufferGetPage(buf);
1359         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1360
1361         for (;;)
1362         {
1363                 /*
1364                  * If we landed on a deleted page, step right to find a live page
1365                  * (there must be one).  Also, if we want the rightmost page, step
1366                  * right if needed to get to it (this could happen if the page split
1367                  * since we obtained a pointer to it).
1368                  */
1369                 while (P_IGNORE(opaque) ||
1370                            (rightmost && !P_RIGHTMOST(opaque)))
1371                 {
1372                         blkno = opaque->btpo_next;
1373                         if (blkno == P_NONE)
1374                                 elog(ERROR, "fell off the end of index \"%s\"",
1375                                          RelationGetRelationName(rel));
1376                         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1377                         page = BufferGetPage(buf);
1378                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1379                 }
1380
1381                 /* Done? */
1382                 if (opaque->btpo.level == level)
1383                         break;
1384                 if (opaque->btpo.level < level)
1385                         elog(ERROR, "btree level %u not found in index \"%s\"",
1386                                  level, RelationGetRelationName(rel));
1387
1388                 /* Descend to leftmost or rightmost child page */
1389                 if (rightmost)
1390                         offnum = PageGetMaxOffsetNumber(page);
1391                 else
1392                         offnum = P_FIRSTDATAKEY(opaque);
1393
1394                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1395                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1396
1397                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1398                 page = BufferGetPage(buf);
1399                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1400         }
1401
1402         return buf;
1403 }
1404
1405 /*
1406  *      _bt_endpoint() -- Find the first or last page in the index, and scan
1407  * from there to the first key satisfying all the quals.
1408  *
1409  * This is used by _bt_first() to set up a scan when we've determined
1410  * that the scan must start at the beginning or end of the index (for
1411  * a forward or backward scan respectively).  Exit conditions are the
1412  * same as for _bt_first().
1413  */
1414 static bool
1415 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1416 {
1417         Relation        rel = scan->indexRelation;
1418         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1419         Buffer          buf;
1420         Page            page;
1421         BTPageOpaque opaque;
1422         OffsetNumber start;
1423
1424         /*
1425          * Scan down to the leftmost or rightmost leaf page.  This is a simplified
1426          * version of _bt_search().  We don't maintain a stack since we know we
1427          * won't need it.
1428          */
1429         buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
1430
1431         if (!BufferIsValid(buf))
1432         {
1433                 /* empty index... */
1434                 so->currPos.buf = InvalidBuffer;
1435                 return false;
1436         }
1437
1438         page = BufferGetPage(buf);
1439         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1440         Assert(P_ISLEAF(opaque));
1441
1442         if (ScanDirectionIsForward(dir))
1443         {
1444                 /* There could be dead pages to the left, so not this: */
1445                 /* Assert(P_LEFTMOST(opaque)); */
1446
1447                 start = P_FIRSTDATAKEY(opaque);
1448         }
1449         else if (ScanDirectionIsBackward(dir))
1450         {
1451                 Assert(P_RIGHTMOST(opaque));
1452
1453                 start = PageGetMaxOffsetNumber(page);
1454         }
1455         else
1456         {
1457                 elog(ERROR, "invalid scan direction: %d", (int) dir);
1458                 start = 0;                              /* keep compiler quiet */
1459         }
1460
1461         /* remember which buffer we have pinned */
1462         so->currPos.buf = buf;
1463
1464         /* initialize moreLeft/moreRight appropriately for scan direction */
1465         if (ScanDirectionIsForward(dir))
1466         {
1467                 so->currPos.moreLeft = false;
1468                 so->currPos.moreRight = true;
1469         }
1470         else
1471         {
1472                 so->currPos.moreLeft = true;
1473                 so->currPos.moreRight = false;
1474         }
1475         so->numKilled = 0;                      /* just paranoia */
1476         so->markItemIndex = -1;         /* ditto */
1477
1478         /*
1479          * Now load data from the first page of the scan.
1480          */
1481         if (!_bt_readpage(scan, dir, start))
1482         {
1483                 /*
1484                  * There's no actually-matching data on this page.  Try to advance to
1485                  * the next page.  Return false if there's no matching data at all.
1486                  */
1487                 if (!_bt_steppage(scan, dir))
1488                         return false;
1489         }
1490
1491         /* Drop the lock, but not pin, on the current page */
1492         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1493
1494         /* OK, itemIndex says what to return */
1495         scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
1496
1497         return true;
1498 }