nbtree: fix read page recheck typo.
[pgsql.git] / src / backend / access / nbtree / nbtsearch.c
bloba662665b55fc87f8acdabb321384a730871041be
1 /*-------------------------------------------------------------------------
3 * nbtsearch.c
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
13 *-------------------------------------------------------------------------
16 #include "postgres.h"
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "access/xact.h"
21 #include "miscadmin.h"
22 #include "pgstat.h"
23 #include "storage/predicate.h"
24 #include "utils/lsyscache.h"
25 #include "utils/rel.h"
28 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
29 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
30 Buffer buf, bool forupdate, BTStack stack,
31 int access);
32 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
33 static int _bt_binsrch_posting(BTScanInsert key, Page page,
34 OffsetNumber offnum);
35 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
36 OffsetNumber offnum, bool firstPage);
37 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
38 OffsetNumber offnum, IndexTuple itup);
39 static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
40 OffsetNumber offnum, ItemPointer heapTid,
41 IndexTuple itup);
42 static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
43 OffsetNumber offnum,
44 ItemPointer heapTid, int tupleOffset);
45 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
46 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
47 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
48 ScanDirection dir);
49 static Buffer _bt_walk_left(Relation rel, Buffer buf);
50 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
51 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
55 * _bt_drop_lock_and_maybe_pin()
57 * Unlock the buffer; and if it is safe to release the pin, do that, too.
58 * This will prevent vacuum from stalling in a blocked state trying to read a
59 * page when a cursor is sitting on it.
61 * See nbtree/README section on making concurrent TID recycling safe.
63 static void
64 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
66 _bt_unlockbuf(scan->indexRelation, sp->buf);
68 if (IsMVCCSnapshot(scan->xs_snapshot) &&
69 RelationNeedsWAL(scan->indexRelation) &&
70 !scan->xs_want_itup)
72 ReleaseBuffer(sp->buf);
73 sp->buf = InvalidBuffer;
78 * _bt_search() -- Search the tree for a particular scankey,
79 * or more precisely for the first leaf page it could be on.
81 * The passed scankey is an insertion-type scankey (see nbtree/README),
82 * but it can omit the rightmost column(s) of the index.
84 * Return value is a stack of parent-page pointers (i.e. there is no entry for
85 * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
86 * which is locked and pinned. No locks are held on the parent pages,
87 * however!
89 * The returned buffer is locked according to access parameter. Additionally,
90 * access = BT_WRITE will allow an empty root page to be created and returned.
91 * When access = BT_READ, an empty index will result in *bufP being set to
92 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
93 * during the search will be finished.
95 * heaprel must be provided by callers that pass access = BT_WRITE, since we
96 * might need to allocate a new root page for caller -- see _bt_allocbuf.
98 BTStack
99 _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
100 int access)
102 BTStack stack_in = NULL;
103 int page_access = BT_READ;
105 /* heaprel must be set whenever _bt_allocbuf is reachable */
106 Assert(access == BT_READ || access == BT_WRITE);
107 Assert(access == BT_READ || heaprel != NULL);
109 /* Get the root page to start with */
110 *bufP = _bt_getroot(rel, heaprel, access);
112 /* If index is empty and access = BT_READ, no root page is created. */
113 if (!BufferIsValid(*bufP))
114 return (BTStack) NULL;
116 /* Loop iterates once per level descended in the tree */
117 for (;;)
119 Page page;
120 BTPageOpaque opaque;
121 OffsetNumber offnum;
122 ItemId itemid;
123 IndexTuple itup;
124 BlockNumber child;
125 BTStack new_stack;
128 * Race -- the page we just grabbed may have split since we read its
129 * downlink in its parent page (or the metapage). If it has, we may
130 * need to move right to its new sibling. Do that.
132 * In write-mode, allow _bt_moveright to finish any incomplete splits
133 * along the way. Strictly speaking, we'd only need to finish an
134 * incomplete split on the leaf page we're about to insert to, not on
135 * any of the upper levels (internal pages with incomplete splits are
136 * also taken care of in _bt_getstackbuf). But this is a good
137 * opportunity to finish splits of internal pages too.
139 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
140 stack_in, page_access);
142 /* if this is a leaf page, we're done */
143 page = BufferGetPage(*bufP);
144 opaque = BTPageGetOpaque(page);
145 if (P_ISLEAF(opaque))
146 break;
149 * Find the appropriate pivot tuple on this page. Its downlink points
150 * to the child page that we're about to descend to.
152 offnum = _bt_binsrch(rel, key, *bufP);
153 itemid = PageGetItemId(page, offnum);
154 itup = (IndexTuple) PageGetItem(page, itemid);
155 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
156 child = BTreeTupleGetDownLink(itup);
159 * We need to save the location of the pivot tuple we chose in a new
160 * stack entry for this page/level. If caller ends up splitting a
161 * page one level down, it usually ends up inserting a new pivot
162 * tuple/downlink immediately after the location recorded here.
164 new_stack = (BTStack) palloc(sizeof(BTStackData));
165 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
166 new_stack->bts_offset = offnum;
167 new_stack->bts_parent = stack_in;
170 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
171 * we're on the level 1 and asked to lock leaf page in write mode,
172 * then lock next page in write mode, because it must be a leaf.
174 if (opaque->btpo_level == 1 && access == BT_WRITE)
175 page_access = BT_WRITE;
177 /* drop the read lock on the page, then acquire one on its child */
178 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
180 /* okay, all set to move down a level */
181 stack_in = new_stack;
185 * If we're asked to lock leaf in write mode, but didn't manage to, then
186 * relock. This should only happen when the root page is a leaf page (and
187 * the only page in the index other than the metapage).
189 if (access == BT_WRITE && page_access == BT_READ)
191 /* trade in our read lock for a write lock */
192 _bt_unlockbuf(rel, *bufP);
193 _bt_lockbuf(rel, *bufP, BT_WRITE);
196 * Race -- the leaf page may have split after we dropped the read lock
197 * but before we acquired a write lock. If it has, we may need to
198 * move right to its new sibling. Do that.
200 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
203 return stack_in;
207 * _bt_moveright() -- move right in the btree if necessary.
209 * When we follow a pointer to reach a page, it is possible that
210 * the page has changed in the meanwhile. If this happens, we're
211 * guaranteed that the page has "split right" -- that is, that any
212 * data that appeared on the page originally is either on the page
213 * or strictly to the right of it.
215 * This routine decides whether or not we need to move right in the
216 * tree by examining the high key entry on the page. If that entry is
217 * strictly less than the scankey, or <= the scankey in the
218 * key.nextkey=true case, then we followed the wrong link and we need
219 * to move right.
221 * The passed insertion-type scankey can omit the rightmost column(s) of the
222 * index. (see nbtree/README)
224 * When key.nextkey is false (the usual case), we are looking for the first
225 * item >= key. When key.nextkey is true, we are looking for the first item
226 * strictly greater than key.
228 * If forupdate is true, we will attempt to finish any incomplete splits
229 * that we encounter. This is required when locking a target page for an
230 * insertion, because we don't allow inserting on a page before the split is
231 * completed. 'heaprel' and 'stack' are only used if forupdate is true.
233 * On entry, we have the buffer pinned and a lock of the type specified by
234 * 'access'. If we move right, we release the buffer and lock and acquire
235 * the same on the right sibling. Return value is the buffer we stop at.
237 static Buffer
238 _bt_moveright(Relation rel,
239 Relation heaprel,
240 BTScanInsert key,
241 Buffer buf,
242 bool forupdate,
243 BTStack stack,
244 int access)
246 Page page;
247 BTPageOpaque opaque;
248 int32 cmpval;
250 Assert(!forupdate || heaprel != NULL);
253 * When nextkey = false (normal case): if the scan key that brought us to
254 * this page is > the high key stored on the page, then the page has split
255 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
256 * have some duplicates to the right as well as the left, but that's
257 * something that's only ever dealt with on the leaf level, after
258 * _bt_search has found an initial leaf page.)
260 * When nextkey = true: move right if the scan key is >= page's high key.
261 * (Note that key.scantid cannot be set in this case.)
263 * The page could even have split more than once, so scan as far as
264 * needed.
266 * We also have to move right if we followed a link that brought us to a
267 * dead page.
269 cmpval = key->nextkey ? 0 : 1;
271 for (;;)
273 page = BufferGetPage(buf);
274 opaque = BTPageGetOpaque(page);
276 if (P_RIGHTMOST(opaque))
277 break;
280 * Finish any incomplete splits we encounter along the way.
282 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
284 BlockNumber blkno = BufferGetBlockNumber(buf);
286 /* upgrade our lock if necessary */
287 if (access == BT_READ)
289 _bt_unlockbuf(rel, buf);
290 _bt_lockbuf(rel, buf, BT_WRITE);
293 if (P_INCOMPLETE_SPLIT(opaque))
294 _bt_finish_split(rel, heaprel, buf, stack);
295 else
296 _bt_relbuf(rel, buf);
298 /* re-acquire the lock in the right mode, and re-check */
299 buf = _bt_getbuf(rel, blkno, access);
300 continue;
303 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
305 /* step right one page */
306 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
307 continue;
309 else
310 break;
313 if (P_IGNORE(opaque))
314 elog(ERROR, "fell off the end of index \"%s\"",
315 RelationGetRelationName(rel));
317 return buf;
321 * _bt_binsrch() -- Do a binary search for a key on a particular page.
323 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
324 * of the last key < given scankey, or last key <= given scankey if nextkey
325 * is true. (Since _bt_compare treats the first data key of such a page as
326 * minus infinity, there will be at least one key < scankey, so the result
327 * always points at one of the keys on the page.)
329 * On a leaf page, _bt_binsrch() returns the final result of the initial
330 * positioning process that started with _bt_first's call to _bt_search.
331 * We're returning a non-pivot tuple offset, so things are a little different.
332 * It is possible that we'll return an offset that's either past the last
333 * non-pivot slot, or (in the case of a backward scan) before the first slot.
335 * This procedure is not responsible for walking right, it just examines
336 * the given page. _bt_binsrch() has no lock or refcount side effects
337 * on the buffer.
339 static OffsetNumber
340 _bt_binsrch(Relation rel,
341 BTScanInsert key,
342 Buffer buf)
344 Page page;
345 BTPageOpaque opaque;
346 OffsetNumber low,
347 high;
348 int32 result,
349 cmpval;
351 page = BufferGetPage(buf);
352 opaque = BTPageGetOpaque(page);
354 /* Requesting nextkey semantics while using scantid seems nonsensical */
355 Assert(!key->nextkey || key->scantid == NULL);
356 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
357 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
359 low = P_FIRSTDATAKEY(opaque);
360 high = PageGetMaxOffsetNumber(page);
363 * If there are no keys on the page, return the first available slot. Note
364 * this covers two cases: the page is really empty (no keys), or it
365 * contains only a high key. The latter case is possible after vacuuming.
366 * This can never happen on an internal page, however, since they are
367 * never empty (an internal page must have at least one child).
369 if (unlikely(high < low))
370 return low;
373 * Binary search to find the first key on the page >= scan key, or first
374 * key > scankey when nextkey is true.
376 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
377 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
379 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
380 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
382 * We can fall out when high == low.
384 high++; /* establish the loop invariant for high */
386 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
388 while (high > low)
390 OffsetNumber mid = low + ((high - low) / 2);
392 /* We have low <= mid < high, so mid points at a real slot */
394 result = _bt_compare(rel, key, page, mid);
396 if (result >= cmpval)
397 low = mid + 1;
398 else
399 high = mid;
403 * At this point we have high == low.
405 * On a leaf page we always return the first non-pivot tuple >= scan key
406 * (resp. > scan key) for forward scan callers. For backward scans, it's
407 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
409 if (P_ISLEAF(opaque))
412 * In the backward scan case we're supposed to locate the last
413 * matching tuple on the leaf level -- not the first matching tuple
414 * (the last tuple will be the first one returned by the scan).
416 * At this point we've located the first non-pivot tuple immediately
417 * after the last matching tuple (which might just be maxoff + 1).
418 * Compensate by stepping back.
420 if (key->backward)
421 return OffsetNumberPrev(low);
423 return low;
427 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
428 * There must be one if _bt_compare() is playing by the rules.
430 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
431 * a truncated -inf heap TID is usually enough to prevent it altogether.
432 * Even omitted scan key entries are treated as > truncated attributes.
434 * However, during backward scans _bt_compare() interprets omitted scan
435 * key attributes as == corresponding truncated -inf attributes instead.
436 * This works just like < would work here. Under this scheme, < strategy
437 * backward scans will always directly descend to the correct leaf page.
438 * In particular, they will never incur an "extra" leaf page access with a
439 * scan key that happens to contain the same prefix of values as some
440 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
441 * it uses a leaf page high key to "re-find" a page undergoing deletion.
443 Assert(low > P_FIRSTDATAKEY(opaque));
445 return OffsetNumberPrev(low);
450 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
452 * Like _bt_binsrch(), but with support for caching the binary search
453 * bounds. Only used during insertion, and only on the leaf page that it
454 * looks like caller will insert tuple on. Exclusive-locked and pinned
455 * leaf page is contained within insertstate.
457 * Caches the bounds fields in insertstate so that a subsequent call can
458 * reuse the low and strict high bounds of original binary search. Callers
459 * that use these fields directly must be prepared for the case where low
460 * and/or stricthigh are not on the same page (one or both exceed maxoff
461 * for the page). The case where there are no items on the page (high <
462 * low) makes bounds invalid.
464 * Caller is responsible for invalidating bounds when it modifies the page
465 * before calling here a second time, and for dealing with posting list
466 * tuple matches (callers can use insertstate's postingoff field to
467 * determine which existing heap TID will need to be replaced by a posting
468 * list split).
470 OffsetNumber
471 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
473 BTScanInsert key = insertstate->itup_key;
474 Page page;
475 BTPageOpaque opaque;
476 OffsetNumber low,
477 high,
478 stricthigh;
479 int32 result,
480 cmpval;
482 page = BufferGetPage(insertstate->buf);
483 opaque = BTPageGetOpaque(page);
485 Assert(P_ISLEAF(opaque));
486 Assert(!key->nextkey);
487 Assert(insertstate->postingoff == 0);
489 if (!insertstate->bounds_valid)
491 /* Start new binary search */
492 low = P_FIRSTDATAKEY(opaque);
493 high = PageGetMaxOffsetNumber(page);
495 else
497 /* Restore result of previous binary search against same page */
498 low = insertstate->low;
499 high = insertstate->stricthigh;
502 /* If there are no keys on the page, return the first available slot */
503 if (unlikely(high < low))
505 /* Caller can't reuse bounds */
506 insertstate->low = InvalidOffsetNumber;
507 insertstate->stricthigh = InvalidOffsetNumber;
508 insertstate->bounds_valid = false;
509 return low;
513 * Binary search to find the first key on the page >= scan key. (nextkey
514 * is always false when inserting).
516 * The loop invariant is: all slots before 'low' are < scan key, all slots
517 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
518 * maintained to save additional search effort for caller.
520 * We can fall out when high == low.
522 if (!insertstate->bounds_valid)
523 high++; /* establish the loop invariant for high */
524 stricthigh = high; /* high initially strictly higher */
526 cmpval = 1; /* !nextkey comparison value */
528 while (high > low)
530 OffsetNumber mid = low + ((high - low) / 2);
532 /* We have low <= mid < high, so mid points at a real slot */
534 result = _bt_compare(rel, key, page, mid);
536 if (result >= cmpval)
537 low = mid + 1;
538 else
540 high = mid;
541 if (result != 0)
542 stricthigh = high;
546 * If tuple at offset located by binary search is a posting list whose
547 * TID range overlaps with caller's scantid, perform posting list
548 * binary search to set postingoff for caller. Caller must split the
549 * posting list when postingoff is set. This should happen
550 * infrequently.
552 if (unlikely(result == 0 && key->scantid != NULL))
555 * postingoff should never be set more than once per leaf page
556 * binary search. That would mean that there are duplicate table
557 * TIDs in the index, which is never okay. Check for that here.
559 if (insertstate->postingoff != 0)
560 ereport(ERROR,
561 (errcode(ERRCODE_INDEX_CORRUPTED),
562 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
563 ItemPointerGetBlockNumber(key->scantid),
564 ItemPointerGetOffsetNumber(key->scantid),
565 low, stricthigh,
566 BufferGetBlockNumber(insertstate->buf),
567 RelationGetRelationName(rel))));
569 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
574 * On a leaf page, a binary search always returns the first key >= scan
575 * key (at least in !nextkey case), which could be the last slot + 1. This
576 * is also the lower bound of cached search.
578 * stricthigh may also be the last slot + 1, which prevents caller from
579 * using bounds directly, but is still useful to us if we're called a
580 * second time with cached bounds (cached low will be < stricthigh when
581 * that happens).
583 insertstate->low = low;
584 insertstate->stricthigh = stricthigh;
585 insertstate->bounds_valid = true;
587 return low;
590 /*----------
591 * _bt_binsrch_posting() -- posting list binary search.
593 * Helper routine for _bt_binsrch_insert().
595 * Returns offset into posting list where caller's scantid belongs.
596 *----------
598 static int
599 _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
601 IndexTuple itup;
602 ItemId itemid;
603 int low,
604 high,
605 mid,
606 res;
609 * If this isn't a posting tuple, then the index must be corrupt (if it is
610 * an ordinary non-pivot tuple then there must be an existing tuple with a
611 * heap TID that equals inserter's new heap TID/scantid). Defensively
612 * check that tuple is a posting list tuple whose posting list range
613 * includes caller's scantid.
615 * (This is also needed because contrib/amcheck's rootdescend option needs
616 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
618 itemid = PageGetItemId(page, offnum);
619 itup = (IndexTuple) PageGetItem(page, itemid);
620 if (!BTreeTupleIsPosting(itup))
621 return 0;
623 Assert(key->heapkeyspace && key->allequalimage);
626 * In the event that posting list tuple has LP_DEAD bit set, indicate this
627 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
628 * second call to _bt_binsrch_insert() can take place when its caller has
629 * removed the dead item.
631 if (ItemIdIsDead(itemid))
632 return -1;
634 /* "high" is past end of posting list for loop invariant */
635 low = 0;
636 high = BTreeTupleGetNPosting(itup);
637 Assert(high >= 2);
639 while (high > low)
641 mid = low + ((high - low) / 2);
642 res = ItemPointerCompare(key->scantid,
643 BTreeTupleGetPostingN(itup, mid));
645 if (res > 0)
646 low = mid + 1;
647 else if (res < 0)
648 high = mid;
649 else
650 return mid;
653 /* Exact match not found */
654 return low;
657 /*----------
658 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
660 * page/offnum: location of btree item to be compared to.
662 * This routine returns:
663 * <0 if scankey < tuple at offnum;
664 * 0 if scankey == tuple at offnum;
665 * >0 if scankey > tuple at offnum.
667 * NULLs in the keys are treated as sortable values. Therefore
668 * "equality" does not necessarily mean that the item should be returned
669 * to the caller as a matching key. Similarly, an insertion scankey
670 * with its scantid set is treated as equal to a posting tuple whose TID
671 * range overlaps with their scantid. There generally won't be a
672 * matching TID in the posting tuple, which caller must handle
673 * themselves (e.g., by splitting the posting list tuple).
675 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
676 * "minus infinity": this routine will always claim it is less than the
677 * scankey. The actual key value stored is explicitly truncated to 0
678 * attributes (explicitly minus infinity) with version 3+ indexes, but
679 * that isn't relied upon. This allows us to implement the Lehman and
680 * Yao convention that the first down-link pointer is before the first
681 * key. See backend/access/nbtree/README for details.
682 *----------
684 int32
685 _bt_compare(Relation rel,
686 BTScanInsert key,
687 Page page,
688 OffsetNumber offnum)
690 TupleDesc itupdesc = RelationGetDescr(rel);
691 BTPageOpaque opaque = BTPageGetOpaque(page);
692 IndexTuple itup;
693 ItemPointer heapTid;
694 ScanKey scankey;
695 int ncmpkey;
696 int ntupatts;
697 int32 result;
699 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
700 Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
701 Assert(key->heapkeyspace || key->scantid == NULL);
704 * Force result ">" if target item is first data item on an internal page
705 * --- see NOTE above.
707 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
708 return 1;
710 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
711 ntupatts = BTreeTupleGetNAtts(itup, rel);
714 * The scan key is set up with the attribute number associated with each
715 * term in the key. It is important that, if the index is multi-key, the
716 * scan contain the first k key attributes, and that they be in order. If
717 * you think about how multi-key ordering works, you'll understand why
718 * this is.
720 * We don't test for violation of this condition here, however. The
721 * initial setup for the index scan had better have gotten it right (see
722 * _bt_first).
725 ncmpkey = Min(ntupatts, key->keysz);
726 Assert(key->heapkeyspace || ncmpkey == key->keysz);
727 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
728 scankey = key->scankeys;
729 for (int i = 1; i <= ncmpkey; i++)
731 Datum datum;
732 bool isNull;
734 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
736 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
738 if (isNull)
739 result = 0; /* NULL "=" NULL */
740 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
741 result = -1; /* NULL "<" NOT_NULL */
742 else
743 result = 1; /* NULL ">" NOT_NULL */
745 else if (isNull) /* key is NOT_NULL and item is NULL */
747 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
748 result = 1; /* NOT_NULL ">" NULL */
749 else
750 result = -1; /* NOT_NULL "<" NULL */
752 else
755 * The sk_func needs to be passed the index value as left arg and
756 * the sk_argument as right arg (they might be of different
757 * types). Since it is convenient for callers to think of
758 * _bt_compare as comparing the scankey to the index item, we have
759 * to flip the sign of the comparison result. (Unless it's a DESC
760 * column, in which case we *don't* flip the sign.)
762 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
763 scankey->sk_collation,
764 datum,
765 scankey->sk_argument));
767 if (!(scankey->sk_flags & SK_BT_DESC))
768 INVERT_COMPARE_RESULT(result);
771 /* if the keys are unequal, return the difference */
772 if (result != 0)
773 return result;
775 scankey++;
779 * All non-truncated attributes (other than heap TID) were found to be
780 * equal. Treat truncated attributes as minus infinity when scankey has a
781 * key attribute value that would otherwise be compared directly.
783 * Note: it doesn't matter if ntupatts includes non-key attributes;
784 * scankey won't, so explicitly excluding non-key attributes isn't
785 * necessary.
787 if (key->keysz > ntupatts)
788 return 1;
791 * Use the heap TID attribute and scantid to try to break the tie. The
792 * rules are the same as any other key attribute -- only the
793 * representation differs.
795 heapTid = BTreeTupleGetHeapTID(itup);
796 if (key->scantid == NULL)
799 * Forward scans have a scankey that is considered greater than a
800 * truncated pivot tuple if and when the scankey has equal values for
801 * attributes up to and including the least significant untruncated
802 * attribute in tuple. Even attributes that were omitted from the
803 * scan key are considered greater than -inf truncated attributes.
804 * (See _bt_binsrch for an explanation of our backward scan behavior.)
806 * For example, if an index has the minimum two attributes (single
807 * user key attribute, plus heap TID attribute), and a page's high key
808 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
809 * will not descend to the page to the left. The search will descend
810 * right instead. The truncated attribute in pivot tuple means that
811 * all non-pivot tuples on the page to the left are strictly < 'foo',
812 * so it isn't necessary to descend left. In other words, search
813 * doesn't have to descend left because it isn't interested in a match
814 * that has a heap TID value of -inf.
816 * Note: the heap TID part of the test ensures that scankey is being
817 * compared to a pivot tuple with one or more truncated -inf key
818 * attributes. The heap TID attribute is the last key attribute in
819 * every index, of course, but other than that it isn't special.
821 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
822 key->heapkeyspace)
823 return 1;
825 /* All provided scankey arguments found to be equal */
826 return 0;
830 * Treat truncated heap TID as minus infinity, since scankey has a key
831 * attribute value (scantid) that would otherwise be compared directly
833 Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
834 if (heapTid == NULL)
835 return 1;
838 * Scankey must be treated as equal to a posting list tuple if its scantid
839 * value falls within the range of the posting list. In all other cases
840 * there can only be a single heap TID value, which is compared directly
841 * with scantid.
843 Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
844 result = ItemPointerCompare(key->scantid, heapTid);
845 if (result <= 0 || !BTreeTupleIsPosting(itup))
846 return result;
847 else
849 result = ItemPointerCompare(key->scantid,
850 BTreeTupleGetMaxHeapTID(itup));
851 if (result > 0)
852 return 1;
855 return 0;
859 * _bt_first() -- Find the first item in a scan.
861 * We need to be clever about the direction of scan, the search
862 * conditions, and the tree ordering. We find the first item (or,
863 * if backwards scan, the last item) in the tree that satisfies the
864 * qualifications in the scan key. On success exit, the page containing
865 * the current index tuple is pinned but not locked, and data about
866 * the matching tuple(s) on the page has been loaded into so->currPos.
867 * scan->xs_heaptid is set to the heap TID of the current tuple, and if
868 * requested, scan->xs_itup points to a copy of the index tuple.
870 * If there are no matching items in the index, we return false, with no
871 * pins or locks held.
873 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
874 * are both search-type scankeys (see nbtree/README for more about this).
875 * Within this routine, we build a temporary insertion-type scankey to use
876 * in locating the scan start position.
878 bool
879 _bt_first(IndexScanDesc scan, ScanDirection dir)
881 Relation rel = scan->indexRelation;
882 BTScanOpaque so = (BTScanOpaque) scan->opaque;
883 Buffer buf;
884 BTStack stack;
885 OffsetNumber offnum;
886 StrategyNumber strat;
887 BTScanInsertData inskey;
888 ScanKey startKeys[INDEX_MAX_KEYS];
889 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
890 int keysz = 0;
891 int i;
892 bool status;
893 StrategyNumber strat_total;
894 BTScanPosItem *currItem;
895 BlockNumber blkno;
897 Assert(!BTScanPosIsValid(so->currPos));
900 * Examine the scan keys and eliminate any redundant keys; also mark the
901 * keys that must be matched to continue the scan.
903 _bt_preprocess_keys(scan);
906 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
907 * never be satisfied (eg, x == 1 AND x > 2).
909 if (!so->qual_ok)
911 _bt_parallel_done(scan);
912 return false;
916 * For parallel scans, get the starting page from shared state. If the
917 * scan has not started, proceed to find out first leaf page in the usual
918 * way while keeping other participating processes waiting. If the scan
919 * has already begun, use the page number from the shared structure.
921 * When a parallel scan has another primitive index scan scheduled, a
922 * parallel worker will seize the scan for that purpose now. This is
923 * similar to the case where the top-level scan hasn't started.
925 if (scan->parallel_scan != NULL)
927 status = _bt_parallel_seize(scan, &blkno, true);
930 * Initialize arrays (when _bt_parallel_seize didn't already set up
931 * the next primitive index scan)
933 if (so->numArrayKeys && !so->needPrimScan)
934 _bt_start_array_keys(scan, dir);
936 if (!status)
937 return false;
938 else if (blkno == P_NONE)
940 _bt_parallel_done(scan);
941 return false;
943 else if (blkno != InvalidBlockNumber)
945 if (!_bt_parallel_readpage(scan, blkno, dir))
946 return false;
947 goto readcomplete;
950 else if (so->numArrayKeys && !so->needPrimScan)
953 * First _bt_first call (for current btrescan) without parallelism.
955 * Initialize arrays, and the corresponding scan keys that were just
956 * output by _bt_preprocess_keys.
958 _bt_start_array_keys(scan, dir);
962 * Count an indexscan for stats, now that we know that we'll call
963 * _bt_search/_bt_endpoint below
965 pgstat_count_index_scan(rel);
967 /*----------
968 * Examine the scan keys to discover where we need to start the scan.
970 * We want to identify the keys that can be used as starting boundaries;
971 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
972 * a backwards scan. We can use keys for multiple attributes so long as
973 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
974 * a > or < boundary or find an attribute with no boundary (which can be
975 * thought of as the same as "> -infinity"), we can't use keys for any
976 * attributes to its right, because it would break our simplistic notion
977 * of what initial positioning strategy to use.
979 * When the scan keys include cross-type operators, _bt_preprocess_keys
980 * may not be able to eliminate redundant keys; in such cases we will
981 * arbitrarily pick a usable one for each attribute. This is correct
982 * but possibly not optimal behavior. (For example, with keys like
983 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
984 * x=5 would be more efficient.) Since the situation only arises given
985 * a poorly-worded query plus an incomplete opfamily, live with it.
987 * When both equality and inequality keys appear for a single attribute
988 * (again, only possible when cross-type operators appear), we *must*
989 * select one of the equality keys for the starting point, because
990 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
991 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
992 * start at x=4, we will fail and stop before reaching x=10. If multiple
993 * equality quals survive preprocessing, however, it doesn't matter which
994 * one we use --- by definition, they are either redundant or
995 * contradictory.
997 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
998 * If the index stores nulls at the end of the index we'll be starting
999 * from, and we have no boundary key for the column (which means the key
1000 * we deduced NOT NULL from is an inequality key that constrains the other
1001 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1002 * use as a boundary key. If we didn't do this, we might find ourselves
1003 * traversing a lot of null entries at the start of the scan.
1005 * In this loop, row-comparison keys are treated the same as keys on their
1006 * first (leftmost) columns. We'll add on lower-order columns of the row
1007 * comparison below, if possible.
1009 * The selected scan keys (at most one per index column) are remembered by
1010 * storing their addresses into the local startKeys[] array.
1012 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1013 * the next primitive index scan (for scans with array keys) based in part
1014 * on an understanding of how it'll enable us to reposition the scan.
1015 * They're directly aware of how we'll sometimes cons up an explicit
1016 * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1017 * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1018 * scans to skip large groups of NULLs through repeated deductions about
1019 * key strictness (for a required inequality key) and whether NULLs in the
1020 * key's index column are stored last or first (relative to non-NULLs).
1021 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1022 * need to be kept in sync.
1023 *----------
1025 strat_total = BTEqualStrategyNumber;
1026 if (so->numberOfKeys > 0)
1028 AttrNumber curattr;
1029 ScanKey chosen;
1030 ScanKey impliesNN;
1031 ScanKey cur;
1034 * chosen is the so-far-chosen key for the current attribute, if any.
1035 * We don't cast the decision in stone until we reach keys for the
1036 * next attribute.
1038 curattr = 1;
1039 chosen = NULL;
1040 /* Also remember any scankey that implies a NOT NULL constraint */
1041 impliesNN = NULL;
1044 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1045 * pass to handle after-last-key processing. Actual exit from the
1046 * loop is at one of the "break" statements below.
1048 for (cur = so->keyData, i = 0;; cur++, i++)
1050 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1053 * Done looking at keys for curattr. If we didn't find a
1054 * usable boundary key, see if we can deduce a NOT NULL key.
1056 if (chosen == NULL && impliesNN != NULL &&
1057 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1058 ScanDirectionIsForward(dir) :
1059 ScanDirectionIsBackward(dir)))
1061 /* Yes, so build the key in notnullkeys[keysz] */
1062 chosen = &notnullkeys[keysz];
1063 ScanKeyEntryInitialize(chosen,
1064 (SK_SEARCHNOTNULL | SK_ISNULL |
1065 (impliesNN->sk_flags &
1066 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1067 curattr,
1068 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1069 BTGreaterStrategyNumber :
1070 BTLessStrategyNumber),
1071 InvalidOid,
1072 InvalidOid,
1073 InvalidOid,
1074 (Datum) 0);
1078 * If we still didn't find a usable boundary key, quit; else
1079 * save the boundary key pointer in startKeys.
1081 if (chosen == NULL)
1082 break;
1083 startKeys[keysz++] = chosen;
1086 * Adjust strat_total, and quit if we have stored a > or <
1087 * key.
1089 strat = chosen->sk_strategy;
1090 if (strat != BTEqualStrategyNumber)
1092 strat_total = strat;
1093 if (strat == BTGreaterStrategyNumber ||
1094 strat == BTLessStrategyNumber)
1095 break;
1099 * Done if that was the last attribute, or if next key is not
1100 * in sequence (implying no boundary key is available for the
1101 * next attribute).
1103 if (i >= so->numberOfKeys ||
1104 cur->sk_attno != curattr + 1)
1105 break;
1108 * Reset for next attr.
1110 curattr = cur->sk_attno;
1111 chosen = NULL;
1112 impliesNN = NULL;
1116 * Can we use this key as a starting boundary for this attr?
1118 * If not, does it imply a NOT NULL constraint? (Because
1119 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1120 * *any* inequality key works for that; we need not test.)
1122 switch (cur->sk_strategy)
1124 case BTLessStrategyNumber:
1125 case BTLessEqualStrategyNumber:
1126 if (chosen == NULL)
1128 if (ScanDirectionIsBackward(dir))
1129 chosen = cur;
1130 else
1131 impliesNN = cur;
1133 break;
1134 case BTEqualStrategyNumber:
1135 /* override any non-equality choice */
1136 chosen = cur;
1137 break;
1138 case BTGreaterEqualStrategyNumber:
1139 case BTGreaterStrategyNumber:
1140 if (chosen == NULL)
1142 if (ScanDirectionIsForward(dir))
1143 chosen = cur;
1144 else
1145 impliesNN = cur;
1147 break;
1153 * If we found no usable boundary keys, we have to start from one end of
1154 * the tree. Walk down that edge to the first or last key, and scan from
1155 * there.
1157 if (keysz == 0)
1159 bool match;
1161 match = _bt_endpoint(scan, dir);
1163 if (!match)
1165 /* No match, so mark (parallel) scan finished */
1166 _bt_parallel_done(scan);
1169 return match;
1173 * We want to start the scan somewhere within the index. Set up an
1174 * insertion scankey we can use to search for the boundary point we
1175 * identified above. The insertion scankey is built using the keys
1176 * identified by startKeys[]. (Remaining insertion scankey fields are
1177 * initialized after initial-positioning strategy is finalized.)
1179 Assert(keysz <= INDEX_MAX_KEYS);
1180 for (i = 0; i < keysz; i++)
1182 ScanKey cur = startKeys[i];
1184 Assert(cur->sk_attno == i + 1);
1186 if (cur->sk_flags & SK_ROW_HEADER)
1189 * Row comparison header: look to the first row member instead.
1191 * The member scankeys are already in insertion format (ie, they
1192 * have sk_func = 3-way-comparison function), but we have to watch
1193 * out for nulls, which _bt_preprocess_keys didn't check. A null
1194 * in the first row member makes the condition unmatchable, just
1195 * like qual_ok = false.
1197 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1199 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1200 if (subkey->sk_flags & SK_ISNULL)
1202 _bt_parallel_done(scan);
1203 return false;
1205 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1208 * If the row comparison is the last positioning key we accepted,
1209 * try to add additional keys from the lower-order row members.
1210 * (If we accepted independent conditions on additional index
1211 * columns, we use those instead --- doesn't seem worth trying to
1212 * determine which is more restrictive.) Note that this is OK
1213 * even if the row comparison is of ">" or "<" type, because the
1214 * condition applied to all but the last row member is effectively
1215 * ">=" or "<=", and so the extra keys don't break the positioning
1216 * scheme. But, by the same token, if we aren't able to use all
1217 * the row members, then the part of the row comparison that we
1218 * did use has to be treated as just a ">=" or "<=" condition, and
1219 * so we'd better adjust strat_total accordingly.
1221 if (i == keysz - 1)
1223 bool used_all_subkeys = false;
1225 Assert(!(subkey->sk_flags & SK_ROW_END));
1226 for (;;)
1228 subkey++;
1229 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1230 if (subkey->sk_attno != keysz + 1)
1231 break; /* out-of-sequence, can't use it */
1232 if (subkey->sk_strategy != cur->sk_strategy)
1233 break; /* wrong direction, can't use it */
1234 if (subkey->sk_flags & SK_ISNULL)
1235 break; /* can't use null keys */
1236 Assert(keysz < INDEX_MAX_KEYS);
1237 memcpy(inskey.scankeys + keysz, subkey,
1238 sizeof(ScanKeyData));
1239 keysz++;
1240 if (subkey->sk_flags & SK_ROW_END)
1242 used_all_subkeys = true;
1243 break;
1246 if (!used_all_subkeys)
1248 switch (strat_total)
1250 case BTLessStrategyNumber:
1251 strat_total = BTLessEqualStrategyNumber;
1252 break;
1253 case BTGreaterStrategyNumber:
1254 strat_total = BTGreaterEqualStrategyNumber;
1255 break;
1258 break; /* done with outer loop */
1261 else
1264 * Ordinary comparison key. Transform the search-style scan key
1265 * to an insertion scan key by replacing the sk_func with the
1266 * appropriate btree comparison function.
1268 * If scankey operator is not a cross-type comparison, we can use
1269 * the cached comparison function; otherwise gotta look it up in
1270 * the catalogs. (That can't lead to infinite recursion, since no
1271 * indexscan initiated by syscache lookup will use cross-data-type
1272 * operators.)
1274 * We support the convention that sk_subtype == InvalidOid means
1275 * the opclass input type; this is a hack to simplify life for
1276 * ScanKeyInit().
1278 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1279 cur->sk_subtype == InvalidOid)
1281 FmgrInfo *procinfo;
1283 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1284 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1285 cur->sk_flags,
1286 cur->sk_attno,
1287 InvalidStrategy,
1288 cur->sk_subtype,
1289 cur->sk_collation,
1290 procinfo,
1291 cur->sk_argument);
1293 else
1295 RegProcedure cmp_proc;
1297 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1298 rel->rd_opcintype[i],
1299 cur->sk_subtype,
1300 BTORDER_PROC);
1301 if (!RegProcedureIsValid(cmp_proc))
1302 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1303 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1304 cur->sk_attno, RelationGetRelationName(rel));
1305 ScanKeyEntryInitialize(inskey.scankeys + i,
1306 cur->sk_flags,
1307 cur->sk_attno,
1308 InvalidStrategy,
1309 cur->sk_subtype,
1310 cur->sk_collation,
1311 cmp_proc,
1312 cur->sk_argument);
1317 /*----------
1318 * Examine the selected initial-positioning strategy to determine exactly
1319 * where we need to start the scan, and set flag variables to control the
1320 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1321 * page _bt_search returns).
1322 *----------
1324 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1325 inskey.anynullkeys = false; /* unused */
1326 inskey.scantid = NULL;
1327 inskey.keysz = keysz;
1328 switch (strat_total)
1330 case BTLessStrategyNumber:
1332 inskey.nextkey = false;
1333 inskey.backward = true;
1334 break;
1336 case BTLessEqualStrategyNumber:
1338 inskey.nextkey = true;
1339 inskey.backward = true;
1340 break;
1342 case BTEqualStrategyNumber:
1345 * If a backward scan was specified, need to start with last equal
1346 * item not first one.
1348 if (ScanDirectionIsBackward(dir))
1351 * This is the same as the <= strategy
1353 inskey.nextkey = true;
1354 inskey.backward = true;
1356 else
1359 * This is the same as the >= strategy
1361 inskey.nextkey = false;
1362 inskey.backward = false;
1364 break;
1366 case BTGreaterEqualStrategyNumber:
1369 * Find first item >= scankey
1371 inskey.nextkey = false;
1372 inskey.backward = false;
1373 break;
1375 case BTGreaterStrategyNumber:
1378 * Find first item > scankey
1380 inskey.nextkey = true;
1381 inskey.backward = false;
1382 break;
1384 default:
1385 /* can't get here, but keep compiler quiet */
1386 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1387 return false;
1391 * Use the manufactured insertion scan key to descend the tree and
1392 * position ourselves on the target leaf page.
1394 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1395 stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1397 /* don't need to keep the stack around... */
1398 _bt_freestack(stack);
1400 if (!BufferIsValid(buf))
1403 * We only get here if the index is completely empty. Lock relation
1404 * because nothing finer to lock exists. Without a buffer lock, it's
1405 * possible for another transaction to insert data between
1406 * _bt_search() and PredicateLockRelation(). We have to try again
1407 * after taking the relation-level predicate lock, to close a narrow
1408 * window where we wouldn't scan concurrently inserted tuples, but the
1409 * writer wouldn't see our predicate lock.
1411 if (IsolationIsSerializable())
1413 PredicateLockRelation(rel, scan->xs_snapshot);
1414 stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1415 _bt_freestack(stack);
1418 if (!BufferIsValid(buf))
1421 * Mark parallel scan as done, so that all the workers can finish
1422 * their scan.
1424 _bt_parallel_done(scan);
1425 BTScanPosInvalidate(so->currPos);
1426 return false;
1430 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1432 _bt_initialize_more_data(so, dir);
1434 /* position to the precise item on the page */
1435 offnum = _bt_binsrch(rel, &inskey, buf);
1436 Assert(!BTScanPosIsValid(so->currPos));
1437 so->currPos.buf = buf;
1440 * Now load data from the first page of the scan.
1442 * If inskey.nextkey = false and inskey.backward = false, offnum is
1443 * positioned at the first non-pivot tuple >= inskey.scankeys.
1445 * If inskey.nextkey = false and inskey.backward = true, offnum is
1446 * positioned at the last non-pivot tuple < inskey.scankeys.
1448 * If inskey.nextkey = true and inskey.backward = false, offnum is
1449 * positioned at the first non-pivot tuple > inskey.scankeys.
1451 * If inskey.nextkey = true and inskey.backward = true, offnum is
1452 * positioned at the last non-pivot tuple <= inskey.scankeys.
1454 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1455 * for the page. For example, when inskey is both < the leaf page's high
1456 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1458 if (!_bt_readpage(scan, dir, offnum, true))
1461 * There's no actually-matching data on this page. Try to advance to
1462 * the next page. Return false if there's no matching data at all.
1464 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1465 if (!_bt_steppage(scan, dir))
1466 return false;
1468 else
1470 /* We have at least one item to return as scan's first item */
1471 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1474 readcomplete:
1475 /* OK, itemIndex says what to return */
1476 currItem = &so->currPos.items[so->currPos.itemIndex];
1477 scan->xs_heaptid = currItem->heapTid;
1478 if (scan->xs_want_itup)
1479 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1481 return true;
1485 * _bt_next() -- Get the next item in a scan.
1487 * On entry, so->currPos describes the current page, which may be pinned
1488 * but is not locked, and so->currPos.itemIndex identifies which item was
1489 * previously returned.
1491 * On successful exit, scan->xs_heaptid is set to the TID of the next
1492 * heap tuple, and if requested, scan->xs_itup points to a copy of the
1493 * index tuple. so->currPos is updated as needed.
1495 * On failure exit (no more tuples), we release pin and set
1496 * so->currPos.buf to InvalidBuffer.
1498 bool
1499 _bt_next(IndexScanDesc scan, ScanDirection dir)
1501 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1502 BTScanPosItem *currItem;
1505 * Advance to next tuple on current page; or if there's no more, try to
1506 * step to the next page with data.
1508 if (ScanDirectionIsForward(dir))
1510 if (++so->currPos.itemIndex > so->currPos.lastItem)
1512 if (!_bt_steppage(scan, dir))
1513 return false;
1516 else
1518 if (--so->currPos.itemIndex < so->currPos.firstItem)
1520 if (!_bt_steppage(scan, dir))
1521 return false;
1525 /* OK, itemIndex says what to return */
1526 currItem = &so->currPos.items[so->currPos.itemIndex];
1527 scan->xs_heaptid = currItem->heapTid;
1528 if (scan->xs_want_itup)
1529 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1531 return true;
1535 * _bt_readpage() -- Load data from current index page into so->currPos
1537 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1538 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1539 * they are updated as appropriate. All other fields of so->currPos are
1540 * initialized from scratch here.
1542 * We scan the current page starting at offnum and moving in the indicated
1543 * direction. All items matching the scan keys are loaded into currPos.items.
1544 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1545 * that there can be no more matching tuples in the current scan direction
1546 * (could just be for the current primitive index scan when scan has arrays).
1548 * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1549 * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1550 * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
1551 * its scan key was marked required), so _bt_first _must_ pass us an offnum
1552 * exactly at the beginning of where equal tuples are to be found. When we're
1553 * passed an offnum past the end of the page, we might still manage to stop
1554 * the scan on this page by calling _bt_checkkeys against the high key.
1556 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1557 * prior to calling this function; this function will invoke
1558 * _bt_parallel_release before returning.
1560 * Returns true if any matching items found on the page, false if none.
1562 static bool
1563 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1564 bool firstPage)
1566 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1567 Page page;
1568 BTPageOpaque opaque;
1569 OffsetNumber minoff;
1570 OffsetNumber maxoff;
1571 BTReadPageState pstate;
1572 bool arrayKeys;
1573 int itemIndex,
1574 indnatts;
1577 * We must have the buffer pinned and locked, but the usual macro can't be
1578 * used here; this function is what makes it good for currPos.
1580 Assert(BufferIsValid(so->currPos.buf));
1582 page = BufferGetPage(so->currPos.buf);
1583 opaque = BTPageGetOpaque(page);
1585 /* allow next page be processed by parallel worker */
1586 if (scan->parallel_scan)
1588 if (ScanDirectionIsForward(dir))
1589 pstate.prev_scan_page = opaque->btpo_next;
1590 else
1591 pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
1593 _bt_parallel_release(scan, pstate.prev_scan_page);
1596 indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1597 arrayKeys = so->numArrayKeys != 0;
1598 minoff = P_FIRSTDATAKEY(opaque);
1599 maxoff = PageGetMaxOffsetNumber(page);
1601 /* initialize page-level state that we'll pass to _bt_checkkeys */
1602 pstate.dir = dir;
1603 pstate.minoff = minoff;
1604 pstate.maxoff = maxoff;
1605 pstate.finaltup = NULL;
1606 pstate.page = page;
1607 pstate.offnum = InvalidOffsetNumber;
1608 pstate.skip = InvalidOffsetNumber;
1609 pstate.continuescan = true; /* default assumption */
1610 pstate.prechecked = false;
1611 pstate.firstmatch = false;
1612 pstate.rechecks = 0;
1613 pstate.targetdistance = 0;
1616 * We note the buffer's block number so that we can release the pin later.
1617 * This allows us to re-read the buffer if it is needed again for hinting.
1619 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1622 * We save the LSN of the page as we read it, so that we know whether it
1623 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1624 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1626 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1629 * we must save the page's right-link while scanning it; this tells us
1630 * where to step right to after we're done with these items. There is no
1631 * corresponding need for the left-link, since splits always go right.
1633 so->currPos.nextPage = opaque->btpo_next;
1635 /* initialize tuple workspace to empty */
1636 so->currPos.nextTupleOffset = 0;
1639 * Now that the current page has been made consistent, the macro should be
1640 * good.
1642 Assert(BTScanPosIsPinned(so->currPos));
1645 * Prechecking the value of the continuescan flag for the last item on the
1646 * page (for backwards scan it will be the first item on a page). If we
1647 * observe it to be true, then it should be true for all other items. This
1648 * allows us to do significant optimizations in the _bt_checkkeys()
1649 * function for all the items on the page.
1651 * With the forward scan, we do this check for the last item on the page
1652 * instead of the high key. It's relatively likely that the most
1653 * significant column in the high key will be different from the
1654 * corresponding value from the last item on the page. So checking with
1655 * the last item on the page would give a more precise answer.
1657 * We skip this for the first page read by each (primitive) scan, to avoid
1658 * slowing down point queries. They typically don't stand to gain much
1659 * when the optimization can be applied, and are more likely to notice the
1660 * overhead of the precheck.
1662 * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1663 * just set a low-order required array's key to the best available match
1664 * for a truncated -inf attribute value from the prior page's high key
1665 * (array element 0 is always the best available match in this scenario).
1666 * It's quite likely that matches for array element 0 begin on this page,
1667 * but the start of matches won't necessarily align with page boundaries.
1668 * When the start of matches is somewhere in the middle of this page, it
1669 * would be wrong to treat page's final non-pivot tuple as representative.
1670 * Doing so might lead us to treat some of the page's earlier tuples as
1671 * being part of a group of tuples thought to satisfy the required keys.
1673 * Note: Conversely, in the case where the scan's arrays just advanced
1674 * using the prior page's HIKEY _without_ advancement setting scanBehind,
1675 * the start of matches must be aligned with page boundaries, which makes
1676 * it safe to attempt the optimization here now. It's also safe when the
1677 * prior page's HIKEY simply didn't need to advance any required array. In
1678 * both cases we can safely assume that the _first_ tuple from this page
1679 * must be >= the current set of array keys/equality constraints. And so
1680 * if the final tuple is == those same keys (and also satisfies any
1681 * required < or <= strategy scan keys) during the precheck, we can safely
1682 * assume that this must also be true of all earlier tuples from the page.
1684 if (!firstPage && !so->scanBehind && minoff < maxoff)
1686 ItemId iid;
1687 IndexTuple itup;
1689 iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1690 itup = (IndexTuple) PageGetItem(page, iid);
1692 /* Call with arrayKeys=false to avoid undesirable side-effects */
1693 _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1694 pstate.prechecked = pstate.continuescan;
1695 pstate.continuescan = true; /* reset */
1698 if (ScanDirectionIsForward(dir))
1700 /* SK_SEARCHARRAY forward scans must provide high key up front */
1701 if (arrayKeys && !P_RIGHTMOST(opaque))
1703 ItemId iid = PageGetItemId(page, P_HIKEY);
1705 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1707 if (unlikely(so->oppositeDirCheck))
1709 Assert(so->scanBehind);
1712 * Last _bt_readpage call scheduled a recheck of finaltup for
1713 * required scan keys up to and including a > or >= scan key.
1715 * _bt_checkkeys won't consider the scanBehind flag unless the
1716 * scan is stopped by a scan key required in the current scan
1717 * direction. We need this recheck so that we'll notice when
1718 * all tuples on this page are still before the _bt_first-wise
1719 * start of matches for the current set of array keys.
1721 if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
1723 /* Schedule another primitive index scan after all */
1724 so->currPos.moreRight = false;
1725 so->needPrimScan = true;
1726 return false;
1729 /* Deliberately don't unset scanBehind flag just yet */
1733 /* load items[] in ascending order */
1734 itemIndex = 0;
1736 offnum = Max(offnum, minoff);
1738 while (offnum <= maxoff)
1740 ItemId iid = PageGetItemId(page, offnum);
1741 IndexTuple itup;
1742 bool passes_quals;
1745 * If the scan specifies not to return killed tuples, then we
1746 * treat a killed tuple as not passing the qual
1748 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1750 offnum = OffsetNumberNext(offnum);
1751 continue;
1754 itup = (IndexTuple) PageGetItem(page, iid);
1755 Assert(!BTreeTupleIsPivot(itup));
1757 pstate.offnum = offnum;
1758 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1759 itup, indnatts);
1762 * Check if we need to skip ahead to a later tuple (only possible
1763 * when the scan uses array keys)
1765 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1767 Assert(!passes_quals && pstate.continuescan);
1768 Assert(offnum < pstate.skip);
1770 offnum = pstate.skip;
1771 pstate.skip = InvalidOffsetNumber;
1772 continue;
1775 if (passes_quals)
1777 /* tuple passes all scan key conditions */
1778 pstate.firstmatch = true;
1779 if (!BTreeTupleIsPosting(itup))
1781 /* Remember it */
1782 _bt_saveitem(so, itemIndex, offnum, itup);
1783 itemIndex++;
1785 else
1787 int tupleOffset;
1790 * Set up state to return posting list, and remember first
1791 * TID
1793 tupleOffset =
1794 _bt_setuppostingitems(so, itemIndex, offnum,
1795 BTreeTupleGetPostingN(itup, 0),
1796 itup);
1797 itemIndex++;
1798 /* Remember additional TIDs */
1799 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1801 _bt_savepostingitem(so, itemIndex, offnum,
1802 BTreeTupleGetPostingN(itup, i),
1803 tupleOffset);
1804 itemIndex++;
1808 /* When !continuescan, there can't be any more matches, so stop */
1809 if (!pstate.continuescan)
1810 break;
1812 offnum = OffsetNumberNext(offnum);
1816 * We don't need to visit page to the right when the high key
1817 * indicates that no more matches will be found there.
1819 * Checking the high key like this works out more often than you might
1820 * think. Leaf page splits pick a split point between the two most
1821 * dissimilar tuples (this is weighed against the need to evenly share
1822 * free space). Leaf pages with high key attribute values that can
1823 * only appear on non-pivot tuples on the right sibling page are
1824 * common.
1826 if (pstate.continuescan && !P_RIGHTMOST(opaque))
1828 ItemId iid = PageGetItemId(page, P_HIKEY);
1829 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1830 int truncatt;
1832 truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1833 pstate.prechecked = false; /* precheck didn't cover HIKEY */
1834 _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1837 if (!pstate.continuescan)
1838 so->currPos.moreRight = false;
1840 Assert(itemIndex <= MaxTIDsPerBTreePage);
1841 so->currPos.firstItem = 0;
1842 so->currPos.lastItem = itemIndex - 1;
1843 so->currPos.itemIndex = 0;
1845 else
1847 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1848 if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1850 ItemId iid = PageGetItemId(page, minoff);
1852 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1855 /* load items[] in descending order */
1856 itemIndex = MaxTIDsPerBTreePage;
1858 offnum = Min(offnum, maxoff);
1860 while (offnum >= minoff)
1862 ItemId iid = PageGetItemId(page, offnum);
1863 IndexTuple itup;
1864 bool tuple_alive;
1865 bool passes_quals;
1868 * If the scan specifies not to return killed tuples, then we
1869 * treat a killed tuple as not passing the qual. Most of the
1870 * time, it's a win to not bother examining the tuple's index
1871 * keys, but just skip to the next tuple (previous, actually,
1872 * since we're scanning backwards). However, if this is the first
1873 * tuple on the page, we do check the index keys, to prevent
1874 * uselessly advancing to the page to the left. This is similar
1875 * to the high key optimization used by forward scans.
1877 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1879 Assert(offnum >= P_FIRSTDATAKEY(opaque));
1880 if (offnum > P_FIRSTDATAKEY(opaque))
1882 offnum = OffsetNumberPrev(offnum);
1883 continue;
1886 tuple_alive = false;
1888 else
1889 tuple_alive = true;
1891 itup = (IndexTuple) PageGetItem(page, iid);
1892 Assert(!BTreeTupleIsPivot(itup));
1894 pstate.offnum = offnum;
1895 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1896 itup, indnatts);
1899 * Check if we need to skip ahead to a later tuple (only possible
1900 * when the scan uses array keys)
1902 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1904 Assert(!passes_quals && pstate.continuescan);
1905 Assert(offnum > pstate.skip);
1907 offnum = pstate.skip;
1908 pstate.skip = InvalidOffsetNumber;
1909 continue;
1912 if (passes_quals && tuple_alive)
1914 /* tuple passes all scan key conditions */
1915 pstate.firstmatch = true;
1916 if (!BTreeTupleIsPosting(itup))
1918 /* Remember it */
1919 itemIndex--;
1920 _bt_saveitem(so, itemIndex, offnum, itup);
1922 else
1924 int tupleOffset;
1927 * Set up state to return posting list, and remember first
1928 * TID.
1930 * Note that we deliberately save/return items from
1931 * posting lists in ascending heap TID order for backwards
1932 * scans. This allows _bt_killitems() to make a
1933 * consistent assumption about the order of items
1934 * associated with the same posting list tuple.
1936 itemIndex--;
1937 tupleOffset =
1938 _bt_setuppostingitems(so, itemIndex, offnum,
1939 BTreeTupleGetPostingN(itup, 0),
1940 itup);
1941 /* Remember additional TIDs */
1942 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1944 itemIndex--;
1945 _bt_savepostingitem(so, itemIndex, offnum,
1946 BTreeTupleGetPostingN(itup, i),
1947 tupleOffset);
1951 /* When !continuescan, there can't be any more matches, so stop */
1952 if (!pstate.continuescan)
1953 break;
1955 offnum = OffsetNumberPrev(offnum);
1959 * We don't need to visit page to the left when no more matches will
1960 * be found there
1962 if (!pstate.continuescan || P_LEFTMOST(opaque))
1963 so->currPos.moreLeft = false;
1965 Assert(itemIndex >= 0);
1966 so->currPos.firstItem = itemIndex;
1967 so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1968 so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1971 return (so->currPos.firstItem <= so->currPos.lastItem);
1974 /* Save an index item into so->currPos.items[itemIndex] */
1975 static void
1976 _bt_saveitem(BTScanOpaque so, int itemIndex,
1977 OffsetNumber offnum, IndexTuple itup)
1979 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1981 Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1983 currItem->heapTid = itup->t_tid;
1984 currItem->indexOffset = offnum;
1985 if (so->currTuples)
1987 Size itupsz = IndexTupleSize(itup);
1989 currItem->tupleOffset = so->currPos.nextTupleOffset;
1990 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1991 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1996 * Setup state to save TIDs/items from a single posting list tuple.
1998 * Saves an index item into so->currPos.items[itemIndex] for TID that is
1999 * returned to scan first. Second or subsequent TIDs for posting list should
2000 * be saved by calling _bt_savepostingitem().
2002 * Returns an offset into tuple storage space that main tuple is stored at if
2003 * needed.
2005 static int
2006 _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2007 ItemPointer heapTid, IndexTuple itup)
2009 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2011 Assert(BTreeTupleIsPosting(itup));
2013 currItem->heapTid = *heapTid;
2014 currItem->indexOffset = offnum;
2015 if (so->currTuples)
2017 /* Save base IndexTuple (truncate posting list) */
2018 IndexTuple base;
2019 Size itupsz = BTreeTupleGetPostingOffset(itup);
2021 itupsz = MAXALIGN(itupsz);
2022 currItem->tupleOffset = so->currPos.nextTupleOffset;
2023 base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
2024 memcpy(base, itup, itupsz);
2025 /* Defensively reduce work area index tuple header size */
2026 base->t_info &= ~INDEX_SIZE_MASK;
2027 base->t_info |= itupsz;
2028 so->currPos.nextTupleOffset += itupsz;
2030 return currItem->tupleOffset;
2033 return 0;
2037 * Save an index item into so->currPos.items[itemIndex] for current posting
2038 * tuple.
2040 * Assumes that _bt_setuppostingitems() has already been called for current
2041 * posting list tuple. Caller passes its return value as tupleOffset.
2043 static inline void
2044 _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2045 ItemPointer heapTid, int tupleOffset)
2047 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2049 currItem->heapTid = *heapTid;
2050 currItem->indexOffset = offnum;
2053 * Have index-only scans return the same base IndexTuple for every TID
2054 * that originates from the same posting list
2056 if (so->currTuples)
2057 currItem->tupleOffset = tupleOffset;
2061 * _bt_steppage() -- Step to next page containing valid data for scan
2063 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
2064 * if pinned, we'll drop the pin before moving to next page. The buffer is
2065 * not locked on entry.
2067 * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
2068 * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
2069 * to InvalidBuffer. We return true to indicate success.
2071 static bool
2072 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
2074 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2075 BlockNumber blkno = InvalidBlockNumber;
2076 bool status;
2078 Assert(BTScanPosIsValid(so->currPos));
2080 /* Before leaving current page, deal with any killed items */
2081 if (so->numKilled > 0)
2082 _bt_killitems(scan);
2085 * Before we modify currPos, make a copy of the page data if there was a
2086 * mark position that needs it.
2088 if (so->markItemIndex >= 0)
2090 /* bump pin on current buffer for assignment to mark buffer */
2091 if (BTScanPosIsPinned(so->currPos))
2092 IncrBufferRefCount(so->currPos.buf);
2093 memcpy(&so->markPos, &so->currPos,
2094 offsetof(BTScanPosData, items[1]) +
2095 so->currPos.lastItem * sizeof(BTScanPosItem));
2096 if (so->markTuples)
2097 memcpy(so->markTuples, so->currTuples,
2098 so->currPos.nextTupleOffset);
2099 so->markPos.itemIndex = so->markItemIndex;
2100 so->markItemIndex = -1;
2103 * If we're just about to start the next primitive index scan
2104 * (possible with a scan that has arrays keys, and needs to skip to
2105 * continue in the current scan direction), moreLeft/moreRight only
2106 * indicate the end of the current primitive index scan. They must
2107 * never be taken to indicate that the top-level index scan has ended
2108 * (that would be wrong).
2110 * We could handle this case by treating the current array keys as
2111 * markPos state. But depending on the current array state like this
2112 * would add complexity. Instead, we just unset markPos's copy of
2113 * moreRight or moreLeft (whichever might be affected), while making
2114 * btrestpos reset the scan's arrays to their initial scan positions.
2115 * In effect, btrestpos leaves advancing the arrays up to the first
2116 * _bt_readpage call (that takes place after it has restored markPos).
2118 Assert(so->markPos.dir == dir);
2119 if (so->needPrimScan)
2121 if (ScanDirectionIsForward(dir))
2122 so->markPos.moreRight = true;
2123 else
2124 so->markPos.moreLeft = true;
2128 if (ScanDirectionIsForward(dir))
2130 /* Walk right to the next page with data */
2131 if (scan->parallel_scan != NULL)
2134 * Seize the scan to get the next block number; if the scan has
2135 * ended already, bail out.
2137 status = _bt_parallel_seize(scan, &blkno, false);
2138 if (!status)
2140 /* release the previous buffer, if pinned */
2141 BTScanPosUnpinIfPinned(so->currPos);
2142 BTScanPosInvalidate(so->currPos);
2143 return false;
2146 else
2148 /* Not parallel, so use the previously-saved nextPage link. */
2149 blkno = so->currPos.nextPage;
2152 /* Remember we left a page with data */
2153 so->currPos.moreLeft = true;
2155 /* release the previous buffer, if pinned */
2156 BTScanPosUnpinIfPinned(so->currPos);
2158 else
2160 /* Remember we left a page with data */
2161 so->currPos.moreRight = true;
2163 if (scan->parallel_scan != NULL)
2166 * Seize the scan to get the current block number; if the scan has
2167 * ended already, bail out.
2169 status = _bt_parallel_seize(scan, &blkno, false);
2170 BTScanPosUnpinIfPinned(so->currPos);
2171 if (!status)
2173 BTScanPosInvalidate(so->currPos);
2174 return false;
2177 else
2179 /* Not parallel, so just use our own notion of the current page */
2180 blkno = so->currPos.currPage;
2184 if (!_bt_readnextpage(scan, blkno, dir))
2185 return false;
2187 /* We have at least one item to return as scan's next item */
2188 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2190 return true;
2194 * _bt_readnextpage() -- Read next page containing valid data for scan
2196 * On success exit, so->currPos is updated to contain data from the next
2197 * interesting page, and we return true. Caller must release the lock (and
2198 * maybe the pin) on the buffer on success exit.
2200 * If there are no more matching records in the given direction, we drop all
2201 * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
2203 static bool
2204 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2206 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2207 Relation rel;
2208 Page page;
2209 BTPageOpaque opaque;
2210 bool status;
2212 rel = scan->indexRelation;
2214 if (ScanDirectionIsForward(dir))
2216 for (;;)
2219 * if we're at end of scan, give up and mark parallel scan as
2220 * done, so that all the workers can finish their scan
2222 if (blkno == P_NONE || !so->currPos.moreRight)
2224 _bt_parallel_done(scan);
2225 BTScanPosInvalidate(so->currPos);
2226 return false;
2228 /* check for interrupts while we're not holding any buffer lock */
2229 CHECK_FOR_INTERRUPTS();
2230 /* step right one page */
2231 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2232 page = BufferGetPage(so->currPos.buf);
2233 opaque = BTPageGetOpaque(page);
2234 /* check for deleted page */
2235 if (!P_IGNORE(opaque))
2237 PredicateLockPage(rel, blkno, scan->xs_snapshot);
2238 /* see if there are any matches on this page */
2239 /* note that this will clear moreRight if we can stop */
2240 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2241 break;
2243 else if (scan->parallel_scan != NULL)
2245 /* allow next page be processed by parallel worker */
2246 _bt_parallel_release(scan, opaque->btpo_next);
2249 /* nope, keep going */
2250 if (scan->parallel_scan != NULL)
2252 _bt_relbuf(rel, so->currPos.buf);
2253 status = _bt_parallel_seize(scan, &blkno, false);
2254 if (!status)
2256 BTScanPosInvalidate(so->currPos);
2257 return false;
2260 else
2262 blkno = opaque->btpo_next;
2263 _bt_relbuf(rel, so->currPos.buf);
2267 else
2270 * Should only happen in parallel cases, when some other backend
2271 * advanced the scan.
2273 if (so->currPos.currPage != blkno)
2275 BTScanPosUnpinIfPinned(so->currPos);
2276 so->currPos.currPage = blkno;
2279 /* Done if we know that the left sibling link isn't of interest */
2280 if (!so->currPos.moreLeft)
2282 BTScanPosUnpinIfPinned(so->currPos);
2283 _bt_parallel_done(scan);
2284 BTScanPosInvalidate(so->currPos);
2285 return false;
2289 * Walk left to the next page with data. This is much more complex
2290 * than the walk-right case because of the possibility that the page
2291 * to our left splits while we are in flight to it, plus the
2292 * possibility that the page we were on gets deleted after we leave
2293 * it. See nbtree/README for details.
2295 * It might be possible to rearrange this code to have less overhead
2296 * in pinning and locking, but that would require capturing the left
2297 * sibling block number when the page is initially read, and then
2298 * optimistically starting there (rather than pinning the page twice).
2299 * It is not clear that this would be worth the complexity.
2301 if (BTScanPosIsPinned(so->currPos))
2302 _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2303 else
2304 so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2306 for (;;)
2308 /* Done if we know that the left sibling link isn't of interest */
2309 if (!so->currPos.moreLeft)
2311 _bt_relbuf(rel, so->currPos.buf);
2312 _bt_parallel_done(scan);
2313 BTScanPosInvalidate(so->currPos);
2314 return false;
2317 /* Step to next physical page */
2318 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2320 /* if we're physically at end of index, return failure */
2321 if (so->currPos.buf == InvalidBuffer)
2323 _bt_parallel_done(scan);
2324 BTScanPosInvalidate(so->currPos);
2325 return false;
2329 * Okay, we managed to move left to a non-deleted page. Done if
2330 * it's not half-dead and contains matching tuples. Else loop back
2331 * and do it all again.
2333 page = BufferGetPage(so->currPos.buf);
2334 opaque = BTPageGetOpaque(page);
2335 if (!P_IGNORE(opaque))
2337 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
2338 /* see if there are any matches on this page */
2339 /* note that this will clear moreLeft if we can stop */
2340 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2341 break;
2343 else if (scan->parallel_scan != NULL)
2345 /* allow next page be processed by parallel worker */
2346 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2350 * For parallel scans, get the last page scanned as it is quite
2351 * possible that by the time we try to seize the scan, some other
2352 * worker has already advanced the scan to a different page. We
2353 * must continue based on the latest page scanned by any worker.
2355 if (scan->parallel_scan != NULL)
2357 _bt_relbuf(rel, so->currPos.buf);
2358 status = _bt_parallel_seize(scan, &blkno, false);
2359 if (!status)
2361 BTScanPosInvalidate(so->currPos);
2362 return false;
2364 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2369 return true;
2373 * _bt_parallel_readpage() -- Read current page containing valid data for scan
2375 * On success, release lock and maybe pin on buffer. We return true to
2376 * indicate success.
2378 static bool
2379 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2381 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2383 Assert(!so->needPrimScan);
2385 _bt_initialize_more_data(so, dir);
2387 if (!_bt_readnextpage(scan, blkno, dir))
2388 return false;
2390 /* We have at least one item to return as scan's next item */
2391 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2393 return true;
2397 * _bt_walk_left() -- step left one page, if possible
2399 * The given buffer must be pinned and read-locked. This will be dropped
2400 * before stepping left. On return, we have pin and read lock on the
2401 * returned page, instead.
2403 * Returns InvalidBuffer if there is no page to the left (no lock is held
2404 * in that case).
2406 * It is possible for the returned leaf page to be half-dead; caller must
2407 * check that condition and step left again when required.
2409 static Buffer
2410 _bt_walk_left(Relation rel, Buffer buf)
2412 Page page;
2413 BTPageOpaque opaque;
2415 page = BufferGetPage(buf);
2416 opaque = BTPageGetOpaque(page);
2418 for (;;)
2420 BlockNumber obknum;
2421 BlockNumber lblkno;
2422 BlockNumber blkno;
2423 int tries;
2425 /* if we're at end of tree, release buf and return failure */
2426 if (P_LEFTMOST(opaque))
2428 _bt_relbuf(rel, buf);
2429 break;
2431 /* remember original page we are stepping left from */
2432 obknum = BufferGetBlockNumber(buf);
2433 /* step left */
2434 blkno = lblkno = opaque->btpo_prev;
2435 _bt_relbuf(rel, buf);
2436 /* check for interrupts while we're not holding any buffer lock */
2437 CHECK_FOR_INTERRUPTS();
2438 buf = _bt_getbuf(rel, blkno, BT_READ);
2439 page = BufferGetPage(buf);
2440 opaque = BTPageGetOpaque(page);
2443 * If this isn't the page we want, walk right till we find what we
2444 * want --- but go no more than four hops (an arbitrary limit). If we
2445 * don't find the correct page by then, the most likely bet is that
2446 * the original page got deleted and isn't in the sibling chain at all
2447 * anymore, not that its left sibling got split more than four times.
2449 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2450 * because half-dead pages are still in the sibling chain.
2452 tries = 0;
2453 for (;;)
2455 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2457 /* Found desired page, return it */
2458 return buf;
2460 if (P_RIGHTMOST(opaque) || ++tries > 4)
2461 break;
2462 blkno = opaque->btpo_next;
2463 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2464 page = BufferGetPage(buf);
2465 opaque = BTPageGetOpaque(page);
2468 /* Return to the original page to see what's up */
2469 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2470 page = BufferGetPage(buf);
2471 opaque = BTPageGetOpaque(page);
2472 if (P_ISDELETED(opaque))
2475 * It was deleted. Move right to first nondeleted page (there
2476 * must be one); that is the page that has acquired the deleted
2477 * one's keyspace, so stepping left from it will take us where we
2478 * want to be.
2480 for (;;)
2482 if (P_RIGHTMOST(opaque))
2483 elog(ERROR, "fell off the end of index \"%s\"",
2484 RelationGetRelationName(rel));
2485 blkno = opaque->btpo_next;
2486 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2487 page = BufferGetPage(buf);
2488 opaque = BTPageGetOpaque(page);
2489 if (!P_ISDELETED(opaque))
2490 break;
2494 * Now return to top of loop, resetting obknum to point to this
2495 * nondeleted page, and try again.
2498 else
2501 * It wasn't deleted; the explanation had better be that the page
2502 * to the left got split or deleted. Without this check, we'd go
2503 * into an infinite loop if there's anything wrong.
2505 if (opaque->btpo_prev == lblkno)
2506 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2507 obknum, RelationGetRelationName(rel));
2508 /* Okay to try again with new lblkno value */
2512 return InvalidBuffer;
2516 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2518 * If the index is empty, we will return InvalidBuffer; any other failure
2519 * condition causes ereport(). We will not return a dead page.
2521 * The returned buffer is pinned and read-locked.
2523 Buffer
2524 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2526 Buffer buf;
2527 Page page;
2528 BTPageOpaque opaque;
2529 OffsetNumber offnum;
2530 BlockNumber blkno;
2531 IndexTuple itup;
2534 * If we are looking for a leaf page, okay to descend from fast root;
2535 * otherwise better descend from true root. (There is no point in being
2536 * smarter about intermediate levels.)
2538 if (level == 0)
2539 buf = _bt_getroot(rel, NULL, BT_READ);
2540 else
2541 buf = _bt_gettrueroot(rel);
2543 if (!BufferIsValid(buf))
2544 return InvalidBuffer;
2546 page = BufferGetPage(buf);
2547 opaque = BTPageGetOpaque(page);
2549 for (;;)
2552 * If we landed on a deleted page, step right to find a live page
2553 * (there must be one). Also, if we want the rightmost page, step
2554 * right if needed to get to it (this could happen if the page split
2555 * since we obtained a pointer to it).
2557 while (P_IGNORE(opaque) ||
2558 (rightmost && !P_RIGHTMOST(opaque)))
2560 blkno = opaque->btpo_next;
2561 if (blkno == P_NONE)
2562 elog(ERROR, "fell off the end of index \"%s\"",
2563 RelationGetRelationName(rel));
2564 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2565 page = BufferGetPage(buf);
2566 opaque = BTPageGetOpaque(page);
2569 /* Done? */
2570 if (opaque->btpo_level == level)
2571 break;
2572 if (opaque->btpo_level < level)
2573 ereport(ERROR,
2574 (errcode(ERRCODE_INDEX_CORRUPTED),
2575 errmsg_internal("btree level %u not found in index \"%s\"",
2576 level, RelationGetRelationName(rel))));
2578 /* Descend to leftmost or rightmost child page */
2579 if (rightmost)
2580 offnum = PageGetMaxOffsetNumber(page);
2581 else
2582 offnum = P_FIRSTDATAKEY(opaque);
2584 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2585 blkno = BTreeTupleGetDownLink(itup);
2587 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2588 page = BufferGetPage(buf);
2589 opaque = BTPageGetOpaque(page);
2592 return buf;
2596 * _bt_endpoint() -- Find the first or last page in the index, and scan
2597 * from there to the first key satisfying all the quals.
2599 * This is used by _bt_first() to set up a scan when we've determined
2600 * that the scan must start at the beginning or end of the index (for
2601 * a forward or backward scan respectively). Exit conditions are the
2602 * same as for _bt_first().
2604 static bool
2605 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2607 Relation rel = scan->indexRelation;
2608 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2609 Buffer buf;
2610 Page page;
2611 BTPageOpaque opaque;
2612 OffsetNumber start;
2613 BTScanPosItem *currItem;
2616 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2617 * version of _bt_search(). We don't maintain a stack since we know we
2618 * won't need it.
2620 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2622 if (!BufferIsValid(buf))
2625 * Empty index. Lock the whole relation, as nothing finer to lock
2626 * exists.
2628 PredicateLockRelation(rel, scan->xs_snapshot);
2629 BTScanPosInvalidate(so->currPos);
2630 return false;
2633 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2634 page = BufferGetPage(buf);
2635 opaque = BTPageGetOpaque(page);
2636 Assert(P_ISLEAF(opaque));
2638 if (ScanDirectionIsForward(dir))
2640 /* There could be dead pages to the left, so not this: */
2641 /* Assert(P_LEFTMOST(opaque)); */
2643 start = P_FIRSTDATAKEY(opaque);
2645 else if (ScanDirectionIsBackward(dir))
2647 Assert(P_RIGHTMOST(opaque));
2649 start = PageGetMaxOffsetNumber(page);
2651 else
2653 elog(ERROR, "invalid scan direction: %d", (int) dir);
2654 start = 0; /* keep compiler quiet */
2657 /* remember which buffer we have pinned */
2658 so->currPos.buf = buf;
2660 _bt_initialize_more_data(so, dir);
2663 * Now load data from the first page of the scan.
2665 if (!_bt_readpage(scan, dir, start, true))
2668 * There's no actually-matching data on this page. Try to advance to
2669 * the next page. Return false if there's no matching data at all.
2671 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2672 if (!_bt_steppage(scan, dir))
2673 return false;
2675 else
2677 /* We have at least one item to return as scan's first item */
2678 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2681 /* OK, itemIndex says what to return */
2682 currItem = &so->currPos.items[so->currPos.itemIndex];
2683 scan->xs_heaptid = currItem->heapTid;
2684 if (scan->xs_want_itup)
2685 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2687 return true;
2691 * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
2692 * from currPos
2694 static inline void
2695 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2697 so->currPos.dir = dir;
2698 if (so->needPrimScan)
2700 Assert(so->numArrayKeys);
2702 so->currPos.moreLeft = true;
2703 so->currPos.moreRight = true;
2704 so->needPrimScan = false;
2706 else if (ScanDirectionIsForward(dir))
2708 so->currPos.moreLeft = false;
2709 so->currPos.moreRight = true;
2711 else
2713 so->currPos.moreLeft = true;
2714 so->currPos.moreRight = false;
2716 so->numKilled = 0; /* just paranoia */
2717 so->markItemIndex = -1; /* ditto */