1 /*-------------------------------------------------------------------------
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/nbtree/nbtsearch.c
13 *-------------------------------------------------------------------------
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "access/xact.h"
21 #include "miscadmin.h"
23 #include "storage/predicate.h"
24 #include "utils/lsyscache.h"
25 #include "utils/rel.h"
28 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan
, BTScanPos sp
);
29 static Buffer
_bt_moveright(Relation rel
, Relation heaprel
, BTScanInsert key
,
30 Buffer buf
, bool forupdate
, BTStack stack
,
32 static OffsetNumber
_bt_binsrch(Relation rel
, BTScanInsert key
, Buffer buf
);
33 static int _bt_binsrch_posting(BTScanInsert key
, Page page
,
35 static bool _bt_readpage(IndexScanDesc scan
, ScanDirection dir
,
36 OffsetNumber offnum
, bool firstPage
);
37 static void _bt_saveitem(BTScanOpaque so
, int itemIndex
,
38 OffsetNumber offnum
, IndexTuple itup
);
39 static int _bt_setuppostingitems(BTScanOpaque so
, int itemIndex
,
40 OffsetNumber offnum
, ItemPointer heapTid
,
42 static inline void _bt_savepostingitem(BTScanOpaque so
, int itemIndex
,
44 ItemPointer heapTid
, int tupleOffset
);
45 static bool _bt_steppage(IndexScanDesc scan
, ScanDirection dir
);
46 static bool _bt_readnextpage(IndexScanDesc scan
, BlockNumber blkno
, ScanDirection dir
);
47 static bool _bt_parallel_readpage(IndexScanDesc scan
, BlockNumber blkno
,
49 static Buffer
_bt_walk_left(Relation rel
, Buffer buf
);
50 static bool _bt_endpoint(IndexScanDesc scan
, ScanDirection dir
);
51 static inline void _bt_initialize_more_data(BTScanOpaque so
, ScanDirection dir
);
55 * _bt_drop_lock_and_maybe_pin()
57 * Unlock the buffer; and if it is safe to release the pin, do that, too.
58 * This will prevent vacuum from stalling in a blocked state trying to read a
59 * page when a cursor is sitting on it.
61 * See nbtree/README section on making concurrent TID recycling safe.
64 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan
, BTScanPos sp
)
66 _bt_unlockbuf(scan
->indexRelation
, sp
->buf
);
68 if (IsMVCCSnapshot(scan
->xs_snapshot
) &&
69 RelationNeedsWAL(scan
->indexRelation
) &&
72 ReleaseBuffer(sp
->buf
);
73 sp
->buf
= InvalidBuffer
;
78 * _bt_search() -- Search the tree for a particular scankey,
79 * or more precisely for the first leaf page it could be on.
81 * The passed scankey is an insertion-type scankey (see nbtree/README),
82 * but it can omit the rightmost column(s) of the index.
84 * Return value is a stack of parent-page pointers (i.e. there is no entry for
85 * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
86 * which is locked and pinned. No locks are held on the parent pages,
89 * The returned buffer is locked according to access parameter. Additionally,
90 * access = BT_WRITE will allow an empty root page to be created and returned.
91 * When access = BT_READ, an empty index will result in *bufP being set to
92 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
93 * during the search will be finished.
95 * heaprel must be provided by callers that pass access = BT_WRITE, since we
96 * might need to allocate a new root page for caller -- see _bt_allocbuf.
99 _bt_search(Relation rel
, Relation heaprel
, BTScanInsert key
, Buffer
*bufP
,
102 BTStack stack_in
= NULL
;
103 int page_access
= BT_READ
;
105 /* heaprel must be set whenever _bt_allocbuf is reachable */
106 Assert(access
== BT_READ
|| access
== BT_WRITE
);
107 Assert(access
== BT_READ
|| heaprel
!= NULL
);
109 /* Get the root page to start with */
110 *bufP
= _bt_getroot(rel
, heaprel
, access
);
112 /* If index is empty and access = BT_READ, no root page is created. */
113 if (!BufferIsValid(*bufP
))
114 return (BTStack
) NULL
;
116 /* Loop iterates once per level descended in the tree */
128 * Race -- the page we just grabbed may have split since we read its
129 * downlink in its parent page (or the metapage). If it has, we may
130 * need to move right to its new sibling. Do that.
132 * In write-mode, allow _bt_moveright to finish any incomplete splits
133 * along the way. Strictly speaking, we'd only need to finish an
134 * incomplete split on the leaf page we're about to insert to, not on
135 * any of the upper levels (internal pages with incomplete splits are
136 * also taken care of in _bt_getstackbuf). But this is a good
137 * opportunity to finish splits of internal pages too.
139 *bufP
= _bt_moveright(rel
, heaprel
, key
, *bufP
, (access
== BT_WRITE
),
140 stack_in
, page_access
);
142 /* if this is a leaf page, we're done */
143 page
= BufferGetPage(*bufP
);
144 opaque
= BTPageGetOpaque(page
);
145 if (P_ISLEAF(opaque
))
149 * Find the appropriate pivot tuple on this page. Its downlink points
150 * to the child page that we're about to descend to.
152 offnum
= _bt_binsrch(rel
, key
, *bufP
);
153 itemid
= PageGetItemId(page
, offnum
);
154 itup
= (IndexTuple
) PageGetItem(page
, itemid
);
155 Assert(BTreeTupleIsPivot(itup
) || !key
->heapkeyspace
);
156 child
= BTreeTupleGetDownLink(itup
);
159 * We need to save the location of the pivot tuple we chose in a new
160 * stack entry for this page/level. If caller ends up splitting a
161 * page one level down, it usually ends up inserting a new pivot
162 * tuple/downlink immediately after the location recorded here.
164 new_stack
= (BTStack
) palloc(sizeof(BTStackData
));
165 new_stack
->bts_blkno
= BufferGetBlockNumber(*bufP
);
166 new_stack
->bts_offset
= offnum
;
167 new_stack
->bts_parent
= stack_in
;
170 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
171 * we're on the level 1 and asked to lock leaf page in write mode,
172 * then lock next page in write mode, because it must be a leaf.
174 if (opaque
->btpo_level
== 1 && access
== BT_WRITE
)
175 page_access
= BT_WRITE
;
177 /* drop the read lock on the page, then acquire one on its child */
178 *bufP
= _bt_relandgetbuf(rel
, *bufP
, child
, page_access
);
180 /* okay, all set to move down a level */
181 stack_in
= new_stack
;
185 * If we're asked to lock leaf in write mode, but didn't manage to, then
186 * relock. This should only happen when the root page is a leaf page (and
187 * the only page in the index other than the metapage).
189 if (access
== BT_WRITE
&& page_access
== BT_READ
)
191 /* trade in our read lock for a write lock */
192 _bt_unlockbuf(rel
, *bufP
);
193 _bt_lockbuf(rel
, *bufP
, BT_WRITE
);
196 * Race -- the leaf page may have split after we dropped the read lock
197 * but before we acquired a write lock. If it has, we may need to
198 * move right to its new sibling. Do that.
200 *bufP
= _bt_moveright(rel
, heaprel
, key
, *bufP
, true, stack_in
, BT_WRITE
);
207 * _bt_moveright() -- move right in the btree if necessary.
209 * When we follow a pointer to reach a page, it is possible that
210 * the page has changed in the meanwhile. If this happens, we're
211 * guaranteed that the page has "split right" -- that is, that any
212 * data that appeared on the page originally is either on the page
213 * or strictly to the right of it.
215 * This routine decides whether or not we need to move right in the
216 * tree by examining the high key entry on the page. If that entry is
217 * strictly less than the scankey, or <= the scankey in the
218 * key.nextkey=true case, then we followed the wrong link and we need
221 * The passed insertion-type scankey can omit the rightmost column(s) of the
222 * index. (see nbtree/README)
224 * When key.nextkey is false (the usual case), we are looking for the first
225 * item >= key. When key.nextkey is true, we are looking for the first item
226 * strictly greater than key.
228 * If forupdate is true, we will attempt to finish any incomplete splits
229 * that we encounter. This is required when locking a target page for an
230 * insertion, because we don't allow inserting on a page before the split is
231 * completed. 'heaprel' and 'stack' are only used if forupdate is true.
233 * On entry, we have the buffer pinned and a lock of the type specified by
234 * 'access'. If we move right, we release the buffer and lock and acquire
235 * the same on the right sibling. Return value is the buffer we stop at.
238 _bt_moveright(Relation rel
,
250 Assert(!forupdate
|| heaprel
!= NULL
);
253 * When nextkey = false (normal case): if the scan key that brought us to
254 * this page is > the high key stored on the page, then the page has split
255 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
256 * have some duplicates to the right as well as the left, but that's
257 * something that's only ever dealt with on the leaf level, after
258 * _bt_search has found an initial leaf page.)
260 * When nextkey = true: move right if the scan key is >= page's high key.
261 * (Note that key.scantid cannot be set in this case.)
263 * The page could even have split more than once, so scan as far as
266 * We also have to move right if we followed a link that brought us to a
269 cmpval
= key
->nextkey
? 0 : 1;
273 page
= BufferGetPage(buf
);
274 opaque
= BTPageGetOpaque(page
);
276 if (P_RIGHTMOST(opaque
))
280 * Finish any incomplete splits we encounter along the way.
282 if (forupdate
&& P_INCOMPLETE_SPLIT(opaque
))
284 BlockNumber blkno
= BufferGetBlockNumber(buf
);
286 /* upgrade our lock if necessary */
287 if (access
== BT_READ
)
289 _bt_unlockbuf(rel
, buf
);
290 _bt_lockbuf(rel
, buf
, BT_WRITE
);
293 if (P_INCOMPLETE_SPLIT(opaque
))
294 _bt_finish_split(rel
, heaprel
, buf
, stack
);
296 _bt_relbuf(rel
, buf
);
298 /* re-acquire the lock in the right mode, and re-check */
299 buf
= _bt_getbuf(rel
, blkno
, access
);
303 if (P_IGNORE(opaque
) || _bt_compare(rel
, key
, page
, P_HIKEY
) >= cmpval
)
305 /* step right one page */
306 buf
= _bt_relandgetbuf(rel
, buf
, opaque
->btpo_next
, access
);
313 if (P_IGNORE(opaque
))
314 elog(ERROR
, "fell off the end of index \"%s\"",
315 RelationGetRelationName(rel
));
321 * _bt_binsrch() -- Do a binary search for a key on a particular page.
323 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
324 * of the last key < given scankey, or last key <= given scankey if nextkey
325 * is true. (Since _bt_compare treats the first data key of such a page as
326 * minus infinity, there will be at least one key < scankey, so the result
327 * always points at one of the keys on the page.)
329 * On a leaf page, _bt_binsrch() returns the final result of the initial
330 * positioning process that started with _bt_first's call to _bt_search.
331 * We're returning a non-pivot tuple offset, so things are a little different.
332 * It is possible that we'll return an offset that's either past the last
333 * non-pivot slot, or (in the case of a backward scan) before the first slot.
335 * This procedure is not responsible for walking right, it just examines
336 * the given page. _bt_binsrch() has no lock or refcount side effects
340 _bt_binsrch(Relation rel
,
351 page
= BufferGetPage(buf
);
352 opaque
= BTPageGetOpaque(page
);
354 /* Requesting nextkey semantics while using scantid seems nonsensical */
355 Assert(!key
->nextkey
|| key
->scantid
== NULL
);
356 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
357 Assert(!P_ISLEAF(opaque
) || key
->scantid
== NULL
);
359 low
= P_FIRSTDATAKEY(opaque
);
360 high
= PageGetMaxOffsetNumber(page
);
363 * If there are no keys on the page, return the first available slot. Note
364 * this covers two cases: the page is really empty (no keys), or it
365 * contains only a high key. The latter case is possible after vacuuming.
366 * This can never happen on an internal page, however, since they are
367 * never empty (an internal page must have at least one child).
369 if (unlikely(high
< low
))
373 * Binary search to find the first key on the page >= scan key, or first
374 * key > scankey when nextkey is true.
376 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
377 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
379 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
380 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
382 * We can fall out when high == low.
384 high
++; /* establish the loop invariant for high */
386 cmpval
= key
->nextkey
? 0 : 1; /* select comparison value */
390 OffsetNumber mid
= low
+ ((high
- low
) / 2);
392 /* We have low <= mid < high, so mid points at a real slot */
394 result
= _bt_compare(rel
, key
, page
, mid
);
396 if (result
>= cmpval
)
403 * At this point we have high == low.
405 * On a leaf page we always return the first non-pivot tuple >= scan key
406 * (resp. > scan key) for forward scan callers. For backward scans, it's
407 * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
409 if (P_ISLEAF(opaque
))
412 * In the backward scan case we're supposed to locate the last
413 * matching tuple on the leaf level -- not the first matching tuple
414 * (the last tuple will be the first one returned by the scan).
416 * At this point we've located the first non-pivot tuple immediately
417 * after the last matching tuple (which might just be maxoff + 1).
418 * Compensate by stepping back.
421 return OffsetNumberPrev(low
);
427 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
428 * There must be one if _bt_compare() is playing by the rules.
430 * _bt_compare() will seldom see any exactly-matching pivot tuples, since
431 * a truncated -inf heap TID is usually enough to prevent it altogether.
432 * Even omitted scan key entries are treated as > truncated attributes.
434 * However, during backward scans _bt_compare() interprets omitted scan
435 * key attributes as == corresponding truncated -inf attributes instead.
436 * This works just like < would work here. Under this scheme, < strategy
437 * backward scans will always directly descend to the correct leaf page.
438 * In particular, they will never incur an "extra" leaf page access with a
439 * scan key that happens to contain the same prefix of values as some
440 * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
441 * it uses a leaf page high key to "re-find" a page undergoing deletion.
443 Assert(low
> P_FIRSTDATAKEY(opaque
));
445 return OffsetNumberPrev(low
);
450 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
452 * Like _bt_binsrch(), but with support for caching the binary search
453 * bounds. Only used during insertion, and only on the leaf page that it
454 * looks like caller will insert tuple on. Exclusive-locked and pinned
455 * leaf page is contained within insertstate.
457 * Caches the bounds fields in insertstate so that a subsequent call can
458 * reuse the low and strict high bounds of original binary search. Callers
459 * that use these fields directly must be prepared for the case where low
460 * and/or stricthigh are not on the same page (one or both exceed maxoff
461 * for the page). The case where there are no items on the page (high <
462 * low) makes bounds invalid.
464 * Caller is responsible for invalidating bounds when it modifies the page
465 * before calling here a second time, and for dealing with posting list
466 * tuple matches (callers can use insertstate's postingoff field to
467 * determine which existing heap TID will need to be replaced by a posting
471 _bt_binsrch_insert(Relation rel
, BTInsertState insertstate
)
473 BTScanInsert key
= insertstate
->itup_key
;
482 page
= BufferGetPage(insertstate
->buf
);
483 opaque
= BTPageGetOpaque(page
);
485 Assert(P_ISLEAF(opaque
));
486 Assert(!key
->nextkey
);
487 Assert(insertstate
->postingoff
== 0);
489 if (!insertstate
->bounds_valid
)
491 /* Start new binary search */
492 low
= P_FIRSTDATAKEY(opaque
);
493 high
= PageGetMaxOffsetNumber(page
);
497 /* Restore result of previous binary search against same page */
498 low
= insertstate
->low
;
499 high
= insertstate
->stricthigh
;
502 /* If there are no keys on the page, return the first available slot */
503 if (unlikely(high
< low
))
505 /* Caller can't reuse bounds */
506 insertstate
->low
= InvalidOffsetNumber
;
507 insertstate
->stricthigh
= InvalidOffsetNumber
;
508 insertstate
->bounds_valid
= false;
513 * Binary search to find the first key on the page >= scan key. (nextkey
514 * is always false when inserting).
516 * The loop invariant is: all slots before 'low' are < scan key, all slots
517 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
518 * maintained to save additional search effort for caller.
520 * We can fall out when high == low.
522 if (!insertstate
->bounds_valid
)
523 high
++; /* establish the loop invariant for high */
524 stricthigh
= high
; /* high initially strictly higher */
526 cmpval
= 1; /* !nextkey comparison value */
530 OffsetNumber mid
= low
+ ((high
- low
) / 2);
532 /* We have low <= mid < high, so mid points at a real slot */
534 result
= _bt_compare(rel
, key
, page
, mid
);
536 if (result
>= cmpval
)
546 * If tuple at offset located by binary search is a posting list whose
547 * TID range overlaps with caller's scantid, perform posting list
548 * binary search to set postingoff for caller. Caller must split the
549 * posting list when postingoff is set. This should happen
552 if (unlikely(result
== 0 && key
->scantid
!= NULL
))
555 * postingoff should never be set more than once per leaf page
556 * binary search. That would mean that there are duplicate table
557 * TIDs in the index, which is never okay. Check for that here.
559 if (insertstate
->postingoff
!= 0)
561 (errcode(ERRCODE_INDEX_CORRUPTED
),
562 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
563 ItemPointerGetBlockNumber(key
->scantid
),
564 ItemPointerGetOffsetNumber(key
->scantid
),
566 BufferGetBlockNumber(insertstate
->buf
),
567 RelationGetRelationName(rel
))));
569 insertstate
->postingoff
= _bt_binsrch_posting(key
, page
, mid
);
574 * On a leaf page, a binary search always returns the first key >= scan
575 * key (at least in !nextkey case), which could be the last slot + 1. This
576 * is also the lower bound of cached search.
578 * stricthigh may also be the last slot + 1, which prevents caller from
579 * using bounds directly, but is still useful to us if we're called a
580 * second time with cached bounds (cached low will be < stricthigh when
583 insertstate
->low
= low
;
584 insertstate
->stricthigh
= stricthigh
;
585 insertstate
->bounds_valid
= true;
591 * _bt_binsrch_posting() -- posting list binary search.
593 * Helper routine for _bt_binsrch_insert().
595 * Returns offset into posting list where caller's scantid belongs.
599 _bt_binsrch_posting(BTScanInsert key
, Page page
, OffsetNumber offnum
)
609 * If this isn't a posting tuple, then the index must be corrupt (if it is
610 * an ordinary non-pivot tuple then there must be an existing tuple with a
611 * heap TID that equals inserter's new heap TID/scantid). Defensively
612 * check that tuple is a posting list tuple whose posting list range
613 * includes caller's scantid.
615 * (This is also needed because contrib/amcheck's rootdescend option needs
616 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
618 itemid
= PageGetItemId(page
, offnum
);
619 itup
= (IndexTuple
) PageGetItem(page
, itemid
);
620 if (!BTreeTupleIsPosting(itup
))
623 Assert(key
->heapkeyspace
&& key
->allequalimage
);
626 * In the event that posting list tuple has LP_DEAD bit set, indicate this
627 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
628 * second call to _bt_binsrch_insert() can take place when its caller has
629 * removed the dead item.
631 if (ItemIdIsDead(itemid
))
634 /* "high" is past end of posting list for loop invariant */
636 high
= BTreeTupleGetNPosting(itup
);
641 mid
= low
+ ((high
- low
) / 2);
642 res
= ItemPointerCompare(key
->scantid
,
643 BTreeTupleGetPostingN(itup
, mid
));
653 /* Exact match not found */
658 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
660 * page/offnum: location of btree item to be compared to.
662 * This routine returns:
663 * <0 if scankey < tuple at offnum;
664 * 0 if scankey == tuple at offnum;
665 * >0 if scankey > tuple at offnum.
667 * NULLs in the keys are treated as sortable values. Therefore
668 * "equality" does not necessarily mean that the item should be returned
669 * to the caller as a matching key. Similarly, an insertion scankey
670 * with its scantid set is treated as equal to a posting tuple whose TID
671 * range overlaps with their scantid. There generally won't be a
672 * matching TID in the posting tuple, which caller must handle
673 * themselves (e.g., by splitting the posting list tuple).
675 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
676 * "minus infinity": this routine will always claim it is less than the
677 * scankey. The actual key value stored is explicitly truncated to 0
678 * attributes (explicitly minus infinity) with version 3+ indexes, but
679 * that isn't relied upon. This allows us to implement the Lehman and
680 * Yao convention that the first down-link pointer is before the first
681 * key. See backend/access/nbtree/README for details.
685 _bt_compare(Relation rel
,
690 TupleDesc itupdesc
= RelationGetDescr(rel
);
691 BTPageOpaque opaque
= BTPageGetOpaque(page
);
699 Assert(_bt_check_natts(rel
, key
->heapkeyspace
, page
, offnum
));
700 Assert(key
->keysz
<= IndexRelationGetNumberOfKeyAttributes(rel
));
701 Assert(key
->heapkeyspace
|| key
->scantid
== NULL
);
704 * Force result ">" if target item is first data item on an internal page
705 * --- see NOTE above.
707 if (!P_ISLEAF(opaque
) && offnum
== P_FIRSTDATAKEY(opaque
))
710 itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, offnum
));
711 ntupatts
= BTreeTupleGetNAtts(itup
, rel
);
714 * The scan key is set up with the attribute number associated with each
715 * term in the key. It is important that, if the index is multi-key, the
716 * scan contain the first k key attributes, and that they be in order. If
717 * you think about how multi-key ordering works, you'll understand why
720 * We don't test for violation of this condition here, however. The
721 * initial setup for the index scan had better have gotten it right (see
725 ncmpkey
= Min(ntupatts
, key
->keysz
);
726 Assert(key
->heapkeyspace
|| ncmpkey
== key
->keysz
);
727 Assert(!BTreeTupleIsPosting(itup
) || key
->allequalimage
);
728 scankey
= key
->scankeys
;
729 for (int i
= 1; i
<= ncmpkey
; i
++)
734 datum
= index_getattr(itup
, scankey
->sk_attno
, itupdesc
, &isNull
);
736 if (scankey
->sk_flags
& SK_ISNULL
) /* key is NULL */
739 result
= 0; /* NULL "=" NULL */
740 else if (scankey
->sk_flags
& SK_BT_NULLS_FIRST
)
741 result
= -1; /* NULL "<" NOT_NULL */
743 result
= 1; /* NULL ">" NOT_NULL */
745 else if (isNull
) /* key is NOT_NULL and item is NULL */
747 if (scankey
->sk_flags
& SK_BT_NULLS_FIRST
)
748 result
= 1; /* NOT_NULL ">" NULL */
750 result
= -1; /* NOT_NULL "<" NULL */
755 * The sk_func needs to be passed the index value as left arg and
756 * the sk_argument as right arg (they might be of different
757 * types). Since it is convenient for callers to think of
758 * _bt_compare as comparing the scankey to the index item, we have
759 * to flip the sign of the comparison result. (Unless it's a DESC
760 * column, in which case we *don't* flip the sign.)
762 result
= DatumGetInt32(FunctionCall2Coll(&scankey
->sk_func
,
763 scankey
->sk_collation
,
765 scankey
->sk_argument
));
767 if (!(scankey
->sk_flags
& SK_BT_DESC
))
768 INVERT_COMPARE_RESULT(result
);
771 /* if the keys are unequal, return the difference */
779 * All non-truncated attributes (other than heap TID) were found to be
780 * equal. Treat truncated attributes as minus infinity when scankey has a
781 * key attribute value that would otherwise be compared directly.
783 * Note: it doesn't matter if ntupatts includes non-key attributes;
784 * scankey won't, so explicitly excluding non-key attributes isn't
787 if (key
->keysz
> ntupatts
)
791 * Use the heap TID attribute and scantid to try to break the tie. The
792 * rules are the same as any other key attribute -- only the
793 * representation differs.
795 heapTid
= BTreeTupleGetHeapTID(itup
);
796 if (key
->scantid
== NULL
)
799 * Forward scans have a scankey that is considered greater than a
800 * truncated pivot tuple if and when the scankey has equal values for
801 * attributes up to and including the least significant untruncated
802 * attribute in tuple. Even attributes that were omitted from the
803 * scan key are considered greater than -inf truncated attributes.
804 * (See _bt_binsrch for an explanation of our backward scan behavior.)
806 * For example, if an index has the minimum two attributes (single
807 * user key attribute, plus heap TID attribute), and a page's high key
808 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
809 * will not descend to the page to the left. The search will descend
810 * right instead. The truncated attribute in pivot tuple means that
811 * all non-pivot tuples on the page to the left are strictly < 'foo',
812 * so it isn't necessary to descend left. In other words, search
813 * doesn't have to descend left because it isn't interested in a match
814 * that has a heap TID value of -inf.
816 * Note: the heap TID part of the test ensures that scankey is being
817 * compared to a pivot tuple with one or more truncated -inf key
818 * attributes. The heap TID attribute is the last key attribute in
819 * every index, of course, but other than that it isn't special.
821 if (!key
->backward
&& key
->keysz
== ntupatts
&& heapTid
== NULL
&&
825 /* All provided scankey arguments found to be equal */
830 * Treat truncated heap TID as minus infinity, since scankey has a key
831 * attribute value (scantid) that would otherwise be compared directly
833 Assert(key
->keysz
== IndexRelationGetNumberOfKeyAttributes(rel
));
838 * Scankey must be treated as equal to a posting list tuple if its scantid
839 * value falls within the range of the posting list. In all other cases
840 * there can only be a single heap TID value, which is compared directly
843 Assert(ntupatts
>= IndexRelationGetNumberOfKeyAttributes(rel
));
844 result
= ItemPointerCompare(key
->scantid
, heapTid
);
845 if (result
<= 0 || !BTreeTupleIsPosting(itup
))
849 result
= ItemPointerCompare(key
->scantid
,
850 BTreeTupleGetMaxHeapTID(itup
));
859 * _bt_first() -- Find the first item in a scan.
861 * We need to be clever about the direction of scan, the search
862 * conditions, and the tree ordering. We find the first item (or,
863 * if backwards scan, the last item) in the tree that satisfies the
864 * qualifications in the scan key. On success exit, the page containing
865 * the current index tuple is pinned but not locked, and data about
866 * the matching tuple(s) on the page has been loaded into so->currPos.
867 * scan->xs_heaptid is set to the heap TID of the current tuple, and if
868 * requested, scan->xs_itup points to a copy of the index tuple.
870 * If there are no matching items in the index, we return false, with no
871 * pins or locks held.
873 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
874 * are both search-type scankeys (see nbtree/README for more about this).
875 * Within this routine, we build a temporary insertion-type scankey to use
876 * in locating the scan start position.
879 _bt_first(IndexScanDesc scan
, ScanDirection dir
)
881 Relation rel
= scan
->indexRelation
;
882 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
886 StrategyNumber strat
;
887 BTScanInsertData inskey
;
888 ScanKey startKeys
[INDEX_MAX_KEYS
];
889 ScanKeyData notnullkeys
[INDEX_MAX_KEYS
];
893 StrategyNumber strat_total
;
894 BTScanPosItem
*currItem
;
897 Assert(!BTScanPosIsValid(so
->currPos
));
900 * Examine the scan keys and eliminate any redundant keys; also mark the
901 * keys that must be matched to continue the scan.
903 _bt_preprocess_keys(scan
);
906 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
907 * never be satisfied (eg, x == 1 AND x > 2).
911 _bt_parallel_done(scan
);
916 * For parallel scans, get the starting page from shared state. If the
917 * scan has not started, proceed to find out first leaf page in the usual
918 * way while keeping other participating processes waiting. If the scan
919 * has already begun, use the page number from the shared structure.
921 * When a parallel scan has another primitive index scan scheduled, a
922 * parallel worker will seize the scan for that purpose now. This is
923 * similar to the case where the top-level scan hasn't started.
925 if (scan
->parallel_scan
!= NULL
)
927 status
= _bt_parallel_seize(scan
, &blkno
, true);
930 * Initialize arrays (when _bt_parallel_seize didn't already set up
931 * the next primitive index scan)
933 if (so
->numArrayKeys
&& !so
->needPrimScan
)
934 _bt_start_array_keys(scan
, dir
);
938 else if (blkno
== P_NONE
)
940 _bt_parallel_done(scan
);
943 else if (blkno
!= InvalidBlockNumber
)
945 if (!_bt_parallel_readpage(scan
, blkno
, dir
))
950 else if (so
->numArrayKeys
&& !so
->needPrimScan
)
953 * First _bt_first call (for current btrescan) without parallelism.
955 * Initialize arrays, and the corresponding scan keys that were just
956 * output by _bt_preprocess_keys.
958 _bt_start_array_keys(scan
, dir
);
962 * Count an indexscan for stats, now that we know that we'll call
963 * _bt_search/_bt_endpoint below
965 pgstat_count_index_scan(rel
);
968 * Examine the scan keys to discover where we need to start the scan.
970 * We want to identify the keys that can be used as starting boundaries;
971 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
972 * a backwards scan. We can use keys for multiple attributes so long as
973 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
974 * a > or < boundary or find an attribute with no boundary (which can be
975 * thought of as the same as "> -infinity"), we can't use keys for any
976 * attributes to its right, because it would break our simplistic notion
977 * of what initial positioning strategy to use.
979 * When the scan keys include cross-type operators, _bt_preprocess_keys
980 * may not be able to eliminate redundant keys; in such cases we will
981 * arbitrarily pick a usable one for each attribute. This is correct
982 * but possibly not optimal behavior. (For example, with keys like
983 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
984 * x=5 would be more efficient.) Since the situation only arises given
985 * a poorly-worded query plus an incomplete opfamily, live with it.
987 * When both equality and inequality keys appear for a single attribute
988 * (again, only possible when cross-type operators appear), we *must*
989 * select one of the equality keys for the starting point, because
990 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
991 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
992 * start at x=4, we will fail and stop before reaching x=10. If multiple
993 * equality quals survive preprocessing, however, it doesn't matter which
994 * one we use --- by definition, they are either redundant or
997 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
998 * If the index stores nulls at the end of the index we'll be starting
999 * from, and we have no boundary key for the column (which means the key
1000 * we deduced NOT NULL from is an inequality key that constrains the other
1001 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1002 * use as a boundary key. If we didn't do this, we might find ourselves
1003 * traversing a lot of null entries at the start of the scan.
1005 * In this loop, row-comparison keys are treated the same as keys on their
1006 * first (leftmost) columns. We'll add on lower-order columns of the row
1007 * comparison below, if possible.
1009 * The selected scan keys (at most one per index column) are remembered by
1010 * storing their addresses into the local startKeys[] array.
1012 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1013 * the next primitive index scan (for scans with array keys) based in part
1014 * on an understanding of how it'll enable us to reposition the scan.
1015 * They're directly aware of how we'll sometimes cons up an explicit
1016 * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1017 * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1018 * scans to skip large groups of NULLs through repeated deductions about
1019 * key strictness (for a required inequality key) and whether NULLs in the
1020 * key's index column are stored last or first (relative to non-NULLs).
1021 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1022 * need to be kept in sync.
1025 strat_total
= BTEqualStrategyNumber
;
1026 if (so
->numberOfKeys
> 0)
1034 * chosen is the so-far-chosen key for the current attribute, if any.
1035 * We don't cast the decision in stone until we reach keys for the
1040 /* Also remember any scankey that implies a NOT NULL constraint */
1044 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1045 * pass to handle after-last-key processing. Actual exit from the
1046 * loop is at one of the "break" statements below.
1048 for (cur
= so
->keyData
, i
= 0;; cur
++, i
++)
1050 if (i
>= so
->numberOfKeys
|| cur
->sk_attno
!= curattr
)
1053 * Done looking at keys for curattr. If we didn't find a
1054 * usable boundary key, see if we can deduce a NOT NULL key.
1056 if (chosen
== NULL
&& impliesNN
!= NULL
&&
1057 ((impliesNN
->sk_flags
& SK_BT_NULLS_FIRST
) ?
1058 ScanDirectionIsForward(dir
) :
1059 ScanDirectionIsBackward(dir
)))
1061 /* Yes, so build the key in notnullkeys[keysz] */
1062 chosen
= ¬nullkeys
[keysz
];
1063 ScanKeyEntryInitialize(chosen
,
1064 (SK_SEARCHNOTNULL
| SK_ISNULL
|
1065 (impliesNN
->sk_flags
&
1066 (SK_BT_DESC
| SK_BT_NULLS_FIRST
))),
1068 ((impliesNN
->sk_flags
& SK_BT_NULLS_FIRST
) ?
1069 BTGreaterStrategyNumber
:
1070 BTLessStrategyNumber
),
1078 * If we still didn't find a usable boundary key, quit; else
1079 * save the boundary key pointer in startKeys.
1083 startKeys
[keysz
++] = chosen
;
1086 * Adjust strat_total, and quit if we have stored a > or <
1089 strat
= chosen
->sk_strategy
;
1090 if (strat
!= BTEqualStrategyNumber
)
1092 strat_total
= strat
;
1093 if (strat
== BTGreaterStrategyNumber
||
1094 strat
== BTLessStrategyNumber
)
1099 * Done if that was the last attribute, or if next key is not
1100 * in sequence (implying no boundary key is available for the
1103 if (i
>= so
->numberOfKeys
||
1104 cur
->sk_attno
!= curattr
+ 1)
1108 * Reset for next attr.
1110 curattr
= cur
->sk_attno
;
1116 * Can we use this key as a starting boundary for this attr?
1118 * If not, does it imply a NOT NULL constraint? (Because
1119 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1120 * *any* inequality key works for that; we need not test.)
1122 switch (cur
->sk_strategy
)
1124 case BTLessStrategyNumber
:
1125 case BTLessEqualStrategyNumber
:
1128 if (ScanDirectionIsBackward(dir
))
1134 case BTEqualStrategyNumber
:
1135 /* override any non-equality choice */
1138 case BTGreaterEqualStrategyNumber
:
1139 case BTGreaterStrategyNumber
:
1142 if (ScanDirectionIsForward(dir
))
1153 * If we found no usable boundary keys, we have to start from one end of
1154 * the tree. Walk down that edge to the first or last key, and scan from
1161 match
= _bt_endpoint(scan
, dir
);
1165 /* No match, so mark (parallel) scan finished */
1166 _bt_parallel_done(scan
);
1173 * We want to start the scan somewhere within the index. Set up an
1174 * insertion scankey we can use to search for the boundary point we
1175 * identified above. The insertion scankey is built using the keys
1176 * identified by startKeys[]. (Remaining insertion scankey fields are
1177 * initialized after initial-positioning strategy is finalized.)
1179 Assert(keysz
<= INDEX_MAX_KEYS
);
1180 for (i
= 0; i
< keysz
; i
++)
1182 ScanKey cur
= startKeys
[i
];
1184 Assert(cur
->sk_attno
== i
+ 1);
1186 if (cur
->sk_flags
& SK_ROW_HEADER
)
1189 * Row comparison header: look to the first row member instead.
1191 * The member scankeys are already in insertion format (ie, they
1192 * have sk_func = 3-way-comparison function), but we have to watch
1193 * out for nulls, which _bt_preprocess_keys didn't check. A null
1194 * in the first row member makes the condition unmatchable, just
1195 * like qual_ok = false.
1197 ScanKey subkey
= (ScanKey
) DatumGetPointer(cur
->sk_argument
);
1199 Assert(subkey
->sk_flags
& SK_ROW_MEMBER
);
1200 if (subkey
->sk_flags
& SK_ISNULL
)
1202 _bt_parallel_done(scan
);
1205 memcpy(inskey
.scankeys
+ i
, subkey
, sizeof(ScanKeyData
));
1208 * If the row comparison is the last positioning key we accepted,
1209 * try to add additional keys from the lower-order row members.
1210 * (If we accepted independent conditions on additional index
1211 * columns, we use those instead --- doesn't seem worth trying to
1212 * determine which is more restrictive.) Note that this is OK
1213 * even if the row comparison is of ">" or "<" type, because the
1214 * condition applied to all but the last row member is effectively
1215 * ">=" or "<=", and so the extra keys don't break the positioning
1216 * scheme. But, by the same token, if we aren't able to use all
1217 * the row members, then the part of the row comparison that we
1218 * did use has to be treated as just a ">=" or "<=" condition, and
1219 * so we'd better adjust strat_total accordingly.
1223 bool used_all_subkeys
= false;
1225 Assert(!(subkey
->sk_flags
& SK_ROW_END
));
1229 Assert(subkey
->sk_flags
& SK_ROW_MEMBER
);
1230 if (subkey
->sk_attno
!= keysz
+ 1)
1231 break; /* out-of-sequence, can't use it */
1232 if (subkey
->sk_strategy
!= cur
->sk_strategy
)
1233 break; /* wrong direction, can't use it */
1234 if (subkey
->sk_flags
& SK_ISNULL
)
1235 break; /* can't use null keys */
1236 Assert(keysz
< INDEX_MAX_KEYS
);
1237 memcpy(inskey
.scankeys
+ keysz
, subkey
,
1238 sizeof(ScanKeyData
));
1240 if (subkey
->sk_flags
& SK_ROW_END
)
1242 used_all_subkeys
= true;
1246 if (!used_all_subkeys
)
1248 switch (strat_total
)
1250 case BTLessStrategyNumber
:
1251 strat_total
= BTLessEqualStrategyNumber
;
1253 case BTGreaterStrategyNumber
:
1254 strat_total
= BTGreaterEqualStrategyNumber
;
1258 break; /* done with outer loop */
1264 * Ordinary comparison key. Transform the search-style scan key
1265 * to an insertion scan key by replacing the sk_func with the
1266 * appropriate btree comparison function.
1268 * If scankey operator is not a cross-type comparison, we can use
1269 * the cached comparison function; otherwise gotta look it up in
1270 * the catalogs. (That can't lead to infinite recursion, since no
1271 * indexscan initiated by syscache lookup will use cross-data-type
1274 * We support the convention that sk_subtype == InvalidOid means
1275 * the opclass input type; this is a hack to simplify life for
1278 if (cur
->sk_subtype
== rel
->rd_opcintype
[i
] ||
1279 cur
->sk_subtype
== InvalidOid
)
1283 procinfo
= index_getprocinfo(rel
, cur
->sk_attno
, BTORDER_PROC
);
1284 ScanKeyEntryInitializeWithInfo(inskey
.scankeys
+ i
,
1295 RegProcedure cmp_proc
;
1297 cmp_proc
= get_opfamily_proc(rel
->rd_opfamily
[i
],
1298 rel
->rd_opcintype
[i
],
1301 if (!RegProcedureIsValid(cmp_proc
))
1302 elog(ERROR
, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1303 BTORDER_PROC
, rel
->rd_opcintype
[i
], cur
->sk_subtype
,
1304 cur
->sk_attno
, RelationGetRelationName(rel
));
1305 ScanKeyEntryInitialize(inskey
.scankeys
+ i
,
1318 * Examine the selected initial-positioning strategy to determine exactly
1319 * where we need to start the scan, and set flag variables to control the
1320 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1321 * page _bt_search returns).
1324 _bt_metaversion(rel
, &inskey
.heapkeyspace
, &inskey
.allequalimage
);
1325 inskey
.anynullkeys
= false; /* unused */
1326 inskey
.scantid
= NULL
;
1327 inskey
.keysz
= keysz
;
1328 switch (strat_total
)
1330 case BTLessStrategyNumber
:
1332 inskey
.nextkey
= false;
1333 inskey
.backward
= true;
1336 case BTLessEqualStrategyNumber
:
1338 inskey
.nextkey
= true;
1339 inskey
.backward
= true;
1342 case BTEqualStrategyNumber
:
1345 * If a backward scan was specified, need to start with last equal
1346 * item not first one.
1348 if (ScanDirectionIsBackward(dir
))
1351 * This is the same as the <= strategy
1353 inskey
.nextkey
= true;
1354 inskey
.backward
= true;
1359 * This is the same as the >= strategy
1361 inskey
.nextkey
= false;
1362 inskey
.backward
= false;
1366 case BTGreaterEqualStrategyNumber
:
1369 * Find first item >= scankey
1371 inskey
.nextkey
= false;
1372 inskey
.backward
= false;
1375 case BTGreaterStrategyNumber
:
1378 * Find first item > scankey
1380 inskey
.nextkey
= true;
1381 inskey
.backward
= false;
1385 /* can't get here, but keep compiler quiet */
1386 elog(ERROR
, "unrecognized strat_total: %d", (int) strat_total
);
1391 * Use the manufactured insertion scan key to descend the tree and
1392 * position ourselves on the target leaf page.
1394 Assert(ScanDirectionIsBackward(dir
) == inskey
.backward
);
1395 stack
= _bt_search(rel
, NULL
, &inskey
, &buf
, BT_READ
);
1397 /* don't need to keep the stack around... */
1398 _bt_freestack(stack
);
1400 if (!BufferIsValid(buf
))
1403 * We only get here if the index is completely empty. Lock relation
1404 * because nothing finer to lock exists. Without a buffer lock, it's
1405 * possible for another transaction to insert data between
1406 * _bt_search() and PredicateLockRelation(). We have to try again
1407 * after taking the relation-level predicate lock, to close a narrow
1408 * window where we wouldn't scan concurrently inserted tuples, but the
1409 * writer wouldn't see our predicate lock.
1411 if (IsolationIsSerializable())
1413 PredicateLockRelation(rel
, scan
->xs_snapshot
);
1414 stack
= _bt_search(rel
, NULL
, &inskey
, &buf
, BT_READ
);
1415 _bt_freestack(stack
);
1418 if (!BufferIsValid(buf
))
1421 * Mark parallel scan as done, so that all the workers can finish
1424 _bt_parallel_done(scan
);
1425 BTScanPosInvalidate(so
->currPos
);
1430 PredicateLockPage(rel
, BufferGetBlockNumber(buf
), scan
->xs_snapshot
);
1432 _bt_initialize_more_data(so
, dir
);
1434 /* position to the precise item on the page */
1435 offnum
= _bt_binsrch(rel
, &inskey
, buf
);
1436 Assert(!BTScanPosIsValid(so
->currPos
));
1437 so
->currPos
.buf
= buf
;
1440 * Now load data from the first page of the scan.
1442 * If inskey.nextkey = false and inskey.backward = false, offnum is
1443 * positioned at the first non-pivot tuple >= inskey.scankeys.
1445 * If inskey.nextkey = false and inskey.backward = true, offnum is
1446 * positioned at the last non-pivot tuple < inskey.scankeys.
1448 * If inskey.nextkey = true and inskey.backward = false, offnum is
1449 * positioned at the first non-pivot tuple > inskey.scankeys.
1451 * If inskey.nextkey = true and inskey.backward = true, offnum is
1452 * positioned at the last non-pivot tuple <= inskey.scankeys.
1454 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1455 * for the page. For example, when inskey is both < the leaf page's high
1456 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1458 if (!_bt_readpage(scan
, dir
, offnum
, true))
1461 * There's no actually-matching data on this page. Try to advance to
1462 * the next page. Return false if there's no matching data at all.
1464 _bt_unlockbuf(scan
->indexRelation
, so
->currPos
.buf
);
1465 if (!_bt_steppage(scan
, dir
))
1470 /* We have at least one item to return as scan's first item */
1471 _bt_drop_lock_and_maybe_pin(scan
, &so
->currPos
);
1475 /* OK, itemIndex says what to return */
1476 currItem
= &so
->currPos
.items
[so
->currPos
.itemIndex
];
1477 scan
->xs_heaptid
= currItem
->heapTid
;
1478 if (scan
->xs_want_itup
)
1479 scan
->xs_itup
= (IndexTuple
) (so
->currTuples
+ currItem
->tupleOffset
);
1485 * _bt_next() -- Get the next item in a scan.
1487 * On entry, so->currPos describes the current page, which may be pinned
1488 * but is not locked, and so->currPos.itemIndex identifies which item was
1489 * previously returned.
1491 * On successful exit, scan->xs_heaptid is set to the TID of the next
1492 * heap tuple, and if requested, scan->xs_itup points to a copy of the
1493 * index tuple. so->currPos is updated as needed.
1495 * On failure exit (no more tuples), we release pin and set
1496 * so->currPos.buf to InvalidBuffer.
1499 _bt_next(IndexScanDesc scan
, ScanDirection dir
)
1501 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
1502 BTScanPosItem
*currItem
;
1505 * Advance to next tuple on current page; or if there's no more, try to
1506 * step to the next page with data.
1508 if (ScanDirectionIsForward(dir
))
1510 if (++so
->currPos
.itemIndex
> so
->currPos
.lastItem
)
1512 if (!_bt_steppage(scan
, dir
))
1518 if (--so
->currPos
.itemIndex
< so
->currPos
.firstItem
)
1520 if (!_bt_steppage(scan
, dir
))
1525 /* OK, itemIndex says what to return */
1526 currItem
= &so
->currPos
.items
[so
->currPos
.itemIndex
];
1527 scan
->xs_heaptid
= currItem
->heapTid
;
1528 if (scan
->xs_want_itup
)
1529 scan
->xs_itup
= (IndexTuple
) (so
->currTuples
+ currItem
->tupleOffset
);
1535 * _bt_readpage() -- Load data from current index page into so->currPos
1537 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1538 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1539 * they are updated as appropriate. All other fields of so->currPos are
1540 * initialized from scratch here.
1542 * We scan the current page starting at offnum and moving in the indicated
1543 * direction. All items matching the scan keys are loaded into currPos.items.
1544 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1545 * that there can be no more matching tuples in the current scan direction
1546 * (could just be for the current primitive index scan when scan has arrays).
1548 * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1549 * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1550 * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
1551 * its scan key was marked required), so _bt_first _must_ pass us an offnum
1552 * exactly at the beginning of where equal tuples are to be found. When we're
1553 * passed an offnum past the end of the page, we might still manage to stop
1554 * the scan on this page by calling _bt_checkkeys against the high key.
1556 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1557 * prior to calling this function; this function will invoke
1558 * _bt_parallel_release before returning.
1560 * Returns true if any matching items found on the page, false if none.
1563 _bt_readpage(IndexScanDesc scan
, ScanDirection dir
, OffsetNumber offnum
,
1566 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
1568 BTPageOpaque opaque
;
1569 OffsetNumber minoff
;
1570 OffsetNumber maxoff
;
1571 BTReadPageState pstate
;
1577 * We must have the buffer pinned and locked, but the usual macro can't be
1578 * used here; this function is what makes it good for currPos.
1580 Assert(BufferIsValid(so
->currPos
.buf
));
1582 page
= BufferGetPage(so
->currPos
.buf
);
1583 opaque
= BTPageGetOpaque(page
);
1585 /* allow next page be processed by parallel worker */
1586 if (scan
->parallel_scan
)
1588 if (ScanDirectionIsForward(dir
))
1589 pstate
.prev_scan_page
= opaque
->btpo_next
;
1591 pstate
.prev_scan_page
= BufferGetBlockNumber(so
->currPos
.buf
);
1593 _bt_parallel_release(scan
, pstate
.prev_scan_page
);
1596 indnatts
= IndexRelationGetNumberOfAttributes(scan
->indexRelation
);
1597 arrayKeys
= so
->numArrayKeys
!= 0;
1598 minoff
= P_FIRSTDATAKEY(opaque
);
1599 maxoff
= PageGetMaxOffsetNumber(page
);
1601 /* initialize page-level state that we'll pass to _bt_checkkeys */
1603 pstate
.minoff
= minoff
;
1604 pstate
.maxoff
= maxoff
;
1605 pstate
.finaltup
= NULL
;
1607 pstate
.offnum
= InvalidOffsetNumber
;
1608 pstate
.skip
= InvalidOffsetNumber
;
1609 pstate
.continuescan
= true; /* default assumption */
1610 pstate
.prechecked
= false;
1611 pstate
.firstmatch
= false;
1612 pstate
.rechecks
= 0;
1613 pstate
.targetdistance
= 0;
1616 * We note the buffer's block number so that we can release the pin later.
1617 * This allows us to re-read the buffer if it is needed again for hinting.
1619 so
->currPos
.currPage
= BufferGetBlockNumber(so
->currPos
.buf
);
1622 * We save the LSN of the page as we read it, so that we know whether it
1623 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1624 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1626 so
->currPos
.lsn
= BufferGetLSNAtomic(so
->currPos
.buf
);
1629 * we must save the page's right-link while scanning it; this tells us
1630 * where to step right to after we're done with these items. There is no
1631 * corresponding need for the left-link, since splits always go right.
1633 so
->currPos
.nextPage
= opaque
->btpo_next
;
1635 /* initialize tuple workspace to empty */
1636 so
->currPos
.nextTupleOffset
= 0;
1639 * Now that the current page has been made consistent, the macro should be
1642 Assert(BTScanPosIsPinned(so
->currPos
));
1645 * Prechecking the value of the continuescan flag for the last item on the
1646 * page (for backwards scan it will be the first item on a page). If we
1647 * observe it to be true, then it should be true for all other items. This
1648 * allows us to do significant optimizations in the _bt_checkkeys()
1649 * function for all the items on the page.
1651 * With the forward scan, we do this check for the last item on the page
1652 * instead of the high key. It's relatively likely that the most
1653 * significant column in the high key will be different from the
1654 * corresponding value from the last item on the page. So checking with
1655 * the last item on the page would give a more precise answer.
1657 * We skip this for the first page read by each (primitive) scan, to avoid
1658 * slowing down point queries. They typically don't stand to gain much
1659 * when the optimization can be applied, and are more likely to notice the
1660 * overhead of the precheck.
1662 * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1663 * just set a low-order required array's key to the best available match
1664 * for a truncated -inf attribute value from the prior page's high key
1665 * (array element 0 is always the best available match in this scenario).
1666 * It's quite likely that matches for array element 0 begin on this page,
1667 * but the start of matches won't necessarily align with page boundaries.
1668 * When the start of matches is somewhere in the middle of this page, it
1669 * would be wrong to treat page's final non-pivot tuple as representative.
1670 * Doing so might lead us to treat some of the page's earlier tuples as
1671 * being part of a group of tuples thought to satisfy the required keys.
1673 * Note: Conversely, in the case where the scan's arrays just advanced
1674 * using the prior page's HIKEY _without_ advancement setting scanBehind,
1675 * the start of matches must be aligned with page boundaries, which makes
1676 * it safe to attempt the optimization here now. It's also safe when the
1677 * prior page's HIKEY simply didn't need to advance any required array. In
1678 * both cases we can safely assume that the _first_ tuple from this page
1679 * must be >= the current set of array keys/equality constraints. And so
1680 * if the final tuple is == those same keys (and also satisfies any
1681 * required < or <= strategy scan keys) during the precheck, we can safely
1682 * assume that this must also be true of all earlier tuples from the page.
1684 if (!firstPage
&& !so
->scanBehind
&& minoff
< maxoff
)
1689 iid
= PageGetItemId(page
, ScanDirectionIsForward(dir
) ? maxoff
: minoff
);
1690 itup
= (IndexTuple
) PageGetItem(page
, iid
);
1692 /* Call with arrayKeys=false to avoid undesirable side-effects */
1693 _bt_checkkeys(scan
, &pstate
, false, itup
, indnatts
);
1694 pstate
.prechecked
= pstate
.continuescan
;
1695 pstate
.continuescan
= true; /* reset */
1698 if (ScanDirectionIsForward(dir
))
1700 /* SK_SEARCHARRAY forward scans must provide high key up front */
1701 if (arrayKeys
&& !P_RIGHTMOST(opaque
))
1703 ItemId iid
= PageGetItemId(page
, P_HIKEY
);
1705 pstate
.finaltup
= (IndexTuple
) PageGetItem(page
, iid
);
1707 if (unlikely(so
->oppositeDirCheck
))
1709 Assert(so
->scanBehind
);
1712 * Last _bt_readpage call scheduled a recheck of finaltup for
1713 * required scan keys up to and including a > or >= scan key.
1715 * _bt_checkkeys won't consider the scanBehind flag unless the
1716 * scan is stopped by a scan key required in the current scan
1717 * direction. We need this recheck so that we'll notice when
1718 * all tuples on this page are still before the _bt_first-wise
1719 * start of matches for the current set of array keys.
1721 if (!_bt_oppodir_checkkeys(scan
, dir
, pstate
.finaltup
))
1723 /* Schedule another primitive index scan after all */
1724 so
->currPos
.moreRight
= false;
1725 so
->needPrimScan
= true;
1729 /* Deliberately don't unset scanBehind flag just yet */
1733 /* load items[] in ascending order */
1736 offnum
= Max(offnum
, minoff
);
1738 while (offnum
<= maxoff
)
1740 ItemId iid
= PageGetItemId(page
, offnum
);
1745 * If the scan specifies not to return killed tuples, then we
1746 * treat a killed tuple as not passing the qual
1748 if (scan
->ignore_killed_tuples
&& ItemIdIsDead(iid
))
1750 offnum
= OffsetNumberNext(offnum
);
1754 itup
= (IndexTuple
) PageGetItem(page
, iid
);
1755 Assert(!BTreeTupleIsPivot(itup
));
1757 pstate
.offnum
= offnum
;
1758 passes_quals
= _bt_checkkeys(scan
, &pstate
, arrayKeys
,
1762 * Check if we need to skip ahead to a later tuple (only possible
1763 * when the scan uses array keys)
1765 if (arrayKeys
&& OffsetNumberIsValid(pstate
.skip
))
1767 Assert(!passes_quals
&& pstate
.continuescan
);
1768 Assert(offnum
< pstate
.skip
);
1770 offnum
= pstate
.skip
;
1771 pstate
.skip
= InvalidOffsetNumber
;
1777 /* tuple passes all scan key conditions */
1778 pstate
.firstmatch
= true;
1779 if (!BTreeTupleIsPosting(itup
))
1782 _bt_saveitem(so
, itemIndex
, offnum
, itup
);
1790 * Set up state to return posting list, and remember first
1794 _bt_setuppostingitems(so
, itemIndex
, offnum
,
1795 BTreeTupleGetPostingN(itup
, 0),
1798 /* Remember additional TIDs */
1799 for (int i
= 1; i
< BTreeTupleGetNPosting(itup
); i
++)
1801 _bt_savepostingitem(so
, itemIndex
, offnum
,
1802 BTreeTupleGetPostingN(itup
, i
),
1808 /* When !continuescan, there can't be any more matches, so stop */
1809 if (!pstate
.continuescan
)
1812 offnum
= OffsetNumberNext(offnum
);
1816 * We don't need to visit page to the right when the high key
1817 * indicates that no more matches will be found there.
1819 * Checking the high key like this works out more often than you might
1820 * think. Leaf page splits pick a split point between the two most
1821 * dissimilar tuples (this is weighed against the need to evenly share
1822 * free space). Leaf pages with high key attribute values that can
1823 * only appear on non-pivot tuples on the right sibling page are
1826 if (pstate
.continuescan
&& !P_RIGHTMOST(opaque
))
1828 ItemId iid
= PageGetItemId(page
, P_HIKEY
);
1829 IndexTuple itup
= (IndexTuple
) PageGetItem(page
, iid
);
1832 truncatt
= BTreeTupleGetNAtts(itup
, scan
->indexRelation
);
1833 pstate
.prechecked
= false; /* precheck didn't cover HIKEY */
1834 _bt_checkkeys(scan
, &pstate
, arrayKeys
, itup
, truncatt
);
1837 if (!pstate
.continuescan
)
1838 so
->currPos
.moreRight
= false;
1840 Assert(itemIndex
<= MaxTIDsPerBTreePage
);
1841 so
->currPos
.firstItem
= 0;
1842 so
->currPos
.lastItem
= itemIndex
- 1;
1843 so
->currPos
.itemIndex
= 0;
1847 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1848 if (arrayKeys
&& minoff
<= maxoff
&& !P_LEFTMOST(opaque
))
1850 ItemId iid
= PageGetItemId(page
, minoff
);
1852 pstate
.finaltup
= (IndexTuple
) PageGetItem(page
, iid
);
1855 /* load items[] in descending order */
1856 itemIndex
= MaxTIDsPerBTreePage
;
1858 offnum
= Min(offnum
, maxoff
);
1860 while (offnum
>= minoff
)
1862 ItemId iid
= PageGetItemId(page
, offnum
);
1868 * If the scan specifies not to return killed tuples, then we
1869 * treat a killed tuple as not passing the qual. Most of the
1870 * time, it's a win to not bother examining the tuple's index
1871 * keys, but just skip to the next tuple (previous, actually,
1872 * since we're scanning backwards). However, if this is the first
1873 * tuple on the page, we do check the index keys, to prevent
1874 * uselessly advancing to the page to the left. This is similar
1875 * to the high key optimization used by forward scans.
1877 if (scan
->ignore_killed_tuples
&& ItemIdIsDead(iid
))
1879 Assert(offnum
>= P_FIRSTDATAKEY(opaque
));
1880 if (offnum
> P_FIRSTDATAKEY(opaque
))
1882 offnum
= OffsetNumberPrev(offnum
);
1886 tuple_alive
= false;
1891 itup
= (IndexTuple
) PageGetItem(page
, iid
);
1892 Assert(!BTreeTupleIsPivot(itup
));
1894 pstate
.offnum
= offnum
;
1895 passes_quals
= _bt_checkkeys(scan
, &pstate
, arrayKeys
,
1899 * Check if we need to skip ahead to a later tuple (only possible
1900 * when the scan uses array keys)
1902 if (arrayKeys
&& OffsetNumberIsValid(pstate
.skip
))
1904 Assert(!passes_quals
&& pstate
.continuescan
);
1905 Assert(offnum
> pstate
.skip
);
1907 offnum
= pstate
.skip
;
1908 pstate
.skip
= InvalidOffsetNumber
;
1912 if (passes_quals
&& tuple_alive
)
1914 /* tuple passes all scan key conditions */
1915 pstate
.firstmatch
= true;
1916 if (!BTreeTupleIsPosting(itup
))
1920 _bt_saveitem(so
, itemIndex
, offnum
, itup
);
1927 * Set up state to return posting list, and remember first
1930 * Note that we deliberately save/return items from
1931 * posting lists in ascending heap TID order for backwards
1932 * scans. This allows _bt_killitems() to make a
1933 * consistent assumption about the order of items
1934 * associated with the same posting list tuple.
1938 _bt_setuppostingitems(so
, itemIndex
, offnum
,
1939 BTreeTupleGetPostingN(itup
, 0),
1941 /* Remember additional TIDs */
1942 for (int i
= 1; i
< BTreeTupleGetNPosting(itup
); i
++)
1945 _bt_savepostingitem(so
, itemIndex
, offnum
,
1946 BTreeTupleGetPostingN(itup
, i
),
1951 /* When !continuescan, there can't be any more matches, so stop */
1952 if (!pstate
.continuescan
)
1955 offnum
= OffsetNumberPrev(offnum
);
1959 * We don't need to visit page to the left when no more matches will
1962 if (!pstate
.continuescan
|| P_LEFTMOST(opaque
))
1963 so
->currPos
.moreLeft
= false;
1965 Assert(itemIndex
>= 0);
1966 so
->currPos
.firstItem
= itemIndex
;
1967 so
->currPos
.lastItem
= MaxTIDsPerBTreePage
- 1;
1968 so
->currPos
.itemIndex
= MaxTIDsPerBTreePage
- 1;
1971 return (so
->currPos
.firstItem
<= so
->currPos
.lastItem
);
1974 /* Save an index item into so->currPos.items[itemIndex] */
1976 _bt_saveitem(BTScanOpaque so
, int itemIndex
,
1977 OffsetNumber offnum
, IndexTuple itup
)
1979 BTScanPosItem
*currItem
= &so
->currPos
.items
[itemIndex
];
1981 Assert(!BTreeTupleIsPivot(itup
) && !BTreeTupleIsPosting(itup
));
1983 currItem
->heapTid
= itup
->t_tid
;
1984 currItem
->indexOffset
= offnum
;
1987 Size itupsz
= IndexTupleSize(itup
);
1989 currItem
->tupleOffset
= so
->currPos
.nextTupleOffset
;
1990 memcpy(so
->currTuples
+ so
->currPos
.nextTupleOffset
, itup
, itupsz
);
1991 so
->currPos
.nextTupleOffset
+= MAXALIGN(itupsz
);
1996 * Setup state to save TIDs/items from a single posting list tuple.
1998 * Saves an index item into so->currPos.items[itemIndex] for TID that is
1999 * returned to scan first. Second or subsequent TIDs for posting list should
2000 * be saved by calling _bt_savepostingitem().
2002 * Returns an offset into tuple storage space that main tuple is stored at if
2006 _bt_setuppostingitems(BTScanOpaque so
, int itemIndex
, OffsetNumber offnum
,
2007 ItemPointer heapTid
, IndexTuple itup
)
2009 BTScanPosItem
*currItem
= &so
->currPos
.items
[itemIndex
];
2011 Assert(BTreeTupleIsPosting(itup
));
2013 currItem
->heapTid
= *heapTid
;
2014 currItem
->indexOffset
= offnum
;
2017 /* Save base IndexTuple (truncate posting list) */
2019 Size itupsz
= BTreeTupleGetPostingOffset(itup
);
2021 itupsz
= MAXALIGN(itupsz
);
2022 currItem
->tupleOffset
= so
->currPos
.nextTupleOffset
;
2023 base
= (IndexTuple
) (so
->currTuples
+ so
->currPos
.nextTupleOffset
);
2024 memcpy(base
, itup
, itupsz
);
2025 /* Defensively reduce work area index tuple header size */
2026 base
->t_info
&= ~INDEX_SIZE_MASK
;
2027 base
->t_info
|= itupsz
;
2028 so
->currPos
.nextTupleOffset
+= itupsz
;
2030 return currItem
->tupleOffset
;
2037 * Save an index item into so->currPos.items[itemIndex] for current posting
2040 * Assumes that _bt_setuppostingitems() has already been called for current
2041 * posting list tuple. Caller passes its return value as tupleOffset.
2044 _bt_savepostingitem(BTScanOpaque so
, int itemIndex
, OffsetNumber offnum
,
2045 ItemPointer heapTid
, int tupleOffset
)
2047 BTScanPosItem
*currItem
= &so
->currPos
.items
[itemIndex
];
2049 currItem
->heapTid
= *heapTid
;
2050 currItem
->indexOffset
= offnum
;
2053 * Have index-only scans return the same base IndexTuple for every TID
2054 * that originates from the same posting list
2057 currItem
->tupleOffset
= tupleOffset
;
2061 * _bt_steppage() -- Step to next page containing valid data for scan
2063 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
2064 * if pinned, we'll drop the pin before moving to next page. The buffer is
2065 * not locked on entry.
2067 * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
2068 * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
2069 * to InvalidBuffer. We return true to indicate success.
2072 _bt_steppage(IndexScanDesc scan
, ScanDirection dir
)
2074 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
2075 BlockNumber blkno
= InvalidBlockNumber
;
2078 Assert(BTScanPosIsValid(so
->currPos
));
2080 /* Before leaving current page, deal with any killed items */
2081 if (so
->numKilled
> 0)
2082 _bt_killitems(scan
);
2085 * Before we modify currPos, make a copy of the page data if there was a
2086 * mark position that needs it.
2088 if (so
->markItemIndex
>= 0)
2090 /* bump pin on current buffer for assignment to mark buffer */
2091 if (BTScanPosIsPinned(so
->currPos
))
2092 IncrBufferRefCount(so
->currPos
.buf
);
2093 memcpy(&so
->markPos
, &so
->currPos
,
2094 offsetof(BTScanPosData
, items
[1]) +
2095 so
->currPos
.lastItem
* sizeof(BTScanPosItem
));
2097 memcpy(so
->markTuples
, so
->currTuples
,
2098 so
->currPos
.nextTupleOffset
);
2099 so
->markPos
.itemIndex
= so
->markItemIndex
;
2100 so
->markItemIndex
= -1;
2103 * If we're just about to start the next primitive index scan
2104 * (possible with a scan that has arrays keys, and needs to skip to
2105 * continue in the current scan direction), moreLeft/moreRight only
2106 * indicate the end of the current primitive index scan. They must
2107 * never be taken to indicate that the top-level index scan has ended
2108 * (that would be wrong).
2110 * We could handle this case by treating the current array keys as
2111 * markPos state. But depending on the current array state like this
2112 * would add complexity. Instead, we just unset markPos's copy of
2113 * moreRight or moreLeft (whichever might be affected), while making
2114 * btrestpos reset the scan's arrays to their initial scan positions.
2115 * In effect, btrestpos leaves advancing the arrays up to the first
2116 * _bt_readpage call (that takes place after it has restored markPos).
2118 Assert(so
->markPos
.dir
== dir
);
2119 if (so
->needPrimScan
)
2121 if (ScanDirectionIsForward(dir
))
2122 so
->markPos
.moreRight
= true;
2124 so
->markPos
.moreLeft
= true;
2128 if (ScanDirectionIsForward(dir
))
2130 /* Walk right to the next page with data */
2131 if (scan
->parallel_scan
!= NULL
)
2134 * Seize the scan to get the next block number; if the scan has
2135 * ended already, bail out.
2137 status
= _bt_parallel_seize(scan
, &blkno
, false);
2140 /* release the previous buffer, if pinned */
2141 BTScanPosUnpinIfPinned(so
->currPos
);
2142 BTScanPosInvalidate(so
->currPos
);
2148 /* Not parallel, so use the previously-saved nextPage link. */
2149 blkno
= so
->currPos
.nextPage
;
2152 /* Remember we left a page with data */
2153 so
->currPos
.moreLeft
= true;
2155 /* release the previous buffer, if pinned */
2156 BTScanPosUnpinIfPinned(so
->currPos
);
2160 /* Remember we left a page with data */
2161 so
->currPos
.moreRight
= true;
2163 if (scan
->parallel_scan
!= NULL
)
2166 * Seize the scan to get the current block number; if the scan has
2167 * ended already, bail out.
2169 status
= _bt_parallel_seize(scan
, &blkno
, false);
2170 BTScanPosUnpinIfPinned(so
->currPos
);
2173 BTScanPosInvalidate(so
->currPos
);
2179 /* Not parallel, so just use our own notion of the current page */
2180 blkno
= so
->currPos
.currPage
;
2184 if (!_bt_readnextpage(scan
, blkno
, dir
))
2187 /* We have at least one item to return as scan's next item */
2188 _bt_drop_lock_and_maybe_pin(scan
, &so
->currPos
);
2194 * _bt_readnextpage() -- Read next page containing valid data for scan
2196 * On success exit, so->currPos is updated to contain data from the next
2197 * interesting page, and we return true. Caller must release the lock (and
2198 * maybe the pin) on the buffer on success exit.
2200 * If there are no more matching records in the given direction, we drop all
2201 * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
2204 _bt_readnextpage(IndexScanDesc scan
, BlockNumber blkno
, ScanDirection dir
)
2206 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
2209 BTPageOpaque opaque
;
2212 rel
= scan
->indexRelation
;
2214 if (ScanDirectionIsForward(dir
))
2219 * if we're at end of scan, give up and mark parallel scan as
2220 * done, so that all the workers can finish their scan
2222 if (blkno
== P_NONE
|| !so
->currPos
.moreRight
)
2224 _bt_parallel_done(scan
);
2225 BTScanPosInvalidate(so
->currPos
);
2228 /* check for interrupts while we're not holding any buffer lock */
2229 CHECK_FOR_INTERRUPTS();
2230 /* step right one page */
2231 so
->currPos
.buf
= _bt_getbuf(rel
, blkno
, BT_READ
);
2232 page
= BufferGetPage(so
->currPos
.buf
);
2233 opaque
= BTPageGetOpaque(page
);
2234 /* check for deleted page */
2235 if (!P_IGNORE(opaque
))
2237 PredicateLockPage(rel
, blkno
, scan
->xs_snapshot
);
2238 /* see if there are any matches on this page */
2239 /* note that this will clear moreRight if we can stop */
2240 if (_bt_readpage(scan
, dir
, P_FIRSTDATAKEY(opaque
), false))
2243 else if (scan
->parallel_scan
!= NULL
)
2245 /* allow next page be processed by parallel worker */
2246 _bt_parallel_release(scan
, opaque
->btpo_next
);
2249 /* nope, keep going */
2250 if (scan
->parallel_scan
!= NULL
)
2252 _bt_relbuf(rel
, so
->currPos
.buf
);
2253 status
= _bt_parallel_seize(scan
, &blkno
, false);
2256 BTScanPosInvalidate(so
->currPos
);
2262 blkno
= opaque
->btpo_next
;
2263 _bt_relbuf(rel
, so
->currPos
.buf
);
2270 * Should only happen in parallel cases, when some other backend
2271 * advanced the scan.
2273 if (so
->currPos
.currPage
!= blkno
)
2275 BTScanPosUnpinIfPinned(so
->currPos
);
2276 so
->currPos
.currPage
= blkno
;
2279 /* Done if we know that the left sibling link isn't of interest */
2280 if (!so
->currPos
.moreLeft
)
2282 BTScanPosUnpinIfPinned(so
->currPos
);
2283 _bt_parallel_done(scan
);
2284 BTScanPosInvalidate(so
->currPos
);
2289 * Walk left to the next page with data. This is much more complex
2290 * than the walk-right case because of the possibility that the page
2291 * to our left splits while we are in flight to it, plus the
2292 * possibility that the page we were on gets deleted after we leave
2293 * it. See nbtree/README for details.
2295 * It might be possible to rearrange this code to have less overhead
2296 * in pinning and locking, but that would require capturing the left
2297 * sibling block number when the page is initially read, and then
2298 * optimistically starting there (rather than pinning the page twice).
2299 * It is not clear that this would be worth the complexity.
2301 if (BTScanPosIsPinned(so
->currPos
))
2302 _bt_lockbuf(rel
, so
->currPos
.buf
, BT_READ
);
2304 so
->currPos
.buf
= _bt_getbuf(rel
, so
->currPos
.currPage
, BT_READ
);
2308 /* Done if we know that the left sibling link isn't of interest */
2309 if (!so
->currPos
.moreLeft
)
2311 _bt_relbuf(rel
, so
->currPos
.buf
);
2312 _bt_parallel_done(scan
);
2313 BTScanPosInvalidate(so
->currPos
);
2317 /* Step to next physical page */
2318 so
->currPos
.buf
= _bt_walk_left(rel
, so
->currPos
.buf
);
2320 /* if we're physically at end of index, return failure */
2321 if (so
->currPos
.buf
== InvalidBuffer
)
2323 _bt_parallel_done(scan
);
2324 BTScanPosInvalidate(so
->currPos
);
2329 * Okay, we managed to move left to a non-deleted page. Done if
2330 * it's not half-dead and contains matching tuples. Else loop back
2331 * and do it all again.
2333 page
= BufferGetPage(so
->currPos
.buf
);
2334 opaque
= BTPageGetOpaque(page
);
2335 if (!P_IGNORE(opaque
))
2337 PredicateLockPage(rel
, BufferGetBlockNumber(so
->currPos
.buf
), scan
->xs_snapshot
);
2338 /* see if there are any matches on this page */
2339 /* note that this will clear moreLeft if we can stop */
2340 if (_bt_readpage(scan
, dir
, PageGetMaxOffsetNumber(page
), false))
2343 else if (scan
->parallel_scan
!= NULL
)
2345 /* allow next page be processed by parallel worker */
2346 _bt_parallel_release(scan
, BufferGetBlockNumber(so
->currPos
.buf
));
2350 * For parallel scans, get the last page scanned as it is quite
2351 * possible that by the time we try to seize the scan, some other
2352 * worker has already advanced the scan to a different page. We
2353 * must continue based on the latest page scanned by any worker.
2355 if (scan
->parallel_scan
!= NULL
)
2357 _bt_relbuf(rel
, so
->currPos
.buf
);
2358 status
= _bt_parallel_seize(scan
, &blkno
, false);
2361 BTScanPosInvalidate(so
->currPos
);
2364 so
->currPos
.buf
= _bt_getbuf(rel
, blkno
, BT_READ
);
2373 * _bt_parallel_readpage() -- Read current page containing valid data for scan
2375 * On success, release lock and maybe pin on buffer. We return true to
2379 _bt_parallel_readpage(IndexScanDesc scan
, BlockNumber blkno
, ScanDirection dir
)
2381 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
2383 Assert(!so
->needPrimScan
);
2385 _bt_initialize_more_data(so
, dir
);
2387 if (!_bt_readnextpage(scan
, blkno
, dir
))
2390 /* We have at least one item to return as scan's next item */
2391 _bt_drop_lock_and_maybe_pin(scan
, &so
->currPos
);
2397 * _bt_walk_left() -- step left one page, if possible
2399 * The given buffer must be pinned and read-locked. This will be dropped
2400 * before stepping left. On return, we have pin and read lock on the
2401 * returned page, instead.
2403 * Returns InvalidBuffer if there is no page to the left (no lock is held
2406 * It is possible for the returned leaf page to be half-dead; caller must
2407 * check that condition and step left again when required.
2410 _bt_walk_left(Relation rel
, Buffer buf
)
2413 BTPageOpaque opaque
;
2415 page
= BufferGetPage(buf
);
2416 opaque
= BTPageGetOpaque(page
);
2425 /* if we're at end of tree, release buf and return failure */
2426 if (P_LEFTMOST(opaque
))
2428 _bt_relbuf(rel
, buf
);
2431 /* remember original page we are stepping left from */
2432 obknum
= BufferGetBlockNumber(buf
);
2434 blkno
= lblkno
= opaque
->btpo_prev
;
2435 _bt_relbuf(rel
, buf
);
2436 /* check for interrupts while we're not holding any buffer lock */
2437 CHECK_FOR_INTERRUPTS();
2438 buf
= _bt_getbuf(rel
, blkno
, BT_READ
);
2439 page
= BufferGetPage(buf
);
2440 opaque
= BTPageGetOpaque(page
);
2443 * If this isn't the page we want, walk right till we find what we
2444 * want --- but go no more than four hops (an arbitrary limit). If we
2445 * don't find the correct page by then, the most likely bet is that
2446 * the original page got deleted and isn't in the sibling chain at all
2447 * anymore, not that its left sibling got split more than four times.
2449 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2450 * because half-dead pages are still in the sibling chain.
2455 if (!P_ISDELETED(opaque
) && opaque
->btpo_next
== obknum
)
2457 /* Found desired page, return it */
2460 if (P_RIGHTMOST(opaque
) || ++tries
> 4)
2462 blkno
= opaque
->btpo_next
;
2463 buf
= _bt_relandgetbuf(rel
, buf
, blkno
, BT_READ
);
2464 page
= BufferGetPage(buf
);
2465 opaque
= BTPageGetOpaque(page
);
2468 /* Return to the original page to see what's up */
2469 buf
= _bt_relandgetbuf(rel
, buf
, obknum
, BT_READ
);
2470 page
= BufferGetPage(buf
);
2471 opaque
= BTPageGetOpaque(page
);
2472 if (P_ISDELETED(opaque
))
2475 * It was deleted. Move right to first nondeleted page (there
2476 * must be one); that is the page that has acquired the deleted
2477 * one's keyspace, so stepping left from it will take us where we
2482 if (P_RIGHTMOST(opaque
))
2483 elog(ERROR
, "fell off the end of index \"%s\"",
2484 RelationGetRelationName(rel
));
2485 blkno
= opaque
->btpo_next
;
2486 buf
= _bt_relandgetbuf(rel
, buf
, blkno
, BT_READ
);
2487 page
= BufferGetPage(buf
);
2488 opaque
= BTPageGetOpaque(page
);
2489 if (!P_ISDELETED(opaque
))
2494 * Now return to top of loop, resetting obknum to point to this
2495 * nondeleted page, and try again.
2501 * It wasn't deleted; the explanation had better be that the page
2502 * to the left got split or deleted. Without this check, we'd go
2503 * into an infinite loop if there's anything wrong.
2505 if (opaque
->btpo_prev
== lblkno
)
2506 elog(ERROR
, "could not find left sibling of block %u in index \"%s\"",
2507 obknum
, RelationGetRelationName(rel
));
2508 /* Okay to try again with new lblkno value */
2512 return InvalidBuffer
;
2516 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2518 * If the index is empty, we will return InvalidBuffer; any other failure
2519 * condition causes ereport(). We will not return a dead page.
2521 * The returned buffer is pinned and read-locked.
2524 _bt_get_endpoint(Relation rel
, uint32 level
, bool rightmost
)
2528 BTPageOpaque opaque
;
2529 OffsetNumber offnum
;
2534 * If we are looking for a leaf page, okay to descend from fast root;
2535 * otherwise better descend from true root. (There is no point in being
2536 * smarter about intermediate levels.)
2539 buf
= _bt_getroot(rel
, NULL
, BT_READ
);
2541 buf
= _bt_gettrueroot(rel
);
2543 if (!BufferIsValid(buf
))
2544 return InvalidBuffer
;
2546 page
= BufferGetPage(buf
);
2547 opaque
= BTPageGetOpaque(page
);
2552 * If we landed on a deleted page, step right to find a live page
2553 * (there must be one). Also, if we want the rightmost page, step
2554 * right if needed to get to it (this could happen if the page split
2555 * since we obtained a pointer to it).
2557 while (P_IGNORE(opaque
) ||
2558 (rightmost
&& !P_RIGHTMOST(opaque
)))
2560 blkno
= opaque
->btpo_next
;
2561 if (blkno
== P_NONE
)
2562 elog(ERROR
, "fell off the end of index \"%s\"",
2563 RelationGetRelationName(rel
));
2564 buf
= _bt_relandgetbuf(rel
, buf
, blkno
, BT_READ
);
2565 page
= BufferGetPage(buf
);
2566 opaque
= BTPageGetOpaque(page
);
2570 if (opaque
->btpo_level
== level
)
2572 if (opaque
->btpo_level
< level
)
2574 (errcode(ERRCODE_INDEX_CORRUPTED
),
2575 errmsg_internal("btree level %u not found in index \"%s\"",
2576 level
, RelationGetRelationName(rel
))));
2578 /* Descend to leftmost or rightmost child page */
2580 offnum
= PageGetMaxOffsetNumber(page
);
2582 offnum
= P_FIRSTDATAKEY(opaque
);
2584 itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, offnum
));
2585 blkno
= BTreeTupleGetDownLink(itup
);
2587 buf
= _bt_relandgetbuf(rel
, buf
, blkno
, BT_READ
);
2588 page
= BufferGetPage(buf
);
2589 opaque
= BTPageGetOpaque(page
);
2596 * _bt_endpoint() -- Find the first or last page in the index, and scan
2597 * from there to the first key satisfying all the quals.
2599 * This is used by _bt_first() to set up a scan when we've determined
2600 * that the scan must start at the beginning or end of the index (for
2601 * a forward or backward scan respectively). Exit conditions are the
2602 * same as for _bt_first().
2605 _bt_endpoint(IndexScanDesc scan
, ScanDirection dir
)
2607 Relation rel
= scan
->indexRelation
;
2608 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
2611 BTPageOpaque opaque
;
2613 BTScanPosItem
*currItem
;
2616 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2617 * version of _bt_search(). We don't maintain a stack since we know we
2620 buf
= _bt_get_endpoint(rel
, 0, ScanDirectionIsBackward(dir
));
2622 if (!BufferIsValid(buf
))
2625 * Empty index. Lock the whole relation, as nothing finer to lock
2628 PredicateLockRelation(rel
, scan
->xs_snapshot
);
2629 BTScanPosInvalidate(so
->currPos
);
2633 PredicateLockPage(rel
, BufferGetBlockNumber(buf
), scan
->xs_snapshot
);
2634 page
= BufferGetPage(buf
);
2635 opaque
= BTPageGetOpaque(page
);
2636 Assert(P_ISLEAF(opaque
));
2638 if (ScanDirectionIsForward(dir
))
2640 /* There could be dead pages to the left, so not this: */
2641 /* Assert(P_LEFTMOST(opaque)); */
2643 start
= P_FIRSTDATAKEY(opaque
);
2645 else if (ScanDirectionIsBackward(dir
))
2647 Assert(P_RIGHTMOST(opaque
));
2649 start
= PageGetMaxOffsetNumber(page
);
2653 elog(ERROR
, "invalid scan direction: %d", (int) dir
);
2654 start
= 0; /* keep compiler quiet */
2657 /* remember which buffer we have pinned */
2658 so
->currPos
.buf
= buf
;
2660 _bt_initialize_more_data(so
, dir
);
2663 * Now load data from the first page of the scan.
2665 if (!_bt_readpage(scan
, dir
, start
, true))
2668 * There's no actually-matching data on this page. Try to advance to
2669 * the next page. Return false if there's no matching data at all.
2671 _bt_unlockbuf(scan
->indexRelation
, so
->currPos
.buf
);
2672 if (!_bt_steppage(scan
, dir
))
2677 /* We have at least one item to return as scan's first item */
2678 _bt_drop_lock_and_maybe_pin(scan
, &so
->currPos
);
2681 /* OK, itemIndex says what to return */
2682 currItem
= &so
->currPos
.items
[so
->currPos
.itemIndex
];
2683 scan
->xs_heaptid
= currItem
->heapTid
;
2684 if (scan
->xs_want_itup
)
2685 scan
->xs_itup
= (IndexTuple
) (so
->currTuples
+ currItem
->tupleOffset
);
2691 * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
2695 _bt_initialize_more_data(BTScanOpaque so
, ScanDirection dir
)
2697 so
->currPos
.dir
= dir
;
2698 if (so
->needPrimScan
)
2700 Assert(so
->numArrayKeys
);
2702 so
->currPos
.moreLeft
= true;
2703 so
->currPos
.moreRight
= true;
2704 so
->needPrimScan
= false;
2706 else if (ScanDirectionIsForward(dir
))
2708 so
->currPos
.moreLeft
= false;
2709 so
->currPos
.moreRight
= true;
2713 so
->currPos
.moreLeft
= true;
2714 so
->currPos
.moreRight
= false;
2716 so
->numKilled
= 0; /* just paranoia */
2717 so
->markItemIndex
= -1; /* ditto */