Fix xslt_process() to ensure that it inserts a NULL terminator after the
[PostgreSQL.git] / src / backend / access / nbtree / nbtsearch.c
blob12b4528f7b9ad4050de196cedef22257991efeb4
1 /*-------------------------------------------------------------------------
3 * nbtsearch.c
4 * Search code for postgres btrees.
7 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * $PostgreSQL$
13 *-------------------------------------------------------------------------
16 #include "postgres.h"
18 #include "access/genam.h"
19 #include "access/nbtree.h"
20 #include "access/relscan.h"
21 #include "miscadmin.h"
22 #include "pgstat.h"
23 #include "storage/bufmgr.h"
24 #include "utils/lsyscache.h"
25 #include "utils/rel.h"
28 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
29 OffsetNumber offnum);
30 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
31 static Buffer _bt_walk_left(Relation rel, Buffer buf);
32 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
36 * _bt_search() -- Search the tree for a particular scankey,
37 * or more precisely for the first leaf page it could be on.
39 * The passed scankey must be an insertion-type scankey (see nbtree/README),
40 * but it can omit the rightmost column(s) of the index.
42 * When nextkey is false (the usual case), we are looking for the first
43 * item >= scankey. When nextkey is true, we are looking for the first
44 * item strictly greater than scankey.
46 * Return value is a stack of parent-page pointers. *bufP is set to the
47 * address of the leaf-page buffer, which is read-locked and pinned.
48 * No locks are held on the parent pages, however!
50 * NOTE that the returned buffer is read-locked regardless of the access
51 * parameter. However, access = BT_WRITE will allow an empty root page
52 * to be created and returned. When access = BT_READ, an empty index
53 * will result in *bufP being set to InvalidBuffer.
55 BTStack
56 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
57 Buffer *bufP, int access)
59 BTStack stack_in = NULL;
61 /* Get the root page to start with */
62 *bufP = _bt_getroot(rel, access);
64 /* If index is empty and access = BT_READ, no root page is created. */
65 if (!BufferIsValid(*bufP))
66 return (BTStack) NULL;
68 /* Loop iterates once per level descended in the tree */
69 for (;;)
71 Page page;
72 BTPageOpaque opaque;
73 OffsetNumber offnum;
74 ItemId itemid;
75 IndexTuple itup;
76 BlockNumber blkno;
77 BlockNumber par_blkno;
78 BTStack new_stack;
81 * Race -- the page we just grabbed may have split since we read its
82 * pointer in the parent (or metapage). If it has, we may need to
83 * move right to its new sibling. Do that.
85 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
87 /* if this is a leaf page, we're done */
88 page = BufferGetPage(*bufP);
89 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
90 if (P_ISLEAF(opaque))
91 break;
94 * Find the appropriate item on the internal page, and get the child
95 * page that it points to.
97 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
98 itemid = PageGetItemId(page, offnum);
99 itup = (IndexTuple) PageGetItem(page, itemid);
100 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
101 par_blkno = BufferGetBlockNumber(*bufP);
104 * We need to save the location of the index entry we chose in the
105 * parent page on a stack. In case we split the tree, we'll use the
106 * stack to work back up to the parent page. We also save the actual
107 * downlink (TID) to uniquely identify the index entry, in case it
108 * moves right while we're working lower in the tree. See the paper
109 * by Lehman and Yao for how this is detected and handled. (We use the
110 * child link to disambiguate duplicate keys in the index -- Lehman
111 * and Yao disallow duplicate keys.)
113 new_stack = (BTStack) palloc(sizeof(BTStackData));
114 new_stack->bts_blkno = par_blkno;
115 new_stack->bts_offset = offnum;
116 memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
117 new_stack->bts_parent = stack_in;
119 /* drop the read lock on the parent page, acquire one on the child */
120 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
122 /* okay, all set to move down a level */
123 stack_in = new_stack;
126 return stack_in;
130 * _bt_moveright() -- move right in the btree if necessary.
132 * When we follow a pointer to reach a page, it is possible that
133 * the page has changed in the meanwhile. If this happens, we're
134 * guaranteed that the page has "split right" -- that is, that any
135 * data that appeared on the page originally is either on the page
136 * or strictly to the right of it.
138 * This routine decides whether or not we need to move right in the
139 * tree by examining the high key entry on the page. If that entry
140 * is strictly less than the scankey, or <= the scankey in the nextkey=true
141 * case, then we followed the wrong link and we need to move right.
143 * The passed scankey must be an insertion-type scankey (see nbtree/README),
144 * but it can omit the rightmost column(s) of the index.
146 * When nextkey is false (the usual case), we are looking for the first
147 * item >= scankey. When nextkey is true, we are looking for the first
148 * item strictly greater than scankey.
150 * On entry, we have the buffer pinned and a lock of the type specified by
151 * 'access'. If we move right, we release the buffer and lock and acquire
152 * the same on the right sibling. Return value is the buffer we stop at.
154 Buffer
155 _bt_moveright(Relation rel,
156 Buffer buf,
157 int keysz,
158 ScanKey scankey,
159 bool nextkey,
160 int access)
162 Page page;
163 BTPageOpaque opaque;
164 int32 cmpval;
166 page = BufferGetPage(buf);
167 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
170 * When nextkey = false (normal case): if the scan key that brought us to
171 * this page is > the high key stored on the page, then the page has split
172 * and we need to move right. (If the scan key is equal to the high key,
173 * we might or might not need to move right; have to scan the page first
174 * anyway.)
176 * When nextkey = true: move right if the scan key is >= page's high key.
178 * The page could even have split more than once, so scan as far as
179 * needed.
181 * We also have to move right if we followed a link that brought us to a
182 * dead page.
184 cmpval = nextkey ? 0 : 1;
186 while (!P_RIGHTMOST(opaque) &&
187 (P_IGNORE(opaque) ||
188 _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
190 /* step right one page */
191 BlockNumber rblkno = opaque->btpo_next;
193 buf = _bt_relandgetbuf(rel, buf, rblkno, access);
194 page = BufferGetPage(buf);
195 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
198 if (P_IGNORE(opaque))
199 elog(ERROR, "fell off the end of index \"%s\"",
200 RelationGetRelationName(rel));
202 return buf;
206 * _bt_binsrch() -- Do a binary search for a key on a particular page.
208 * The passed scankey must be an insertion-type scankey (see nbtree/README),
209 * but it can omit the rightmost column(s) of the index.
211 * When nextkey is false (the usual case), we are looking for the first
212 * item >= scankey. When nextkey is true, we are looking for the first
213 * item strictly greater than scankey.
215 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
216 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
217 * particular, this means it is possible to return a value 1 greater than the
218 * number of keys on the page, if the scankey is > all keys on the page.)
220 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
221 * of the last key < given scankey, or last key <= given scankey if nextkey
222 * is true. (Since _bt_compare treats the first data key of such a page as
223 * minus infinity, there will be at least one key < scankey, so the result
224 * always points at one of the keys on the page.) This key indicates the
225 * right place to descend to be sure we find all leaf keys >= given scankey
226 * (or leaf keys > given scankey when nextkey is true).
228 * This procedure is not responsible for walking right, it just examines
229 * the given page. _bt_binsrch() has no lock or refcount side effects
230 * on the buffer.
232 OffsetNumber
233 _bt_binsrch(Relation rel,
234 Buffer buf,
235 int keysz,
236 ScanKey scankey,
237 bool nextkey)
239 Page page;
240 BTPageOpaque opaque;
241 OffsetNumber low,
242 high;
243 int32 result,
244 cmpval;
246 page = BufferGetPage(buf);
247 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
249 low = P_FIRSTDATAKEY(opaque);
250 high = PageGetMaxOffsetNumber(page);
253 * If there are no keys on the page, return the first available slot. Note
254 * this covers two cases: the page is really empty (no keys), or it
255 * contains only a high key. The latter case is possible after vacuuming.
256 * This can never happen on an internal page, however, since they are
257 * never empty (an internal page must have children).
259 if (high < low)
260 return low;
263 * Binary search to find the first key on the page >= scan key, or first
264 * key > scankey when nextkey is true.
266 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
267 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
269 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
270 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
272 * We can fall out when high == low.
274 high++; /* establish the loop invariant for high */
276 cmpval = nextkey ? 0 : 1; /* select comparison value */
278 while (high > low)
280 OffsetNumber mid = low + ((high - low) / 2);
282 /* We have low <= mid < high, so mid points at a real slot */
284 result = _bt_compare(rel, keysz, scankey, page, mid);
286 if (result >= cmpval)
287 low = mid + 1;
288 else
289 high = mid;
293 * At this point we have high == low, but be careful: they could point
294 * past the last slot on the page.
296 * On a leaf page, we always return the first key >= scan key (resp. >
297 * scan key), which could be the last slot + 1.
299 if (P_ISLEAF(opaque))
300 return low;
303 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
304 * There must be one if _bt_compare() is playing by the rules.
306 Assert(low > P_FIRSTDATAKEY(opaque));
308 return OffsetNumberPrev(low);
311 /*----------
312 * _bt_compare() -- Compare scankey to a particular tuple on the page.
314 * The passed scankey must be an insertion-type scankey (see nbtree/README),
315 * but it can omit the rightmost column(s) of the index.
317 * keysz: number of key conditions to be checked (might be less than the
318 * number of index columns!)
319 * page/offnum: location of btree item to be compared to.
321 * This routine returns:
322 * <0 if scankey < tuple at offnum;
323 * 0 if scankey == tuple at offnum;
324 * >0 if scankey > tuple at offnum.
325 * NULLs in the keys are treated as sortable values. Therefore
326 * "equality" does not necessarily mean that the item should be
327 * returned to the caller as a matching key!
329 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
330 * "minus infinity": this routine will always claim it is less than the
331 * scankey. The actual key value stored (if any, which there probably isn't)
332 * does not matter. This convention allows us to implement the Lehman and
333 * Yao convention that the first down-link pointer is before the first key.
334 * See backend/access/nbtree/README for details.
335 *----------
337 int32
338 _bt_compare(Relation rel,
339 int keysz,
340 ScanKey scankey,
341 Page page,
342 OffsetNumber offnum)
344 TupleDesc itupdesc = RelationGetDescr(rel);
345 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
346 IndexTuple itup;
347 int i;
350 * Force result ">" if target item is first data item on an internal page
351 * --- see NOTE above.
353 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
354 return 1;
356 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
359 * The scan key is set up with the attribute number associated with each
360 * term in the key. It is important that, if the index is multi-key, the
361 * scan contain the first k key attributes, and that they be in order. If
362 * you think about how multi-key ordering works, you'll understand why
363 * this is.
365 * We don't test for violation of this condition here, however. The
366 * initial setup for the index scan had better have gotten it right (see
367 * _bt_first).
370 for (i = 1; i <= keysz; i++)
372 Datum datum;
373 bool isNull;
374 int32 result;
376 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
378 /* see comments about NULLs handling in btbuild */
379 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
381 if (isNull)
382 result = 0; /* NULL "=" NULL */
383 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
384 result = -1; /* NULL "<" NOT_NULL */
385 else
386 result = 1; /* NULL ">" NOT_NULL */
388 else if (isNull) /* key is NOT_NULL and item is NULL */
390 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
391 result = 1; /* NOT_NULL ">" NULL */
392 else
393 result = -1; /* NOT_NULL "<" NULL */
395 else
398 * The sk_func needs to be passed the index value as left arg and
399 * the sk_argument as right arg (they might be of different
400 * types). Since it is convenient for callers to think of
401 * _bt_compare as comparing the scankey to the index item, we have
402 * to flip the sign of the comparison result. (Unless it's a DESC
403 * column, in which case we *don't* flip the sign.)
405 result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
406 datum,
407 scankey->sk_argument));
409 if (!(scankey->sk_flags & SK_BT_DESC))
410 result = -result;
413 /* if the keys are unequal, return the difference */
414 if (result != 0)
415 return result;
417 scankey++;
420 /* if we get here, the keys are equal */
421 return 0;
425 * _bt_first() -- Find the first item in a scan.
427 * We need to be clever about the direction of scan, the search
428 * conditions, and the tree ordering. We find the first item (or,
429 * if backwards scan, the last item) in the tree that satisfies the
430 * qualifications in the scan key. On success exit, the page containing
431 * the current index tuple is pinned but not locked, and data about
432 * the matching tuple(s) on the page has been loaded into so->currPos,
433 * and scan->xs_ctup.t_self is set to the heap TID of the current tuple.
435 * If there are no matching items in the index, we return FALSE, with no
436 * pins or locks held.
438 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
439 * are both search-type scankeys (see nbtree/README for more about this).
440 * Within this routine, we build a temporary insertion-type scankey to use
441 * in locating the scan start position.
443 bool
444 _bt_first(IndexScanDesc scan, ScanDirection dir)
446 Relation rel = scan->indexRelation;
447 BTScanOpaque so = (BTScanOpaque) scan->opaque;
448 Buffer buf;
449 BTStack stack;
450 OffsetNumber offnum;
451 StrategyNumber strat;
452 bool nextkey;
453 bool goback;
454 ScanKey startKeys[INDEX_MAX_KEYS];
455 ScanKeyData scankeys[INDEX_MAX_KEYS];
456 int keysCount = 0;
457 int i;
458 StrategyNumber strat_total;
460 pgstat_count_index_scan(rel);
463 * Examine the scan keys and eliminate any redundant keys; also mark the
464 * keys that must be matched to continue the scan.
466 _bt_preprocess_keys(scan);
469 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
470 * never be satisfied (eg, x == 1 AND x > 2).
472 if (!so->qual_ok)
473 return false;
475 /*----------
476 * Examine the scan keys to discover where we need to start the scan.
478 * We want to identify the keys that can be used as starting boundaries;
479 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
480 * a backwards scan. We can use keys for multiple attributes so long as
481 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
482 * a > or < boundary or find an attribute with no boundary (which can be
483 * thought of as the same as "> -infinity"), we can't use keys for any
484 * attributes to its right, because it would break our simplistic notion
485 * of what initial positioning strategy to use.
487 * When the scan keys include cross-type operators, _bt_preprocess_keys
488 * may not be able to eliminate redundant keys; in such cases we will
489 * arbitrarily pick a usable one for each attribute. This is correct
490 * but possibly not optimal behavior. (For example, with keys like
491 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
492 * x=5 would be more efficient.) Since the situation only arises given
493 * a poorly-worded query plus an incomplete opfamily, live with it.
495 * When both equality and inequality keys appear for a single attribute
496 * (again, only possible when cross-type operators appear), we *must*
497 * select one of the equality keys for the starting point, because
498 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
499 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
500 * start at x=4, we will fail and stop before reaching x=10. If multiple
501 * equality quals survive preprocessing, however, it doesn't matter which
502 * one we use --- by definition, they are either redundant or
503 * contradictory.
505 * In this loop, row-comparison keys are treated the same as keys on their
506 * first (leftmost) columns. We'll add on lower-order columns of the row
507 * comparison below, if possible.
509 * The selected scan keys (at most one per index column) are remembered by
510 * storing their addresses into the local startKeys[] array.
511 *----------
513 strat_total = BTEqualStrategyNumber;
514 if (so->numberOfKeys > 0)
516 AttrNumber curattr;
517 ScanKey chosen;
518 ScanKey cur;
521 * chosen is the so-far-chosen key for the current attribute, if any.
522 * We don't cast the decision in stone until we reach keys for the
523 * next attribute.
525 curattr = 1;
526 chosen = NULL;
529 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
530 * pass to handle after-last-key processing. Actual exit from the
531 * loop is at one of the "break" statements below.
533 for (cur = so->keyData, i = 0;; cur++, i++)
535 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
538 * Done looking at keys for curattr. If we didn't find a
539 * usable boundary key, quit; else save the boundary key
540 * pointer in startKeys.
542 if (chosen == NULL)
543 break;
544 startKeys[keysCount++] = chosen;
547 * Adjust strat_total, and quit if we have stored a > or <
548 * key.
550 strat = chosen->sk_strategy;
551 if (strat != BTEqualStrategyNumber)
553 strat_total = strat;
554 if (strat == BTGreaterStrategyNumber ||
555 strat == BTLessStrategyNumber)
556 break;
560 * Done if that was the last attribute, or if next key is not
561 * in sequence (implying no boundary key is available for the
562 * next attribute).
564 if (i >= so->numberOfKeys ||
565 cur->sk_attno != curattr + 1)
566 break;
569 * Reset for next attr.
571 curattr = cur->sk_attno;
572 chosen = NULL;
575 /* Can we use this key as a starting boundary for this attr? */
576 switch (cur->sk_strategy)
578 case BTLessStrategyNumber:
579 case BTLessEqualStrategyNumber:
580 if (chosen == NULL && ScanDirectionIsBackward(dir))
581 chosen = cur;
582 break;
583 case BTEqualStrategyNumber:
584 /* override any non-equality choice */
585 chosen = cur;
586 break;
587 case BTGreaterEqualStrategyNumber:
588 case BTGreaterStrategyNumber:
589 if (chosen == NULL && ScanDirectionIsForward(dir))
590 chosen = cur;
591 break;
597 * If we found no usable boundary keys, we have to start from one end of
598 * the tree. Walk down that edge to the first or last key, and scan from
599 * there.
601 if (keysCount == 0)
602 return _bt_endpoint(scan, dir);
605 * We want to start the scan somewhere within the index. Set up an
606 * insertion scankey we can use to search for the boundary point we
607 * identified above. The insertion scankey is built in the local
608 * scankeys[] array, using the keys identified by startKeys[].
610 Assert(keysCount <= INDEX_MAX_KEYS);
611 for (i = 0; i < keysCount; i++)
613 ScanKey cur = startKeys[i];
615 Assert(cur->sk_attno == i + 1);
617 if (cur->sk_flags & SK_ROW_HEADER)
620 * Row comparison header: look to the first row member instead.
622 * The member scankeys are already in insertion format (ie, they
623 * have sk_func = 3-way-comparison function), but we have to watch
624 * out for nulls, which _bt_preprocess_keys didn't check. A null
625 * in the first row member makes the condition unmatchable, just
626 * like qual_ok = false.
628 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
630 Assert(subkey->sk_flags & SK_ROW_MEMBER);
631 if (subkey->sk_flags & SK_ISNULL)
632 return false;
633 memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
636 * If the row comparison is the last positioning key we accepted,
637 * try to add additional keys from the lower-order row members.
638 * (If we accepted independent conditions on additional index
639 * columns, we use those instead --- doesn't seem worth trying to
640 * determine which is more restrictive.) Note that this is OK
641 * even if the row comparison is of ">" or "<" type, because the
642 * condition applied to all but the last row member is effectively
643 * ">=" or "<=", and so the extra keys don't break the positioning
644 * scheme. But, by the same token, if we aren't able to use all
645 * the row members, then the part of the row comparison that we
646 * did use has to be treated as just a ">=" or "<=" condition, and
647 * so we'd better adjust strat_total accordingly.
649 if (i == keysCount - 1)
651 bool used_all_subkeys = false;
653 Assert(!(subkey->sk_flags & SK_ROW_END));
654 for (;;)
656 subkey++;
657 Assert(subkey->sk_flags & SK_ROW_MEMBER);
658 if (subkey->sk_attno != keysCount + 1)
659 break; /* out-of-sequence, can't use it */
660 if (subkey->sk_strategy != cur->sk_strategy)
661 break; /* wrong direction, can't use it */
662 if (subkey->sk_flags & SK_ISNULL)
663 break; /* can't use null keys */
664 Assert(keysCount < INDEX_MAX_KEYS);
665 memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
666 keysCount++;
667 if (subkey->sk_flags & SK_ROW_END)
669 used_all_subkeys = true;
670 break;
673 if (!used_all_subkeys)
675 switch (strat_total)
677 case BTLessStrategyNumber:
678 strat_total = BTLessEqualStrategyNumber;
679 break;
680 case BTGreaterStrategyNumber:
681 strat_total = BTGreaterEqualStrategyNumber;
682 break;
685 break; /* done with outer loop */
688 else
691 * Ordinary comparison key. Transform the search-style scan key
692 * to an insertion scan key by replacing the sk_func with the
693 * appropriate btree comparison function.
695 * If scankey operator is not a cross-type comparison, we can use
696 * the cached comparison function; otherwise gotta look it up in
697 * the catalogs. (That can't lead to infinite recursion, since no
698 * indexscan initiated by syscache lookup will use cross-data-type
699 * operators.)
701 * We support the convention that sk_subtype == InvalidOid means
702 * the opclass input type; this is a hack to simplify life for
703 * ScanKeyInit().
705 if (cur->sk_subtype == rel->rd_opcintype[i] ||
706 cur->sk_subtype == InvalidOid)
708 FmgrInfo *procinfo;
710 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
711 ScanKeyEntryInitializeWithInfo(scankeys + i,
712 cur->sk_flags,
713 cur->sk_attno,
714 InvalidStrategy,
715 cur->sk_subtype,
716 procinfo,
717 cur->sk_argument);
719 else
721 RegProcedure cmp_proc;
723 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
724 rel->rd_opcintype[i],
725 cur->sk_subtype,
726 BTORDER_PROC);
727 if (!RegProcedureIsValid(cmp_proc))
728 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
729 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
730 cur->sk_attno, RelationGetRelationName(rel));
731 ScanKeyEntryInitialize(scankeys + i,
732 cur->sk_flags,
733 cur->sk_attno,
734 InvalidStrategy,
735 cur->sk_subtype,
736 cmp_proc,
737 cur->sk_argument);
742 /*----------
743 * Examine the selected initial-positioning strategy to determine exactly
744 * where we need to start the scan, and set flag variables to control the
745 * code below.
747 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
748 * item >= scan key. If nextkey = true, they will locate the first
749 * item > scan key.
751 * If goback = true, we will then step back one item, while if
752 * goback = false, we will start the scan on the located item.
753 *----------
755 switch (strat_total)
757 case BTLessStrategyNumber:
760 * Find first item >= scankey, then back up one to arrive at last
761 * item < scankey. (Note: this positioning strategy is only used
762 * for a backward scan, so that is always the correct starting
763 * position.)
765 nextkey = false;
766 goback = true;
767 break;
769 case BTLessEqualStrategyNumber:
772 * Find first item > scankey, then back up one to arrive at last
773 * item <= scankey. (Note: this positioning strategy is only used
774 * for a backward scan, so that is always the correct starting
775 * position.)
777 nextkey = true;
778 goback = true;
779 break;
781 case BTEqualStrategyNumber:
784 * If a backward scan was specified, need to start with last equal
785 * item not first one.
787 if (ScanDirectionIsBackward(dir))
790 * This is the same as the <= strategy. We will check at the
791 * end whether the found item is actually =.
793 nextkey = true;
794 goback = true;
796 else
799 * This is the same as the >= strategy. We will check at the
800 * end whether the found item is actually =.
802 nextkey = false;
803 goback = false;
805 break;
807 case BTGreaterEqualStrategyNumber:
810 * Find first item >= scankey. (This is only used for forward
811 * scans.)
813 nextkey = false;
814 goback = false;
815 break;
817 case BTGreaterStrategyNumber:
820 * Find first item > scankey. (This is only used for forward
821 * scans.)
823 nextkey = true;
824 goback = false;
825 break;
827 default:
828 /* can't get here, but keep compiler quiet */
829 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
830 return false;
834 * Use the manufactured insertion scan key to descend the tree and
835 * position ourselves on the target leaf page.
837 stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
839 /* don't need to keep the stack around... */
840 _bt_freestack(stack);
842 /* remember which buffer we have pinned, if any */
843 so->currPos.buf = buf;
845 if (!BufferIsValid(buf))
847 /* Only get here if index is completely empty */
848 return false;
851 /* initialize moreLeft/moreRight appropriately for scan direction */
852 if (ScanDirectionIsForward(dir))
854 so->currPos.moreLeft = false;
855 so->currPos.moreRight = true;
857 else
859 so->currPos.moreLeft = true;
860 so->currPos.moreRight = false;
862 so->numKilled = 0; /* just paranoia */
863 so->markItemIndex = -1; /* ditto */
865 /* position to the precise item on the page */
866 offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
869 * If nextkey = false, we are positioned at the first item >= scan key, or
870 * possibly at the end of a page on which all the existing items are less
871 * than the scan key and we know that everything on later pages is greater
872 * than or equal to scan key.
874 * If nextkey = true, we are positioned at the first item > scan key, or
875 * possibly at the end of a page on which all the existing items are less
876 * than or equal to the scan key and we know that everything on later
877 * pages is greater than scan key.
879 * The actually desired starting point is either this item or the prior
880 * one, or in the end-of-page case it's the first item on the next page or
881 * the last item on this page. Adjust the starting offset if needed. (If
882 * this results in an offset before the first item or after the last one,
883 * _bt_readpage will report no items found, and then we'll step to the
884 * next page as needed.)
886 if (goback)
887 offnum = OffsetNumberPrev(offnum);
890 * Now load data from the first page of the scan.
892 if (!_bt_readpage(scan, dir, offnum))
895 * There's no actually-matching data on this page. Try to advance to
896 * the next page. Return false if there's no matching data at all.
898 if (!_bt_steppage(scan, dir))
899 return false;
902 /* Drop the lock, but not pin, on the current page */
903 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
905 /* OK, itemIndex says what to return */
906 scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
908 return true;
912 * _bt_next() -- Get the next item in a scan.
914 * On entry, so->currPos describes the current page, which is pinned
915 * but not locked, and so->currPos.itemIndex identifies which item was
916 * previously returned.
918 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
919 * next heap tuple, and so->currPos is updated as needed.
921 * On failure exit (no more tuples), we release pin and set
922 * so->currPos.buf to InvalidBuffer.
924 bool
925 _bt_next(IndexScanDesc scan, ScanDirection dir)
927 BTScanOpaque so = (BTScanOpaque) scan->opaque;
930 * Advance to next tuple on current page; or if there's no more, try to
931 * step to the next page with data.
933 if (ScanDirectionIsForward(dir))
935 if (++so->currPos.itemIndex > so->currPos.lastItem)
937 /* We must acquire lock before applying _bt_steppage */
938 Assert(BufferIsValid(so->currPos.buf));
939 LockBuffer(so->currPos.buf, BT_READ);
940 if (!_bt_steppage(scan, dir))
941 return false;
942 /* Drop the lock, but not pin, on the new page */
943 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
946 else
948 if (--so->currPos.itemIndex < so->currPos.firstItem)
950 /* We must acquire lock before applying _bt_steppage */
951 Assert(BufferIsValid(so->currPos.buf));
952 LockBuffer(so->currPos.buf, BT_READ);
953 if (!_bt_steppage(scan, dir))
954 return false;
955 /* Drop the lock, but not pin, on the new page */
956 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
960 /* OK, itemIndex says what to return */
961 scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
963 return true;
967 * _bt_readpage() -- Load data from current index page into so->currPos
969 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
970 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
971 * they are updated as appropriate. All other fields of so->currPos are
972 * initialized from scratch here.
974 * We scan the current page starting at offnum and moving in the indicated
975 * direction. All items matching the scan keys are loaded into currPos.items.
976 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
977 * that there can be no more matching tuples in the current scan direction.
979 * Returns true if any matching items found on the page, false if none.
981 static bool
982 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
984 BTScanOpaque so = (BTScanOpaque) scan->opaque;
985 Page page;
986 BTPageOpaque opaque;
987 OffsetNumber minoff;
988 OffsetNumber maxoff;
989 int itemIndex;
990 bool continuescan;
992 /* we must have the buffer pinned and locked */
993 Assert(BufferIsValid(so->currPos.buf));
995 page = BufferGetPage(so->currPos.buf);
996 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
997 minoff = P_FIRSTDATAKEY(opaque);
998 maxoff = PageGetMaxOffsetNumber(page);
1001 * we must save the page's right-link while scanning it; this tells us
1002 * where to step right to after we're done with these items. There is no
1003 * corresponding need for the left-link, since splits always go right.
1005 so->currPos.nextPage = opaque->btpo_next;
1007 if (ScanDirectionIsForward(dir))
1009 /* load items[] in ascending order */
1010 itemIndex = 0;
1012 offnum = Max(offnum, minoff);
1014 while (offnum <= maxoff)
1016 if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
1018 /* tuple passes all scan key conditions, so remember it */
1019 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
1020 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
1021 so->currPos.items[itemIndex].indexOffset = offnum;
1022 itemIndex++;
1024 if (!continuescan)
1026 /* there can't be any more matches, so stop */
1027 so->currPos.moreRight = false;
1028 break;
1031 offnum = OffsetNumberNext(offnum);
1034 Assert(itemIndex <= MaxIndexTuplesPerPage);
1035 so->currPos.firstItem = 0;
1036 so->currPos.lastItem = itemIndex - 1;
1037 so->currPos.itemIndex = 0;
1039 else
1041 /* load items[] in descending order */
1042 itemIndex = MaxIndexTuplesPerPage;
1044 offnum = Min(offnum, maxoff);
1046 while (offnum >= minoff)
1048 if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
1050 /* tuple passes all scan key conditions, so remember it */
1051 /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
1052 itemIndex--;
1053 so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
1054 so->currPos.items[itemIndex].indexOffset = offnum;
1056 if (!continuescan)
1058 /* there can't be any more matches, so stop */
1059 so->currPos.moreLeft = false;
1060 break;
1063 offnum = OffsetNumberPrev(offnum);
1066 Assert(itemIndex >= 0);
1067 so->currPos.firstItem = itemIndex;
1068 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1069 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1072 return (so->currPos.firstItem <= so->currPos.lastItem);
1076 * _bt_steppage() -- Step to next page containing valid data for scan
1078 * On entry, so->currPos.buf must be pinned and read-locked. We'll drop
1079 * the lock and pin before moving to next page.
1081 * On success exit, we hold pin and read-lock on the next interesting page,
1082 * and so->currPos is updated to contain data from that page.
1084 * If there are no more matching records in the given direction, we drop all
1085 * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
1087 static bool
1088 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1090 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1091 Relation rel;
1092 Page page;
1093 BTPageOpaque opaque;
1095 /* we must have the buffer pinned and locked */
1096 Assert(BufferIsValid(so->currPos.buf));
1098 /* Before leaving current page, deal with any killed items */
1099 if (so->numKilled > 0)
1100 _bt_killitems(scan, true);
1103 * Before we modify currPos, make a copy of the page data if there was a
1104 * mark position that needs it.
1106 if (so->markItemIndex >= 0)
1108 /* bump pin on current buffer for assignment to mark buffer */
1109 IncrBufferRefCount(so->currPos.buf);
1110 memcpy(&so->markPos, &so->currPos,
1111 offsetof(BTScanPosData, items[1]) +
1112 so->currPos.lastItem * sizeof(BTScanPosItem));
1113 so->markPos.itemIndex = so->markItemIndex;
1114 so->markItemIndex = -1;
1117 rel = scan->indexRelation;
1119 if (ScanDirectionIsForward(dir))
1121 /* Walk right to the next page with data */
1122 /* We must rely on the previously saved nextPage link! */
1123 BlockNumber blkno = so->currPos.nextPage;
1125 /* Remember we left a page with data */
1126 so->currPos.moreLeft = true;
1128 for (;;)
1130 /* release the previous buffer */
1131 _bt_relbuf(rel, so->currPos.buf);
1132 so->currPos.buf = InvalidBuffer;
1133 /* if we're at end of scan, give up */
1134 if (blkno == P_NONE || !so->currPos.moreRight)
1135 return false;
1136 /* check for interrupts while we're not holding any buffer lock */
1137 CHECK_FOR_INTERRUPTS();
1138 /* step right one page */
1139 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1140 /* check for deleted page */
1141 page = BufferGetPage(so->currPos.buf);
1142 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1143 if (!P_IGNORE(opaque))
1145 /* see if there are any matches on this page */
1146 /* note that this will clear moreRight if we can stop */
1147 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1148 break;
1150 /* nope, keep going */
1151 blkno = opaque->btpo_next;
1154 else
1156 /* Remember we left a page with data */
1157 so->currPos.moreRight = true;
1160 * Walk left to the next page with data. This is much more complex
1161 * than the walk-right case because of the possibility that the page
1162 * to our left splits while we are in flight to it, plus the
1163 * possibility that the page we were on gets deleted after we leave
1164 * it. See nbtree/README for details.
1166 for (;;)
1168 /* Done if we know there are no matching keys to the left */
1169 if (!so->currPos.moreLeft)
1171 _bt_relbuf(rel, so->currPos.buf);
1172 so->currPos.buf = InvalidBuffer;
1173 return false;
1176 /* Step to next physical page */
1177 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
1179 /* if we're physically at end of index, return failure */
1180 if (so->currPos.buf == InvalidBuffer)
1181 return false;
1184 * Okay, we managed to move left to a non-deleted page. Done if
1185 * it's not half-dead and contains matching tuples. Else loop back
1186 * and do it all again.
1188 page = BufferGetPage(so->currPos.buf);
1189 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1190 if (!P_IGNORE(opaque))
1192 /* see if there are any matches on this page */
1193 /* note that this will clear moreLeft if we can stop */
1194 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1195 break;
1200 return true;
1204 * _bt_walk_left() -- step left one page, if possible
1206 * The given buffer must be pinned and read-locked. This will be dropped
1207 * before stepping left. On return, we have pin and read lock on the
1208 * returned page, instead.
1210 * Returns InvalidBuffer if there is no page to the left (no lock is held
1211 * in that case).
1213 * When working on a non-leaf level, it is possible for the returned page
1214 * to be half-dead; the caller should check that condition and step left
1215 * again if it's important.
1217 static Buffer
1218 _bt_walk_left(Relation rel, Buffer buf)
1220 Page page;
1221 BTPageOpaque opaque;
1223 page = BufferGetPage(buf);
1224 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1226 for (;;)
1228 BlockNumber obknum;
1229 BlockNumber lblkno;
1230 BlockNumber blkno;
1231 int tries;
1233 /* if we're at end of tree, release buf and return failure */
1234 if (P_LEFTMOST(opaque))
1236 _bt_relbuf(rel, buf);
1237 break;
1239 /* remember original page we are stepping left from */
1240 obknum = BufferGetBlockNumber(buf);
1241 /* step left */
1242 blkno = lblkno = opaque->btpo_prev;
1243 _bt_relbuf(rel, buf);
1244 /* check for interrupts while we're not holding any buffer lock */
1245 CHECK_FOR_INTERRUPTS();
1246 buf = _bt_getbuf(rel, blkno, BT_READ);
1247 page = BufferGetPage(buf);
1248 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1251 * If this isn't the page we want, walk right till we find what we
1252 * want --- but go no more than four hops (an arbitrary limit). If we
1253 * don't find the correct page by then, the most likely bet is that
1254 * the original page got deleted and isn't in the sibling chain at all
1255 * anymore, not that its left sibling got split more than four times.
1257 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1258 * because half-dead pages are still in the sibling chain. Caller
1259 * must reject half-dead pages if wanted.
1261 tries = 0;
1262 for (;;)
1264 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1266 /* Found desired page, return it */
1267 return buf;
1269 if (P_RIGHTMOST(opaque) || ++tries > 4)
1270 break;
1271 blkno = opaque->btpo_next;
1272 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1273 page = BufferGetPage(buf);
1274 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1277 /* Return to the original page to see what's up */
1278 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1279 page = BufferGetPage(buf);
1280 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1281 if (P_ISDELETED(opaque))
1284 * It was deleted. Move right to first nondeleted page (there
1285 * must be one); that is the page that has acquired the deleted
1286 * one's keyspace, so stepping left from it will take us where we
1287 * want to be.
1289 for (;;)
1291 if (P_RIGHTMOST(opaque))
1292 elog(ERROR, "fell off the end of index \"%s\"",
1293 RelationGetRelationName(rel));
1294 blkno = opaque->btpo_next;
1295 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1296 page = BufferGetPage(buf);
1297 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1298 if (!P_ISDELETED(opaque))
1299 break;
1303 * Now return to top of loop, resetting obknum to point to this
1304 * nondeleted page, and try again.
1307 else
1310 * It wasn't deleted; the explanation had better be that the page
1311 * to the left got split or deleted. Without this check, we'd go
1312 * into an infinite loop if there's anything wrong.
1314 if (opaque->btpo_prev == lblkno)
1315 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1316 obknum, RelationGetRelationName(rel));
1317 /* Okay to try again with new lblkno value */
1321 return InvalidBuffer;
1325 * _bt_get_endpoint() -- Find the first or last page on a given tree level
1327 * If the index is empty, we will return InvalidBuffer; any other failure
1328 * condition causes ereport(). We will not return a dead page.
1330 * The returned buffer is pinned and read-locked.
1332 Buffer
1333 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
1335 Buffer buf;
1336 Page page;
1337 BTPageOpaque opaque;
1338 OffsetNumber offnum;
1339 BlockNumber blkno;
1340 IndexTuple itup;
1343 * If we are looking for a leaf page, okay to descend from fast root;
1344 * otherwise better descend from true root. (There is no point in being
1345 * smarter about intermediate levels.)
1347 if (level == 0)
1348 buf = _bt_getroot(rel, BT_READ);
1349 else
1350 buf = _bt_gettrueroot(rel);
1352 if (!BufferIsValid(buf))
1354 /* empty index... */
1355 return InvalidBuffer;
1358 page = BufferGetPage(buf);
1359 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1361 for (;;)
1364 * If we landed on a deleted page, step right to find a live page
1365 * (there must be one). Also, if we want the rightmost page, step
1366 * right if needed to get to it (this could happen if the page split
1367 * since we obtained a pointer to it).
1369 while (P_IGNORE(opaque) ||
1370 (rightmost && !P_RIGHTMOST(opaque)))
1372 blkno = opaque->btpo_next;
1373 if (blkno == P_NONE)
1374 elog(ERROR, "fell off the end of index \"%s\"",
1375 RelationGetRelationName(rel));
1376 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1377 page = BufferGetPage(buf);
1378 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1381 /* Done? */
1382 if (opaque->btpo.level == level)
1383 break;
1384 if (opaque->btpo.level < level)
1385 elog(ERROR, "btree level %u not found in index \"%s\"",
1386 level, RelationGetRelationName(rel));
1388 /* Descend to leftmost or rightmost child page */
1389 if (rightmost)
1390 offnum = PageGetMaxOffsetNumber(page);
1391 else
1392 offnum = P_FIRSTDATAKEY(opaque);
1394 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1395 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1397 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1398 page = BufferGetPage(buf);
1399 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1402 return buf;
1406 * _bt_endpoint() -- Find the first or last page in the index, and scan
1407 * from there to the first key satisfying all the quals.
1409 * This is used by _bt_first() to set up a scan when we've determined
1410 * that the scan must start at the beginning or end of the index (for
1411 * a forward or backward scan respectively). Exit conditions are the
1412 * same as for _bt_first().
1414 static bool
1415 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1417 Relation rel = scan->indexRelation;
1418 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1419 Buffer buf;
1420 Page page;
1421 BTPageOpaque opaque;
1422 OffsetNumber start;
1425 * Scan down to the leftmost or rightmost leaf page. This is a simplified
1426 * version of _bt_search(). We don't maintain a stack since we know we
1427 * won't need it.
1429 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
1431 if (!BufferIsValid(buf))
1433 /* empty index... */
1434 so->currPos.buf = InvalidBuffer;
1435 return false;
1438 page = BufferGetPage(buf);
1439 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1440 Assert(P_ISLEAF(opaque));
1442 if (ScanDirectionIsForward(dir))
1444 /* There could be dead pages to the left, so not this: */
1445 /* Assert(P_LEFTMOST(opaque)); */
1447 start = P_FIRSTDATAKEY(opaque);
1449 else if (ScanDirectionIsBackward(dir))
1451 Assert(P_RIGHTMOST(opaque));
1453 start = PageGetMaxOffsetNumber(page);
1455 else
1457 elog(ERROR, "invalid scan direction: %d", (int) dir);
1458 start = 0; /* keep compiler quiet */
1461 /* remember which buffer we have pinned */
1462 so->currPos.buf = buf;
1464 /* initialize moreLeft/moreRight appropriately for scan direction */
1465 if (ScanDirectionIsForward(dir))
1467 so->currPos.moreLeft = false;
1468 so->currPos.moreRight = true;
1470 else
1472 so->currPos.moreLeft = true;
1473 so->currPos.moreRight = false;
1475 so->numKilled = 0; /* just paranoia */
1476 so->markItemIndex = -1; /* ditto */
1479 * Now load data from the first page of the scan.
1481 if (!_bt_readpage(scan, dir, start))
1484 * There's no actually-matching data on this page. Try to advance to
1485 * the next page. Return false if there's no matching data at all.
1487 if (!_bt_steppage(scan, dir))
1488 return false;
1491 /* Drop the lock, but not pin, on the current page */
1492 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1494 /* OK, itemIndex says what to return */
1495 scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
1497 return true;