1 /*-------------------------------------------------------------------------
4 * Utility code for Postgres btree implementation.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
13 *-------------------------------------------------------------------------
20 #include "access/genam.h"
21 #include "access/nbtree.h"
22 #include "access/reloptions.h"
23 #include "access/relscan.h"
24 #include "executor/execdebug.h"
25 #include "miscadmin.h"
26 #include "storage/bufmgr.h"
27 #include "storage/lwlock.h"
28 #include "storage/shmem.h"
29 #include "utils/lsyscache.h"
32 static bool _bt_compare_scankey_args(IndexScanDesc scan
, ScanKey op
,
33 ScanKey leftarg
, ScanKey rightarg
,
35 static void _bt_mark_scankey_with_indoption(ScanKey skey
, int16
*indoption
);
36 static void _bt_mark_scankey_required(ScanKey skey
);
37 static bool _bt_check_rowcompare(ScanKey skey
,
38 IndexTuple tuple
, TupleDesc tupdesc
,
39 ScanDirection dir
, bool *continuescan
);
44 * Build an insertion scan key that contains comparison data from itup
45 * as well as comparator routines appropriate to the key datatypes.
47 * The result is intended for use with _bt_compare().
50 _bt_mkscankey(Relation rel
, IndexTuple itup
)
58 itupdesc
= RelationGetDescr(rel
);
59 natts
= RelationGetNumberOfAttributes(rel
);
60 indoption
= rel
->rd_indoption
;
62 skey
= (ScanKey
) palloc(natts
* sizeof(ScanKeyData
));
64 for (i
= 0; i
< natts
; i
++)
72 * We can use the cached (default) support procs since no cross-type
73 * comparison can be needed.
75 procinfo
= index_getprocinfo(rel
, i
+ 1, BTORDER_PROC
);
76 arg
= index_getattr(itup
, i
+ 1, itupdesc
, &null
);
77 flags
= (null
? SK_ISNULL
: 0) | (indoption
[i
] << SK_BT_INDOPTION_SHIFT
);
78 ScanKeyEntryInitializeWithInfo(&skey
[i
],
91 * _bt_mkscankey_nodata
92 * Build an insertion scan key that contains 3-way comparator routines
93 * appropriate to the key datatypes, but no comparison data. The
94 * comparison data ultimately used must match the key datatypes.
96 * The result cannot be used with _bt_compare(), unless comparison
97 * data is first stored into the key entries. Currently this
98 * routine is only called by nbtsort.c and tuplesort.c, which have
99 * their own comparison routines.
102 _bt_mkscankey_nodata(Relation rel
)
109 natts
= RelationGetNumberOfAttributes(rel
);
110 indoption
= rel
->rd_indoption
;
112 skey
= (ScanKey
) palloc(natts
* sizeof(ScanKeyData
));
114 for (i
= 0; i
< natts
; i
++)
120 * We can use the cached (default) support procs since no cross-type
121 * comparison can be needed.
123 procinfo
= index_getprocinfo(rel
, i
+ 1, BTORDER_PROC
);
124 flags
= SK_ISNULL
| (indoption
[i
] << SK_BT_INDOPTION_SHIFT
);
125 ScanKeyEntryInitializeWithInfo(&skey
[i
],
127 (AttrNumber
) (i
+ 1),
138 * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
141 _bt_freeskey(ScanKey skey
)
147 * free a retracement stack made by _bt_search.
150 _bt_freestack(BTStack stack
)
154 while (stack
!= NULL
)
157 stack
= stack
->bts_parent
;
164 * _bt_preprocess_keys() -- Preprocess scan keys
166 * The caller-supplied search-type keys (in scan->keyData[]) are copied to
167 * so->keyData[] with possible transformation. scan->numberOfKeys is
168 * the number of input keys, so->numberOfKeys gets the number of output
169 * keys (possibly less, never greater).
171 * The output keys are marked with additional sk_flag bits beyond the
172 * system-standard bits supplied by the caller. The DESC and NULLS_FIRST
173 * indoption bits for the relevant index attribute are copied into the flags.
174 * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
175 * so that the index sorts in the desired direction.
177 * One key purpose of this routine is to discover how many scan keys
178 * must be satisfied to continue the scan. It also attempts to eliminate
179 * redundant keys and detect contradictory keys. (If the index opfamily
180 * provides incomplete sets of cross-type operators, we may fail to detect
181 * redundant or contradictory keys, but we can survive that.)
183 * The output keys must be sorted by index attribute. Presently we expect
184 * (but verify) that the input keys are already so sorted --- this is done
185 * by group_clauses_by_indexkey() in indxpath.c. Some reordering of the keys
186 * within each attribute may be done as a byproduct of the processing here,
187 * but no other code depends on that.
189 * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
190 * if they must be satisfied in order to continue the scan forward or backward
191 * respectively. _bt_checkkeys uses these flags. For example, if the quals
192 * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
193 * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
194 * But once we reach tuples like (1,4,z) we can stop scanning because no
195 * later tuples could match. This is reflected by marking the x and y keys,
196 * but not the z key, with SK_BT_REQFWD. In general, the keys for leading
197 * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
198 * For the first attribute without an "=" key, any "<" and "<=" keys are
199 * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
200 * This can be seen to be correct by considering the above example. Note
201 * in particular that if there are no keys for a given attribute, the keys for
202 * subsequent attributes can never be required; for instance "WHERE y = 4"
203 * requires a full-index scan.
205 * If possible, redundant keys are eliminated: we keep only the tightest
206 * >/>= bound and the tightest </<= bound, and if there's an = key then
207 * that's the only one returned. (So, we return either a single = key,
208 * or one or two boundary-condition keys for each attr.) However, if we
209 * cannot compare two keys for lack of a suitable cross-type operator,
210 * we cannot eliminate either. If there are two such keys of the same
211 * operator strategy, the second one is just pushed into the output array
212 * without further processing here. We may also emit both >/>= or both
213 * </<= keys if we can't compare them. The logic about required keys still
214 * works if we don't eliminate redundant keys.
216 * As a byproduct of this work, we can detect contradictory quals such
217 * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = FALSE,
218 * indicating the scan need not be run at all since no tuples can match.
219 * (In this case we do not bother completing the output key array!)
220 * Again, missing cross-type operators might cause us to fail to prove the
221 * quals contradictory when they really are, but the scan will work correctly.
223 * Row comparison keys are currently also treated without any smarts:
224 * we just transfer them into the preprocessed array without any
225 * editorialization. We can treat them the same as an ordinary inequality
226 * comparison on the row's first index column, for the purposes of the logic
227 * about required keys.
229 * Note: the reason we have to copy the preprocessed scan keys into private
230 * storage is that we are modifying the array based on comparisons of the
231 * key argument values, which could change on a rescan. Therefore we can't
232 * overwrite the caller's data structure.
235 _bt_preprocess_keys(IndexScanDesc scan
)
237 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
238 int numberOfKeys
= scan
->numberOfKeys
;
239 int16
*indoption
= scan
->indexRelation
->rd_indoption
;
240 int new_numberOfKeys
;
241 int numberOfEqualCols
;
245 ScanKey xform
[BTMaxStrategyNumber
];
251 /* initialize result variables */
253 so
->numberOfKeys
= 0;
255 if (numberOfKeys
< 1)
256 return; /* done if qual-less scan */
258 inkeys
= scan
->keyData
;
259 outkeys
= so
->keyData
;
261 /* we check that input keys are correctly ordered */
262 if (cur
->sk_attno
< 1)
263 elog(ERROR
, "btree index keys must be ordered by attribute");
265 /* We can short-circuit most of the work if there's just one key */
266 if (numberOfKeys
== 1)
269 * We treat all btree operators as strict (even if they're not so
270 * marked in pg_proc). This means that it is impossible for an
271 * operator condition with a NULL comparison constant to succeed, and
272 * we can reject it right away.
274 * However, we now also support "x IS NULL" clauses as search
275 * conditions, so in that case keep going. The planner has not filled
276 * in any particular strategy in this case, so set it to
277 * BTEqualStrategyNumber --- we can treat IS NULL as an equality
278 * operator for purposes of search strategy.
280 if (cur
->sk_flags
& SK_ISNULL
)
282 if (cur
->sk_flags
& SK_SEARCHNULL
)
284 cur
->sk_strategy
= BTEqualStrategyNumber
;
285 cur
->sk_subtype
= InvalidOid
;
290 _bt_mark_scankey_with_indoption(cur
, indoption
);
291 memcpy(outkeys
, cur
, sizeof(ScanKeyData
));
292 so
->numberOfKeys
= 1;
293 /* We can mark the qual as required if it's for first index col */
294 if (cur
->sk_attno
== 1)
295 _bt_mark_scankey_required(outkeys
);
300 * Otherwise, do the full set of pushups.
302 new_numberOfKeys
= 0;
303 numberOfEqualCols
= 0;
306 * Initialize for processing of keys for attr 1.
308 * xform[i] points to the currently best scan key of strategy type i+1; it
309 * is NULL if we haven't yet found such a key for this attr.
312 memset(xform
, 0, sizeof(xform
));
315 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
316 * handle after-last-key processing. Actual exit from the loop is at the
317 * "break" statement below.
319 for (i
= 0;; cur
++, i
++)
321 if (i
< numberOfKeys
)
323 /* See comments above about NULLs and IS NULL handling. */
324 /* Note: we assume SK_ISNULL is never set in a row header key */
325 if (cur
->sk_flags
& SK_ISNULL
)
327 if (cur
->sk_flags
& SK_SEARCHNULL
)
329 cur
->sk_strategy
= BTEqualStrategyNumber
;
330 cur
->sk_subtype
= InvalidOid
;
341 * If we are at the end of the keys for a particular attr, finish up
342 * processing and emit the cleaned-up keys.
344 if (i
== numberOfKeys
|| cur
->sk_attno
!= attno
)
346 int priorNumberOfEqualCols
= numberOfEqualCols
;
348 /* check input keys are correctly ordered */
349 if (i
< numberOfKeys
&& cur
->sk_attno
< attno
)
350 elog(ERROR
, "btree index keys must be ordered by attribute");
353 * If = has been specified, all other keys can be eliminated as
354 * redundant. In case of key > 2 && key == 1 we can set qual_ok
355 * to false and abandon further processing.
357 if (xform
[BTEqualStrategyNumber
- 1])
359 ScanKey eq
= xform
[BTEqualStrategyNumber
- 1];
361 for (j
= BTMaxStrategyNumber
; --j
>= 0;)
363 ScanKey chk
= xform
[j
];
365 if (!chk
|| j
== (BTEqualStrategyNumber
- 1))
368 /* IS NULL together with any other predicate must fail */
369 if (eq
->sk_flags
& SK_SEARCHNULL
)
375 if (_bt_compare_scankey_args(scan
, chk
, eq
, chk
,
380 /* keys proven mutually contradictory */
384 /* else discard the redundant non-equality key */
387 /* else, cannot determine redundancy, keep both keys */
389 /* track number of attrs for which we have "=" keys */
393 /* try to keep only one of <, <= */
394 if (xform
[BTLessStrategyNumber
- 1]
395 && xform
[BTLessEqualStrategyNumber
- 1])
397 ScanKey lt
= xform
[BTLessStrategyNumber
- 1];
398 ScanKey le
= xform
[BTLessEqualStrategyNumber
- 1];
400 if (_bt_compare_scankey_args(scan
, le
, lt
, le
,
404 xform
[BTLessEqualStrategyNumber
- 1] = NULL
;
406 xform
[BTLessStrategyNumber
- 1] = NULL
;
410 /* try to keep only one of >, >= */
411 if (xform
[BTGreaterStrategyNumber
- 1]
412 && xform
[BTGreaterEqualStrategyNumber
- 1])
414 ScanKey gt
= xform
[BTGreaterStrategyNumber
- 1];
415 ScanKey ge
= xform
[BTGreaterEqualStrategyNumber
- 1];
417 if (_bt_compare_scankey_args(scan
, ge
, gt
, ge
,
421 xform
[BTGreaterEqualStrategyNumber
- 1] = NULL
;
423 xform
[BTGreaterStrategyNumber
- 1] = NULL
;
428 * Emit the cleaned-up keys into the outkeys[] array, and then
429 * mark them if they are required. They are required (possibly
430 * only in one direction) if all attrs before this one had "=".
432 for (j
= BTMaxStrategyNumber
; --j
>= 0;)
436 ScanKey outkey
= &outkeys
[new_numberOfKeys
++];
438 memcpy(outkey
, xform
[j
], sizeof(ScanKeyData
));
439 if (priorNumberOfEqualCols
== attno
- 1)
440 _bt_mark_scankey_required(outkey
);
445 * Exit loop here if done.
447 if (i
== numberOfKeys
)
450 /* Re-initialize for new attno */
451 attno
= cur
->sk_attno
;
452 memset(xform
, 0, sizeof(xform
));
455 /* apply indoption to scankey (might change sk_strategy!) */
456 _bt_mark_scankey_with_indoption(cur
, indoption
);
458 /* check strategy this key's operator corresponds to */
459 j
= cur
->sk_strategy
- 1;
461 /* if row comparison, push it directly to the output array */
462 if (cur
->sk_flags
& SK_ROW_HEADER
)
464 ScanKey outkey
= &outkeys
[new_numberOfKeys
++];
466 memcpy(outkey
, cur
, sizeof(ScanKeyData
));
467 if (numberOfEqualCols
== attno
- 1)
468 _bt_mark_scankey_required(outkey
);
471 * We don't support RowCompare using equality; such a qual would
472 * mess up the numberOfEqualCols tracking.
474 Assert(j
!= (BTEqualStrategyNumber
- 1));
478 /* have we seen one of these before? */
479 if (xform
[j
] == NULL
)
481 /* nope, so remember this scankey */
486 /* yup, keep only the more restrictive key */
488 /* if either arg is NULL, don't try to compare */
489 if ((cur
->sk_flags
| xform
[j
]->sk_flags
) & SK_ISNULL
)
491 /* at least one of them must be an IS NULL clause */
492 Assert(j
== (BTEqualStrategyNumber
- 1));
493 Assert((cur
->sk_flags
| xform
[j
]->sk_flags
) & SK_SEARCHNULL
);
494 /* if one is and one isn't, the search must fail */
495 if ((cur
->sk_flags
^ xform
[j
]->sk_flags
) & SK_SEARCHNULL
)
500 /* we have duplicate IS NULL clauses, ignore the newer one */
504 if (_bt_compare_scankey_args(scan
, cur
, cur
, xform
[j
],
509 else if (j
== (BTEqualStrategyNumber
- 1))
511 /* key == a && key == b, but a != b */
515 /* else old key is more restrictive, keep it */
520 * We can't determine which key is more restrictive. Keep the
521 * previous one in xform[j] and push this one directly to the
524 ScanKey outkey
= &outkeys
[new_numberOfKeys
++];
526 memcpy(outkey
, cur
, sizeof(ScanKeyData
));
527 if (numberOfEqualCols
== attno
- 1)
528 _bt_mark_scankey_required(outkey
);
533 so
->numberOfKeys
= new_numberOfKeys
;
537 * Compare two scankey values using a specified operator. Both values
538 * must be already known non-NULL.
540 * The test we want to perform is logically "leftarg op rightarg", where
541 * leftarg and rightarg are the sk_argument values in those ScanKeys, and
542 * the comparison operator is the one in the op ScanKey. However, in
543 * cross-data-type situations we may need to look up the correct operator in
544 * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
545 * and amoplefttype/amoprighttype equal to the two argument datatypes.
547 * If the opfamily doesn't supply a complete set of cross-type operators we
548 * may not be able to make the comparison. If we can make the comparison
549 * we store the operator result in *result and return TRUE. We return FALSE
550 * if the comparison could not be made.
552 * Note: op always points at the same ScanKey as either leftarg or rightarg.
553 * Since we don't scribble on the scankeys, this aliasing should cause no
556 * Note: this routine needs to be insensitive to any DESC option applied
557 * to the index column. For example, "x < 4" is a tighter constraint than
558 * "x < 5" regardless of which way the index is sorted. We don't worry about
559 * NULLS FIRST/LAST either, since the given values are never nulls.
562 _bt_compare_scankey_args(IndexScanDesc scan
, ScanKey op
,
563 ScanKey leftarg
, ScanKey rightarg
,
566 Relation rel
= scan
->indexRelation
;
572 StrategyNumber strat
;
575 * The opfamily we need to worry about is identified by the index column.
577 Assert(leftarg
->sk_attno
== rightarg
->sk_attno
);
579 opcintype
= rel
->rd_opcintype
[leftarg
->sk_attno
- 1];
582 * Determine the actual datatypes of the ScanKey arguments. We have to
583 * support the convention that sk_subtype == InvalidOid means the opclass
584 * input type; this is a hack to simplify life for ScanKeyInit().
586 lefttype
= leftarg
->sk_subtype
;
587 if (lefttype
== InvalidOid
)
588 lefttype
= opcintype
;
589 righttype
= rightarg
->sk_subtype
;
590 if (righttype
== InvalidOid
)
591 righttype
= opcintype
;
592 optype
= op
->sk_subtype
;
593 if (optype
== InvalidOid
)
597 * If leftarg and rightarg match the types expected for the "op" scankey,
598 * we can use its already-looked-up comparison function.
600 if (lefttype
== opcintype
&& righttype
== optype
)
602 *result
= DatumGetBool(FunctionCall2(&op
->sk_func
,
603 leftarg
->sk_argument
,
604 rightarg
->sk_argument
));
609 * Otherwise, we need to go to the syscache to find the appropriate
610 * operator. (This cannot result in infinite recursion, since no
611 * indexscan initiated by syscache lookup will use cross-data-type
614 * If the sk_strategy was flipped by _bt_mark_scankey_with_indoption, we
615 * have to un-flip it to get the correct opfamily member.
617 strat
= op
->sk_strategy
;
618 if (op
->sk_flags
& SK_BT_DESC
)
619 strat
= BTCommuteStrategyNumber(strat
);
621 cmp_op
= get_opfamily_member(rel
->rd_opfamily
[leftarg
->sk_attno
- 1],
625 if (OidIsValid(cmp_op
))
627 RegProcedure cmp_proc
= get_opcode(cmp_op
);
629 if (RegProcedureIsValid(cmp_proc
))
631 *result
= DatumGetBool(OidFunctionCall2(cmp_proc
,
632 leftarg
->sk_argument
,
633 rightarg
->sk_argument
));
638 /* Can't make the comparison */
639 *result
= false; /* suppress compiler warnings */
644 * Mark a scankey with info from the index's indoption array.
646 * We copy the appropriate indoption value into the scankey sk_flags
647 * (shifting to avoid clobbering system-defined flag bits). Also, if
648 * the DESC option is set, commute (flip) the operator strategy number.
650 * This function is applied to the *input* scankey structure; therefore
651 * on a rescan we will be looking at already-processed scankeys. Hence
652 * we have to be careful not to re-commute the strategy if we already did it.
653 * It's a bit ugly to modify the caller's copy of the scankey but in practice
654 * there shouldn't be any problem, since the index's indoptions are certainly
655 * not going to change while the scankey survives.
658 _bt_mark_scankey_with_indoption(ScanKey skey
, int16
*indoption
)
662 addflags
= indoption
[skey
->sk_attno
- 1] << SK_BT_INDOPTION_SHIFT
;
663 if ((addflags
& SK_BT_DESC
) && !(skey
->sk_flags
& SK_BT_DESC
))
664 skey
->sk_strategy
= BTCommuteStrategyNumber(skey
->sk_strategy
);
665 skey
->sk_flags
|= addflags
;
667 if (skey
->sk_flags
& SK_ROW_HEADER
)
669 ScanKey subkey
= (ScanKey
) DatumGetPointer(skey
->sk_argument
);
673 Assert(subkey
->sk_flags
& SK_ROW_MEMBER
);
674 addflags
= indoption
[subkey
->sk_attno
- 1] << SK_BT_INDOPTION_SHIFT
;
675 if ((addflags
& SK_BT_DESC
) && !(subkey
->sk_flags
& SK_BT_DESC
))
676 subkey
->sk_strategy
= BTCommuteStrategyNumber(subkey
->sk_strategy
);
677 subkey
->sk_flags
|= addflags
;
678 if (subkey
->sk_flags
& SK_ROW_END
)
686 * Mark a scankey as "required to continue the scan".
688 * Depending on the operator type, the key may be required for both scan
689 * directions or just one. Also, if the key is a row comparison header,
690 * we have to mark the appropriate subsidiary ScanKeys as required. In
691 * such cases, the first subsidiary key is required, but subsequent ones
692 * are required only as long as they correspond to successive index columns
693 * and match the leading column as to sort direction.
694 * Otherwise the row comparison ordering is different from the index ordering
695 * and so we can't stop the scan on the basis of those lower-order columns.
697 * Note: when we set required-key flag bits in a subsidiary scankey, we are
698 * scribbling on a data structure belonging to the index AM's caller, not on
699 * our private copy. This should be OK because the marking will not change
700 * from scan to scan within a query, and so we'd just re-mark the same way
701 * anyway on a rescan. Something to keep an eye on though.
704 _bt_mark_scankey_required(ScanKey skey
)
708 switch (skey
->sk_strategy
)
710 case BTLessStrategyNumber
:
711 case BTLessEqualStrategyNumber
:
712 addflags
= SK_BT_REQFWD
;
714 case BTEqualStrategyNumber
:
715 addflags
= SK_BT_REQFWD
| SK_BT_REQBKWD
;
717 case BTGreaterEqualStrategyNumber
:
718 case BTGreaterStrategyNumber
:
719 addflags
= SK_BT_REQBKWD
;
722 elog(ERROR
, "unrecognized StrategyNumber: %d",
723 (int) skey
->sk_strategy
);
724 addflags
= 0; /* keep compiler quiet */
728 skey
->sk_flags
|= addflags
;
730 if (skey
->sk_flags
& SK_ROW_HEADER
)
732 ScanKey subkey
= (ScanKey
) DatumGetPointer(skey
->sk_argument
);
733 AttrNumber attno
= skey
->sk_attno
;
735 /* First subkey should be same as the header says */
736 Assert(subkey
->sk_attno
== attno
);
740 Assert(subkey
->sk_flags
& SK_ROW_MEMBER
);
741 if (subkey
->sk_attno
!= attno
)
742 break; /* non-adjacent key, so not required */
743 if (subkey
->sk_strategy
!= skey
->sk_strategy
)
744 break; /* wrong direction, so not required */
745 subkey
->sk_flags
|= addflags
;
746 if (subkey
->sk_flags
& SK_ROW_END
)
755 * Test whether an indextuple satisfies all the scankey conditions.
757 * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE.
758 * If not, return FALSE (xs_ctup is not changed).
760 * If the tuple fails to pass the qual, we also determine whether there's
761 * any need to continue the scan beyond this tuple, and set *continuescan
762 * accordingly. See comments for _bt_preprocess_keys(), above, about how
765 * scan: index scan descriptor (containing a search-type scankey)
766 * page: buffer page containing index tuple
767 * offnum: offset number of index tuple (must be a valid item!)
768 * dir: direction we are scanning in
769 * continuescan: output parameter (will be set correctly in all cases)
772 _bt_checkkeys(IndexScanDesc scan
,
773 Page page
, OffsetNumber offnum
,
774 ScanDirection dir
, bool *continuescan
)
776 ItemId iid
= PageGetItemId(page
, offnum
);
785 *continuescan
= true; /* default assumption */
788 * If the scan specifies not to return killed tuples, then we treat a
789 * killed tuple as not passing the qual. Most of the time, it's a win to
790 * not bother examining the tuple's index keys, but just return
791 * immediately with continuescan = true to proceed to the next tuple.
792 * However, if this is the last tuple on the page, we should check the
793 * index keys to prevent uselessly advancing to the next page.
795 if (scan
->ignore_killed_tuples
&& ItemIdIsDead(iid
))
797 /* return immediately if there are more tuples on the page */
798 if (ScanDirectionIsForward(dir
))
800 if (offnum
< PageGetMaxOffsetNumber(page
))
805 BTPageOpaque opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
807 if (offnum
> P_FIRSTDATAKEY(opaque
))
812 * OK, we want to check the keys, but we'll return FALSE even if the
813 * tuple passes the key tests.
820 tuple
= (IndexTuple
) PageGetItem(page
, iid
);
822 IncrIndexProcessed();
824 tupdesc
= RelationGetDescr(scan
->indexRelation
);
825 so
= (BTScanOpaque
) scan
->opaque
;
826 keysz
= so
->numberOfKeys
;
828 for (key
= so
->keyData
, ikey
= 0; ikey
< keysz
; key
++, ikey
++)
834 /* row-comparison keys need special processing */
835 if (key
->sk_flags
& SK_ROW_HEADER
)
837 if (_bt_check_rowcompare(key
, tuple
, tupdesc
, dir
, continuescan
))
842 datum
= index_getattr(tuple
,
847 if (key
->sk_flags
& SK_ISNULL
)
849 /* Handle IS NULL tests */
850 Assert(key
->sk_flags
& SK_SEARCHNULL
);
853 continue; /* tuple satisfies this qual */
856 * Tuple fails this qual. If it's a required qual for the current
857 * scan direction, then we can conclude no further tuples will
860 if ((key
->sk_flags
& SK_BT_REQFWD
) &&
861 ScanDirectionIsForward(dir
))
862 *continuescan
= false;
863 else if ((key
->sk_flags
& SK_BT_REQBKWD
) &&
864 ScanDirectionIsBackward(dir
))
865 *continuescan
= false;
868 * In any case, this indextuple doesn't match the qual.
875 if (key
->sk_flags
& SK_BT_NULLS_FIRST
)
878 * Since NULLs are sorted before non-NULLs, we know we have
879 * reached the lower limit of the range of values for this
880 * index attr. On a backward scan, we can stop if this qual
881 * is one of the "must match" subset. On a forward scan,
882 * however, we should keep going.
884 if ((key
->sk_flags
& SK_BT_REQBKWD
) &&
885 ScanDirectionIsBackward(dir
))
886 *continuescan
= false;
891 * Since NULLs are sorted after non-NULLs, we know we have
892 * reached the upper limit of the range of values for this
893 * index attr. On a forward scan, we can stop if this qual is
894 * one of the "must match" subset. On a backward scan,
895 * however, we should keep going.
897 if ((key
->sk_flags
& SK_BT_REQFWD
) &&
898 ScanDirectionIsForward(dir
))
899 *continuescan
= false;
903 * In any case, this indextuple doesn't match the qual.
908 test
= FunctionCall2(&key
->sk_func
, datum
, key
->sk_argument
);
910 if (!DatumGetBool(test
))
913 * Tuple fails this qual. If it's a required qual for the current
914 * scan direction, then we can conclude no further tuples will
917 * Note: because we stop the scan as soon as any required equality
918 * qual fails, it is critical that equality quals be used for the
919 * initial positioning in _bt_first() when they are available. See
920 * comments in _bt_first().
922 if ((key
->sk_flags
& SK_BT_REQFWD
) &&
923 ScanDirectionIsForward(dir
))
924 *continuescan
= false;
925 else if ((key
->sk_flags
& SK_BT_REQBKWD
) &&
926 ScanDirectionIsBackward(dir
))
927 *continuescan
= false;
930 * In any case, this indextuple doesn't match the qual.
936 /* If we get here, the tuple passes all index quals. */
938 scan
->xs_ctup
.t_self
= tuple
->t_tid
;
944 * Test whether an indextuple satisfies a row-comparison scan condition.
946 * Return true if so, false if not. If not, also clear *continuescan if
947 * it's not possible for any future tuples in the current scan direction
950 * This is a subroutine for _bt_checkkeys, which see for more info.
953 _bt_check_rowcompare(ScanKey skey
, IndexTuple tuple
, TupleDesc tupdesc
,
954 ScanDirection dir
, bool *continuescan
)
956 ScanKey subkey
= (ScanKey
) DatumGetPointer(skey
->sk_argument
);
960 /* First subkey should be same as the header says */
961 Assert(subkey
->sk_attno
== skey
->sk_attno
);
963 /* Loop over columns of the row condition */
969 Assert(subkey
->sk_flags
& SK_ROW_MEMBER
);
971 datum
= index_getattr(tuple
,
978 if (subkey
->sk_flags
& SK_BT_NULLS_FIRST
)
981 * Since NULLs are sorted before non-NULLs, we know we have
982 * reached the lower limit of the range of values for this
983 * index attr. On a backward scan, we can stop if this qual is
984 * one of the "must match" subset. On a forward scan,
985 * however, we should keep going.
987 if ((subkey
->sk_flags
& SK_BT_REQBKWD
) &&
988 ScanDirectionIsBackward(dir
))
989 *continuescan
= false;
994 * Since NULLs are sorted after non-NULLs, we know we have
995 * reached the upper limit of the range of values for this
996 * index attr. On a forward scan, we can stop if this qual is
997 * one of the "must match" subset. On a backward scan,
998 * however, we should keep going.
1000 if ((subkey
->sk_flags
& SK_BT_REQFWD
) &&
1001 ScanDirectionIsForward(dir
))
1002 *continuescan
= false;
1006 * In any case, this indextuple doesn't match the qual.
1011 if (subkey
->sk_flags
& SK_ISNULL
)
1014 * Unlike the simple-scankey case, this isn't a disallowed case.
1015 * But it can never match. If all the earlier row comparison
1016 * columns are required for the scan direction, we can stop the
1017 * scan, because there can't be another tuple that will succeed.
1019 if (subkey
!= (ScanKey
) DatumGetPointer(skey
->sk_argument
))
1021 if ((subkey
->sk_flags
& SK_BT_REQFWD
) &&
1022 ScanDirectionIsForward(dir
))
1023 *continuescan
= false;
1024 else if ((subkey
->sk_flags
& SK_BT_REQBKWD
) &&
1025 ScanDirectionIsBackward(dir
))
1026 *continuescan
= false;
1030 /* Perform the test --- three-way comparison not bool operator */
1031 cmpresult
= DatumGetInt32(FunctionCall2(&subkey
->sk_func
,
1033 subkey
->sk_argument
));
1035 if (subkey
->sk_flags
& SK_BT_DESC
)
1036 cmpresult
= -cmpresult
;
1038 /* Done comparing if unequal, else advance to next column */
1042 if (subkey
->sk_flags
& SK_ROW_END
)
1048 * At this point cmpresult indicates the overall result of the row
1049 * comparison, and subkey points to the deciding column (or the last
1050 * column if the result is "=").
1052 switch (subkey
->sk_strategy
)
1054 /* EQ and NE cases aren't allowed here */
1055 case BTLessStrategyNumber
:
1056 result
= (cmpresult
< 0);
1058 case BTLessEqualStrategyNumber
:
1059 result
= (cmpresult
<= 0);
1061 case BTGreaterEqualStrategyNumber
:
1062 result
= (cmpresult
>= 0);
1064 case BTGreaterStrategyNumber
:
1065 result
= (cmpresult
> 0);
1068 elog(ERROR
, "unrecognized RowCompareType: %d",
1069 (int) subkey
->sk_strategy
);
1070 result
= 0; /* keep compiler quiet */
1077 * Tuple fails this qual. If it's a required qual for the current
1078 * scan direction, then we can conclude no further tuples will pass,
1079 * either. Note we have to look at the deciding column, not
1080 * necessarily the first or last column of the row condition.
1082 if ((subkey
->sk_flags
& SK_BT_REQFWD
) &&
1083 ScanDirectionIsForward(dir
))
1084 *continuescan
= false;
1085 else if ((subkey
->sk_flags
& SK_BT_REQBKWD
) &&
1086 ScanDirectionIsBackward(dir
))
1087 *continuescan
= false;
1094 * _bt_killitems - set LP_DEAD state for items an indexscan caller has
1095 * told us were killed
1097 * scan->so contains information about the current page and killed tuples
1098 * thereon (generally, this should only be called if so->numKilled > 0).
1100 * The caller must have pin on so->currPos.buf, but may or may not have
1101 * read-lock, as indicated by haveLock. Note that we assume read-lock
1102 * is sufficient for setting LP_DEAD status (which is only a hint).
1104 * We match items by heap TID before assuming they are the right ones to
1105 * delete. We cope with cases where items have moved right due to insertions.
1106 * If an item has moved off the current page due to a split, we'll fail to
1107 * find it and do nothing (this is not an error case --- we assume the item
1108 * will eventually get marked in a future indexscan). Note that because we
1109 * hold pin on the target page continuously from initially reading the items
1110 * until applying this function, VACUUM cannot have deleted any items from
1111 * the page, and so there is no need to search left from the recorded offset.
1112 * (This observation also guarantees that the item is still the right one
1113 * to delete, which might otherwise be questionable since heap TIDs can get
1117 _bt_killitems(IndexScanDesc scan
, bool haveLock
)
1119 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
1121 BTPageOpaque opaque
;
1122 OffsetNumber minoff
;
1123 OffsetNumber maxoff
;
1125 bool killedsomething
= false;
1127 Assert(BufferIsValid(so
->currPos
.buf
));
1130 LockBuffer(so
->currPos
.buf
, BT_READ
);
1132 page
= BufferGetPage(so
->currPos
.buf
);
1133 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1134 minoff
= P_FIRSTDATAKEY(opaque
);
1135 maxoff
= PageGetMaxOffsetNumber(page
);
1137 for (i
= 0; i
< so
->numKilled
; i
++)
1139 int itemIndex
= so
->killedItems
[i
];
1140 BTScanPosItem
*kitem
= &so
->currPos
.items
[itemIndex
];
1141 OffsetNumber offnum
= kitem
->indexOffset
;
1143 Assert(itemIndex
>= so
->currPos
.firstItem
&&
1144 itemIndex
<= so
->currPos
.lastItem
);
1145 if (offnum
< minoff
)
1146 continue; /* pure paranoia */
1147 while (offnum
<= maxoff
)
1149 ItemId iid
= PageGetItemId(page
, offnum
);
1150 IndexTuple ituple
= (IndexTuple
) PageGetItem(page
, iid
);
1152 if (ItemPointerEquals(&ituple
->t_tid
, &kitem
->heapTid
))
1154 /* found the item */
1155 ItemIdMarkDead(iid
);
1156 killedsomething
= true;
1157 break; /* out of inner search loop */
1159 offnum
= OffsetNumberNext(offnum
);
1164 * Since this can be redone later if needed, it's treated the same as a
1165 * commit-hint-bit status update for heap tuples: we mark the buffer dirty
1166 * but don't make a WAL log entry.
1168 * Whenever we mark anything LP_DEAD, we also set the page's
1169 * BTP_HAS_GARBAGE flag, which is likewise just a hint.
1171 if (killedsomething
)
1173 opaque
->btpo_flags
|= BTP_HAS_GARBAGE
;
1174 SetBufferCommitInfoNeedsSave(so
->currPos
.buf
);
1178 LockBuffer(so
->currPos
.buf
, BUFFER_LOCK_UNLOCK
);
1181 * Always reset the scan state, so we don't look for same items on other
1189 * The following routines manage a shared-memory area in which we track
1190 * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
1191 * operations. There is a single counter which increments each time we
1192 * start a vacuum to assign it a cycle ID. Since multiple vacuums could
1193 * be active concurrently, we have to track the cycle ID for each active
1194 * vacuum; this requires at most MaxBackends entries (usually far fewer).
1195 * We assume at most one vacuum can be active for a given index.
1197 * Access to the shared memory area is controlled by BtreeVacuumLock.
1198 * In principle we could use a separate lmgr locktag for each index,
1199 * but a single LWLock is much cheaper, and given the short time that
1200 * the lock is ever held, the concurrency hit should be minimal.
1203 typedef struct BTOneVacInfo
1205 LockRelId relid
; /* global identifier of an index */
1206 BTCycleId cycleid
; /* cycle ID for its active VACUUM */
1209 typedef struct BTVacInfo
1211 BTCycleId cycle_ctr
; /* cycle ID most recently assigned */
1212 int num_vacuums
; /* number of currently active VACUUMs */
1213 int max_vacuums
; /* allocated length of vacuums[] array */
1214 BTOneVacInfo vacuums
[1]; /* VARIABLE LENGTH ARRAY */
1217 static BTVacInfo
*btvacinfo
;
1221 * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
1222 * or zero if there is no active VACUUM
1224 * Note: for correct interlocking, the caller must already hold pin and
1225 * exclusive lock on each buffer it will store the cycle ID into. This
1226 * ensures that even if a VACUUM starts immediately afterwards, it cannot
1227 * process those pages until the page split is complete.
1230 _bt_vacuum_cycleid(Relation rel
)
1232 BTCycleId result
= 0;
1235 /* Share lock is enough since this is a read-only operation */
1236 LWLockAcquire(BtreeVacuumLock
, LW_SHARED
);
1238 for (i
= 0; i
< btvacinfo
->num_vacuums
; i
++)
1240 BTOneVacInfo
*vac
= &btvacinfo
->vacuums
[i
];
1242 if (vac
->relid
.relId
== rel
->rd_lockInfo
.lockRelId
.relId
&&
1243 vac
->relid
.dbId
== rel
->rd_lockInfo
.lockRelId
.dbId
)
1245 result
= vac
->cycleid
;
1250 LWLockRelease(BtreeVacuumLock
);
1255 * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
1257 * Note: the caller must guarantee that it will eventually call
1258 * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
1259 * that this happens even in elog(FATAL) scenarios, the appropriate coding
1260 * is not just a PG_TRY, but
1261 * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
1264 _bt_start_vacuum(Relation rel
)
1270 LWLockAcquire(BtreeVacuumLock
, LW_EXCLUSIVE
);
1273 * Assign the next cycle ID, being careful to avoid zero as well as the
1274 * reserved high values.
1276 result
= ++(btvacinfo
->cycle_ctr
);
1277 if (result
== 0 || result
> MAX_BT_CYCLE_ID
)
1278 result
= btvacinfo
->cycle_ctr
= 1;
1280 /* Let's just make sure there's no entry already for this index */
1281 for (i
= 0; i
< btvacinfo
->num_vacuums
; i
++)
1283 vac
= &btvacinfo
->vacuums
[i
];
1284 if (vac
->relid
.relId
== rel
->rd_lockInfo
.lockRelId
.relId
&&
1285 vac
->relid
.dbId
== rel
->rd_lockInfo
.lockRelId
.dbId
)
1288 * Unlike most places in the backend, we have to explicitly
1289 * release our LWLock before throwing an error. This is because
1290 * we expect _bt_end_vacuum() to be called before transaction
1291 * abort cleanup can run to release LWLocks.
1293 LWLockRelease(BtreeVacuumLock
);
1294 elog(ERROR
, "multiple active vacuums for index \"%s\"",
1295 RelationGetRelationName(rel
));
1299 /* OK, add an entry */
1300 if (btvacinfo
->num_vacuums
>= btvacinfo
->max_vacuums
)
1302 LWLockRelease(BtreeVacuumLock
);
1303 elog(ERROR
, "out of btvacinfo slots");
1305 vac
= &btvacinfo
->vacuums
[btvacinfo
->num_vacuums
];
1306 vac
->relid
= rel
->rd_lockInfo
.lockRelId
;
1307 vac
->cycleid
= result
;
1308 btvacinfo
->num_vacuums
++;
1310 LWLockRelease(BtreeVacuumLock
);
1315 * _bt_end_vacuum --- mark a btree VACUUM operation as done
1317 * Note: this is deliberately coded not to complain if no entry is found;
1318 * this allows the caller to put PG_TRY around the start_vacuum operation.
1321 _bt_end_vacuum(Relation rel
)
1325 LWLockAcquire(BtreeVacuumLock
, LW_EXCLUSIVE
);
1327 /* Find the array entry */
1328 for (i
= 0; i
< btvacinfo
->num_vacuums
; i
++)
1330 BTOneVacInfo
*vac
= &btvacinfo
->vacuums
[i
];
1332 if (vac
->relid
.relId
== rel
->rd_lockInfo
.lockRelId
.relId
&&
1333 vac
->relid
.dbId
== rel
->rd_lockInfo
.lockRelId
.dbId
)
1335 /* Remove it by shifting down the last entry */
1336 *vac
= btvacinfo
->vacuums
[btvacinfo
->num_vacuums
- 1];
1337 btvacinfo
->num_vacuums
--;
1342 LWLockRelease(BtreeVacuumLock
);
1346 * _bt_end_vacuum wrapped as an on_shmem_exit callback function
1349 _bt_end_vacuum_callback(int code
, Datum arg
)
1351 _bt_end_vacuum((Relation
) DatumGetPointer(arg
));
1355 * BTreeShmemSize --- report amount of shared memory space needed
1358 BTreeShmemSize(void)
1362 size
= offsetof(BTVacInfo
, vacuums
[0]);
1363 size
= add_size(size
, mul_size(MaxBackends
, sizeof(BTOneVacInfo
)));
1368 * BTreeShmemInit --- initialize this module's shared memory
1371 BTreeShmemInit(void)
1375 btvacinfo
= (BTVacInfo
*) ShmemInitStruct("BTree Vacuum State",
1379 if (!IsUnderPostmaster
)
1381 /* Initialize shared memory area */
1385 * It doesn't really matter what the cycle counter starts at, but
1386 * having it always start the same doesn't seem good. Seed with
1387 * low-order bits of time() instead.
1389 btvacinfo
->cycle_ctr
= (BTCycleId
) time(NULL
);
1391 btvacinfo
->num_vacuums
= 0;
1392 btvacinfo
->max_vacuums
= MaxBackends
;
1399 btoptions(PG_FUNCTION_ARGS
)
1401 Datum reloptions
= PG_GETARG_DATUM(0);
1402 bool validate
= PG_GETARG_BOOL(1);
1405 result
= default_reloptions(reloptions
, validate
, RELOPT_KIND_BTREE
);
1407 PG_RETURN_BYTEA_P(result
);