Fix xslt_process() to ensure that it inserts a NULL terminator after the
[PostgreSQL.git] / src / backend / access / nbtree / nbtutils.c
blob85101ebb027d2145b5cbb95ff6275fbcb59b3632
1 /*-------------------------------------------------------------------------
3 * nbtutils.c
4 * Utility code for Postgres btree implementation.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * $PostgreSQL$
13 *-------------------------------------------------------------------------
16 #include "postgres.h"
18 #include <time.h>
20 #include "access/genam.h"
21 #include "access/nbtree.h"
22 #include "access/reloptions.h"
23 #include "access/relscan.h"
24 #include "executor/execdebug.h"
25 #include "miscadmin.h"
26 #include "storage/bufmgr.h"
27 #include "storage/lwlock.h"
28 #include "storage/shmem.h"
29 #include "utils/lsyscache.h"
32 static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
33 ScanKey leftarg, ScanKey rightarg,
34 bool *result);
35 static void _bt_mark_scankey_with_indoption(ScanKey skey, int16 *indoption);
36 static void _bt_mark_scankey_required(ScanKey skey);
37 static bool _bt_check_rowcompare(ScanKey skey,
38 IndexTuple tuple, TupleDesc tupdesc,
39 ScanDirection dir, bool *continuescan);
43 * _bt_mkscankey
44 * Build an insertion scan key that contains comparison data from itup
45 * as well as comparator routines appropriate to the key datatypes.
47 * The result is intended for use with _bt_compare().
49 ScanKey
50 _bt_mkscankey(Relation rel, IndexTuple itup)
52 ScanKey skey;
53 TupleDesc itupdesc;
54 int natts;
55 int16 *indoption;
56 int i;
58 itupdesc = RelationGetDescr(rel);
59 natts = RelationGetNumberOfAttributes(rel);
60 indoption = rel->rd_indoption;
62 skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
64 for (i = 0; i < natts; i++)
66 FmgrInfo *procinfo;
67 Datum arg;
68 bool null;
69 int flags;
72 * We can use the cached (default) support procs since no cross-type
73 * comparison can be needed.
75 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
76 arg = index_getattr(itup, i + 1, itupdesc, &null);
77 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
78 ScanKeyEntryInitializeWithInfo(&skey[i],
79 flags,
80 (AttrNumber) (i + 1),
81 InvalidStrategy,
82 InvalidOid,
83 procinfo,
84 arg);
87 return skey;
91 * _bt_mkscankey_nodata
92 * Build an insertion scan key that contains 3-way comparator routines
93 * appropriate to the key datatypes, but no comparison data. The
94 * comparison data ultimately used must match the key datatypes.
96 * The result cannot be used with _bt_compare(), unless comparison
97 * data is first stored into the key entries. Currently this
98 * routine is only called by nbtsort.c and tuplesort.c, which have
99 * their own comparison routines.
101 ScanKey
102 _bt_mkscankey_nodata(Relation rel)
104 ScanKey skey;
105 int natts;
106 int16 *indoption;
107 int i;
109 natts = RelationGetNumberOfAttributes(rel);
110 indoption = rel->rd_indoption;
112 skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
114 for (i = 0; i < natts; i++)
116 FmgrInfo *procinfo;
117 int flags;
120 * We can use the cached (default) support procs since no cross-type
121 * comparison can be needed.
123 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
124 flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
125 ScanKeyEntryInitializeWithInfo(&skey[i],
126 flags,
127 (AttrNumber) (i + 1),
128 InvalidStrategy,
129 InvalidOid,
130 procinfo,
131 (Datum) 0);
134 return skey;
138 * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
140 void
141 _bt_freeskey(ScanKey skey)
143 pfree(skey);
147 * free a retracement stack made by _bt_search.
149 void
150 _bt_freestack(BTStack stack)
152 BTStack ostack;
154 while (stack != NULL)
156 ostack = stack;
157 stack = stack->bts_parent;
158 pfree(ostack);
164 * _bt_preprocess_keys() -- Preprocess scan keys
166 * The caller-supplied search-type keys (in scan->keyData[]) are copied to
167 * so->keyData[] with possible transformation. scan->numberOfKeys is
168 * the number of input keys, so->numberOfKeys gets the number of output
169 * keys (possibly less, never greater).
171 * The output keys are marked with additional sk_flag bits beyond the
172 * system-standard bits supplied by the caller. The DESC and NULLS_FIRST
173 * indoption bits for the relevant index attribute are copied into the flags.
174 * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
175 * so that the index sorts in the desired direction.
177 * One key purpose of this routine is to discover how many scan keys
178 * must be satisfied to continue the scan. It also attempts to eliminate
179 * redundant keys and detect contradictory keys. (If the index opfamily
180 * provides incomplete sets of cross-type operators, we may fail to detect
181 * redundant or contradictory keys, but we can survive that.)
183 * The output keys must be sorted by index attribute. Presently we expect
184 * (but verify) that the input keys are already so sorted --- this is done
185 * by group_clauses_by_indexkey() in indxpath.c. Some reordering of the keys
186 * within each attribute may be done as a byproduct of the processing here,
187 * but no other code depends on that.
189 * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
190 * if they must be satisfied in order to continue the scan forward or backward
191 * respectively. _bt_checkkeys uses these flags. For example, if the quals
192 * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
193 * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
194 * But once we reach tuples like (1,4,z) we can stop scanning because no
195 * later tuples could match. This is reflected by marking the x and y keys,
196 * but not the z key, with SK_BT_REQFWD. In general, the keys for leading
197 * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
198 * For the first attribute without an "=" key, any "<" and "<=" keys are
199 * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
200 * This can be seen to be correct by considering the above example. Note
201 * in particular that if there are no keys for a given attribute, the keys for
202 * subsequent attributes can never be required; for instance "WHERE y = 4"
203 * requires a full-index scan.
205 * If possible, redundant keys are eliminated: we keep only the tightest
206 * >/>= bound and the tightest </<= bound, and if there's an = key then
207 * that's the only one returned. (So, we return either a single = key,
208 * or one or two boundary-condition keys for each attr.) However, if we
209 * cannot compare two keys for lack of a suitable cross-type operator,
210 * we cannot eliminate either. If there are two such keys of the same
211 * operator strategy, the second one is just pushed into the output array
212 * without further processing here. We may also emit both >/>= or both
213 * </<= keys if we can't compare them. The logic about required keys still
214 * works if we don't eliminate redundant keys.
216 * As a byproduct of this work, we can detect contradictory quals such
217 * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = FALSE,
218 * indicating the scan need not be run at all since no tuples can match.
219 * (In this case we do not bother completing the output key array!)
220 * Again, missing cross-type operators might cause us to fail to prove the
221 * quals contradictory when they really are, but the scan will work correctly.
223 * Row comparison keys are currently also treated without any smarts:
224 * we just transfer them into the preprocessed array without any
225 * editorialization. We can treat them the same as an ordinary inequality
226 * comparison on the row's first index column, for the purposes of the logic
227 * about required keys.
229 * Note: the reason we have to copy the preprocessed scan keys into private
230 * storage is that we are modifying the array based on comparisons of the
231 * key argument values, which could change on a rescan. Therefore we can't
232 * overwrite the caller's data structure.
234 void
235 _bt_preprocess_keys(IndexScanDesc scan)
237 BTScanOpaque so = (BTScanOpaque) scan->opaque;
238 int numberOfKeys = scan->numberOfKeys;
239 int16 *indoption = scan->indexRelation->rd_indoption;
240 int new_numberOfKeys;
241 int numberOfEqualCols;
242 ScanKey inkeys;
243 ScanKey outkeys;
244 ScanKey cur;
245 ScanKey xform[BTMaxStrategyNumber];
246 bool test_result;
247 int i,
249 AttrNumber attno;
251 /* initialize result variables */
252 so->qual_ok = true;
253 so->numberOfKeys = 0;
255 if (numberOfKeys < 1)
256 return; /* done if qual-less scan */
258 inkeys = scan->keyData;
259 outkeys = so->keyData;
260 cur = &inkeys[0];
261 /* we check that input keys are correctly ordered */
262 if (cur->sk_attno < 1)
263 elog(ERROR, "btree index keys must be ordered by attribute");
265 /* We can short-circuit most of the work if there's just one key */
266 if (numberOfKeys == 1)
269 * We treat all btree operators as strict (even if they're not so
270 * marked in pg_proc). This means that it is impossible for an
271 * operator condition with a NULL comparison constant to succeed, and
272 * we can reject it right away.
274 * However, we now also support "x IS NULL" clauses as search
275 * conditions, so in that case keep going. The planner has not filled
276 * in any particular strategy in this case, so set it to
277 * BTEqualStrategyNumber --- we can treat IS NULL as an equality
278 * operator for purposes of search strategy.
280 if (cur->sk_flags & SK_ISNULL)
282 if (cur->sk_flags & SK_SEARCHNULL)
284 cur->sk_strategy = BTEqualStrategyNumber;
285 cur->sk_subtype = InvalidOid;
287 else
288 so->qual_ok = false;
290 _bt_mark_scankey_with_indoption(cur, indoption);
291 memcpy(outkeys, cur, sizeof(ScanKeyData));
292 so->numberOfKeys = 1;
293 /* We can mark the qual as required if it's for first index col */
294 if (cur->sk_attno == 1)
295 _bt_mark_scankey_required(outkeys);
296 return;
300 * Otherwise, do the full set of pushups.
302 new_numberOfKeys = 0;
303 numberOfEqualCols = 0;
306 * Initialize for processing of keys for attr 1.
308 * xform[i] points to the currently best scan key of strategy type i+1; it
309 * is NULL if we haven't yet found such a key for this attr.
311 attno = 1;
312 memset(xform, 0, sizeof(xform));
315 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
316 * handle after-last-key processing. Actual exit from the loop is at the
317 * "break" statement below.
319 for (i = 0;; cur++, i++)
321 if (i < numberOfKeys)
323 /* See comments above about NULLs and IS NULL handling. */
324 /* Note: we assume SK_ISNULL is never set in a row header key */
325 if (cur->sk_flags & SK_ISNULL)
327 if (cur->sk_flags & SK_SEARCHNULL)
329 cur->sk_strategy = BTEqualStrategyNumber;
330 cur->sk_subtype = InvalidOid;
332 else
334 so->qual_ok = false;
335 return;
341 * If we are at the end of the keys for a particular attr, finish up
342 * processing and emit the cleaned-up keys.
344 if (i == numberOfKeys || cur->sk_attno != attno)
346 int priorNumberOfEqualCols = numberOfEqualCols;
348 /* check input keys are correctly ordered */
349 if (i < numberOfKeys && cur->sk_attno < attno)
350 elog(ERROR, "btree index keys must be ordered by attribute");
353 * If = has been specified, all other keys can be eliminated as
354 * redundant. In case of key > 2 && key == 1 we can set qual_ok
355 * to false and abandon further processing.
357 if (xform[BTEqualStrategyNumber - 1])
359 ScanKey eq = xform[BTEqualStrategyNumber - 1];
361 for (j = BTMaxStrategyNumber; --j >= 0;)
363 ScanKey chk = xform[j];
365 if (!chk || j == (BTEqualStrategyNumber - 1))
366 continue;
368 /* IS NULL together with any other predicate must fail */
369 if (eq->sk_flags & SK_SEARCHNULL)
371 so->qual_ok = false;
372 return;
375 if (_bt_compare_scankey_args(scan, chk, eq, chk,
376 &test_result))
378 if (!test_result)
380 /* keys proven mutually contradictory */
381 so->qual_ok = false;
382 return;
384 /* else discard the redundant non-equality key */
385 xform[j] = NULL;
387 /* else, cannot determine redundancy, keep both keys */
389 /* track number of attrs for which we have "=" keys */
390 numberOfEqualCols++;
393 /* try to keep only one of <, <= */
394 if (xform[BTLessStrategyNumber - 1]
395 && xform[BTLessEqualStrategyNumber - 1])
397 ScanKey lt = xform[BTLessStrategyNumber - 1];
398 ScanKey le = xform[BTLessEqualStrategyNumber - 1];
400 if (_bt_compare_scankey_args(scan, le, lt, le,
401 &test_result))
403 if (test_result)
404 xform[BTLessEqualStrategyNumber - 1] = NULL;
405 else
406 xform[BTLessStrategyNumber - 1] = NULL;
410 /* try to keep only one of >, >= */
411 if (xform[BTGreaterStrategyNumber - 1]
412 && xform[BTGreaterEqualStrategyNumber - 1])
414 ScanKey gt = xform[BTGreaterStrategyNumber - 1];
415 ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1];
417 if (_bt_compare_scankey_args(scan, ge, gt, ge,
418 &test_result))
420 if (test_result)
421 xform[BTGreaterEqualStrategyNumber - 1] = NULL;
422 else
423 xform[BTGreaterStrategyNumber - 1] = NULL;
428 * Emit the cleaned-up keys into the outkeys[] array, and then
429 * mark them if they are required. They are required (possibly
430 * only in one direction) if all attrs before this one had "=".
432 for (j = BTMaxStrategyNumber; --j >= 0;)
434 if (xform[j])
436 ScanKey outkey = &outkeys[new_numberOfKeys++];
438 memcpy(outkey, xform[j], sizeof(ScanKeyData));
439 if (priorNumberOfEqualCols == attno - 1)
440 _bt_mark_scankey_required(outkey);
445 * Exit loop here if done.
447 if (i == numberOfKeys)
448 break;
450 /* Re-initialize for new attno */
451 attno = cur->sk_attno;
452 memset(xform, 0, sizeof(xform));
455 /* apply indoption to scankey (might change sk_strategy!) */
456 _bt_mark_scankey_with_indoption(cur, indoption);
458 /* check strategy this key's operator corresponds to */
459 j = cur->sk_strategy - 1;
461 /* if row comparison, push it directly to the output array */
462 if (cur->sk_flags & SK_ROW_HEADER)
464 ScanKey outkey = &outkeys[new_numberOfKeys++];
466 memcpy(outkey, cur, sizeof(ScanKeyData));
467 if (numberOfEqualCols == attno - 1)
468 _bt_mark_scankey_required(outkey);
471 * We don't support RowCompare using equality; such a qual would
472 * mess up the numberOfEqualCols tracking.
474 Assert(j != (BTEqualStrategyNumber - 1));
475 continue;
478 /* have we seen one of these before? */
479 if (xform[j] == NULL)
481 /* nope, so remember this scankey */
482 xform[j] = cur;
484 else
486 /* yup, keep only the more restrictive key */
488 /* if either arg is NULL, don't try to compare */
489 if ((cur->sk_flags | xform[j]->sk_flags) & SK_ISNULL)
491 /* at least one of them must be an IS NULL clause */
492 Assert(j == (BTEqualStrategyNumber - 1));
493 Assert((cur->sk_flags | xform[j]->sk_flags) & SK_SEARCHNULL);
494 /* if one is and one isn't, the search must fail */
495 if ((cur->sk_flags ^ xform[j]->sk_flags) & SK_SEARCHNULL)
497 so->qual_ok = false;
498 return;
500 /* we have duplicate IS NULL clauses, ignore the newer one */
501 continue;
504 if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
505 &test_result))
507 if (test_result)
508 xform[j] = cur;
509 else if (j == (BTEqualStrategyNumber - 1))
511 /* key == a && key == b, but a != b */
512 so->qual_ok = false;
513 return;
515 /* else old key is more restrictive, keep it */
517 else
520 * We can't determine which key is more restrictive. Keep the
521 * previous one in xform[j] and push this one directly to the
522 * output array.
524 ScanKey outkey = &outkeys[new_numberOfKeys++];
526 memcpy(outkey, cur, sizeof(ScanKeyData));
527 if (numberOfEqualCols == attno - 1)
528 _bt_mark_scankey_required(outkey);
533 so->numberOfKeys = new_numberOfKeys;
537 * Compare two scankey values using a specified operator. Both values
538 * must be already known non-NULL.
540 * The test we want to perform is logically "leftarg op rightarg", where
541 * leftarg and rightarg are the sk_argument values in those ScanKeys, and
542 * the comparison operator is the one in the op ScanKey. However, in
543 * cross-data-type situations we may need to look up the correct operator in
544 * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
545 * and amoplefttype/amoprighttype equal to the two argument datatypes.
547 * If the opfamily doesn't supply a complete set of cross-type operators we
548 * may not be able to make the comparison. If we can make the comparison
549 * we store the operator result in *result and return TRUE. We return FALSE
550 * if the comparison could not be made.
552 * Note: op always points at the same ScanKey as either leftarg or rightarg.
553 * Since we don't scribble on the scankeys, this aliasing should cause no
554 * trouble.
556 * Note: this routine needs to be insensitive to any DESC option applied
557 * to the index column. For example, "x < 4" is a tighter constraint than
558 * "x < 5" regardless of which way the index is sorted. We don't worry about
559 * NULLS FIRST/LAST either, since the given values are never nulls.
561 static bool
562 _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
563 ScanKey leftarg, ScanKey rightarg,
564 bool *result)
566 Relation rel = scan->indexRelation;
567 Oid lefttype,
568 righttype,
569 optype,
570 opcintype,
571 cmp_op;
572 StrategyNumber strat;
575 * The opfamily we need to worry about is identified by the index column.
577 Assert(leftarg->sk_attno == rightarg->sk_attno);
579 opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
582 * Determine the actual datatypes of the ScanKey arguments. We have to
583 * support the convention that sk_subtype == InvalidOid means the opclass
584 * input type; this is a hack to simplify life for ScanKeyInit().
586 lefttype = leftarg->sk_subtype;
587 if (lefttype == InvalidOid)
588 lefttype = opcintype;
589 righttype = rightarg->sk_subtype;
590 if (righttype == InvalidOid)
591 righttype = opcintype;
592 optype = op->sk_subtype;
593 if (optype == InvalidOid)
594 optype = opcintype;
597 * If leftarg and rightarg match the types expected for the "op" scankey,
598 * we can use its already-looked-up comparison function.
600 if (lefttype == opcintype && righttype == optype)
602 *result = DatumGetBool(FunctionCall2(&op->sk_func,
603 leftarg->sk_argument,
604 rightarg->sk_argument));
605 return true;
609 * Otherwise, we need to go to the syscache to find the appropriate
610 * operator. (This cannot result in infinite recursion, since no
611 * indexscan initiated by syscache lookup will use cross-data-type
612 * operators.)
614 * If the sk_strategy was flipped by _bt_mark_scankey_with_indoption, we
615 * have to un-flip it to get the correct opfamily member.
617 strat = op->sk_strategy;
618 if (op->sk_flags & SK_BT_DESC)
619 strat = BTCommuteStrategyNumber(strat);
621 cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
622 lefttype,
623 righttype,
624 strat);
625 if (OidIsValid(cmp_op))
627 RegProcedure cmp_proc = get_opcode(cmp_op);
629 if (RegProcedureIsValid(cmp_proc))
631 *result = DatumGetBool(OidFunctionCall2(cmp_proc,
632 leftarg->sk_argument,
633 rightarg->sk_argument));
634 return true;
638 /* Can't make the comparison */
639 *result = false; /* suppress compiler warnings */
640 return false;
644 * Mark a scankey with info from the index's indoption array.
646 * We copy the appropriate indoption value into the scankey sk_flags
647 * (shifting to avoid clobbering system-defined flag bits). Also, if
648 * the DESC option is set, commute (flip) the operator strategy number.
650 * This function is applied to the *input* scankey structure; therefore
651 * on a rescan we will be looking at already-processed scankeys. Hence
652 * we have to be careful not to re-commute the strategy if we already did it.
653 * It's a bit ugly to modify the caller's copy of the scankey but in practice
654 * there shouldn't be any problem, since the index's indoptions are certainly
655 * not going to change while the scankey survives.
657 static void
658 _bt_mark_scankey_with_indoption(ScanKey skey, int16 *indoption)
660 int addflags;
662 addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
663 if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
664 skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
665 skey->sk_flags |= addflags;
667 if (skey->sk_flags & SK_ROW_HEADER)
669 ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
671 for (;;)
673 Assert(subkey->sk_flags & SK_ROW_MEMBER);
674 addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
675 if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
676 subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
677 subkey->sk_flags |= addflags;
678 if (subkey->sk_flags & SK_ROW_END)
679 break;
680 subkey++;
686 * Mark a scankey as "required to continue the scan".
688 * Depending on the operator type, the key may be required for both scan
689 * directions or just one. Also, if the key is a row comparison header,
690 * we have to mark the appropriate subsidiary ScanKeys as required. In
691 * such cases, the first subsidiary key is required, but subsequent ones
692 * are required only as long as they correspond to successive index columns
693 * and match the leading column as to sort direction.
694 * Otherwise the row comparison ordering is different from the index ordering
695 * and so we can't stop the scan on the basis of those lower-order columns.
697 * Note: when we set required-key flag bits in a subsidiary scankey, we are
698 * scribbling on a data structure belonging to the index AM's caller, not on
699 * our private copy. This should be OK because the marking will not change
700 * from scan to scan within a query, and so we'd just re-mark the same way
701 * anyway on a rescan. Something to keep an eye on though.
703 static void
704 _bt_mark_scankey_required(ScanKey skey)
706 int addflags;
708 switch (skey->sk_strategy)
710 case BTLessStrategyNumber:
711 case BTLessEqualStrategyNumber:
712 addflags = SK_BT_REQFWD;
713 break;
714 case BTEqualStrategyNumber:
715 addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
716 break;
717 case BTGreaterEqualStrategyNumber:
718 case BTGreaterStrategyNumber:
719 addflags = SK_BT_REQBKWD;
720 break;
721 default:
722 elog(ERROR, "unrecognized StrategyNumber: %d",
723 (int) skey->sk_strategy);
724 addflags = 0; /* keep compiler quiet */
725 break;
728 skey->sk_flags |= addflags;
730 if (skey->sk_flags & SK_ROW_HEADER)
732 ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
733 AttrNumber attno = skey->sk_attno;
735 /* First subkey should be same as the header says */
736 Assert(subkey->sk_attno == attno);
738 for (;;)
740 Assert(subkey->sk_flags & SK_ROW_MEMBER);
741 if (subkey->sk_attno != attno)
742 break; /* non-adjacent key, so not required */
743 if (subkey->sk_strategy != skey->sk_strategy)
744 break; /* wrong direction, so not required */
745 subkey->sk_flags |= addflags;
746 if (subkey->sk_flags & SK_ROW_END)
747 break;
748 subkey++;
749 attno++;
755 * Test whether an indextuple satisfies all the scankey conditions.
757 * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE.
758 * If not, return FALSE (xs_ctup is not changed).
760 * If the tuple fails to pass the qual, we also determine whether there's
761 * any need to continue the scan beyond this tuple, and set *continuescan
762 * accordingly. See comments for _bt_preprocess_keys(), above, about how
763 * this is done.
765 * scan: index scan descriptor (containing a search-type scankey)
766 * page: buffer page containing index tuple
767 * offnum: offset number of index tuple (must be a valid item!)
768 * dir: direction we are scanning in
769 * continuescan: output parameter (will be set correctly in all cases)
771 bool
772 _bt_checkkeys(IndexScanDesc scan,
773 Page page, OffsetNumber offnum,
774 ScanDirection dir, bool *continuescan)
776 ItemId iid = PageGetItemId(page, offnum);
777 bool tuple_valid;
778 IndexTuple tuple;
779 TupleDesc tupdesc;
780 BTScanOpaque so;
781 int keysz;
782 int ikey;
783 ScanKey key;
785 *continuescan = true; /* default assumption */
788 * If the scan specifies not to return killed tuples, then we treat a
789 * killed tuple as not passing the qual. Most of the time, it's a win to
790 * not bother examining the tuple's index keys, but just return
791 * immediately with continuescan = true to proceed to the next tuple.
792 * However, if this is the last tuple on the page, we should check the
793 * index keys to prevent uselessly advancing to the next page.
795 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
797 /* return immediately if there are more tuples on the page */
798 if (ScanDirectionIsForward(dir))
800 if (offnum < PageGetMaxOffsetNumber(page))
801 return false;
803 else
805 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
807 if (offnum > P_FIRSTDATAKEY(opaque))
808 return false;
812 * OK, we want to check the keys, but we'll return FALSE even if the
813 * tuple passes the key tests.
815 tuple_valid = false;
817 else
818 tuple_valid = true;
820 tuple = (IndexTuple) PageGetItem(page, iid);
822 IncrIndexProcessed();
824 tupdesc = RelationGetDescr(scan->indexRelation);
825 so = (BTScanOpaque) scan->opaque;
826 keysz = so->numberOfKeys;
828 for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
830 Datum datum;
831 bool isNull;
832 Datum test;
834 /* row-comparison keys need special processing */
835 if (key->sk_flags & SK_ROW_HEADER)
837 if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
838 continue;
839 return false;
842 datum = index_getattr(tuple,
843 key->sk_attno,
844 tupdesc,
845 &isNull);
847 if (key->sk_flags & SK_ISNULL)
849 /* Handle IS NULL tests */
850 Assert(key->sk_flags & SK_SEARCHNULL);
852 if (isNull)
853 continue; /* tuple satisfies this qual */
856 * Tuple fails this qual. If it's a required qual for the current
857 * scan direction, then we can conclude no further tuples will
858 * pass, either.
860 if ((key->sk_flags & SK_BT_REQFWD) &&
861 ScanDirectionIsForward(dir))
862 *continuescan = false;
863 else if ((key->sk_flags & SK_BT_REQBKWD) &&
864 ScanDirectionIsBackward(dir))
865 *continuescan = false;
868 * In any case, this indextuple doesn't match the qual.
870 return false;
873 if (isNull)
875 if (key->sk_flags & SK_BT_NULLS_FIRST)
878 * Since NULLs are sorted before non-NULLs, we know we have
879 * reached the lower limit of the range of values for this
880 * index attr. On a backward scan, we can stop if this qual
881 * is one of the "must match" subset. On a forward scan,
882 * however, we should keep going.
884 if ((key->sk_flags & SK_BT_REQBKWD) &&
885 ScanDirectionIsBackward(dir))
886 *continuescan = false;
888 else
891 * Since NULLs are sorted after non-NULLs, we know we have
892 * reached the upper limit of the range of values for this
893 * index attr. On a forward scan, we can stop if this qual is
894 * one of the "must match" subset. On a backward scan,
895 * however, we should keep going.
897 if ((key->sk_flags & SK_BT_REQFWD) &&
898 ScanDirectionIsForward(dir))
899 *continuescan = false;
903 * In any case, this indextuple doesn't match the qual.
905 return false;
908 test = FunctionCall2(&key->sk_func, datum, key->sk_argument);
910 if (!DatumGetBool(test))
913 * Tuple fails this qual. If it's a required qual for the current
914 * scan direction, then we can conclude no further tuples will
915 * pass, either.
917 * Note: because we stop the scan as soon as any required equality
918 * qual fails, it is critical that equality quals be used for the
919 * initial positioning in _bt_first() when they are available. See
920 * comments in _bt_first().
922 if ((key->sk_flags & SK_BT_REQFWD) &&
923 ScanDirectionIsForward(dir))
924 *continuescan = false;
925 else if ((key->sk_flags & SK_BT_REQBKWD) &&
926 ScanDirectionIsBackward(dir))
927 *continuescan = false;
930 * In any case, this indextuple doesn't match the qual.
932 return false;
936 /* If we get here, the tuple passes all index quals. */
937 if (tuple_valid)
938 scan->xs_ctup.t_self = tuple->t_tid;
940 return tuple_valid;
944 * Test whether an indextuple satisfies a row-comparison scan condition.
946 * Return true if so, false if not. If not, also clear *continuescan if
947 * it's not possible for any future tuples in the current scan direction
948 * to pass the qual.
950 * This is a subroutine for _bt_checkkeys, which see for more info.
952 static bool
953 _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
954 ScanDirection dir, bool *continuescan)
956 ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
957 int32 cmpresult = 0;
958 bool result;
960 /* First subkey should be same as the header says */
961 Assert(subkey->sk_attno == skey->sk_attno);
963 /* Loop over columns of the row condition */
964 for (;;)
966 Datum datum;
967 bool isNull;
969 Assert(subkey->sk_flags & SK_ROW_MEMBER);
971 datum = index_getattr(tuple,
972 subkey->sk_attno,
973 tupdesc,
974 &isNull);
976 if (isNull)
978 if (subkey->sk_flags & SK_BT_NULLS_FIRST)
981 * Since NULLs are sorted before non-NULLs, we know we have
982 * reached the lower limit of the range of values for this
983 * index attr. On a backward scan, we can stop if this qual is
984 * one of the "must match" subset. On a forward scan,
985 * however, we should keep going.
987 if ((subkey->sk_flags & SK_BT_REQBKWD) &&
988 ScanDirectionIsBackward(dir))
989 *continuescan = false;
991 else
994 * Since NULLs are sorted after non-NULLs, we know we have
995 * reached the upper limit of the range of values for this
996 * index attr. On a forward scan, we can stop if this qual is
997 * one of the "must match" subset. On a backward scan,
998 * however, we should keep going.
1000 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1001 ScanDirectionIsForward(dir))
1002 *continuescan = false;
1006 * In any case, this indextuple doesn't match the qual.
1008 return false;
1011 if (subkey->sk_flags & SK_ISNULL)
1014 * Unlike the simple-scankey case, this isn't a disallowed case.
1015 * But it can never match. If all the earlier row comparison
1016 * columns are required for the scan direction, we can stop the
1017 * scan, because there can't be another tuple that will succeed.
1019 if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
1020 subkey--;
1021 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1022 ScanDirectionIsForward(dir))
1023 *continuescan = false;
1024 else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1025 ScanDirectionIsBackward(dir))
1026 *continuescan = false;
1027 return false;
1030 /* Perform the test --- three-way comparison not bool operator */
1031 cmpresult = DatumGetInt32(FunctionCall2(&subkey->sk_func,
1032 datum,
1033 subkey->sk_argument));
1035 if (subkey->sk_flags & SK_BT_DESC)
1036 cmpresult = -cmpresult;
1038 /* Done comparing if unequal, else advance to next column */
1039 if (cmpresult != 0)
1040 break;
1042 if (subkey->sk_flags & SK_ROW_END)
1043 break;
1044 subkey++;
1048 * At this point cmpresult indicates the overall result of the row
1049 * comparison, and subkey points to the deciding column (or the last
1050 * column if the result is "=").
1052 switch (subkey->sk_strategy)
1054 /* EQ and NE cases aren't allowed here */
1055 case BTLessStrategyNumber:
1056 result = (cmpresult < 0);
1057 break;
1058 case BTLessEqualStrategyNumber:
1059 result = (cmpresult <= 0);
1060 break;
1061 case BTGreaterEqualStrategyNumber:
1062 result = (cmpresult >= 0);
1063 break;
1064 case BTGreaterStrategyNumber:
1065 result = (cmpresult > 0);
1066 break;
1067 default:
1068 elog(ERROR, "unrecognized RowCompareType: %d",
1069 (int) subkey->sk_strategy);
1070 result = 0; /* keep compiler quiet */
1071 break;
1074 if (!result)
1077 * Tuple fails this qual. If it's a required qual for the current
1078 * scan direction, then we can conclude no further tuples will pass,
1079 * either. Note we have to look at the deciding column, not
1080 * necessarily the first or last column of the row condition.
1082 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1083 ScanDirectionIsForward(dir))
1084 *continuescan = false;
1085 else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1086 ScanDirectionIsBackward(dir))
1087 *continuescan = false;
1090 return result;
1094 * _bt_killitems - set LP_DEAD state for items an indexscan caller has
1095 * told us were killed
1097 * scan->so contains information about the current page and killed tuples
1098 * thereon (generally, this should only be called if so->numKilled > 0).
1100 * The caller must have pin on so->currPos.buf, but may or may not have
1101 * read-lock, as indicated by haveLock. Note that we assume read-lock
1102 * is sufficient for setting LP_DEAD status (which is only a hint).
1104 * We match items by heap TID before assuming they are the right ones to
1105 * delete. We cope with cases where items have moved right due to insertions.
1106 * If an item has moved off the current page due to a split, we'll fail to
1107 * find it and do nothing (this is not an error case --- we assume the item
1108 * will eventually get marked in a future indexscan). Note that because we
1109 * hold pin on the target page continuously from initially reading the items
1110 * until applying this function, VACUUM cannot have deleted any items from
1111 * the page, and so there is no need to search left from the recorded offset.
1112 * (This observation also guarantees that the item is still the right one
1113 * to delete, which might otherwise be questionable since heap TIDs can get
1114 * recycled.)
1116 void
1117 _bt_killitems(IndexScanDesc scan, bool haveLock)
1119 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1120 Page page;
1121 BTPageOpaque opaque;
1122 OffsetNumber minoff;
1123 OffsetNumber maxoff;
1124 int i;
1125 bool killedsomething = false;
1127 Assert(BufferIsValid(so->currPos.buf));
1129 if (!haveLock)
1130 LockBuffer(so->currPos.buf, BT_READ);
1132 page = BufferGetPage(so->currPos.buf);
1133 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1134 minoff = P_FIRSTDATAKEY(opaque);
1135 maxoff = PageGetMaxOffsetNumber(page);
1137 for (i = 0; i < so->numKilled; i++)
1139 int itemIndex = so->killedItems[i];
1140 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
1141 OffsetNumber offnum = kitem->indexOffset;
1143 Assert(itemIndex >= so->currPos.firstItem &&
1144 itemIndex <= so->currPos.lastItem);
1145 if (offnum < minoff)
1146 continue; /* pure paranoia */
1147 while (offnum <= maxoff)
1149 ItemId iid = PageGetItemId(page, offnum);
1150 IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
1152 if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
1154 /* found the item */
1155 ItemIdMarkDead(iid);
1156 killedsomething = true;
1157 break; /* out of inner search loop */
1159 offnum = OffsetNumberNext(offnum);
1164 * Since this can be redone later if needed, it's treated the same as a
1165 * commit-hint-bit status update for heap tuples: we mark the buffer dirty
1166 * but don't make a WAL log entry.
1168 * Whenever we mark anything LP_DEAD, we also set the page's
1169 * BTP_HAS_GARBAGE flag, which is likewise just a hint.
1171 if (killedsomething)
1173 opaque->btpo_flags |= BTP_HAS_GARBAGE;
1174 SetBufferCommitInfoNeedsSave(so->currPos.buf);
1177 if (!haveLock)
1178 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1181 * Always reset the scan state, so we don't look for same items on other
1182 * pages.
1184 so->numKilled = 0;
1189 * The following routines manage a shared-memory area in which we track
1190 * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
1191 * operations. There is a single counter which increments each time we
1192 * start a vacuum to assign it a cycle ID. Since multiple vacuums could
1193 * be active concurrently, we have to track the cycle ID for each active
1194 * vacuum; this requires at most MaxBackends entries (usually far fewer).
1195 * We assume at most one vacuum can be active for a given index.
1197 * Access to the shared memory area is controlled by BtreeVacuumLock.
1198 * In principle we could use a separate lmgr locktag for each index,
1199 * but a single LWLock is much cheaper, and given the short time that
1200 * the lock is ever held, the concurrency hit should be minimal.
1203 typedef struct BTOneVacInfo
1205 LockRelId relid; /* global identifier of an index */
1206 BTCycleId cycleid; /* cycle ID for its active VACUUM */
1207 } BTOneVacInfo;
1209 typedef struct BTVacInfo
1211 BTCycleId cycle_ctr; /* cycle ID most recently assigned */
1212 int num_vacuums; /* number of currently active VACUUMs */
1213 int max_vacuums; /* allocated length of vacuums[] array */
1214 BTOneVacInfo vacuums[1]; /* VARIABLE LENGTH ARRAY */
1215 } BTVacInfo;
1217 static BTVacInfo *btvacinfo;
1221 * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
1222 * or zero if there is no active VACUUM
1224 * Note: for correct interlocking, the caller must already hold pin and
1225 * exclusive lock on each buffer it will store the cycle ID into. This
1226 * ensures that even if a VACUUM starts immediately afterwards, it cannot
1227 * process those pages until the page split is complete.
1229 BTCycleId
1230 _bt_vacuum_cycleid(Relation rel)
1232 BTCycleId result = 0;
1233 int i;
1235 /* Share lock is enough since this is a read-only operation */
1236 LWLockAcquire(BtreeVacuumLock, LW_SHARED);
1238 for (i = 0; i < btvacinfo->num_vacuums; i++)
1240 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1242 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1243 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1245 result = vac->cycleid;
1246 break;
1250 LWLockRelease(BtreeVacuumLock);
1251 return result;
1255 * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
1257 * Note: the caller must guarantee that it will eventually call
1258 * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
1259 * that this happens even in elog(FATAL) scenarios, the appropriate coding
1260 * is not just a PG_TRY, but
1261 * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
1263 BTCycleId
1264 _bt_start_vacuum(Relation rel)
1266 BTCycleId result;
1267 int i;
1268 BTOneVacInfo *vac;
1270 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1273 * Assign the next cycle ID, being careful to avoid zero as well as the
1274 * reserved high values.
1276 result = ++(btvacinfo->cycle_ctr);
1277 if (result == 0 || result > MAX_BT_CYCLE_ID)
1278 result = btvacinfo->cycle_ctr = 1;
1280 /* Let's just make sure there's no entry already for this index */
1281 for (i = 0; i < btvacinfo->num_vacuums; i++)
1283 vac = &btvacinfo->vacuums[i];
1284 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1285 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1288 * Unlike most places in the backend, we have to explicitly
1289 * release our LWLock before throwing an error. This is because
1290 * we expect _bt_end_vacuum() to be called before transaction
1291 * abort cleanup can run to release LWLocks.
1293 LWLockRelease(BtreeVacuumLock);
1294 elog(ERROR, "multiple active vacuums for index \"%s\"",
1295 RelationGetRelationName(rel));
1299 /* OK, add an entry */
1300 if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
1302 LWLockRelease(BtreeVacuumLock);
1303 elog(ERROR, "out of btvacinfo slots");
1305 vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
1306 vac->relid = rel->rd_lockInfo.lockRelId;
1307 vac->cycleid = result;
1308 btvacinfo->num_vacuums++;
1310 LWLockRelease(BtreeVacuumLock);
1311 return result;
1315 * _bt_end_vacuum --- mark a btree VACUUM operation as done
1317 * Note: this is deliberately coded not to complain if no entry is found;
1318 * this allows the caller to put PG_TRY around the start_vacuum operation.
1320 void
1321 _bt_end_vacuum(Relation rel)
1323 int i;
1325 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1327 /* Find the array entry */
1328 for (i = 0; i < btvacinfo->num_vacuums; i++)
1330 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1332 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1333 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1335 /* Remove it by shifting down the last entry */
1336 *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
1337 btvacinfo->num_vacuums--;
1338 break;
1342 LWLockRelease(BtreeVacuumLock);
1346 * _bt_end_vacuum wrapped as an on_shmem_exit callback function
1348 void
1349 _bt_end_vacuum_callback(int code, Datum arg)
1351 _bt_end_vacuum((Relation) DatumGetPointer(arg));
1355 * BTreeShmemSize --- report amount of shared memory space needed
1357 Size
1358 BTreeShmemSize(void)
1360 Size size;
1362 size = offsetof(BTVacInfo, vacuums[0]);
1363 size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
1364 return size;
1368 * BTreeShmemInit --- initialize this module's shared memory
1370 void
1371 BTreeShmemInit(void)
1373 bool found;
1375 btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
1376 BTreeShmemSize(),
1377 &found);
1379 if (!IsUnderPostmaster)
1381 /* Initialize shared memory area */
1382 Assert(!found);
1385 * It doesn't really matter what the cycle counter starts at, but
1386 * having it always start the same doesn't seem good. Seed with
1387 * low-order bits of time() instead.
1389 btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
1391 btvacinfo->num_vacuums = 0;
1392 btvacinfo->max_vacuums = MaxBackends;
1394 else
1395 Assert(found);
1398 Datum
1399 btoptions(PG_FUNCTION_ARGS)
1401 Datum reloptions = PG_GETARG_DATUM(0);
1402 bool validate = PG_GETARG_BOOL(1);
1403 bytea *result;
1405 result = default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
1406 if (result)
1407 PG_RETURN_BYTEA_P(result);
1408 PG_RETURN_NULL();