src/backend/access/nbtree/nbtutils.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtutils.c
   4  *        Utility code for Postgres btree implementation.
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include <time.h>
  19
  20 #include "access/genam.h"
  21 #include "access/nbtree.h"
  22 #include "access/reloptions.h"
  23 #include "access/relscan.h"
  24 #include "executor/execdebug.h"
  25 #include "miscadmin.h"
  26 #include "storage/bufmgr.h"
  27 #include "storage/lwlock.h"
  28 #include "storage/shmem.h"
  29 #include "utils/lsyscache.h"
  30
  31
  32 static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
  33                                                  ScanKey leftarg, ScanKey rightarg,
  34                                                  bool *result);
  35 static void _bt_mark_scankey_with_indoption(ScanKey skey, int16 *indoption);
  36 static void _bt_mark_scankey_required(ScanKey skey);
  37 static bool _bt_check_rowcompare(ScanKey skey,
  38                                          IndexTuple tuple, TupleDesc tupdesc,
  39                                          ScanDirection dir, bool *continuescan);
  40
  41
  42 /*
  43  * _bt_mkscankey
  44  *              Build an insertion scan key that contains comparison data from itup
  45  *              as well as comparator routines appropriate to the key datatypes.
  46  *
  47  *              The result is intended for use with _bt_compare().
  48  */
  49 ScanKey
  50 _bt_mkscankey(Relation rel, IndexTuple itup)
  51 {
  52         ScanKey         skey;
  53         TupleDesc       itupdesc;
  54         int                     natts;
  55         int16      *indoption;
  56         int                     i;
  57
  58         itupdesc = RelationGetDescr(rel);
  59         natts = RelationGetNumberOfAttributes(rel);
  60         indoption = rel->rd_indoption;
  61
  62         skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
  63
  64         for (i = 0; i < natts; i++)
  65         {
  66                 FmgrInfo   *procinfo;
  67                 Datum           arg;
  68                 bool            null;
  69                 int                     flags;
  70
  71                 /*
  72                  * We can use the cached (default) support procs since no cross-type
  73                  * comparison can be needed.
  74                  */
  75                 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
  76                 arg = index_getattr(itup, i + 1, itupdesc, &null);
  77                 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
  78                 ScanKeyEntryInitializeWithInfo(&skey[i],
  79                                                                            flags,
  80                                                                            (AttrNumber) (i + 1),
  81                                                                            InvalidStrategy,
  82                                                                            InvalidOid,
  83                                                                            procinfo,
  84                                                                            arg);
  85         }
  86
  87         return skey;
  88 }
  89
  90 /*
  91  * _bt_mkscankey_nodata
  92  *              Build an insertion scan key that contains 3-way comparator routines
  93  *              appropriate to the key datatypes, but no comparison data.  The
  94  *              comparison data ultimately used must match the key datatypes.
  95  *
  96  *              The result cannot be used with _bt_compare(), unless comparison
  97  *              data is first stored into the key entries.      Currently this
  98  *              routine is only called by nbtsort.c and tuplesort.c, which have
  99  *              their own comparison routines.
 100  */
 101 ScanKey
 102 _bt_mkscankey_nodata(Relation rel)
 103 {
 104         ScanKey         skey;
 105         int                     natts;
 106         int16      *indoption;
 107         int                     i;
 108
 109         natts = RelationGetNumberOfAttributes(rel);
 110         indoption = rel->rd_indoption;
 111
 112         skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
 113
 114         for (i = 0; i < natts; i++)
 115         {
 116                 FmgrInfo   *procinfo;
 117                 int                     flags;
 118
 119                 /*
 120                  * We can use the cached (default) support procs since no cross-type
 121                  * comparison can be needed.
 122                  */
 123                 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
 124                 flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
 125                 ScanKeyEntryInitializeWithInfo(&skey[i],
 126                                                                            flags,
 127                                                                            (AttrNumber) (i + 1),
 128                                                                            InvalidStrategy,
 129                                                                            InvalidOid,
 130                                                                            procinfo,
 131                                                                            (Datum) 0);
 132         }
 133
 134         return skey;
 135 }
 136
 137 /*
 138  * free a scan key made by either _bt_mkscankey or _bt_mkscankey_nodata.
 139  */
 140 void
 141 _bt_freeskey(ScanKey skey)
 142 {
 143         pfree(skey);
 144 }
 145
 146 /*
 147  * free a retracement stack made by _bt_search.
 148  */
 149 void
 150 _bt_freestack(BTStack stack)
 151 {
 152         BTStack         ostack;
 153
 154         while (stack != NULL)
 155         {
 156                 ostack = stack;
 157                 stack = stack->bts_parent;
 158                 pfree(ostack);
 159         }
 160 }
 161
 162
 163 /*
 164  *      _bt_preprocess_keys() -- Preprocess scan keys
 165  *
 166  * The caller-supplied search-type keys (in scan->keyData[]) are copied to
 167  * so->keyData[] with possible transformation.  scan->numberOfKeys is
 168  * the number of input keys, so->numberOfKeys gets the number of output
 169  * keys (possibly less, never greater).
 170  *
 171  * The output keys are marked with additional sk_flag bits beyond the
 172  * system-standard bits supplied by the caller.  The DESC and NULLS_FIRST
 173  * indoption bits for the relevant index attribute are copied into the flags.
 174  * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
 175  * so that the index sorts in the desired direction.
 176  *
 177  * One key purpose of this routine is to discover how many scan keys
 178  * must be satisfied to continue the scan.      It also attempts to eliminate
 179  * redundant keys and detect contradictory keys.  (If the index opfamily
 180  * provides incomplete sets of cross-type operators, we may fail to detect
 181  * redundant or contradictory keys, but we can survive that.)
 182  *
 183  * The output keys must be sorted by index attribute.  Presently we expect
 184  * (but verify) that the input keys are already so sorted --- this is done
 185  * by group_clauses_by_indexkey() in indxpath.c.  Some reordering of the keys
 186  * within each attribute may be done as a byproduct of the processing here,
 187  * but no other code depends on that.
 188  *
 189  * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
 190  * if they must be satisfied in order to continue the scan forward or backward
 191  * respectively.  _bt_checkkeys uses these flags.  For example, if the quals
 192  * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
 193  * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
 194  * But once we reach tuples like (1,4,z) we can stop scanning because no
 195  * later tuples could match.  This is reflected by marking the x and y keys,
 196  * but not the z key, with SK_BT_REQFWD.  In general, the keys for leading
 197  * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
 198  * For the first attribute without an "=" key, any "<" and "<=" keys are
 199  * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
 200  * This can be seen to be correct by considering the above example.  Note
 201  * in particular that if there are no keys for a given attribute, the keys for
 202  * subsequent attributes can never be required; for instance "WHERE y = 4"
 203  * requires a full-index scan.
 204  *
 205  * If possible, redundant keys are eliminated: we keep only the tightest
 206  * >/>= bound and the tightest </<= bound, and if there's an = key then
 207  * that's the only one returned.  (So, we return either a single = key,
 208  * or one or two boundary-condition keys for each attr.)  However, if we
 209  * cannot compare two keys for lack of a suitable cross-type operator,
 210  * we cannot eliminate either.  If there are two such keys of the same
 211  * operator strategy, the second one is just pushed into the output array
 212  * without further processing here.  We may also emit both >/>= or both
 213  * </<= keys if we can't compare them.  The logic about required keys still
 214  * works if we don't eliminate redundant keys.
 215  *
 216  * As a byproduct of this work, we can detect contradictory quals such
 217  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = FALSE,
 218  * indicating the scan need not be run at all since no tuples can match.
 219  * (In this case we do not bother completing the output key array!)
 220  * Again, missing cross-type operators might cause us to fail to prove the
 221  * quals contradictory when they really are, but the scan will work correctly.
 222  *
 223  * Row comparison keys are currently also treated without any smarts:
 224  * we just transfer them into the preprocessed array without any
 225  * editorialization.  We can treat them the same as an ordinary inequality
 226  * comparison on the row's first index column, for the purposes of the logic
 227  * about required keys.
 228  *
 229  * Note: the reason we have to copy the preprocessed scan keys into private
 230  * storage is that we are modifying the array based on comparisons of the
 231  * key argument values, which could change on a rescan.  Therefore we can't
 232  * overwrite the caller's data structure.
 233  */
 234 void
 235 _bt_preprocess_keys(IndexScanDesc scan)
 236 {
 237         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 238         int                     numberOfKeys = scan->numberOfKeys;
 239         int16      *indoption = scan->indexRelation->rd_indoption;
 240         int                     new_numberOfKeys;
 241         int                     numberOfEqualCols;
 242         ScanKey         inkeys;
 243         ScanKey         outkeys;
 244         ScanKey         cur;
 245         ScanKey         xform[BTMaxStrategyNumber];
 246         bool            test_result;
 247         int                     i,
 248                                 j;
 249         AttrNumber      attno;
 250
 251         /* initialize result variables */
 252         so->qual_ok = true;
 253         so->numberOfKeys = 0;
 254
 255         if (numberOfKeys < 1)
 256                 return;                                 /* done if qual-less scan */
 257
 258         inkeys = scan->keyData;
 259         outkeys = so->keyData;
 260         cur = &inkeys[0];
 261         /* we check that input keys are correctly ordered */
 262         if (cur->sk_attno < 1)
 263                 elog(ERROR, "btree index keys must be ordered by attribute");
 264
 265         /* We can short-circuit most of the work if there's just one key */
 266         if (numberOfKeys == 1)
 267         {
 268                 /*
 269                  * We treat all btree operators as strict (even if they're not so
 270                  * marked in pg_proc).  This means that it is impossible for an
 271                  * operator condition with a NULL comparison constant to succeed, and
 272                  * we can reject it right away.
 273                  *
 274                  * However, we now also support "x IS NULL" clauses as search
 275                  * conditions, so in that case keep going.      The planner has not filled
 276                  * in any particular strategy in this case, so set it to
 277                  * BTEqualStrategyNumber --- we can treat IS NULL as an equality
 278                  * operator for purposes of search strategy.
 279                  */
 280                 if (cur->sk_flags & SK_ISNULL)
 281                 {
 282                         if (cur->sk_flags & SK_SEARCHNULL)
 283                         {
 284                                 cur->sk_strategy = BTEqualStrategyNumber;
 285                                 cur->sk_subtype = InvalidOid;
 286                         }
 287                         else
 288                                 so->qual_ok = false;
 289                 }
 290                 _bt_mark_scankey_with_indoption(cur, indoption);
 291                 memcpy(outkeys, cur, sizeof(ScanKeyData));
 292                 so->numberOfKeys = 1;
 293                 /* We can mark the qual as required if it's for first index col */
 294                 if (cur->sk_attno == 1)
 295                         _bt_mark_scankey_required(outkeys);
 296                 return;
 297         }
 298
 299         /*
 300          * Otherwise, do the full set of pushups.
 301          */
 302         new_numberOfKeys = 0;
 303         numberOfEqualCols = 0;
 304
 305         /*
 306          * Initialize for processing of keys for attr 1.
 307          *
 308          * xform[i] points to the currently best scan key of strategy type i+1; it
 309          * is NULL if we haven't yet found such a key for this attr.
 310          */
 311         attno = 1;
 312         memset(xform, 0, sizeof(xform));
 313
 314         /*
 315          * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
 316          * handle after-last-key processing.  Actual exit from the loop is at the
 317          * "break" statement below.
 318          */
 319         for (i = 0;; cur++, i++)
 320         {
 321                 if (i < numberOfKeys)
 322                 {
 323                         /* See comments above about NULLs and IS NULL handling. */
 324                         /* Note: we assume SK_ISNULL is never set in a row header key */
 325                         if (cur->sk_flags & SK_ISNULL)
 326                         {
 327                                 if (cur->sk_flags & SK_SEARCHNULL)
 328                                 {
 329                                         cur->sk_strategy = BTEqualStrategyNumber;
 330                                         cur->sk_subtype = InvalidOid;
 331                                 }
 332                                 else
 333                                 {
 334                                         so->qual_ok = false;
 335                                         return;
 336                                 }
 337                         }
 338                 }
 339
 340                 /*
 341                  * If we are at the end of the keys for a particular attr, finish up
 342                  * processing and emit the cleaned-up keys.
 343                  */
 344                 if (i == numberOfKeys || cur->sk_attno != attno)
 345                 {
 346                         int                     priorNumberOfEqualCols = numberOfEqualCols;
 347
 348                         /* check input keys are correctly ordered */
 349                         if (i < numberOfKeys && cur->sk_attno < attno)
 350                                 elog(ERROR, "btree index keys must be ordered by attribute");
 351
 352                         /*
 353                          * If = has been specified, all other keys can be eliminated as
 354                          * redundant.  In case of key > 2 && key == 1 we can set qual_ok
 355                          * to false and abandon further processing.
 356                          */
 357                         if (xform[BTEqualStrategyNumber - 1])
 358                         {
 359                                 ScanKey         eq = xform[BTEqualStrategyNumber - 1];
 360
 361                                 for (j = BTMaxStrategyNumber; --j >= 0;)
 362                                 {
 363                                         ScanKey         chk = xform[j];
 364
 365                                         if (!chk || j == (BTEqualStrategyNumber - 1))
 366                                                 continue;
 367
 368                                         /* IS NULL together with any other predicate must fail */
 369                                         if (eq->sk_flags & SK_SEARCHNULL)
 370                                         {
 371                                                 so->qual_ok = false;
 372                                                 return;
 373                                         }
 374
 375                                         if (_bt_compare_scankey_args(scan, chk, eq, chk,
 376                                                                                                  &test_result))
 377                                         {
 378                                                 if (!test_result)
 379                                                 {
 380                                                         /* keys proven mutually contradictory */
 381                                                         so->qual_ok = false;
 382                                                         return;
 383                                                 }
 384                                                 /* else discard the redundant non-equality key */
 385                                                 xform[j] = NULL;
 386                                         }
 387                                         /* else, cannot determine redundancy, keep both keys */
 388                                 }
 389                                 /* track number of attrs for which we have "=" keys */
 390                                 numberOfEqualCols++;
 391                         }
 392
 393                         /* try to keep only one of <, <= */
 394                         if (xform[BTLessStrategyNumber - 1]
 395                                 && xform[BTLessEqualStrategyNumber - 1])
 396                         {
 397                                 ScanKey         lt = xform[BTLessStrategyNumber - 1];
 398                                 ScanKey         le = xform[BTLessEqualStrategyNumber - 1];
 399
 400                                 if (_bt_compare_scankey_args(scan, le, lt, le,
 401                                                                                          &test_result))
 402                                 {
 403                                         if (test_result)
 404                                                 xform[BTLessEqualStrategyNumber - 1] = NULL;
 405                                         else
 406                                                 xform[BTLessStrategyNumber - 1] = NULL;
 407                                 }
 408                         }
 409
 410                         /* try to keep only one of >, >= */
 411                         if (xform[BTGreaterStrategyNumber - 1]
 412                                 && xform[BTGreaterEqualStrategyNumber - 1])
 413                         {
 414                                 ScanKey         gt = xform[BTGreaterStrategyNumber - 1];
 415                                 ScanKey         ge = xform[BTGreaterEqualStrategyNumber - 1];
 416
 417                                 if (_bt_compare_scankey_args(scan, ge, gt, ge,
 418                                                                                          &test_result))
 419                                 {
 420                                         if (test_result)
 421                                                 xform[BTGreaterEqualStrategyNumber - 1] = NULL;
 422                                         else
 423                                                 xform[BTGreaterStrategyNumber - 1] = NULL;
 424                                 }
 425                         }
 426
 427                         /*
 428                          * Emit the cleaned-up keys into the outkeys[] array, and then
 429                          * mark them if they are required.      They are required (possibly
 430                          * only in one direction) if all attrs before this one had "=".
 431                          */
 432                         for (j = BTMaxStrategyNumber; --j >= 0;)
 433                         {
 434                                 if (xform[j])
 435                                 {
 436                                         ScanKey         outkey = &outkeys[new_numberOfKeys++];
 437
 438                                         memcpy(outkey, xform[j], sizeof(ScanKeyData));
 439                                         if (priorNumberOfEqualCols == attno - 1)
 440                                                 _bt_mark_scankey_required(outkey);
 441                                 }
 442                         }
 443
 444                         /*
 445                          * Exit loop here if done.
 446                          */
 447                         if (i == numberOfKeys)
 448                                 break;
 449
 450                         /* Re-initialize for new attno */
 451                         attno = cur->sk_attno;
 452                         memset(xform, 0, sizeof(xform));
 453                 }
 454
 455                 /* apply indoption to scankey (might change sk_strategy!) */
 456                 _bt_mark_scankey_with_indoption(cur, indoption);
 457
 458                 /* check strategy this key's operator corresponds to */
 459                 j = cur->sk_strategy - 1;
 460
 461                 /* if row comparison, push it directly to the output array */
 462                 if (cur->sk_flags & SK_ROW_HEADER)
 463                 {
 464                         ScanKey         outkey = &outkeys[new_numberOfKeys++];
 465
 466                         memcpy(outkey, cur, sizeof(ScanKeyData));
 467                         if (numberOfEqualCols == attno - 1)
 468                                 _bt_mark_scankey_required(outkey);
 469
 470                         /*
 471                          * We don't support RowCompare using equality; such a qual would
 472                          * mess up the numberOfEqualCols tracking.
 473                          */
 474                         Assert(j != (BTEqualStrategyNumber - 1));
 475                         continue;
 476                 }
 477
 478                 /* have we seen one of these before? */
 479                 if (xform[j] == NULL)
 480                 {
 481                         /* nope, so remember this scankey */
 482                         xform[j] = cur;
 483                 }
 484                 else
 485                 {
 486                         /* yup, keep only the more restrictive key */
 487
 488                         /* if either arg is NULL, don't try to compare */
 489                         if ((cur->sk_flags | xform[j]->sk_flags) & SK_ISNULL)
 490                         {
 491                                 /* at least one of them must be an IS NULL clause */
 492                                 Assert(j == (BTEqualStrategyNumber - 1));
 493                                 Assert((cur->sk_flags | xform[j]->sk_flags) & SK_SEARCHNULL);
 494                                 /* if one is and one isn't, the search must fail */
 495                                 if ((cur->sk_flags ^ xform[j]->sk_flags) & SK_SEARCHNULL)
 496                                 {
 497                                         so->qual_ok = false;
 498                                         return;
 499                                 }
 500                                 /* we have duplicate IS NULL clauses, ignore the newer one */
 501                                 continue;
 502                         }
 503
 504                         if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
 505                                                                                  &test_result))
 506                         {
 507                                 if (test_result)
 508                                         xform[j] = cur;
 509                                 else if (j == (BTEqualStrategyNumber - 1))
 510                                 {
 511                                         /* key == a && key == b, but a != b */
 512                                         so->qual_ok = false;
 513                                         return;
 514                                 }
 515                                 /* else old key is more restrictive, keep it */
 516                         }
 517                         else
 518                         {
 519                                 /*
 520                                  * We can't determine which key is more restrictive.  Keep the
 521                                  * previous one in xform[j] and push this one directly to the
 522                                  * output array.
 523                                  */
 524                                 ScanKey         outkey = &outkeys[new_numberOfKeys++];
 525
 526                                 memcpy(outkey, cur, sizeof(ScanKeyData));
 527                                 if (numberOfEqualCols == attno - 1)
 528                                         _bt_mark_scankey_required(outkey);
 529                         }
 530                 }
 531         }
 532
 533         so->numberOfKeys = new_numberOfKeys;
 534 }
 535
 536 /*
 537  * Compare two scankey values using a specified operator.  Both values
 538  * must be already known non-NULL.
 539  *
 540  * The test we want to perform is logically "leftarg op rightarg", where
 541  * leftarg and rightarg are the sk_argument values in those ScanKeys, and
 542  * the comparison operator is the one in the op ScanKey.  However, in
 543  * cross-data-type situations we may need to look up the correct operator in
 544  * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
 545  * and amoplefttype/amoprighttype equal to the two argument datatypes.
 546  *
 547  * If the opfamily doesn't supply a complete set of cross-type operators we
 548  * may not be able to make the comparison.      If we can make the comparison
 549  * we store the operator result in *result and return TRUE.  We return FALSE
 550  * if the comparison could not be made.
 551  *
 552  * Note: op always points at the same ScanKey as either leftarg or rightarg.
 553  * Since we don't scribble on the scankeys, this aliasing should cause no
 554  * trouble.
 555  *
 556  * Note: this routine needs to be insensitive to any DESC option applied
 557  * to the index column.  For example, "x < 4" is a tighter constraint than
 558  * "x < 5" regardless of which way the index is sorted.  We don't worry about
 559  * NULLS FIRST/LAST either, since the given values are never nulls.
 560  */
 561 static bool
 562 _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
 563                                                  ScanKey leftarg, ScanKey rightarg,
 564                                                  bool *result)
 565 {
 566         Relation        rel = scan->indexRelation;
 567         Oid                     lefttype,
 568                                 righttype,
 569                                 optype,
 570                                 opcintype,
 571                                 cmp_op;
 572         StrategyNumber strat;
 573
 574         /*
 575          * The opfamily we need to worry about is identified by the index column.
 576          */
 577         Assert(leftarg->sk_attno == rightarg->sk_attno);
 578
 579         opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
 580
 581         /*
 582          * Determine the actual datatypes of the ScanKey arguments.  We have to
 583          * support the convention that sk_subtype == InvalidOid means the opclass
 584          * input type; this is a hack to simplify life for ScanKeyInit().
 585          */
 586         lefttype = leftarg->sk_subtype;
 587         if (lefttype == InvalidOid)
 588                 lefttype = opcintype;
 589         righttype = rightarg->sk_subtype;
 590         if (righttype == InvalidOid)
 591                 righttype = opcintype;
 592         optype = op->sk_subtype;
 593         if (optype == InvalidOid)
 594                 optype = opcintype;
 595
 596         /*
 597          * If leftarg and rightarg match the types expected for the "op" scankey,
 598          * we can use its already-looked-up comparison function.
 599          */
 600         if (lefttype == opcintype && righttype == optype)
 601         {
 602                 *result = DatumGetBool(FunctionCall2(&op->sk_func,
 603                                                                                          leftarg->sk_argument,
 604                                                                                          rightarg->sk_argument));
 605                 return true;
 606         }
 607
 608         /*
 609          * Otherwise, we need to go to the syscache to find the appropriate
 610          * operator.  (This cannot result in infinite recursion, since no
 611          * indexscan initiated by syscache lookup will use cross-data-type
 612          * operators.)
 613          *
 614          * If the sk_strategy was flipped by _bt_mark_scankey_with_indoption, we
 615          * have to un-flip it to get the correct opfamily member.
 616          */
 617         strat = op->sk_strategy;
 618         if (op->sk_flags & SK_BT_DESC)
 619                 strat = BTCommuteStrategyNumber(strat);
 620
 621         cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
 622                                                                  lefttype,
 623                                                                  righttype,
 624                                                                  strat);
 625         if (OidIsValid(cmp_op))
 626         {
 627                 RegProcedure cmp_proc = get_opcode(cmp_op);
 628
 629                 if (RegProcedureIsValid(cmp_proc))
 630                 {
 631                         *result = DatumGetBool(OidFunctionCall2(cmp_proc,
 632                                                                                                         leftarg->sk_argument,
 633                                                                                                         rightarg->sk_argument));
 634                         return true;
 635                 }
 636         }
 637
 638         /* Can't make the comparison */
 639         *result = false;                        /* suppress compiler warnings */
 640         return false;
 641 }
 642
 643 /*
 644  * Mark a scankey with info from the index's indoption array.
 645  *
 646  * We copy the appropriate indoption value into the scankey sk_flags
 647  * (shifting to avoid clobbering system-defined flag bits).  Also, if
 648  * the DESC option is set, commute (flip) the operator strategy number.
 649  *
 650  * This function is applied to the *input* scankey structure; therefore
 651  * on a rescan we will be looking at already-processed scankeys.  Hence
 652  * we have to be careful not to re-commute the strategy if we already did it.
 653  * It's a bit ugly to modify the caller's copy of the scankey but in practice
 654  * there shouldn't be any problem, since the index's indoptions are certainly
 655  * not going to change while the scankey survives.
 656  */
 657 static void
 658 _bt_mark_scankey_with_indoption(ScanKey skey, int16 *indoption)
 659 {
 660         int                     addflags;
 661
 662         addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
 663         if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
 664                 skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
 665         skey->sk_flags |= addflags;
 666
 667         if (skey->sk_flags & SK_ROW_HEADER)
 668         {
 669                 ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
 670
 671                 for (;;)
 672                 {
 673                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
 674                         addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
 675                         if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
 676                                 subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
 677                         subkey->sk_flags |= addflags;
 678                         if (subkey->sk_flags & SK_ROW_END)
 679                                 break;
 680                         subkey++;
 681                 }
 682         }
 683 }
 684
 685 /*
 686  * Mark a scankey as "required to continue the scan".
 687  *
 688  * Depending on the operator type, the key may be required for both scan
 689  * directions or just one.      Also, if the key is a row comparison header,
 690  * we have to mark the appropriate subsidiary ScanKeys as required.  In
 691  * such cases, the first subsidiary key is required, but subsequent ones
 692  * are required only as long as they correspond to successive index columns
 693  * and match the leading column as to sort direction.
 694  * Otherwise the row comparison ordering is different from the index ordering
 695  * and so we can't stop the scan on the basis of those lower-order columns.
 696  *
 697  * Note: when we set required-key flag bits in a subsidiary scankey, we are
 698  * scribbling on a data structure belonging to the index AM's caller, not on
 699  * our private copy.  This should be OK because the marking will not change
 700  * from scan to scan within a query, and so we'd just re-mark the same way
 701  * anyway on a rescan.  Something to keep an eye on though.
 702  */
 703 static void
 704 _bt_mark_scankey_required(ScanKey skey)
 705 {
 706         int                     addflags;
 707
 708         switch (skey->sk_strategy)
 709         {
 710                 case BTLessStrategyNumber:
 711                 case BTLessEqualStrategyNumber:
 712                         addflags = SK_BT_REQFWD;
 713                         break;
 714                 case BTEqualStrategyNumber:
 715                         addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
 716                         break;
 717                 case BTGreaterEqualStrategyNumber:
 718                 case BTGreaterStrategyNumber:
 719                         addflags = SK_BT_REQBKWD;
 720                         break;
 721                 default:
 722                         elog(ERROR, "unrecognized StrategyNumber: %d",
 723                                  (int) skey->sk_strategy);
 724                         addflags = 0;           /* keep compiler quiet */
 725                         break;
 726         }
 727
 728         skey->sk_flags |= addflags;
 729
 730         if (skey->sk_flags & SK_ROW_HEADER)
 731         {
 732                 ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
 733                 AttrNumber      attno = skey->sk_attno;
 734
 735                 /* First subkey should be same as the header says */
 736                 Assert(subkey->sk_attno == attno);
 737
 738                 for (;;)
 739                 {
 740                         Assert(subkey->sk_flags & SK_ROW_MEMBER);
 741                         if (subkey->sk_attno != attno)
 742                                 break;                  /* non-adjacent key, so not required */
 743                         if (subkey->sk_strategy != skey->sk_strategy)
 744                                 break;                  /* wrong direction, so not required */
 745                         subkey->sk_flags |= addflags;
 746                         if (subkey->sk_flags & SK_ROW_END)
 747                                 break;
 748                         subkey++;
 749                         attno++;
 750                 }
 751         }
 752 }
 753
 754 /*
 755  * Test whether an indextuple satisfies all the scankey conditions.
 756  *
 757  * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE.
 758  * If not, return FALSE (xs_ctup is not changed).
 759  *
 760  * If the tuple fails to pass the qual, we also determine whether there's
 761  * any need to continue the scan beyond this tuple, and set *continuescan
 762  * accordingly.  See comments for _bt_preprocess_keys(), above, about how
 763  * this is done.
 764  *
 765  * scan: index scan descriptor (containing a search-type scankey)
 766  * page: buffer page containing index tuple
 767  * offnum: offset number of index tuple (must be a valid item!)
 768  * dir: direction we are scanning in
 769  * continuescan: output parameter (will be set correctly in all cases)
 770  */
 771 bool
 772 _bt_checkkeys(IndexScanDesc scan,
 773                           Page page, OffsetNumber offnum,
 774                           ScanDirection dir, bool *continuescan)
 775 {
 776         ItemId          iid = PageGetItemId(page, offnum);
 777         bool            tuple_valid;
 778         IndexTuple      tuple;
 779         TupleDesc       tupdesc;
 780         BTScanOpaque so;
 781         int                     keysz;
 782         int                     ikey;
 783         ScanKey         key;
 784
 785         *continuescan = true;           /* default assumption */
 786
 787         /*
 788          * If the scan specifies not to return killed tuples, then we treat a
 789          * killed tuple as not passing the qual.  Most of the time, it's a win to
 790          * not bother examining the tuple's index keys, but just return
 791          * immediately with continuescan = true to proceed to the next tuple.
 792          * However, if this is the last tuple on the page, we should check the
 793          * index keys to prevent uselessly advancing to the next page.
 794          */
 795         if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
 796         {
 797                 /* return immediately if there are more tuples on the page */
 798                 if (ScanDirectionIsForward(dir))
 799                 {
 800                         if (offnum < PageGetMaxOffsetNumber(page))
 801                                 return false;
 802                 }
 803                 else
 804                 {
 805                         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 806
 807                         if (offnum > P_FIRSTDATAKEY(opaque))
 808                                 return false;
 809                 }
 810
 811                 /*
 812                  * OK, we want to check the keys, but we'll return FALSE even if the
 813                  * tuple passes the key tests.
 814                  */
 815                 tuple_valid = false;
 816         }
 817         else
 818                 tuple_valid = true;
 819
 820         tuple = (IndexTuple) PageGetItem(page, iid);
 821
 822         IncrIndexProcessed();
 823
 824         tupdesc = RelationGetDescr(scan->indexRelation);
 825         so = (BTScanOpaque) scan->opaque;
 826         keysz = so->numberOfKeys;
 827
 828         for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
 829         {
 830                 Datum           datum;
 831                 bool            isNull;
 832                 Datum           test;
 833
 834                 /* row-comparison keys need special processing */
 835                 if (key->sk_flags & SK_ROW_HEADER)
 836                 {
 837                         if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
 838                                 continue;
 839                         return false;
 840                 }
 841
 842                 datum = index_getattr(tuple,
 843                                                           key->sk_attno,
 844                                                           tupdesc,
 845                                                           &isNull);
 846
 847                 if (key->sk_flags & SK_ISNULL)
 848                 {
 849                         /* Handle IS NULL tests */
 850                         Assert(key->sk_flags & SK_SEARCHNULL);
 851
 852                         if (isNull)
 853                                 continue;               /* tuple satisfies this qual */
 854
 855                         /*
 856                          * Tuple fails this qual.  If it's a required qual for the current
 857                          * scan direction, then we can conclude no further tuples will
 858                          * pass, either.
 859                          */
 860                         if ((key->sk_flags & SK_BT_REQFWD) &&
 861                                 ScanDirectionIsForward(dir))
 862                                 *continuescan = false;
 863                         else if ((key->sk_flags & SK_BT_REQBKWD) &&
 864                                          ScanDirectionIsBackward(dir))
 865                                 *continuescan = false;
 866
 867                         /*
 868                          * In any case, this indextuple doesn't match the qual.
 869                          */
 870                         return false;
 871                 }
 872
 873                 if (isNull)
 874                 {
 875                         if (key->sk_flags & SK_BT_NULLS_FIRST)
 876                         {
 877                                 /*
 878                                  * Since NULLs are sorted before non-NULLs, we know we have
 879                                  * reached the lower limit of the range of values for this
 880                                  * index attr.  On a backward scan, we can stop if this qual
 881                                  * is one of the "must match" subset.  On a forward scan,
 882                                  * however, we should keep going.
 883                                  */
 884                                 if ((key->sk_flags & SK_BT_REQBKWD) &&
 885                                         ScanDirectionIsBackward(dir))
 886                                         *continuescan = false;
 887                         }
 888                         else
 889                         {
 890                                 /*
 891                                  * Since NULLs are sorted after non-NULLs, we know we have
 892                                  * reached the upper limit of the range of values for this
 893                                  * index attr.  On a forward scan, we can stop if this qual is
 894                                  * one of the "must match" subset.      On a backward scan,
 895                                  * however, we should keep going.
 896                                  */
 897                                 if ((key->sk_flags & SK_BT_REQFWD) &&
 898                                         ScanDirectionIsForward(dir))
 899                                         *continuescan = false;
 900                         }
 901
 902                         /*
 903                          * In any case, this indextuple doesn't match the qual.
 904                          */
 905                         return false;
 906                 }
 907
 908                 test = FunctionCall2(&key->sk_func, datum, key->sk_argument);
 909
 910                 if (!DatumGetBool(test))
 911                 {
 912                         /*
 913                          * Tuple fails this qual.  If it's a required qual for the current
 914                          * scan direction, then we can conclude no further tuples will
 915                          * pass, either.
 916                          *
 917                          * Note: because we stop the scan as soon as any required equality
 918                          * qual fails, it is critical that equality quals be used for the
 919                          * initial positioning in _bt_first() when they are available. See
 920                          * comments in _bt_first().
 921                          */
 922                         if ((key->sk_flags & SK_BT_REQFWD) &&
 923                                 ScanDirectionIsForward(dir))
 924                                 *continuescan = false;
 925                         else if ((key->sk_flags & SK_BT_REQBKWD) &&
 926                                          ScanDirectionIsBackward(dir))
 927                                 *continuescan = false;
 928
 929                         /*
 930                          * In any case, this indextuple doesn't match the qual.
 931                          */
 932                         return false;
 933                 }
 934         }
 935
 936         /* If we get here, the tuple passes all index quals. */
 937         if (tuple_valid)
 938                 scan->xs_ctup.t_self = tuple->t_tid;
 939
 940         return tuple_valid;
 941 }
 942
 943 /*
 944  * Test whether an indextuple satisfies a row-comparison scan condition.
 945  *
 946  * Return true if so, false if not.  If not, also clear *continuescan if
 947  * it's not possible for any future tuples in the current scan direction
 948  * to pass the qual.
 949  *
 950  * This is a subroutine for _bt_checkkeys, which see for more info.
 951  */
 952 static bool
 953 _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
 954                                          ScanDirection dir, bool *continuescan)
 955 {
 956         ScanKey         subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
 957         int32           cmpresult = 0;
 958         bool            result;
 959
 960         /* First subkey should be same as the header says */
 961         Assert(subkey->sk_attno == skey->sk_attno);
 962
 963         /* Loop over columns of the row condition */
 964         for (;;)
 965         {
 966                 Datum           datum;
 967                 bool            isNull;
 968
 969                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
 970
 971                 datum = index_getattr(tuple,
 972                                                           subkey->sk_attno,
 973                                                           tupdesc,
 974                                                           &isNull);
 975
 976                 if (isNull)
 977                 {
 978                         if (subkey->sk_flags & SK_BT_NULLS_FIRST)
 979                         {
 980                                 /*
 981                                  * Since NULLs are sorted before non-NULLs, we know we have
 982                                  * reached the lower limit of the range of values for this
 983                                  * index attr. On a backward scan, we can stop if this qual is
 984                                  * one of the "must match" subset.      On a forward scan,
 985                                  * however, we should keep going.
 986                                  */
 987                                 if ((subkey->sk_flags & SK_BT_REQBKWD) &&
 988                                         ScanDirectionIsBackward(dir))
 989                                         *continuescan = false;
 990                         }
 991                         else
 992                         {
 993                                 /*
 994                                  * Since NULLs are sorted after non-NULLs, we know we have
 995                                  * reached the upper limit of the range of values for this
 996                                  * index attr. On a forward scan, we can stop if this qual is
 997                                  * one of the "must match" subset.      On a backward scan,
 998                                  * however, we should keep going.
 999                                  */
1000                                 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1001                                         ScanDirectionIsForward(dir))
1002                                         *continuescan = false;
1003                         }
1004
1005                         /*
1006                          * In any case, this indextuple doesn't match the qual.
1007                          */
1008                         return false;
1009                 }
1010
1011                 if (subkey->sk_flags & SK_ISNULL)
1012                 {
1013                         /*
1014                          * Unlike the simple-scankey case, this isn't a disallowed case.
1015                          * But it can never match.      If all the earlier row comparison
1016                          * columns are required for the scan direction, we can stop the
1017                          * scan, because there can't be another tuple that will succeed.
1018                          */
1019                         if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
1020                                 subkey--;
1021                         if ((subkey->sk_flags & SK_BT_REQFWD) &&
1022                                 ScanDirectionIsForward(dir))
1023                                 *continuescan = false;
1024                         else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1025                                          ScanDirectionIsBackward(dir))
1026                                 *continuescan = false;
1027                         return false;
1028                 }
1029
1030                 /* Perform the test --- three-way comparison not bool operator */
1031                 cmpresult = DatumGetInt32(FunctionCall2(&subkey->sk_func,
1032                                                                                                 datum,
1033                                                                                                 subkey->sk_argument));
1034
1035                 if (subkey->sk_flags & SK_BT_DESC)
1036                         cmpresult = -cmpresult;
1037
1038                 /* Done comparing if unequal, else advance to next column */
1039                 if (cmpresult != 0)
1040                         break;
1041
1042                 if (subkey->sk_flags & SK_ROW_END)
1043                         break;
1044                 subkey++;
1045         }
1046
1047         /*
1048          * At this point cmpresult indicates the overall result of the row
1049          * comparison, and subkey points to the deciding column (or the last
1050          * column if the result is "=").
1051          */
1052         switch (subkey->sk_strategy)
1053         {
1054                         /* EQ and NE cases aren't allowed here */
1055                 case BTLessStrategyNumber:
1056                         result = (cmpresult < 0);
1057                         break;
1058                 case BTLessEqualStrategyNumber:
1059                         result = (cmpresult <= 0);
1060                         break;
1061                 case BTGreaterEqualStrategyNumber:
1062                         result = (cmpresult >= 0);
1063                         break;
1064                 case BTGreaterStrategyNumber:
1065                         result = (cmpresult > 0);
1066                         break;
1067                 default:
1068                         elog(ERROR, "unrecognized RowCompareType: %d",
1069                                  (int) subkey->sk_strategy);
1070                         result = 0;                     /* keep compiler quiet */
1071                         break;
1072         }
1073
1074         if (!result)
1075         {
1076                 /*
1077                  * Tuple fails this qual.  If it's a required qual for the current
1078                  * scan direction, then we can conclude no further tuples will pass,
1079                  * either.      Note we have to look at the deciding column, not
1080                  * necessarily the first or last column of the row condition.
1081                  */
1082                 if ((subkey->sk_flags & SK_BT_REQFWD) &&
1083                         ScanDirectionIsForward(dir))
1084                         *continuescan = false;
1085                 else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1086                                  ScanDirectionIsBackward(dir))
1087                         *continuescan = false;
1088         }
1089
1090         return result;
1091 }
1092
1093 /*
1094  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
1095  * told us were killed
1096  *
1097  * scan->so contains information about the current page and killed tuples
1098  * thereon (generally, this should only be called if so->numKilled > 0).
1099  *
1100  * The caller must have pin on so->currPos.buf, but may or may not have
1101  * read-lock, as indicated by haveLock.  Note that we assume read-lock
1102  * is sufficient for setting LP_DEAD status (which is only a hint).
1103  *
1104  * We match items by heap TID before assuming they are the right ones to
1105  * delete.      We cope with cases where items have moved right due to insertions.
1106  * If an item has moved off the current page due to a split, we'll fail to
1107  * find it and do nothing (this is not an error case --- we assume the item
1108  * will eventually get marked in a future indexscan).  Note that because we
1109  * hold pin on the target page continuously from initially reading the items
1110  * until applying this function, VACUUM cannot have deleted any items from
1111  * the page, and so there is no need to search left from the recorded offset.
1112  * (This observation also guarantees that the item is still the right one
1113  * to delete, which might otherwise be questionable since heap TIDs can get
1114  * recycled.)
1115  */
1116 void
1117 _bt_killitems(IndexScanDesc scan, bool haveLock)
1118 {
1119         BTScanOpaque so = (BTScanOpaque) scan->opaque;
1120         Page            page;
1121         BTPageOpaque opaque;
1122         OffsetNumber minoff;
1123         OffsetNumber maxoff;
1124         int                     i;
1125         bool            killedsomething = false;
1126
1127         Assert(BufferIsValid(so->currPos.buf));
1128
1129         if (!haveLock)
1130                 LockBuffer(so->currPos.buf, BT_READ);
1131
1132         page = BufferGetPage(so->currPos.buf);
1133         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1134         minoff = P_FIRSTDATAKEY(opaque);
1135         maxoff = PageGetMaxOffsetNumber(page);
1136
1137         for (i = 0; i < so->numKilled; i++)
1138         {
1139                 int                     itemIndex = so->killedItems[i];
1140                 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
1141                 OffsetNumber offnum = kitem->indexOffset;
1142
1143                 Assert(itemIndex >= so->currPos.firstItem &&
1144                            itemIndex <= so->currPos.lastItem);
1145                 if (offnum < minoff)
1146                         continue;                       /* pure paranoia */
1147                 while (offnum <= maxoff)
1148                 {
1149                         ItemId          iid = PageGetItemId(page, offnum);
1150                         IndexTuple      ituple = (IndexTuple) PageGetItem(page, iid);
1151
1152                         if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
1153                         {
1154                                 /* found the item */
1155                                 ItemIdMarkDead(iid);
1156                                 killedsomething = true;
1157                                 break;                  /* out of inner search loop */
1158                         }
1159                         offnum = OffsetNumberNext(offnum);
1160                 }
1161         }
1162
1163         /*
1164          * Since this can be redone later if needed, it's treated the same as a
1165          * commit-hint-bit status update for heap tuples: we mark the buffer dirty
1166          * but don't make a WAL log entry.
1167          *
1168          * Whenever we mark anything LP_DEAD, we also set the page's
1169          * BTP_HAS_GARBAGE flag, which is likewise just a hint.
1170          */
1171         if (killedsomething)
1172         {
1173                 opaque->btpo_flags |= BTP_HAS_GARBAGE;
1174                 SetBufferCommitInfoNeedsSave(so->currPos.buf);
1175         }
1176
1177         if (!haveLock)
1178                 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1179
1180         /*
1181          * Always reset the scan state, so we don't look for same items on other
1182          * pages.
1183          */
1184         so->numKilled = 0;
1185 }
1186
1187
1188 /*
1189  * The following routines manage a shared-memory area in which we track
1190  * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
1191  * operations.  There is a single counter which increments each time we
1192  * start a vacuum to assign it a cycle ID.      Since multiple vacuums could
1193  * be active concurrently, we have to track the cycle ID for each active
1194  * vacuum; this requires at most MaxBackends entries (usually far fewer).
1195  * We assume at most one vacuum can be active for a given index.
1196  *
1197  * Access to the shared memory area is controlled by BtreeVacuumLock.
1198  * In principle we could use a separate lmgr locktag for each index,
1199  * but a single LWLock is much cheaper, and given the short time that
1200  * the lock is ever held, the concurrency hit should be minimal.
1201  */
1202
1203 typedef struct BTOneVacInfo
1204 {
1205         LockRelId       relid;                  /* global identifier of an index */
1206         BTCycleId       cycleid;                /* cycle ID for its active VACUUM */
1207 } BTOneVacInfo;
1208
1209 typedef struct BTVacInfo
1210 {
1211         BTCycleId       cycle_ctr;              /* cycle ID most recently assigned */
1212         int                     num_vacuums;    /* number of currently active VACUUMs */
1213         int                     max_vacuums;    /* allocated length of vacuums[] array */
1214         BTOneVacInfo vacuums[1];        /* VARIABLE LENGTH ARRAY */
1215 } BTVacInfo;
1216
1217 static BTVacInfo *btvacinfo;
1218
1219
1220 /*
1221  * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
1222  *              or zero if there is no active VACUUM
1223  *
1224  * Note: for correct interlocking, the caller must already hold pin and
1225  * exclusive lock on each buffer it will store the cycle ID into.  This
1226  * ensures that even if a VACUUM starts immediately afterwards, it cannot
1227  * process those pages until the page split is complete.
1228  */
1229 BTCycleId
1230 _bt_vacuum_cycleid(Relation rel)
1231 {
1232         BTCycleId       result = 0;
1233         int                     i;
1234
1235         /* Share lock is enough since this is a read-only operation */
1236         LWLockAcquire(BtreeVacuumLock, LW_SHARED);
1237
1238         for (i = 0; i < btvacinfo->num_vacuums; i++)
1239         {
1240                 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1241
1242                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1243                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1244                 {
1245                         result = vac->cycleid;
1246                         break;
1247                 }
1248         }
1249
1250         LWLockRelease(BtreeVacuumLock);
1251         return result;
1252 }
1253
1254 /*
1255  * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
1256  *
1257  * Note: the caller must guarantee that it will eventually call
1258  * _bt_end_vacuum, else we'll permanently leak an array slot.  To ensure
1259  * that this happens even in elog(FATAL) scenarios, the appropriate coding
1260  * is not just a PG_TRY, but
1261  *              PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
1262  */
1263 BTCycleId
1264 _bt_start_vacuum(Relation rel)
1265 {
1266         BTCycleId       result;
1267         int                     i;
1268         BTOneVacInfo *vac;
1269
1270         LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1271
1272         /*
1273          * Assign the next cycle ID, being careful to avoid zero as well as the
1274          * reserved high values.
1275          */
1276         result = ++(btvacinfo->cycle_ctr);
1277         if (result == 0 || result > MAX_BT_CYCLE_ID)
1278                 result = btvacinfo->cycle_ctr = 1;
1279
1280         /* Let's just make sure there's no entry already for this index */
1281         for (i = 0; i < btvacinfo->num_vacuums; i++)
1282         {
1283                 vac = &btvacinfo->vacuums[i];
1284                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1285                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1286                 {
1287                         /*
1288                          * Unlike most places in the backend, we have to explicitly
1289                          * release our LWLock before throwing an error.  This is because
1290                          * we expect _bt_end_vacuum() to be called before transaction
1291                          * abort cleanup can run to release LWLocks.
1292                          */
1293                         LWLockRelease(BtreeVacuumLock);
1294                         elog(ERROR, "multiple active vacuums for index \"%s\"",
1295                                  RelationGetRelationName(rel));
1296                 }
1297         }
1298
1299         /* OK, add an entry */
1300         if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
1301         {
1302                 LWLockRelease(BtreeVacuumLock);
1303                 elog(ERROR, "out of btvacinfo slots");
1304         }
1305         vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
1306         vac->relid = rel->rd_lockInfo.lockRelId;
1307         vac->cycleid = result;
1308         btvacinfo->num_vacuums++;
1309
1310         LWLockRelease(BtreeVacuumLock);
1311         return result;
1312 }
1313
1314 /*
1315  * _bt_end_vacuum --- mark a btree VACUUM operation as done
1316  *
1317  * Note: this is deliberately coded not to complain if no entry is found;
1318  * this allows the caller to put PG_TRY around the start_vacuum operation.
1319  */
1320 void
1321 _bt_end_vacuum(Relation rel)
1322 {
1323         int                     i;
1324
1325         LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1326
1327         /* Find the array entry */
1328         for (i = 0; i < btvacinfo->num_vacuums; i++)
1329         {
1330                 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1331
1332                 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1333                         vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1334                 {
1335                         /* Remove it by shifting down the last entry */
1336                         *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
1337                         btvacinfo->num_vacuums--;
1338                         break;
1339                 }
1340         }
1341
1342         LWLockRelease(BtreeVacuumLock);
1343 }
1344
1345 /*
1346  * _bt_end_vacuum wrapped as an on_shmem_exit callback function
1347  */
1348 void
1349 _bt_end_vacuum_callback(int code, Datum arg)
1350 {
1351         _bt_end_vacuum((Relation) DatumGetPointer(arg));
1352 }
1353
1354 /*
1355  * BTreeShmemSize --- report amount of shared memory space needed
1356  */
1357 Size
1358 BTreeShmemSize(void)
1359 {
1360         Size            size;
1361
1362         size = offsetof(BTVacInfo, vacuums[0]);
1363         size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
1364         return size;
1365 }
1366
1367 /*
1368  * BTreeShmemInit --- initialize this module's shared memory
1369  */
1370 void
1371 BTreeShmemInit(void)
1372 {
1373         bool            found;
1374
1375         btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
1376                                                                                           BTreeShmemSize(),
1377                                                                                           &found);
1378
1379         if (!IsUnderPostmaster)
1380         {
1381                 /* Initialize shared memory area */
1382                 Assert(!found);
1383
1384                 /*
1385                  * It doesn't really matter what the cycle counter starts at, but
1386                  * having it always start the same doesn't seem good.  Seed with
1387                  * low-order bits of time() instead.
1388                  */
1389                 btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
1390
1391                 btvacinfo->num_vacuums = 0;
1392                 btvacinfo->max_vacuums = MaxBackends;
1393         }
1394         else
1395                 Assert(found);
1396 }
1397
1398 Datum
1399 btoptions(PG_FUNCTION_ARGS)
1400 {
1401         Datum           reloptions = PG_GETARG_DATUM(0);
1402         bool            validate = PG_GETARG_BOOL(1);
1403         bytea      *result;
1404
1405         result = default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
1406         if (result)
1407                 PG_RETURN_BYTEA_P(result);
1408         PG_RETURN_NULL();
1409 }