src/backend/access/nbtree/nbtree.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtree.c
   4  *        Implementation of Lehman and Yao's btree management algorithm for
   5  *        Postgres.
   6  *
   7  * NOTES
   8  *        This file contains only the public interface routines.
   9  *
  10  *
  11  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  12  * Portions Copyright (c) 1994, Regents of the University of California
  13  *
  14  * IDENTIFICATION
  15  *        src/backend/access/nbtree/nbtree.c
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19 #include "postgres.h"
  20
  21 #include "access/nbtree.h"
  22 #include "access/relscan.h"
  23 #include "commands/progress.h"
  24 #include "commands/vacuum.h"
  25 #include "nodes/execnodes.h"
  26 #include "pgstat.h"
  27 #include "storage/bulk_write.h"
  28 #include "storage/condition_variable.h"
  29 #include "storage/indexfsm.h"
  30 #include "storage/ipc.h"
  31 #include "storage/lmgr.h"
  32 #include "utils/fmgrprotos.h"
  33 #include "utils/index_selfuncs.h"
  34 #include "utils/memutils.h"
  35
  36
  37 /*
  38  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
  39  *
  40  * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
  41  * scan to advance it via another call to _bt_first.
  42  *
  43  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
  44  * a new page; others must wait.
  45  *
  46  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
  47  * to a new page; some process can start doing that.
  48  *
  49  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
  50  */
  51 typedef enum
  52 {
  53         BTPARALLEL_NOT_INITIALIZED,
  54         BTPARALLEL_NEED_PRIMSCAN,
  55         BTPARALLEL_ADVANCING,
  56         BTPARALLEL_IDLE,
  57         BTPARALLEL_DONE,
  58 } BTPS_State;
  59
  60 /*
  61  * BTParallelScanDescData contains btree specific shared information required
  62  * for parallel scan.
  63  */
  64 typedef struct BTParallelScanDescData
  65 {
  66         BlockNumber btps_nextScanPage;  /* next page to be scanned */
  67         BlockNumber btps_lastCurrPage;  /* page whose sibling link was copied into
  68                                                                          * btps_nextScanPage */
  69         BTPS_State      btps_pageStatus;        /* indicates whether next page is
  70                                                                          * available for scan. see above for
  71                                                                          * possible states of parallel scan. */
  72         slock_t         btps_mutex;             /* protects above variables, btps_arrElems */
  73         ConditionVariable btps_cv;      /* used to synchronize parallel scan */
  74
  75         /*
  76          * btps_arrElems is used when scans need to schedule another primitive
  77          * index scan.  Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
  78          */
  79         int                     btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
  80 }                       BTParallelScanDescData;
  81
  82 typedef struct BTParallelScanDescData *BTParallelScanDesc;
  83
  84
  85 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
  86                                                  IndexBulkDeleteCallback callback, void *callback_state,
  87                                                  BTCycleId cycleid);
  88 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
  89 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
  90                                                                                   IndexTuple posting,
  91                                                                                   OffsetNumber updatedoffset,
  92                                                                                   int *nremaining);
  93
  94
  95 /*
  96  * Btree handler function: return IndexAmRoutine with access method parameters
  97  * and callbacks.
  98  */
  99 Datum
 100 bthandler(PG_FUNCTION_ARGS)
 101 {
 102         IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
 103
 104         amroutine->amstrategies = BTMaxStrategyNumber;
 105         amroutine->amsupport = BTNProcs;
 106         amroutine->amoptsprocnum = BTOPTIONS_PROC;
 107         amroutine->amcanorder = true;
 108         amroutine->amcanorderbyop = false;
 109         amroutine->amcanbackward = true;
 110         amroutine->amcanunique = true;
 111         amroutine->amcanmulticol = true;
 112         amroutine->amoptionalkey = true;
 113         amroutine->amsearcharray = true;
 114         amroutine->amsearchnulls = true;
 115         amroutine->amstorage = false;
 116         amroutine->amclusterable = true;
 117         amroutine->ampredlocks = true;
 118         amroutine->amcanparallel = true;
 119         amroutine->amcanbuildparallel = true;
 120         amroutine->amcaninclude = true;
 121         amroutine->amusemaintenanceworkmem = false;
 122         amroutine->amsummarizing = false;
 123         amroutine->amparallelvacuumoptions =
 124                 VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
 125         amroutine->amkeytype = InvalidOid;
 126
 127         amroutine->ambuild = btbuild;
 128         amroutine->ambuildempty = btbuildempty;
 129         amroutine->aminsert = btinsert;
 130         amroutine->aminsertcleanup = NULL;
 131         amroutine->ambulkdelete = btbulkdelete;
 132         amroutine->amvacuumcleanup = btvacuumcleanup;
 133         amroutine->amcanreturn = btcanreturn;
 134         amroutine->amcostestimate = btcostestimate;
 135         amroutine->amgettreeheight = btgettreeheight;
 136         amroutine->amoptions = btoptions;
 137         amroutine->amproperty = btproperty;
 138         amroutine->ambuildphasename = btbuildphasename;
 139         amroutine->amvalidate = btvalidate;
 140         amroutine->amadjustmembers = btadjustmembers;
 141         amroutine->ambeginscan = btbeginscan;
 142         amroutine->amrescan = btrescan;
 143         amroutine->amgettuple = btgettuple;
 144         amroutine->amgetbitmap = btgetbitmap;
 145         amroutine->amendscan = btendscan;
 146         amroutine->ammarkpos = btmarkpos;
 147         amroutine->amrestrpos = btrestrpos;
 148         amroutine->amestimateparallelscan = btestimateparallelscan;
 149         amroutine->aminitparallelscan = btinitparallelscan;
 150         amroutine->amparallelrescan = btparallelrescan;
 151
 152         PG_RETURN_POINTER(amroutine);
 153 }
 154
 155 /*
 156  *      btbuildempty() -- build an empty btree index in the initialization fork
 157  */
 158 void
 159 btbuildempty(Relation index)
 160 {
 161         bool            allequalimage = _bt_allequalimage(index, false);
 162         BulkWriteState *bulkstate;
 163         BulkWriteBuffer metabuf;
 164
 165         bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
 166
 167         /* Construct metapage. */
 168         metabuf = smgr_bulk_get_buf(bulkstate);
 169         _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
 170         smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
 171
 172         smgr_bulk_finish(bulkstate);
 173 }
 174
 175 /*
 176  *      btinsert() -- insert an index tuple into a btree.
 177  *
 178  *              Descend the tree recursively, find the appropriate location for our
 179  *              new tuple, and put it there.
 180  */
 181 bool
 182 btinsert(Relation rel, Datum *values, bool *isnull,
 183                  ItemPointer ht_ctid, Relation heapRel,
 184                  IndexUniqueCheck checkUnique,
 185                  bool indexUnchanged,
 186                  IndexInfo *indexInfo)
 187 {
 188         bool            result;
 189         IndexTuple      itup;
 190
 191         /* generate an index tuple */
 192         itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 193         itup->t_tid = *ht_ctid;
 194
 195         result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
 196
 197         pfree(itup);
 198
 199         return result;
 200 }
 201
 202 /*
 203  *      btgettuple() -- Get the next tuple in the scan.
 204  */
 205 bool
 206 btgettuple(IndexScanDesc scan, ScanDirection dir)
 207 {
 208         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 209         bool            res;
 210
 211         /* btree indexes are never lossy */
 212         scan->xs_recheck = false;
 213
 214         /* Each loop iteration performs another primitive index scan */
 215         do
 216         {
 217                 /*
 218                  * If we've already initialized this scan, we can just advance it in
 219                  * the appropriate direction.  If we haven't done so yet, we call
 220                  * _bt_first() to get the first item in the scan.
 221                  */
 222                 if (!BTScanPosIsValid(so->currPos))
 223                         res = _bt_first(scan, dir);
 224                 else
 225                 {
 226                         /*
 227                          * Check to see if we should kill the previously-fetched tuple.
 228                          */
 229                         if (scan->kill_prior_tuple)
 230                         {
 231                                 /*
 232                                  * Yes, remember it for later. (We'll deal with all such
 233                                  * tuples at once right before leaving the index page.)  The
 234                                  * test for numKilled overrun is not just paranoia: if the
 235                                  * caller reverses direction in the indexscan then the same
 236                                  * item might get entered multiple times. It's not worth
 237                                  * trying to optimize that, so we don't detect it, but instead
 238                                  * just forget any excess entries.
 239                                  */
 240                                 if (so->killedItems == NULL)
 241                                         so->killedItems = (int *)
 242                                                 palloc(MaxTIDsPerBTreePage * sizeof(int));
 243                                 if (so->numKilled < MaxTIDsPerBTreePage)
 244                                         so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 245                         }
 246
 247                         /*
 248                          * Now continue the scan.
 249                          */
 250                         res = _bt_next(scan, dir);
 251                 }
 252
 253                 /* If we have a tuple, return it ... */
 254                 if (res)
 255                         break;
 256                 /* ... otherwise see if we need another primitive index scan */
 257         } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
 258
 259         return res;
 260 }
 261
 262 /*
 263  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
 264  */
 265 int64
 266 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 267 {
 268         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 269         int64           ntids = 0;
 270         ItemPointer heapTid;
 271
 272         /* Each loop iteration performs another primitive index scan */
 273         do
 274         {
 275                 /* Fetch the first page & tuple */
 276                 if (_bt_first(scan, ForwardScanDirection))
 277                 {
 278                         /* Save tuple ID, and continue scanning */
 279                         heapTid = &scan->xs_heaptid;
 280                         tbm_add_tuples(tbm, heapTid, 1, false);
 281                         ntids++;
 282
 283                         for (;;)
 284                         {
 285                                 /*
 286                                  * Advance to next tuple within page.  This is the same as the
 287                                  * easy case in _bt_next().
 288                                  */
 289                                 if (++so->currPos.itemIndex > so->currPos.lastItem)
 290                                 {
 291                                         /* let _bt_next do the heavy lifting */
 292                                         if (!_bt_next(scan, ForwardScanDirection))
 293                                                 break;
 294                                 }
 295
 296                                 /* Save tuple ID, and continue scanning */
 297                                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
 298                                 tbm_add_tuples(tbm, heapTid, 1, false);
 299                                 ntids++;
 300                         }
 301                 }
 302                 /* Now see if we need another primitive index scan */
 303         } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
 304
 305         return ntids;
 306 }
 307
 308 /*
 309  *      btbeginscan() -- start a scan on a btree index
 310  */
 311 IndexScanDesc
 312 btbeginscan(Relation rel, int nkeys, int norderbys)
 313 {
 314         IndexScanDesc scan;
 315         BTScanOpaque so;
 316
 317         /* no order by operators allowed */
 318         Assert(norderbys == 0);
 319
 320         /* get the scan */
 321         scan = RelationGetIndexScan(rel, nkeys, norderbys);
 322
 323         /* allocate private workspace */
 324         so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
 325         BTScanPosInvalidate(so->currPos);
 326         BTScanPosInvalidate(so->markPos);
 327         if (scan->numberOfKeys > 0)
 328                 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 329         else
 330                 so->keyData = NULL;
 331
 332         so->needPrimScan = false;
 333         so->scanBehind = false;
 334         so->oppositeDirCheck = false;
 335         so->arrayKeys = NULL;
 336         so->orderProcs = NULL;
 337         so->arrayContext = NULL;
 338
 339         so->killedItems = NULL;         /* until needed */
 340         so->numKilled = 0;
 341
 342         /*
 343          * We don't know yet whether the scan will be index-only, so we do not
 344          * allocate the tuple workspace arrays until btrescan.  However, we set up
 345          * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
 346          */
 347         so->currTuples = so->markTuples = NULL;
 348
 349         scan->xs_itupdesc = RelationGetDescr(rel);
 350
 351         scan->opaque = so;
 352
 353         return scan;
 354 }
 355
 356 /*
 357  *      btrescan() -- rescan an index relation
 358  */
 359 void
 360 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 361                  ScanKey orderbys, int norderbys)
 362 {
 363         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 364
 365         /* we aren't holding any read locks, but gotta drop the pins */
 366         if (BTScanPosIsValid(so->currPos))
 367         {
 368                 /* Before leaving current page, deal with any killed items */
 369                 if (so->numKilled > 0)
 370                         _bt_killitems(scan);
 371                 BTScanPosUnpinIfPinned(so->currPos);
 372                 BTScanPosInvalidate(so->currPos);
 373         }
 374
 375         so->markItemIndex = -1;
 376         so->needPrimScan = false;
 377         so->scanBehind = false;
 378         so->oppositeDirCheck = false;
 379         BTScanPosUnpinIfPinned(so->markPos);
 380         BTScanPosInvalidate(so->markPos);
 381
 382         /*
 383          * Allocate tuple workspace arrays, if needed for an index-only scan and
 384          * not already done in a previous rescan call.  To save on palloc
 385          * overhead, both workspaces are allocated as one palloc block; only this
 386          * function and btendscan know that.
 387          *
 388          * NOTE: this data structure also makes it safe to return data from a
 389          * "name" column, even though btree name_ops uses an underlying storage
 390          * datatype of cstring.  The risk there is that "name" is supposed to be
 391          * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
 392          * However, since we only return data out of tuples sitting in the
 393          * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
 394          * data out of the markTuples array --- running off the end of memory for
 395          * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
 396          * adding special-case treatment for name_ops elsewhere.
 397          */
 398         if (scan->xs_want_itup && so->currTuples == NULL)
 399         {
 400                 so->currTuples = (char *) palloc(BLCKSZ * 2);
 401                 so->markTuples = so->currTuples + BLCKSZ;
 402         }
 403
 404         /*
 405          * Reset the scan keys
 406          */
 407         if (scankey && scan->numberOfKeys > 0)
 408                 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
 409         so->numberOfKeys = 0;           /* until _bt_preprocess_keys sets it */
 410         so->numArrayKeys = 0;           /* ditto */
 411 }
 412
 413 /*
 414  *      btendscan() -- close down a scan
 415  */
 416 void
 417 btendscan(IndexScanDesc scan)
 418 {
 419         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 420
 421         /* we aren't holding any read locks, but gotta drop the pins */
 422         if (BTScanPosIsValid(so->currPos))
 423         {
 424                 /* Before leaving current page, deal with any killed items */
 425                 if (so->numKilled > 0)
 426                         _bt_killitems(scan);
 427                 BTScanPosUnpinIfPinned(so->currPos);
 428         }
 429
 430         so->markItemIndex = -1;
 431         BTScanPosUnpinIfPinned(so->markPos);
 432
 433         /* No need to invalidate positions, the RAM is about to be freed. */
 434
 435         /* Release storage */
 436         if (so->keyData != NULL)
 437                 pfree(so->keyData);
 438         /* so->arrayKeys and so->orderProcs are in arrayContext */
 439         if (so->arrayContext != NULL)
 440                 MemoryContextDelete(so->arrayContext);
 441         if (so->killedItems != NULL)
 442                 pfree(so->killedItems);
 443         if (so->currTuples != NULL)
 444                 pfree(so->currTuples);
 445         /* so->markTuples should not be pfree'd, see btrescan */
 446         pfree(so);
 447 }
 448
 449 /*
 450  *      btmarkpos() -- save current scan position
 451  */
 452 void
 453 btmarkpos(IndexScanDesc scan)
 454 {
 455         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 456
 457         /* There may be an old mark with a pin (but no lock). */
 458         BTScanPosUnpinIfPinned(so->markPos);
 459
 460         /*
 461          * Just record the current itemIndex.  If we later step to next page
 462          * before releasing the marked position, _bt_steppage makes a full copy of
 463          * the currPos struct in markPos.  If (as often happens) the mark is moved
 464          * before we leave the page, we don't have to do that work.
 465          */
 466         if (BTScanPosIsValid(so->currPos))
 467                 so->markItemIndex = so->currPos.itemIndex;
 468         else
 469         {
 470                 BTScanPosInvalidate(so->markPos);
 471                 so->markItemIndex = -1;
 472         }
 473 }
 474
 475 /*
 476  *      btrestrpos() -- restore scan to last saved position
 477  */
 478 void
 479 btrestrpos(IndexScanDesc scan)
 480 {
 481         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 482
 483         if (so->markItemIndex >= 0)
 484         {
 485                 /*
 486                  * The scan has never moved to a new page since the last mark.  Just
 487                  * restore the itemIndex.
 488                  *
 489                  * NB: In this case we can't count on anything in so->markPos to be
 490                  * accurate.
 491                  */
 492                 so->currPos.itemIndex = so->markItemIndex;
 493         }
 494         else
 495         {
 496                 /*
 497                  * The scan moved to a new page after last mark or restore, and we are
 498                  * now restoring to the marked page.  We aren't holding any read
 499                  * locks, but if we're still holding the pin for the current position,
 500                  * we must drop it.
 501                  */
 502                 if (BTScanPosIsValid(so->currPos))
 503                 {
 504                         /* Before leaving current page, deal with any killed items */
 505                         if (so->numKilled > 0)
 506                                 _bt_killitems(scan);
 507                         BTScanPosUnpinIfPinned(so->currPos);
 508                 }
 509
 510                 if (BTScanPosIsValid(so->markPos))
 511                 {
 512                         /* bump pin on mark buffer for assignment to current buffer */
 513                         if (BTScanPosIsPinned(so->markPos))
 514                                 IncrBufferRefCount(so->markPos.buf);
 515                         memcpy(&so->currPos, &so->markPos,
 516                                    offsetof(BTScanPosData, items[1]) +
 517                                    so->markPos.lastItem * sizeof(BTScanPosItem));
 518                         if (so->currTuples)
 519                                 memcpy(so->currTuples, so->markTuples,
 520                                            so->markPos.nextTupleOffset);
 521                         /* Reset the scan's array keys (see _bt_steppage for why) */
 522                         if (so->numArrayKeys)
 523                         {
 524                                 _bt_start_array_keys(scan, so->currPos.dir);
 525                                 so->needPrimScan = false;
 526                         }
 527                 }
 528                 else
 529                         BTScanPosInvalidate(so->currPos);
 530         }
 531 }
 532
 533 /*
 534  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
 535  */
 536 Size
 537 btestimateparallelscan(int nkeys, int norderbys)
 538 {
 539         /* Pessimistically assume all input scankeys will be output with arrays */
 540         return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
 541 }
 542
 543 /*
 544  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
 545  */
 546 void
 547 btinitparallelscan(void *target)
 548 {
 549         BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
 550
 551         SpinLockInit(&bt_target->btps_mutex);
 552         bt_target->btps_nextScanPage = InvalidBlockNumber;
 553         bt_target->btps_lastCurrPage = InvalidBlockNumber;
 554         bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 555         ConditionVariableInit(&bt_target->btps_cv);
 556 }
 557
 558 /*
 559  *      btparallelrescan() -- reset parallel scan
 560  */
 561 void
 562 btparallelrescan(IndexScanDesc scan)
 563 {
 564         BTParallelScanDesc btscan;
 565         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 566
 567         Assert(parallel_scan);
 568
 569         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 570                                                                                                   parallel_scan->ps_offset);
 571
 572         /*
 573          * In theory, we don't need to acquire the spinlock here, because there
 574          * shouldn't be any other workers running at this point, but we do so for
 575          * consistency.
 576          */
 577         SpinLockAcquire(&btscan->btps_mutex);
 578         btscan->btps_nextScanPage = InvalidBlockNumber;
 579         btscan->btps_lastCurrPage = InvalidBlockNumber;
 580         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 581         SpinLockRelease(&btscan->btps_mutex);
 582 }
 583
 584 /*
 585  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
 586  *              page.  Other scans must wait until we call _bt_parallel_release()
 587  *              or _bt_parallel_done().
 588  *
 589  * The return value is true if we successfully seized the scan and false
 590  * if we did not.  The latter case occurs when no pages remain, or when
 591  * another primitive index scan is scheduled that caller's backend cannot
 592  * start just yet (only backends that call from _bt_first are capable of
 593  * starting primitive index scans, which they indicate by passing first=true).
 594  *
 595  * If the return value is true, *next_scan_page returns the next page of the
 596  * scan, and *last_curr_page returns the page that *next_scan_page came from.
 597  * An invalid *next_scan_page means the scan hasn't yet started, or that
 598  * caller needs to start the next primitive index scan (if it's the latter
 599  * case we'll set so.needPrimScan).
 600  *
 601  * Callers should ignore the value of *next_scan_page and *last_curr_page if
 602  * the return value is false.
 603  */
 604 bool
 605 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
 606                                    BlockNumber *last_curr_page, bool first)
 607 {
 608         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 609         bool            exit_loop = false,
 610                                 status = true,
 611                                 endscan = false;
 612         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 613         BTParallelScanDesc btscan;
 614
 615         *next_scan_page = InvalidBlockNumber;
 616         *last_curr_page = InvalidBlockNumber;
 617
 618         /*
 619          * Reset so->currPos, and initialize moreLeft/moreRight such that the next
 620          * call to _bt_readnextpage treats this backend similarly to a serial
 621          * backend that steps from *last_curr_page to *next_scan_page (unless this
 622          * backend's so->currPos is initialized by _bt_readfirstpage before then).
 623          */
 624         BTScanPosInvalidate(so->currPos);
 625         so->currPos.moreLeft = so->currPos.moreRight = true;
 626
 627         if (first)
 628         {
 629                 /*
 630                  * Initialize array related state when called from _bt_first, assuming
 631                  * that this will be the first primitive index scan for the scan
 632                  */
 633                 so->needPrimScan = false;
 634                 so->scanBehind = false;
 635                 so->oppositeDirCheck = false;
 636         }
 637         else
 638         {
 639                 /*
 640                  * Don't attempt to seize the scan when it requires another primitive
 641                  * index scan, since caller's backend cannot start it right now
 642                  */
 643                 if (so->needPrimScan)
 644                         return false;
 645         }
 646
 647         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 648                                                                                                   parallel_scan->ps_offset);
 649
 650         while (1)
 651         {
 652                 SpinLockAcquire(&btscan->btps_mutex);
 653
 654                 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
 655                 {
 656                         /* We're done with this parallel index scan */
 657                         status = false;
 658                 }
 659                 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
 660                                  btscan->btps_nextScanPage == P_NONE)
 661                 {
 662                         /* End this parallel index scan */
 663                         status = false;
 664                         endscan = true;
 665                 }
 666                 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
 667                 {
 668                         Assert(so->numArrayKeys);
 669
 670                         if (first)
 671                         {
 672                                 /* Can start scheduled primitive scan right away, so do so */
 673                                 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 674                                 for (int i = 0; i < so->numArrayKeys; i++)
 675                                 {
 676                                         BTArrayKeyInfo *array = &so->arrayKeys[i];
 677                                         ScanKey         skey = &so->keyData[array->scan_key];
 678
 679                                         array->cur_elem = btscan->btps_arrElems[i];
 680                                         skey->sk_argument = array->elem_values[array->cur_elem];
 681                                 }
 682                                 exit_loop = true;
 683                         }
 684                         else
 685                         {
 686                                 /*
 687                                  * Don't attempt to seize the scan when it requires another
 688                                  * primitive index scan, since caller's backend cannot start
 689                                  * it right now
 690                                  */
 691                                 status = false;
 692                         }
 693
 694                         /*
 695                          * Either way, update backend local state to indicate that a
 696                          * pending primitive scan is required
 697                          */
 698                         so->needPrimScan = true;
 699                         so->scanBehind = false;
 700                         so->oppositeDirCheck = false;
 701                 }
 702                 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
 703                 {
 704                         /*
 705                          * We have successfully seized control of the scan for the purpose
 706                          * of advancing it to a new page!
 707                          */
 708                         btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 709                         Assert(btscan->btps_nextScanPage != P_NONE);
 710                         *next_scan_page = btscan->btps_nextScanPage;
 711                         *last_curr_page = btscan->btps_lastCurrPage;
 712                         exit_loop = true;
 713                 }
 714                 SpinLockRelease(&btscan->btps_mutex);
 715                 if (exit_loop || !status)
 716                         break;
 717                 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
 718         }
 719         ConditionVariableCancelSleep();
 720
 721         /* When the scan has reached the rightmost (or leftmost) page, end it */
 722         if (endscan)
 723                 _bt_parallel_done(scan);
 724
 725         return status;
 726 }
 727
 728 /*
 729  * _bt_parallel_release() -- Complete the process of advancing the scan to a
 730  *              new page.  We now have the new value btps_nextScanPage; another backend
 731  *              can now begin advancing the scan.
 732  *
 733  * Callers whose scan uses array keys must save their curr_page argument so
 734  * that it can be passed to _bt_parallel_primscan_schedule, should caller
 735  * determine that another primitive index scan is required.
 736  *
 737  * If caller's next_scan_page is P_NONE, the scan has reached the index's
 738  * rightmost/leftmost page.  This is treated as reaching the end of the scan
 739  * within _bt_parallel_seize.
 740  *
 741  * Note: unlike the serial case, parallel scans don't need to remember both
 742  * sibling links.  next_scan_page is whichever link is next given the scan's
 743  * direction.  That's all we'll ever need, since the direction of a parallel
 744  * scan can never change.
 745  */
 746 void
 747 _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
 748                                          BlockNumber curr_page)
 749 {
 750         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 751         BTParallelScanDesc btscan;
 752
 753         Assert(BlockNumberIsValid(next_scan_page));
 754
 755         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 756                                                                                                   parallel_scan->ps_offset);
 757
 758         SpinLockAcquire(&btscan->btps_mutex);
 759         btscan->btps_nextScanPage = next_scan_page;
 760         btscan->btps_lastCurrPage = curr_page;
 761         btscan->btps_pageStatus = BTPARALLEL_IDLE;
 762         SpinLockRelease(&btscan->btps_mutex);
 763         ConditionVariableSignal(&btscan->btps_cv);
 764 }
 765
 766 /*
 767  * _bt_parallel_done() -- Mark the parallel scan as complete.
 768  *
 769  * When there are no pages left to scan, this function should be called to
 770  * notify other workers.  Otherwise, they might wait forever for the scan to
 771  * advance to the next page.
 772  */
 773 void
 774 _bt_parallel_done(IndexScanDesc scan)
 775 {
 776         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 777         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 778         BTParallelScanDesc btscan;
 779         bool            status_changed = false;
 780
 781         Assert(!BTScanPosIsValid(so->currPos));
 782
 783         /* Do nothing, for non-parallel scans */
 784         if (parallel_scan == NULL)
 785                 return;
 786
 787         /*
 788          * Should not mark parallel scan done when there's still a pending
 789          * primitive index scan
 790          */
 791         if (so->needPrimScan)
 792                 return;
 793
 794         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 795                                                                                                   parallel_scan->ps_offset);
 796
 797         /*
 798          * Mark the parallel scan as done, unless some other process did so
 799          * already
 800          */
 801         SpinLockAcquire(&btscan->btps_mutex);
 802         Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
 803         if (btscan->btps_pageStatus != BTPARALLEL_DONE)
 804         {
 805                 btscan->btps_pageStatus = BTPARALLEL_DONE;
 806                 status_changed = true;
 807         }
 808         SpinLockRelease(&btscan->btps_mutex);
 809
 810         /* wake up all the workers associated with this parallel scan */
 811         if (status_changed)
 812                 ConditionVariableBroadcast(&btscan->btps_cv);
 813 }
 814
 815 /*
 816  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
 817  *
 818  * Caller passes the curr_page most recently passed to _bt_parallel_release
 819  * by its backend.  Caller successfully schedules the next primitive index scan
 820  * if the shared parallel state hasn't been seized since caller's backend last
 821  * advanced the scan.
 822  */
 823 void
 824 _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
 825 {
 826         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 827         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 828         BTParallelScanDesc btscan;
 829
 830         Assert(so->numArrayKeys);
 831
 832         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 833                                                                                                   parallel_scan->ps_offset);
 834
 835         SpinLockAcquire(&btscan->btps_mutex);
 836         if (btscan->btps_lastCurrPage == curr_page &&
 837                 btscan->btps_pageStatus == BTPARALLEL_IDLE)
 838         {
 839                 btscan->btps_nextScanPage = InvalidBlockNumber;
 840                 btscan->btps_lastCurrPage = InvalidBlockNumber;
 841                 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
 842
 843                 /* Serialize scan's current array keys */
 844                 for (int i = 0; i < so->numArrayKeys; i++)
 845                 {
 846                         BTArrayKeyInfo *array = &so->arrayKeys[i];
 847
 848                         btscan->btps_arrElems[i] = array->cur_elem;
 849                 }
 850         }
 851         SpinLockRelease(&btscan->btps_mutex);
 852 }
 853
 854 /*
 855  * Bulk deletion of all index entries pointing to a set of heap tuples.
 856  * The set of target tuples is specified via a callback routine that tells
 857  * whether any given heap tuple (identified by ItemPointer) is being deleted.
 858  *
 859  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 860  */
 861 IndexBulkDeleteResult *
 862 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 863                          IndexBulkDeleteCallback callback, void *callback_state)
 864 {
 865         Relation        rel = info->index;
 866         BTCycleId       cycleid;
 867
 868         /* allocate stats if first time through, else re-use existing struct */
 869         if (stats == NULL)
 870                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 871
 872         /* Establish the vacuum cycle ID to use for this scan */
 873         /* The ENSURE stuff ensures we clean up shared memory on failure */
 874         PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 875         {
 876                 cycleid = _bt_start_vacuum(rel);
 877
 878                 btvacuumscan(info, stats, callback, callback_state, cycleid);
 879         }
 880         PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 881         _bt_end_vacuum(rel);
 882
 883         return stats;
 884 }
 885
 886 /*
 887  * Post-VACUUM cleanup.
 888  *
 889  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 890  */
 891 IndexBulkDeleteResult *
 892 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 893 {
 894         BlockNumber num_delpages;
 895
 896         /* No-op in ANALYZE ONLY mode */
 897         if (info->analyze_only)
 898                 return stats;
 899
 900         /*
 901          * If btbulkdelete was called, we need not do anything (we just maintain
 902          * the information used within _bt_vacuum_needs_cleanup() by calling
 903          * _bt_set_cleanup_info() below).
 904          *
 905          * If btbulkdelete was _not_ called, then we have a choice to make: we
 906          * must decide whether or not a btvacuumscan() call is needed now (i.e.
 907          * whether the ongoing VACUUM operation can entirely avoid a physical scan
 908          * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
 909          * now.
 910          */
 911         if (stats == NULL)
 912         {
 913                 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
 914                 if (!_bt_vacuum_needs_cleanup(info->index))
 915                         return NULL;
 916
 917                 /*
 918                  * Since we aren't going to actually delete any leaf items, there's no
 919                  * need to go through all the vacuum-cycle-ID pushups here.
 920                  *
 921                  * Posting list tuples are a source of inaccuracy for cleanup-only
 922                  * scans.  btvacuumscan() will assume that the number of index tuples
 923                  * from each page can be used as num_index_tuples, even though
 924                  * num_index_tuples is supposed to represent the number of TIDs in the
 925                  * index.  This naive approach can underestimate the number of tuples
 926                  * in the index significantly.
 927                  *
 928                  * We handle the problem by making num_index_tuples an estimate in
 929                  * cleanup-only case.
 930                  */
 931                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 932                 btvacuumscan(info, stats, NULL, NULL, 0);
 933                 stats->estimated_count = true;
 934         }
 935
 936         /*
 937          * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
 938          *
 939          * num_delpages is the number of deleted pages now in the index that were
 940          * not safe to place in the FSM to be recycled just yet.  num_delpages is
 941          * greater than 0 only when _bt_pagedel() actually deleted pages during
 942          * our call to btvacuumscan().  Even then, _bt_pendingfsm_finalize() must
 943          * have failed to place any newly deleted pages in the FSM just moments
 944          * ago.  (Actually, there are edge cases where recycling of the current
 945          * VACUUM's newly deleted pages does not even become safe by the time the
 946          * next VACUUM comes around.  See nbtree/README.)
 947          */
 948         Assert(stats->pages_deleted >= stats->pages_free);
 949         num_delpages = stats->pages_deleted - stats->pages_free;
 950         _bt_set_cleanup_info(info->index, num_delpages);
 951
 952         /*
 953          * It's quite possible for us to be fooled by concurrent page splits into
 954          * double-counting some index tuples, so disbelieve any total that exceeds
 955          * the underlying heap's count ... if we know that accurately.  Otherwise
 956          * this might just make matters worse.
 957          */
 958         if (!info->estimated_count)
 959         {
 960                 if (stats->num_index_tuples > info->num_heap_tuples)
 961                         stats->num_index_tuples = info->num_heap_tuples;
 962         }
 963
 964         return stats;
 965 }
 966
 967 /*
 968  * btvacuumscan --- scan the index for VACUUMing purposes
 969  *
 970  * This combines the functions of looking for leaf tuples that are deletable
 971  * according to the vacuum callback, looking for empty pages that can be
 972  * deleted, and looking for old deleted pages that can be recycled.  Both
 973  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
 974  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
 975  *
 976  * The caller is responsible for initially allocating/zeroing a stats struct
 977  * and for obtaining a vacuum cycle ID if necessary.
 978  */
 979 static void
 980 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 981                          IndexBulkDeleteCallback callback, void *callback_state,
 982                          BTCycleId cycleid)
 983 {
 984         Relation        rel = info->index;
 985         BTVacState      vstate;
 986         BlockNumber num_pages;
 987         BlockNumber scanblkno;
 988         bool            needLock;
 989
 990         /*
 991          * Reset fields that track information about the entire index now.  This
 992          * avoids double-counting in the case where a single VACUUM command
 993          * requires multiple scans of the index.
 994          *
 995          * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
 996          * since they track information about the VACUUM command, and so must last
 997          * across each call to btvacuumscan().
 998          *
 999          * (Note that pages_free is treated as state about the whole index, not
1000          * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
1001          * calls are idempotent, and get repeated for the same deleted pages in
1002          * some scenarios.  The point for us is to track the number of recyclable
1003          * pages in the index at the end of the VACUUM command.)
1004          */
1005         stats->num_pages = 0;
1006         stats->num_index_tuples = 0;
1007         stats->pages_deleted = 0;
1008         stats->pages_free = 0;
1009
1010         /* Set up info to pass down to btvacuumpage */
1011         vstate.info = info;
1012         vstate.stats = stats;
1013         vstate.callback = callback;
1014         vstate.callback_state = callback_state;
1015         vstate.cycleid = cycleid;
1016
1017         /* Create a temporary memory context to run _bt_pagedel in */
1018         vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1019                                                                                                   "_bt_pagedel",
1020                                                                                                   ALLOCSET_DEFAULT_SIZES);
1021
1022         /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1023         vstate.bufsize = 0;
1024         vstate.maxbufsize = 0;
1025         vstate.pendingpages = NULL;
1026         vstate.npendingpages = 0;
1027         /* Consider applying _bt_pendingfsm_finalize optimization */
1028         _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1029
1030         /*
1031          * The outer loop iterates over all index pages except the metapage, in
1032          * physical order (we hope the kernel will cooperate in providing
1033          * read-ahead for speed).  It is critical that we visit all leaf pages,
1034          * including ones added after we start the scan, else we might fail to
1035          * delete some deletable tuples.  Hence, we must repeatedly check the
1036          * relation length.  We must acquire the relation-extension lock while
1037          * doing so to avoid a race condition: if someone else is extending the
1038          * relation, there is a window where bufmgr/smgr have created a new
1039          * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1040          * we manage to scan such a page here, we'll improperly assume it can be
1041          * recycled.  Taking the lock synchronizes things enough to prevent a
1042          * problem: either num_pages won't include the new page, or _bt_getbuf
1043          * already has write lock on the buffer and it will be fully initialized
1044          * before we can examine it.  Also, we need not worry if a page is added
1045          * immediately after we look; the page splitting code already has
1046          * write-lock on the left page before it adds a right page, so we must
1047          * already have processed any tuples due to be moved into such a page.
1048          *
1049          * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1050          * think the use of the extension lock is still required.
1051          *
1052          * We can skip locking for new or temp relations, however, since no one
1053          * else could be accessing them.
1054          */
1055         needLock = !RELATION_IS_LOCAL(rel);
1056
1057         scanblkno = BTREE_METAPAGE + 1;
1058         for (;;)
1059         {
1060                 /* Get the current relation length */
1061                 if (needLock)
1062                         LockRelationForExtension(rel, ExclusiveLock);
1063                 num_pages = RelationGetNumberOfBlocks(rel);
1064                 if (needLock)
1065                         UnlockRelationForExtension(rel, ExclusiveLock);
1066
1067                 if (info->report_progress)
1068                         pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1069                                                                                  num_pages);
1070
1071                 /* Quit if we've scanned the whole relation */
1072                 if (scanblkno >= num_pages)
1073                         break;
1074                 /* Iterate over pages, then loop back to recheck length */
1075                 for (; scanblkno < num_pages; scanblkno++)
1076                 {
1077                         btvacuumpage(&vstate, scanblkno);
1078                         if (info->report_progress)
1079                                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1080                                                                                          scanblkno);
1081                 }
1082         }
1083
1084         /* Set statistics num_pages field to final size of index */
1085         stats->num_pages = num_pages;
1086
1087         MemoryContextDelete(vstate.pagedelcontext);
1088
1089         /*
1090          * If there were any calls to _bt_pagedel() during scan of the index then
1091          * see if any of the resulting pages can be placed in the FSM now.  When
1092          * it's not safe we'll have to leave it up to a future VACUUM operation.
1093          *
1094          * Finally, if we placed any pages in the FSM (either just now or during
1095          * the scan), forcibly update the upper-level FSM pages to ensure that
1096          * searchers can find them.
1097          */
1098         _bt_pendingfsm_finalize(rel, &vstate);
1099         if (stats->pages_free > 0)
1100                 IndexFreeSpaceMapVacuum(rel);
1101 }
1102
1103 /*
1104  * btvacuumpage --- VACUUM one page
1105  *
1106  * This processes a single page for btvacuumscan().  In some cases we must
1107  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1108  * a previous call here.  This is how we handle page splits (that happened
1109  * after our cycleid was acquired) whose right half page happened to reuse
1110  * a block that we might have processed at some point before it was
1111  * recycled (i.e. before the page split).
1112  */
1113 static void
1114 btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1115 {
1116         IndexVacuumInfo *info = vstate->info;
1117         IndexBulkDeleteResult *stats = vstate->stats;
1118         IndexBulkDeleteCallback callback = vstate->callback;
1119         void       *callback_state = vstate->callback_state;
1120         Relation        rel = info->index;
1121         Relation        heaprel = info->heaprel;
1122         bool            attempt_pagedel;
1123         BlockNumber blkno,
1124                                 backtrack_to;
1125         Buffer          buf;
1126         Page            page;
1127         BTPageOpaque opaque;
1128
1129         blkno = scanblkno;
1130
1131 backtrack:
1132
1133         attempt_pagedel = false;
1134         backtrack_to = P_NONE;
1135
1136         /* call vacuum_delay_point while not holding any buffer lock */
1137         vacuum_delay_point();
1138
1139         /*
1140          * We can't use _bt_getbuf() here because it always applies
1141          * _bt_checkpage(), which will barf on an all-zero page. We want to
1142          * recycle all-zero pages, not fail.  Also, we want to use a nondefault
1143          * buffer access strategy.
1144          */
1145         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1146                                                          info->strategy);
1147         _bt_lockbuf(rel, buf, BT_READ);
1148         page = BufferGetPage(buf);
1149         opaque = NULL;
1150         if (!PageIsNew(page))
1151         {
1152                 _bt_checkpage(rel, buf);
1153                 opaque = BTPageGetOpaque(page);
1154         }
1155
1156         Assert(blkno <= scanblkno);
1157         if (blkno != scanblkno)
1158         {
1159                 /*
1160                  * We're backtracking.
1161                  *
1162                  * We followed a right link to a sibling leaf page (a page that
1163                  * happens to be from a block located before scanblkno).  The only
1164                  * case we want to do anything with is a live leaf page having the
1165                  * current vacuum cycle ID.
1166                  *
1167                  * The page had better be in a state that's consistent with what we
1168                  * expect.  Check for conditions that imply corruption in passing.  It
1169                  * can't be half-dead because only an interrupted VACUUM process can
1170                  * leave pages in that state, so we'd definitely have dealt with it
1171                  * back when the page was the scanblkno page (half-dead pages are
1172                  * always marked fully deleted by _bt_pagedel(), barring corruption).
1173                  */
1174                 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1175                 {
1176                         Assert(false);
1177                         ereport(LOG,
1178                                         (errcode(ERRCODE_INDEX_CORRUPTED),
1179                                          errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1180                                                                          blkno, scanblkno, RelationGetRelationName(rel))));
1181                         _bt_relbuf(rel, buf);
1182                         return;
1183                 }
1184
1185                 /*
1186                  * We may have already processed the page in an earlier call, when the
1187                  * page was scanblkno.  This happens when the leaf page split occurred
1188                  * after the scan began, but before the right sibling page became the
1189                  * scanblkno.
1190                  *
1191                  * Page may also have been deleted by current btvacuumpage() call,
1192                  * since _bt_pagedel() sometimes deletes the right sibling page of
1193                  * scanblkno in passing (it does so after we decided where to
1194                  * backtrack to).  We don't need to process this page as a deleted
1195                  * page a second time now (in fact, it would be wrong to count it as a
1196                  * deleted page in the bulk delete statistics a second time).
1197                  */
1198                 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1199                 {
1200                         /* Done with current scanblkno (and all lower split pages) */
1201                         _bt_relbuf(rel, buf);
1202                         return;
1203                 }
1204         }
1205
1206         if (!opaque || BTPageIsRecyclable(page, heaprel))
1207         {
1208                 /* Okay to recycle this page (which could be leaf or internal) */
1209                 RecordFreeIndexPage(rel, blkno);
1210                 stats->pages_deleted++;
1211                 stats->pages_free++;
1212         }
1213         else if (P_ISDELETED(opaque))
1214         {
1215                 /*
1216                  * Already deleted page (which could be leaf or internal).  Can't
1217                  * recycle yet.
1218                  */
1219                 stats->pages_deleted++;
1220         }
1221         else if (P_ISHALFDEAD(opaque))
1222         {
1223                 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1224                 attempt_pagedel = true;
1225
1226                 /*
1227                  * _bt_pagedel() will increment both pages_newly_deleted and
1228                  * pages_deleted stats in all cases (barring corruption)
1229                  */
1230         }
1231         else if (P_ISLEAF(opaque))
1232         {
1233                 OffsetNumber deletable[MaxIndexTuplesPerPage];
1234                 int                     ndeletable;
1235                 BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1236                 int                     nupdatable;
1237                 OffsetNumber offnum,
1238                                         minoff,
1239                                         maxoff;
1240                 int                     nhtidsdead,
1241                                         nhtidslive;
1242
1243                 /*
1244                  * Trade in the initial read lock for a full cleanup lock on this
1245                  * page.  We must get such a lock on every leaf page over the course
1246                  * of the vacuum scan, whether or not it actually contains any
1247                  * deletable tuples --- see nbtree/README.
1248                  */
1249                 _bt_upgradelockbufcleanup(rel, buf);
1250
1251                 /*
1252                  * Check whether we need to backtrack to earlier pages.  What we are
1253                  * concerned about is a page split that happened since we started the
1254                  * vacuum scan.  If the split moved tuples on the right half of the
1255                  * split (i.e. the tuples that sort high) to a block that we already
1256                  * passed over, then we might have missed the tuples.  We need to
1257                  * backtrack now.  (Must do this before possibly clearing btpo_cycleid
1258                  * or deleting scanblkno page below!)
1259                  */
1260                 if (vstate->cycleid != 0 &&
1261                         opaque->btpo_cycleid == vstate->cycleid &&
1262                         !(opaque->btpo_flags & BTP_SPLIT_END) &&
1263                         !P_RIGHTMOST(opaque) &&
1264                         opaque->btpo_next < scanblkno)
1265                         backtrack_to = opaque->btpo_next;
1266
1267                 ndeletable = 0;
1268                 nupdatable = 0;
1269                 minoff = P_FIRSTDATAKEY(opaque);
1270                 maxoff = PageGetMaxOffsetNumber(page);
1271                 nhtidsdead = 0;
1272                 nhtidslive = 0;
1273                 if (callback)
1274                 {
1275                         /* btbulkdelete callback tells us what to delete (or update) */
1276                         for (offnum = minoff;
1277                                  offnum <= maxoff;
1278                                  offnum = OffsetNumberNext(offnum))
1279                         {
1280                                 IndexTuple      itup;
1281
1282                                 itup = (IndexTuple) PageGetItem(page,
1283                                                                                                 PageGetItemId(page, offnum));
1284
1285                                 Assert(!BTreeTupleIsPivot(itup));
1286                                 if (!BTreeTupleIsPosting(itup))
1287                                 {
1288                                         /* Regular tuple, standard table TID representation */
1289                                         if (callback(&itup->t_tid, callback_state))
1290                                         {
1291                                                 deletable[ndeletable++] = offnum;
1292                                                 nhtidsdead++;
1293                                         }
1294                                         else
1295                                                 nhtidslive++;
1296                                 }
1297                                 else
1298                                 {
1299                                         BTVacuumPosting vacposting;
1300                                         int                     nremaining;
1301
1302                                         /* Posting list tuple */
1303                                         vacposting = btreevacuumposting(vstate, itup, offnum,
1304                                                                                                         &nremaining);
1305                                         if (vacposting == NULL)
1306                                         {
1307                                                 /*
1308                                                  * All table TIDs from the posting tuple remain, so no
1309                                                  * delete or update required
1310                                                  */
1311                                                 Assert(nremaining == BTreeTupleGetNPosting(itup));
1312                                         }
1313                                         else if (nremaining > 0)
1314                                         {
1315
1316                                                 /*
1317                                                  * Store metadata about posting list tuple in
1318                                                  * updatable array for entire page.  Existing tuple
1319                                                  * will be updated during the later call to
1320                                                  * _bt_delitems_vacuum().
1321                                                  */
1322                                                 Assert(nremaining < BTreeTupleGetNPosting(itup));
1323                                                 updatable[nupdatable++] = vacposting;
1324                                                 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1325                                         }
1326                                         else
1327                                         {
1328                                                 /*
1329                                                  * All table TIDs from the posting list must be
1330                                                  * deleted.  We'll delete the index tuple completely
1331                                                  * (no update required).
1332                                                  */
1333                                                 Assert(nremaining == 0);
1334                                                 deletable[ndeletable++] = offnum;
1335                                                 nhtidsdead += BTreeTupleGetNPosting(itup);
1336                                                 pfree(vacposting);
1337                                         }
1338
1339                                         nhtidslive += nremaining;
1340                                 }
1341                         }
1342                 }
1343
1344                 /*
1345                  * Apply any needed deletes or updates.  We issue just one
1346                  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1347                  */
1348                 if (ndeletable > 0 || nupdatable > 0)
1349                 {
1350                         Assert(nhtidsdead >= ndeletable + nupdatable);
1351                         _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1352                                                                 nupdatable);
1353
1354                         stats->tuples_removed += nhtidsdead;
1355                         /* must recompute maxoff */
1356                         maxoff = PageGetMaxOffsetNumber(page);
1357
1358                         /* can't leak memory here */
1359                         for (int i = 0; i < nupdatable; i++)
1360                                 pfree(updatable[i]);
1361                 }
1362                 else
1363                 {
1364                         /*
1365                          * If the leaf page has been split during this vacuum cycle, it
1366                          * seems worth expending a write to clear btpo_cycleid even if we
1367                          * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
1368                          * takes care of this.)  This ensures we won't process the page
1369                          * again.
1370                          *
1371                          * We treat this like a hint-bit update because there's no need to
1372                          * WAL-log it.
1373                          */
1374                         Assert(nhtidsdead == 0);
1375                         if (vstate->cycleid != 0 &&
1376                                 opaque->btpo_cycleid == vstate->cycleid)
1377                         {
1378                                 opaque->btpo_cycleid = 0;
1379                                 MarkBufferDirtyHint(buf, true);
1380                         }
1381                 }
1382
1383                 /*
1384                  * If the leaf page is now empty, try to delete it; else count the
1385                  * live tuples (live table TIDs in posting lists are counted as
1386                  * separate live tuples).  We don't delete when backtracking, though,
1387                  * since that would require teaching _bt_pagedel() about backtracking
1388                  * (doesn't seem worth adding more complexity to deal with that).
1389                  *
1390                  * We don't count the number of live TIDs during cleanup-only calls to
1391                  * btvacuumscan (i.e. when callback is not set).  We count the number
1392                  * of index tuples directly instead.  This avoids the expense of
1393                  * directly examining all of the tuples on each page.  VACUUM will
1394                  * treat num_index_tuples as an estimate in cleanup-only case, so it
1395                  * doesn't matter that this underestimates num_index_tuples
1396                  * significantly in some cases.
1397                  */
1398                 if (minoff > maxoff)
1399                         attempt_pagedel = (blkno == scanblkno);
1400                 else if (callback)
1401                         stats->num_index_tuples += nhtidslive;
1402                 else
1403                         stats->num_index_tuples += maxoff - minoff + 1;
1404
1405                 Assert(!attempt_pagedel || nhtidslive == 0);
1406         }
1407
1408         if (attempt_pagedel)
1409         {
1410                 MemoryContext oldcontext;
1411
1412                 /* Run pagedel in a temp context to avoid memory leakage */
1413                 MemoryContextReset(vstate->pagedelcontext);
1414                 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1415
1416                 /*
1417                  * _bt_pagedel maintains the bulk delete stats on our behalf;
1418                  * pages_newly_deleted and pages_deleted are likely to be incremented
1419                  * during call
1420                  */
1421                 Assert(blkno == scanblkno);
1422                 _bt_pagedel(rel, buf, vstate);
1423
1424                 MemoryContextSwitchTo(oldcontext);
1425                 /* pagedel released buffer, so we shouldn't */
1426         }
1427         else
1428                 _bt_relbuf(rel, buf);
1429
1430         if (backtrack_to != P_NONE)
1431         {
1432                 blkno = backtrack_to;
1433                 goto backtrack;
1434         }
1435 }
1436
1437 /*
1438  * btreevacuumposting --- determine TIDs still needed in posting list
1439  *
1440  * Returns metadata describing how to build replacement tuple without the TIDs
1441  * that VACUUM needs to delete.  Returned value is NULL in the common case
1442  * where no changes are needed to caller's posting list tuple (we avoid
1443  * allocating memory here as an optimization).
1444  *
1445  * The number of TIDs that should remain in the posting list tuple is set for
1446  * caller in *nremaining.
1447  */
1448 static BTVacuumPosting
1449 btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1450                                    OffsetNumber updatedoffset, int *nremaining)
1451 {
1452         int                     live = 0;
1453         int                     nitem = BTreeTupleGetNPosting(posting);
1454         ItemPointer items = BTreeTupleGetPosting(posting);
1455         BTVacuumPosting vacposting = NULL;
1456
1457         for (int i = 0; i < nitem; i++)
1458         {
1459                 if (!vstate->callback(items + i, vstate->callback_state))
1460                 {
1461                         /* Live table TID */
1462                         live++;
1463                 }
1464                 else if (vacposting == NULL)
1465                 {
1466                         /*
1467                          * First dead table TID encountered.
1468                          *
1469                          * It's now clear that we need to delete one or more dead table
1470                          * TIDs, so start maintaining metadata describing how to update
1471                          * existing posting list tuple.
1472                          */
1473                         vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1474                                                                 nitem * sizeof(uint16));
1475
1476                         vacposting->itup = posting;
1477                         vacposting->updatedoffset = updatedoffset;
1478                         vacposting->ndeletedtids = 0;
1479                         vacposting->deletetids[vacposting->ndeletedtids++] = i;
1480                 }
1481                 else
1482                 {
1483                         /* Second or subsequent dead table TID */
1484                         vacposting->deletetids[vacposting->ndeletedtids++] = i;
1485                 }
1486         }
1487
1488         *nremaining = live;
1489         return vacposting;
1490 }
1491
1492 /*
1493  *      btcanreturn() -- Check whether btree indexes support index-only scans.
1494  *
1495  * btrees always do, so this is trivial.
1496  */
1497 bool
1498 btcanreturn(Relation index, int attno)
1499 {
1500         return true;
1501 }
1502
1503 /*
1504  * btgettreeheight() -- Compute tree height for use by btcostestimate().
1505  */
1506 int
1507 btgettreeheight(Relation rel)
1508 {
1509         return _bt_getrootheight(rel);
1510 }