src/backend/access/nbtree/nbtree.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtree.c
   4  *        Implementation of Lehman and Yao's btree management algorithm for
   5  *        Postgres.
   6  *
   7  * NOTES
   8  *        This file contains only the public interface routines.
   9  *
  10  *
  11  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  12  * Portions Copyright (c) 1994, Regents of the University of California
  13  *
  14  * IDENTIFICATION
  15  *        $PostgreSQL$
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19 #include "postgres.h"
  20
  21 #include "access/genam.h"
  22 #include "access/nbtree.h"
  23 #include "access/relscan.h"
  24 #include "catalog/index.h"
  25 #include "catalog/storage.h"
  26 #include "commands/vacuum.h"
  27 #include "storage/bufmgr.h"
  28 #include "storage/freespace.h"
  29 #include "storage/indexfsm.h"
  30 #include "storage/ipc.h"
  31 #include "storage/lmgr.h"
  32 #include "utils/memutils.h"
  33
  34
  35 /* Working state for btbuild and its callback */
  36 typedef struct
  37 {
  38         bool            isUnique;
  39         bool            haveDead;
  40         Relation        heapRel;
  41         BTSpool    *spool;
  42
  43         /*
  44          * spool2 is needed only when the index is an unique index. Dead tuples
  45          * are put into spool2 instead of spool in order to avoid uniqueness
  46          * check.
  47          */
  48         BTSpool    *spool2;
  49         double          indtuples;
  50 } BTBuildState;
  51
  52 /* Working state needed by btvacuumpage */
  53 typedef struct
  54 {
  55         IndexVacuumInfo *info;
  56         IndexBulkDeleteResult *stats;
  57         IndexBulkDeleteCallback callback;
  58         void       *callback_state;
  59         BTCycleId       cycleid;
  60         BlockNumber lastUsedPage;
  61         BlockNumber totFreePages;       /* true total # of free pages */
  62         MemoryContext pagedelcontext;
  63 } BTVacState;
  64
  65
  66 static void btbuildCallback(Relation index,
  67                                 HeapTuple htup,
  68                                 Datum *values,
  69                                 bool *isnull,
  70                                 bool tupleIsAlive,
  71                                 void *state);
  72 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
  73                          IndexBulkDeleteCallback callback, void *callback_state,
  74                          BTCycleId cycleid);
  75 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
  76                          BlockNumber orig_blkno);
  77
  78
  79 /*
  80  *      btbuild() -- build a new btree index.
  81  */
  82 Datum
  83 btbuild(PG_FUNCTION_ARGS)
  84 {
  85         Relation        heap = (Relation) PG_GETARG_POINTER(0);
  86         Relation        index = (Relation) PG_GETARG_POINTER(1);
  87         IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
  88         IndexBuildResult *result;
  89         double          reltuples;
  90         BTBuildState buildstate;
  91
  92         buildstate.isUnique = indexInfo->ii_Unique;
  93         buildstate.haveDead = false;
  94         buildstate.heapRel = heap;
  95         buildstate.spool = NULL;
  96         buildstate.spool2 = NULL;
  97         buildstate.indtuples = 0;
  98
  99 #ifdef BTREE_BUILD_STATS
 100         if (log_btree_build_stats)
 101                 ResetUsage();
 102 #endif   /* BTREE_BUILD_STATS */
 103
 104         /*
 105          * We expect to be called exactly once for any index relation. If that's
 106          * not the case, big trouble's what we have.
 107          */
 108         if (RelationGetNumberOfBlocks(index) != 0)
 109                 elog(ERROR, "index \"%s\" already contains data",
 110                          RelationGetRelationName(index));
 111
 112         buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
 113
 114         /*
 115          * If building a unique index, put dead tuples in a second spool to keep
 116          * them out of the uniqueness check.
 117          */
 118         if (indexInfo->ii_Unique)
 119                 buildstate.spool2 = _bt_spoolinit(index, false, true);
 120
 121         /* do the heap scan */
 122         reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
 123                                                                    btbuildCallback, (void *) &buildstate);
 124
 125         /* okay, all heap tuples are indexed */
 126         if (buildstate.spool2 && !buildstate.haveDead)
 127         {
 128                 /* spool2 turns out to be unnecessary */
 129                 _bt_spooldestroy(buildstate.spool2);
 130                 buildstate.spool2 = NULL;
 131         }
 132
 133         /*
 134          * Finish the build by (1) completing the sort of the spool file, (2)
 135          * inserting the sorted tuples into btree pages and (3) building the upper
 136          * levels.
 137          */
 138         _bt_leafbuild(buildstate.spool, buildstate.spool2);
 139         _bt_spooldestroy(buildstate.spool);
 140         if (buildstate.spool2)
 141                 _bt_spooldestroy(buildstate.spool2);
 142
 143 #ifdef BTREE_BUILD_STATS
 144         if (log_btree_build_stats)
 145         {
 146                 ShowUsage("BTREE BUILD STATS");
 147                 ResetUsage();
 148         }
 149 #endif   /* BTREE_BUILD_STATS */
 150
 151         /*
 152          * If we are reindexing a pre-existing index, it is critical to send out a
 153          * relcache invalidation SI message to ensure all backends re-read the
 154          * index metapage.      We expect that the caller will ensure that happens
 155          * (typically as a side effect of updating index stats, but it must happen
 156          * even if the stats don't change!)
 157          */
 158
 159         /*
 160          * Return statistics
 161          */
 162         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
 163
 164         result->heap_tuples = reltuples;
 165         result->index_tuples = buildstate.indtuples;
 166
 167         PG_RETURN_POINTER(result);
 168 }
 169
 170 /*
 171  * Per-tuple callback from IndexBuildHeapScan
 172  */
 173 static void
 174 btbuildCallback(Relation index,
 175                                 HeapTuple htup,
 176                                 Datum *values,
 177                                 bool *isnull,
 178                                 bool tupleIsAlive,
 179                                 void *state)
 180 {
 181         BTBuildState *buildstate = (BTBuildState *) state;
 182         IndexTuple      itup;
 183
 184         /* form an index tuple and point it at the heap tuple */
 185         itup = index_form_tuple(RelationGetDescr(index), values, isnull);
 186         itup->t_tid = htup->t_self;
 187
 188         /*
 189          * insert the index tuple into the appropriate spool file for subsequent
 190          * processing
 191          */
 192         if (tupleIsAlive || buildstate->spool2 == NULL)
 193                 _bt_spool(itup, buildstate->spool);
 194         else
 195         {
 196                 /* dead tuples are put into spool2 */
 197                 buildstate->haveDead = true;
 198                 _bt_spool(itup, buildstate->spool2);
 199         }
 200
 201         buildstate->indtuples += 1;
 202
 203         pfree(itup);
 204 }
 205
 206 /*
 207  *      btinsert() -- insert an index tuple into a btree.
 208  *
 209  *              Descend the tree recursively, find the appropriate location for our
 210  *              new tuple, and put it there.
 211  */
 212 Datum
 213 btinsert(PG_FUNCTION_ARGS)
 214 {
 215         Relation        rel = (Relation) PG_GETARG_POINTER(0);
 216         Datum      *values = (Datum *) PG_GETARG_POINTER(1);
 217         bool       *isnull = (bool *) PG_GETARG_POINTER(2);
 218         ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
 219         Relation        heapRel = (Relation) PG_GETARG_POINTER(4);
 220         bool            checkUnique = PG_GETARG_BOOL(5);
 221         IndexTuple      itup;
 222
 223         /* generate an index tuple */
 224         itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 225         itup->t_tid = *ht_ctid;
 226
 227         _bt_doinsert(rel, itup, checkUnique, heapRel);
 228
 229         pfree(itup);
 230
 231         PG_RETURN_BOOL(true);
 232 }
 233
 234 /*
 235  *      btgettuple() -- Get the next tuple in the scan.
 236  */
 237 Datum
 238 btgettuple(PG_FUNCTION_ARGS)
 239 {
 240         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 241         ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
 242         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 243         bool            res;
 244
 245         /* btree indexes are never lossy */
 246         scan->xs_recheck = false;
 247
 248         /*
 249          * If we've already initialized this scan, we can just advance it in the
 250          * appropriate direction.  If we haven't done so yet, we call a routine to
 251          * get the first item in the scan.
 252          */
 253         if (BTScanPosIsValid(so->currPos))
 254         {
 255                 /*
 256                  * Check to see if we should kill the previously-fetched tuple.
 257                  */
 258                 if (scan->kill_prior_tuple)
 259                 {
 260                         /*
 261                          * Yes, remember it for later.  (We'll deal with all such tuples
 262                          * at once right before leaving the index page.)  The test for
 263                          * numKilled overrun is not just paranoia: if the caller reverses
 264                          * direction in the indexscan then the same item might get entered
 265                          * multiple times.      It's not worth trying to optimize that, so we
 266                          * don't detect it, but instead just forget any excess entries.
 267                          */
 268                         if (so->killedItems == NULL)
 269                                 so->killedItems = (int *)
 270                                         palloc(MaxIndexTuplesPerPage * sizeof(int));
 271                         if (so->numKilled < MaxIndexTuplesPerPage)
 272                                 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 273                 }
 274
 275                 /*
 276                  * Now continue the scan.
 277                  */
 278                 res = _bt_next(scan, dir);
 279         }
 280         else
 281                 res = _bt_first(scan, dir);
 282
 283         PG_RETURN_BOOL(res);
 284 }
 285
 286 /*
 287  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
 288  */
 289 Datum
 290 btgetbitmap(PG_FUNCTION_ARGS)
 291 {
 292         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 293         TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
 294         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 295         int64           ntids = 0;
 296         ItemPointer heapTid;
 297
 298         /* Fetch the first page & tuple. */
 299         if (!_bt_first(scan, ForwardScanDirection))
 300         {
 301                 /* empty scan */
 302                 PG_RETURN_INT64(0);
 303         }
 304         /* Save tuple ID, and continue scanning */
 305         heapTid = &scan->xs_ctup.t_self;
 306         tbm_add_tuples(tbm, heapTid, 1, false);
 307         ntids++;
 308
 309         for (;;)
 310         {
 311                 /*
 312                  * Advance to next tuple within page.  This is the same as the easy
 313                  * case in _bt_next().
 314                  */
 315                 if (++so->currPos.itemIndex > so->currPos.lastItem)
 316                 {
 317                         /* let _bt_next do the heavy lifting */
 318                         if (!_bt_next(scan, ForwardScanDirection))
 319                                 break;
 320                 }
 321
 322                 /* Save tuple ID, and continue scanning */
 323                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
 324                 tbm_add_tuples(tbm, heapTid, 1, false);
 325                 ntids++;
 326         }
 327
 328         PG_RETURN_INT64(ntids);
 329 }
 330
 331 /*
 332  *      btbeginscan() -- start a scan on a btree index
 333  */
 334 Datum
 335 btbeginscan(PG_FUNCTION_ARGS)
 336 {
 337         Relation        rel = (Relation) PG_GETARG_POINTER(0);
 338         int                     keysz = PG_GETARG_INT32(1);
 339         ScanKey         scankey = (ScanKey) PG_GETARG_POINTER(2);
 340         IndexScanDesc scan;
 341
 342         /* get the scan */
 343         scan = RelationGetIndexScan(rel, keysz, scankey);
 344
 345         PG_RETURN_POINTER(scan);
 346 }
 347
 348 /*
 349  *      btrescan() -- rescan an index relation
 350  */
 351 Datum
 352 btrescan(PG_FUNCTION_ARGS)
 353 {
 354         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 355         ScanKey         scankey = (ScanKey) PG_GETARG_POINTER(1);
 356         BTScanOpaque so;
 357
 358         so = (BTScanOpaque) scan->opaque;
 359
 360         if (so == NULL)                         /* if called from btbeginscan */
 361         {
 362                 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
 363                 so->currPos.buf = so->markPos.buf = InvalidBuffer;
 364                 if (scan->numberOfKeys > 0)
 365                         so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 366                 else
 367                         so->keyData = NULL;
 368                 so->killedItems = NULL; /* until needed */
 369                 so->numKilled = 0;
 370                 scan->opaque = so;
 371         }
 372
 373         /* we aren't holding any read locks, but gotta drop the pins */
 374         if (BTScanPosIsValid(so->currPos))
 375         {
 376                 /* Before leaving current page, deal with any killed items */
 377                 if (so->numKilled > 0)
 378                         _bt_killitems(scan, false);
 379                 ReleaseBuffer(so->currPos.buf);
 380                 so->currPos.buf = InvalidBuffer;
 381         }
 382
 383         if (BTScanPosIsValid(so->markPos))
 384         {
 385                 ReleaseBuffer(so->markPos.buf);
 386                 so->markPos.buf = InvalidBuffer;
 387         }
 388         so->markItemIndex = -1;
 389
 390         /*
 391          * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
 392          * - vadim 05/05/97
 393          */
 394         if (scankey && scan->numberOfKeys > 0)
 395                 memmove(scan->keyData,
 396                                 scankey,
 397                                 scan->numberOfKeys * sizeof(ScanKeyData));
 398         so->numberOfKeys = 0;           /* until _bt_preprocess_keys sets it */
 399
 400         PG_RETURN_VOID();
 401 }
 402
 403 /*
 404  *      btendscan() -- close down a scan
 405  */
 406 Datum
 407 btendscan(PG_FUNCTION_ARGS)
 408 {
 409         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 410         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 411
 412         /* we aren't holding any read locks, but gotta drop the pins */
 413         if (BTScanPosIsValid(so->currPos))
 414         {
 415                 /* Before leaving current page, deal with any killed items */
 416                 if (so->numKilled > 0)
 417                         _bt_killitems(scan, false);
 418                 ReleaseBuffer(so->currPos.buf);
 419                 so->currPos.buf = InvalidBuffer;
 420         }
 421
 422         if (BTScanPosIsValid(so->markPos))
 423         {
 424                 ReleaseBuffer(so->markPos.buf);
 425                 so->markPos.buf = InvalidBuffer;
 426         }
 427         so->markItemIndex = -1;
 428
 429         if (so->killedItems != NULL)
 430                 pfree(so->killedItems);
 431         if (so->keyData != NULL)
 432                 pfree(so->keyData);
 433         pfree(so);
 434
 435         PG_RETURN_VOID();
 436 }
 437
 438 /*
 439  *      btmarkpos() -- save current scan position
 440  */
 441 Datum
 442 btmarkpos(PG_FUNCTION_ARGS)
 443 {
 444         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 445         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 446
 447         /* we aren't holding any read locks, but gotta drop the pin */
 448         if (BTScanPosIsValid(so->markPos))
 449         {
 450                 ReleaseBuffer(so->markPos.buf);
 451                 so->markPos.buf = InvalidBuffer;
 452         }
 453
 454         /*
 455          * Just record the current itemIndex.  If we later step to next page
 456          * before releasing the marked position, _bt_steppage makes a full copy of
 457          * the currPos struct in markPos.  If (as often happens) the mark is moved
 458          * before we leave the page, we don't have to do that work.
 459          */
 460         if (BTScanPosIsValid(so->currPos))
 461                 so->markItemIndex = so->currPos.itemIndex;
 462         else
 463                 so->markItemIndex = -1;
 464
 465         PG_RETURN_VOID();
 466 }
 467
 468 /*
 469  *      btrestrpos() -- restore scan to last saved position
 470  */
 471 Datum
 472 btrestrpos(PG_FUNCTION_ARGS)
 473 {
 474         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
 475         BTScanOpaque so = (BTScanOpaque) scan->opaque;
 476
 477         if (so->markItemIndex >= 0)
 478         {
 479                 /*
 480                  * The mark position is on the same page we are currently on. Just
 481                  * restore the itemIndex.
 482                  */
 483                 so->currPos.itemIndex = so->markItemIndex;
 484         }
 485         else
 486         {
 487                 /* we aren't holding any read locks, but gotta drop the pin */
 488                 if (BTScanPosIsValid(so->currPos))
 489                 {
 490                         /* Before leaving current page, deal with any killed items */
 491                         if (so->numKilled > 0 &&
 492                                 so->currPos.buf != so->markPos.buf)
 493                                 _bt_killitems(scan, false);
 494                         ReleaseBuffer(so->currPos.buf);
 495                         so->currPos.buf = InvalidBuffer;
 496                 }
 497
 498                 if (BTScanPosIsValid(so->markPos))
 499                 {
 500                         /* bump pin on mark buffer for assignment to current buffer */
 501                         IncrBufferRefCount(so->markPos.buf);
 502                         memcpy(&so->currPos, &so->markPos,
 503                                    offsetof(BTScanPosData, items[1]) +
 504                                    so->markPos.lastItem * sizeof(BTScanPosItem));
 505                 }
 506         }
 507
 508         PG_RETURN_VOID();
 509 }
 510
 511 /*
 512  * Bulk deletion of all index entries pointing to a set of heap tuples.
 513  * The set of target tuples is specified via a callback routine that tells
 514  * whether any given heap tuple (identified by ItemPointer) is being deleted.
 515  *
 516  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 517  */
 518 Datum
 519 btbulkdelete(PG_FUNCTION_ARGS)
 520 {
 521         IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
 522         IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
 523         IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
 524         void       *callback_state = (void *) PG_GETARG_POINTER(3);
 525         Relation        rel = info->index;
 526         BTCycleId       cycleid;
 527
 528         /* allocate stats if first time through, else re-use existing struct */
 529         if (stats == NULL)
 530                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 531
 532         /* Establish the vacuum cycle ID to use for this scan */
 533         /* The ENSURE stuff ensures we clean up shared memory on failure */
 534         PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 535         {
 536                 cycleid = _bt_start_vacuum(rel);
 537
 538                 btvacuumscan(info, stats, callback, callback_state, cycleid);
 539         }
 540         PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 541         _bt_end_vacuum(rel);
 542
 543         PG_RETURN_POINTER(stats);
 544 }
 545
 546 /*
 547  * Post-VACUUM cleanup.
 548  *
 549  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 550  */
 551 Datum
 552 btvacuumcleanup(PG_FUNCTION_ARGS)
 553 {
 554         IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
 555         IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
 556
 557         /* No-op in ANALYZE ONLY mode */
 558         if (info->analyze_only)
 559                 PG_RETURN_POINTER(stats);
 560
 561         /*
 562          * If btbulkdelete was called, we need not do anything, just return the
 563          * stats from the latest btbulkdelete call.  If it wasn't called, we must
 564          * still do a pass over the index, to recycle any newly-recyclable pages
 565          * and to obtain index statistics.
 566          *
 567          * Since we aren't going to actually delete any leaf items, there's no
 568          * need to go through all the vacuum-cycle-ID pushups.
 569          */
 570         if (stats == NULL)
 571         {
 572                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 573                 btvacuumscan(info, stats, NULL, NULL, 0);
 574         }
 575
 576         /* Finally, vacuum the FSM */
 577         IndexFreeSpaceMapVacuum(info->index);
 578
 579         /*
 580          * During a non-FULL vacuum it's quite possible for us to be fooled by
 581          * concurrent page splits into double-counting some index tuples, so
 582          * disbelieve any total that exceeds the underlying heap's count. (We
 583          * can't check this during btbulkdelete.)
 584          */
 585         if (!info->vacuum_full)
 586         {
 587                 if (stats->num_index_tuples > info->num_heap_tuples)
 588                         stats->num_index_tuples = info->num_heap_tuples;
 589         }
 590
 591         PG_RETURN_POINTER(stats);
 592 }
 593
 594 /*
 595  * btvacuumscan --- scan the index for VACUUMing purposes
 596  *
 597  * This combines the functions of looking for leaf tuples that are deletable
 598  * according to the vacuum callback, looking for empty pages that can be
 599  * deleted, and looking for old deleted pages that can be recycled.  Both
 600  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
 601  * btbulkdelete call occurred).
 602  *
 603  * The caller is responsible for initially allocating/zeroing a stats struct
 604  * and for obtaining a vacuum cycle ID if necessary.
 605  */
 606 static void
 607 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 608                          IndexBulkDeleteCallback callback, void *callback_state,
 609                          BTCycleId cycleid)
 610 {
 611         Relation        rel = info->index;
 612         BTVacState      vstate;
 613         BlockNumber num_pages;
 614         BlockNumber blkno;
 615         bool            needLock;
 616
 617         /*
 618          * Reset counts that will be incremented during the scan; needed in case
 619          * of multiple scans during a single VACUUM command
 620          */
 621         stats->num_index_tuples = 0;
 622         stats->pages_deleted = 0;
 623
 624         /* Set up info to pass down to btvacuumpage */
 625         vstate.info = info;
 626         vstate.stats = stats;
 627         vstate.callback = callback;
 628         vstate.callback_state = callback_state;
 629         vstate.cycleid = cycleid;
 630         vstate.lastUsedPage = BTREE_METAPAGE;
 631         vstate.totFreePages = 0;
 632
 633         /* Create a temporary memory context to run _bt_pagedel in */
 634         vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
 635                                                                                                   "_bt_pagedel",
 636                                                                                                   ALLOCSET_DEFAULT_MINSIZE,
 637                                                                                                   ALLOCSET_DEFAULT_INITSIZE,
 638                                                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 639
 640         /*
 641          * The outer loop iterates over all index pages except the metapage, in
 642          * physical order (we hope the kernel will cooperate in providing
 643          * read-ahead for speed).  It is critical that we visit all leaf pages,
 644          * including ones added after we start the scan, else we might fail to
 645          * delete some deletable tuples.  Hence, we must repeatedly check the
 646          * relation length.  We must acquire the relation-extension lock while
 647          * doing so to avoid a race condition: if someone else is extending the
 648          * relation, there is a window where bufmgr/smgr have created a new
 649          * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
 650          * we manage to scan such a page here, we'll improperly assume it can be
 651          * recycled.  Taking the lock synchronizes things enough to prevent a
 652          * problem: either num_pages won't include the new page, or _bt_getbuf
 653          * already has write lock on the buffer and it will be fully initialized
 654          * before we can examine it.  (See also vacuumlazy.c, which has the same
 655          * issue.)      Also, we need not worry if a page is added immediately after
 656          * we look; the page splitting code already has write-lock on the left
 657          * page before it adds a right page, so we must already have processed any
 658          * tuples due to be moved into such a page.
 659          *
 660          * We can skip locking for new or temp relations, however, since no one
 661          * else could be accessing them.
 662          */
 663         needLock = !RELATION_IS_LOCAL(rel);
 664
 665         blkno = BTREE_METAPAGE + 1;
 666         for (;;)
 667         {
 668                 /* Get the current relation length */
 669                 if (needLock)
 670                         LockRelationForExtension(rel, ExclusiveLock);
 671                 num_pages = RelationGetNumberOfBlocks(rel);
 672                 if (needLock)
 673                         UnlockRelationForExtension(rel, ExclusiveLock);
 674
 675                 /* Quit if we've scanned the whole relation */
 676                 if (blkno >= num_pages)
 677                         break;
 678                 /* Iterate over pages, then loop back to recheck length */
 679                 for (; blkno < num_pages; blkno++)
 680                 {
 681                         btvacuumpage(&vstate, blkno, blkno);
 682                 }
 683         }
 684
 685         /*
 686          * During VACUUM FULL, we truncate off any recyclable pages at the end of
 687          * the index.  In a normal vacuum it'd be unsafe to do this except by
 688          * acquiring exclusive lock on the index and then rechecking all the
 689          * pages; doesn't seem worth it.
 690          */
 691         if (info->vacuum_full && vstate.lastUsedPage < num_pages - 1)
 692         {
 693                 BlockNumber new_pages = vstate.lastUsedPage + 1;
 694
 695                 /*
 696                  * Okay to truncate.
 697                  */
 698                 RelationTruncate(rel, new_pages);
 699
 700                 /* update statistics */
 701                 stats->pages_removed += num_pages - new_pages;
 702                 vstate.totFreePages -= (num_pages - new_pages);
 703                 num_pages = new_pages;
 704         }
 705
 706         MemoryContextDelete(vstate.pagedelcontext);
 707
 708         /* update statistics */
 709         stats->num_pages = num_pages;
 710         stats->pages_free = vstate.totFreePages;
 711 }
 712
 713 /*
 714  * btvacuumpage --- VACUUM one page
 715  *
 716  * This processes a single page for btvacuumscan().  In some cases we
 717  * must go back and re-examine previously-scanned pages; this routine
 718  * recurses when necessary to handle that case.
 719  *
 720  * blkno is the page to process.  orig_blkno is the highest block number
 721  * reached by the outer btvacuumscan loop (the same as blkno, unless we
 722  * are recursing to re-examine a previous page).
 723  */
 724 static void
 725 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
 726 {
 727         IndexVacuumInfo *info = vstate->info;
 728         IndexBulkDeleteResult *stats = vstate->stats;
 729         IndexBulkDeleteCallback callback = vstate->callback;
 730         void       *callback_state = vstate->callback_state;
 731         Relation        rel = info->index;
 732         bool            delete_now;
 733         BlockNumber recurse_to;
 734         Buffer          buf;
 735         Page            page;
 736         BTPageOpaque opaque;
 737
 738 restart:
 739         delete_now = false;
 740         recurse_to = P_NONE;
 741
 742         /* call vacuum_delay_point while not holding any buffer lock */
 743         vacuum_delay_point();
 744
 745         /*
 746          * We can't use _bt_getbuf() here because it always applies
 747          * _bt_checkpage(), which will barf on an all-zero page. We want to
 748          * recycle all-zero pages, not fail.  Also, we want to use a nondefault
 749          * buffer access strategy.
 750          */
 751         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
 752                                                          info->strategy);
 753         LockBuffer(buf, BT_READ);
 754         page = BufferGetPage(buf);
 755         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 756         if (!PageIsNew(page))
 757                 _bt_checkpage(rel, buf);
 758
 759         /*
 760          * If we are recursing, the only case we want to do anything with is a
 761          * live leaf page having the current vacuum cycle ID.  Any other state
 762          * implies we already saw the page (eg, deleted it as being empty).
 763          */
 764         if (blkno != orig_blkno)
 765         {
 766                 if (_bt_page_recyclable(page) ||
 767                         P_IGNORE(opaque) ||
 768                         !P_ISLEAF(opaque) ||
 769                         opaque->btpo_cycleid != vstate->cycleid)
 770                 {
 771                         _bt_relbuf(rel, buf);
 772                         return;
 773                 }
 774         }
 775
 776         /* If the page is in use, update lastUsedPage */
 777         if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
 778                 vstate->lastUsedPage = blkno;
 779
 780         /* Page is valid, see what to do with it */
 781         if (_bt_page_recyclable(page))
 782         {
 783                 /* Okay to recycle this page */
 784                 RecordFreeIndexPage(rel, blkno);
 785                 vstate->totFreePages++;
 786                 stats->pages_deleted++;
 787         }
 788         else if (P_ISDELETED(opaque))
 789         {
 790                 /* Already deleted, but can't recycle yet */
 791                 stats->pages_deleted++;
 792         }
 793         else if (P_ISHALFDEAD(opaque))
 794         {
 795                 /* Half-dead, try to delete */
 796                 delete_now = true;
 797         }
 798         else if (P_ISLEAF(opaque))
 799         {
 800                 OffsetNumber deletable[MaxOffsetNumber];
 801                 int                     ndeletable;
 802                 OffsetNumber offnum,
 803                                         minoff,
 804                                         maxoff;
 805
 806                 /*
 807                  * Trade in the initial read lock for a super-exclusive write lock on
 808                  * this page.  We must get such a lock on every leaf page over the
 809                  * course of the vacuum scan, whether or not it actually contains any
 810                  * deletable tuples --- see nbtree/README.
 811                  */
 812                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 813                 LockBufferForCleanup(buf);
 814
 815                 /*
 816                  * Check whether we need to recurse back to earlier pages.      What we
 817                  * are concerned about is a page split that happened since we started
 818                  * the vacuum scan.  If the split moved some tuples to a lower page
 819                  * then we might have missed 'em.  If so, set up for tail recursion.
 820                  * (Must do this before possibly clearing btpo_cycleid below!)
 821                  */
 822                 if (vstate->cycleid != 0 &&
 823                         opaque->btpo_cycleid == vstate->cycleid &&
 824                         !(opaque->btpo_flags & BTP_SPLIT_END) &&
 825                         !P_RIGHTMOST(opaque) &&
 826                         opaque->btpo_next < orig_blkno)
 827                         recurse_to = opaque->btpo_next;
 828
 829                 /*
 830                  * Scan over all items to see which ones need deleted according to the
 831                  * callback function.
 832                  */
 833                 ndeletable = 0;
 834                 minoff = P_FIRSTDATAKEY(opaque);
 835                 maxoff = PageGetMaxOffsetNumber(page);
 836                 if (callback)
 837                 {
 838                         for (offnum = minoff;
 839                                  offnum <= maxoff;
 840                                  offnum = OffsetNumberNext(offnum))
 841                         {
 842                                 IndexTuple      itup;
 843                                 ItemPointer htup;
 844
 845                                 itup = (IndexTuple) PageGetItem(page,
 846                                                                                                 PageGetItemId(page, offnum));
 847                                 htup = &(itup->t_tid);
 848                                 if (callback(htup, callback_state))
 849                                         deletable[ndeletable++] = offnum;
 850                         }
 851                 }
 852
 853                 /*
 854                  * Apply any needed deletes.  We issue just one _bt_delitems() call
 855                  * per page, so as to minimize WAL traffic.
 856                  */
 857                 if (ndeletable > 0)
 858                 {
 859                         _bt_delitems(rel, buf, deletable, ndeletable);
 860                         stats->tuples_removed += ndeletable;
 861                         /* must recompute maxoff */
 862                         maxoff = PageGetMaxOffsetNumber(page);
 863                 }
 864                 else
 865                 {
 866                         /*
 867                          * If the page has been split during this vacuum cycle, it seems
 868                          * worth expending a write to clear btpo_cycleid even if we don't
 869                          * have any deletions to do.  (If we do, _bt_delitems takes care
 870                          * of this.)  This ensures we won't process the page again.
 871                          *
 872                          * We treat this like a hint-bit update because there's no need to
 873                          * WAL-log it.
 874                          */
 875                         if (vstate->cycleid != 0 &&
 876                                 opaque->btpo_cycleid == vstate->cycleid)
 877                         {
 878                                 opaque->btpo_cycleid = 0;
 879                                 SetBufferCommitInfoNeedsSave(buf);
 880                         }
 881                 }
 882
 883                 /*
 884                  * If it's now empty, try to delete; else count the live tuples. We
 885                  * don't delete when recursing, though, to avoid putting entries into
 886                  * freePages out-of-order (doesn't seem worth any extra code to handle
 887                  * the case).
 888                  */
 889                 if (minoff > maxoff)
 890                         delete_now = (blkno == orig_blkno);
 891                 else
 892                         stats->num_index_tuples += maxoff - minoff + 1;
 893         }
 894
 895         if (delete_now)
 896         {
 897                 MemoryContext oldcontext;
 898                 int                     ndel;
 899
 900                 /* Run pagedel in a temp context to avoid memory leakage */
 901                 MemoryContextReset(vstate->pagedelcontext);
 902                 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
 903
 904                 ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full);
 905
 906                 /* count only this page, else may double-count parent */
 907                 if (ndel)
 908                         stats->pages_deleted++;
 909
 910                 /*
 911                  * During VACUUM FULL it's okay to recycle deleted pages immediately,
 912                  * since there can be no other transactions scanning the index.  Note
 913                  * that we will only recycle the current page and not any parent pages
 914                  * that _bt_pagedel might have recursed to; this seems reasonable in
 915                  * the name of simplicity.      (Trying to do otherwise would mean we'd
 916                  * have to sort the list of recyclable pages we're building.)
 917                  */
 918                 if (ndel && info->vacuum_full)
 919                 {
 920                         RecordFreeIndexPage(rel, blkno);
 921                         vstate->totFreePages++;
 922                 }
 923
 924                 MemoryContextSwitchTo(oldcontext);
 925                 /* pagedel released buffer, so we shouldn't */
 926         }
 927         else
 928                 _bt_relbuf(rel, buf);
 929
 930         /*
 931          * This is really tail recursion, but if the compiler is too stupid to
 932          * optimize it as such, we'd eat an uncomfortably large amount of stack
 933          * space per recursion level (due to the deletable[] array). A failure is
 934          * improbable since the number of levels isn't likely to be large ... but
 935          * just in case, let's hand-optimize into a loop.
 936          */
 937         if (recurse_to != P_NONE)
 938         {
 939                 blkno = recurse_to;
 940                 goto restart;
 941         }
 942 }