src/backend/access/heap/vacuumlazy.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * vacuumlazy.c
   4  *        Concurrent ("lazy") vacuuming.
   5  *
   6  * The major space usage for vacuuming is storage for the array of dead TIDs
   7  * that are to be removed from indexes.  We want to ensure we can vacuum even
   8  * the very largest relations with finite memory space usage.  To do that, we
   9  * set upper bounds on the number of TIDs we can keep track of at once.
  10  *
  11  * We are willing to use at most maintenance_work_mem (or perhaps
  12  * autovacuum_work_mem) memory space to keep track of dead TIDs.  We initially
  13  * allocate an array of TIDs of that size, with an upper limit that depends on
  14  * table size (this limit ensures we don't allocate a huge area uselessly for
  15  * vacuuming small tables).  If the array threatens to overflow, we must call
  16  * lazy_vacuum to vacuum indexes (and to vacuum the pages that we've pruned).
  17  * This frees up the memory space dedicated to storing dead TIDs.
  18  *
  19  * In practice VACUUM will often complete its initial pass over the target
  20  * heap relation without ever running out of space to store TIDs.  This means
  21  * that there only needs to be one call to lazy_vacuum, after the initial pass
  22  * completes.
  23  *
  24  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  25  * Portions Copyright (c) 1994, Regents of the University of California
  26  *
  27  *
  28  * IDENTIFICATION
  29  *        src/backend/access/heap/vacuumlazy.c
  30  *
  31  *-------------------------------------------------------------------------
  32  */
  33 #include "postgres.h"
  34
  35 #include <math.h>
  36
  37 #include "access/amapi.h"
  38 #include "access/genam.h"
  39 #include "access/heapam.h"
  40 #include "access/heapam_xlog.h"
  41 #include "access/htup_details.h"
  42 #include "access/multixact.h"
  43 #include "access/transam.h"
  44 #include "access/visibilitymap.h"
  45 #include "access/xact.h"
  46 #include "access/xlog.h"
  47 #include "access/xloginsert.h"
  48 #include "catalog/index.h"
  49 #include "catalog/storage.h"
  50 #include "commands/dbcommands.h"
  51 #include "commands/progress.h"
  52 #include "commands/vacuum.h"
  53 #include "executor/instrument.h"
  54 #include "miscadmin.h"
  55 #include "optimizer/paths.h"
  56 #include "pgstat.h"
  57 #include "portability/instr_time.h"
  58 #include "postmaster/autovacuum.h"
  59 #include "storage/bufmgr.h"
  60 #include "storage/freespace.h"
  61 #include "storage/lmgr.h"
  62 #include "tcop/tcopprot.h"
  63 #include "utils/lsyscache.h"
  64 #include "utils/memutils.h"
  65 #include "utils/pg_rusage.h"
  66 #include "utils/timestamp.h"
  67
  68
  69 /*
  70  * Space/time tradeoff parameters: do these need to be user-tunable?
  71  *
  72  * To consider truncating the relation, we want there to be at least
  73  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
  74  * is less) potentially-freeable pages.
  75  */
  76 #define REL_TRUNCATE_MINIMUM    1000
  77 #define REL_TRUNCATE_FRACTION   16
  78
  79 /*
  80  * Timing parameters for truncate locking heuristics.
  81  *
  82  * These were not exposed as user tunable GUC values because it didn't seem
  83  * that the potential for improvement was great enough to merit the cost of
  84  * supporting them.
  85  */
  86 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL             20      /* ms */
  87 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL              50      /* ms */
  88 #define VACUUM_TRUNCATE_LOCK_TIMEOUT                    5000    /* ms */
  89
  90 /*
  91  * Threshold that controls whether we bypass index vacuuming and heap
  92  * vacuuming as an optimization
  93  */
  94 #define BYPASS_THRESHOLD_PAGES  0.02    /* i.e. 2% of rel_pages */
  95
  96 /*
  97  * Perform a failsafe check each time we scan another 4GB of pages.
  98  * (Note that this is deliberately kept to a power-of-two, usually 2^19.)
  99  */
 100 #define FAILSAFE_EVERY_PAGES \
 101         ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
 102
 103 /*
 104  * When a table has no indexes, vacuum the FSM after every 8GB, approximately
 105  * (it won't be exact because we only vacuum FSM after processing a heap page
 106  * that has some removable tuples).  When there are indexes, this is ignored,
 107  * and we vacuum FSM after each index/heap cleaning pass.
 108  */
 109 #define VACUUM_FSM_EVERY_PAGES \
 110         ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
 111
 112 /*
 113  * Before we consider skipping a page that's marked as clean in
 114  * visibility map, we must've seen at least this many clean pages.
 115  */
 116 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
 117
 118 /*
 119  * Size of the prefetch window for lazy vacuum backwards truncation scan.
 120  * Needs to be a power of 2.
 121  */
 122 #define PREFETCH_SIZE                   ((BlockNumber) 32)
 123
 124 /*
 125  * Macro to check if we are in a parallel vacuum.  If true, we are in the
 126  * parallel mode and the DSM segment is initialized.
 127  */
 128 #define ParallelVacuumIsActive(vacrel) ((vacrel)->pvs != NULL)
 129
 130 /* Phases of vacuum during which we report error context. */
 131 typedef enum
 132 {
 133         VACUUM_ERRCB_PHASE_UNKNOWN,
 134         VACUUM_ERRCB_PHASE_SCAN_HEAP,
 135         VACUUM_ERRCB_PHASE_VACUUM_INDEX,
 136         VACUUM_ERRCB_PHASE_VACUUM_HEAP,
 137         VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
 138         VACUUM_ERRCB_PHASE_TRUNCATE
 139 } VacErrPhase;
 140
 141 typedef struct LVRelState
 142 {
 143         /* Target heap relation and its indexes */
 144         Relation        rel;
 145         Relation   *indrels;
 146         int                     nindexes;
 147
 148         /* Buffer access strategy and parallel vacuum state */
 149         BufferAccessStrategy bstrategy;
 150         ParallelVacuumState *pvs;
 151
 152         /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */
 153         bool            aggressive;
 154         /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */
 155         bool            skipwithvm;
 156         /* Wraparound failsafe has been triggered? */
 157         bool            failsafe_active;
 158         /* Consider index vacuuming bypass optimization? */
 159         bool            consider_bypass_optimization;
 160
 161         /* Doing index vacuuming, index cleanup, rel truncation? */
 162         bool            do_index_vacuuming;
 163         bool            do_index_cleanup;
 164         bool            do_rel_truncate;
 165
 166         /* VACUUM operation's cutoffs for freezing and pruning */
 167         struct VacuumCutoffs cutoffs;
 168         GlobalVisState *vistest;
 169         /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */
 170         TransactionId NewRelfrozenXid;
 171         MultiXactId NewRelminMxid;
 172         bool            skippedallvis;
 173
 174         /* Error reporting state */
 175         char       *relnamespace;
 176         char       *relname;
 177         char       *indname;            /* Current index name */
 178         BlockNumber blkno;                      /* used only for heap operations */
 179         OffsetNumber offnum;            /* used only for heap operations */
 180         VacErrPhase phase;
 181         bool            verbose;                /* VACUUM VERBOSE? */
 182
 183         /*
 184          * dead_items stores TIDs whose index tuples are deleted by index
 185          * vacuuming. Each TID points to an LP_DEAD line pointer from a heap page
 186          * that has been processed by lazy_scan_prune.  Also needed by
 187          * lazy_vacuum_heap_rel, which marks the same LP_DEAD line pointers as
 188          * LP_UNUSED during second heap pass.
 189          */
 190         VacDeadItems *dead_items;       /* TIDs whose index tuples we'll delete */
 191         BlockNumber rel_pages;          /* total number of pages */
 192         BlockNumber scanned_pages;      /* # pages examined (not skipped via VM) */
 193         BlockNumber removed_pages;      /* # pages removed by relation truncation */
 194         BlockNumber frozen_pages;       /* # pages with newly frozen tuples */
 195         BlockNumber lpdead_item_pages;  /* # pages with LP_DEAD items */
 196         BlockNumber missed_dead_pages;  /* # pages with missed dead tuples */
 197         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
 198
 199         /* Statistics output by us, for table */
 200         double          new_rel_tuples; /* new estimated total # of tuples */
 201         double          new_live_tuples;        /* new estimated total # of live tuples */
 202         /* Statistics output by index AMs */
 203         IndexBulkDeleteResult **indstats;
 204
 205         /* Instrumentation counters */
 206         int                     num_index_scans;
 207         /* Counters that follow are only for scanned_pages */
 208         int64           tuples_deleted; /* # deleted from table */
 209         int64           tuples_frozen;  /* # newly frozen */
 210         int64           lpdead_items;   /* # deleted from indexes */
 211         int64           live_tuples;    /* # live tuples remaining */
 212         int64           recently_dead_tuples;   /* # dead, but not yet removable */
 213         int64           missed_dead_tuples; /* # removable, but not removed */
 214 } LVRelState;
 215
 216 /*
 217  * State returned by lazy_scan_prune()
 218  */
 219 typedef struct LVPagePruneState
 220 {
 221         bool            hastup;                 /* Page prevents rel truncation? */
 222         bool            has_lpdead_items;       /* includes existing LP_DEAD items */
 223
 224         /*
 225          * State describes the proper VM bit states to set for the page following
 226          * pruning and freezing.  all_visible implies !has_lpdead_items, but don't
 227          * trust all_frozen result unless all_visible is also set to true.
 228          */
 229         bool            all_visible;    /* Every item visible to all? */
 230         bool            all_frozen;             /* provided all_visible is also true */
 231         TransactionId visibility_cutoff_xid;    /* For recovery conflicts */
 232 } LVPagePruneState;
 233
 234 /* Struct for saving and restoring vacuum error information. */
 235 typedef struct LVSavedErrInfo
 236 {
 237         BlockNumber blkno;
 238         OffsetNumber offnum;
 239         VacErrPhase phase;
 240 } LVSavedErrInfo;
 241
 242
 243 /* non-export function prototypes */
 244 static void lazy_scan_heap(LVRelState *vacrel);
 245 static BlockNumber lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer,
 246                                                                   BlockNumber next_block,
 247                                                                   bool *next_unskippable_allvis,
 248                                                                   bool *skipping_current_range);
 249 static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
 250                                                                    BlockNumber blkno, Page page,
 251                                                                    bool sharelock, Buffer vmbuffer);
 252 static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
 253                                                         BlockNumber blkno, Page page,
 254                                                         LVPagePruneState *prunestate);
 255 static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf,
 256                                                           BlockNumber blkno, Page page,
 257                                                           bool *hastup, bool *recordfreespace);
 258 static void lazy_vacuum(LVRelState *vacrel);
 259 static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
 260 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
 261 static int      lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
 262                                                                   Buffer buffer, int index, Buffer *vmbuffer);
 263 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
 264 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
 265 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
 266                                                                                                         IndexBulkDeleteResult *istat,
 267                                                                                                         double reltuples,
 268                                                                                                         LVRelState *vacrel);
 269 static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
 270                                                                                                          IndexBulkDeleteResult *istat,
 271                                                                                                          double reltuples,
 272                                                                                                          bool estimated_count,
 273                                                                                                          LVRelState *vacrel);
 274 static bool should_attempt_truncation(LVRelState *vacrel);
 275 static void lazy_truncate_heap(LVRelState *vacrel);
 276 static BlockNumber count_nondeletable_pages(LVRelState *vacrel,
 277                                                                                         bool *lock_waiter_detected);
 278 static void dead_items_alloc(LVRelState *vacrel, int nworkers);
 279 static void dead_items_cleanup(LVRelState *vacrel);
 280 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 281                                                                          TransactionId *visibility_cutoff_xid, bool *all_frozen);
 282 static void update_relstats_all_indexes(LVRelState *vacrel);
 283 static void vacuum_error_callback(void *arg);
 284 static void update_vacuum_error_info(LVRelState *vacrel,
 285                                                                          LVSavedErrInfo *saved_vacrel,
 286                                                                          int phase, BlockNumber blkno,
 287                                                                          OffsetNumber offnum);
 288 static void restore_vacuum_error_info(LVRelState *vacrel,
 289                                                                           const LVSavedErrInfo *saved_vacrel);
 290
 291
 292 /*
 293  *      heap_vacuum_rel() -- perform VACUUM for one heap relation
 294  *
 295  *              This routine sets things up for and then calls lazy_scan_heap, where
 296  *              almost all work actually takes place.  Finalizes everything after call
 297  *              returns by managing relation truncation and updating rel's pg_class
 298  *              entry. (Also updates pg_class entries for any indexes that need it.)
 299  *
 300  *              At entry, we have already established a transaction and opened
 301  *              and locked the relation.
 302  */
 303 void
 304 heap_vacuum_rel(Relation rel, VacuumParams *params,
 305                                 BufferAccessStrategy bstrategy)
 306 {
 307         LVRelState *vacrel;
 308         bool            verbose,
 309                                 instrument,
 310                                 skipwithvm,
 311                                 frozenxid_updated,
 312                                 minmulti_updated;
 313         BlockNumber orig_rel_pages,
 314                                 new_rel_pages,
 315                                 new_rel_allvisible;
 316         PGRUsage        ru0;
 317         TimestampTz starttime = 0;
 318         PgStat_Counter startreadtime = 0,
 319                                 startwritetime = 0;
 320         WalUsage        startwalusage = pgWalUsage;
 321         int64           StartPageHit = VacuumPageHit,
 322                                 StartPageMiss = VacuumPageMiss,
 323                                 StartPageDirty = VacuumPageDirty;
 324         ErrorContextCallback errcallback;
 325         char      **indnames = NULL;
 326
 327         verbose = (params->options & VACOPT_VERBOSE) != 0;
 328         instrument = (verbose || (IsAutoVacuumWorkerProcess() &&
 329                                                           params->log_min_duration >= 0));
 330         if (instrument)
 331         {
 332                 pg_rusage_init(&ru0);
 333                 starttime = GetCurrentTimestamp();
 334                 if (track_io_timing)
 335                 {
 336                         startreadtime = pgStatBlockReadTime;
 337                         startwritetime = pgStatBlockWriteTime;
 338                 }
 339         }
 340
 341         pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
 342                                                                   RelationGetRelid(rel));
 343
 344         /*
 345          * Setup error traceback support for ereport() first.  The idea is to set
 346          * up an error context callback to display additional information on any
 347          * error during a vacuum.  During different phases of vacuum, we update
 348          * the state so that the error context callback always display current
 349          * information.
 350          *
 351          * Copy the names of heap rel into local memory for error reporting
 352          * purposes, too.  It isn't always safe to assume that we can get the name
 353          * of each rel.  It's convenient for code in lazy_scan_heap to always use
 354          * these temp copies.
 355          */
 356         vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
 357         vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
 358         vacrel->relname = pstrdup(RelationGetRelationName(rel));
 359         vacrel->indname = NULL;
 360         vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
 361         vacrel->verbose = verbose;
 362         errcallback.callback = vacuum_error_callback;
 363         errcallback.arg = vacrel;
 364         errcallback.previous = error_context_stack;
 365         error_context_stack = &errcallback;
 366
 367         /* Set up high level stuff about rel and its indexes */
 368         vacrel->rel = rel;
 369         vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
 370                                          &vacrel->indrels);
 371         vacrel->bstrategy = bstrategy;
 372         if (instrument && vacrel->nindexes > 0)
 373         {
 374                 /* Copy index names used by instrumentation (not error reporting) */
 375                 indnames = palloc(sizeof(char *) * vacrel->nindexes);
 376                 for (int i = 0; i < vacrel->nindexes; i++)
 377                         indnames[i] = pstrdup(RelationGetRelationName(vacrel->indrels[i]));
 378         }
 379
 380         /*
 381          * The index_cleanup param either disables index vacuuming and cleanup or
 382          * forces it to go ahead when we would otherwise apply the index bypass
 383          * optimization.  The default is 'auto', which leaves the final decision
 384          * up to lazy_vacuum().
 385          *
 386          * The truncate param allows user to avoid attempting relation truncation,
 387          * though it can't force truncation to happen.
 388          */
 389         Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED);
 390         Assert(params->truncate != VACOPTVALUE_UNSPECIFIED &&
 391                    params->truncate != VACOPTVALUE_AUTO);
 392         vacrel->failsafe_active = false;
 393         vacrel->consider_bypass_optimization = true;
 394         vacrel->do_index_vacuuming = true;
 395         vacrel->do_index_cleanup = true;
 396         vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED);
 397         if (params->index_cleanup == VACOPTVALUE_DISABLED)
 398         {
 399                 /* Force disable index vacuuming up-front */
 400                 vacrel->do_index_vacuuming = false;
 401                 vacrel->do_index_cleanup = false;
 402         }
 403         else if (params->index_cleanup == VACOPTVALUE_ENABLED)
 404         {
 405                 /* Force index vacuuming.  Note that failsafe can still bypass. */
 406                 vacrel->consider_bypass_optimization = false;
 407         }
 408         else
 409         {
 410                 /* Default/auto, make all decisions dynamically */
 411                 Assert(params->index_cleanup == VACOPTVALUE_AUTO);
 412         }
 413
 414         /* Initialize page counters explicitly (be tidy) */
 415         vacrel->scanned_pages = 0;
 416         vacrel->removed_pages = 0;
 417         vacrel->frozen_pages = 0;
 418         vacrel->lpdead_item_pages = 0;
 419         vacrel->missed_dead_pages = 0;
 420         vacrel->nonempty_pages = 0;
 421         /* dead_items_alloc allocates vacrel->dead_items later on */
 422
 423         /* Allocate/initialize output statistics state */
 424         vacrel->new_rel_tuples = 0;
 425         vacrel->new_live_tuples = 0;
 426         vacrel->indstats = (IndexBulkDeleteResult **)
 427                 palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
 428
 429         /* Initialize remaining counters (be tidy) */
 430         vacrel->num_index_scans = 0;
 431         vacrel->tuples_deleted = 0;
 432         vacrel->tuples_frozen = 0;
 433         vacrel->lpdead_items = 0;
 434         vacrel->live_tuples = 0;
 435         vacrel->recently_dead_tuples = 0;
 436         vacrel->missed_dead_tuples = 0;
 437
 438         /*
 439          * Get cutoffs that determine which deleted tuples are considered DEAD,
 440          * not just RECENTLY_DEAD, and which XIDs/MXIDs to freeze.  Then determine
 441          * the extent of the blocks that we'll scan in lazy_scan_heap.  It has to
 442          * happen in this order to ensure that the OldestXmin cutoff field works
 443          * as an upper bound on the XIDs stored in the pages we'll actually scan
 444          * (NewRelfrozenXid tracking must never be allowed to miss unfrozen XIDs).
 445          *
 446          * Next acquire vistest, a related cutoff that's used in heap_page_prune.
 447          * We expect vistest will always make heap_page_prune remove any deleted
 448          * tuple whose xmax is < OldestXmin.  lazy_scan_prune must never become
 449          * confused about whether a tuple should be frozen or removed.  (In the
 450          * future we might want to teach lazy_scan_prune to recompute vistest from
 451          * time to time, to increase the number of dead tuples it can prune away.)
 452          */
 453         vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs);
 454         vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
 455         vacrel->vistest = GlobalVisTestFor(rel);
 456         /* Initialize state used to track oldest extant XID/MXID */
 457         vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin;
 458         vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact;
 459         vacrel->skippedallvis = false;
 460         skipwithvm = true;
 461         if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
 462         {
 463                 /*
 464                  * Force aggressive mode, and disable skipping blocks using the
 465                  * visibility map (even those set all-frozen)
 466                  */
 467                 vacrel->aggressive = true;
 468                 skipwithvm = false;
 469         }
 470
 471         vacrel->skipwithvm = skipwithvm;
 472
 473         if (verbose)
 474         {
 475                 if (vacrel->aggressive)
 476                         ereport(INFO,
 477                                         (errmsg("aggressively vacuuming \"%s.%s.%s\"",
 478                                                         get_database_name(MyDatabaseId),
 479                                                         vacrel->relnamespace, vacrel->relname)));
 480                 else
 481                         ereport(INFO,
 482                                         (errmsg("vacuuming \"%s.%s.%s\"",
 483                                                         get_database_name(MyDatabaseId),
 484                                                         vacrel->relnamespace, vacrel->relname)));
 485         }
 486
 487         /*
 488          * Allocate dead_items array memory using dead_items_alloc.  This handles
 489          * parallel VACUUM initialization as part of allocating shared memory
 490          * space used for dead_items.  (But do a failsafe precheck first, to
 491          * ensure that parallel VACUUM won't be attempted at all when relfrozenxid
 492          * is already dangerously old.)
 493          */
 494         lazy_check_wraparound_failsafe(vacrel);
 495         dead_items_alloc(vacrel, params->nworkers);
 496
 497         /*
 498          * Call lazy_scan_heap to perform all required heap pruning, index
 499          * vacuuming, and heap vacuuming (plus related processing)
 500          */
 501         lazy_scan_heap(vacrel);
 502
 503         /*
 504          * Free resources managed by dead_items_alloc.  This ends parallel mode in
 505          * passing when necessary.
 506          */
 507         dead_items_cleanup(vacrel);
 508         Assert(!IsInParallelMode());
 509
 510         /*
 511          * Update pg_class entries for each of rel's indexes where appropriate.
 512          *
 513          * Unlike the later update to rel's pg_class entry, this is not critical.
 514          * Maintains relpages/reltuples statistics used by the planner only.
 515          */
 516         if (vacrel->do_index_cleanup)
 517                 update_relstats_all_indexes(vacrel);
 518
 519         /* Done with rel's indexes */
 520         vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
 521
 522         /* Optionally truncate rel */
 523         if (should_attempt_truncation(vacrel))
 524                 lazy_truncate_heap(vacrel);
 525
 526         /* Pop the error context stack */
 527         error_context_stack = errcallback.previous;
 528
 529         /* Report that we are now doing final cleanup */
 530         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
 531                                                                  PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
 532
 533         /*
 534          * Prepare to update rel's pg_class entry.
 535          *
 536          * Aggressive VACUUMs must always be able to advance relfrozenxid to a
 537          * value >= FreezeLimit, and relminmxid to a value >= MultiXactCutoff.
 538          * Non-aggressive VACUUMs may advance them by any amount, or not at all.
 539          */
 540         Assert(vacrel->NewRelfrozenXid == vacrel->cutoffs.OldestXmin ||
 541                    TransactionIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.FreezeLimit :
 542                                                                                  vacrel->cutoffs.relfrozenxid,
 543                                                                                  vacrel->NewRelfrozenXid));
 544         Assert(vacrel->NewRelminMxid == vacrel->cutoffs.OldestMxact ||
 545                    MultiXactIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.MultiXactCutoff :
 546                                                                            vacrel->cutoffs.relminmxid,
 547                                                                            vacrel->NewRelminMxid));
 548         if (vacrel->skippedallvis)
 549         {
 550                 /*
 551                  * Must keep original relfrozenxid in a non-aggressive VACUUM that
 552                  * chose to skip an all-visible page range.  The state that tracks new
 553                  * values will have missed unfrozen XIDs from the pages we skipped.
 554                  */
 555                 Assert(!vacrel->aggressive);
 556                 vacrel->NewRelfrozenXid = InvalidTransactionId;
 557                 vacrel->NewRelminMxid = InvalidMultiXactId;
 558         }
 559
 560         /*
 561          * For safety, clamp relallvisible to be not more than what we're setting
 562          * pg_class.relpages to
 563          */
 564         new_rel_pages = vacrel->rel_pages;      /* After possible rel truncation */
 565         visibilitymap_count(rel, &new_rel_allvisible, NULL);
 566         if (new_rel_allvisible > new_rel_pages)
 567                 new_rel_allvisible = new_rel_pages;
 568
 569         /*
 570          * Now actually update rel's pg_class entry.
 571          *
 572          * In principle new_live_tuples could be -1 indicating that we (still)
 573          * don't know the tuple count.  In practice that can't happen, since we
 574          * scan every page that isn't skipped using the visibility map.
 575          */
 576         vac_update_relstats(rel, new_rel_pages, vacrel->new_live_tuples,
 577                                                 new_rel_allvisible, vacrel->nindexes > 0,
 578                                                 vacrel->NewRelfrozenXid, vacrel->NewRelminMxid,
 579                                                 &frozenxid_updated, &minmulti_updated, false);
 580
 581         /*
 582          * Report results to the cumulative stats system, too.
 583          *
 584          * Deliberately avoid telling the stats system about LP_DEAD items that
 585          * remain in the table due to VACUUM bypassing index and heap vacuuming.
 586          * ANALYZE will consider the remaining LP_DEAD items to be dead "tuples".
 587          * It seems like a good idea to err on the side of not vacuuming again too
 588          * soon in cases where the failsafe prevented significant amounts of heap
 589          * vacuuming.
 590          */
 591         pgstat_report_vacuum(RelationGetRelid(rel),
 592                                                  rel->rd_rel->relisshared,
 593                                                  Max(vacrel->new_live_tuples, 0),
 594                                                  vacrel->recently_dead_tuples +
 595                                                  vacrel->missed_dead_tuples);
 596         pgstat_progress_end_command();
 597
 598         if (instrument)
 599         {
 600                 TimestampTz endtime = GetCurrentTimestamp();
 601
 602                 if (verbose || params->log_min_duration == 0 ||
 603                         TimestampDifferenceExceeds(starttime, endtime,
 604                                                                            params->log_min_duration))
 605                 {
 606                         long            secs_dur;
 607                         int                     usecs_dur;
 608                         WalUsage        walusage;
 609                         StringInfoData buf;
 610                         char       *msgfmt;
 611                         int32           diff;
 612                         int64           PageHitOp = VacuumPageHit - StartPageHit,
 613                                                 PageMissOp = VacuumPageMiss - StartPageMiss,
 614                                                 PageDirtyOp = VacuumPageDirty - StartPageDirty;
 615                         double          read_rate = 0,
 616                                                 write_rate = 0;
 617
 618                         TimestampDifference(starttime, endtime, &secs_dur, &usecs_dur);
 619                         memset(&walusage, 0, sizeof(WalUsage));
 620                         WalUsageAccumDiff(&walusage, &pgWalUsage, &startwalusage);
 621
 622                         initStringInfo(&buf);
 623                         if (verbose)
 624                         {
 625                                 /*
 626                                  * Aggressiveness already reported earlier, in dedicated
 627                                  * VACUUM VERBOSE ereport
 628                                  */
 629                                 Assert(!params->is_wraparound);
 630                                 msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n");
 631                         }
 632                         else if (params->is_wraparound)
 633                         {
 634                                 /*
 635                                  * While it's possible for a VACUUM to be both is_wraparound
 636                                  * and !aggressive, that's just a corner-case -- is_wraparound
 637                                  * implies aggressive.  Produce distinct output for the corner
 638                                  * case all the same, just in case.
 639                                  */
 640                                 if (vacrel->aggressive)
 641                                         msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
 642                                 else
 643                                         msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
 644                         }
 645                         else
 646                         {
 647                                 if (vacrel->aggressive)
 648                                         msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
 649                                 else
 650                                         msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
 651                         }
 652                         appendStringInfo(&buf, msgfmt,
 653                                                          get_database_name(MyDatabaseId),
 654                                                          vacrel->relnamespace,
 655                                                          vacrel->relname,
 656                                                          vacrel->num_index_scans);
 657                         appendStringInfo(&buf, _("pages: %u removed, %u remain, %u scanned (%.2f%% of total)\n"),
 658                                                          vacrel->removed_pages,
 659                                                          new_rel_pages,
 660                                                          vacrel->scanned_pages,
 661                                                          orig_rel_pages == 0 ? 100.0 :
 662                                                          100.0 * vacrel->scanned_pages / orig_rel_pages);
 663                         appendStringInfo(&buf,
 664                                                          _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable\n"),
 665                                                          (long long) vacrel->tuples_deleted,
 666                                                          (long long) vacrel->new_rel_tuples,
 667                                                          (long long) vacrel->recently_dead_tuples);
 668                         if (vacrel->missed_dead_tuples > 0)
 669                                 appendStringInfo(&buf,
 670                                                                  _("tuples missed: %lld dead from %u pages not removed due to cleanup lock contention\n"),
 671                                                                  (long long) vacrel->missed_dead_tuples,
 672                                                                  vacrel->missed_dead_pages);
 673                         diff = (int32) (ReadNextTransactionId() -
 674                                                         vacrel->cutoffs.OldestXmin);
 675                         appendStringInfo(&buf,
 676                                                          _("removable cutoff: %u, which was %d XIDs old when operation ended\n"),
 677                                                          vacrel->cutoffs.OldestXmin, diff);
 678                         if (frozenxid_updated)
 679                         {
 680                                 diff = (int32) (vacrel->NewRelfrozenXid -
 681                                                                 vacrel->cutoffs.relfrozenxid);
 682                                 appendStringInfo(&buf,
 683                                                                  _("new relfrozenxid: %u, which is %d XIDs ahead of previous value\n"),
 684                                                                  vacrel->NewRelfrozenXid, diff);
 685                         }
 686                         if (minmulti_updated)
 687                         {
 688                                 diff = (int32) (vacrel->NewRelminMxid -
 689                                                                 vacrel->cutoffs.relminmxid);
 690                                 appendStringInfo(&buf,
 691                                                                  _("new relminmxid: %u, which is %d MXIDs ahead of previous value\n"),
 692                                                                  vacrel->NewRelminMxid, diff);
 693                         }
 694                         appendStringInfo(&buf, _("frozen: %u pages from table (%.2f%% of total) had %lld tuples frozen\n"),
 695                                                          vacrel->frozen_pages,
 696                                                          orig_rel_pages == 0 ? 100.0 :
 697                                                          100.0 * vacrel->frozen_pages / orig_rel_pages,
 698                                                          (long long) vacrel->tuples_frozen);
 699                         if (vacrel->do_index_vacuuming)
 700                         {
 701                                 if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0)
 702                                         appendStringInfoString(&buf, _("index scan not needed: "));
 703                                 else
 704                                         appendStringInfoString(&buf, _("index scan needed: "));
 705
 706                                 msgfmt = _("%u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
 707                         }
 708                         else
 709                         {
 710                                 if (!vacrel->failsafe_active)
 711                                         appendStringInfoString(&buf, _("index scan bypassed: "));
 712                                 else
 713                                         appendStringInfoString(&buf, _("index scan bypassed by failsafe: "));
 714
 715                                 msgfmt = _("%u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
 716                         }
 717                         appendStringInfo(&buf, msgfmt,
 718                                                          vacrel->lpdead_item_pages,
 719                                                          orig_rel_pages == 0 ? 100.0 :
 720                                                          100.0 * vacrel->lpdead_item_pages / orig_rel_pages,
 721                                                          (long long) vacrel->lpdead_items);
 722                         for (int i = 0; i < vacrel->nindexes; i++)
 723                         {
 724                                 IndexBulkDeleteResult *istat = vacrel->indstats[i];
 725
 726                                 if (!istat)
 727                                         continue;
 728
 729                                 appendStringInfo(&buf,
 730                                                                  _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
 731                                                                  indnames[i],
 732                                                                  istat->num_pages,
 733                                                                  istat->pages_newly_deleted,
 734                                                                  istat->pages_deleted,
 735                                                                  istat->pages_free);
 736                         }
 737                         if (track_io_timing)
 738                         {
 739                                 double          read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000;
 740                                 double          write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000;
 741
 742                                 appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"),
 743                                                                  read_ms, write_ms);
 744                         }
 745                         if (secs_dur > 0 || usecs_dur > 0)
 746                         {
 747                                 read_rate = (double) BLCKSZ * PageMissOp / (1024 * 1024) /
 748                                         (secs_dur + usecs_dur / 1000000.0);
 749                                 write_rate = (double) BLCKSZ * PageDirtyOp / (1024 * 1024) /
 750                                         (secs_dur + usecs_dur / 1000000.0);
 751                         }
 752                         appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
 753                                                          read_rate, write_rate);
 754                         appendStringInfo(&buf,
 755                                                          _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
 756                                                          (long long) PageHitOp,
 757                                                          (long long) PageMissOp,
 758                                                          (long long) PageDirtyOp);
 759                         appendStringInfo(&buf,
 760                                                          _("WAL usage: %lld records, %lld full page images, %llu bytes\n"),
 761                                                          (long long) walusage.wal_records,
 762                                                          (long long) walusage.wal_fpi,
 763                                                          (unsigned long long) walusage.wal_bytes);
 764                         appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
 765
 766                         ereport(verbose ? INFO : LOG,
 767                                         (errmsg_internal("%s", buf.data)));
 768                         pfree(buf.data);
 769                 }
 770         }
 771
 772         /* Cleanup index statistics and index names */
 773         for (int i = 0; i < vacrel->nindexes; i++)
 774         {
 775                 if (vacrel->indstats[i])
 776                         pfree(vacrel->indstats[i]);
 777
 778                 if (instrument)
 779                         pfree(indnames[i]);
 780         }
 781 }
 782
 783 /*
 784  *      lazy_scan_heap() -- workhorse function for VACUUM
 785  *
 786  *              This routine prunes each page in the heap, and considers the need to
 787  *              freeze remaining tuples with storage (not including pages that can be
 788  *              skipped using the visibility map).  Also performs related maintenance
 789  *              of the FSM and visibility map.  These steps all take place during an
 790  *              initial pass over the target heap relation.
 791  *
 792  *              Also invokes lazy_vacuum_all_indexes to vacuum indexes, which largely
 793  *              consists of deleting index tuples that point to LP_DEAD items left in
 794  *              heap pages following pruning.  Earlier initial pass over the heap will
 795  *              have collected the TIDs whose index tuples need to be removed.
 796  *
 797  *              Finally, invokes lazy_vacuum_heap_rel to vacuum heap pages, which
 798  *              largely consists of marking LP_DEAD items (from collected TID array)
 799  *              as LP_UNUSED.  This has to happen in a second, final pass over the
 800  *              heap, to preserve a basic invariant that all index AMs rely on: no
 801  *              extant index tuple can ever be allowed to contain a TID that points to
 802  *              an LP_UNUSED line pointer in the heap.  We must disallow premature
 803  *              recycling of line pointers to avoid index scans that get confused
 804  *              about which TID points to which tuple immediately after recycling.
 805  *              (Actually, this isn't a concern when target heap relation happens to
 806  *              have no indexes, which allows us to safely apply the one-pass strategy
 807  *              as an optimization).
 808  *
 809  *              In practice we often have enough space to fit all TIDs, and so won't
 810  *              need to call lazy_vacuum more than once, after our initial pass over
 811  *              the heap has totally finished.  Otherwise things are slightly more
 812  *              complicated: our "initial pass" over the heap applies only to those
 813  *              pages that were pruned before we needed to call lazy_vacuum, and our
 814  *              "final pass" over the heap only vacuums these same heap pages.
 815  *              However, we process indexes in full every time lazy_vacuum is called,
 816  *              which makes index processing very inefficient when memory is in short
 817  *              supply.
 818  */
 819 static void
 820 lazy_scan_heap(LVRelState *vacrel)
 821 {
 822         BlockNumber rel_pages = vacrel->rel_pages,
 823                                 blkno,
 824                                 next_unskippable_block,
 825                                 next_fsm_block_to_vacuum = 0;
 826         VacDeadItems *dead_items = vacrel->dead_items;
 827         Buffer          vmbuffer = InvalidBuffer;
 828         bool            next_unskippable_allvis,
 829                                 skipping_current_range;
 830         const int       initprog_index[] = {
 831                 PROGRESS_VACUUM_PHASE,
 832                 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
 833                 PROGRESS_VACUUM_MAX_DEAD_TUPLES
 834         };
 835         int64           initprog_val[3];
 836
 837         /* Report that we're scanning the heap, advertising total # of blocks */
 838         initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
 839         initprog_val[1] = rel_pages;
 840         initprog_val[2] = dead_items->max_items;
 841         pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
 842
 843         /* Set up an initial range of skippable blocks using the visibility map */
 844         next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0,
 845                                                                                         &next_unskippable_allvis,
 846                                                                                         &skipping_current_range);
 847         for (blkno = 0; blkno < rel_pages; blkno++)
 848         {
 849                 Buffer          buf;
 850                 Page            page;
 851                 bool            all_visible_according_to_vm;
 852                 LVPagePruneState prunestate;
 853
 854                 if (blkno == next_unskippable_block)
 855                 {
 856                         /*
 857                          * Can't skip this page safely.  Must scan the page.  But
 858                          * determine the next skippable range after the page first.
 859                          */
 860                         all_visible_according_to_vm = next_unskippable_allvis;
 861                         next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer,
 862                                                                                                         blkno + 1,
 863                                                                                                         &next_unskippable_allvis,
 864                                                                                                         &skipping_current_range);
 865
 866                         Assert(next_unskippable_block >= blkno + 1);
 867                 }
 868                 else
 869                 {
 870                         /* Last page always scanned (may need to set nonempty_pages) */
 871                         Assert(blkno < rel_pages - 1);
 872
 873                         if (skipping_current_range)
 874                                 continue;
 875
 876                         /* Current range is too small to skip -- just scan the page */
 877                         all_visible_according_to_vm = true;
 878                 }
 879
 880                 vacrel->scanned_pages++;
 881
 882                 /* Report as block scanned, update error traceback information */
 883                 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
 884                 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
 885                                                                  blkno, InvalidOffsetNumber);
 886
 887                 vacuum_delay_point();
 888
 889                 /*
 890                  * Regularly check if wraparound failsafe should trigger.
 891                  *
 892                  * There is a similar check inside lazy_vacuum_all_indexes(), but
 893                  * relfrozenxid might start to look dangerously old before we reach
 894                  * that point.  This check also provides failsafe coverage for the
 895                  * one-pass strategy, and the two-pass strategy with the index_cleanup
 896                  * param set to 'off'.
 897                  */
 898                 if (vacrel->scanned_pages % FAILSAFE_EVERY_PAGES == 0)
 899                         lazy_check_wraparound_failsafe(vacrel);
 900
 901                 /*
 902                  * Consider if we definitely have enough space to process TIDs on page
 903                  * already.  If we are close to overrunning the available space for
 904                  * dead_items TIDs, pause and do a cycle of vacuuming before we tackle
 905                  * this page.
 906                  */
 907                 Assert(dead_items->max_items >= MaxHeapTuplesPerPage);
 908                 if (dead_items->max_items - dead_items->num_items < MaxHeapTuplesPerPage)
 909                 {
 910                         /*
 911                          * Before beginning index vacuuming, we release any pin we may
 912                          * hold on the visibility map page.  This isn't necessary for
 913                          * correctness, but we do it anyway to avoid holding the pin
 914                          * across a lengthy, unrelated operation.
 915                          */
 916                         if (BufferIsValid(vmbuffer))
 917                         {
 918                                 ReleaseBuffer(vmbuffer);
 919                                 vmbuffer = InvalidBuffer;
 920                         }
 921
 922                         /* Perform a round of index and heap vacuuming */
 923                         vacrel->consider_bypass_optimization = false;
 924                         lazy_vacuum(vacrel);
 925
 926                         /*
 927                          * Vacuum the Free Space Map to make newly-freed space visible on
 928                          * upper-level FSM pages.  Note we have not yet processed blkno.
 929                          */
 930                         FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
 931                                                                         blkno);
 932                         next_fsm_block_to_vacuum = blkno;
 933
 934                         /* Report that we are once again scanning the heap */
 935                         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
 936                                                                                  PROGRESS_VACUUM_PHASE_SCAN_HEAP);
 937                 }
 938
 939                 /*
 940                  * Pin the visibility map page in case we need to mark the page
 941                  * all-visible.  In most cases this will be very cheap, because we'll
 942                  * already have the correct page pinned anyway.
 943                  */
 944                 visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
 945
 946                 /* Finished preparatory checks.  Actually scan the page. */
 947                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
 948                                                                  RBM_NORMAL, vacrel->bstrategy);
 949                 page = BufferGetPage(buf);
 950
 951                 /*
 952                  * We need a buffer cleanup lock to prune HOT chains and defragment
 953                  * the page in lazy_scan_prune.  But when it's not possible to acquire
 954                  * a cleanup lock right away, we may be able to settle for reduced
 955                  * processing using lazy_scan_noprune.
 956                  */
 957                 if (!ConditionalLockBufferForCleanup(buf))
 958                 {
 959                         bool            hastup,
 960                                                 recordfreespace;
 961
 962                         LockBuffer(buf, BUFFER_LOCK_SHARE);
 963
 964                         /* Check for new or empty pages before lazy_scan_noprune call */
 965                         if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, true,
 966                                                                            vmbuffer))
 967                         {
 968                                 /* Processed as new/empty page (lock and pin released) */
 969                                 continue;
 970                         }
 971
 972                         /* Collect LP_DEAD items in dead_items array, count tuples */
 973                         if (lazy_scan_noprune(vacrel, buf, blkno, page, &hastup,
 974                                                                   &recordfreespace))
 975                         {
 976                                 Size            freespace = 0;
 977
 978                                 /*
 979                                  * Processed page successfully (without cleanup lock) -- just
 980                                  * need to perform rel truncation and FSM steps, much like the
 981                                  * lazy_scan_prune case.  Don't bother trying to match its
 982                                  * visibility map setting steps, though.
 983                                  */
 984                                 if (hastup)
 985                                         vacrel->nonempty_pages = blkno + 1;
 986                                 if (recordfreespace)
 987                                         freespace = PageGetHeapFreeSpace(page);
 988                                 UnlockReleaseBuffer(buf);
 989                                 if (recordfreespace)
 990                                         RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
 991                                 continue;
 992                         }
 993
 994                         /*
 995                          * lazy_scan_noprune could not do all required processing.  Wait
 996                          * for a cleanup lock, and call lazy_scan_prune in the usual way.
 997                          */
 998                         Assert(vacrel->aggressive);
 999                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1000                         LockBufferForCleanup(buf);
1001                 }
1002
1003                 /* Check for new or empty pages before lazy_scan_prune call */
1004                 if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, false, vmbuffer))
1005                 {
1006                         /* Processed as new/empty page (lock and pin released) */
1007                         continue;
1008                 }
1009
1010                 /*
1011                  * Prune, freeze, and count tuples.
1012                  *
1013                  * Accumulates details of remaining LP_DEAD line pointers on page in
1014                  * dead_items array.  This includes LP_DEAD line pointers that we
1015                  * pruned ourselves, as well as existing LP_DEAD line pointers that
1016                  * were pruned some time earlier.  Also considers freezing XIDs in the
1017                  * tuple headers of remaining items with storage.
1018                  */
1019                 lazy_scan_prune(vacrel, buf, blkno, page, &prunestate);
1020
1021                 Assert(!prunestate.all_visible || !prunestate.has_lpdead_items);
1022
1023                 /* Remember the location of the last page with nonremovable tuples */
1024                 if (prunestate.hastup)
1025                         vacrel->nonempty_pages = blkno + 1;
1026
1027                 if (vacrel->nindexes == 0)
1028                 {
1029                         /*
1030                          * Consider the need to do page-at-a-time heap vacuuming when
1031                          * using the one-pass strategy now.
1032                          *
1033                          * The one-pass strategy will never call lazy_vacuum().  The steps
1034                          * performed here can be thought of as the one-pass equivalent of
1035                          * a call to lazy_vacuum().
1036                          */
1037                         if (prunestate.has_lpdead_items)
1038                         {
1039                                 Size            freespace;
1040
1041                                 lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1042
1043                                 /* Forget the LP_DEAD items that we just vacuumed */
1044                                 dead_items->num_items = 0;
1045
1046                                 /*
1047                                  * Periodically perform FSM vacuuming to make newly-freed
1048                                  * space visible on upper FSM pages.  Note we have not yet
1049                                  * performed FSM processing for blkno.
1050                                  */
1051                                 if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1052                                 {
1053                                         FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1054                                                                                         blkno);
1055                                         next_fsm_block_to_vacuum = blkno;
1056                                 }
1057
1058                                 /*
1059                                  * Now perform FSM processing for blkno, and move on to next
1060                                  * page.
1061                                  *
1062                                  * Our call to lazy_vacuum_heap_page() will have considered if
1063                                  * it's possible to set all_visible/all_frozen independently
1064                                  * of lazy_scan_prune().  Note that prunestate was invalidated
1065                                  * by lazy_vacuum_heap_page() call.
1066                                  */
1067                                 freespace = PageGetHeapFreeSpace(page);
1068
1069                                 UnlockReleaseBuffer(buf);
1070                                 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1071                                 continue;
1072                         }
1073
1074                         /*
1075                          * There was no call to lazy_vacuum_heap_page() because pruning
1076                          * didn't encounter/create any LP_DEAD items that needed to be
1077                          * vacuumed.  Prune state has not been invalidated, so proceed
1078                          * with prunestate-driven visibility map and FSM steps (just like
1079                          * the two-pass strategy).
1080                          */
1081                         Assert(dead_items->num_items == 0);
1082                 }
1083
1084                 /*
1085                  * Handle setting visibility map bit based on information from the VM
1086                  * (as of last lazy_scan_skip() call), and from prunestate
1087                  */
1088                 if (!all_visible_according_to_vm && prunestate.all_visible)
1089                 {
1090                         uint8           flags = VISIBILITYMAP_ALL_VISIBLE;
1091
1092                         if (prunestate.all_frozen)
1093                                 flags |= VISIBILITYMAP_ALL_FROZEN;
1094
1095                         /*
1096                          * It should never be the case that the visibility map page is set
1097                          * while the page-level bit is clear, but the reverse is allowed
1098                          * (if checksums are not enabled).  Regardless, set both bits so
1099                          * that we get back in sync.
1100                          *
1101                          * NB: If the heap page is all-visible but the VM bit is not set,
1102                          * we don't need to dirty the heap page.  However, if checksums
1103                          * are enabled, we do need to make sure that the heap page is
1104                          * dirtied before passing it to visibilitymap_set(), because it
1105                          * may be logged.  Given that this situation should only happen in
1106                          * rare cases after a crash, it is not worth optimizing.
1107                          */
1108                         PageSetAllVisible(page);
1109                         MarkBufferDirty(buf);
1110                         visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1111                                                           vmbuffer, prunestate.visibility_cutoff_xid,
1112                                                           flags);
1113                 }
1114
1115                 /*
1116                  * As of PostgreSQL 9.2, the visibility map bit should never be set if
1117                  * the page-level bit is clear.  However, it's possible that the bit
1118                  * got cleared after lazy_scan_skip() was called, so we must recheck
1119                  * with buffer lock before concluding that the VM is corrupt.
1120                  */
1121                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1122                                  && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1123                 {
1124                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1125                                  vacrel->relname, blkno);
1126                         visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1127                                                                 VISIBILITYMAP_VALID_BITS);
1128                 }
1129
1130                 /*
1131                  * It's possible for the value returned by
1132                  * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1133                  * wrong for us to see tuples that appear to not be visible to
1134                  * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1135                  * xmin value never moves backwards, but
1136                  * GetOldestNonRemovableTransactionId() is conservative and sometimes
1137                  * returns a value that's unnecessarily small, so if we see that
1138                  * contradiction it just means that the tuples that we think are not
1139                  * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1140                  * is correct.
1141                  *
1142                  * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE
1143                  * set, however.
1144                  */
1145                 else if (prunestate.has_lpdead_items && PageIsAllVisible(page))
1146                 {
1147                         elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u",
1148                                  vacrel->relname, blkno);
1149                         PageClearAllVisible(page);
1150                         MarkBufferDirty(buf);
1151                         visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1152                                                                 VISIBILITYMAP_VALID_BITS);
1153                 }
1154
1155                 /*
1156                  * If the all-visible page is all-frozen but not marked as such yet,
1157                  * mark it as all-frozen.  Note that all_frozen is only valid if
1158                  * all_visible is true, so we must check both prunestate fields.
1159                  */
1160                 else if (all_visible_according_to_vm && prunestate.all_visible &&
1161                                  prunestate.all_frozen &&
1162                                  !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1163                 {
1164                         /*
1165                          * We can pass InvalidTransactionId as the cutoff XID here,
1166                          * because setting the all-frozen bit doesn't cause recovery
1167                          * conflicts.
1168                          */
1169                         visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1170                                                           vmbuffer, InvalidTransactionId,
1171                                                           VISIBILITYMAP_ALL_FROZEN);
1172                 }
1173
1174                 /*
1175                  * Final steps for block: drop cleanup lock, record free space in the
1176                  * FSM
1177                  */
1178                 if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming)
1179                 {
1180                         /*
1181                          * Wait until lazy_vacuum_heap_rel() to save free space.  This
1182                          * doesn't just save us some cycles; it also allows us to record
1183                          * any additional free space that lazy_vacuum_heap_page() will
1184                          * make available in cases where it's possible to truncate the
1185                          * page's line pointer array.
1186                          *
1187                          * Note: It's not in fact 100% certain that we really will call
1188                          * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip
1189                          * index vacuuming (and so must skip heap vacuuming).  This is
1190                          * deemed okay because it only happens in emergencies, or when
1191                          * there is very little free space anyway. (Besides, we start
1192                          * recording free space in the FSM once index vacuuming has been
1193                          * abandoned.)
1194                          *
1195                          * Note: The one-pass (no indexes) case is only supposed to make
1196                          * it this far when there were no LP_DEAD items during pruning.
1197                          */
1198                         Assert(vacrel->nindexes > 0);
1199                         UnlockReleaseBuffer(buf);
1200                 }
1201                 else
1202                 {
1203                         Size            freespace = PageGetHeapFreeSpace(page);
1204
1205                         UnlockReleaseBuffer(buf);
1206                         RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1207                 }
1208         }
1209
1210         vacrel->blkno = InvalidBlockNumber;
1211         if (BufferIsValid(vmbuffer))
1212                 ReleaseBuffer(vmbuffer);
1213
1214         /* report that everything is now scanned */
1215         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1216
1217         /* now we can compute the new value for pg_class.reltuples */
1218         vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages,
1219                                                                                                          vacrel->scanned_pages,
1220                                                                                                          vacrel->live_tuples);
1221
1222         /*
1223          * Also compute the total number of surviving heap entries.  In the
1224          * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1225          */
1226         vacrel->new_rel_tuples =
1227                 Max(vacrel->new_live_tuples, 0) + vacrel->recently_dead_tuples +
1228                 vacrel->missed_dead_tuples;
1229
1230         /*
1231          * Do index vacuuming (call each index's ambulkdelete routine), then do
1232          * related heap vacuuming
1233          */
1234         if (dead_items->num_items > 0)
1235                 lazy_vacuum(vacrel);
1236
1237         /*
1238          * Vacuum the remainder of the Free Space Map.  We must do this whether or
1239          * not there were indexes, and whether or not we bypassed index vacuuming.
1240          */
1241         if (blkno > next_fsm_block_to_vacuum)
1242                 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1243
1244         /* report all blocks vacuumed */
1245         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1246
1247         /* Do final index cleanup (call each index's amvacuumcleanup routine) */
1248         if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1249                 lazy_cleanup_all_indexes(vacrel);
1250 }
1251
1252 /*
1253  *      lazy_scan_skip() -- set up range of skippable blocks using visibility map.
1254  *
1255  * lazy_scan_heap() calls here every time it needs to set up a new range of
1256  * blocks to skip via the visibility map.  Caller passes the next block in
1257  * line.  We return a next_unskippable_block for this range.  When there are
1258  * no skippable blocks we just return caller's next_block.  The all-visible
1259  * status of the returned block is set in *next_unskippable_allvis for caller,
1260  * too.  Block usually won't be all-visible (since it's unskippable), but it
1261  * can be during aggressive VACUUMs (as well as in certain edge cases).
1262  *
1263  * Sets *skipping_current_range to indicate if caller should skip this range.
1264  * Costs and benefits drive our decision.  Very small ranges won't be skipped.
1265  *
1266  * Note: our opinion of which blocks can be skipped can go stale immediately.
1267  * It's okay if caller "misses" a page whose all-visible or all-frozen marking
1268  * was concurrently cleared, though.  All that matters is that caller scan all
1269  * pages whose tuples might contain XIDs < OldestXmin, or MXIDs < OldestMxact.
1270  * (Actually, non-aggressive VACUUMs can choose to skip all-visible pages with
1271  * older XIDs/MXIDs.  The vacrel->skippedallvis flag will be set here when the
1272  * choice to skip such a range is actually made, making everything safe.)
1273  */
1274 static BlockNumber
1275 lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block,
1276                            bool *next_unskippable_allvis, bool *skipping_current_range)
1277 {
1278         BlockNumber rel_pages = vacrel->rel_pages,
1279                                 next_unskippable_block = next_block,
1280                                 nskippable_blocks = 0;
1281         bool            skipsallvis = false;
1282
1283         *next_unskippable_allvis = true;
1284         while (next_unskippable_block < rel_pages)
1285         {
1286                 uint8           mapbits = visibilitymap_get_status(vacrel->rel,
1287                                                                                                            next_unskippable_block,
1288                                                                                                            vmbuffer);
1289
1290                 if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
1291                 {
1292                         Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0);
1293                         *next_unskippable_allvis = false;
1294                         break;
1295                 }
1296
1297                 /*
1298                  * Caller must scan the last page to determine whether it has tuples
1299                  * (caller must have the opportunity to set vacrel->nonempty_pages).
1300                  * This rule avoids having lazy_truncate_heap() take access-exclusive
1301                  * lock on rel to attempt a truncation that fails anyway, just because
1302                  * there are tuples on the last page (it is likely that there will be
1303                  * tuples on other nearby pages as well, but those can be skipped).
1304                  *
1305                  * Implement this by always treating the last block as unsafe to skip.
1306                  */
1307                 if (next_unskippable_block == rel_pages - 1)
1308                         break;
1309
1310                 /* DISABLE_PAGE_SKIPPING makes all skipping unsafe */
1311                 if (!vacrel->skipwithvm)
1312                         break;
1313
1314                 /*
1315                  * Aggressive VACUUM caller can't skip pages just because they are
1316                  * all-visible.  They may still skip all-frozen pages, which can't
1317                  * contain XIDs < OldestXmin (XIDs that aren't already frozen by now).
1318                  */
1319                 if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0)
1320                 {
1321                         if (vacrel->aggressive)
1322                                 break;
1323
1324                         /*
1325                          * All-visible block is safe to skip in non-aggressive case.  But
1326                          * remember that the final range contains such a block for later.
1327                          */
1328                         skipsallvis = true;
1329                 }
1330
1331                 vacuum_delay_point();
1332                 next_unskippable_block++;
1333                 nskippable_blocks++;
1334         }
1335
1336         /*
1337          * We only skip a range with at least SKIP_PAGES_THRESHOLD consecutive
1338          * pages.  Since we're reading sequentially, the OS should be doing
1339          * readahead for us, so there's no gain in skipping a page now and then.
1340          * Skipping such a range might even discourage sequential detection.
1341          *
1342          * This test also enables more frequent relfrozenxid advancement during
1343          * non-aggressive VACUUMs.  If the range has any all-visible pages then
1344          * skipping makes updating relfrozenxid unsafe, which is a real downside.
1345          */
1346         if (nskippable_blocks < SKIP_PAGES_THRESHOLD)
1347                 *skipping_current_range = false;
1348         else
1349         {
1350                 *skipping_current_range = true;
1351                 if (skipsallvis)
1352                         vacrel->skippedallvis = true;
1353         }
1354
1355         return next_unskippable_block;
1356 }
1357
1358 /*
1359  *      lazy_scan_new_or_empty() -- lazy_scan_heap() new/empty page handling.
1360  *
1361  * Must call here to handle both new and empty pages before calling
1362  * lazy_scan_prune or lazy_scan_noprune, since they're not prepared to deal
1363  * with new or empty pages.
1364  *
1365  * It's necessary to consider new pages as a special case, since the rules for
1366  * maintaining the visibility map and FSM with empty pages are a little
1367  * different (though new pages can be truncated away during rel truncation).
1368  *
1369  * Empty pages are not really a special case -- they're just heap pages that
1370  * have no allocated tuples (including even LP_UNUSED items).  You might
1371  * wonder why we need to handle them here all the same.  It's only necessary
1372  * because of a corner-case involving a hard crash during heap relation
1373  * extension.  If we ever make relation-extension crash safe, then it should
1374  * no longer be necessary to deal with empty pages here (or new pages, for
1375  * that matter).
1376  *
1377  * Caller must hold at least a shared lock.  We might need to escalate the
1378  * lock in that case, so the type of lock caller holds needs to be specified
1379  * using 'sharelock' argument.
1380  *
1381  * Returns false in common case where caller should go on to call
1382  * lazy_scan_prune (or lazy_scan_noprune).  Otherwise returns true, indicating
1383  * that lazy_scan_heap is done processing the page, releasing lock on caller's
1384  * behalf.
1385  */
1386 static bool
1387 lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
1388                                            Page page, bool sharelock, Buffer vmbuffer)
1389 {
1390         Size            freespace;
1391
1392         if (PageIsNew(page))
1393         {
1394                 /*
1395                  * All-zeroes pages can be left over if either a backend extends the
1396                  * relation by a single page, but crashes before the newly initialized
1397                  * page has been written out, or when bulk-extending the relation
1398                  * (which creates a number of empty pages at the tail end of the
1399                  * relation), and then enters them into the FSM.
1400                  *
1401                  * Note we do not enter the page into the visibilitymap. That has the
1402                  * downside that we repeatedly visit this page in subsequent vacuums,
1403                  * but otherwise we'll never discover the space on a promoted standby.
1404                  * The harm of repeated checking ought to normally not be too bad. The
1405                  * space usually should be used at some point, otherwise there
1406                  * wouldn't be any regular vacuums.
1407                  *
1408                  * Make sure these pages are in the FSM, to ensure they can be reused.
1409                  * Do that by testing if there's any space recorded for the page. If
1410                  * not, enter it. We do so after releasing the lock on the heap page,
1411                  * the FSM is approximate, after all.
1412                  */
1413                 UnlockReleaseBuffer(buf);
1414
1415                 if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1416                 {
1417                         freespace = BLCKSZ - SizeOfPageHeaderData;
1418
1419                         RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1420                 }
1421
1422                 return true;
1423         }
1424
1425         if (PageIsEmpty(page))
1426         {
1427                 /*
1428                  * It seems likely that caller will always be able to get a cleanup
1429                  * lock on an empty page.  But don't take any chances -- escalate to
1430                  * an exclusive lock (still don't need a cleanup lock, though).
1431                  */
1432                 if (sharelock)
1433                 {
1434                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1435                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1436
1437                         if (!PageIsEmpty(page))
1438                         {
1439                                 /* page isn't new or empty -- keep lock and pin for now */
1440                                 return false;
1441                         }
1442                 }
1443                 else
1444                 {
1445                         /* Already have a full cleanup lock (which is more than enough) */
1446                 }
1447
1448                 /*
1449                  * Unlike new pages, empty pages are always set all-visible and
1450                  * all-frozen.
1451                  */
1452                 if (!PageIsAllVisible(page))
1453                 {
1454                         START_CRIT_SECTION();
1455
1456                         /* mark buffer dirty before writing a WAL record */
1457                         MarkBufferDirty(buf);
1458
1459                         /*
1460                          * It's possible that another backend has extended the heap,
1461                          * initialized the page, and then failed to WAL-log the page due
1462                          * to an ERROR.  Since heap extension is not WAL-logged, recovery
1463                          * might try to replay our record setting the page all-visible and
1464                          * find that the page isn't initialized, which will cause a PANIC.
1465                          * To prevent that, check whether the page has been previously
1466                          * WAL-logged, and if not, do that now.
1467                          */
1468                         if (RelationNeedsWAL(vacrel->rel) &&
1469                                 PageGetLSN(page) == InvalidXLogRecPtr)
1470                                 log_newpage_buffer(buf, true);
1471
1472                         PageSetAllVisible(page);
1473                         visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1474                                                           vmbuffer, InvalidTransactionId,
1475                                                           VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1476                         END_CRIT_SECTION();
1477                 }
1478
1479                 freespace = PageGetHeapFreeSpace(page);
1480                 UnlockReleaseBuffer(buf);
1481                 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1482                 return true;
1483         }
1484
1485         /* page isn't new or empty -- keep lock and pin */
1486         return false;
1487 }
1488
1489 /*
1490  *      lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1491  *
1492  * Caller must hold pin and buffer cleanup lock on the buffer.
1493  *
1494  * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
1495  * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
1496  * whether or not a tuple should be considered DEAD.  This happened when an
1497  * inserting transaction concurrently aborted (after our heap_page_prune()
1498  * call, before our HeapTupleSatisfiesVacuum() call).  There was rather a lot
1499  * of complexity just so we could deal with tuples that were DEAD to VACUUM,
1500  * but nevertheless were left with storage after pruning.
1501  *
1502  * The approach we take now is to restart pruning when the race condition is
1503  * detected.  This allows heap_page_prune() to prune the tuples inserted by
1504  * the now-aborted transaction.  This is a little crude, but it guarantees
1505  * that any items that make it into the dead_items array are simple LP_DEAD
1506  * line pointers, and that every remaining item with tuple storage is
1507  * considered as a candidate for freezing.
1508  */
1509 static void
1510 lazy_scan_prune(LVRelState *vacrel,
1511                                 Buffer buf,
1512                                 BlockNumber blkno,
1513                                 Page page,
1514                                 LVPagePruneState *prunestate)
1515 {
1516         Relation        rel = vacrel->rel;
1517         OffsetNumber offnum,
1518                                 maxoff;
1519         ItemId          itemid;
1520         HeapTupleData tuple;
1521         HTSV_Result res;
1522         int                     tuples_deleted,
1523                                 tuples_frozen,
1524                                 lpdead_items,
1525                                 live_tuples,
1526                                 recently_dead_tuples;
1527         int                     nnewlpdead;
1528         TransactionId NewRelfrozenXid;
1529         MultiXactId NewRelminMxid;
1530         OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1531         HeapTupleFreeze frozen[MaxHeapTuplesPerPage];
1532
1533         Assert(BufferGetBlockNumber(buf) == blkno);
1534
1535         /*
1536          * maxoff might be reduced following line pointer array truncation in
1537          * heap_page_prune.  That's safe for us to ignore, since the reclaimed
1538          * space will continue to look like LP_UNUSED items below.
1539          */
1540         maxoff = PageGetMaxOffsetNumber(page);
1541
1542 retry:
1543
1544         /* Initialize (or reset) page-level state */
1545         NewRelfrozenXid = vacrel->NewRelfrozenXid;
1546         NewRelminMxid = vacrel->NewRelminMxid;
1547         tuples_deleted = 0;
1548         tuples_frozen = 0;
1549         lpdead_items = 0;
1550         live_tuples = 0;
1551         recently_dead_tuples = 0;
1552
1553         /*
1554          * Prune all HOT-update chains in this page.
1555          *
1556          * We count tuples removed by the pruning step as tuples_deleted.  Its
1557          * final value can be thought of as the number of tuples that have been
1558          * deleted from the table.  It should not be confused with lpdead_items;
1559          * lpdead_items's final value can be thought of as the number of tuples
1560          * that were deleted from indexes.
1561          */
1562         tuples_deleted = heap_page_prune(rel, buf, vacrel->vistest,
1563                                                                          InvalidTransactionId, 0, &nnewlpdead,
1564                                                                          &vacrel->offnum);
1565
1566         /*
1567          * Now scan the page to collect LP_DEAD items and check for tuples
1568          * requiring freezing among remaining tuples with storage
1569          */
1570         prunestate->hastup = false;
1571         prunestate->has_lpdead_items = false;
1572         prunestate->all_visible = true;
1573         prunestate->all_frozen = true;
1574         prunestate->visibility_cutoff_xid = InvalidTransactionId;
1575
1576         for (offnum = FirstOffsetNumber;
1577                  offnum <= maxoff;
1578                  offnum = OffsetNumberNext(offnum))
1579         {
1580                 bool            totally_frozen;
1581
1582                 /*
1583                  * Set the offset number so that we can display it along with any
1584                  * error that occurred while processing this tuple.
1585                  */
1586                 vacrel->offnum = offnum;
1587                 itemid = PageGetItemId(page, offnum);
1588
1589                 if (!ItemIdIsUsed(itemid))
1590                         continue;
1591
1592                 /* Redirect items mustn't be touched */
1593                 if (ItemIdIsRedirected(itemid))
1594                 {
1595                         prunestate->hastup = true;      /* page won't be truncatable */
1596                         continue;
1597                 }
1598
1599                 /*
1600                  * LP_DEAD items are processed outside of the loop.
1601                  *
1602                  * Note that we deliberately don't set hastup=true in the case of an
1603                  * LP_DEAD item here, which is not how count_nondeletable_pages() does
1604                  * it -- it only considers pages empty/truncatable when they have no
1605                  * items at all (except LP_UNUSED items).
1606                  *
1607                  * Our assumption is that any LP_DEAD items we encounter here will
1608                  * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
1609                  * call count_nondeletable_pages().  In any case our opinion of
1610                  * whether or not a page 'hastup' (which is how our caller sets its
1611                  * vacrel->nonempty_pages value) is inherently race-prone.  It must be
1612                  * treated as advisory/unreliable, so we might as well be slightly
1613                  * optimistic.
1614                  */
1615                 if (ItemIdIsDead(itemid))
1616                 {
1617                         deadoffsets[lpdead_items++] = offnum;
1618                         prunestate->all_visible = false;
1619                         prunestate->has_lpdead_items = true;
1620                         continue;
1621                 }
1622
1623                 Assert(ItemIdIsNormal(itemid));
1624
1625                 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1626                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1627                 tuple.t_len = ItemIdGetLength(itemid);
1628                 tuple.t_tableOid = RelationGetRelid(rel);
1629
1630                 /*
1631                  * DEAD tuples are almost always pruned into LP_DEAD line pointers by
1632                  * heap_page_prune(), but it's possible that the tuple state changed
1633                  * since heap_page_prune() looked.  Handle that here by restarting.
1634                  * (See comments at the top of function for a full explanation.)
1635                  */
1636                 res = HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin,
1637                                                                            buf);
1638
1639                 if (unlikely(res == HEAPTUPLE_DEAD))
1640                         goto retry;
1641
1642                 /*
1643                  * The criteria for counting a tuple as live in this block need to
1644                  * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM
1645                  * and ANALYZE may produce wildly different reltuples values, e.g.
1646                  * when there are many recently-dead tuples.
1647                  *
1648                  * The logic here is a bit simpler than acquire_sample_rows(), as
1649                  * VACUUM can't run inside a transaction block, which makes some cases
1650                  * impossible (e.g. in-progress insert from the same transaction).
1651                  *
1652                  * We treat LP_DEAD items (which are the closest thing to DEAD tuples
1653                  * that might be seen here) differently, too: we assume that they'll
1654                  * become LP_UNUSED before VACUUM finishes.  This difference is only
1655                  * superficial.  VACUUM effectively agrees with ANALYZE about DEAD
1656                  * items, in the end.  VACUUM won't remember LP_DEAD items, but only
1657                  * because they're not supposed to be left behind when it is done.
1658                  * (Cases where we bypass index vacuuming will violate this optimistic
1659                  * assumption, but the overall impact of that should be negligible.)
1660                  */
1661                 switch (res)
1662                 {
1663                         case HEAPTUPLE_LIVE:
1664
1665                                 /*
1666                                  * Count it as live.  Not only is this natural, but it's also
1667                                  * what acquire_sample_rows() does.
1668                                  */
1669                                 live_tuples++;
1670
1671                                 /*
1672                                  * Is the tuple definitely visible to all transactions?
1673                                  *
1674                                  * NB: Like with per-tuple hint bits, we can't set the
1675                                  * PD_ALL_VISIBLE flag if the inserter committed
1676                                  * asynchronously. See SetHintBits for more info. Check that
1677                                  * the tuple is hinted xmin-committed because of that.
1678                                  */
1679                                 if (prunestate->all_visible)
1680                                 {
1681                                         TransactionId xmin;
1682
1683                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1684                                         {
1685                                                 prunestate->all_visible = false;
1686                                                 break;
1687                                         }
1688
1689                                         /*
1690                                          * The inserter definitely committed. But is it old enough
1691                                          * that everyone sees it as committed?
1692                                          */
1693                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1694                                         if (!TransactionIdPrecedes(xmin,
1695                                                                                            vacrel->cutoffs.OldestXmin))
1696                                         {
1697                                                 prunestate->all_visible = false;
1698                                                 break;
1699                                         }
1700
1701                                         /* Track newest xmin on page. */
1702                                         if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid))
1703                                                 prunestate->visibility_cutoff_xid = xmin;
1704                                 }
1705                                 break;
1706                         case HEAPTUPLE_RECENTLY_DEAD:
1707
1708                                 /*
1709                                  * If tuple is recently dead then we must not remove it from
1710                                  * the relation.  (We only remove items that are LP_DEAD from
1711                                  * pruning.)
1712                                  */
1713                                 recently_dead_tuples++;
1714                                 prunestate->all_visible = false;
1715                                 break;
1716                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1717
1718                                 /*
1719                                  * We do not count these rows as live, because we expect the
1720                                  * inserting transaction to update the counters at commit, and
1721                                  * we assume that will happen only after we report our
1722                                  * results.  This assumption is a bit shaky, but it is what
1723                                  * acquire_sample_rows() does, so be consistent.
1724                                  */
1725                                 prunestate->all_visible = false;
1726                                 break;
1727                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1728                                 /* This is an expected case during concurrent vacuum */
1729                                 prunestate->all_visible = false;
1730
1731                                 /*
1732                                  * Count such rows as live.  As above, we assume the deleting
1733                                  * transaction will commit and update the counters after we
1734                                  * report.
1735                                  */
1736                                 live_tuples++;
1737                                 break;
1738                         default:
1739                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1740                                 break;
1741                 }
1742
1743                 prunestate->hastup = true;      /* page makes rel truncation unsafe */
1744
1745                 /* Tuple with storage -- consider need to freeze */
1746                 if (heap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs,
1747                                                                           &frozen[tuples_frozen], &totally_frozen,
1748                                                                           &NewRelfrozenXid, &NewRelminMxid))
1749                 {
1750                         /* Save prepared freeze plan for later */
1751                         frozen[tuples_frozen++].offset = offnum;
1752                 }
1753
1754                 /*
1755                  * If tuple is not frozen (and not about to become frozen) then caller
1756                  * had better not go on to set this page's VM bit
1757                  */
1758                 if (!totally_frozen)
1759                         prunestate->all_frozen = false;
1760         }
1761
1762         vacrel->offnum = InvalidOffsetNumber;
1763
1764         /*
1765          * We have now divided every item on the page into either an LP_DEAD item
1766          * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
1767          * that remains and needs to be considered for freezing now (LP_UNUSED and
1768          * LP_REDIRECT items also remain, but are of no further interest to us).
1769          */
1770         vacrel->NewRelfrozenXid = NewRelfrozenXid;
1771         vacrel->NewRelminMxid = NewRelminMxid;
1772
1773         /*
1774          * Consider the need to freeze any items with tuple storage from the page
1775          * first (arbitrary)
1776          */
1777         if (tuples_frozen > 0)
1778         {
1779                 Assert(prunestate->hastup);
1780
1781                 vacrel->frozen_pages++;
1782
1783                 /* Execute all freeze plans for page as a single atomic action */
1784                 heap_freeze_execute_prepared(vacrel->rel, buf,
1785                                                                          vacrel->cutoffs.FreezeLimit,
1786                                                                          frozen, tuples_frozen);
1787         }
1788
1789         /*
1790          * The second pass over the heap can also set visibility map bits, using
1791          * the same approach.  This is important when the table frequently has a
1792          * few old LP_DEAD items on each page by the time we get to it (typically
1793          * because past opportunistic pruning operations freed some non-HOT
1794          * tuples).
1795          *
1796          * VACUUM will call heap_page_is_all_visible() during the second pass over
1797          * the heap to determine all_visible and all_frozen for the page -- this
1798          * is a specialized version of the logic from this function.  Now that
1799          * we've finished pruning and freezing, make sure that we're in total
1800          * agreement with heap_page_is_all_visible() using an assertion.
1801          */
1802 #ifdef USE_ASSERT_CHECKING
1803         /* Note that all_frozen value does not matter when !all_visible */
1804         if (prunestate->all_visible)
1805         {
1806                 TransactionId cutoff;
1807                 bool            all_frozen;
1808
1809                 if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
1810                         Assert(false);
1811
1812                 Assert(lpdead_items == 0);
1813                 Assert(prunestate->all_frozen == all_frozen);
1814
1815                 /*
1816                  * It's possible that we froze tuples and made the page's XID cutoff
1817                  * (for recovery conflict purposes) FrozenTransactionId.  This is okay
1818                  * because visibility_cutoff_xid will be logged by our caller in a
1819                  * moment.
1820                  */
1821                 Assert(cutoff == FrozenTransactionId ||
1822                            cutoff == prunestate->visibility_cutoff_xid);
1823         }
1824 #endif
1825
1826         /*
1827          * Now save details of the LP_DEAD items from the page in vacrel
1828          */
1829         if (lpdead_items > 0)
1830         {
1831                 VacDeadItems *dead_items = vacrel->dead_items;
1832                 ItemPointerData tmp;
1833
1834                 Assert(!prunestate->all_visible);
1835                 Assert(prunestate->has_lpdead_items);
1836
1837                 vacrel->lpdead_item_pages++;
1838
1839                 ItemPointerSetBlockNumber(&tmp, blkno);
1840
1841                 for (int i = 0; i < lpdead_items; i++)
1842                 {
1843                         ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]);
1844                         dead_items->items[dead_items->num_items++] = tmp;
1845                 }
1846
1847                 Assert(dead_items->num_items <= dead_items->max_items);
1848                 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
1849                                                                          dead_items->num_items);
1850         }
1851
1852         /* Finally, add page-local counts to whole-VACUUM counts */
1853         vacrel->tuples_deleted += tuples_deleted;
1854         vacrel->tuples_frozen += tuples_frozen;
1855         vacrel->lpdead_items += lpdead_items;
1856         vacrel->live_tuples += live_tuples;
1857         vacrel->recently_dead_tuples += recently_dead_tuples;
1858 }
1859
1860 /*
1861  *      lazy_scan_noprune() -- lazy_scan_prune() without pruning or freezing
1862  *
1863  * Caller need only hold a pin and share lock on the buffer, unlike
1864  * lazy_scan_prune, which requires a full cleanup lock.  While pruning isn't
1865  * performed here, it's quite possible that an earlier opportunistic pruning
1866  * operation left LP_DEAD items behind.  We'll at least collect any such items
1867  * in the dead_items array for removal from indexes.
1868  *
1869  * For aggressive VACUUM callers, we may return false to indicate that a full
1870  * cleanup lock is required for processing by lazy_scan_prune.  This is only
1871  * necessary when the aggressive VACUUM needs to freeze some tuple XIDs from
1872  * one or more tuples on the page.  We always return true for non-aggressive
1873  * callers.
1874  *
1875  * See lazy_scan_prune for an explanation of hastup return flag.
1876  * recordfreespace flag instructs caller on whether or not it should do
1877  * generic FSM processing for page.
1878  */
1879 static bool
1880 lazy_scan_noprune(LVRelState *vacrel,
1881                                   Buffer buf,
1882                                   BlockNumber blkno,
1883                                   Page page,
1884                                   bool *hastup,
1885                                   bool *recordfreespace)
1886 {
1887         OffsetNumber offnum,
1888                                 maxoff;
1889         int                     lpdead_items,
1890                                 live_tuples,
1891                                 recently_dead_tuples,
1892                                 missed_dead_tuples;
1893         HeapTupleHeader tupleheader;
1894         TransactionId NewRelfrozenXid = vacrel->NewRelfrozenXid;
1895         MultiXactId NewRelminMxid = vacrel->NewRelminMxid;
1896         OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1897
1898         Assert(BufferGetBlockNumber(buf) == blkno);
1899
1900         *hastup = false;                        /* for now */
1901         *recordfreespace = false;       /* for now */
1902
1903         lpdead_items = 0;
1904         live_tuples = 0;
1905         recently_dead_tuples = 0;
1906         missed_dead_tuples = 0;
1907
1908         maxoff = PageGetMaxOffsetNumber(page);
1909         for (offnum = FirstOffsetNumber;
1910                  offnum <= maxoff;
1911                  offnum = OffsetNumberNext(offnum))
1912         {
1913                 ItemId          itemid;
1914                 HeapTupleData tuple;
1915
1916                 vacrel->offnum = offnum;
1917                 itemid = PageGetItemId(page, offnum);
1918
1919                 if (!ItemIdIsUsed(itemid))
1920                         continue;
1921
1922                 if (ItemIdIsRedirected(itemid))
1923                 {
1924                         *hastup = true;
1925                         continue;
1926                 }
1927
1928                 if (ItemIdIsDead(itemid))
1929                 {
1930                         /*
1931                          * Deliberately don't set hastup=true here.  See same point in
1932                          * lazy_scan_prune for an explanation.
1933                          */
1934                         deadoffsets[lpdead_items++] = offnum;
1935                         continue;
1936                 }
1937
1938                 *hastup = true;                 /* page prevents rel truncation */
1939                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1940                 if (heap_tuple_would_freeze(tupleheader, &vacrel->cutoffs,
1941                                                                         &NewRelfrozenXid, &NewRelminMxid))
1942                 {
1943                         /* Tuple with XID < FreezeLimit (or MXID < MultiXactCutoff) */
1944                         if (vacrel->aggressive)
1945                         {
1946                                 /*
1947                                  * Aggressive VACUUMs must always be able to advance rel's
1948                                  * relfrozenxid to a value >= FreezeLimit (and be able to
1949                                  * advance rel's relminmxid to a value >= MultiXactCutoff).
1950                                  * The ongoing aggressive VACUUM won't be able to do that
1951                                  * unless it can freeze an XID (or MXID) from this tuple now.
1952                                  *
1953                                  * The only safe option is to have caller perform processing
1954                                  * of this page using lazy_scan_prune.  Caller might have to
1955                                  * wait a while for a cleanup lock, but it can't be helped.
1956                                  */
1957                                 vacrel->offnum = InvalidOffsetNumber;
1958                                 return false;
1959                         }
1960
1961                         /*
1962                          * Non-aggressive VACUUMs are under no obligation to advance
1963                          * relfrozenxid (even by one XID).  We can be much laxer here.
1964                          *
1965                          * Currently we always just accept an older final relfrozenxid
1966                          * and/or relminmxid value.  We never make caller wait or work a
1967                          * little harder, even when it likely makes sense to do so.
1968                          */
1969                 }
1970
1971                 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1972                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1973                 tuple.t_len = ItemIdGetLength(itemid);
1974                 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
1975
1976                 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin,
1977                                                                                  buf))
1978                 {
1979                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1980                         case HEAPTUPLE_LIVE:
1981
1982                                 /*
1983                                  * Count both cases as live, just like lazy_scan_prune
1984                                  */
1985                                 live_tuples++;
1986
1987                                 break;
1988                         case HEAPTUPLE_DEAD:
1989
1990                                 /*
1991                                  * There is some useful work for pruning to do, that won't be
1992                                  * done due to failure to get a cleanup lock.
1993                                  */
1994                                 missed_dead_tuples++;
1995                                 break;
1996                         case HEAPTUPLE_RECENTLY_DEAD:
1997
1998                                 /*
1999                                  * Count in recently_dead_tuples, just like lazy_scan_prune
2000                                  */
2001                                 recently_dead_tuples++;
2002                                 break;
2003                         case HEAPTUPLE_INSERT_IN_PROGRESS:
2004
2005                                 /*
2006                                  * Do not count these rows as live, just like lazy_scan_prune
2007                                  */
2008                                 break;
2009                         default:
2010                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
2011                                 break;
2012                 }
2013         }
2014
2015         vacrel->offnum = InvalidOffsetNumber;
2016
2017         /*
2018          * By here we know for sure that caller can put off freezing and pruning
2019          * this particular page until the next VACUUM.  Remember its details now.
2020          * (lazy_scan_prune expects a clean slate, so we have to do this last.)
2021          */
2022         vacrel->NewRelfrozenXid = NewRelfrozenXid;
2023         vacrel->NewRelminMxid = NewRelminMxid;
2024
2025         /* Save any LP_DEAD items found on the page in dead_items array */
2026         if (vacrel->nindexes == 0)
2027         {
2028                 /* Using one-pass strategy (since table has no indexes) */
2029                 if (lpdead_items > 0)
2030                 {
2031                         /*
2032                          * Perfunctory handling for the corner case where a single pass
2033                          * strategy VACUUM cannot get a cleanup lock, and it turns out
2034                          * that there is one or more LP_DEAD items: just count the LP_DEAD
2035                          * items as missed_dead_tuples instead. (This is a bit dishonest,
2036                          * but it beats having to maintain specialized heap vacuuming code
2037                          * forever, for vanishingly little benefit.)
2038                          */
2039                         *hastup = true;
2040                         missed_dead_tuples += lpdead_items;
2041                 }
2042
2043                 *recordfreespace = true;
2044         }
2045         else if (lpdead_items == 0)
2046         {
2047                 /*
2048                  * Won't be vacuuming this page later, so record page's freespace in
2049                  * the FSM now
2050                  */
2051                 *recordfreespace = true;
2052         }
2053         else
2054         {
2055                 VacDeadItems *dead_items = vacrel->dead_items;
2056                 ItemPointerData tmp;
2057
2058                 /*
2059                  * Page has LP_DEAD items, and so any references/TIDs that remain in
2060                  * indexes will be deleted during index vacuuming (and then marked
2061                  * LP_UNUSED in the heap)
2062                  */
2063                 vacrel->lpdead_item_pages++;
2064
2065                 ItemPointerSetBlockNumber(&tmp, blkno);
2066
2067                 for (int i = 0; i < lpdead_items; i++)
2068                 {
2069                         ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]);
2070                         dead_items->items[dead_items->num_items++] = tmp;
2071                 }
2072
2073                 Assert(dead_items->num_items <= dead_items->max_items);
2074                 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2075                                                                          dead_items->num_items);
2076
2077                 vacrel->lpdead_items += lpdead_items;
2078
2079                 /*
2080                  * Assume that we'll go on to vacuum this heap page during final pass
2081                  * over the heap.  Don't record free space until then.
2082                  */
2083                 *recordfreespace = false;
2084         }
2085
2086         /*
2087          * Finally, add relevant page-local counts to whole-VACUUM counts
2088          */
2089         vacrel->live_tuples += live_tuples;
2090         vacrel->recently_dead_tuples += recently_dead_tuples;
2091         vacrel->missed_dead_tuples += missed_dead_tuples;
2092         if (missed_dead_tuples > 0)
2093                 vacrel->missed_dead_pages++;
2094
2095         /* Caller won't need to call lazy_scan_prune with same page */
2096         return true;
2097 }
2098
2099 /*
2100  * Main entry point for index vacuuming and heap vacuuming.
2101  *
2102  * Removes items collected in dead_items from table's indexes, then marks the
2103  * same items LP_UNUSED in the heap.  See the comments above lazy_scan_heap
2104  * for full details.
2105  *
2106  * Also empties dead_items, freeing up space for later TIDs.
2107  *
2108  * We may choose to bypass index vacuuming at this point, though only when the
2109  * ongoing VACUUM operation will definitely only have one index scan/round of
2110  * index vacuuming.
2111  */
2112 static void
2113 lazy_vacuum(LVRelState *vacrel)
2114 {
2115         bool            bypass;
2116
2117         /* Should not end up here with no indexes */
2118         Assert(vacrel->nindexes > 0);
2119         Assert(vacrel->lpdead_item_pages > 0);
2120
2121         if (!vacrel->do_index_vacuuming)
2122         {
2123                 Assert(!vacrel->do_index_cleanup);
2124                 vacrel->dead_items->num_items = 0;
2125                 return;
2126         }
2127
2128         /*
2129          * Consider bypassing index vacuuming (and heap vacuuming) entirely.
2130          *
2131          * We currently only do this in cases where the number of LP_DEAD items
2132          * for the entire VACUUM operation is close to zero.  This avoids sharp
2133          * discontinuities in the duration and overhead of successive VACUUM
2134          * operations that run against the same table with a fixed workload.
2135          * Ideally, successive VACUUM operations will behave as if there are
2136          * exactly zero LP_DEAD items in cases where there are close to zero.
2137          *
2138          * This is likely to be helpful with a table that is continually affected
2139          * by UPDATEs that can mostly apply the HOT optimization, but occasionally
2140          * have small aberrations that lead to just a few heap pages retaining
2141          * only one or two LP_DEAD items.  This is pretty common; even when the
2142          * DBA goes out of their way to make UPDATEs use HOT, it is practically
2143          * impossible to predict whether HOT will be applied in 100% of cases.
2144          * It's far easier to ensure that 99%+ of all UPDATEs against a table use
2145          * HOT through careful tuning.
2146          */
2147         bypass = false;
2148         if (vacrel->consider_bypass_optimization && vacrel->rel_pages > 0)
2149         {
2150                 BlockNumber threshold;
2151
2152                 Assert(vacrel->num_index_scans == 0);
2153                 Assert(vacrel->lpdead_items == vacrel->dead_items->num_items);
2154                 Assert(vacrel->do_index_vacuuming);
2155                 Assert(vacrel->do_index_cleanup);
2156
2157                 /*
2158                  * This crossover point at which we'll start to do index vacuuming is
2159                  * expressed as a percentage of the total number of heap pages in the
2160                  * table that are known to have at least one LP_DEAD item.  This is
2161                  * much more important than the total number of LP_DEAD items, since
2162                  * it's a proxy for the number of heap pages whose visibility map bits
2163                  * cannot be set on account of bypassing index and heap vacuuming.
2164                  *
2165                  * We apply one further precautionary test: the space currently used
2166                  * to store the TIDs (TIDs that now all point to LP_DEAD items) must
2167                  * not exceed 32MB.  This limits the risk that we will bypass index
2168                  * vacuuming again and again until eventually there is a VACUUM whose
2169                  * dead_items space is not CPU cache resident.
2170                  *
2171                  * We don't take any special steps to remember the LP_DEAD items (such
2172                  * as counting them in our final update to the stats system) when the
2173                  * optimization is applied.  Though the accounting used in analyze.c's
2174                  * acquire_sample_rows() will recognize the same LP_DEAD items as dead
2175                  * rows in its own stats report, that's okay. The discrepancy should
2176                  * be negligible.  If this optimization is ever expanded to cover more
2177                  * cases then this may need to be reconsidered.
2178                  */
2179                 threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES;
2180                 bypass = (vacrel->lpdead_item_pages < threshold &&
2181                                   vacrel->lpdead_items < MAXDEADITEMS(32L * 1024L * 1024L));
2182         }
2183
2184         if (bypass)
2185         {
2186                 /*
2187                  * There are almost zero TIDs.  Behave as if there were precisely
2188                  * zero: bypass index vacuuming, but do index cleanup.
2189                  *
2190                  * We expect that the ongoing VACUUM operation will finish very
2191                  * quickly, so there is no point in considering speeding up as a
2192                  * failsafe against wraparound failure. (Index cleanup is expected to
2193                  * finish very quickly in cases where there were no ambulkdelete()
2194                  * calls.)
2195                  */
2196                 vacrel->do_index_vacuuming = false;
2197         }
2198         else if (lazy_vacuum_all_indexes(vacrel))
2199         {
2200                 /*
2201                  * We successfully completed a round of index vacuuming.  Do related
2202                  * heap vacuuming now.
2203                  */
2204                 lazy_vacuum_heap_rel(vacrel);
2205         }
2206         else
2207         {
2208                 /*
2209                  * Failsafe case.
2210                  *
2211                  * We attempted index vacuuming, but didn't finish a full round/full
2212                  * index scan.  This happens when relfrozenxid or relminmxid is too
2213                  * far in the past.
2214                  *
2215                  * From this point on the VACUUM operation will do no further index
2216                  * vacuuming or heap vacuuming.  This VACUUM operation won't end up
2217                  * back here again.
2218                  */
2219                 Assert(vacrel->failsafe_active);
2220         }
2221
2222         /*
2223          * Forget the LP_DEAD items that we just vacuumed (or just decided to not
2224          * vacuum)
2225          */
2226         vacrel->dead_items->num_items = 0;
2227 }
2228
2229 /*
2230  *      lazy_vacuum_all_indexes() -- Main entry for index vacuuming
2231  *
2232  * Returns true in the common case when all indexes were successfully
2233  * vacuumed.  Returns false in rare cases where we determined that the ongoing
2234  * VACUUM operation is at risk of taking too long to finish, leading to
2235  * wraparound failure.
2236  */
2237 static bool
2238 lazy_vacuum_all_indexes(LVRelState *vacrel)
2239 {
2240         bool            allindexes = true;
2241         double          old_live_tuples = vacrel->rel->rd_rel->reltuples;
2242
2243         Assert(vacrel->nindexes > 0);
2244         Assert(vacrel->do_index_vacuuming);
2245         Assert(vacrel->do_index_cleanup);
2246
2247         /* Precheck for XID wraparound emergencies */
2248         if (lazy_check_wraparound_failsafe(vacrel))
2249         {
2250                 /* Wraparound emergency -- don't even start an index scan */
2251                 return false;
2252         }
2253
2254         /* Report that we are now vacuuming indexes */
2255         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2256                                                                  PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
2257
2258         if (!ParallelVacuumIsActive(vacrel))
2259         {
2260                 for (int idx = 0; idx < vacrel->nindexes; idx++)
2261                 {
2262                         Relation        indrel = vacrel->indrels[idx];
2263                         IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2264
2265                         vacrel->indstats[idx] = lazy_vacuum_one_index(indrel, istat,
2266                                                                                                                   old_live_tuples,
2267                                                                                                                   vacrel);
2268
2269                         if (lazy_check_wraparound_failsafe(vacrel))
2270                         {
2271                                 /* Wraparound emergency -- end current index scan */
2272                                 allindexes = false;
2273                                 break;
2274                         }
2275                 }
2276         }
2277         else
2278         {
2279                 /* Outsource everything to parallel variant */
2280                 parallel_vacuum_bulkdel_all_indexes(vacrel->pvs, old_live_tuples,
2281                                                                                         vacrel->num_index_scans);
2282
2283                 /*
2284                  * Do a postcheck to consider applying wraparound failsafe now.  Note
2285                  * that parallel VACUUM only gets the precheck and this postcheck.
2286                  */
2287                 if (lazy_check_wraparound_failsafe(vacrel))
2288                         allindexes = false;
2289         }
2290
2291         /*
2292          * We delete all LP_DEAD items from the first heap pass in all indexes on
2293          * each call here (except calls where we choose to do the failsafe). This
2294          * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2295          * of the failsafe triggering, which prevents the next call from taking
2296          * place).
2297          */
2298         Assert(vacrel->num_index_scans > 0 ||
2299                    vacrel->dead_items->num_items == vacrel->lpdead_items);
2300         Assert(allindexes || vacrel->failsafe_active);
2301
2302         /*
2303          * Increase and report the number of index scans.
2304          *
2305          * We deliberately include the case where we started a round of bulk
2306          * deletes that we weren't able to finish due to the failsafe triggering.
2307          */
2308         vacrel->num_index_scans++;
2309         pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS,
2310                                                                  vacrel->num_index_scans);
2311
2312         return allindexes;
2313 }
2314
2315 /*
2316  *      lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2317  *
2318  * This routine marks LP_DEAD items in vacrel->dead_items array as LP_UNUSED.
2319  * Pages that never had lazy_scan_prune record LP_DEAD items are not visited
2320  * at all.
2321  *
2322  * We may also be able to truncate the line pointer array of the heap pages we
2323  * visit.  If there is a contiguous group of LP_UNUSED items at the end of the
2324  * array, it can be reclaimed as free space.  These LP_UNUSED items usually
2325  * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2326  * each page to LP_UNUSED, and then consider if it's possible to truncate the
2327  * page's line pointer array).
2328  *
2329  * Note: the reason for doing this as a second pass is we cannot remove the
2330  * tuples until we've removed their index entries, and we want to process
2331  * index entry removal in batches as large as possible.
2332  */
2333 static void
2334 lazy_vacuum_heap_rel(LVRelState *vacrel)
2335 {
2336         int                     index;
2337         BlockNumber vacuumed_pages;
2338         Buffer          vmbuffer = InvalidBuffer;
2339         LVSavedErrInfo saved_err_info;
2340
2341         Assert(vacrel->do_index_vacuuming);
2342         Assert(vacrel->do_index_cleanup);
2343         Assert(vacrel->num_index_scans > 0);
2344
2345         /* Report that we are now vacuuming the heap */
2346         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2347                                                                  PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
2348
2349         /* Update error traceback information */
2350         update_vacuum_error_info(vacrel, &saved_err_info,
2351                                                          VACUUM_ERRCB_PHASE_VACUUM_HEAP,
2352                                                          InvalidBlockNumber, InvalidOffsetNumber);
2353
2354         vacuumed_pages = 0;
2355
2356         index = 0;
2357         while (index < vacrel->dead_items->num_items)
2358         {
2359                 BlockNumber tblk;
2360                 Buffer          buf;
2361                 Page            page;
2362                 Size            freespace;
2363
2364                 vacuum_delay_point();
2365
2366                 tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]);
2367                 vacrel->blkno = tblk;
2368                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
2369                                                                  vacrel->bstrategy);
2370                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2371                 index = lazy_vacuum_heap_page(vacrel, tblk, buf, index, &vmbuffer);
2372
2373                 /* Now that we've vacuumed the page, record its available space */
2374                 page = BufferGetPage(buf);
2375                 freespace = PageGetHeapFreeSpace(page);
2376
2377                 UnlockReleaseBuffer(buf);
2378                 RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
2379                 vacuumed_pages++;
2380         }
2381
2382         /* Clear the block number information */
2383         vacrel->blkno = InvalidBlockNumber;
2384
2385         if (BufferIsValid(vmbuffer))
2386         {
2387                 ReleaseBuffer(vmbuffer);
2388                 vmbuffer = InvalidBuffer;
2389         }
2390
2391         /*
2392          * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2393          * the second heap pass.  No more, no less.
2394          */
2395         Assert(index > 0);
2396         Assert(vacrel->num_index_scans > 1 ||
2397                    (index == vacrel->lpdead_items &&
2398                         vacuumed_pages == vacrel->lpdead_item_pages));
2399
2400         ereport(DEBUG2,
2401                         (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
2402                                         vacrel->relname, (long long) index, vacuumed_pages)));
2403
2404         /* Revert to the previous phase information for error traceback */
2405         restore_vacuum_error_info(vacrel, &saved_err_info);
2406 }
2407
2408 /*
2409  *      lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2410  *                                                vacrel->dead_items array.
2411  *
2412  * Caller must have an exclusive buffer lock on the buffer (though a full
2413  * cleanup lock is also acceptable).
2414  *
2415  * index is an offset into the vacrel->dead_items array for the first listed
2416  * LP_DEAD item on the page.  The return value is the first index immediately
2417  * after all LP_DEAD items for the same page in the array.
2418  */
2419 static int
2420 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
2421                                           int index, Buffer *vmbuffer)
2422 {
2423         VacDeadItems *dead_items = vacrel->dead_items;
2424         Page            page = BufferGetPage(buffer);
2425         OffsetNumber unused[MaxHeapTuplesPerPage];
2426         int                     uncnt = 0;
2427         TransactionId visibility_cutoff_xid;
2428         bool            all_frozen;
2429         LVSavedErrInfo saved_err_info;
2430
2431         Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming);
2432
2433         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
2434
2435         /* Update error traceback information */
2436         update_vacuum_error_info(vacrel, &saved_err_info,
2437                                                          VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
2438                                                          InvalidOffsetNumber);
2439
2440         START_CRIT_SECTION();
2441
2442         for (; index < dead_items->num_items; index++)
2443         {
2444                 BlockNumber tblk;
2445                 OffsetNumber toff;
2446                 ItemId          itemid;
2447
2448                 tblk = ItemPointerGetBlockNumber(&dead_items->items[index]);
2449                 if (tblk != blkno)
2450                         break;                          /* past end of tuples for this block */
2451                 toff = ItemPointerGetOffsetNumber(&dead_items->items[index]);
2452                 itemid = PageGetItemId(page, toff);
2453
2454                 Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
2455                 ItemIdSetUnused(itemid);
2456                 unused[uncnt++] = toff;
2457         }
2458
2459         Assert(uncnt > 0);
2460
2461         /* Attempt to truncate line pointer array now */
2462         PageTruncateLinePointerArray(page);
2463
2464         /*
2465          * Mark buffer dirty before we write WAL.
2466          */
2467         MarkBufferDirty(buffer);
2468
2469         /* XLOG stuff */
2470         if (RelationNeedsWAL(vacrel->rel))
2471         {
2472                 xl_heap_vacuum xlrec;
2473                 XLogRecPtr      recptr;
2474
2475                 xlrec.nunused = uncnt;
2476
2477                 XLogBeginInsert();
2478                 XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum);
2479
2480                 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2481                 XLogRegisterBufData(0, (char *) unused, uncnt * sizeof(OffsetNumber));
2482
2483                 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM);
2484
2485                 PageSetLSN(page, recptr);
2486         }
2487
2488         /*
2489          * End critical section, so we safely can do visibility tests (which
2490          * possibly need to perform IO and allocate memory!). If we crash now the
2491          * page (including the corresponding vm bit) might not be marked all
2492          * visible, but that's fine. A later vacuum will fix that.
2493          */
2494         END_CRIT_SECTION();
2495
2496         /*
2497          * Now that we have removed the LD_DEAD items from the page, once again
2498          * check if the page has become all-visible.  The page is already marked
2499          * dirty, exclusively locked, and, if needed, a full page image has been
2500          * emitted.
2501          */
2502         if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2503                                                                  &all_frozen))
2504                 PageSetAllVisible(page);
2505
2506         /*
2507          * All the changes to the heap page have been done. If the all-visible
2508          * flag is now set, also set the VM all-visible bit (and, if possible, the
2509          * all-frozen bit) unless this has already been done previously.
2510          */
2511         if (PageIsAllVisible(page))
2512         {
2513                 uint8           flags = 0;
2514                 uint8           vm_status = visibilitymap_get_status(vacrel->rel,
2515                                                                                                                  blkno, vmbuffer);
2516
2517                 /* Set the VM all-frozen bit to flag, if needed */
2518                 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2519                         flags |= VISIBILITYMAP_ALL_VISIBLE;
2520                 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2521                         flags |= VISIBILITYMAP_ALL_FROZEN;
2522
2523                 Assert(BufferIsValid(*vmbuffer));
2524                 if (flags != 0)
2525                         visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2526                                                           *vmbuffer, visibility_cutoff_xid, flags);
2527         }
2528
2529         /* Revert to the previous phase information for error traceback */
2530         restore_vacuum_error_info(vacrel, &saved_err_info);
2531         return index;
2532 }
2533
2534 /*
2535  * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2536  * relfrozenxid and/or relminmxid that is dangerously far in the past.
2537  * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2538  * vacuuming and heap vacuuming.  Truncating the heap is also bypassed.
2539  *
2540  * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2541  * up when the failsafe triggers.  VACUUM stops applying any cost-based delay
2542  * that it started out with.
2543  *
2544  * Returns true when failsafe has been triggered.
2545  */
2546 static bool
2547 lazy_check_wraparound_failsafe(LVRelState *vacrel)
2548 {
2549         /* Don't warn more than once per VACUUM */
2550         if (vacrel->failsafe_active)
2551                 return true;
2552
2553         if (unlikely(vacuum_xid_failsafe_check(&vacrel->cutoffs)))
2554         {
2555                 vacrel->failsafe_active = true;
2556
2557                 /* Disable index vacuuming, index cleanup, and heap rel truncation */
2558                 vacrel->do_index_vacuuming = false;
2559                 vacrel->do_index_cleanup = false;
2560                 vacrel->do_rel_truncate = false;
2561
2562                 ereport(WARNING,
2563                                 (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2564                                                 get_database_name(MyDatabaseId),
2565                                                 vacrel->relnamespace,
2566                                                 vacrel->relname,
2567                                                 vacrel->num_index_scans),
2568                                  errdetail("The table's relfrozenxid or relminmxid is too far in the past."),
2569                                  errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2570                                                  "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2571
2572                 /* Stop applying cost limits from this point on */
2573                 VacuumCostActive = false;
2574                 VacuumCostBalance = 0;
2575
2576                 return true;
2577         }
2578
2579         return false;
2580 }
2581
2582 /*
2583  *      lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2584  */
2585 static void
2586 lazy_cleanup_all_indexes(LVRelState *vacrel)
2587 {
2588         double          reltuples = vacrel->new_rel_tuples;
2589         bool            estimated_count = vacrel->scanned_pages < vacrel->rel_pages;
2590
2591         Assert(vacrel->do_index_cleanup);
2592         Assert(vacrel->nindexes > 0);
2593
2594         /* Report that we are now cleaning up indexes */
2595         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2596                                                                  PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
2597
2598         if (!ParallelVacuumIsActive(vacrel))
2599         {
2600                 for (int idx = 0; idx < vacrel->nindexes; idx++)
2601                 {
2602                         Relation        indrel = vacrel->indrels[idx];
2603                         IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2604
2605                         vacrel->indstats[idx] =
2606                                 lazy_cleanup_one_index(indrel, istat, reltuples,
2607                                                                            estimated_count, vacrel);
2608                 }
2609         }
2610         else
2611         {
2612                 /* Outsource everything to parallel variant */
2613                 parallel_vacuum_cleanup_all_indexes(vacrel->pvs, reltuples,
2614                                                                                         vacrel->num_index_scans,
2615                                                                                         estimated_count);
2616         }
2617 }
2618
2619 /*
2620  *      lazy_vacuum_one_index() -- vacuum index relation.
2621  *
2622  *              Delete all the index tuples containing a TID collected in
2623  *              vacrel->dead_items array.  Also update running statistics.
2624  *              Exact details depend on index AM's ambulkdelete routine.
2625  *
2626  *              reltuples is the number of heap tuples to be passed to the
2627  *              bulkdelete callback.  It's always assumed to be estimated.
2628  *              See indexam.sgml for more info.
2629  *
2630  * Returns bulk delete stats derived from input stats
2631  */
2632 static IndexBulkDeleteResult *
2633 lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2634                                           double reltuples, LVRelState *vacrel)
2635 {
2636         IndexVacuumInfo ivinfo;
2637         LVSavedErrInfo saved_err_info;
2638
2639         ivinfo.index = indrel;
2640         ivinfo.analyze_only = false;
2641         ivinfo.report_progress = false;
2642         ivinfo.estimated_count = true;
2643         ivinfo.message_level = DEBUG2;
2644         ivinfo.num_heap_tuples = reltuples;
2645         ivinfo.strategy = vacrel->bstrategy;
2646
2647         /*
2648          * Update error traceback information.
2649          *
2650          * The index name is saved during this phase and restored immediately
2651          * after this phase.  See vacuum_error_callback.
2652          */
2653         Assert(vacrel->indname == NULL);
2654         vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2655         update_vacuum_error_info(vacrel, &saved_err_info,
2656                                                          VACUUM_ERRCB_PHASE_VACUUM_INDEX,
2657                                                          InvalidBlockNumber, InvalidOffsetNumber);
2658
2659         /* Do bulk deletion */
2660         istat = vac_bulkdel_one_index(&ivinfo, istat, (void *) vacrel->dead_items);
2661
2662         /* Revert to the previous phase information for error traceback */
2663         restore_vacuum_error_info(vacrel, &saved_err_info);
2664         pfree(vacrel->indname);
2665         vacrel->indname = NULL;
2666
2667         return istat;
2668 }
2669
2670 /*
2671  *      lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
2672  *
2673  *              Calls index AM's amvacuumcleanup routine.  reltuples is the number
2674  *              of heap tuples and estimated_count is true if reltuples is an
2675  *              estimated value.  See indexam.sgml for more info.
2676  *
2677  * Returns bulk delete stats derived from input stats
2678  */
2679 static IndexBulkDeleteResult *
2680 lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2681                                            double reltuples, bool estimated_count,
2682                                            LVRelState *vacrel)
2683 {
2684         IndexVacuumInfo ivinfo;
2685         LVSavedErrInfo saved_err_info;
2686
2687         ivinfo.index = indrel;
2688         ivinfo.analyze_only = false;
2689         ivinfo.report_progress = false;
2690         ivinfo.estimated_count = estimated_count;
2691         ivinfo.message_level = DEBUG2;
2692
2693         ivinfo.num_heap_tuples = reltuples;
2694         ivinfo.strategy = vacrel->bstrategy;
2695
2696         /*
2697          * Update error traceback information.
2698          *
2699          * The index name is saved during this phase and restored immediately
2700          * after this phase.  See vacuum_error_callback.
2701          */
2702         Assert(vacrel->indname == NULL);
2703         vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2704         update_vacuum_error_info(vacrel, &saved_err_info,
2705                                                          VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
2706                                                          InvalidBlockNumber, InvalidOffsetNumber);
2707
2708         istat = vac_cleanup_one_index(&ivinfo, istat);
2709
2710         /* Revert to the previous phase information for error traceback */
2711         restore_vacuum_error_info(vacrel, &saved_err_info);
2712         pfree(vacrel->indname);
2713         vacrel->indname = NULL;
2714
2715         return istat;
2716 }
2717
2718 /*
2719  * should_attempt_truncation - should we attempt to truncate the heap?
2720  *
2721  * Don't even think about it unless we have a shot at releasing a goodly
2722  * number of pages.  Otherwise, the time taken isn't worth it, mainly because
2723  * an AccessExclusive lock must be replayed on any hot standby, where it can
2724  * be particularly disruptive.
2725  *
2726  * Also don't attempt it if wraparound failsafe is in effect.  The entire
2727  * system might be refusing to allocate new XIDs at this point.  The system
2728  * definitely won't return to normal unless and until VACUUM actually advances
2729  * the oldest relfrozenxid -- which hasn't happened for target rel just yet.
2730  * If lazy_truncate_heap attempted to acquire an AccessExclusiveLock to
2731  * truncate the table under these circumstances, an XID exhaustion error might
2732  * make it impossible for VACUUM to fix the underlying XID exhaustion problem.
2733  * There is very little chance of truncation working out when the failsafe is
2734  * in effect in any case.  lazy_scan_prune makes the optimistic assumption
2735  * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
2736  * we're called.
2737  *
2738  * Also don't attempt it if we are doing early pruning/vacuuming, because a
2739  * scan which cannot find a truncated heap page cannot determine that the
2740  * snapshot is too old to read that page.
2741  */
2742 static bool
2743 should_attempt_truncation(LVRelState *vacrel)
2744 {
2745         BlockNumber possibly_freeable;
2746
2747         if (!vacrel->do_rel_truncate || vacrel->failsafe_active ||
2748                 old_snapshot_threshold >= 0)
2749                 return false;
2750
2751         possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
2752         if (possibly_freeable > 0 &&
2753                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
2754                  possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION))
2755                 return true;
2756
2757         return false;
2758 }
2759
2760 /*
2761  * lazy_truncate_heap - try to truncate off any empty pages at the end
2762  */
2763 static void
2764 lazy_truncate_heap(LVRelState *vacrel)
2765 {
2766         BlockNumber orig_rel_pages = vacrel->rel_pages;
2767         BlockNumber new_rel_pages;
2768         bool            lock_waiter_detected;
2769         int                     lock_retry;
2770
2771         /* Report that we are now truncating */
2772         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2773                                                                  PROGRESS_VACUUM_PHASE_TRUNCATE);
2774
2775         /* Update error traceback information one last time */
2776         update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
2777                                                          vacrel->nonempty_pages, InvalidOffsetNumber);
2778
2779         /*
2780          * Loop until no more truncating can be done.
2781          */
2782         do
2783         {
2784                 /*
2785                  * We need full exclusive lock on the relation in order to do
2786                  * truncation. If we can't get it, give up rather than waiting --- we
2787                  * don't want to block other backends, and we don't want to deadlock
2788                  * (which is quite possible considering we already hold a lower-grade
2789                  * lock).
2790                  */
2791                 lock_waiter_detected = false;
2792                 lock_retry = 0;
2793                 while (true)
2794                 {
2795                         if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
2796                                 break;
2797
2798                         /*
2799                          * Check for interrupts while trying to (re-)acquire the exclusive
2800                          * lock.
2801                          */
2802                         CHECK_FOR_INTERRUPTS();
2803
2804                         if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
2805                                                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
2806                         {
2807                                 /*
2808                                  * We failed to establish the lock in the specified number of
2809                                  * retries. This means we give up truncating.
2810                                  */
2811                                 ereport(vacrel->verbose ? INFO : DEBUG2,
2812                                                 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
2813                                                                 vacrel->relname)));
2814                                 return;
2815                         }
2816
2817                         (void) WaitLatch(MyLatch,
2818                                                          WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
2819                                                          VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL,
2820                                                          WAIT_EVENT_VACUUM_TRUNCATE);
2821                         ResetLatch(MyLatch);
2822                 }
2823
2824                 /*
2825                  * Now that we have exclusive lock, look to see if the rel has grown
2826                  * whilst we were vacuuming with non-exclusive lock.  If so, give up;
2827                  * the newly added pages presumably contain non-deletable tuples.
2828                  */
2829                 new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
2830                 if (new_rel_pages != orig_rel_pages)
2831                 {
2832                         /*
2833                          * Note: we intentionally don't update vacrel->rel_pages with the
2834                          * new rel size here.  If we did, it would amount to assuming that
2835                          * the new pages are empty, which is unlikely. Leaving the numbers
2836                          * alone amounts to assuming that the new pages have the same
2837                          * tuple density as existing ones, which is less unlikely.
2838                          */
2839                         UnlockRelation(vacrel->rel, AccessExclusiveLock);
2840                         return;
2841                 }
2842
2843                 /*
2844                  * Scan backwards from the end to verify that the end pages actually
2845                  * contain no tuples.  This is *necessary*, not optional, because
2846                  * other backends could have added tuples to these pages whilst we
2847                  * were vacuuming.
2848                  */
2849                 new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected);
2850                 vacrel->blkno = new_rel_pages;
2851
2852                 if (new_rel_pages >= orig_rel_pages)
2853                 {
2854                         /* can't do anything after all */
2855                         UnlockRelation(vacrel->rel, AccessExclusiveLock);
2856                         return;
2857                 }
2858
2859                 /*
2860                  * Okay to truncate.
2861                  */
2862                 RelationTruncate(vacrel->rel, new_rel_pages);
2863
2864                 /*
2865                  * We can release the exclusive lock as soon as we have truncated.
2866                  * Other backends can't safely access the relation until they have
2867                  * processed the smgr invalidation that smgrtruncate sent out ... but
2868                  * that should happen as part of standard invalidation processing once
2869                  * they acquire lock on the relation.
2870                  */
2871                 UnlockRelation(vacrel->rel, AccessExclusiveLock);
2872
2873                 /*
2874                  * Update statistics.  Here, it *is* correct to adjust rel_pages
2875                  * without also touching reltuples, since the tuple count wasn't
2876                  * changed by the truncation.
2877                  */
2878                 vacrel->removed_pages += orig_rel_pages - new_rel_pages;
2879                 vacrel->rel_pages = new_rel_pages;
2880
2881                 ereport(vacrel->verbose ? INFO : DEBUG2,
2882                                 (errmsg("table \"%s\": truncated %u to %u pages",
2883                                                 vacrel->relname,
2884                                                 orig_rel_pages, new_rel_pages)));
2885                 orig_rel_pages = new_rel_pages;
2886         } while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected);
2887 }
2888
2889 /*
2890  * Rescan end pages to verify that they are (still) empty of tuples.
2891  *
2892  * Returns number of nondeletable pages (last nonempty page + 1).
2893  */
2894 static BlockNumber
2895 count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
2896 {
2897         BlockNumber blkno;
2898         BlockNumber prefetchedUntil;
2899         instr_time      starttime;
2900
2901         /* Initialize the starttime if we check for conflicting lock requests */
2902         INSTR_TIME_SET_CURRENT(starttime);
2903
2904         /*
2905          * Start checking blocks at what we believe relation end to be and move
2906          * backwards.  (Strange coding of loop control is needed because blkno is
2907          * unsigned.)  To make the scan faster, we prefetch a few blocks at a time
2908          * in forward direction, so that OS-level readahead can kick in.
2909          */
2910         blkno = vacrel->rel_pages;
2911         StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
2912                                          "prefetch size must be power of 2");
2913         prefetchedUntil = InvalidBlockNumber;
2914         while (blkno > vacrel->nonempty_pages)
2915         {
2916                 Buffer          buf;
2917                 Page            page;
2918                 OffsetNumber offnum,
2919                                         maxoff;
2920                 bool            hastup;
2921
2922                 /*
2923                  * Check if another process requests a lock on our relation. We are
2924                  * holding an AccessExclusiveLock here, so they will be waiting. We
2925                  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
2926                  * only check if that interval has elapsed once every 32 blocks to
2927                  * keep the number of system calls and actual shared lock table
2928                  * lookups to a minimum.
2929                  */
2930                 if ((blkno % 32) == 0)
2931                 {
2932                         instr_time      currenttime;
2933                         instr_time      elapsed;
2934
2935                         INSTR_TIME_SET_CURRENT(currenttime);
2936                         elapsed = currenttime;
2937                         INSTR_TIME_SUBTRACT(elapsed, starttime);
2938                         if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
2939                                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
2940                         {
2941                                 if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
2942                                 {
2943                                         ereport(vacrel->verbose ? INFO : DEBUG2,
2944                                                         (errmsg("table \"%s\": suspending truncate due to conflicting lock request",
2945                                                                         vacrel->relname)));
2946
2947                                         *lock_waiter_detected = true;
2948                                         return blkno;
2949                                 }
2950                                 starttime = currenttime;
2951                         }
2952                 }
2953
2954                 /*
2955                  * We don't insert a vacuum delay point here, because we have an
2956                  * exclusive lock on the table which we want to hold for as short a
2957                  * time as possible.  We still need to check for interrupts however.
2958                  */
2959                 CHECK_FOR_INTERRUPTS();
2960
2961                 blkno--;
2962
2963                 /* If we haven't prefetched this lot yet, do so now. */
2964                 if (prefetchedUntil > blkno)
2965                 {
2966                         BlockNumber prefetchStart;
2967                         BlockNumber pblkno;
2968
2969                         prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
2970                         for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
2971                         {
2972                                 PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
2973                                 CHECK_FOR_INTERRUPTS();
2974                         }
2975                         prefetchedUntil = prefetchStart;
2976                 }
2977
2978                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
2979                                                                  vacrel->bstrategy);
2980
2981                 /* In this phase we only need shared access to the buffer */
2982                 LockBuffer(buf, BUFFER_LOCK_SHARE);
2983
2984                 page = BufferGetPage(buf);
2985
2986                 if (PageIsNew(page) || PageIsEmpty(page))
2987                 {
2988                         UnlockReleaseBuffer(buf);
2989                         continue;
2990                 }
2991
2992                 hastup = false;
2993                 maxoff = PageGetMaxOffsetNumber(page);
2994                 for (offnum = FirstOffsetNumber;
2995                          offnum <= maxoff;
2996                          offnum = OffsetNumberNext(offnum))
2997                 {
2998                         ItemId          itemid;
2999
3000                         itemid = PageGetItemId(page, offnum);
3001
3002                         /*
3003                          * Note: any non-unused item should be taken as a reason to keep
3004                          * this page.  Even an LP_DEAD item makes truncation unsafe, since
3005                          * we must not have cleaned out its index entries.
3006                          */
3007                         if (ItemIdIsUsed(itemid))
3008                         {
3009                                 hastup = true;
3010                                 break;                  /* can stop scanning */
3011                         }
3012                 }                                               /* scan along page */
3013
3014                 UnlockReleaseBuffer(buf);
3015
3016                 /* Done scanning if we found a tuple here */
3017                 if (hastup)
3018                         return blkno + 1;
3019         }
3020
3021         /*
3022          * If we fall out of the loop, all the previously-thought-to-be-empty
3023          * pages still are; we need not bother to look at the last known-nonempty
3024          * page.
3025          */
3026         return vacrel->nonempty_pages;
3027 }
3028
3029 /*
3030  * Returns the number of dead TIDs that VACUUM should allocate space to
3031  * store, given a heap rel of size vacrel->rel_pages, and given current
3032  * maintenance_work_mem setting (or current autovacuum_work_mem setting,
3033  * when applicable).
3034  *
3035  * See the comments at the head of this file for rationale.
3036  */
3037 static int
3038 dead_items_max_items(LVRelState *vacrel)
3039 {
3040         int64           max_items;
3041         int                     vac_work_mem = IsAutoVacuumWorkerProcess() &&
3042         autovacuum_work_mem != -1 ?
3043         autovacuum_work_mem : maintenance_work_mem;
3044
3045         if (vacrel->nindexes > 0)
3046         {
3047                 BlockNumber rel_pages = vacrel->rel_pages;
3048
3049                 max_items = MAXDEADITEMS(vac_work_mem * 1024L);
3050                 max_items = Min(max_items, INT_MAX);
3051                 max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize));
3052
3053                 /* curious coding here to ensure the multiplication can't overflow */
3054                 if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages)
3055                         max_items = rel_pages * MaxHeapTuplesPerPage;
3056
3057                 /* stay sane if small maintenance_work_mem */
3058                 max_items = Max(max_items, MaxHeapTuplesPerPage);
3059         }
3060         else
3061         {
3062                 /* One-pass case only stores a single heap page's TIDs at a time */
3063                 max_items = MaxHeapTuplesPerPage;
3064         }
3065
3066         return (int) max_items;
3067 }
3068
3069 /*
3070  * Allocate dead_items (either using palloc, or in dynamic shared memory).
3071  * Sets dead_items in vacrel for caller.
3072  *
3073  * Also handles parallel initialization as part of allocating dead_items in
3074  * DSM when required.
3075  */
3076 static void
3077 dead_items_alloc(LVRelState *vacrel, int nworkers)
3078 {
3079         VacDeadItems *dead_items;
3080         int                     max_items;
3081
3082         max_items = dead_items_max_items(vacrel);
3083         Assert(max_items >= MaxHeapTuplesPerPage);
3084
3085         /*
3086          * Initialize state for a parallel vacuum.  As of now, only one worker can
3087          * be used for an index, so we invoke parallelism only if there are at
3088          * least two indexes on a table.
3089          */
3090         if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming)
3091         {
3092                 /*
3093                  * Since parallel workers cannot access data in temporary tables, we
3094                  * can't perform parallel vacuum on them.
3095                  */
3096                 if (RelationUsesLocalBuffers(vacrel->rel))
3097                 {
3098                         /*
3099                          * Give warning only if the user explicitly tries to perform a
3100                          * parallel vacuum on the temporary table.
3101                          */
3102                         if (nworkers > 0)
3103                                 ereport(WARNING,
3104                                                 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3105                                                                 vacrel->relname)));
3106                 }
3107                 else
3108                         vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels,
3109                                                                                            vacrel->nindexes, nworkers,
3110                                                                                            max_items,
3111                                                                                            vacrel->verbose ? INFO : DEBUG2,
3112                                                                                            vacrel->bstrategy);
3113
3114                 /* If parallel mode started, dead_items space is allocated in DSM */
3115                 if (ParallelVacuumIsActive(vacrel))
3116                 {
3117                         vacrel->dead_items = parallel_vacuum_get_dead_items(vacrel->pvs);
3118                         return;
3119                 }
3120         }
3121
3122         /* Serial VACUUM case */
3123         dead_items = (VacDeadItems *) palloc(vac_max_items_to_alloc_size(max_items));
3124         dead_items->max_items = max_items;
3125         dead_items->num_items = 0;
3126
3127         vacrel->dead_items = dead_items;
3128 }
3129
3130 /*
3131  * Perform cleanup for resources allocated in dead_items_alloc
3132  */
3133 static void
3134 dead_items_cleanup(LVRelState *vacrel)
3135 {
3136         if (!ParallelVacuumIsActive(vacrel))
3137         {
3138                 /* Don't bother with pfree here */
3139                 return;
3140         }
3141
3142         /* End parallel mode */
3143         parallel_vacuum_end(vacrel->pvs, vacrel->indstats);
3144         vacrel->pvs = NULL;
3145 }
3146
3147 /*
3148  * Check if every tuple in the given page is visible to all current and future
3149  * transactions. Also return the visibility_cutoff_xid which is the highest
3150  * xmin amongst the visible tuples.  Set *all_frozen to true if every tuple
3151  * on this page is frozen.
3152  *
3153  * This is a stripped down version of lazy_scan_prune().  If you change
3154  * anything here, make sure that everything stays in sync.  Note that an
3155  * assertion calls us to verify that everybody still agrees.  Be sure to avoid
3156  * introducing new side-effects here.
3157  */
3158 static bool
3159 heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
3160                                                  TransactionId *visibility_cutoff_xid,
3161                                                  bool *all_frozen)
3162 {
3163         Page            page = BufferGetPage(buf);
3164         BlockNumber blockno = BufferGetBlockNumber(buf);
3165         OffsetNumber offnum,
3166                                 maxoff;
3167         bool            all_visible = true;
3168
3169         *visibility_cutoff_xid = InvalidTransactionId;
3170         *all_frozen = true;
3171
3172         maxoff = PageGetMaxOffsetNumber(page);
3173         for (offnum = FirstOffsetNumber;
3174                  offnum <= maxoff && all_visible;
3175                  offnum = OffsetNumberNext(offnum))
3176         {
3177                 ItemId          itemid;
3178                 HeapTupleData tuple;
3179
3180                 /*
3181                  * Set the offset number so that we can display it along with any
3182                  * error that occurred while processing this tuple.
3183                  */
3184                 vacrel->offnum = offnum;
3185                 itemid = PageGetItemId(page, offnum);
3186
3187                 /* Unused or redirect line pointers are of no interest */
3188                 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3189                         continue;
3190
3191                 ItemPointerSet(&(tuple.t_self), blockno, offnum);
3192
3193                 /*
3194                  * Dead line pointers can have index pointers pointing to them. So
3195                  * they can't be treated as visible
3196                  */
3197                 if (ItemIdIsDead(itemid))
3198                 {
3199                         all_visible = false;
3200                         *all_frozen = false;
3201                         break;
3202                 }
3203
3204                 Assert(ItemIdIsNormal(itemid));
3205
3206                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3207                 tuple.t_len = ItemIdGetLength(itemid);
3208                 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3209
3210                 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin,
3211                                                                                  buf))
3212                 {
3213                         case HEAPTUPLE_LIVE:
3214                                 {
3215                                         TransactionId xmin;
3216
3217                                         /* Check comments in lazy_scan_prune. */
3218                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3219                                         {
3220                                                 all_visible = false;
3221                                                 *all_frozen = false;
3222                                                 break;
3223                                         }
3224
3225                                         /*
3226                                          * The inserter definitely committed. But is it old enough
3227                                          * that everyone sees it as committed?
3228                                          */
3229                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3230                                         if (!TransactionIdPrecedes(xmin,
3231                                                                                            vacrel->cutoffs.OldestXmin))
3232                                         {
3233                                                 all_visible = false;
3234                                                 *all_frozen = false;
3235                                                 break;
3236                                         }
3237
3238                                         /* Track newest xmin on page. */
3239                                         if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3240                                                 *visibility_cutoff_xid = xmin;
3241
3242                                         /* Check whether this tuple is already frozen or not */
3243                                         if (all_visible && *all_frozen &&
3244                                                 heap_tuple_needs_eventual_freeze(tuple.t_data))
3245                                                 *all_frozen = false;
3246                                 }
3247                                 break;
3248
3249                         case HEAPTUPLE_DEAD:
3250                         case HEAPTUPLE_RECENTLY_DEAD:
3251                         case HEAPTUPLE_INSERT_IN_PROGRESS:
3252                         case HEAPTUPLE_DELETE_IN_PROGRESS:
3253                                 {
3254                                         all_visible = false;
3255                                         *all_frozen = false;
3256                                         break;
3257                                 }
3258                         default:
3259                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3260                                 break;
3261                 }
3262         }                                                       /* scan along page */
3263
3264         /* Clear the offset information once we have processed the given page. */
3265         vacrel->offnum = InvalidOffsetNumber;
3266
3267         return all_visible;
3268 }
3269
3270 /*
3271  * Update index statistics in pg_class if the statistics are accurate.
3272  */
3273 static void
3274 update_relstats_all_indexes(LVRelState *vacrel)
3275 {
3276         Relation   *indrels = vacrel->indrels;
3277         int                     nindexes = vacrel->nindexes;
3278         IndexBulkDeleteResult **indstats = vacrel->indstats;
3279
3280         Assert(vacrel->do_index_cleanup);
3281
3282         for (int idx = 0; idx < nindexes; idx++)
3283         {
3284                 Relation        indrel = indrels[idx];
3285                 IndexBulkDeleteResult *istat = indstats[idx];
3286
3287                 if (istat == NULL || istat->estimated_count)
3288                         continue;
3289
3290                 /* Update index statistics */
3291                 vac_update_relstats(indrel,
3292                                                         istat->num_pages,
3293                                                         istat->num_index_tuples,
3294                                                         0,
3295                                                         false,
3296                                                         InvalidTransactionId,
3297                                                         InvalidMultiXactId,
3298                                                         NULL, NULL, false);
3299         }
3300 }
3301
3302 /*
3303  * Error context callback for errors occurring during vacuum.  The error
3304  * context messages for index phases should match the messages set in parallel
3305  * vacuum.  If you change this function for those phases, change
3306  * parallel_vacuum_error_callback() as well.
3307  */
3308 static void
3309 vacuum_error_callback(void *arg)
3310 {
3311         LVRelState *errinfo = arg;
3312
3313         switch (errinfo->phase)
3314         {
3315                 case VACUUM_ERRCB_PHASE_SCAN_HEAP:
3316                         if (BlockNumberIsValid(errinfo->blkno))
3317                         {
3318                                 if (OffsetNumberIsValid(errinfo->offnum))
3319                                         errcontext("while scanning block %u offset %u of relation \"%s.%s\"",
3320                                                            errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3321                                 else
3322                                         errcontext("while scanning block %u of relation \"%s.%s\"",
3323                                                            errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3324                         }
3325                         else
3326                                 errcontext("while scanning relation \"%s.%s\"",
3327                                                    errinfo->relnamespace, errinfo->relname);
3328                         break;
3329
3330                 case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
3331                         if (BlockNumberIsValid(errinfo->blkno))
3332                         {
3333                                 if (OffsetNumberIsValid(errinfo->offnum))
3334                                         errcontext("while vacuuming block %u offset %u of relation \"%s.%s\"",
3335                                                            errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3336                                 else
3337                                         errcontext("while vacuuming block %u of relation \"%s.%s\"",
3338                                                            errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3339                         }
3340                         else
3341                                 errcontext("while vacuuming relation \"%s.%s\"",
3342                                                    errinfo->relnamespace, errinfo->relname);
3343                         break;
3344
3345                 case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
3346                         errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
3347                                            errinfo->indname, errinfo->relnamespace, errinfo->relname);
3348                         break;
3349
3350                 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
3351                         errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
3352                                            errinfo->indname, errinfo->relnamespace, errinfo->relname);
3353                         break;
3354
3355                 case VACUUM_ERRCB_PHASE_TRUNCATE:
3356                         if (BlockNumberIsValid(errinfo->blkno))
3357                                 errcontext("while truncating relation \"%s.%s\" to %u blocks",
3358                                                    errinfo->relnamespace, errinfo->relname, errinfo->blkno);
3359                         break;
3360
3361                 case VACUUM_ERRCB_PHASE_UNKNOWN:
3362                 default:
3363                         return;                         /* do nothing; the errinfo may not be
3364                                                                  * initialized */
3365         }
3366 }
3367
3368 /*
3369  * Updates the information required for vacuum error callback.  This also saves
3370  * the current information which can be later restored via restore_vacuum_error_info.
3371  */
3372 static void
3373 update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
3374                                                  int phase, BlockNumber blkno, OffsetNumber offnum)
3375 {
3376         if (saved_vacrel)
3377         {
3378                 saved_vacrel->offnum = vacrel->offnum;
3379                 saved_vacrel->blkno = vacrel->blkno;
3380                 saved_vacrel->phase = vacrel->phase;
3381         }
3382
3383         vacrel->blkno = blkno;
3384         vacrel->offnum = offnum;
3385         vacrel->phase = phase;
3386 }
3387
3388 /*
3389  * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
3390  */
3391 static void
3392 restore_vacuum_error_info(LVRelState *vacrel,
3393                                                   const LVSavedErrInfo *saved_vacrel)
3394 {
3395         vacrel->blkno = saved_vacrel->blkno;
3396         vacrel->offnum = saved_vacrel->offnum;
3397         vacrel->phase = saved_vacrel->phase;
3398 }