src/backend/access/heap/vacuumlazy.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * vacuumlazy.c
   4  *        Concurrent ("lazy") vacuuming.
   5  *
   6  *
   7  * The major space usage for LAZY VACUUM is storage for the array of dead tuple
   8  * TIDs.  We want to ensure we can vacuum even the very largest relations with
   9  * finite memory space usage.  To do that, we set upper bounds on the number of
  10  * tuples we will keep track of at once.
  11  *
  12  * We are willing to use at most maintenance_work_mem (or perhaps
  13  * autovacuum_work_mem) memory space to keep track of dead tuples.  We
  14  * initially allocate an array of TIDs of that size, with an upper limit that
  15  * depends on table size (this limit ensures we don't allocate a huge area
  16  * uselessly for vacuuming small tables).  If the array threatens to overflow,
  17  * we suspend the heap scan phase and perform a pass of index cleanup and page
  18  * compaction, then resume the heap scan with an empty TID array.
  19  *
  20  * If we're processing a table with no indexes, we can just vacuum each page
  21  * as we go; there's no need to save up multiple tuples to minimize the number
  22  * of index scans performed.  So we don't use maintenance_work_mem memory for
  23  * the TID array, just enough to hold as many heap tuples as fit on one page.
  24  *
  25  * Lazy vacuum supports parallel execution with parallel worker processes.  In
  26  * a parallel vacuum, we perform both index vacuum and index cleanup with
  27  * parallel worker processes.  Individual indexes are processed by one vacuum
  28  * process.  At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
  29  * the parallel context and initialize the DSM segment that contains shared
  30  * information as well as the memory space for storing dead tuples.  When
  31  * starting either index vacuum or index cleanup, we launch parallel worker
  32  * processes.  Once all indexes are processed the parallel worker processes
  33  * exit.  After that, the leader process re-initializes the parallel context
  34  * so that it can use the same DSM for multiple passes of index vacuum and
  35  * for performing index cleanup.  For updating the index statistics, we need
  36  * to update the system table and since updates are not allowed during
  37  * parallel mode we update the index statistics after exiting from the
  38  * parallel mode.
  39  *
  40  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  41  * Portions Copyright (c) 1994, Regents of the University of California
  42  *
  43  *
  44  * IDENTIFICATION
  45  *        src/backend/access/heap/vacuumlazy.c
  46  *
  47  *-------------------------------------------------------------------------
  48  */
  49 #include "postgres.h"
  50
  51 #include <math.h>
  52
  53 #include "access/amapi.h"
  54 #include "access/genam.h"
  55 #include "access/heapam.h"
  56 #include "access/heapam_xlog.h"
  57 #include "access/htup_details.h"
  58 #include "access/multixact.h"
  59 #include "access/parallel.h"
  60 #include "access/transam.h"
  61 #include "access/visibilitymap.h"
  62 #include "access/xact.h"
  63 #include "access/xlog.h"
  64 #include "catalog/index.h"
  65 #include "catalog/storage.h"
  66 #include "commands/dbcommands.h"
  67 #include "commands/progress.h"
  68 #include "commands/vacuum.h"
  69 #include "executor/instrument.h"
  70 #include "miscadmin.h"
  71 #include "optimizer/paths.h"
  72 #include "pgstat.h"
  73 #include "portability/instr_time.h"
  74 #include "postmaster/autovacuum.h"
  75 #include "storage/bufmgr.h"
  76 #include "storage/freespace.h"
  77 #include "storage/lmgr.h"
  78 #include "tcop/tcopprot.h"
  79 #include "utils/lsyscache.h"
  80 #include "utils/memutils.h"
  81 #include "utils/pg_rusage.h"
  82 #include "utils/timestamp.h"
  83
  84
  85 /*
  86  * Space/time tradeoff parameters: do these need to be user-tunable?
  87  *
  88  * To consider truncating the relation, we want there to be at least
  89  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
  90  * is less) potentially-freeable pages.
  91  */
  92 #define REL_TRUNCATE_MINIMUM    1000
  93 #define REL_TRUNCATE_FRACTION   16
  94
  95 /*
  96  * Timing parameters for truncate locking heuristics.
  97  *
  98  * These were not exposed as user tunable GUC values because it didn't seem
  99  * that the potential for improvement was great enough to merit the cost of
 100  * supporting them.
 101  */
 102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL             20      /* ms */
 103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL              50      /* ms */
 104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT                    5000    /* ms */
 105
 106 /*
 107  * Threshold that controls whether we bypass index vacuuming and heap
 108  * vacuuming as an optimization
 109  */
 110 #define BYPASS_THRESHOLD_PAGES  0.02    /* i.e. 2% of rel_pages */
 111
 112 /*
 113  * Perform a failsafe check every 4GB during the heap scan, approximately
 114  */
 115 #define FAILSAFE_EVERY_PAGES \
 116         ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
 117
 118 /*
 119  * When a table has no indexes, vacuum the FSM after every 8GB, approximately
 120  * (it won't be exact because we only vacuum FSM after processing a heap page
 121  * that has some removable tuples).  When there are indexes, this is ignored,
 122  * and we vacuum FSM after each index/heap cleaning pass.
 123  */
 124 #define VACUUM_FSM_EVERY_PAGES \
 125         ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
 126
 127 /*
 128  * Guesstimation of number of dead tuples per page.  This is used to
 129  * provide an upper limit to memory allocated when vacuuming small
 130  * tables.
 131  */
 132 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
 133
 134 /*
 135  * Before we consider skipping a page that's marked as clean in
 136  * visibility map, we must've seen at least this many clean pages.
 137  */
 138 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
 139
 140 /*
 141  * Size of the prefetch window for lazy vacuum backwards truncation scan.
 142  * Needs to be a power of 2.
 143  */
 144 #define PREFETCH_SIZE                   ((BlockNumber) 32)
 145
 146 /*
 147  * DSM keys for parallel vacuum.  Unlike other parallel execution code, since
 148  * we don't need to worry about DSM keys conflicting with plan_node_id we can
 149  * use small integers.
 150  */
 151 #define PARALLEL_VACUUM_KEY_SHARED                      1
 152 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES         2
 153 #define PARALLEL_VACUUM_KEY_QUERY_TEXT          3
 154 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE        4
 155 #define PARALLEL_VACUUM_KEY_WAL_USAGE           5
 156
 157 /*
 158  * Macro to check if we are in a parallel vacuum.  If true, we are in the
 159  * parallel mode and the DSM segment is initialized.
 160  */
 161 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
 162
 163 /* Phases of vacuum during which we report error context. */
 164 typedef enum
 165 {
 166         VACUUM_ERRCB_PHASE_UNKNOWN,
 167         VACUUM_ERRCB_PHASE_SCAN_HEAP,
 168         VACUUM_ERRCB_PHASE_VACUUM_INDEX,
 169         VACUUM_ERRCB_PHASE_VACUUM_HEAP,
 170         VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
 171         VACUUM_ERRCB_PHASE_TRUNCATE
 172 } VacErrPhase;
 173
 174 /*
 175  * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
 176  * This is allocated in the DSM segment in parallel mode and in local memory
 177  * in non-parallel mode.
 178  */
 179 typedef struct LVDeadTuples
 180 {
 181         int                     max_tuples;             /* # slots allocated in array */
 182         int                     num_tuples;             /* current # of entries */
 183         /* List of TIDs of tuples we intend to delete */
 184         /* NB: this list is ordered by TID address */
 185         ItemPointerData itemptrs[FLEXIBLE_ARRAY_MEMBER];        /* array of
 186                                                                                                                  * ItemPointerData */
 187 } LVDeadTuples;
 188
 189 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
 190 #define SizeOfDeadTuples(cnt) \
 191         add_size(offsetof(LVDeadTuples, itemptrs), \
 192                          mul_size(sizeof(ItemPointerData), cnt))
 193 #define MAXDEADTUPLES(max_size) \
 194                 (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
 195
 196 /*
 197  * Shared information among parallel workers.  So this is allocated in the DSM
 198  * segment.
 199  */
 200 typedef struct LVShared
 201 {
 202         /*
 203          * Target table relid and log level.  These fields are not modified during
 204          * the lazy vacuum.
 205          */
 206         Oid                     relid;
 207         int                     elevel;
 208
 209         /*
 210          * An indication for vacuum workers to perform either index vacuum or
 211          * index cleanup.  first_time is true only if for_cleanup is true and
 212          * bulk-deletion is not performed yet.
 213          */
 214         bool            for_cleanup;
 215         bool            first_time;
 216
 217         /*
 218          * Fields for both index vacuum and cleanup.
 219          *
 220          * reltuples is the total number of input heap tuples.  We set either old
 221          * live tuples in the index vacuum case or the new live tuples in the
 222          * index cleanup case.
 223          *
 224          * estimated_count is true if reltuples is an estimated value.  (Note that
 225          * reltuples could be -1 in this case, indicating we have no idea.)
 226          */
 227         double          reltuples;
 228         bool            estimated_count;
 229
 230         /*
 231          * In single process lazy vacuum we could consume more memory during index
 232          * vacuuming or cleanup apart from the memory for heap scanning.  In
 233          * parallel vacuum, since individual vacuum workers can consume memory
 234          * equal to maintenance_work_mem, the new maintenance_work_mem for each
 235          * worker is set such that the parallel operation doesn't consume more
 236          * memory than single process lazy vacuum.
 237          */
 238         int                     maintenance_work_mem_worker;
 239
 240         /*
 241          * Shared vacuum cost balance.  During parallel vacuum,
 242          * VacuumSharedCostBalance points to this value and it accumulates the
 243          * balance of each parallel vacuum worker.
 244          */
 245         pg_atomic_uint32 cost_balance;
 246
 247         /*
 248          * Number of active parallel workers.  This is used for computing the
 249          * minimum threshold of the vacuum cost balance before a worker sleeps for
 250          * cost-based delay.
 251          */
 252         pg_atomic_uint32 active_nworkers;
 253
 254         /*
 255          * Variables to control parallel vacuum.  We have a bitmap to indicate
 256          * which index has stats in shared memory.  The set bit in the map
 257          * indicates that the particular index supports a parallel vacuum.
 258          */
 259         pg_atomic_uint32 idx;           /* counter for vacuuming and clean up */
 260         uint32          offset;                 /* sizeof header incl. bitmap */
 261         bits8           bitmap[FLEXIBLE_ARRAY_MEMBER];  /* bit map of NULLs */
 262
 263         /* Shared index statistics data follows at end of struct */
 264 } LVShared;
 265
 266 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
 267 #define GetSharedIndStats(s) \
 268         ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
 269 #define IndStatsIsNull(s, i) \
 270         (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
 271
 272 /*
 273  * Struct for an index bulk-deletion statistic used for parallel vacuum.  This
 274  * is allocated in the DSM segment.
 275  */
 276 typedef struct LVSharedIndStats
 277 {
 278         bool            updated;                /* are the stats updated? */
 279         IndexBulkDeleteResult istat;
 280 } LVSharedIndStats;
 281
 282 /* Struct for maintaining a parallel vacuum state. */
 283 typedef struct LVParallelState
 284 {
 285         ParallelContext *pcxt;
 286
 287         /* Shared information among parallel vacuum workers */
 288         LVShared   *lvshared;
 289
 290         /* Points to buffer usage area in DSM */
 291         BufferUsage *buffer_usage;
 292
 293         /* Points to WAL usage area in DSM */
 294         WalUsage   *wal_usage;
 295
 296         /*
 297          * The number of indexes that support parallel index bulk-deletion and
 298          * parallel index cleanup respectively.
 299          */
 300         int                     nindexes_parallel_bulkdel;
 301         int                     nindexes_parallel_cleanup;
 302         int                     nindexes_parallel_condcleanup;
 303 } LVParallelState;
 304
 305 typedef struct LVRelState
 306 {
 307         /* Target heap relation and its indexes */
 308         Relation        rel;
 309         Relation   *indrels;
 310         int                     nindexes;
 311         /* Do index vacuuming/cleanup? */
 312         bool            do_index_vacuuming;
 313         bool            do_index_cleanup;
 314         /* Wraparound failsafe in effect? (implies !do_index_vacuuming) */
 315         bool            do_failsafe;
 316
 317         /* Buffer access strategy and parallel state */
 318         BufferAccessStrategy bstrategy;
 319         LVParallelState *lps;
 320
 321         /* Statistics from pg_class when we start out */
 322         BlockNumber old_rel_pages;      /* previous value of pg_class.relpages */
 323         double          old_live_tuples;        /* previous value of pg_class.reltuples */
 324         /* rel's initial relfrozenxid and relminmxid */
 325         TransactionId relfrozenxid;
 326         MultiXactId relminmxid;
 327
 328         /* VACUUM operation's cutoff for pruning */
 329         TransactionId OldestXmin;
 330         /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
 331         TransactionId FreezeLimit;
 332         MultiXactId MultiXactCutoff;
 333
 334         /* Error reporting state */
 335         char       *relnamespace;
 336         char       *relname;
 337         char       *indname;
 338         BlockNumber blkno;                      /* used only for heap operations */
 339         OffsetNumber offnum;            /* used only for heap operations */
 340         VacErrPhase phase;
 341
 342         /*
 343          * State managed by lazy_scan_heap() follows
 344          */
 345         LVDeadTuples *dead_tuples;      /* items to vacuum from indexes */
 346         BlockNumber rel_pages;          /* total number of pages */
 347         BlockNumber scanned_pages;      /* number of pages we examined */
 348         BlockNumber pinskipped_pages;   /* # of pages skipped due to a pin */
 349         BlockNumber frozenskipped_pages;        /* # of frozen pages we skipped */
 350         BlockNumber tupcount_pages; /* pages whose tuples we counted */
 351         BlockNumber pages_removed;      /* pages remove by truncation */
 352         BlockNumber lpdead_item_pages;  /* # pages with LP_DEAD items */
 353         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
 354         bool            lock_waiter_detected;
 355
 356         /* Statistics output by us, for table */
 357         double          new_rel_tuples; /* new estimated total # of tuples */
 358         double          new_live_tuples;        /* new estimated total # of live tuples */
 359         /* Statistics output by index AMs */
 360         IndexBulkDeleteResult **indstats;
 361
 362         /* Instrumentation counters */
 363         int                     num_index_scans;
 364         int64           tuples_deleted; /* # deleted from table */
 365         int64           lpdead_items;   /* # deleted from indexes */
 366         int64           new_dead_tuples;        /* new estimated total # of dead items in
 367                                                                          * table */
 368         int64           num_tuples;             /* total number of nonremovable tuples */
 369         int64           live_tuples;    /* live tuples (reltuples estimate) */
 370 } LVRelState;
 371
 372 /*
 373  * State returned by lazy_scan_prune()
 374  */
 375 typedef struct LVPagePruneState
 376 {
 377         bool            hastup;                 /* Page is truncatable? */
 378         bool            has_lpdead_items;       /* includes existing LP_DEAD items */
 379
 380         /*
 381          * State describes the proper VM bit states to set for the page following
 382          * pruning and freezing.  all_visible implies !has_lpdead_items, but don't
 383          * trust all_frozen result unless all_visible is also set to true.
 384          */
 385         bool            all_visible;    /* Every item visible to all? */
 386         bool            all_frozen;             /* provided all_visible is also true */
 387         TransactionId visibility_cutoff_xid;    /* For recovery conflicts */
 388 } LVPagePruneState;
 389
 390 /* Struct for saving and restoring vacuum error information. */
 391 typedef struct LVSavedErrInfo
 392 {
 393         BlockNumber blkno;
 394         OffsetNumber offnum;
 395         VacErrPhase phase;
 396 } LVSavedErrInfo;
 397
 398 /* elevel controls whole VACUUM's verbosity */
 399 static int      elevel = -1;
 400
 401
 402 /* non-export function prototypes */
 403 static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params,
 404                                                    bool aggressive);
 405 static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
 406                                                         BlockNumber blkno, Page page,
 407                                                         GlobalVisState *vistest,
 408                                                         LVPagePruneState *prunestate);
 409 static void lazy_vacuum(LVRelState *vacrel, bool onecall);
 410 static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
 411 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
 412 static int      lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
 413                                                                   Buffer buffer, int tupindex, Buffer *vmbuffer);
 414 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
 415                                                                         LVRelState *vacrel);
 416 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
 417 static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel);
 418 static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel);
 419 static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers);
 420 static void do_parallel_processing(LVRelState *vacrel,
 421                                                                    LVShared *lvshared);
 422 static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel,
 423                                                                                                         LVShared *lvshared);
 424 static IndexBulkDeleteResult *parallel_process_one_index(Relation indrel,
 425                                                                                                                  IndexBulkDeleteResult *istat,
 426                                                                                                                  LVShared *lvshared,
 427                                                                                                                  LVSharedIndStats *shared_indstats,
 428                                                                                                                  LVRelState *vacrel);
 429 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
 430 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
 431                                                                                                         IndexBulkDeleteResult *istat,
 432                                                                                                         double reltuples,
 433                                                                                                         LVRelState *vacrel);
 434 static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
 435                                                                                                          IndexBulkDeleteResult *istat,
 436                                                                                                          double reltuples,
 437                                                                                                          bool estimated_count,
 438                                                                                                          LVRelState *vacrel);
 439 static bool should_attempt_truncation(LVRelState *vacrel,
 440                                                                           VacuumParams *params);
 441 static void lazy_truncate_heap(LVRelState *vacrel);
 442 static BlockNumber count_nondeletable_pages(LVRelState *vacrel);
 443 static long compute_max_dead_tuples(BlockNumber relblocks, bool hasindex);
 444 static void lazy_space_alloc(LVRelState *vacrel, int nworkers,
 445                                                          BlockNumber relblocks);
 446 static void lazy_space_free(LVRelState *vacrel);
 447 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
 448 static int      vac_cmp_itemptr(const void *left, const void *right);
 449 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 450                                                                          TransactionId *visibility_cutoff_xid, bool *all_frozen);
 451 static int      compute_parallel_vacuum_workers(LVRelState *vacrel,
 452                                                                                         int nrequested,
 453                                                                                         bool *can_parallel_vacuum);
 454 static void update_index_statistics(LVRelState *vacrel);
 455 static LVParallelState *begin_parallel_vacuum(LVRelState *vacrel,
 456                                                                                           BlockNumber nblocks,
 457                                                                                           int nrequested);
 458 static void end_parallel_vacuum(LVRelState *vacrel);
 459 static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx);
 460 static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared);
 461 static void vacuum_error_callback(void *arg);
 462 static void update_vacuum_error_info(LVRelState *vacrel,
 463                                                                          LVSavedErrInfo *saved_vacrel,
 464                                                                          int phase, BlockNumber blkno,
 465                                                                          OffsetNumber offnum);
 466 static void restore_vacuum_error_info(LVRelState *vacrel,
 467                                                                           const LVSavedErrInfo *saved_vacrel);
 468
 469
 470 /*
 471  *      heap_vacuum_rel() -- perform VACUUM for one heap relation
 472  *
 473  *              This routine vacuums a single heap, cleans out its indexes, and
 474  *              updates its relpages and reltuples statistics.
 475  *
 476  *              At entry, we have already established a transaction and opened
 477  *              and locked the relation.
 478  */
 479 void
 480 heap_vacuum_rel(Relation rel, VacuumParams *params,
 481                                 BufferAccessStrategy bstrategy)
 482 {
 483         LVRelState *vacrel;
 484         PGRUsage        ru0;
 485         TimestampTz starttime = 0;
 486         WalUsage        walusage_start = pgWalUsage;
 487         WalUsage        walusage = {0, 0, 0};
 488         long            secs;
 489         int                     usecs;
 490         double          read_rate,
 491                                 write_rate;
 492         bool            aggressive;             /* should we scan all unfrozen pages? */
 493         bool            scanned_all_unfrozen;   /* actually scanned all such pages? */
 494         char      **indnames = NULL;
 495         TransactionId xidFullScanLimit;
 496         MultiXactId mxactFullScanLimit;
 497         BlockNumber new_rel_pages;
 498         BlockNumber new_rel_allvisible;
 499         double          new_live_tuples;
 500         TransactionId new_frozen_xid;
 501         MultiXactId new_min_multi;
 502         ErrorContextCallback errcallback;
 503         PgStat_Counter startreadtime = 0;
 504         PgStat_Counter startwritetime = 0;
 505         TransactionId OldestXmin;
 506         TransactionId FreezeLimit;
 507         MultiXactId MultiXactCutoff;
 508
 509         Assert(params != NULL);
 510         Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
 511         Assert(params->truncate != VACOPT_TERNARY_DEFAULT);
 512
 513         /* measure elapsed time iff autovacuum logging requires it */
 514         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
 515         {
 516                 pg_rusage_init(&ru0);
 517                 starttime = GetCurrentTimestamp();
 518                 if (track_io_timing)
 519                 {
 520                         startreadtime = pgStatBlockReadTime;
 521                         startwritetime = pgStatBlockWriteTime;
 522                 }
 523         }
 524
 525         if (params->options & VACOPT_VERBOSE)
 526                 elevel = INFO;
 527         else
 528                 elevel = DEBUG2;
 529
 530         pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
 531                                                                   RelationGetRelid(rel));
 532
 533         vacuum_set_xid_limits(rel,
 534                                                   params->freeze_min_age,
 535                                                   params->freeze_table_age,
 536                                                   params->multixact_freeze_min_age,
 537                                                   params->multixact_freeze_table_age,
 538                                                   &OldestXmin, &FreezeLimit, &xidFullScanLimit,
 539                                                   &MultiXactCutoff, &mxactFullScanLimit);
 540
 541         /*
 542          * We request an aggressive scan if the table's frozen Xid is now older
 543          * than or equal to the requested Xid full-table scan limit; or if the
 544          * table's minimum MultiXactId is older than or equal to the requested
 545          * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
 546          */
 547         aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid,
 548                                                                                            xidFullScanLimit);
 549         aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid,
 550                                                                                           mxactFullScanLimit);
 551         if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
 552                 aggressive = true;
 553
 554         vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
 555
 556         /* Set up high level stuff about rel */
 557         vacrel->rel = rel;
 558         vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
 559                                          &vacrel->indrels);
 560         vacrel->do_index_vacuuming = true;
 561         vacrel->do_index_cleanup = true;
 562         vacrel->do_failsafe = false;
 563         if (params->index_cleanup == VACOPT_TERNARY_DISABLED)
 564         {
 565                 vacrel->do_index_vacuuming = false;
 566                 vacrel->do_index_cleanup = false;
 567         }
 568         vacrel->bstrategy = bstrategy;
 569         vacrel->old_rel_pages = rel->rd_rel->relpages;
 570         vacrel->old_live_tuples = rel->rd_rel->reltuples;
 571         vacrel->relfrozenxid = rel->rd_rel->relfrozenxid;
 572         vacrel->relminmxid = rel->rd_rel->relminmxid;
 573
 574         /* Set cutoffs for entire VACUUM */
 575         vacrel->OldestXmin = OldestXmin;
 576         vacrel->FreezeLimit = FreezeLimit;
 577         vacrel->MultiXactCutoff = MultiXactCutoff;
 578
 579         vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
 580         vacrel->relname = pstrdup(RelationGetRelationName(rel));
 581         vacrel->indname = NULL;
 582         vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
 583
 584         /* Save index names iff autovacuum logging requires it */
 585         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 &&
 586                 vacrel->nindexes > 0)
 587         {
 588                 indnames = palloc(sizeof(char *) * vacrel->nindexes);
 589                 for (int i = 0; i < vacrel->nindexes; i++)
 590                         indnames[i] =
 591                                 pstrdup(RelationGetRelationName(vacrel->indrels[i]));
 592         }
 593
 594         /*
 595          * Setup error traceback support for ereport().  The idea is to set up an
 596          * error context callback to display additional information on any error
 597          * during a vacuum.  During different phases of vacuum (heap scan, heap
 598          * vacuum, index vacuum, index clean up, heap truncate), we update the
 599          * error context callback to display appropriate information.
 600          *
 601          * Note that the index vacuum and heap vacuum phases may be called
 602          * multiple times in the middle of the heap scan phase.  So the old phase
 603          * information is restored at the end of those phases.
 604          */
 605         errcallback.callback = vacuum_error_callback;
 606         errcallback.arg = vacrel;
 607         errcallback.previous = error_context_stack;
 608         error_context_stack = &errcallback;
 609
 610         /* Do the vacuuming */
 611         lazy_scan_heap(vacrel, params, aggressive);
 612
 613         /* Done with indexes */
 614         vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
 615
 616         /*
 617          * Compute whether we actually scanned the all unfrozen pages. If we did,
 618          * we can adjust relfrozenxid and relminmxid.
 619          *
 620          * NB: We need to check this before truncating the relation, because that
 621          * will change ->rel_pages.
 622          */
 623         if ((vacrel->scanned_pages + vacrel->frozenskipped_pages)
 624                 < vacrel->rel_pages)
 625         {
 626                 Assert(!aggressive);
 627                 scanned_all_unfrozen = false;
 628         }
 629         else
 630                 scanned_all_unfrozen = true;
 631
 632         /*
 633          * Optionally truncate the relation.
 634          */
 635         if (should_attempt_truncation(vacrel, params))
 636         {
 637                 /*
 638                  * Update error traceback information.  This is the last phase during
 639                  * which we add context information to errors, so we don't need to
 640                  * revert to the previous phase.
 641                  */
 642                 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
 643                                                                  vacrel->nonempty_pages,
 644                                                                  InvalidOffsetNumber);
 645                 lazy_truncate_heap(vacrel);
 646         }
 647
 648         /* Pop the error context stack */
 649         error_context_stack = errcallback.previous;
 650
 651         /* Report that we are now doing final cleanup */
 652         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
 653                                                                  PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
 654
 655         /*
 656          * Update statistics in pg_class.
 657          *
 658          * In principle new_live_tuples could be -1 indicating that we (still)
 659          * don't know the tuple count.  In practice that probably can't happen,
 660          * since we'd surely have scanned some pages if the table is new and
 661          * nonempty.
 662          *
 663          * For safety, clamp relallvisible to be not more than what we're setting
 664          * relpages to.
 665          *
 666          * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
 667          * since then we don't know for certain that all tuples have a newer xmin.
 668          */
 669         new_rel_pages = vacrel->rel_pages;
 670         new_live_tuples = vacrel->new_live_tuples;
 671
 672         visibilitymap_count(rel, &new_rel_allvisible, NULL);
 673         if (new_rel_allvisible > new_rel_pages)
 674                 new_rel_allvisible = new_rel_pages;
 675
 676         new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
 677         new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
 678
 679         vac_update_relstats(rel,
 680                                                 new_rel_pages,
 681                                                 new_live_tuples,
 682                                                 new_rel_allvisible,
 683                                                 vacrel->nindexes > 0,
 684                                                 new_frozen_xid,
 685                                                 new_min_multi,
 686                                                 false);
 687
 688         /*
 689          * Report results to the stats collector, too.
 690          *
 691          * Deliberately avoid telling the stats collector about LP_DEAD items that
 692          * remain in the table due to VACUUM bypassing index and heap vacuuming.
 693          * ANALYZE will consider the remaining LP_DEAD items to be dead tuples. It
 694          * seems like a good idea to err on the side of not vacuuming again too
 695          * soon in cases where the failsafe prevented significant amounts of heap
 696          * vacuuming.
 697          */
 698         pgstat_report_vacuum(RelationGetRelid(rel),
 699                                                  rel->rd_rel->relisshared,
 700                                                  Max(new_live_tuples, 0),
 701                                                  vacrel->new_dead_tuples);
 702         pgstat_progress_end_command();
 703
 704         /* and log the action if appropriate */
 705         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
 706         {
 707                 TimestampTz endtime = GetCurrentTimestamp();
 708
 709                 if (params->log_min_duration == 0 ||
 710                         TimestampDifferenceExceeds(starttime, endtime,
 711                                                                            params->log_min_duration))
 712                 {
 713                         StringInfoData buf;
 714                         char       *msgfmt;
 715
 716                         TimestampDifference(starttime, endtime, &secs, &usecs);
 717
 718                         memset(&walusage, 0, sizeof(WalUsage));
 719                         WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
 720
 721                         read_rate = 0;
 722                         write_rate = 0;
 723                         if ((secs > 0) || (usecs > 0))
 724                         {
 725                                 read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
 726                                         (secs + usecs / 1000000.0);
 727                                 write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
 728                                         (secs + usecs / 1000000.0);
 729                         }
 730
 731                         /*
 732                          * This is pretty messy, but we split it up so that we can skip
 733                          * emitting individual parts of the message when not applicable.
 734                          */
 735                         initStringInfo(&buf);
 736                         if (params->is_wraparound)
 737                         {
 738                                 /*
 739                                  * While it's possible for a VACUUM to be both is_wraparound
 740                                  * and !aggressive, that's just a corner-case -- is_wraparound
 741                                  * implies aggressive.  Produce distinct output for the corner
 742                                  * case all the same, just in case.
 743                                  */
 744                                 if (aggressive)
 745                                         msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
 746                                 else
 747                                         msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
 748                         }
 749                         else
 750                         {
 751                                 if (aggressive)
 752                                         msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
 753                                 else
 754                                         msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
 755                         }
 756                         appendStringInfo(&buf, msgfmt,
 757                                                          get_database_name(MyDatabaseId),
 758                                                          vacrel->relnamespace,
 759                                                          vacrel->relname,
 760                                                          vacrel->num_index_scans);
 761                         appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
 762                                                          vacrel->pages_removed,
 763                                                          vacrel->rel_pages,
 764                                                          vacrel->pinskipped_pages,
 765                                                          vacrel->frozenskipped_pages);
 766                         appendStringInfo(&buf,
 767                                                          _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
 768                                                          (long long) vacrel->tuples_deleted,
 769                                                          (long long) vacrel->new_rel_tuples,
 770                                                          (long long) vacrel->new_dead_tuples,
 771                                                          OldestXmin);
 772                         appendStringInfo(&buf,
 773                                                          _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
 774                                                          (long long) VacuumPageHit,
 775                                                          (long long) VacuumPageMiss,
 776                                                          (long long) VacuumPageDirty);
 777                         if (vacrel->rel_pages > 0)
 778                         {
 779                                 BlockNumber orig_rel_pages;
 780
 781                                 if (vacrel->do_index_vacuuming)
 782                                 {
 783                                         msgfmt = _(" %u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
 784
 785                                         if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0)
 786                                                 appendStringInfo(&buf, _("index scan not needed:"));
 787                                         else
 788                                                 appendStringInfo(&buf, _("index scan needed:"));
 789                                 }
 790                                 else
 791                                 {
 792                                         msgfmt = _(" %u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
 793
 794                                         if (!vacrel->do_failsafe)
 795                                                 appendStringInfo(&buf, _("index scan bypassed:"));
 796                                         else
 797                                                 appendStringInfo(&buf, _("index scan bypassed by failsafe:"));
 798                                 }
 799                                 orig_rel_pages = vacrel->rel_pages + vacrel->pages_removed;
 800                                 appendStringInfo(&buf, msgfmt,
 801                                                                  vacrel->lpdead_item_pages,
 802                                                                  100.0 * vacrel->lpdead_item_pages / orig_rel_pages,
 803                                                                  (long long) vacrel->lpdead_items);
 804                         }
 805                         for (int i = 0; i < vacrel->nindexes; i++)
 806                         {
 807                                 IndexBulkDeleteResult *istat = vacrel->indstats[i];
 808
 809                                 if (!istat)
 810                                         continue;
 811
 812                                 appendStringInfo(&buf,
 813                                                                  _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
 814                                                                  indnames[i],
 815                                                                  istat->num_pages,
 816                                                                  istat->pages_newly_deleted,
 817                                                                  istat->pages_deleted,
 818                                                                  istat->pages_free);
 819                         }
 820                         appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
 821                                                          read_rate, write_rate);
 822                         if (track_io_timing)
 823                         {
 824                                 appendStringInfoString(&buf, _("I/O Timings:"));
 825                                 if (pgStatBlockReadTime - startreadtime > 0)
 826                                         appendStringInfo(&buf, _(" read=%.3f"),
 827                                                                          (double) (pgStatBlockReadTime - startreadtime) / 1000);
 828                                 if (pgStatBlockWriteTime - startwritetime > 0)
 829                                         appendStringInfo(&buf, _(" write=%.3f"),
 830                                                                          (double) (pgStatBlockWriteTime - startwritetime) / 1000);
 831                                 appendStringInfoChar(&buf, '\n');
 832                         }
 833                         appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0));
 834                         appendStringInfo(&buf,
 835                                                          _("WAL usage: %lld records, %lld full page images, %llu bytes"),
 836                                                          (long long) walusage.wal_records,
 837                                                          (long long) walusage.wal_fpi,
 838                                                          (unsigned long long) walusage.wal_bytes);
 839
 840                         ereport(LOG,
 841                                         (errmsg_internal("%s", buf.data)));
 842                         pfree(buf.data);
 843                 }
 844         }
 845
 846         /* Cleanup index statistics and index names */
 847         for (int i = 0; i < vacrel->nindexes; i++)
 848         {
 849                 if (vacrel->indstats[i])
 850                         pfree(vacrel->indstats[i]);
 851
 852                 if (indnames && indnames[i])
 853                         pfree(indnames[i]);
 854         }
 855 }
 856
 857 /*
 858  *      lazy_scan_heap() -- scan an open heap relation
 859  *
 860  *              This routine prunes each page in the heap, which will among other
 861  *              things truncate dead tuples to dead line pointers, defragment the
 862  *              page, and set commit status bits (see heap_page_prune).  It also builds
 863  *              lists of dead tuples and pages with free space, calculates statistics
 864  *              on the number of live tuples in the heap, and marks pages as
 865  *              all-visible if appropriate.  When done, or when we run low on space
 866  *              for dead-tuple TIDs, invoke lazy_vacuum to vacuum indexes and vacuum
 867  *              heap relation during its own second pass over the heap.
 868  *
 869  *              If the table has at least two indexes, we execute both index vacuum
 870  *              and index cleanup with parallel workers unless parallel vacuum is
 871  *              disabled.  In a parallel vacuum, we enter parallel mode and then
 872  *              create both the parallel context and the DSM segment before starting
 873  *              heap scan so that we can record dead tuples to the DSM segment.  All
 874  *              parallel workers are launched at beginning of index vacuuming and
 875  *              index cleanup and they exit once done with all indexes.  At the end of
 876  *              this function we exit from parallel mode.  Index bulk-deletion results
 877  *              are stored in the DSM segment and we update index statistics for all
 878  *              the indexes after exiting from parallel mode since writes are not
 879  *              allowed during parallel mode.
 880  *
 881  *              If there are no indexes then we can reclaim line pointers on the fly;
 882  *              dead line pointers need only be retained until all index pointers that
 883  *              reference them have been killed.
 884  */
 885 static void
 886 lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 887 {
 888         LVDeadTuples *dead_tuples;
 889         BlockNumber nblocks,
 890                                 blkno,
 891                                 next_unskippable_block,
 892                                 next_failsafe_block,
 893                                 next_fsm_block_to_vacuum;
 894         PGRUsage        ru0;
 895         Buffer          vmbuffer = InvalidBuffer;
 896         bool            skipping_blocks,
 897                                 have_vacuumed_indexes = false;
 898         StringInfoData buf;
 899         const int       initprog_index[] = {
 900                 PROGRESS_VACUUM_PHASE,
 901                 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
 902                 PROGRESS_VACUUM_MAX_DEAD_TUPLES
 903         };
 904         int64           initprog_val[3];
 905         GlobalVisState *vistest;
 906
 907         pg_rusage_init(&ru0);
 908
 909         if (aggressive)
 910                 ereport(elevel,
 911                                 (errmsg("aggressively vacuuming \"%s.%s\"",
 912                                                 vacrel->relnamespace,
 913                                                 vacrel->relname)));
 914         else
 915                 ereport(elevel,
 916                                 (errmsg("vacuuming \"%s.%s\"",
 917                                                 vacrel->relnamespace,
 918                                                 vacrel->relname)));
 919
 920         nblocks = RelationGetNumberOfBlocks(vacrel->rel);
 921         next_unskippable_block = 0;
 922         next_failsafe_block = 0;
 923         next_fsm_block_to_vacuum = 0;
 924         vacrel->rel_pages = nblocks;
 925         vacrel->scanned_pages = 0;
 926         vacrel->pinskipped_pages = 0;
 927         vacrel->frozenskipped_pages = 0;
 928         vacrel->tupcount_pages = 0;
 929         vacrel->pages_removed = 0;
 930         vacrel->lpdead_item_pages = 0;
 931         vacrel->nonempty_pages = 0;
 932         vacrel->lock_waiter_detected = false;
 933
 934         /* Initialize instrumentation counters */
 935         vacrel->num_index_scans = 0;
 936         vacrel->tuples_deleted = 0;
 937         vacrel->lpdead_items = 0;
 938         vacrel->new_dead_tuples = 0;
 939         vacrel->num_tuples = 0;
 940         vacrel->live_tuples = 0;
 941
 942         vistest = GlobalVisTestFor(vacrel->rel);
 943
 944         vacrel->indstats = (IndexBulkDeleteResult **)
 945                 palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
 946
 947         /*
 948          * Before beginning scan, check if it's already necessary to apply
 949          * failsafe
 950          */
 951         lazy_check_wraparound_failsafe(vacrel);
 952
 953         /*
 954          * Allocate the space for dead tuples.  Note that this handles parallel
 955          * VACUUM initialization as part of allocating shared memory space used
 956          * for dead_tuples.
 957          */
 958         lazy_space_alloc(vacrel, params->nworkers, nblocks);
 959         dead_tuples = vacrel->dead_tuples;
 960
 961         /* Report that we're scanning the heap, advertising total # of blocks */
 962         initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
 963         initprog_val[1] = nblocks;
 964         initprog_val[2] = dead_tuples->max_tuples;
 965         pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
 966
 967         /*
 968          * Except when aggressive is set, we want to skip pages that are
 969          * all-visible according to the visibility map, but only when we can skip
 970          * at least SKIP_PAGES_THRESHOLD consecutive pages.  Since we're reading
 971          * sequentially, the OS should be doing readahead for us, so there's no
 972          * gain in skipping a page now and then; that's likely to disable
 973          * readahead and so be counterproductive. Also, skipping even a single
 974          * page means that we can't update relfrozenxid, so we only want to do it
 975          * if we can skip a goodly number of pages.
 976          *
 977          * When aggressive is set, we can't skip pages just because they are
 978          * all-visible, but we can still skip pages that are all-frozen, since
 979          * such pages do not need freezing and do not affect the value that we can
 980          * safely set for relfrozenxid or relminmxid.
 981          *
 982          * Before entering the main loop, establish the invariant that
 983          * next_unskippable_block is the next block number >= blkno that we can't
 984          * skip based on the visibility map, either all-visible for a regular scan
 985          * or all-frozen for an aggressive scan.  We set it to nblocks if there's
 986          * no such block.  We also set up the skipping_blocks flag correctly at
 987          * this stage.
 988          *
 989          * Note: The value returned by visibilitymap_get_status could be slightly
 990          * out-of-date, since we make this test before reading the corresponding
 991          * heap page or locking the buffer.  This is OK.  If we mistakenly think
 992          * that the page is all-visible or all-frozen when in fact the flag's just
 993          * been cleared, we might fail to vacuum the page.  It's easy to see that
 994          * skipping a page when aggressive is not set is not a very big deal; we
 995          * might leave some dead tuples lying around, but the next vacuum will
 996          * find them.  But even when aggressive *is* set, it's still OK if we miss
 997          * a page whose all-frozen marking has just been cleared.  Any new XIDs
 998          * just added to that page are necessarily newer than the GlobalXmin we
 999          * computed, so they'll have no effect on the value to which we can safely
1000          * set relfrozenxid.  A similar argument applies for MXIDs and relminmxid.
1001          *
1002          * We will scan the table's last page, at least to the extent of
1003          * determining whether it has tuples or not, even if it should be skipped
1004          * according to the above rules; except when we've already determined that
1005          * it's not worth trying to truncate the table.  This avoids having
1006          * lazy_truncate_heap() take access-exclusive lock on the table to attempt
1007          * a truncation that just fails immediately because there are tuples in
1008          * the last page.  This is worth avoiding mainly because such a lock must
1009          * be replayed on any hot standby, where it can be disruptive.
1010          */
1011         if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1012         {
1013                 while (next_unskippable_block < nblocks)
1014                 {
1015                         uint8           vmstatus;
1016
1017                         vmstatus = visibilitymap_get_status(vacrel->rel,
1018                                                                                                 next_unskippable_block,
1019                                                                                                 &vmbuffer);
1020                         if (aggressive)
1021                         {
1022                                 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
1023                                         break;
1024                         }
1025                         else
1026                         {
1027                                 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
1028                                         break;
1029                         }
1030                         vacuum_delay_point();
1031                         next_unskippable_block++;
1032                 }
1033         }
1034
1035         if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
1036                 skipping_blocks = true;
1037         else
1038                 skipping_blocks = false;
1039
1040         for (blkno = 0; blkno < nblocks; blkno++)
1041         {
1042                 Buffer          buf;
1043                 Page            page;
1044                 bool            all_visible_according_to_vm = false;
1045                 LVPagePruneState prunestate;
1046
1047                 /*
1048                  * Consider need to skip blocks.  See note above about forcing
1049                  * scanning of last page.
1050                  */
1051 #define FORCE_CHECK_PAGE() \
1052                 (blkno == nblocks - 1 && should_attempt_truncation(vacrel, params))
1053
1054                 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1055
1056                 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
1057                                                                  blkno, InvalidOffsetNumber);
1058
1059                 if (blkno == next_unskippable_block)
1060                 {
1061                         /* Time to advance next_unskippable_block */
1062                         next_unskippable_block++;
1063                         if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1064                         {
1065                                 while (next_unskippable_block < nblocks)
1066                                 {
1067                                         uint8           vmskipflags;
1068
1069                                         vmskipflags = visibilitymap_get_status(vacrel->rel,
1070                                                                                                                    next_unskippable_block,
1071                                                                                                                    &vmbuffer);
1072                                         if (aggressive)
1073                                         {
1074                                                 if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
1075                                                         break;
1076                                         }
1077                                         else
1078                                         {
1079                                                 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
1080                                                         break;
1081                                         }
1082                                         vacuum_delay_point();
1083                                         next_unskippable_block++;
1084                                 }
1085                         }
1086
1087                         /*
1088                          * We know we can't skip the current block.  But set up
1089                          * skipping_blocks to do the right thing at the following blocks.
1090                          */
1091                         if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
1092                                 skipping_blocks = true;
1093                         else
1094                                 skipping_blocks = false;
1095
1096                         /*
1097                          * Normally, the fact that we can't skip this block must mean that
1098                          * it's not all-visible.  But in an aggressive vacuum we know only
1099                          * that it's not all-frozen, so it might still be all-visible.
1100                          */
1101                         if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1102                                 all_visible_according_to_vm = true;
1103                 }
1104                 else
1105                 {
1106                         /*
1107                          * The current block is potentially skippable; if we've seen a
1108                          * long enough run of skippable blocks to justify skipping it, and
1109                          * we're not forced to check it, then go ahead and skip.
1110                          * Otherwise, the page must be at least all-visible if not
1111                          * all-frozen, so we can set all_visible_according_to_vm = true.
1112                          */
1113                         if (skipping_blocks && !FORCE_CHECK_PAGE())
1114                         {
1115                                 /*
1116                                  * Tricky, tricky.  If this is in aggressive vacuum, the page
1117                                  * must have been all-frozen at the time we checked whether it
1118                                  * was skippable, but it might not be any more.  We must be
1119                                  * careful to count it as a skipped all-frozen page in that
1120                                  * case, or else we'll think we can't update relfrozenxid and
1121                                  * relminmxid.  If it's not an aggressive vacuum, we don't
1122                                  * know whether it was all-frozen, so we have to recheck; but
1123                                  * in this case an approximate answer is OK.
1124                                  */
1125                                 if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1126                                         vacrel->frozenskipped_pages++;
1127                                 continue;
1128                         }
1129                         all_visible_according_to_vm = true;
1130                 }
1131
1132                 vacuum_delay_point();
1133
1134                 /*
1135                  * Regularly check if wraparound failsafe should trigger.
1136                  *
1137                  * There is a similar check inside lazy_vacuum_all_indexes(), but
1138                  * relfrozenxid might start to look dangerously old before we reach
1139                  * that point.  This check also provides failsafe coverage for the
1140                  * one-pass strategy case.
1141                  */
1142                 if (blkno - next_failsafe_block >= FAILSAFE_EVERY_PAGES)
1143                 {
1144                         lazy_check_wraparound_failsafe(vacrel);
1145                         next_failsafe_block = blkno;
1146                 }
1147
1148                 /*
1149                  * Consider if we definitely have enough space to process TIDs on page
1150                  * already.  If we are close to overrunning the available space for
1151                  * dead-tuple TIDs, pause and do a cycle of vacuuming before we tackle
1152                  * this page.
1153                  */
1154                 if ((dead_tuples->max_tuples - dead_tuples->num_tuples) < MaxHeapTuplesPerPage &&
1155                         dead_tuples->num_tuples > 0)
1156                 {
1157                         /*
1158                          * Before beginning index vacuuming, we release any pin we may
1159                          * hold on the visibility map page.  This isn't necessary for
1160                          * correctness, but we do it anyway to avoid holding the pin
1161                          * across a lengthy, unrelated operation.
1162                          */
1163                         if (BufferIsValid(vmbuffer))
1164                         {
1165                                 ReleaseBuffer(vmbuffer);
1166                                 vmbuffer = InvalidBuffer;
1167                         }
1168
1169                         /* Remove the collected garbage tuples from table and indexes */
1170                         lazy_vacuum(vacrel, false);
1171                         have_vacuumed_indexes = true;
1172
1173                         /*
1174                          * Vacuum the Free Space Map to make newly-freed space visible on
1175                          * upper-level FSM pages.  Note we have not yet processed blkno.
1176                          */
1177                         FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1178                                                                         blkno);
1179                         next_fsm_block_to_vacuum = blkno;
1180
1181                         /* Report that we are once again scanning the heap */
1182                         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1183                                                                                  PROGRESS_VACUUM_PHASE_SCAN_HEAP);
1184                 }
1185
1186                 /*
1187                  * Set up visibility map page as needed.
1188                  *
1189                  * Pin the visibility map page in case we need to mark the page
1190                  * all-visible.  In most cases this will be very cheap, because we'll
1191                  * already have the correct page pinned anyway.  However, it's
1192                  * possible that (a) next_unskippable_block is covered by a different
1193                  * VM page than the current block or (b) we released our pin and did a
1194                  * cycle of index vacuuming.
1195                  */
1196                 visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
1197
1198                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
1199                                                                  RBM_NORMAL, vacrel->bstrategy);
1200
1201                 /*
1202                  * We need buffer cleanup lock so that we can prune HOT chains and
1203                  * defragment the page.
1204                  */
1205                 if (!ConditionalLockBufferForCleanup(buf))
1206                 {
1207                         bool            hastup;
1208
1209                         /*
1210                          * If we're not performing an aggressive scan to guard against XID
1211                          * wraparound, and we don't want to forcibly check the page, then
1212                          * it's OK to skip vacuuming pages we get a lock conflict on. They
1213                          * will be dealt with in some future vacuum.
1214                          */
1215                         if (!aggressive && !FORCE_CHECK_PAGE())
1216                         {
1217                                 ReleaseBuffer(buf);
1218                                 vacrel->pinskipped_pages++;
1219                                 continue;
1220                         }
1221
1222                         /*
1223                          * Read the page with share lock to see if any xids on it need to
1224                          * be frozen.  If not we just skip the page, after updating our
1225                          * scan statistics.  If there are some, we wait for cleanup lock.
1226                          *
1227                          * We could defer the lock request further by remembering the page
1228                          * and coming back to it later, or we could even register
1229                          * ourselves for multiple buffers and then service whichever one
1230                          * is received first.  For now, this seems good enough.
1231                          *
1232                          * If we get here with aggressive false, then we're just forcibly
1233                          * checking the page, and so we don't want to insist on getting
1234                          * the lock; we only need to know if the page contains tuples, so
1235                          * that we can update nonempty_pages correctly.  It's convenient
1236                          * to use lazy_check_needs_freeze() for both situations, though.
1237                          */
1238                         LockBuffer(buf, BUFFER_LOCK_SHARE);
1239                         if (!lazy_check_needs_freeze(buf, &hastup, vacrel))
1240                         {
1241                                 UnlockReleaseBuffer(buf);
1242                                 vacrel->scanned_pages++;
1243                                 vacrel->pinskipped_pages++;
1244                                 if (hastup)
1245                                         vacrel->nonempty_pages = blkno + 1;
1246                                 continue;
1247                         }
1248                         if (!aggressive)
1249                         {
1250                                 /*
1251                                  * Here, we must not advance scanned_pages; that would amount
1252                                  * to claiming that the page contains no freezable tuples.
1253                                  */
1254                                 UnlockReleaseBuffer(buf);
1255                                 vacrel->pinskipped_pages++;
1256                                 if (hastup)
1257                                         vacrel->nonempty_pages = blkno + 1;
1258                                 continue;
1259                         }
1260                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1261                         LockBufferForCleanup(buf);
1262                         /* drop through to normal processing */
1263                 }
1264
1265                 /*
1266                  * By here we definitely have enough dead_tuples space for whatever
1267                  * LP_DEAD tids are on this page, we have the visibility map page set
1268                  * up in case we need to set this page's all_visible/all_frozen bit,
1269                  * and we have a super-exclusive lock.  Any tuples on this page are
1270                  * now sure to be "counted" by this VACUUM.
1271                  *
1272                  * One last piece of preamble needs to take place before we can prune:
1273                  * we need to consider new and empty pages.
1274                  */
1275                 vacrel->scanned_pages++;
1276                 vacrel->tupcount_pages++;
1277
1278                 page = BufferGetPage(buf);
1279
1280                 if (PageIsNew(page))
1281                 {
1282                         /*
1283                          * All-zeroes pages can be left over if either a backend extends
1284                          * the relation by a single page, but crashes before the newly
1285                          * initialized page has been written out, or when bulk-extending
1286                          * the relation (which creates a number of empty pages at the tail
1287                          * end of the relation, but enters them into the FSM).
1288                          *
1289                          * Note we do not enter the page into the visibilitymap. That has
1290                          * the downside that we repeatedly visit this page in subsequent
1291                          * vacuums, but otherwise we'll never not discover the space on a
1292                          * promoted standby. The harm of repeated checking ought to
1293                          * normally not be too bad - the space usually should be used at
1294                          * some point, otherwise there wouldn't be any regular vacuums.
1295                          *
1296                          * Make sure these pages are in the FSM, to ensure they can be
1297                          * reused. Do that by testing if there's any space recorded for
1298                          * the page. If not, enter it. We do so after releasing the lock
1299                          * on the heap page, the FSM is approximate, after all.
1300                          */
1301                         UnlockReleaseBuffer(buf);
1302
1303                         if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1304                         {
1305                                 Size            freespace = BLCKSZ - SizeOfPageHeaderData;
1306
1307                                 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1308                         }
1309                         continue;
1310                 }
1311
1312                 if (PageIsEmpty(page))
1313                 {
1314                         Size            freespace = PageGetHeapFreeSpace(page);
1315
1316                         /*
1317                          * Empty pages are always all-visible and all-frozen (note that
1318                          * the same is currently not true for new pages, see above).
1319                          */
1320                         if (!PageIsAllVisible(page))
1321                         {
1322                                 START_CRIT_SECTION();
1323
1324                                 /* mark buffer dirty before writing a WAL record */
1325                                 MarkBufferDirty(buf);
1326
1327                                 /*
1328                                  * It's possible that another backend has extended the heap,
1329                                  * initialized the page, and then failed to WAL-log the page
1330                                  * due to an ERROR.  Since heap extension is not WAL-logged,
1331                                  * recovery might try to replay our record setting the page
1332                                  * all-visible and find that the page isn't initialized, which
1333                                  * will cause a PANIC.  To prevent that, check whether the
1334                                  * page has been previously WAL-logged, and if not, do that
1335                                  * now.
1336                                  */
1337                                 if (RelationNeedsWAL(vacrel->rel) &&
1338                                         PageGetLSN(page) == InvalidXLogRecPtr)
1339                                         log_newpage_buffer(buf, true);
1340
1341                                 PageSetAllVisible(page);
1342                                 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1343                                                                   vmbuffer, InvalidTransactionId,
1344                                                                   VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1345                                 END_CRIT_SECTION();
1346                         }
1347
1348                         UnlockReleaseBuffer(buf);
1349                         RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1350                         continue;
1351                 }
1352
1353                 /*
1354                  * Prune and freeze tuples.
1355                  *
1356                  * Accumulates details of remaining LP_DEAD line pointers on page in
1357                  * dead tuple list.  This includes LP_DEAD line pointers that we
1358                  * pruned ourselves, as well as existing LP_DEAD line pointers that
1359                  * were pruned some time earlier.  Also considers freezing XIDs in the
1360                  * tuple headers of remaining items with storage.
1361                  */
1362                 lazy_scan_prune(vacrel, buf, blkno, page, vistest, &prunestate);
1363
1364                 Assert(!prunestate.all_visible || !prunestate.has_lpdead_items);
1365
1366                 /* Remember the location of the last page with nonremovable tuples */
1367                 if (prunestate.hastup)
1368                         vacrel->nonempty_pages = blkno + 1;
1369
1370                 if (vacrel->nindexes == 0)
1371                 {
1372                         /*
1373                          * Consider the need to do page-at-a-time heap vacuuming when
1374                          * using the one-pass strategy now.
1375                          *
1376                          * The one-pass strategy will never call lazy_vacuum().  The steps
1377                          * performed here can be thought of as the one-pass equivalent of
1378                          * a call to lazy_vacuum().
1379                          */
1380                         if (prunestate.has_lpdead_items)
1381                         {
1382                                 Size            freespace;
1383
1384                                 lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1385
1386                                 /* Forget the now-vacuumed tuples */
1387                                 dead_tuples->num_tuples = 0;
1388
1389                                 /*
1390                                  * Periodically perform FSM vacuuming to make newly-freed
1391                                  * space visible on upper FSM pages.  Note we have not yet
1392                                  * performed FSM processing for blkno.
1393                                  */
1394                                 if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1395                                 {
1396                                         FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1397                                                                                         blkno);
1398                                         next_fsm_block_to_vacuum = blkno;
1399                                 }
1400
1401                                 /*
1402                                  * Now perform FSM processing for blkno, and move on to next
1403                                  * page.
1404                                  *
1405                                  * Our call to lazy_vacuum_heap_page() will have considered if
1406                                  * it's possible to set all_visible/all_frozen independently
1407                                  * of lazy_scan_prune().  Note that prunestate was invalidated
1408                                  * by lazy_vacuum_heap_page() call.
1409                                  */
1410                                 freespace = PageGetHeapFreeSpace(page);
1411
1412                                 UnlockReleaseBuffer(buf);
1413                                 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1414                                 continue;
1415                         }
1416
1417                         /*
1418                          * There was no call to lazy_vacuum_heap_page() because pruning
1419                          * didn't encounter/create any LP_DEAD items that needed to be
1420                          * vacuumed.  Prune state has not been invalidated, so proceed
1421                          * with prunestate-driven visibility map and FSM steps (just like
1422                          * the two-pass strategy).
1423                          */
1424                         Assert(dead_tuples->num_tuples == 0);
1425                 }
1426
1427                 /*
1428                  * Handle setting visibility map bit based on what the VM said about
1429                  * the page before pruning started, and using prunestate
1430                  */
1431                 if (!all_visible_according_to_vm && prunestate.all_visible)
1432                 {
1433                         uint8           flags = VISIBILITYMAP_ALL_VISIBLE;
1434
1435                         if (prunestate.all_frozen)
1436                                 flags |= VISIBILITYMAP_ALL_FROZEN;
1437
1438                         /*
1439                          * It should never be the case that the visibility map page is set
1440                          * while the page-level bit is clear, but the reverse is allowed
1441                          * (if checksums are not enabled).  Regardless, set both bits so
1442                          * that we get back in sync.
1443                          *
1444                          * NB: If the heap page is all-visible but the VM bit is not set,
1445                          * we don't need to dirty the heap page.  However, if checksums
1446                          * are enabled, we do need to make sure that the heap page is
1447                          * dirtied before passing it to visibilitymap_set(), because it
1448                          * may be logged.  Given that this situation should only happen in
1449                          * rare cases after a crash, it is not worth optimizing.
1450                          */
1451                         PageSetAllVisible(page);
1452                         MarkBufferDirty(buf);
1453                         visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1454                                                           vmbuffer, prunestate.visibility_cutoff_xid,
1455                                                           flags);
1456                 }
1457
1458                 /*
1459                  * As of PostgreSQL 9.2, the visibility map bit should never be set if
1460                  * the page-level bit is clear.  However, it's possible that the bit
1461                  * got cleared after we checked it and before we took the buffer
1462                  * content lock, so we must recheck before jumping to the conclusion
1463                  * that something bad has happened.
1464                  */
1465                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1466                                  && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1467                 {
1468                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1469                                  vacrel->relname, blkno);
1470                         visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1471                                                                 VISIBILITYMAP_VALID_BITS);
1472                 }
1473
1474                 /*
1475                  * It's possible for the value returned by
1476                  * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1477                  * wrong for us to see tuples that appear to not be visible to
1478                  * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1479                  * xmin value never moves backwards, but
1480                  * GetOldestNonRemovableTransactionId() is conservative and sometimes
1481                  * returns a value that's unnecessarily small, so if we see that
1482                  * contradiction it just means that the tuples that we think are not
1483                  * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1484                  * is correct.
1485                  *
1486                  * There should never be dead tuples on a page with PD_ALL_VISIBLE
1487                  * set, however.
1488                  */
1489                 else if (prunestate.has_lpdead_items && PageIsAllVisible(page))
1490                 {
1491                         elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1492                                  vacrel->relname, blkno);
1493                         PageClearAllVisible(page);
1494                         MarkBufferDirty(buf);
1495                         visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1496                                                                 VISIBILITYMAP_VALID_BITS);
1497                 }
1498
1499                 /*
1500                  * If the all-visible page is all-frozen but not marked as such yet,
1501                  * mark it as all-frozen.  Note that all_frozen is only valid if
1502                  * all_visible is true, so we must check both.
1503                  */
1504                 else if (all_visible_according_to_vm && prunestate.all_visible &&
1505                                  prunestate.all_frozen &&
1506                                  !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1507                 {
1508                         /*
1509                          * We can pass InvalidTransactionId as the cutoff XID here,
1510                          * because setting the all-frozen bit doesn't cause recovery
1511                          * conflicts.
1512                          */
1513                         visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1514                                                           vmbuffer, InvalidTransactionId,
1515                                                           VISIBILITYMAP_ALL_FROZEN);
1516                 }
1517
1518                 /*
1519                  * Final steps for block: drop super-exclusive lock, record free space
1520                  * in the FSM
1521                  */
1522                 if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming)
1523                 {
1524                         /*
1525                          * Wait until lazy_vacuum_heap_rel() to save free space.  This
1526                          * doesn't just save us some cycles; it also allows us to record
1527                          * any additional free space that lazy_vacuum_heap_page() will
1528                          * make available in cases where it's possible to truncate the
1529                          * page's line pointer array.
1530                          *
1531                          * Note: It's not in fact 100% certain that we really will call
1532                          * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip
1533                          * index vacuuming (and so must skip heap vacuuming).  This is
1534                          * deemed okay because it only happens in emergencies, or when
1535                          * there is very little free space anyway. (Besides, we start
1536                          * recording free space in the FSM once index vacuuming has been
1537                          * abandoned.)
1538                          *
1539                          * Note: The one-pass (no indexes) case is only supposed to make
1540                          * it this far when there were no LP_DEAD items during pruning.
1541                          */
1542                         Assert(vacrel->nindexes > 0);
1543                         UnlockReleaseBuffer(buf);
1544                 }
1545                 else
1546                 {
1547                         Size            freespace = PageGetHeapFreeSpace(page);
1548
1549                         UnlockReleaseBuffer(buf);
1550                         RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1551                 }
1552         }
1553
1554         /* report that everything is now scanned */
1555         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1556
1557         /* Clear the block number information */
1558         vacrel->blkno = InvalidBlockNumber;
1559
1560         /* now we can compute the new value for pg_class.reltuples */
1561         vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks,
1562                                                                                                          vacrel->tupcount_pages,
1563                                                                                                          vacrel->live_tuples);
1564
1565         /*
1566          * Also compute the total number of surviving heap entries.  In the
1567          * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1568          */
1569         vacrel->new_rel_tuples =
1570                 Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples;
1571
1572         /*
1573          * Release any remaining pin on visibility map page.
1574          */
1575         if (BufferIsValid(vmbuffer))
1576         {
1577                 ReleaseBuffer(vmbuffer);
1578                 vmbuffer = InvalidBuffer;
1579         }
1580
1581         /* If any tuples need to be deleted, perform final vacuum cycle */
1582         if (dead_tuples->num_tuples > 0)
1583                 lazy_vacuum(vacrel, !have_vacuumed_indexes);
1584
1585         /*
1586          * Vacuum the remainder of the Free Space Map.  We must do this whether or
1587          * not there were indexes, and whether or not we bypassed index vacuuming.
1588          */
1589         if (blkno > next_fsm_block_to_vacuum)
1590                 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1591
1592         /* report all blocks vacuumed */
1593         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1594
1595         /* Do post-vacuum cleanup */
1596         if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1597                 lazy_cleanup_all_indexes(vacrel);
1598
1599         /*
1600          * Free resources managed by lazy_space_alloc().  (We must end parallel
1601          * mode/free shared memory before updating index statistics.  We cannot
1602          * write while in parallel mode.)
1603          */
1604         lazy_space_free(vacrel);
1605
1606         /* Update index statistics */
1607         if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1608                 update_index_statistics(vacrel);
1609
1610         /*
1611          * If table has no indexes and at least one heap pages was vacuumed, make
1612          * log report that lazy_vacuum_heap_rel would've made had there been
1613          * indexes (having indexes implies using the two pass strategy).
1614          *
1615          * We deliberately don't do this in the case where there are indexes but
1616          * index vacuuming was bypassed.  We make a similar report at the point
1617          * that index vacuuming is bypassed, but that's actually quite different
1618          * in one important sense: it shows information about work we _haven't_
1619          * done.
1620          *
1621          * log_autovacuum output does things differently; it consistently presents
1622          * information about LP_DEAD items for the VACUUM as a whole.  We always
1623          * report on each round of index and heap vacuuming separately, though.
1624          */
1625         if (vacrel->nindexes == 0 && vacrel->lpdead_item_pages > 0)
1626                 ereport(elevel,
1627                                 (errmsg("\"%s\": removed %lld dead item identifiers in %u pages",
1628                                                 vacrel->relname, (long long) vacrel->lpdead_items,
1629                                                 vacrel->lpdead_item_pages)));
1630
1631         initStringInfo(&buf);
1632         appendStringInfo(&buf,
1633                                          _("%lld dead row versions cannot be removed yet, oldest xmin: %u\n"),
1634                                          (long long) vacrel->new_dead_tuples, vacrel->OldestXmin);
1635         appendStringInfo(&buf, ngettext("%u page removed.\n",
1636                                                                         "%u pages removed.\n",
1637                                                                         vacrel->pages_removed),
1638                                          vacrel->pages_removed);
1639         appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1640                                                                         "Skipped %u pages due to buffer pins, ",
1641                                                                         vacrel->pinskipped_pages),
1642                                          vacrel->pinskipped_pages);
1643         appendStringInfo(&buf, ngettext("%u frozen page.\n",
1644                                                                         "%u frozen pages.\n",
1645                                                                         vacrel->frozenskipped_pages),
1646                                          vacrel->frozenskipped_pages);
1647         appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1648
1649         ereport(elevel,
1650                         (errmsg("\"%s\": found %lld removable, %lld nonremovable row versions in %u out of %u pages",
1651                                         vacrel->relname,
1652                                         (long long) vacrel->tuples_deleted,
1653                                         (long long) vacrel->num_tuples, vacrel->scanned_pages,
1654                                         nblocks),
1655                          errdetail_internal("%s", buf.data)));
1656         pfree(buf.data);
1657 }
1658
1659 /*
1660  *      lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1661  *
1662  * Caller must hold pin and buffer cleanup lock on the buffer.
1663  *
1664  * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
1665  * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
1666  * whether or not a tuple should be considered DEAD.  This happened when an
1667  * inserting transaction concurrently aborted (after our heap_page_prune()
1668  * call, before our HeapTupleSatisfiesVacuum() call).  There was rather a lot
1669  * of complexity just so we could deal with tuples that were DEAD to VACUUM,
1670  * but nevertheless were left with storage after pruning.
1671  *
1672  * The approach we take now is to restart pruning when the race condition is
1673  * detected.  This allows heap_page_prune() to prune the tuples inserted by
1674  * the now-aborted transaction.  This is a little crude, but it guarantees
1675  * that any items that make it into the dead_tuples array are simple LP_DEAD
1676  * line pointers, and that every remaining item with tuple storage is
1677  * considered as a candidate for freezing.
1678  */
1679 static void
1680 lazy_scan_prune(LVRelState *vacrel,
1681                                 Buffer buf,
1682                                 BlockNumber blkno,
1683                                 Page page,
1684                                 GlobalVisState *vistest,
1685                                 LVPagePruneState *prunestate)
1686 {
1687         Relation        rel = vacrel->rel;
1688         OffsetNumber offnum,
1689                                 maxoff;
1690         ItemId          itemid;
1691         HeapTupleData tuple;
1692         HTSV_Result res;
1693         int                     tuples_deleted,
1694                                 lpdead_items,
1695                                 new_dead_tuples,
1696                                 num_tuples,
1697                                 live_tuples;
1698         int                     nfrozen;
1699         OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1700         xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage];
1701
1702         maxoff = PageGetMaxOffsetNumber(page);
1703
1704 retry:
1705
1706         /* Initialize (or reset) page-level counters */
1707         tuples_deleted = 0;
1708         lpdead_items = 0;
1709         new_dead_tuples = 0;
1710         num_tuples = 0;
1711         live_tuples = 0;
1712
1713         /*
1714          * Prune all HOT-update chains in this page.
1715          *
1716          * We count tuples removed by the pruning step as tuples_deleted.  Its
1717          * final value can be thought of as the number of tuples that have been
1718          * deleted from the table.  It should not be confused with lpdead_items;
1719          * lpdead_items's final value can be thought of as the number of tuples
1720          * that were deleted from indexes.
1721          */
1722         tuples_deleted = heap_page_prune(rel, buf, vistest,
1723                                                                          InvalidTransactionId, 0, false,
1724                                                                          &vacrel->offnum);
1725
1726         /*
1727          * Now scan the page to collect LP_DEAD items and check for tuples
1728          * requiring freezing among remaining tuples with storage
1729          */
1730         prunestate->hastup = false;
1731         prunestate->has_lpdead_items = false;
1732         prunestate->all_visible = true;
1733         prunestate->all_frozen = true;
1734         prunestate->visibility_cutoff_xid = InvalidTransactionId;
1735         nfrozen = 0;
1736
1737         for (offnum = FirstOffsetNumber;
1738                  offnum <= maxoff;
1739                  offnum = OffsetNumberNext(offnum))
1740         {
1741                 bool            tuple_totally_frozen;
1742
1743                 /*
1744                  * Set the offset number so that we can display it along with any
1745                  * error that occurred while processing this tuple.
1746                  */
1747                 vacrel->offnum = offnum;
1748                 itemid = PageGetItemId(page, offnum);
1749
1750                 if (!ItemIdIsUsed(itemid))
1751                         continue;
1752
1753                 /* Redirect items mustn't be touched */
1754                 if (ItemIdIsRedirected(itemid))
1755                 {
1756                         prunestate->hastup = true;      /* page won't be truncatable */
1757                         continue;
1758                 }
1759
1760                 /*
1761                  * LP_DEAD items are processed outside of the loop.
1762                  *
1763                  * Note that we deliberately don't set hastup=true in the case of an
1764                  * LP_DEAD item here, which is not how lazy_check_needs_freeze() or
1765                  * count_nondeletable_pages() do it -- they only consider pages empty
1766                  * when they only have LP_UNUSED items, which is important for
1767                  * correctness.
1768                  *
1769                  * Our assumption is that any LP_DEAD items we encounter here will
1770                  * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
1771                  * call count_nondeletable_pages().  In any case our opinion of
1772                  * whether or not a page 'hastup' (which is how our caller sets its
1773                  * vacrel->nonempty_pages value) is inherently race-prone.  It must be
1774                  * treated as advisory/unreliable, so we might as well be slightly
1775                  * optimistic.
1776                  */
1777                 if (ItemIdIsDead(itemid))
1778                 {
1779                         deadoffsets[lpdead_items++] = offnum;
1780                         prunestate->all_visible = false;
1781                         prunestate->has_lpdead_items = true;
1782                         continue;
1783                 }
1784
1785                 Assert(ItemIdIsNormal(itemid));
1786
1787                 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1788                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1789                 tuple.t_len = ItemIdGetLength(itemid);
1790                 tuple.t_tableOid = RelationGetRelid(rel);
1791
1792                 /*
1793                  * DEAD tuples are almost always pruned into LP_DEAD line pointers by
1794                  * heap_page_prune(), but it's possible that the tuple state changed
1795                  * since heap_page_prune() looked.  Handle that here by restarting.
1796                  * (See comments at the top of function for a full explanation.)
1797                  */
1798                 res = HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf);
1799
1800                 if (unlikely(res == HEAPTUPLE_DEAD))
1801                         goto retry;
1802
1803                 /*
1804                  * The criteria for counting a tuple as live in this block need to
1805                  * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM
1806                  * and ANALYZE may produce wildly different reltuples values, e.g.
1807                  * when there are many recently-dead tuples.
1808                  *
1809                  * The logic here is a bit simpler than acquire_sample_rows(), as
1810                  * VACUUM can't run inside a transaction block, which makes some cases
1811                  * impossible (e.g. in-progress insert from the same transaction).
1812                  *
1813                  * We treat LP_DEAD items a little differently, too -- we don't count
1814                  * them as dead_tuples at all (we only consider new_dead_tuples).  The
1815                  * outcome is no different because we assume that any LP_DEAD items we
1816                  * encounter here will become LP_UNUSED inside lazy_vacuum_heap_page()
1817                  * before we report anything to the stats collector. (Cases where we
1818                  * bypass index vacuuming will violate our assumption, but the overall
1819                  * impact of that should be negligible.)
1820                  */
1821                 switch (res)
1822                 {
1823                         case HEAPTUPLE_LIVE:
1824
1825                                 /*
1826                                  * Count it as live.  Not only is this natural, but it's also
1827                                  * what acquire_sample_rows() does.
1828                                  */
1829                                 live_tuples++;
1830
1831                                 /*
1832                                  * Is the tuple definitely visible to all transactions?
1833                                  *
1834                                  * NB: Like with per-tuple hint bits, we can't set the
1835                                  * PD_ALL_VISIBLE flag if the inserter committed
1836                                  * asynchronously. See SetHintBits for more info. Check that
1837                                  * the tuple is hinted xmin-committed because of that.
1838                                  */
1839                                 if (prunestate->all_visible)
1840                                 {
1841                                         TransactionId xmin;
1842
1843                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1844                                         {
1845                                                 prunestate->all_visible = false;
1846                                                 break;
1847                                         }
1848
1849                                         /*
1850                                          * The inserter definitely committed. But is it old enough
1851                                          * that everyone sees it as committed?
1852                                          */
1853                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1854                                         if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
1855                                         {
1856                                                 prunestate->all_visible = false;
1857                                                 break;
1858                                         }
1859
1860                                         /* Track newest xmin on page. */
1861                                         if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid))
1862                                                 prunestate->visibility_cutoff_xid = xmin;
1863                                 }
1864                                 break;
1865                         case HEAPTUPLE_RECENTLY_DEAD:
1866
1867                                 /*
1868                                  * If tuple is recently deleted then we must not remove it
1869                                  * from relation.  (We only remove items that are LP_DEAD from
1870                                  * pruning.)
1871                                  */
1872                                 new_dead_tuples++;
1873                                 prunestate->all_visible = false;
1874                                 break;
1875                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1876
1877                                 /*
1878                                  * We do not count these rows as live, because we expect the
1879                                  * inserting transaction to update the counters at commit, and
1880                                  * we assume that will happen only after we report our
1881                                  * results.  This assumption is a bit shaky, but it is what
1882                                  * acquire_sample_rows() does, so be consistent.
1883                                  */
1884                                 prunestate->all_visible = false;
1885                                 break;
1886                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1887                                 /* This is an expected case during concurrent vacuum */
1888                                 prunestate->all_visible = false;
1889
1890                                 /*
1891                                  * Count such rows as live.  As above, we assume the deleting
1892                                  * transaction will commit and update the counters after we
1893                                  * report.
1894                                  */
1895                                 live_tuples++;
1896                                 break;
1897                         default:
1898                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1899                                 break;
1900                 }
1901
1902                 /*
1903                  * Non-removable tuple (i.e. tuple with storage).
1904                  *
1905                  * Check tuple left behind after pruning to see if needs to be frozen
1906                  * now.
1907                  */
1908                 num_tuples++;
1909                 prunestate->hastup = true;
1910                 if (heap_prepare_freeze_tuple(tuple.t_data,
1911                                                                           vacrel->relfrozenxid,
1912                                                                           vacrel->relminmxid,
1913                                                                           vacrel->FreezeLimit,
1914                                                                           vacrel->MultiXactCutoff,
1915                                                                           &frozen[nfrozen],
1916                                                                           &tuple_totally_frozen))
1917                 {
1918                         /* Will execute freeze below */
1919                         frozen[nfrozen++].offset = offnum;
1920                 }
1921
1922                 /*
1923                  * If tuple is not frozen (and not about to become frozen) then caller
1924                  * had better not go on to set this page's VM bit
1925                  */
1926                 if (!tuple_totally_frozen)
1927                         prunestate->all_frozen = false;
1928         }
1929
1930         /*
1931          * We have now divided every item on the page into either an LP_DEAD item
1932          * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
1933          * that remains and needs to be considered for freezing now (LP_UNUSED and
1934          * LP_REDIRECT items also remain, but are of no further interest to us).
1935          */
1936         vacrel->offnum = InvalidOffsetNumber;
1937
1938         /*
1939          * Consider the need to freeze any items with tuple storage from the page
1940          * first (arbitrary)
1941          */
1942         if (nfrozen > 0)
1943         {
1944                 Assert(prunestate->hastup);
1945
1946                 /*
1947                  * At least one tuple with storage needs to be frozen -- execute that
1948                  * now.
1949                  *
1950                  * If we need to freeze any tuples we'll mark the buffer dirty, and
1951                  * write a WAL record recording the changes.  We must log the changes
1952                  * to be crash-safe against future truncation of CLOG.
1953                  */
1954                 START_CRIT_SECTION();
1955
1956                 MarkBufferDirty(buf);
1957
1958                 /* execute collected freezes */
1959                 for (int i = 0; i < nfrozen; i++)
1960                 {
1961                         HeapTupleHeader htup;
1962
1963                         itemid = PageGetItemId(page, frozen[i].offset);
1964                         htup = (HeapTupleHeader) PageGetItem(page, itemid);
1965
1966                         heap_execute_freeze_tuple(htup, &frozen[i]);
1967                 }
1968
1969                 /* Now WAL-log freezing if necessary */
1970                 if (RelationNeedsWAL(vacrel->rel))
1971                 {
1972                         XLogRecPtr      recptr;
1973
1974                         recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit,
1975                                                                          frozen, nfrozen);
1976                         PageSetLSN(page, recptr);
1977                 }
1978
1979                 END_CRIT_SECTION();
1980         }
1981
1982         /*
1983          * The second pass over the heap can also set visibility map bits, using
1984          * the same approach.  This is important when the table frequently has a
1985          * few old LP_DEAD items on each page by the time we get to it (typically
1986          * because past opportunistic pruning operations freed some non-HOT
1987          * tuples).
1988          *
1989          * VACUUM will call heap_page_is_all_visible() during the second pass over
1990          * the heap to determine all_visible and all_frozen for the page -- this
1991          * is a specialized version of the logic from this function.  Now that
1992          * we've finished pruning and freezing, make sure that we're in total
1993          * agreement with heap_page_is_all_visible() using an assertion.
1994          */
1995 #ifdef USE_ASSERT_CHECKING
1996         /* Note that all_frozen value does not matter when !all_visible */
1997         if (prunestate->all_visible)
1998         {
1999                 TransactionId cutoff;
2000                 bool            all_frozen;
2001
2002                 if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
2003                         Assert(false);
2004
2005                 Assert(lpdead_items == 0);
2006                 Assert(prunestate->all_frozen == all_frozen);
2007
2008                 /*
2009                  * It's possible that we froze tuples and made the page's XID cutoff
2010                  * (for recovery conflict purposes) FrozenTransactionId.  This is okay
2011                  * because visibility_cutoff_xid will be logged by our caller in a
2012                  * moment.
2013                  */
2014                 Assert(cutoff == FrozenTransactionId ||
2015                            cutoff == prunestate->visibility_cutoff_xid);
2016         }
2017 #endif
2018
2019         /*
2020          * Now save details of the LP_DEAD items from the page in the dead_tuples
2021          * array.  Also record that page has dead items in per-page prunestate.
2022          */
2023         if (lpdead_items > 0)
2024         {
2025                 LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2026                 ItemPointerData tmp;
2027
2028                 Assert(!prunestate->all_visible);
2029                 Assert(prunestate->has_lpdead_items);
2030
2031                 vacrel->lpdead_item_pages++;
2032
2033                 ItemPointerSetBlockNumber(&tmp, blkno);
2034
2035                 for (int i = 0; i < lpdead_items; i++)
2036                 {
2037                         ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]);
2038                         dead_tuples->itemptrs[dead_tuples->num_tuples++] = tmp;
2039                 }
2040
2041                 Assert(dead_tuples->num_tuples <= dead_tuples->max_tuples);
2042                 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2043                                                                          dead_tuples->num_tuples);
2044         }
2045
2046         /* Finally, add page-local counts to whole-VACUUM counts */
2047         vacrel->tuples_deleted += tuples_deleted;
2048         vacrel->lpdead_items += lpdead_items;
2049         vacrel->new_dead_tuples += new_dead_tuples;
2050         vacrel->num_tuples += num_tuples;
2051         vacrel->live_tuples += live_tuples;
2052 }
2053
2054 /*
2055  * Remove the collected garbage tuples from the table and its indexes.
2056  *
2057  * We may choose to bypass index vacuuming at this point, though only when the
2058  * ongoing VACUUM operation will definitely only have one index scan/round of
2059  * index vacuuming.  Caller indicates whether or not this is such a VACUUM
2060  * operation using 'onecall' argument.
2061  *
2062  * In rare emergencies, the ongoing VACUUM operation can be made to skip both
2063  * index vacuuming and index cleanup at the point we're called.  This avoids
2064  * having the whole system refuse to allocate further XIDs/MultiXactIds due to
2065  * wraparound.
2066  */
2067 static void
2068 lazy_vacuum(LVRelState *vacrel, bool onecall)
2069 {
2070         bool            do_bypass_optimization;
2071
2072         /* Should not end up here with no indexes */
2073         Assert(vacrel->nindexes > 0);
2074         Assert(!IsParallelWorker());
2075         Assert(vacrel->lpdead_item_pages > 0);
2076
2077         if (!vacrel->do_index_vacuuming)
2078         {
2079                 Assert(!vacrel->do_index_cleanup);
2080                 vacrel->dead_tuples->num_tuples = 0;
2081                 return;
2082         }
2083
2084         /*
2085          * Consider bypassing index vacuuming (and heap vacuuming) entirely.
2086          *
2087          * We currently only do this in cases where the number of LP_DEAD items
2088          * for the entire VACUUM operation is close to zero.  This avoids sharp
2089          * discontinuities in the duration and overhead of successive VACUUM
2090          * operations that run against the same table with a fixed workload.
2091          * Ideally, successive VACUUM operations will behave as if there are
2092          * exactly zero LP_DEAD items in cases where there are close to zero.
2093          *
2094          * This is likely to be helpful with a table that is continually affected
2095          * by UPDATEs that can mostly apply the HOT optimization, but occasionally
2096          * have small aberrations that lead to just a few heap pages retaining
2097          * only one or two LP_DEAD items.  This is pretty common; even when the
2098          * DBA goes out of their way to make UPDATEs use HOT, it is practically
2099          * impossible to predict whether HOT will be applied in 100% of cases.
2100          * It's far easier to ensure that 99%+ of all UPDATEs against a table use
2101          * HOT through careful tuning.
2102          */
2103         do_bypass_optimization = false;
2104         if (onecall && vacrel->rel_pages > 0)
2105         {
2106                 BlockNumber threshold;
2107
2108                 Assert(vacrel->num_index_scans == 0);
2109                 Assert(vacrel->lpdead_items == vacrel->dead_tuples->num_tuples);
2110                 Assert(vacrel->do_index_vacuuming);
2111                 Assert(vacrel->do_index_cleanup);
2112
2113                 /*
2114                  * This crossover point at which we'll start to do index vacuuming is
2115                  * expressed as a percentage of the total number of heap pages in the
2116                  * table that are known to have at least one LP_DEAD item.  This is
2117                  * much more important than the total number of LP_DEAD items, since
2118                  * it's a proxy for the number of heap pages whose visibility map bits
2119                  * cannot be set on account of bypassing index and heap vacuuming.
2120                  *
2121                  * We apply one further precautionary test: the space currently used
2122                  * to store the TIDs (TIDs that now all point to LP_DEAD items) must
2123                  * not exceed 32MB.  This limits the risk that we will bypass index
2124                  * vacuuming again and again until eventually there is a VACUUM whose
2125                  * dead_tuples space is not CPU cache resident.
2126                  *
2127                  * We don't take any special steps to remember the LP_DEAD items (such
2128                  * as counting them in new_dead_tuples report to the stats collector)
2129                  * when the optimization is applied.  Though the accounting used in
2130                  * analyze.c's acquire_sample_rows() will recognize the same LP_DEAD
2131                  * items as dead rows in its own stats collector report, that's okay.
2132                  * The discrepancy should be negligible.  If this optimization is ever
2133                  * expanded to cover more cases then this may need to be reconsidered.
2134                  */
2135                 threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES;
2136                 do_bypass_optimization =
2137                         (vacrel->lpdead_item_pages < threshold &&
2138                          vacrel->lpdead_items < MAXDEADTUPLES(32L * 1024L * 1024L));
2139         }
2140
2141         if (do_bypass_optimization)
2142         {
2143                 /*
2144                  * There are almost zero TIDs.  Behave as if there were precisely
2145                  * zero: bypass index vacuuming, but do index cleanup.
2146                  *
2147                  * We expect that the ongoing VACUUM operation will finish very
2148                  * quickly, so there is no point in considering speeding up as a
2149                  * failsafe against wraparound failure. (Index cleanup is expected to
2150                  * finish very quickly in cases where there were no ambulkdelete()
2151                  * calls.)
2152                  */
2153                 vacrel->do_index_vacuuming = false;
2154                 ereport(elevel,
2155                                 (errmsg("\"%s\": index scan bypassed: %u pages from table (%.2f%% of total) have %lld dead item identifiers",
2156                                                 vacrel->relname, vacrel->lpdead_item_pages,
2157                                                 100.0 * vacrel->lpdead_item_pages / vacrel->rel_pages,
2158                                                 (long long) vacrel->lpdead_items)));
2159         }
2160         else if (lazy_vacuum_all_indexes(vacrel))
2161         {
2162                 /*
2163                  * We successfully completed a round of index vacuuming.  Do related
2164                  * heap vacuuming now.
2165                  */
2166                 lazy_vacuum_heap_rel(vacrel);
2167         }
2168         else
2169         {
2170                 /*
2171                  * Failsafe case.
2172                  *
2173                  * we attempted index vacuuming, but didn't finish a full round/full
2174                  * index scan.  This happens when relfrozenxid or relminmxid is too
2175                  * far in the past.
2176                  *
2177                  * From this point on the VACUUM operation will do no further index
2178                  * vacuuming or heap vacuuming.  This VACUUM operation won't end up
2179                  * back here again.
2180                  */
2181                 Assert(vacrel->do_failsafe);
2182         }
2183
2184         /*
2185          * Forget the LP_DEAD items that we just vacuumed (or just decided to not
2186          * vacuum)
2187          */
2188         vacrel->dead_tuples->num_tuples = 0;
2189 }
2190
2191 /*
2192  *      lazy_vacuum_all_indexes() -- Main entry for index vacuuming
2193  *
2194  * Returns true in the common case when all indexes were successfully
2195  * vacuumed.  Returns false in rare cases where we determined that the ongoing
2196  * VACUUM operation is at risk of taking too long to finish, leading to
2197  * wraparound failure.
2198  */
2199 static bool
2200 lazy_vacuum_all_indexes(LVRelState *vacrel)
2201 {
2202         bool            allindexes = true;
2203
2204         Assert(!IsParallelWorker());
2205         Assert(vacrel->nindexes > 0);
2206         Assert(vacrel->do_index_vacuuming);
2207         Assert(vacrel->do_index_cleanup);
2208         Assert(TransactionIdIsNormal(vacrel->relfrozenxid));
2209         Assert(MultiXactIdIsValid(vacrel->relminmxid));
2210
2211         /* Precheck for XID wraparound emergencies */
2212         if (lazy_check_wraparound_failsafe(vacrel))
2213         {
2214                 /* Wraparound emergency -- don't even start an index scan */
2215                 return false;
2216         }
2217
2218         /* Report that we are now vacuuming indexes */
2219         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2220                                                                  PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
2221
2222         if (!ParallelVacuumIsActive(vacrel))
2223         {
2224                 for (int idx = 0; idx < vacrel->nindexes; idx++)
2225                 {
2226                         Relation        indrel = vacrel->indrels[idx];
2227                         IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2228
2229                         vacrel->indstats[idx] =
2230                                 lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples,
2231                                                                           vacrel);
2232
2233                         if (lazy_check_wraparound_failsafe(vacrel))
2234                         {
2235                                 /* Wraparound emergency -- end current index scan */
2236                                 allindexes = false;
2237                                 break;
2238                         }
2239                 }
2240         }
2241         else
2242         {
2243                 /* Outsource everything to parallel variant */
2244                 do_parallel_lazy_vacuum_all_indexes(vacrel);
2245
2246                 /*
2247                  * Do a postcheck to consider applying wraparound failsafe now.  Note
2248                  * that parallel VACUUM only gets the precheck and this postcheck.
2249                  */
2250                 if (lazy_check_wraparound_failsafe(vacrel))
2251                         allindexes = false;
2252         }
2253
2254         /*
2255          * We delete all LP_DEAD items from the first heap pass in all indexes on
2256          * each call here (except calls where we choose to do the failsafe). This
2257          * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2258          * of the failsafe triggering, which prevents the next call from taking
2259          * place).
2260          */
2261         Assert(vacrel->num_index_scans > 0 ||
2262                    vacrel->dead_tuples->num_tuples == vacrel->lpdead_items);
2263         Assert(allindexes || vacrel->do_failsafe);
2264
2265         /*
2266          * Increase and report the number of index scans.
2267          *
2268          * We deliberately include the case where we started a round of bulk
2269          * deletes that we weren't able to finish due to the failsafe triggering.
2270          */
2271         vacrel->num_index_scans++;
2272         pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS,
2273                                                                  vacrel->num_index_scans);
2274
2275         return allindexes;
2276 }
2277
2278 /*
2279  *      lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2280  *
2281  * This routine marks LP_DEAD items in vacrel->dead_tuples array as LP_UNUSED.
2282  * Pages that never had lazy_scan_prune record LP_DEAD items are not visited
2283  * at all.
2284  *
2285  * We may also be able to truncate the line pointer array of the heap pages we
2286  * visit.  If there is a contiguous group of LP_UNUSED items at the end of the
2287  * array, it can be reclaimed as free space.  These LP_UNUSED items usually
2288  * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2289  * each page to LP_UNUSED, and then consider if it's possible to truncate the
2290  * page's line pointer array).
2291  *
2292  * Note: the reason for doing this as a second pass is we cannot remove the
2293  * tuples until we've removed their index entries, and we want to process
2294  * index entry removal in batches as large as possible.
2295  */
2296 static void
2297 lazy_vacuum_heap_rel(LVRelState *vacrel)
2298 {
2299         int                     tupindex;
2300         BlockNumber vacuumed_pages;
2301         PGRUsage        ru0;
2302         Buffer          vmbuffer = InvalidBuffer;
2303         LVSavedErrInfo saved_err_info;
2304
2305         Assert(vacrel->do_index_vacuuming);
2306         Assert(vacrel->do_index_cleanup);
2307         Assert(vacrel->num_index_scans > 0);
2308
2309         /* Report that we are now vacuuming the heap */
2310         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2311                                                                  PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
2312
2313         /* Update error traceback information */
2314         update_vacuum_error_info(vacrel, &saved_err_info,
2315                                                          VACUUM_ERRCB_PHASE_VACUUM_HEAP,
2316                                                          InvalidBlockNumber, InvalidOffsetNumber);
2317
2318         pg_rusage_init(&ru0);
2319         vacuumed_pages = 0;
2320
2321         tupindex = 0;
2322         while (tupindex < vacrel->dead_tuples->num_tuples)
2323         {
2324                 BlockNumber tblk;
2325                 Buffer          buf;
2326                 Page            page;
2327                 Size            freespace;
2328
2329                 vacuum_delay_point();
2330
2331                 tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
2332                 vacrel->blkno = tblk;
2333                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
2334                                                                  vacrel->bstrategy);
2335                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2336                 tupindex = lazy_vacuum_heap_page(vacrel, tblk, buf, tupindex,
2337                                                                                  &vmbuffer);
2338
2339                 /* Now that we've vacuumed the page, record its available space */
2340                 page = BufferGetPage(buf);
2341                 freespace = PageGetHeapFreeSpace(page);
2342
2343                 UnlockReleaseBuffer(buf);
2344                 RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
2345                 vacuumed_pages++;
2346         }
2347
2348         /* Clear the block number information */
2349         vacrel->blkno = InvalidBlockNumber;
2350
2351         if (BufferIsValid(vmbuffer))
2352         {
2353                 ReleaseBuffer(vmbuffer);
2354                 vmbuffer = InvalidBuffer;
2355         }
2356
2357         /*
2358          * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2359          * the second heap pass.  No more, no less.
2360          */
2361         Assert(vacrel->num_index_scans > 1 ||
2362                    (tupindex == vacrel->lpdead_items &&
2363                         vacuumed_pages == vacrel->lpdead_item_pages));
2364
2365         ereport(elevel,
2366                         (errmsg("\"%s\": removed %d dead item identifiers in %u pages",
2367                                         vacrel->relname, tupindex, vacuumed_pages),
2368                          errdetail_internal("%s", pg_rusage_show(&ru0))));
2369
2370         /* Revert to the previous phase information for error traceback */
2371         restore_vacuum_error_info(vacrel, &saved_err_info);
2372 }
2373
2374 /*
2375  *      lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2376  *                                                vacrel->dead_tuples array.
2377  *
2378  * Caller must have an exclusive buffer lock on the buffer (though a
2379  * super-exclusive lock is also acceptable).
2380  *
2381  * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
2382  * this page.  We assume the rest follow sequentially.  The return value is
2383  * the first tupindex after the tuples of this page.
2384  *
2385  * Prior to PostgreSQL 14 there were rare cases where this routine had to set
2386  * tuples with storage to unused.  These days it is strictly responsible for
2387  * marking LP_DEAD stub line pointers as unused.  This only happens for those
2388  * LP_DEAD items on the page that were determined to be LP_DEAD items back
2389  * when the same page was visited by lazy_scan_prune() (i.e. those whose TID
2390  * was recorded in the dead_tuples array).
2391  */
2392 static int
2393 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
2394                                           int tupindex, Buffer *vmbuffer)
2395 {
2396         LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2397         Page            page = BufferGetPage(buffer);
2398         OffsetNumber unused[MaxHeapTuplesPerPage];
2399         int                     uncnt = 0;
2400         TransactionId visibility_cutoff_xid;
2401         bool            all_frozen;
2402         LVSavedErrInfo saved_err_info;
2403
2404         Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming);
2405
2406         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
2407
2408         /* Update error traceback information */
2409         update_vacuum_error_info(vacrel, &saved_err_info,
2410                                                          VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
2411                                                          InvalidOffsetNumber);
2412
2413         START_CRIT_SECTION();
2414
2415         for (; tupindex < dead_tuples->num_tuples; tupindex++)
2416         {
2417                 BlockNumber tblk;
2418                 OffsetNumber toff;
2419                 ItemId          itemid;
2420
2421                 tblk = ItemPointerGetBlockNumber(&dead_tuples->itemptrs[tupindex]);
2422                 if (tblk != blkno)
2423                         break;                          /* past end of tuples for this block */
2424                 toff = ItemPointerGetOffsetNumber(&dead_tuples->itemptrs[tupindex]);
2425                 itemid = PageGetItemId(page, toff);
2426
2427                 Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
2428                 ItemIdSetUnused(itemid);
2429                 unused[uncnt++] = toff;
2430         }
2431
2432         Assert(uncnt > 0);
2433
2434         /* Attempt to truncate line pointer array now */
2435         PageTruncateLinePointerArray(page);
2436
2437         /*
2438          * Mark buffer dirty before we write WAL.
2439          */
2440         MarkBufferDirty(buffer);
2441
2442         /* XLOG stuff */
2443         if (RelationNeedsWAL(vacrel->rel))
2444         {
2445                 xl_heap_vacuum xlrec;
2446                 XLogRecPtr      recptr;
2447
2448                 xlrec.nunused = uncnt;
2449
2450                 XLogBeginInsert();
2451                 XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum);
2452
2453                 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2454                 XLogRegisterBufData(0, (char *) unused, uncnt * sizeof(OffsetNumber));
2455
2456                 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM);
2457
2458                 PageSetLSN(page, recptr);
2459         }
2460
2461         /*
2462          * End critical section, so we safely can do visibility tests (which
2463          * possibly need to perform IO and allocate memory!). If we crash now the
2464          * page (including the corresponding vm bit) might not be marked all
2465          * visible, but that's fine. A later vacuum will fix that.
2466          */
2467         END_CRIT_SECTION();
2468
2469         /*
2470          * Now that we have removed the LD_DEAD items from the page, once again
2471          * check if the page has become all-visible.  The page is already marked
2472          * dirty, exclusively locked, and, if needed, a full page image has been
2473          * emitted.
2474          */
2475         if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2476                                                                  &all_frozen))
2477                 PageSetAllVisible(page);
2478
2479         /*
2480          * All the changes to the heap page have been done. If the all-visible
2481          * flag is now set, also set the VM all-visible bit (and, if possible, the
2482          * all-frozen bit) unless this has already been done previously.
2483          */
2484         if (PageIsAllVisible(page))
2485         {
2486                 uint8           flags = 0;
2487                 uint8           vm_status = visibilitymap_get_status(vacrel->rel,
2488                                                                                                                  blkno, vmbuffer);
2489
2490                 /* Set the VM all-frozen bit to flag, if needed */
2491                 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2492                         flags |= VISIBILITYMAP_ALL_VISIBLE;
2493                 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2494                         flags |= VISIBILITYMAP_ALL_FROZEN;
2495
2496                 Assert(BufferIsValid(*vmbuffer));
2497                 if (flags != 0)
2498                         visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2499                                                           *vmbuffer, visibility_cutoff_xid, flags);
2500         }
2501
2502         /* Revert to the previous phase information for error traceback */
2503         restore_vacuum_error_info(vacrel, &saved_err_info);
2504         return tupindex;
2505 }
2506
2507 /*
2508  *      lazy_check_needs_freeze() -- scan page to see if any tuples
2509  *                                       need to be cleaned to avoid wraparound
2510  *
2511  * Returns true if the page needs to be vacuumed using cleanup lock.
2512  * Also returns a flag indicating whether page contains any tuples at all.
2513  */
2514 static bool
2515 lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel)
2516 {
2517         Page            page = BufferGetPage(buf);
2518         OffsetNumber offnum,
2519                                 maxoff;
2520         HeapTupleHeader tupleheader;
2521
2522         *hastup = false;
2523
2524         /*
2525          * New and empty pages, obviously, don't contain tuples. We could make
2526          * sure that the page is registered in the FSM, but it doesn't seem worth
2527          * waiting for a cleanup lock just for that, especially because it's
2528          * likely that the pin holder will do so.
2529          */
2530         if (PageIsNew(page) || PageIsEmpty(page))
2531                 return false;
2532
2533         maxoff = PageGetMaxOffsetNumber(page);
2534         for (offnum = FirstOffsetNumber;
2535                  offnum <= maxoff;
2536                  offnum = OffsetNumberNext(offnum))
2537         {
2538                 ItemId          itemid;
2539
2540                 /*
2541                  * Set the offset number so that we can display it along with any
2542                  * error that occurred while processing this tuple.
2543                  */
2544                 vacrel->offnum = offnum;
2545                 itemid = PageGetItemId(page, offnum);
2546
2547                 /* this should match hastup test in count_nondeletable_pages() */
2548                 if (ItemIdIsUsed(itemid))
2549                         *hastup = true;
2550
2551                 /* dead and redirect items never need freezing */
2552                 if (!ItemIdIsNormal(itemid))
2553                         continue;
2554
2555                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
2556
2557                 if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit,
2558                                                                         vacrel->MultiXactCutoff, buf))
2559                         break;
2560         }                                                       /* scan along page */
2561
2562         /* Clear the offset information once we have processed the given page. */
2563         vacrel->offnum = InvalidOffsetNumber;
2564
2565         return (offnum <= maxoff);
2566 }
2567
2568 /*
2569  * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2570  * relfrozenxid and/or relminmxid that is dangerously far in the past.
2571  * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2572  * vacuuming and heap vacuuming.  Truncating the heap is also bypassed.
2573  *
2574  * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2575  * up when the failsafe triggers.  VACUUM stops applying any cost-based delay
2576  * that it started out with.
2577  *
2578  * Returns true when failsafe has been triggered.
2579  */
2580 static bool
2581 lazy_check_wraparound_failsafe(LVRelState *vacrel)
2582 {
2583         /* Don't warn more than once per VACUUM */
2584         if (vacrel->do_failsafe)
2585                 return true;
2586
2587         if (unlikely(vacuum_xid_failsafe_check(vacrel->relfrozenxid,
2588                                                                                    vacrel->relminmxid)))
2589         {
2590                 Assert(vacrel->do_index_vacuuming);
2591                 Assert(vacrel->do_index_cleanup);
2592
2593                 vacrel->do_index_vacuuming = false;
2594                 vacrel->do_index_cleanup = false;
2595                 vacrel->do_failsafe = true;
2596
2597                 ereport(WARNING,
2598                                 (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2599                                                 get_database_name(MyDatabaseId),
2600                                                 vacrel->relnamespace,
2601                                                 vacrel->relname,
2602                                                 vacrel->num_index_scans),
2603                                  errdetail("table's relfrozenxid or relminmxid is too far in the past"),
2604                                  errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2605                                                  "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2606
2607                 /* Stop applying cost limits from this point on */
2608                 VacuumCostActive = false;
2609                 VacuumCostBalance = 0;
2610
2611                 return true;
2612         }
2613
2614         return false;
2615 }
2616
2617 /*
2618  * Perform lazy_vacuum_all_indexes() steps in parallel
2619  */
2620 static void
2621 do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel)
2622 {
2623         /* Tell parallel workers to do index vacuuming */
2624         vacrel->lps->lvshared->for_cleanup = false;
2625         vacrel->lps->lvshared->first_time = false;
2626
2627         /*
2628          * We can only provide an approximate value of num_heap_tuples in vacuum
2629          * cases.
2630          */
2631         vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples;
2632         vacrel->lps->lvshared->estimated_count = true;
2633
2634         do_parallel_vacuum_or_cleanup(vacrel,
2635                                                                   vacrel->lps->nindexes_parallel_bulkdel);
2636 }
2637
2638 /*
2639  * Perform lazy_cleanup_all_indexes() steps in parallel
2640  */
2641 static void
2642 do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel)
2643 {
2644         int                     nworkers;
2645
2646         /*
2647          * If parallel vacuum is active we perform index cleanup with parallel
2648          * workers.
2649          *
2650          * Tell parallel workers to do index cleanup.
2651          */
2652         vacrel->lps->lvshared->for_cleanup = true;
2653         vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0);
2654
2655         /*
2656          * Now we can provide a better estimate of total number of surviving
2657          * tuples (we assume indexes are more interested in that than in the
2658          * number of nominally live tuples).
2659          */
2660         vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples;
2661         vacrel->lps->lvshared->estimated_count =
2662                 (vacrel->tupcount_pages < vacrel->rel_pages);
2663
2664         /* Determine the number of parallel workers to launch */
2665         if (vacrel->lps->lvshared->first_time)
2666                 nworkers = vacrel->lps->nindexes_parallel_cleanup +
2667                         vacrel->lps->nindexes_parallel_condcleanup;
2668         else
2669                 nworkers = vacrel->lps->nindexes_parallel_cleanup;
2670
2671         do_parallel_vacuum_or_cleanup(vacrel, nworkers);
2672 }
2673
2674 /*
2675  * Perform index vacuum or index cleanup with parallel workers.  This function
2676  * must be used by the parallel vacuum leader process.  The caller must set
2677  * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2678  * cleanup.
2679  */
2680 static void
2681 do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers)
2682 {
2683         LVParallelState *lps = vacrel->lps;
2684
2685         Assert(!IsParallelWorker());
2686         Assert(ParallelVacuumIsActive(vacrel));
2687         Assert(vacrel->nindexes > 0);
2688
2689         /* The leader process will participate */
2690         nworkers--;
2691
2692         /*
2693          * It is possible that parallel context is initialized with fewer workers
2694          * than the number of indexes that need a separate worker in the current
2695          * phase, so we need to consider it.  See compute_parallel_vacuum_workers.
2696          */
2697         nworkers = Min(nworkers, lps->pcxt->nworkers);
2698
2699         /* Setup the shared cost-based vacuum delay and launch workers */
2700         if (nworkers > 0)
2701         {
2702                 if (vacrel->num_index_scans > 0)
2703                 {
2704                         /* Reset the parallel index processing counter */
2705                         pg_atomic_write_u32(&(lps->lvshared->idx), 0);
2706
2707                         /* Reinitialize the parallel context to relaunch parallel workers */
2708                         ReinitializeParallelDSM(lps->pcxt);
2709                 }
2710
2711                 /*
2712                  * Set up shared cost balance and the number of active workers for
2713                  * vacuum delay.  We need to do this before launching workers as
2714                  * otherwise, they might not see the updated values for these
2715                  * parameters.
2716                  */
2717                 pg_atomic_write_u32(&(lps->lvshared->cost_balance), VacuumCostBalance);
2718                 pg_atomic_write_u32(&(lps->lvshared->active_nworkers), 0);
2719
2720                 /*
2721                  * The number of workers can vary between bulkdelete and cleanup
2722                  * phase.
2723                  */
2724                 ReinitializeParallelWorkers(lps->pcxt, nworkers);
2725
2726                 LaunchParallelWorkers(lps->pcxt);
2727
2728                 if (lps->pcxt->nworkers_launched > 0)
2729                 {
2730                         /*
2731                          * Reset the local cost values for leader backend as we have
2732                          * already accumulated the remaining balance of heap.
2733                          */
2734                         VacuumCostBalance = 0;
2735                         VacuumCostBalanceLocal = 0;
2736
2737                         /* Enable shared cost balance for leader backend */
2738                         VacuumSharedCostBalance = &(lps->lvshared->cost_balance);
2739                         VacuumActiveNWorkers = &(lps->lvshared->active_nworkers);
2740                 }
2741
2742                 if (lps->lvshared->for_cleanup)
2743                         ereport(elevel,
2744                                         (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2745                                                                          "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2746                                                                          lps->pcxt->nworkers_launched),
2747                                                         lps->pcxt->nworkers_launched, nworkers)));
2748                 else
2749                         ereport(elevel,
2750                                         (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2751                                                                          "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2752                                                                          lps->pcxt->nworkers_launched),
2753                                                         lps->pcxt->nworkers_launched, nworkers)));
2754         }
2755
2756         /* Process the indexes that can be processed by only leader process */
2757         do_serial_processing_for_unsafe_indexes(vacrel, lps->lvshared);
2758
2759         /*
2760          * Join as a parallel worker.  The leader process alone processes all the
2761          * indexes in the case where no workers are launched.
2762          */
2763         do_parallel_processing(vacrel, lps->lvshared);
2764
2765         /*
2766          * Next, accumulate buffer and WAL usage.  (This must wait for the workers
2767          * to finish, or we might get incomplete data.)
2768          */
2769         if (nworkers > 0)
2770         {
2771                 /* Wait for all vacuum workers to finish */
2772                 WaitForParallelWorkersToFinish(lps->pcxt);
2773
2774                 for (int i = 0; i < lps->pcxt->nworkers_launched; i++)
2775                         InstrAccumParallelQuery(&lps->buffer_usage[i], &lps->wal_usage[i]);
2776         }
2777
2778         /*
2779          * Carry the shared balance value to heap scan and disable shared costing
2780          */
2781         if (VacuumSharedCostBalance)
2782         {
2783                 VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance);
2784                 VacuumSharedCostBalance = NULL;
2785                 VacuumActiveNWorkers = NULL;
2786         }
2787 }
2788
2789 /*
2790  * Index vacuum/cleanup routine used by the leader process and parallel
2791  * vacuum worker processes to process the indexes in parallel.
2792  */
2793 static void
2794 do_parallel_processing(LVRelState *vacrel, LVShared *lvshared)
2795 {
2796         /*
2797          * Increment the active worker count if we are able to launch any worker.
2798          */
2799         if (VacuumActiveNWorkers)
2800                 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2801
2802         /* Loop until all indexes are vacuumed */
2803         for (;;)
2804         {
2805                 int                     idx;
2806                 LVSharedIndStats *shared_istat;
2807                 Relation        indrel;
2808                 IndexBulkDeleteResult *istat;
2809
2810                 /* Get an index number to process */
2811                 idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1);
2812
2813                 /* Done for all indexes? */
2814                 if (idx >= vacrel->nindexes)
2815                         break;
2816
2817                 /* Get the index statistics of this index from DSM */
2818                 shared_istat = parallel_stats_for_idx(lvshared, idx);
2819
2820                 /* Skip indexes not participating in parallelism */
2821                 if (shared_istat == NULL)
2822                         continue;
2823
2824                 indrel = vacrel->indrels[idx];
2825
2826                 /*
2827                  * Skip processing indexes that are unsafe for workers (these are
2828                  * processed in do_serial_processing_for_unsafe_indexes() by leader)
2829                  */
2830                 if (!parallel_processing_is_safe(indrel, lvshared))
2831                         continue;
2832
2833                 /* Do vacuum or cleanup of the index */
2834                 istat = (vacrel->indstats[idx]);
2835                 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2836                                                                                                                    lvshared,
2837                                                                                                                    shared_istat,
2838                                                                                                                    vacrel);
2839         }
2840
2841         /*
2842          * We have completed the index vacuum so decrement the active worker
2843          * count.
2844          */
2845         if (VacuumActiveNWorkers)
2846                 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2847 }
2848
2849 /*
2850  * Vacuum or cleanup indexes that can be processed by only the leader process
2851  * because these indexes don't support parallel operation at that phase.
2852  */
2853 static void
2854 do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared)
2855 {
2856         Assert(!IsParallelWorker());
2857
2858         /*
2859          * Increment the active worker count if we are able to launch any worker.
2860          */
2861         if (VacuumActiveNWorkers)
2862                 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2863
2864         for (int idx = 0; idx < vacrel->nindexes; idx++)
2865         {
2866                 LVSharedIndStats *shared_istat;
2867                 Relation        indrel;
2868                 IndexBulkDeleteResult *istat;
2869
2870                 shared_istat = parallel_stats_for_idx(lvshared, idx);
2871
2872                 /* Skip already-complete indexes */
2873                 if (shared_istat != NULL)
2874                         continue;
2875
2876                 indrel = vacrel->indrels[idx];
2877
2878                 /*
2879                  * We're only here for the unsafe indexes
2880                  */
2881                 if (parallel_processing_is_safe(indrel, lvshared))
2882                         continue;
2883
2884                 /* Do vacuum or cleanup of the index */
2885                 istat = (vacrel->indstats[idx]);
2886                 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2887                                                                                                                    lvshared,
2888                                                                                                                    shared_istat,
2889                                                                                                                    vacrel);
2890         }
2891
2892         /*
2893          * We have completed the index vacuum so decrement the active worker
2894          * count.
2895          */
2896         if (VacuumActiveNWorkers)
2897                 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2898 }
2899
2900 /*
2901  * Vacuum or cleanup index either by leader process or by one of the worker
2902  * process.  After processing the index this function copies the index
2903  * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2904  * segment.
2905  */
2906 static IndexBulkDeleteResult *
2907 parallel_process_one_index(Relation indrel,
2908                                                    IndexBulkDeleteResult *istat,
2909                                                    LVShared *lvshared,
2910                                                    LVSharedIndStats *shared_istat,
2911                                                    LVRelState *vacrel)
2912 {
2913         IndexBulkDeleteResult *istat_res;
2914
2915         /*
2916          * Update the pointer to the corresponding bulk-deletion result if someone
2917          * has already updated it
2918          */
2919         if (shared_istat && shared_istat->updated && istat == NULL)
2920                 istat = &shared_istat->istat;
2921
2922         /* Do vacuum or cleanup of the index */
2923         if (lvshared->for_cleanup)
2924                 istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples,
2925                                                                                    lvshared->estimated_count, vacrel);
2926         else
2927                 istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples,
2928                                                                                   vacrel);
2929
2930         /*
2931          * Copy the index bulk-deletion result returned from ambulkdelete and
2932          * amvacuumcleanup to the DSM segment if it's the first cycle because they
2933          * allocate locally and it's possible that an index will be vacuumed by a
2934          * different vacuum process the next cycle.  Copying the result normally
2935          * happens only the first time an index is vacuumed.  For any additional
2936          * vacuum pass, we directly point to the result on the DSM segment and
2937          * pass it to vacuum index APIs so that workers can update it directly.
2938          *
2939          * Since all vacuum workers write the bulk-deletion result at different
2940          * slots we can write them without locking.
2941          */
2942         if (shared_istat && !shared_istat->updated && istat_res != NULL)
2943         {
2944                 memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult));
2945                 shared_istat->updated = true;
2946
2947                 /* Free the locally-allocated bulk-deletion result */
2948                 pfree(istat_res);
2949
2950                 /* return the pointer to the result from shared memory */
2951                 return &shared_istat->istat;
2952         }
2953
2954         return istat_res;
2955 }
2956
2957 /*
2958  *      lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2959  */
2960 static void
2961 lazy_cleanup_all_indexes(LVRelState *vacrel)
2962 {
2963         Assert(!IsParallelWorker());
2964         Assert(vacrel->nindexes > 0);
2965
2966         /* Report that we are now cleaning up indexes */
2967         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2968                                                                  PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
2969
2970         if (!ParallelVacuumIsActive(vacrel))
2971         {
2972                 double          reltuples = vacrel->new_rel_tuples;
2973                 bool            estimated_count =
2974                 vacrel->tupcount_pages < vacrel->rel_pages;
2975
2976                 for (int idx = 0; idx < vacrel->nindexes; idx++)
2977                 {
2978                         Relation        indrel = vacrel->indrels[idx];
2979                         IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2980
2981                         vacrel->indstats[idx] =
2982                                 lazy_cleanup_one_index(indrel, istat, reltuples,
2983                                                                            estimated_count, vacrel);
2984                 }
2985         }
2986         else
2987         {
2988                 /* Outsource everything to parallel variant */
2989                 do_parallel_lazy_cleanup_all_indexes(vacrel);
2990         }
2991 }
2992
2993 /*
2994  *      lazy_vacuum_one_index() -- vacuum index relation.
2995  *
2996  *              Delete all the index entries pointing to tuples listed in
2997  *              dead_tuples, and update running statistics.
2998  *
2999  *              reltuples is the number of heap tuples to be passed to the
3000  *              bulkdelete callback.  It's always assumed to be estimated.
3001  *
3002  * Returns bulk delete stats derived from input stats
3003  */
3004 static IndexBulkDeleteResult *
3005 lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3006                                           double reltuples, LVRelState *vacrel)
3007 {
3008         IndexVacuumInfo ivinfo;
3009         PGRUsage        ru0;
3010         LVSavedErrInfo saved_err_info;
3011
3012         pg_rusage_init(&ru0);
3013
3014         ivinfo.index = indrel;
3015         ivinfo.analyze_only = false;
3016         ivinfo.report_progress = false;
3017         ivinfo.estimated_count = true;
3018         ivinfo.message_level = elevel;
3019         ivinfo.num_heap_tuples = reltuples;
3020         ivinfo.strategy = vacrel->bstrategy;
3021
3022         /*
3023          * Update error traceback information.
3024          *
3025          * The index name is saved during this phase and restored immediately
3026          * after this phase.  See vacuum_error_callback.
3027          */
3028         Assert(vacrel->indname == NULL);
3029         vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3030         update_vacuum_error_info(vacrel, &saved_err_info,
3031                                                          VACUUM_ERRCB_PHASE_VACUUM_INDEX,
3032                                                          InvalidBlockNumber, InvalidOffsetNumber);
3033
3034         /* Do bulk deletion */
3035         istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped,
3036                                                           (void *) vacrel->dead_tuples);
3037
3038         ereport(elevel,
3039                         (errmsg("scanned index \"%s\" to remove %d row versions",
3040                                         vacrel->indname, vacrel->dead_tuples->num_tuples),
3041                          errdetail_internal("%s", pg_rusage_show(&ru0))));
3042
3043         /* Revert to the previous phase information for error traceback */
3044         restore_vacuum_error_info(vacrel, &saved_err_info);
3045         pfree(vacrel->indname);
3046         vacrel->indname = NULL;
3047
3048         return istat;
3049 }
3050
3051 /*
3052  *      lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
3053  *
3054  *              reltuples is the number of heap tuples and estimated_count is true
3055  *              if reltuples is an estimated value.
3056  *
3057  * Returns bulk delete stats derived from input stats
3058  */
3059 static IndexBulkDeleteResult *
3060 lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3061                                            double reltuples, bool estimated_count,
3062                                            LVRelState *vacrel)
3063 {
3064         IndexVacuumInfo ivinfo;
3065         PGRUsage        ru0;
3066         LVSavedErrInfo saved_err_info;
3067
3068         pg_rusage_init(&ru0);
3069
3070         ivinfo.index = indrel;
3071         ivinfo.analyze_only = false;
3072         ivinfo.report_progress = false;
3073         ivinfo.estimated_count = estimated_count;
3074         ivinfo.message_level = elevel;
3075
3076         ivinfo.num_heap_tuples = reltuples;
3077         ivinfo.strategy = vacrel->bstrategy;
3078
3079         /*
3080          * Update error traceback information.
3081          *
3082          * The index name is saved during this phase and restored immediately
3083          * after this phase.  See vacuum_error_callback.
3084          */
3085         Assert(vacrel->indname == NULL);
3086         vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3087         update_vacuum_error_info(vacrel, &saved_err_info,
3088                                                          VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
3089                                                          InvalidBlockNumber, InvalidOffsetNumber);
3090
3091         istat = index_vacuum_cleanup(&ivinfo, istat);
3092
3093         if (istat)
3094         {
3095                 ereport(elevel,
3096                                 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3097                                                 RelationGetRelationName(indrel),
3098                                                 (istat)->num_index_tuples,
3099                                                 (istat)->num_pages),
3100                                  errdetail("%.0f index row versions were removed.\n"
3101                                                    "%u index pages were newly deleted.\n"
3102                                                    "%u index pages are currently deleted, of which %u are currently reusable.\n"
3103                                                    "%s.",
3104                                                    (istat)->tuples_removed,
3105                                                    (istat)->pages_newly_deleted,
3106                                                    (istat)->pages_deleted, (istat)->pages_free,
3107                                                    pg_rusage_show(&ru0))));
3108         }
3109
3110         /* Revert to the previous phase information for error traceback */
3111         restore_vacuum_error_info(vacrel, &saved_err_info);
3112         pfree(vacrel->indname);
3113         vacrel->indname = NULL;
3114
3115         return istat;
3116 }
3117
3118 /*
3119  * should_attempt_truncation - should we attempt to truncate the heap?
3120  *
3121  * Don't even think about it unless we have a shot at releasing a goodly
3122  * number of pages.  Otherwise, the time taken isn't worth it.
3123  *
3124  * Also don't attempt it if wraparound failsafe is in effect.  It's hard to
3125  * predict how long lazy_truncate_heap will take.  Don't take any chances.
3126  * There is very little chance of truncation working out when the failsafe is
3127  * in effect in any case.  lazy_scan_prune makes the optimistic assumption
3128  * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
3129  * we're called.
3130  *
3131  * Also don't attempt it if we are doing early pruning/vacuuming, because a
3132  * scan which cannot find a truncated heap page cannot determine that the
3133  * snapshot is too old to read that page.
3134  *
3135  * This is split out so that we can test whether truncation is going to be
3136  * called for before we actually do it.  If you change the logic here, be
3137  * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
3138  */
3139 static bool
3140 should_attempt_truncation(LVRelState *vacrel, VacuumParams *params)
3141 {
3142         BlockNumber possibly_freeable;
3143
3144         if (params->truncate == VACOPT_TERNARY_DISABLED)
3145                 return false;
3146
3147         if (vacrel->do_failsafe)
3148                 return false;
3149
3150         possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
3151         if (possibly_freeable > 0 &&
3152                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
3153                  possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) &&
3154                 old_snapshot_threshold < 0)
3155                 return true;
3156         else
3157                 return false;
3158 }
3159
3160 /*
3161  * lazy_truncate_heap - try to truncate off any empty pages at the end
3162  */
3163 static void
3164 lazy_truncate_heap(LVRelState *vacrel)
3165 {
3166         BlockNumber old_rel_pages = vacrel->rel_pages;
3167         BlockNumber new_rel_pages;
3168         int                     lock_retry;
3169
3170         /* Report that we are now truncating */
3171         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
3172                                                                  PROGRESS_VACUUM_PHASE_TRUNCATE);
3173
3174         /*
3175          * Loop until no more truncating can be done.
3176          */
3177         do
3178         {
3179                 PGRUsage        ru0;
3180
3181                 pg_rusage_init(&ru0);
3182
3183                 /*
3184                  * We need full exclusive lock on the relation in order to do
3185                  * truncation. If we can't get it, give up rather than waiting --- we
3186                  * don't want to block other backends, and we don't want to deadlock
3187                  * (which is quite possible considering we already hold a lower-grade
3188                  * lock).
3189                  */
3190                 vacrel->lock_waiter_detected = false;
3191                 lock_retry = 0;
3192                 while (true)
3193                 {
3194                         if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
3195                                 break;
3196
3197                         /*
3198                          * Check for interrupts while trying to (re-)acquire the exclusive
3199                          * lock.
3200                          */
3201                         CHECK_FOR_INTERRUPTS();
3202
3203                         if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
3204                                                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
3205                         {
3206                                 /*
3207                                  * We failed to establish the lock in the specified number of
3208                                  * retries. This means we give up truncating.
3209                                  */
3210                                 vacrel->lock_waiter_detected = true;
3211                                 ereport(elevel,
3212                                                 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
3213                                                                 vacrel->relname)));
3214                                 return;
3215                         }
3216
3217                         pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
3218                 }
3219
3220                 /*
3221                  * Now that we have exclusive lock, look to see if the rel has grown
3222                  * whilst we were vacuuming with non-exclusive lock.  If so, give up;
3223                  * the newly added pages presumably contain non-deletable tuples.
3224                  */
3225                 new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
3226                 if (new_rel_pages != old_rel_pages)
3227                 {
3228                         /*
3229                          * Note: we intentionally don't update vacrel->rel_pages with the
3230                          * new rel size here.  If we did, it would amount to assuming that
3231                          * the new pages are empty, which is unlikely. Leaving the numbers
3232                          * alone amounts to assuming that the new pages have the same
3233                          * tuple density as existing ones, which is less unlikely.
3234                          */
3235                         UnlockRelation(vacrel->rel, AccessExclusiveLock);
3236                         return;
3237                 }
3238
3239                 /*
3240                  * Scan backwards from the end to verify that the end pages actually
3241                  * contain no tuples.  This is *necessary*, not optional, because
3242                  * other backends could have added tuples to these pages whilst we
3243                  * were vacuuming.
3244                  */
3245                 new_rel_pages = count_nondeletable_pages(vacrel);
3246                 vacrel->blkno = new_rel_pages;
3247
3248                 if (new_rel_pages >= old_rel_pages)
3249                 {
3250                         /* can't do anything after all */
3251                         UnlockRelation(vacrel->rel, AccessExclusiveLock);
3252                         return;
3253                 }
3254
3255                 /*
3256                  * Okay to truncate.
3257                  */
3258                 RelationTruncate(vacrel->rel, new_rel_pages);
3259
3260                 /*
3261                  * We can release the exclusive lock as soon as we have truncated.
3262                  * Other backends can't safely access the relation until they have
3263                  * processed the smgr invalidation that smgrtruncate sent out ... but
3264                  * that should happen as part of standard invalidation processing once
3265                  * they acquire lock on the relation.
3266                  */
3267                 UnlockRelation(vacrel->rel, AccessExclusiveLock);
3268
3269                 /*
3270                  * Update statistics.  Here, it *is* correct to adjust rel_pages
3271                  * without also touching reltuples, since the tuple count wasn't
3272                  * changed by the truncation.
3273                  */
3274                 vacrel->pages_removed += old_rel_pages - new_rel_pages;
3275                 vacrel->rel_pages = new_rel_pages;
3276
3277                 ereport(elevel,
3278                                 (errmsg("\"%s\": truncated %u to %u pages",
3279                                                 vacrel->relname,
3280                                                 old_rel_pages, new_rel_pages),
3281                                  errdetail_internal("%s",
3282                                                                         pg_rusage_show(&ru0))));
3283                 old_rel_pages = new_rel_pages;
3284         } while (new_rel_pages > vacrel->nonempty_pages &&
3285                          vacrel->lock_waiter_detected);
3286 }
3287
3288 /*
3289  * Rescan end pages to verify that they are (still) empty of tuples.
3290  *
3291  * Returns number of nondeletable pages (last nonempty page + 1).
3292  */
3293 static BlockNumber
3294 count_nondeletable_pages(LVRelState *vacrel)
3295 {
3296         BlockNumber blkno;
3297         BlockNumber prefetchedUntil;
3298         instr_time      starttime;
3299
3300         /* Initialize the starttime if we check for conflicting lock requests */
3301         INSTR_TIME_SET_CURRENT(starttime);
3302
3303         /*
3304          * Start checking blocks at what we believe relation end to be and move
3305          * backwards.  (Strange coding of loop control is needed because blkno is
3306          * unsigned.)  To make the scan faster, we prefetch a few blocks at a time
3307          * in forward direction, so that OS-level readahead can kick in.
3308          */
3309         blkno = vacrel->rel_pages;
3310         StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
3311                                          "prefetch size must be power of 2");
3312         prefetchedUntil = InvalidBlockNumber;
3313         while (blkno > vacrel->nonempty_pages)
3314         {
3315                 Buffer          buf;
3316                 Page            page;
3317                 OffsetNumber offnum,
3318                                         maxoff;
3319                 bool            hastup;
3320
3321                 /*
3322                  * Check if another process requests a lock on our relation. We are
3323                  * holding an AccessExclusiveLock here, so they will be waiting. We
3324                  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
3325                  * only check if that interval has elapsed once every 32 blocks to
3326                  * keep the number of system calls and actual shared lock table
3327                  * lookups to a minimum.
3328                  */
3329                 if ((blkno % 32) == 0)
3330                 {
3331                         instr_time      currenttime;
3332                         instr_time      elapsed;
3333
3334                         INSTR_TIME_SET_CURRENT(currenttime);
3335                         elapsed = currenttime;
3336                         INSTR_TIME_SUBTRACT(elapsed, starttime);
3337                         if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
3338                                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
3339                         {
3340                                 if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
3341                                 {
3342                                         ereport(elevel,
3343                                                         (errmsg("\"%s\": suspending truncate due to conflicting lock request",
3344                                                                         vacrel->relname)));
3345
3346                                         vacrel->lock_waiter_detected = true;
3347                                         return blkno;
3348                                 }
3349                                 starttime = currenttime;
3350                         }
3351                 }
3352
3353                 /*
3354                  * We don't insert a vacuum delay point here, because we have an
3355                  * exclusive lock on the table which we want to hold for as short a
3356                  * time as possible.  We still need to check for interrupts however.
3357                  */
3358                 CHECK_FOR_INTERRUPTS();
3359
3360                 blkno--;
3361
3362                 /* If we haven't prefetched this lot yet, do so now. */
3363                 if (prefetchedUntil > blkno)
3364                 {
3365                         BlockNumber prefetchStart;
3366                         BlockNumber pblkno;
3367
3368                         prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
3369                         for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
3370                         {
3371                                 PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
3372                                 CHECK_FOR_INTERRUPTS();
3373                         }
3374                         prefetchedUntil = prefetchStart;
3375                 }
3376
3377                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
3378                                                                  vacrel->bstrategy);
3379
3380                 /* In this phase we only need shared access to the buffer */
3381                 LockBuffer(buf, BUFFER_LOCK_SHARE);
3382
3383                 page = BufferGetPage(buf);
3384
3385                 if (PageIsNew(page) || PageIsEmpty(page))
3386                 {
3387                         UnlockReleaseBuffer(buf);
3388                         continue;
3389                 }
3390
3391                 hastup = false;
3392                 maxoff = PageGetMaxOffsetNumber(page);
3393                 for (offnum = FirstOffsetNumber;
3394                          offnum <= maxoff;
3395                          offnum = OffsetNumberNext(offnum))
3396                 {
3397                         ItemId          itemid;
3398
3399                         itemid = PageGetItemId(page, offnum);
3400
3401                         /*
3402                          * Note: any non-unused item should be taken as a reason to keep
3403                          * this page.  We formerly thought that DEAD tuples could be
3404                          * thrown away, but that's not so, because we'd not have cleaned
3405                          * out their index entries.
3406                          */
3407                         if (ItemIdIsUsed(itemid))
3408                         {
3409                                 hastup = true;
3410                                 break;                  /* can stop scanning */
3411                         }
3412                 }                                               /* scan along page */
3413
3414                 UnlockReleaseBuffer(buf);
3415
3416                 /* Done scanning if we found a tuple here */
3417                 if (hastup)
3418                         return blkno + 1;
3419         }
3420
3421         /*
3422          * If we fall out of the loop, all the previously-thought-to-be-empty
3423          * pages still are; we need not bother to look at the last known-nonempty
3424          * page.
3425          */
3426         return vacrel->nonempty_pages;
3427 }
3428
3429 /*
3430  * Return the maximum number of dead tuples we can record.
3431  */
3432 static long
3433 compute_max_dead_tuples(BlockNumber relblocks, bool hasindex)
3434 {
3435         long            maxtuples;
3436         int                     vac_work_mem = IsAutoVacuumWorkerProcess() &&
3437         autovacuum_work_mem != -1 ?
3438         autovacuum_work_mem : maintenance_work_mem;
3439
3440         if (hasindex)
3441         {
3442                 maxtuples = MAXDEADTUPLES(vac_work_mem * 1024L);
3443                 maxtuples = Min(maxtuples, INT_MAX);
3444                 maxtuples = Min(maxtuples, MAXDEADTUPLES(MaxAllocSize));
3445
3446                 /* curious coding here to ensure the multiplication can't overflow */
3447                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
3448                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
3449
3450                 /* stay sane if small maintenance_work_mem */
3451                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
3452         }
3453         else
3454                 maxtuples = MaxHeapTuplesPerPage;
3455
3456         return maxtuples;
3457 }
3458
3459 /*
3460  * lazy_space_alloc - space allocation decisions for lazy vacuum
3461  *
3462  * See the comments at the head of this file for rationale.
3463  */
3464 static void
3465 lazy_space_alloc(LVRelState *vacrel, int nworkers, BlockNumber nblocks)
3466 {
3467         LVDeadTuples *dead_tuples;
3468         long            maxtuples;
3469
3470         /*
3471          * Initialize state for a parallel vacuum.  As of now, only one worker can
3472          * be used for an index, so we invoke parallelism only if there are at
3473          * least two indexes on a table.
3474          */
3475         if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming)
3476         {
3477                 /*
3478                  * Since parallel workers cannot access data in temporary tables, we
3479                  * can't perform parallel vacuum on them.
3480                  */
3481                 if (RelationUsesLocalBuffers(vacrel->rel))
3482                 {
3483                         /*
3484                          * Give warning only if the user explicitly tries to perform a
3485                          * parallel vacuum on the temporary table.
3486                          */
3487                         if (nworkers > 0)
3488                                 ereport(WARNING,
3489                                                 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3490                                                                 vacrel->relname)));
3491                 }
3492                 else
3493                         vacrel->lps = begin_parallel_vacuum(vacrel, nblocks, nworkers);
3494
3495                 /* If parallel mode started, we're done */
3496                 if (ParallelVacuumIsActive(vacrel))
3497                         return;
3498         }
3499
3500         maxtuples = compute_max_dead_tuples(nblocks, vacrel->nindexes > 0);
3501
3502         dead_tuples = (LVDeadTuples *) palloc(SizeOfDeadTuples(maxtuples));
3503         dead_tuples->num_tuples = 0;
3504         dead_tuples->max_tuples = (int) maxtuples;
3505
3506         vacrel->dead_tuples = dead_tuples;
3507 }
3508
3509 /*
3510  * lazy_space_free - free space allocated in lazy_space_alloc
3511  */
3512 static void
3513 lazy_space_free(LVRelState *vacrel)
3514 {
3515         if (!ParallelVacuumIsActive(vacrel))
3516                 return;
3517
3518         /*
3519          * End parallel mode before updating index statistics as we cannot write
3520          * during parallel mode.
3521          */
3522         end_parallel_vacuum(vacrel);
3523 }
3524
3525 /*
3526  *      lazy_tid_reaped() -- is a particular tid deletable?
3527  *
3528  *              This has the right signature to be an IndexBulkDeleteCallback.
3529  *
3530  *              Assumes dead_tuples array is in sorted order.
3531  */
3532 static bool
3533 lazy_tid_reaped(ItemPointer itemptr, void *state)
3534 {
3535         LVDeadTuples *dead_tuples = (LVDeadTuples *) state;
3536         int64           litem,
3537                                 ritem,
3538                                 item;
3539         ItemPointer res;
3540
3541         litem = itemptr_encode(&dead_tuples->itemptrs[0]);
3542         ritem = itemptr_encode(&dead_tuples->itemptrs[dead_tuples->num_tuples - 1]);
3543         item = itemptr_encode(itemptr);
3544
3545         /*
3546          * Doing a simple bound check before bsearch() is useful to avoid the
3547          * extra cost of bsearch(), especially if dead tuples on the heap are
3548          * concentrated in a certain range.  Since this function is called for
3549          * every index tuple, it pays to be really fast.
3550          */
3551         if (item < litem || item > ritem)
3552                 return false;
3553
3554         res = (ItemPointer) bsearch((void *) itemptr,
3555                                                                 (void *) dead_tuples->itemptrs,
3556                                                                 dead_tuples->num_tuples,
3557                                                                 sizeof(ItemPointerData),
3558                                                                 vac_cmp_itemptr);
3559
3560         return (res != NULL);
3561 }
3562
3563 /*
3564  * Comparator routines for use with qsort() and bsearch().
3565  */
3566 static int
3567 vac_cmp_itemptr(const void *left, const void *right)
3568 {
3569         BlockNumber lblk,
3570                                 rblk;
3571         OffsetNumber loff,
3572                                 roff;
3573
3574         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
3575         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
3576
3577         if (lblk < rblk)
3578                 return -1;
3579         if (lblk > rblk)
3580                 return 1;
3581
3582         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
3583         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
3584
3585         if (loff < roff)
3586                 return -1;
3587         if (loff > roff)
3588                 return 1;
3589
3590         return 0;
3591 }
3592
3593 /*
3594  * Check if every tuple in the given page is visible to all current and future
3595  * transactions. Also return the visibility_cutoff_xid which is the highest
3596  * xmin amongst the visible tuples.  Set *all_frozen to true if every tuple
3597  * on this page is frozen.
3598  */
3599 static bool
3600 heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
3601                                                  TransactionId *visibility_cutoff_xid,
3602                                                  bool *all_frozen)
3603 {
3604         Page            page = BufferGetPage(buf);
3605         BlockNumber blockno = BufferGetBlockNumber(buf);
3606         OffsetNumber offnum,
3607                                 maxoff;
3608         bool            all_visible = true;
3609
3610         *visibility_cutoff_xid = InvalidTransactionId;
3611         *all_frozen = true;
3612
3613         /*
3614          * This is a stripped down version of the line pointer scan in
3615          * lazy_scan_heap(). So if you change anything here, also check that code.
3616          */
3617         maxoff = PageGetMaxOffsetNumber(page);
3618         for (offnum = FirstOffsetNumber;
3619                  offnum <= maxoff && all_visible;
3620                  offnum = OffsetNumberNext(offnum))
3621         {
3622                 ItemId          itemid;
3623                 HeapTupleData tuple;
3624
3625                 /*
3626                  * Set the offset number so that we can display it along with any
3627                  * error that occurred while processing this tuple.
3628                  */
3629                 vacrel->offnum = offnum;
3630                 itemid = PageGetItemId(page, offnum);
3631
3632                 /* Unused or redirect line pointers are of no interest */
3633                 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3634                         continue;
3635
3636                 ItemPointerSet(&(tuple.t_self), blockno, offnum);
3637
3638                 /*
3639                  * Dead line pointers can have index pointers pointing to them. So
3640                  * they can't be treated as visible
3641                  */
3642                 if (ItemIdIsDead(itemid))
3643                 {
3644                         all_visible = false;
3645                         *all_frozen = false;
3646                         break;
3647                 }
3648
3649                 Assert(ItemIdIsNormal(itemid));
3650
3651                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3652                 tuple.t_len = ItemIdGetLength(itemid);
3653                 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3654
3655                 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
3656                 {
3657                         case HEAPTUPLE_LIVE:
3658                                 {
3659                                         TransactionId xmin;
3660
3661                                         /* Check comments in lazy_scan_heap. */
3662                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3663                                         {
3664                                                 all_visible = false;
3665                                                 *all_frozen = false;
3666                                                 break;
3667                                         }
3668
3669                                         /*
3670                                          * The inserter definitely committed. But is it old enough
3671                                          * that everyone sees it as committed?
3672                                          */
3673                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3674                                         if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
3675                                         {
3676                                                 all_visible = false;
3677                                                 *all_frozen = false;
3678                                                 break;
3679                                         }
3680
3681                                         /* Track newest xmin on page. */
3682                                         if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3683                                                 *visibility_cutoff_xid = xmin;
3684
3685                                         /* Check whether this tuple is already frozen or not */
3686                                         if (all_visible && *all_frozen &&
3687                                                 heap_tuple_needs_eventual_freeze(tuple.t_data))
3688                                                 *all_frozen = false;
3689                                 }
3690                                 break;
3691
3692                         case HEAPTUPLE_DEAD:
3693                         case HEAPTUPLE_RECENTLY_DEAD:
3694                         case HEAPTUPLE_INSERT_IN_PROGRESS:
3695                         case HEAPTUPLE_DELETE_IN_PROGRESS:
3696                                 {
3697                                         all_visible = false;
3698                                         *all_frozen = false;
3699                                         break;
3700                                 }
3701                         default:
3702                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3703                                 break;
3704                 }
3705         }                                                       /* scan along page */
3706
3707         /* Clear the offset information once we have processed the given page. */
3708         vacrel->offnum = InvalidOffsetNumber;
3709
3710         return all_visible;
3711 }
3712
3713 /*
3714  * Compute the number of parallel worker processes to request.  Both index
3715  * vacuum and index cleanup can be executed with parallel workers.  The index
3716  * is eligible for parallel vacuum iff its size is greater than
3717  * min_parallel_index_scan_size as invoking workers for very small indexes
3718  * can hurt performance.
3719  *
3720  * nrequested is the number of parallel workers that user requested.  If
3721  * nrequested is 0, we compute the parallel degree based on nindexes, that is
3722  * the number of indexes that support parallel vacuum.  This function also
3723  * sets can_parallel_vacuum to remember indexes that participate in parallel
3724  * vacuum.
3725  */
3726 static int
3727 compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested,
3728                                                                 bool *can_parallel_vacuum)
3729 {
3730         int                     nindexes_parallel = 0;
3731         int                     nindexes_parallel_bulkdel = 0;
3732         int                     nindexes_parallel_cleanup = 0;
3733         int                     parallel_workers;
3734
3735         /*
3736          * We don't allow performing parallel operation in standalone backend or
3737          * when parallelism is disabled.
3738          */
3739         if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0)
3740                 return 0;
3741
3742         /*
3743          * Compute the number of indexes that can participate in parallel vacuum.
3744          */
3745         for (int idx = 0; idx < vacrel->nindexes; idx++)
3746         {
3747                 Relation        indrel = vacrel->indrels[idx];
3748                 uint8           vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3749
3750                 if (vacoptions == VACUUM_OPTION_NO_PARALLEL ||
3751                         RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size)
3752                         continue;
3753
3754                 can_parallel_vacuum[idx] = true;
3755
3756                 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3757                         nindexes_parallel_bulkdel++;
3758                 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) ||
3759                         ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3760                         nindexes_parallel_cleanup++;
3761         }
3762
3763         nindexes_parallel = Max(nindexes_parallel_bulkdel,
3764                                                         nindexes_parallel_cleanup);
3765
3766         /* The leader process takes one index */
3767         nindexes_parallel--;
3768
3769         /* No index supports parallel vacuum */
3770         if (nindexes_parallel <= 0)
3771                 return 0;
3772
3773         /* Compute the parallel degree */
3774         parallel_workers = (nrequested > 0) ?
3775                 Min(nrequested, nindexes_parallel) : nindexes_parallel;
3776
3777         /* Cap by max_parallel_maintenance_workers */
3778         parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers);
3779
3780         return parallel_workers;
3781 }
3782
3783 /*
3784  * Update index statistics in pg_class if the statistics are accurate.
3785  */
3786 static void
3787 update_index_statistics(LVRelState *vacrel)
3788 {
3789         Relation   *indrels = vacrel->indrels;
3790         int                     nindexes = vacrel->nindexes;
3791         IndexBulkDeleteResult **indstats = vacrel->indstats;
3792
3793         Assert(!IsInParallelMode());
3794
3795         for (int idx = 0; idx < nindexes; idx++)
3796         {
3797                 Relation        indrel = indrels[idx];
3798                 IndexBulkDeleteResult *istat = indstats[idx];
3799
3800                 if (istat == NULL || istat->estimated_count)
3801                         continue;
3802
3803                 /* Update index statistics */
3804                 vac_update_relstats(indrel,
3805                                                         istat->num_pages,
3806                                                         istat->num_index_tuples,
3807                                                         0,
3808                                                         false,
3809                                                         InvalidTransactionId,
3810                                                         InvalidMultiXactId,
3811                                                         false);
3812         }
3813 }
3814
3815 /*
3816  * This function prepares and returns parallel vacuum state if we can launch
3817  * even one worker.  This function is responsible for entering parallel mode,
3818  * create a parallel context, and then initialize the DSM segment.
3819  */
3820 static LVParallelState *
3821 begin_parallel_vacuum(LVRelState *vacrel, BlockNumber nblocks,
3822                                           int nrequested)
3823 {
3824         LVParallelState *lps = NULL;
3825         Relation   *indrels = vacrel->indrels;
3826         int                     nindexes = vacrel->nindexes;
3827         ParallelContext *pcxt;
3828         LVShared   *shared;
3829         LVDeadTuples *dead_tuples;
3830         BufferUsage *buffer_usage;
3831         WalUsage   *wal_usage;
3832         bool       *can_parallel_vacuum;
3833         long            maxtuples;
3834         Size            est_shared;
3835         Size            est_deadtuples;
3836         int                     nindexes_mwm = 0;
3837         int                     parallel_workers = 0;
3838         int                     querylen;
3839
3840         /*
3841          * A parallel vacuum must be requested and there must be indexes on the
3842          * relation
3843          */
3844         Assert(nrequested >= 0);
3845         Assert(nindexes > 0);
3846
3847         /*
3848          * Compute the number of parallel vacuum workers to launch
3849          */
3850         can_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes);
3851         parallel_workers = compute_parallel_vacuum_workers(vacrel,
3852                                                                                                            nrequested,
3853                                                                                                            can_parallel_vacuum);
3854
3855         /* Can't perform vacuum in parallel */
3856         if (parallel_workers <= 0)
3857         {
3858                 pfree(can_parallel_vacuum);
3859                 return lps;
3860         }
3861
3862         lps = (LVParallelState *) palloc0(sizeof(LVParallelState));
3863
3864         EnterParallelMode();
3865         pcxt = CreateParallelContext("postgres", "parallel_vacuum_main",
3866                                                                  parallel_workers);
3867         Assert(pcxt->nworkers > 0);
3868         lps->pcxt = pcxt;
3869
3870         /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3871         est_shared = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3872         for (int idx = 0; idx < nindexes; idx++)
3873         {
3874                 Relation        indrel = indrels[idx];
3875                 uint8           vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3876
3877                 /*
3878                  * Cleanup option should be either disabled, always performing in
3879                  * parallel or conditionally performing in parallel.
3880                  */
3881                 Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) ||
3882                            ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0));
3883                 Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE);
3884
3885                 /* Skip indexes that don't participate in parallel vacuum */
3886                 if (!can_parallel_vacuum[idx])
3887                         continue;
3888
3889                 if (indrel->rd_indam->amusemaintenanceworkmem)
3890                         nindexes_mwm++;
3891
3892                 est_shared = add_size(est_shared, sizeof(LVSharedIndStats));
3893
3894                 /*
3895                  * Remember the number of indexes that support parallel operation for
3896                  * each phase.
3897                  */
3898                 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3899                         lps->nindexes_parallel_bulkdel++;
3900                 if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)
3901                         lps->nindexes_parallel_cleanup++;
3902                 if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)
3903                         lps->nindexes_parallel_condcleanup++;
3904         }
3905         shm_toc_estimate_chunk(&pcxt->estimator, est_shared);
3906         shm_toc_estimate_keys(&pcxt->estimator, 1);
3907
3908         /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3909         maxtuples = compute_max_dead_tuples(nblocks, true);
3910         est_deadtuples = MAXALIGN(SizeOfDeadTuples(maxtuples));
3911         shm_toc_estimate_chunk(&pcxt->estimator, est_deadtuples);
3912         shm_toc_estimate_keys(&pcxt->estimator, 1);
3913
3914         /*
3915          * Estimate space for BufferUsage and WalUsage --
3916          * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3917          *
3918          * If there are no extensions loaded that care, we could skip this.  We
3919          * have no way of knowing whether anyone's looking at pgBufferUsage or
3920          * pgWalUsage, so do it unconditionally.
3921          */
3922         shm_toc_estimate_chunk(&pcxt->estimator,
3923                                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
3924         shm_toc_estimate_keys(&pcxt->estimator, 1);
3925         shm_toc_estimate_chunk(&pcxt->estimator,
3926                                                    mul_size(sizeof(WalUsage), pcxt->nworkers));
3927         shm_toc_estimate_keys(&pcxt->estimator, 1);
3928
3929         /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3930         if (debug_query_string)
3931         {
3932                 querylen = strlen(debug_query_string);
3933                 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
3934                 shm_toc_estimate_keys(&pcxt->estimator, 1);
3935         }
3936         else
3937                 querylen = 0;                   /* keep compiler quiet */
3938
3939         InitializeParallelDSM(pcxt);
3940
3941         /* Prepare shared information */
3942         shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared);
3943         MemSet(shared, 0, est_shared);
3944         shared->relid = RelationGetRelid(vacrel->rel);
3945         shared->elevel = elevel;
3946         shared->maintenance_work_mem_worker =
3947                 (nindexes_mwm > 0) ?
3948                 maintenance_work_mem / Min(parallel_workers, nindexes_mwm) :
3949                 maintenance_work_mem;
3950
3951         pg_atomic_init_u32(&(shared->cost_balance), 0);
3952         pg_atomic_init_u32(&(shared->active_nworkers), 0);
3953         pg_atomic_init_u32(&(shared->idx), 0);
3954         shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3955
3956         /*
3957          * Initialize variables for shared index statistics, set NULL bitmap and
3958          * the size of stats for each index.
3959          */
3960         memset(shared->bitmap, 0x00, BITMAPLEN(nindexes));
3961         for (int idx = 0; idx < nindexes; idx++)
3962         {
3963                 if (!can_parallel_vacuum[idx])
3964                         continue;
3965
3966                 /* Set NOT NULL as this index does support parallelism */
3967                 shared->bitmap[idx >> 3] |= 1 << (idx & 0x07);
3968         }
3969
3970         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared);
3971         lps->lvshared = shared;
3972
3973         /* Prepare the dead tuple space */
3974         dead_tuples = (LVDeadTuples *) shm_toc_allocate(pcxt->toc, est_deadtuples);
3975         dead_tuples->max_tuples = maxtuples;
3976         dead_tuples->num_tuples = 0;
3977         MemSet(dead_tuples->itemptrs, 0, sizeof(ItemPointerData) * maxtuples);
3978         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_TUPLES, dead_tuples);
3979         vacrel->dead_tuples = dead_tuples;
3980
3981         /*
3982          * Allocate space for each worker's BufferUsage and WalUsage; no need to
3983          * initialize
3984          */
3985         buffer_usage = shm_toc_allocate(pcxt->toc,
3986                                                                         mul_size(sizeof(BufferUsage), pcxt->nworkers));
3987         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
3988         lps->buffer_usage = buffer_usage;
3989         wal_usage = shm_toc_allocate(pcxt->toc,
3990                                                                  mul_size(sizeof(WalUsage), pcxt->nworkers));
3991         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
3992         lps->wal_usage = wal_usage;
3993
3994         /* Store query string for workers */
3995         if (debug_query_string)
3996         {
3997                 char       *sharedquery;
3998
3999                 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
4000                 memcpy(sharedquery, debug_query_string, querylen + 1);
4001                 sharedquery[querylen] = '\0';
4002                 shm_toc_insert(pcxt->toc,
4003                                            PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery);
4004         }
4005
4006         pfree(can_parallel_vacuum);
4007         return lps;
4008 }
4009
4010 /*
4011  * Destroy the parallel context, and end parallel mode.
4012  *
4013  * Since writes are not allowed during parallel mode, copy the
4014  * updated index statistics from DSM into local memory and then later use that
4015  * to update the index statistics.  One might think that we can exit from
4016  * parallel mode, update the index statistics and then destroy parallel
4017  * context, but that won't be safe (see ExitParallelMode).
4018  */
4019 static void
4020 end_parallel_vacuum(LVRelState *vacrel)
4021 {
4022         IndexBulkDeleteResult **indstats = vacrel->indstats;
4023         LVParallelState *lps = vacrel->lps;
4024         int                     nindexes = vacrel->nindexes;
4025
4026         Assert(!IsParallelWorker());
4027
4028         /* Copy the updated statistics */
4029         for (int idx = 0; idx < nindexes; idx++)
4030         {
4031                 LVSharedIndStats *shared_istat;
4032
4033                 shared_istat = parallel_stats_for_idx(lps->lvshared, idx);
4034
4035                 /*
4036                  * Skip unused slot.  The statistics of this index are already stored
4037                  * in local memory.
4038                  */
4039                 if (shared_istat == NULL)
4040                         continue;
4041
4042                 if (shared_istat->updated)
4043                 {
4044                         indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
4045                         memcpy(indstats[idx], &(shared_istat->istat), sizeof(IndexBulkDeleteResult));
4046                 }
4047                 else
4048                         indstats[idx] = NULL;
4049         }
4050
4051         DestroyParallelContext(lps->pcxt);
4052         ExitParallelMode();
4053
4054         /* Deactivate parallel vacuum */
4055         pfree(lps);
4056         vacrel->lps = NULL;
4057 }
4058
4059 /*
4060  * Return shared memory statistics for index at offset 'getidx', if any
4061  */
4062 static LVSharedIndStats *
4063 parallel_stats_for_idx(LVShared *lvshared, int getidx)
4064 {
4065         char       *p;
4066
4067         if (IndStatsIsNull(lvshared, getidx))
4068                 return NULL;
4069
4070         p = (char *) GetSharedIndStats(lvshared);
4071         for (int idx = 0; idx < getidx; idx++)
4072         {
4073                 if (IndStatsIsNull(lvshared, idx))
4074                         continue;
4075
4076                 p += sizeof(LVSharedIndStats);
4077         }
4078
4079         return (LVSharedIndStats *) p;
4080 }
4081
4082 /*
4083  * Returns false, if the given index can't participate in parallel index
4084  * vacuum or parallel index cleanup
4085  */
4086 static bool
4087 parallel_processing_is_safe(Relation indrel, LVShared *lvshared)
4088 {
4089         uint8           vacoptions = indrel->rd_indam->amparallelvacuumoptions;
4090
4091         /* first_time must be true only if for_cleanup is true */
4092         Assert(lvshared->for_cleanup || !lvshared->first_time);
4093
4094         if (lvshared->for_cleanup)
4095         {
4096                 /* Skip, if the index does not support parallel cleanup */
4097                 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) &&
4098                         ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0))
4099                         return true;
4100
4101                 /*
4102                  * Skip, if the index supports parallel cleanup conditionally, but we
4103                  * have already processed the index (for bulkdelete).  See the
4104                  * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
4105                  * when indexes support parallel cleanup conditionally.
4106                  */
4107                 if (!lvshared->first_time &&
4108                         ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
4109                         return false;
4110         }
4111         else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0)
4112         {
4113                 /* Skip if the index does not support parallel bulk deletion */
4114                 return false;
4115         }
4116
4117         return true;
4118 }
4119
4120 /*
4121  * Perform work within a launched parallel process.
4122  *
4123  * Since parallel vacuum workers perform only index vacuum or index cleanup,
4124  * we don't need to report progress information.
4125  */
4126 void
4127 parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
4128 {
4129         Relation        rel;
4130         Relation   *indrels;
4131         LVShared   *lvshared;
4132         LVDeadTuples *dead_tuples;
4133         BufferUsage *buffer_usage;
4134         WalUsage   *wal_usage;
4135         int                     nindexes;
4136         char       *sharedquery;
4137         LVRelState      vacrel;
4138         ErrorContextCallback errcallback;
4139
4140         lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED,
4141                                                                                    false);
4142         elevel = lvshared->elevel;
4143
4144         if (lvshared->for_cleanup)
4145                 elog(DEBUG1, "starting parallel vacuum worker for cleanup");
4146         else
4147                 elog(DEBUG1, "starting parallel vacuum worker for bulk delete");
4148
4149         /* Set debug_query_string for individual workers */
4150         sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true);
4151         debug_query_string = sharedquery;
4152         pgstat_report_activity(STATE_RUNNING, debug_query_string);
4153
4154         /*
4155          * Open table.  The lock mode is the same as the leader process.  It's
4156          * okay because the lock mode does not conflict among the parallel
4157          * workers.
4158          */
4159         rel = table_open(lvshared->relid, ShareUpdateExclusiveLock);
4160
4161         /*
4162          * Open all indexes. indrels are sorted in order by OID, which should be
4163          * matched to the leader's one.
4164          */
4165         vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels);
4166         Assert(nindexes > 0);
4167
4168         /* Set dead tuple space */
4169         dead_tuples = (LVDeadTuples *) shm_toc_lookup(toc,
4170                                                                                                   PARALLEL_VACUUM_KEY_DEAD_TUPLES,
4171                                                                                                   false);
4172
4173         /* Set cost-based vacuum delay */
4174         VacuumCostActive = (VacuumCostDelay > 0);
4175         VacuumCostBalance = 0;
4176         VacuumPageHit = 0;
4177         VacuumPageMiss = 0;
4178         VacuumPageDirty = 0;
4179         VacuumCostBalanceLocal = 0;
4180         VacuumSharedCostBalance = &(lvshared->cost_balance);
4181         VacuumActiveNWorkers = &(lvshared->active_nworkers);
4182
4183         vacrel.rel = rel;
4184         vacrel.indrels = indrels;
4185         vacrel.nindexes = nindexes;
4186         /* Each parallel VACUUM worker gets its own access strategy */
4187         vacrel.bstrategy = GetAccessStrategy(BAS_VACUUM);
4188         vacrel.indstats = (IndexBulkDeleteResult **)
4189                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
4190
4191         if (lvshared->maintenance_work_mem_worker > 0)
4192                 maintenance_work_mem = lvshared->maintenance_work_mem_worker;
4193
4194         /*
4195          * Initialize vacrel for use as error callback arg by parallel worker.
4196          */
4197         vacrel.relnamespace = get_namespace_name(RelationGetNamespace(rel));
4198         vacrel.relname = pstrdup(RelationGetRelationName(rel));
4199         vacrel.indname = NULL;
4200         vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN;      /* Not yet processing */
4201         vacrel.dead_tuples = dead_tuples;
4202
4203         /* Setup error traceback support for ereport() */
4204         errcallback.callback = vacuum_error_callback;
4205         errcallback.arg = &vacrel;
4206         errcallback.previous = error_context_stack;
4207         error_context_stack = &errcallback;
4208
4209         /* Prepare to track buffer usage during parallel execution */
4210         InstrStartParallelQuery();
4211
4212         /* Process indexes to perform vacuum/cleanup */
4213         do_parallel_processing(&vacrel, lvshared);
4214
4215         /* Report buffer/WAL usage during parallel execution */
4216         buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
4217         wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
4218         InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
4219                                                   &wal_usage[ParallelWorkerNumber]);
4220
4221         /* Pop the error context stack */
4222         error_context_stack = errcallback.previous;
4223
4224         vac_close_indexes(nindexes, indrels, RowExclusiveLock);
4225         table_close(rel, ShareUpdateExclusiveLock);
4226         FreeAccessStrategy(vacrel.bstrategy);
4227         pfree(vacrel.indstats);
4228 }
4229
4230 /*
4231  * Error context callback for errors occurring during vacuum.
4232  */
4233 static void
4234 vacuum_error_callback(void *arg)
4235 {
4236         LVRelState *errinfo = arg;
4237
4238         switch (errinfo->phase)
4239         {
4240                 case VACUUM_ERRCB_PHASE_SCAN_HEAP:
4241                         if (BlockNumberIsValid(errinfo->blkno))
4242                         {
4243                                 if (OffsetNumberIsValid(errinfo->offnum))
4244                                         errcontext("while scanning block %u and offset %u of relation \"%s.%s\"",
4245                                                            errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4246                                 else
4247                                         errcontext("while scanning block %u of relation \"%s.%s\"",
4248                                                            errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4249                         }
4250                         else
4251                                 errcontext("while scanning relation \"%s.%s\"",
4252                                                    errinfo->relnamespace, errinfo->relname);
4253                         break;
4254
4255                 case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
4256                         if (BlockNumberIsValid(errinfo->blkno))
4257                         {
4258                                 if (OffsetNumberIsValid(errinfo->offnum))
4259                                         errcontext("while vacuuming block %u and offset %u of relation \"%s.%s\"",
4260                                                            errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4261                                 else
4262                                         errcontext("while vacuuming block %u of relation \"%s.%s\"",
4263                                                            errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4264                         }
4265                         else
4266                                 errcontext("while vacuuming relation \"%s.%s\"",
4267                                                    errinfo->relnamespace, errinfo->relname);
4268                         break;
4269
4270                 case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
4271                         errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
4272                                            errinfo->indname, errinfo->relnamespace, errinfo->relname);
4273                         break;
4274
4275                 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
4276                         errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
4277                                            errinfo->indname, errinfo->relnamespace, errinfo->relname);
4278                         break;
4279
4280                 case VACUUM_ERRCB_PHASE_TRUNCATE:
4281                         if (BlockNumberIsValid(errinfo->blkno))
4282                                 errcontext("while truncating relation \"%s.%s\" to %u blocks",
4283                                                    errinfo->relnamespace, errinfo->relname, errinfo->blkno);
4284                         break;
4285
4286                 case VACUUM_ERRCB_PHASE_UNKNOWN:
4287                 default:
4288                         return;                         /* do nothing; the errinfo may not be
4289                                                                  * initialized */
4290         }
4291 }
4292
4293 /*
4294  * Updates the information required for vacuum error callback.  This also saves
4295  * the current information which can be later restored via restore_vacuum_error_info.
4296  */
4297 static void
4298 update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
4299                                                  int phase, BlockNumber blkno, OffsetNumber offnum)
4300 {
4301         if (saved_vacrel)
4302         {
4303                 saved_vacrel->offnum = vacrel->offnum;
4304                 saved_vacrel->blkno = vacrel->blkno;
4305                 saved_vacrel->phase = vacrel->phase;
4306         }
4307
4308         vacrel->blkno = blkno;
4309         vacrel->offnum = offnum;
4310         vacrel->phase = phase;
4311 }
4312
4313 /*
4314  * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
4315  */
4316 static void
4317 restore_vacuum_error_info(LVRelState *vacrel,
4318                                                   const LVSavedErrInfo *saved_vacrel)
4319 {
4320         vacrel->blkno = saved_vacrel->blkno;
4321         vacrel->offnum = saved_vacrel->offnum;
4322         vacrel->phase = saved_vacrel->phase;
4323 }