src/backend/access/heap/vacuumlazy.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * vacuumlazy.c
   4  *        Concurrent ("lazy") vacuuming.
   5  *
   6  *
   7  * The major space usage for LAZY VACUUM is storage for the array of dead tuple
   8  * TIDs.  We want to ensure we can vacuum even the very largest relations with
   9  * finite memory space usage.  To do that, we set upper bounds on the number of
  10  * tuples we will keep track of at once.
  11  *
  12  * We are willing to use at most maintenance_work_mem (or perhaps
  13  * autovacuum_work_mem) memory space to keep track of dead tuples.  We
  14  * initially allocate an array of TIDs of that size, with an upper limit that
  15  * depends on table size (this limit ensures we don't allocate a huge area
  16  * uselessly for vacuuming small tables).  If the array threatens to overflow,
  17  * we suspend the heap scan phase and perform a pass of index cleanup and page
  18  * compaction, then resume the heap scan with an empty TID array.
  19  *
  20  * If we're processing a table with no indexes, we can just vacuum each page
  21  * as we go; there's no need to save up multiple tuples to minimize the number
  22  * of index scans performed.  So we don't use maintenance_work_mem memory for
  23  * the TID array, just enough to hold as many heap tuples as fit on one page.
  24  *
  25  * Lazy vacuum supports parallel execution with parallel worker processes.  In
  26  * a parallel vacuum, we perform both index vacuum and index cleanup with
  27  * parallel worker processes.  Individual indexes are processed by one vacuum
  28  * process.  At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
  29  * the parallel context and initialize the DSM segment that contains shared
  30  * information as well as the memory space for storing dead tuples.  When
  31  * starting either index vacuum or index cleanup, we launch parallel worker
  32  * processes.  Once all indexes are processed the parallel worker processes
  33  * exit.  After that, the leader process re-initializes the parallel context
  34  * so that it can use the same DSM for multiple passes of index vacuum and
  35  * for performing index cleanup.  For updating the index statistics, we need
  36  * to update the system table and since updates are not allowed during
  37  * parallel mode we update the index statistics after exiting from the
  38  * parallel mode.
  39  *
  40  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  41  * Portions Copyright (c) 1994, Regents of the University of California
  42  *
  43  *
  44  * IDENTIFICATION
  45  *        src/backend/access/heap/vacuumlazy.c
  46  *
  47  *-------------------------------------------------------------------------
  48  */
  49 #include "postgres.h"
  50
  51 #include <math.h>
  52
  53 #include "access/amapi.h"
  54 #include "access/genam.h"
  55 #include "access/heapam.h"
  56 #include "access/heapam_xlog.h"
  57 #include "access/htup_details.h"
  58 #include "access/multixact.h"
  59 #include "access/parallel.h"
  60 #include "access/transam.h"
  61 #include "access/visibilitymap.h"
  62 #include "access/xact.h"
  63 #include "access/xlog.h"
  64 #include "catalog/index.h"
  65 #include "catalog/storage.h"
  66 #include "commands/dbcommands.h"
  67 #include "commands/progress.h"
  68 #include "commands/vacuum.h"
  69 #include "executor/instrument.h"
  70 #include "miscadmin.h"
  71 #include "optimizer/paths.h"
  72 #include "pgstat.h"
  73 #include "portability/instr_time.h"
  74 #include "postmaster/autovacuum.h"
  75 #include "storage/bufmgr.h"
  76 #include "storage/freespace.h"
  77 #include "storage/lmgr.h"
  78 #include "tcop/tcopprot.h"
  79 #include "utils/lsyscache.h"
  80 #include "utils/memutils.h"
  81 #include "utils/pg_rusage.h"
  82 #include "utils/timestamp.h"
  83
  84
  85 /*
  86  * Space/time tradeoff parameters: do these need to be user-tunable?
  87  *
  88  * To consider truncating the relation, we want there to be at least
  89  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
  90  * is less) potentially-freeable pages.
  91  */
  92 #define REL_TRUNCATE_MINIMUM    1000
  93 #define REL_TRUNCATE_FRACTION   16
  94
  95 /*
  96  * Timing parameters for truncate locking heuristics.
  97  *
  98  * These were not exposed as user tunable GUC values because it didn't seem
  99  * that the potential for improvement was great enough to merit the cost of
 100  * supporting them.
 101  */
 102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL             20      /* ms */
 103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL              50      /* ms */
 104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT                    5000    /* ms */
 105
 106 /*
 107  * When a table has no indexes, vacuum the FSM after every 8GB, approximately
 108  * (it won't be exact because we only vacuum FSM after processing a heap page
 109  * that has some removable tuples).  When there are indexes, this is ignored,
 110  * and we vacuum FSM after each index/heap cleaning pass.
 111  */
 112 #define VACUUM_FSM_EVERY_PAGES \
 113         ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
 114
 115 /*
 116  * Guesstimation of number of dead tuples per page.  This is used to
 117  * provide an upper limit to memory allocated when vacuuming small
 118  * tables.
 119  */
 120 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
 121
 122 /*
 123  * Before we consider skipping a page that's marked as clean in
 124  * visibility map, we must've seen at least this many clean pages.
 125  */
 126 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
 127
 128 /*
 129  * Size of the prefetch window for lazy vacuum backwards truncation scan.
 130  * Needs to be a power of 2.
 131  */
 132 #define PREFETCH_SIZE                   ((BlockNumber) 32)
 133
 134 /*
 135  * DSM keys for parallel vacuum.  Unlike other parallel execution code, since
 136  * we don't need to worry about DSM keys conflicting with plan_node_id we can
 137  * use small integers.
 138  */
 139 #define PARALLEL_VACUUM_KEY_SHARED                      1
 140 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES         2
 141 #define PARALLEL_VACUUM_KEY_QUERY_TEXT          3
 142 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE        4
 143 #define PARALLEL_VACUUM_KEY_WAL_USAGE           5
 144
 145 /*
 146  * Macro to check if we are in a parallel vacuum.  If true, we are in the
 147  * parallel mode and the DSM segment is initialized.
 148  */
 149 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
 150
 151 /* Phases of vacuum during which we report error context. */
 152 typedef enum
 153 {
 154         VACUUM_ERRCB_PHASE_UNKNOWN,
 155         VACUUM_ERRCB_PHASE_SCAN_HEAP,
 156         VACUUM_ERRCB_PHASE_VACUUM_INDEX,
 157         VACUUM_ERRCB_PHASE_VACUUM_HEAP,
 158         VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
 159         VACUUM_ERRCB_PHASE_TRUNCATE
 160 } VacErrPhase;
 161
 162 /*
 163  * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
 164  * This is allocated in the DSM segment in parallel mode and in local memory
 165  * in non-parallel mode.
 166  */
 167 typedef struct LVDeadTuples
 168 {
 169         int                     max_tuples;             /* # slots allocated in array */
 170         int                     num_tuples;             /* current # of entries */
 171         /* List of TIDs of tuples we intend to delete */
 172         /* NB: this list is ordered by TID address */
 173         ItemPointerData itemptrs[FLEXIBLE_ARRAY_MEMBER];        /* array of
 174                                                                                                                  * ItemPointerData */
 175 } LVDeadTuples;
 176
 177 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
 178 #define SizeOfDeadTuples(cnt) \
 179         add_size(offsetof(LVDeadTuples, itemptrs), \
 180                          mul_size(sizeof(ItemPointerData), cnt))
 181 #define MAXDEADTUPLES(max_size) \
 182                 (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
 183
 184 /*
 185  * Shared information among parallel workers.  So this is allocated in the DSM
 186  * segment.
 187  */
 188 typedef struct LVShared
 189 {
 190         /*
 191          * Target table relid and log level.  These fields are not modified during
 192          * the lazy vacuum.
 193          */
 194         Oid                     relid;
 195         int                     elevel;
 196
 197         /*
 198          * An indication for vacuum workers to perform either index vacuum or
 199          * index cleanup.  first_time is true only if for_cleanup is true and
 200          * bulk-deletion is not performed yet.
 201          */
 202         bool            for_cleanup;
 203         bool            first_time;
 204
 205         /*
 206          * Fields for both index vacuum and cleanup.
 207          *
 208          * reltuples is the total number of input heap tuples.  We set either old
 209          * live tuples in the index vacuum case or the new live tuples in the
 210          * index cleanup case.
 211          *
 212          * estimated_count is true if reltuples is an estimated value.  (Note that
 213          * reltuples could be -1 in this case, indicating we have no idea.)
 214          */
 215         double          reltuples;
 216         bool            estimated_count;
 217
 218         /*
 219          * In single process lazy vacuum we could consume more memory during index
 220          * vacuuming or cleanup apart from the memory for heap scanning.  In
 221          * parallel vacuum, since individual vacuum workers can consume memory
 222          * equal to maintenance_work_mem, the new maintenance_work_mem for each
 223          * worker is set such that the parallel operation doesn't consume more
 224          * memory than single process lazy vacuum.
 225          */
 226         int                     maintenance_work_mem_worker;
 227
 228         /*
 229          * Shared vacuum cost balance.  During parallel vacuum,
 230          * VacuumSharedCostBalance points to this value and it accumulates the
 231          * balance of each parallel vacuum worker.
 232          */
 233         pg_atomic_uint32 cost_balance;
 234
 235         /*
 236          * Number of active parallel workers.  This is used for computing the
 237          * minimum threshold of the vacuum cost balance before a worker sleeps for
 238          * cost-based delay.
 239          */
 240         pg_atomic_uint32 active_nworkers;
 241
 242         /*
 243          * Variables to control parallel vacuum.  We have a bitmap to indicate
 244          * which index has stats in shared memory.  The set bit in the map
 245          * indicates that the particular index supports a parallel vacuum.
 246          */
 247         pg_atomic_uint32 idx;           /* counter for vacuuming and clean up */
 248         uint32          offset;                 /* sizeof header incl. bitmap */
 249         bits8           bitmap[FLEXIBLE_ARRAY_MEMBER];  /* bit map of NULLs */
 250
 251         /* Shared index statistics data follows at end of struct */
 252 } LVShared;
 253
 254 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
 255 #define GetSharedIndStats(s) \
 256         ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
 257 #define IndStatsIsNull(s, i) \
 258         (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
 259
 260 /*
 261  * Struct for an index bulk-deletion statistic used for parallel vacuum.  This
 262  * is allocated in the DSM segment.
 263  */
 264 typedef struct LVSharedIndStats
 265 {
 266         bool            updated;                /* are the stats updated? */
 267         IndexBulkDeleteResult istat;
 268 } LVSharedIndStats;
 269
 270 /* Struct for maintaining a parallel vacuum state. */
 271 typedef struct LVParallelState
 272 {
 273         ParallelContext *pcxt;
 274
 275         /* Shared information among parallel vacuum workers */
 276         LVShared   *lvshared;
 277
 278         /* Points to buffer usage area in DSM */
 279         BufferUsage *buffer_usage;
 280
 281         /* Points to WAL usage area in DSM */
 282         WalUsage   *wal_usage;
 283
 284         /*
 285          * The number of indexes that support parallel index bulk-deletion and
 286          * parallel index cleanup respectively.
 287          */
 288         int                     nindexes_parallel_bulkdel;
 289         int                     nindexes_parallel_cleanup;
 290         int                     nindexes_parallel_condcleanup;
 291 } LVParallelState;
 292
 293 typedef struct LVRelState
 294 {
 295         /* Target heap relation and its indexes */
 296         Relation        rel;
 297         Relation   *indrels;
 298         int                     nindexes;
 299         /* useindex = true means two-pass strategy; false means one-pass */
 300         bool            useindex;
 301
 302         /* Buffer access strategy and parallel state */
 303         BufferAccessStrategy bstrategy;
 304         LVParallelState *lps;
 305
 306         /* Statistics from pg_class when we start out */
 307         BlockNumber old_rel_pages;      /* previous value of pg_class.relpages */
 308         double          old_live_tuples;        /* previous value of pg_class.reltuples */
 309         /* rel's initial relfrozenxid and relminmxid */
 310         TransactionId relfrozenxid;
 311         MultiXactId relminmxid;
 312         TransactionId latestRemovedXid;
 313
 314         /* VACUUM operation's cutoff for pruning */
 315         TransactionId OldestXmin;
 316         /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
 317         TransactionId FreezeLimit;
 318         MultiXactId MultiXactCutoff;
 319
 320         /* Error reporting state */
 321         char       *relnamespace;
 322         char       *relname;
 323         char       *indname;
 324         BlockNumber blkno;                      /* used only for heap operations */
 325         OffsetNumber offnum;            /* used only for heap operations */
 326         VacErrPhase phase;
 327
 328         /*
 329          * State managed by lazy_scan_heap() follows
 330          */
 331         LVDeadTuples *dead_tuples;      /* items to vacuum from indexes */
 332         BlockNumber rel_pages;          /* total number of pages */
 333         BlockNumber scanned_pages;      /* number of pages we examined */
 334         BlockNumber pinskipped_pages;   /* # of pages skipped due to a pin */
 335         BlockNumber frozenskipped_pages;        /* # of frozen pages we skipped */
 336         BlockNumber tupcount_pages; /* pages whose tuples we counted */
 337         BlockNumber pages_removed;      /* pages remove by truncation */
 338         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
 339         bool            lock_waiter_detected;
 340
 341         /* Statistics output by us, for table */
 342         double          new_rel_tuples; /* new estimated total # of tuples */
 343         double          new_live_tuples;        /* new estimated total # of live tuples */
 344         /* Statistics output by index AMs */
 345         IndexBulkDeleteResult **indstats;
 346
 347         /* Instrumentation counters */
 348         int                     num_index_scans;
 349         int64           tuples_deleted; /* # deleted from table */
 350         int64           new_dead_tuples;        /* new estimated total # of dead items in
 351                                                                          * table */
 352         int64           num_tuples;             /* total number of nonremovable tuples */
 353         int64           live_tuples;    /* live tuples (reltuples estimate) */
 354 } LVRelState;
 355
 356 /* Struct for saving and restoring vacuum error information. */
 357 typedef struct LVSavedErrInfo
 358 {
 359         BlockNumber blkno;
 360         OffsetNumber offnum;
 361         VacErrPhase phase;
 362 } LVSavedErrInfo;
 363
 364 /* elevel controls whole VACUUM's verbosity */
 365 static int      elevel = -1;
 366
 367
 368 /* non-export function prototypes */
 369 static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params,
 370                                                    bool aggressive);
 371 static void lazy_vacuum_all_indexes(LVRelState *vacrel);
 372 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
 373 static int      lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
 374                                                                   Buffer buffer, int tupindex, Buffer *vmbuffer);
 375 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
 376                                                                         LVRelState *vacrel);
 377 static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel);
 378 static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel);
 379 static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers);
 380 static void do_parallel_processing(LVRelState *vacrel,
 381                                                                    LVShared *lvshared);
 382 static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel,
 383                                                                                                         LVShared *lvshared);
 384 static IndexBulkDeleteResult *parallel_process_one_index(Relation indrel,
 385                                                                                                                  IndexBulkDeleteResult *istat,
 386                                                                                                                  LVShared *lvshared,
 387                                                                                                                  LVSharedIndStats *shared_indstats,
 388                                                                                                                  LVRelState *vacrel);
 389 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
 390 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
 391                                                                                                         IndexBulkDeleteResult *istat,
 392                                                                                                         double reltuples,
 393                                                                                                         LVRelState *vacrel);
 394 static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
 395                                                                                                          IndexBulkDeleteResult *istat,
 396                                                                                                          double reltuples,
 397                                                                                                          bool estimated_count,
 398                                                                                                          LVRelState *vacrel);
 399 static bool should_attempt_truncation(LVRelState *vacrel,
 400                                                                           VacuumParams *params);
 401 static void lazy_truncate_heap(LVRelState *vacrel);
 402 static BlockNumber count_nondeletable_pages(LVRelState *vacrel);
 403 static long compute_max_dead_tuples(BlockNumber relblocks, bool hasindex);
 404 static void lazy_space_alloc(LVRelState *vacrel, int nworkers,
 405                                                          BlockNumber relblocks);
 406 static void lazy_space_free(LVRelState *vacrel);
 407 static void lazy_record_dead_tuple(LVDeadTuples *dead_tuples,
 408                                                                    ItemPointer itemptr);
 409 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
 410 static int      vac_cmp_itemptr(const void *left, const void *right);
 411 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 412                                                                          TransactionId *visibility_cutoff_xid, bool *all_frozen);
 413 static int      compute_parallel_vacuum_workers(LVRelState *vacrel,
 414                                                                                         int nrequested,
 415                                                                                         bool *can_parallel_vacuum);
 416 static void update_index_statistics(LVRelState *vacrel);
 417 static LVParallelState *begin_parallel_vacuum(LVRelState *vacrel,
 418                                                                                           BlockNumber nblocks,
 419                                                                                           int nrequested);
 420 static void end_parallel_vacuum(LVRelState *vacrel);
 421 static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx);
 422 static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared);
 423 static void vacuum_error_callback(void *arg);
 424 static void update_vacuum_error_info(LVRelState *vacrel,
 425                                                                          LVSavedErrInfo *saved_vacrel,
 426                                                                          int phase, BlockNumber blkno,
 427                                                                          OffsetNumber offnum);
 428 static void restore_vacuum_error_info(LVRelState *vacrel,
 429                                                                           const LVSavedErrInfo *saved_vacrel);
 430
 431
 432 /*
 433  *      heap_vacuum_rel() -- perform VACUUM for one heap relation
 434  *
 435  *              This routine vacuums a single heap, cleans out its indexes, and
 436  *              updates its relpages and reltuples statistics.
 437  *
 438  *              At entry, we have already established a transaction and opened
 439  *              and locked the relation.
 440  */
 441 void
 442 heap_vacuum_rel(Relation rel, VacuumParams *params,
 443                                 BufferAccessStrategy bstrategy)
 444 {
 445         LVRelState *vacrel;
 446         PGRUsage        ru0;
 447         TimestampTz starttime = 0;
 448         WalUsage        walusage_start = pgWalUsage;
 449         WalUsage        walusage = {0, 0, 0};
 450         long            secs;
 451         int                     usecs;
 452         double          read_rate,
 453                                 write_rate;
 454         bool            aggressive;             /* should we scan all unfrozen pages? */
 455         bool            scanned_all_unfrozen;   /* actually scanned all such pages? */
 456         char      **indnames = NULL;
 457         TransactionId xidFullScanLimit;
 458         MultiXactId mxactFullScanLimit;
 459         BlockNumber new_rel_pages;
 460         BlockNumber new_rel_allvisible;
 461         double          new_live_tuples;
 462         TransactionId new_frozen_xid;
 463         MultiXactId new_min_multi;
 464         ErrorContextCallback errcallback;
 465         PgStat_Counter startreadtime = 0;
 466         PgStat_Counter startwritetime = 0;
 467         TransactionId OldestXmin;
 468         TransactionId FreezeLimit;
 469         MultiXactId MultiXactCutoff;
 470
 471         Assert(params != NULL);
 472         Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
 473         Assert(params->truncate != VACOPT_TERNARY_DEFAULT);
 474
 475         /* measure elapsed time iff autovacuum logging requires it */
 476         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
 477         {
 478                 pg_rusage_init(&ru0);
 479                 starttime = GetCurrentTimestamp();
 480                 if (track_io_timing)
 481                 {
 482                         startreadtime = pgStatBlockReadTime;
 483                         startwritetime = pgStatBlockWriteTime;
 484                 }
 485         }
 486
 487         if (params->options & VACOPT_VERBOSE)
 488                 elevel = INFO;
 489         else
 490                 elevel = DEBUG2;
 491
 492         pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
 493                                                                   RelationGetRelid(rel));
 494
 495         vacuum_set_xid_limits(rel,
 496                                                   params->freeze_min_age,
 497                                                   params->freeze_table_age,
 498                                                   params->multixact_freeze_min_age,
 499                                                   params->multixact_freeze_table_age,
 500                                                   &OldestXmin, &FreezeLimit, &xidFullScanLimit,
 501                                                   &MultiXactCutoff, &mxactFullScanLimit);
 502
 503         /*
 504          * We request an aggressive scan if the table's frozen Xid is now older
 505          * than or equal to the requested Xid full-table scan limit; or if the
 506          * table's minimum MultiXactId is older than or equal to the requested
 507          * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
 508          */
 509         aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid,
 510                                                                                            xidFullScanLimit);
 511         aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid,
 512                                                                                           mxactFullScanLimit);
 513         if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
 514                 aggressive = true;
 515
 516         vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
 517
 518         /* Set up high level stuff about rel */
 519         vacrel->rel = rel;
 520         vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
 521                                          &vacrel->indrels);
 522         vacrel->useindex = (vacrel->nindexes > 0 &&
 523                                                 params->index_cleanup == VACOPT_TERNARY_ENABLED);
 524         vacrel->bstrategy = bstrategy;
 525         vacrel->old_rel_pages = rel->rd_rel->relpages;
 526         vacrel->old_live_tuples = rel->rd_rel->reltuples;
 527         vacrel->relfrozenxid = rel->rd_rel->relfrozenxid;
 528         vacrel->relminmxid = rel->rd_rel->relminmxid;
 529         vacrel->latestRemovedXid = InvalidTransactionId;
 530
 531         /* Set cutoffs for entire VACUUM */
 532         vacrel->OldestXmin = OldestXmin;
 533         vacrel->FreezeLimit = FreezeLimit;
 534         vacrel->MultiXactCutoff = MultiXactCutoff;
 535
 536         vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
 537         vacrel->relname = pstrdup(RelationGetRelationName(rel));
 538         vacrel->indname = NULL;
 539         vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
 540
 541         /* Save index names iff autovacuum logging requires it */
 542         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 &&
 543                 vacrel->nindexes > 0)
 544         {
 545                 indnames = palloc(sizeof(char *) * vacrel->nindexes);
 546                 for (int i = 0; i < vacrel->nindexes; i++)
 547                         indnames[i] =
 548                                 pstrdup(RelationGetRelationName(vacrel->indrels[i]));
 549         }
 550
 551         /*
 552          * Setup error traceback support for ereport().  The idea is to set up an
 553          * error context callback to display additional information on any error
 554          * during a vacuum.  During different phases of vacuum (heap scan, heap
 555          * vacuum, index vacuum, index clean up, heap truncate), we update the
 556          * error context callback to display appropriate information.
 557          *
 558          * Note that the index vacuum and heap vacuum phases may be called
 559          * multiple times in the middle of the heap scan phase.  So the old phase
 560          * information is restored at the end of those phases.
 561          */
 562         errcallback.callback = vacuum_error_callback;
 563         errcallback.arg = vacrel;
 564         errcallback.previous = error_context_stack;
 565         error_context_stack = &errcallback;
 566
 567         /* Do the vacuuming */
 568         lazy_scan_heap(vacrel, params, aggressive);
 569
 570         /* Done with indexes */
 571         vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
 572
 573         /*
 574          * Compute whether we actually scanned the all unfrozen pages. If we did,
 575          * we can adjust relfrozenxid and relminmxid.
 576          *
 577          * NB: We need to check this before truncating the relation, because that
 578          * will change ->rel_pages.
 579          */
 580         if ((vacrel->scanned_pages + vacrel->frozenskipped_pages)
 581                 < vacrel->rel_pages)
 582         {
 583                 Assert(!aggressive);
 584                 scanned_all_unfrozen = false;
 585         }
 586         else
 587                 scanned_all_unfrozen = true;
 588
 589         /*
 590          * Optionally truncate the relation.
 591          */
 592         if (should_attempt_truncation(vacrel, params))
 593         {
 594                 /*
 595                  * Update error traceback information.  This is the last phase during
 596                  * which we add context information to errors, so we don't need to
 597                  * revert to the previous phase.
 598                  */
 599                 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
 600                                                                  vacrel->nonempty_pages,
 601                                                                  InvalidOffsetNumber);
 602                 lazy_truncate_heap(vacrel);
 603         }
 604
 605         /* Pop the error context stack */
 606         error_context_stack = errcallback.previous;
 607
 608         /* Report that we are now doing final cleanup */
 609         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
 610                                                                  PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
 611
 612         /*
 613          * Update statistics in pg_class.
 614          *
 615          * In principle new_live_tuples could be -1 indicating that we (still)
 616          * don't know the tuple count.  In practice that probably can't happen,
 617          * since we'd surely have scanned some pages if the table is new and
 618          * nonempty.
 619          *
 620          * For safety, clamp relallvisible to be not more than what we're setting
 621          * relpages to.
 622          *
 623          * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
 624          * since then we don't know for certain that all tuples have a newer xmin.
 625          */
 626         new_rel_pages = vacrel->rel_pages;
 627         new_live_tuples = vacrel->new_live_tuples;
 628
 629         visibilitymap_count(rel, &new_rel_allvisible, NULL);
 630         if (new_rel_allvisible > new_rel_pages)
 631                 new_rel_allvisible = new_rel_pages;
 632
 633         new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
 634         new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
 635
 636         vac_update_relstats(rel,
 637                                                 new_rel_pages,
 638                                                 new_live_tuples,
 639                                                 new_rel_allvisible,
 640                                                 vacrel->nindexes > 0,
 641                                                 new_frozen_xid,
 642                                                 new_min_multi,
 643                                                 false);
 644
 645         /* report results to the stats collector, too */
 646         pgstat_report_vacuum(RelationGetRelid(rel),
 647                                                  rel->rd_rel->relisshared,
 648                                                  Max(new_live_tuples, 0),
 649                                                  vacrel->new_dead_tuples);
 650         pgstat_progress_end_command();
 651
 652         /* and log the action if appropriate */
 653         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
 654         {
 655                 TimestampTz endtime = GetCurrentTimestamp();
 656
 657                 if (params->log_min_duration == 0 ||
 658                         TimestampDifferenceExceeds(starttime, endtime,
 659                                                                            params->log_min_duration))
 660                 {
 661                         StringInfoData buf;
 662                         char       *msgfmt;
 663
 664                         TimestampDifference(starttime, endtime, &secs, &usecs);
 665
 666                         memset(&walusage, 0, sizeof(WalUsage));
 667                         WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
 668
 669                         read_rate = 0;
 670                         write_rate = 0;
 671                         if ((secs > 0) || (usecs > 0))
 672                         {
 673                                 read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
 674                                         (secs + usecs / 1000000.0);
 675                                 write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
 676                                         (secs + usecs / 1000000.0);
 677                         }
 678
 679                         /*
 680                          * This is pretty messy, but we split it up so that we can skip
 681                          * emitting individual parts of the message when not applicable.
 682                          */
 683                         initStringInfo(&buf);
 684                         if (params->is_wraparound)
 685                         {
 686                                 /*
 687                                  * While it's possible for a VACUUM to be both is_wraparound
 688                                  * and !aggressive, that's just a corner-case -- is_wraparound
 689                                  * implies aggressive.  Produce distinct output for the corner
 690                                  * case all the same, just in case.
 691                                  */
 692                                 if (aggressive)
 693                                         msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
 694                                 else
 695                                         msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
 696                         }
 697                         else
 698                         {
 699                                 if (aggressive)
 700                                         msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
 701                                 else
 702                                         msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
 703                         }
 704                         appendStringInfo(&buf, msgfmt,
 705                                                          get_database_name(MyDatabaseId),
 706                                                          vacrel->relnamespace,
 707                                                          vacrel->relname,
 708                                                          vacrel->num_index_scans);
 709                         appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
 710                                                          vacrel->pages_removed,
 711                                                          vacrel->rel_pages,
 712                                                          vacrel->pinskipped_pages,
 713                                                          vacrel->frozenskipped_pages);
 714                         appendStringInfo(&buf,
 715                                                          _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
 716                                                          (long long) vacrel->tuples_deleted,
 717                                                          (long long) vacrel->new_rel_tuples,
 718                                                          (long long) vacrel->new_dead_tuples,
 719                                                          OldestXmin);
 720                         appendStringInfo(&buf,
 721                                                          _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
 722                                                          (long long) VacuumPageHit,
 723                                                          (long long) VacuumPageMiss,
 724                                                          (long long) VacuumPageDirty);
 725                         for (int i = 0; i < vacrel->nindexes; i++)
 726                         {
 727                                 IndexBulkDeleteResult *istat = vacrel->indstats[i];
 728
 729                                 if (!istat)
 730                                         continue;
 731
 732                                 appendStringInfo(&buf,
 733                                                                  _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
 734                                                                  indnames[i],
 735                                                                  istat->num_pages,
 736                                                                  istat->pages_newly_deleted,
 737                                                                  istat->pages_deleted,
 738                                                                  istat->pages_free);
 739                         }
 740                         appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
 741                                                          read_rate, write_rate);
 742                         if (track_io_timing)
 743                         {
 744                                 appendStringInfoString(&buf, _("I/O Timings:"));
 745                                 if (pgStatBlockReadTime - startreadtime > 0)
 746                                         appendStringInfo(&buf, _(" read=%.3f"),
 747                                                                          (double) (pgStatBlockReadTime - startreadtime) / 1000);
 748                                 if (pgStatBlockWriteTime - startwritetime > 0)
 749                                         appendStringInfo(&buf, _(" write=%.3f"),
 750                                                                          (double) (pgStatBlockWriteTime - startwritetime) / 1000);
 751                                 appendStringInfoChar(&buf, '\n');
 752                         }
 753                         appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0));
 754                         appendStringInfo(&buf,
 755                                                          _("WAL usage: %ld records, %ld full page images, %llu bytes"),
 756                                                          walusage.wal_records,
 757                                                          walusage.wal_fpi,
 758                                                          (unsigned long long) walusage.wal_bytes);
 759
 760                         ereport(LOG,
 761                                         (errmsg_internal("%s", buf.data)));
 762                         pfree(buf.data);
 763                 }
 764         }
 765
 766         /* Cleanup index statistics and index names */
 767         for (int i = 0; i < vacrel->nindexes; i++)
 768         {
 769                 if (vacrel->indstats[i])
 770                         pfree(vacrel->indstats[i]);
 771
 772                 if (indnames && indnames[i])
 773                         pfree(indnames[i]);
 774         }
 775 }
 776
 777 /*
 778  * For Hot Standby we need to know the highest transaction id that will
 779  * be removed by any change. VACUUM proceeds in a number of passes so
 780  * we need to consider how each pass operates. The first phase runs
 781  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
 782  * progresses - these will have a latestRemovedXid on each record.
 783  * In some cases this removes all of the tuples to be removed, though
 784  * often we have dead tuples with index pointers so we must remember them
 785  * for removal in phase 3. Index records for those rows are removed
 786  * in phase 2 and index blocks do not have MVCC information attached.
 787  * So before we can allow removal of any index tuples we need to issue
 788  * a WAL record containing the latestRemovedXid of rows that will be
 789  * removed in phase three. This allows recovery queries to block at the
 790  * correct place, i.e. before phase two, rather than during phase three
 791  * which would be after the rows have become inaccessible.
 792  */
 793 static void
 794 vacuum_log_cleanup_info(LVRelState *vacrel)
 795 {
 796         /*
 797          * Skip this for relations for which no WAL is to be written, or if we're
 798          * not trying to support archive recovery.
 799          */
 800         if (!RelationNeedsWAL(vacrel->rel) || !XLogIsNeeded())
 801                 return;
 802
 803         /*
 804          * No need to write the record at all unless it contains a valid value
 805          */
 806         if (TransactionIdIsValid(vacrel->latestRemovedXid))
 807                 (void) log_heap_cleanup_info(vacrel->rel->rd_node,
 808                                                                          vacrel->latestRemovedXid);
 809 }
 810
 811 /*
 812  *      lazy_scan_heap() -- scan an open heap relation
 813  *
 814  *              This routine prunes each page in the heap, which will among other
 815  *              things truncate dead tuples to dead line pointers, defragment the
 816  *              page, and set commit status bits (see heap_page_prune).  It also builds
 817  *              lists of dead tuples and pages with free space, calculates statistics
 818  *              on the number of live tuples in the heap, and marks pages as
 819  *              all-visible if appropriate.  When done, or when we run low on space
 820  *              for dead-tuple TIDs, invoke vacuuming of indexes and reclaim dead line
 821  *              pointers.
 822  *
 823  *              If the table has at least two indexes, we execute both index vacuum
 824  *              and index cleanup with parallel workers unless parallel vacuum is
 825  *              disabled.  In a parallel vacuum, we enter parallel mode and then
 826  *              create both the parallel context and the DSM segment before starting
 827  *              heap scan so that we can record dead tuples to the DSM segment.  All
 828  *              parallel workers are launched at beginning of index vacuuming and
 829  *              index cleanup and they exit once done with all indexes.  At the end of
 830  *              this function we exit from parallel mode.  Index bulk-deletion results
 831  *              are stored in the DSM segment and we update index statistics for all
 832  *              the indexes after exiting from parallel mode since writes are not
 833  *              allowed during parallel mode.
 834  *
 835  *              If there are no indexes then we can reclaim line pointers on the fly;
 836  *              dead line pointers need only be retained until all index pointers that
 837  *              reference them have been killed.
 838  */
 839 static void
 840 lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 841 {
 842         LVDeadTuples *dead_tuples;
 843         BlockNumber nblocks,
 844                                 blkno;
 845         HeapTupleData tuple;
 846         BlockNumber empty_pages,
 847                                 vacuumed_pages,
 848                                 next_fsm_block_to_vacuum;
 849         double          num_tuples,             /* total number of nonremovable tuples */
 850                                 live_tuples,    /* live tuples (reltuples estimate) */
 851                                 tups_vacuumed,  /* tuples cleaned up by current vacuum */
 852                                 nkeep,                  /* dead-but-not-removable tuples */
 853                                 nunused;                /* # existing unused line pointers */
 854         int                     i;
 855         PGRUsage        ru0;
 856         Buffer          vmbuffer = InvalidBuffer;
 857         BlockNumber next_unskippable_block;
 858         bool            skipping_blocks;
 859         xl_heap_freeze_tuple *frozen;
 860         StringInfoData buf;
 861         const int       initprog_index[] = {
 862                 PROGRESS_VACUUM_PHASE,
 863                 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
 864                 PROGRESS_VACUUM_MAX_DEAD_TUPLES
 865         };
 866         int64           initprog_val[3];
 867         GlobalVisState *vistest;
 868
 869         pg_rusage_init(&ru0);
 870
 871         if (aggressive)
 872                 ereport(elevel,
 873                                 (errmsg("aggressively vacuuming \"%s.%s\"",
 874                                                 vacrel->relnamespace,
 875                                                 vacrel->relname)));
 876         else
 877                 ereport(elevel,
 878                                 (errmsg("vacuuming \"%s.%s\"",
 879                                                 vacrel->relnamespace,
 880                                                 vacrel->relname)));
 881
 882         empty_pages = vacuumed_pages = 0;
 883         next_fsm_block_to_vacuum = (BlockNumber) 0;
 884         num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
 885
 886         nblocks = RelationGetNumberOfBlocks(vacrel->rel);
 887         vacrel->rel_pages = nblocks;
 888         vacrel->scanned_pages = 0;
 889         vacrel->pinskipped_pages = 0;
 890         vacrel->frozenskipped_pages = 0;
 891         vacrel->tupcount_pages = 0;
 892         vacrel->pages_removed = 0;
 893         vacrel->nonempty_pages = 0;
 894         vacrel->lock_waiter_detected = false;
 895
 896         /* Initialize instrumentation counters */
 897         vacrel->num_index_scans = 0;
 898         vacrel->tuples_deleted = 0;
 899         vacrel->new_dead_tuples = 0;
 900         vacrel->num_tuples = 0;
 901         vacrel->live_tuples = 0;
 902
 903         vistest = GlobalVisTestFor(vacrel->rel);
 904
 905         vacrel->indstats = (IndexBulkDeleteResult **)
 906                 palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
 907
 908         /*
 909          * Allocate the space for dead tuples.  Note that this handles parallel
 910          * VACUUM initialization as part of allocating shared memory space used
 911          * for dead_tuples.
 912          */
 913         lazy_space_alloc(vacrel, params->nworkers, nblocks);
 914         dead_tuples = vacrel->dead_tuples;
 915         frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
 916
 917         /* Report that we're scanning the heap, advertising total # of blocks */
 918         initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
 919         initprog_val[1] = nblocks;
 920         initprog_val[2] = dead_tuples->max_tuples;
 921         pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
 922
 923         /*
 924          * Except when aggressive is set, we want to skip pages that are
 925          * all-visible according to the visibility map, but only when we can skip
 926          * at least SKIP_PAGES_THRESHOLD consecutive pages.  Since we're reading
 927          * sequentially, the OS should be doing readahead for us, so there's no
 928          * gain in skipping a page now and then; that's likely to disable
 929          * readahead and so be counterproductive. Also, skipping even a single
 930          * page means that we can't update relfrozenxid, so we only want to do it
 931          * if we can skip a goodly number of pages.
 932          *
 933          * When aggressive is set, we can't skip pages just because they are
 934          * all-visible, but we can still skip pages that are all-frozen, since
 935          * such pages do not need freezing and do not affect the value that we can
 936          * safely set for relfrozenxid or relminmxid.
 937          *
 938          * Before entering the main loop, establish the invariant that
 939          * next_unskippable_block is the next block number >= blkno that we can't
 940          * skip based on the visibility map, either all-visible for a regular scan
 941          * or all-frozen for an aggressive scan.  We set it to nblocks if there's
 942          * no such block.  We also set up the skipping_blocks flag correctly at
 943          * this stage.
 944          *
 945          * Note: The value returned by visibilitymap_get_status could be slightly
 946          * out-of-date, since we make this test before reading the corresponding
 947          * heap page or locking the buffer.  This is OK.  If we mistakenly think
 948          * that the page is all-visible or all-frozen when in fact the flag's just
 949          * been cleared, we might fail to vacuum the page.  It's easy to see that
 950          * skipping a page when aggressive is not set is not a very big deal; we
 951          * might leave some dead tuples lying around, but the next vacuum will
 952          * find them.  But even when aggressive *is* set, it's still OK if we miss
 953          * a page whose all-frozen marking has just been cleared.  Any new XIDs
 954          * just added to that page are necessarily newer than the GlobalXmin we
 955          * computed, so they'll have no effect on the value to which we can safely
 956          * set relfrozenxid.  A similar argument applies for MXIDs and relminmxid.
 957          *
 958          * We will scan the table's last page, at least to the extent of
 959          * determining whether it has tuples or not, even if it should be skipped
 960          * according to the above rules; except when we've already determined that
 961          * it's not worth trying to truncate the table.  This avoids having
 962          * lazy_truncate_heap() take access-exclusive lock on the table to attempt
 963          * a truncation that just fails immediately because there are tuples in
 964          * the last page.  This is worth avoiding mainly because such a lock must
 965          * be replayed on any hot standby, where it can be disruptive.
 966          */
 967         next_unskippable_block = 0;
 968         if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
 969         {
 970                 while (next_unskippable_block < nblocks)
 971                 {
 972                         uint8           vmstatus;
 973
 974                         vmstatus = visibilitymap_get_status(vacrel->rel,
 975                                                                                                 next_unskippable_block,
 976                                                                                                 &vmbuffer);
 977                         if (aggressive)
 978                         {
 979                                 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
 980                                         break;
 981                         }
 982                         else
 983                         {
 984                                 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
 985                                         break;
 986                         }
 987                         vacuum_delay_point();
 988                         next_unskippable_block++;
 989                 }
 990         }
 991
 992         if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
 993                 skipping_blocks = true;
 994         else
 995                 skipping_blocks = false;
 996
 997         for (blkno = 0; blkno < nblocks; blkno++)
 998         {
 999                 Buffer          buf;
1000                 Page            page;
1001                 OffsetNumber offnum,
1002                                         maxoff;
1003                 bool            tupgone,
1004                                         hastup;
1005                 int                     prev_dead_count;
1006                 int                     nfrozen;
1007                 Size            freespace;
1008                 bool            all_visible_according_to_vm = false;
1009                 bool            all_visible;
1010                 bool            all_frozen = true;      /* provided all_visible is also true */
1011                 bool            has_dead_items;         /* includes existing LP_DEAD items */
1012                 TransactionId visibility_cutoff_xid = InvalidTransactionId;
1013
1014                 /* see note above about forcing scanning of last page */
1015 #define FORCE_CHECK_PAGE() \
1016                 (blkno == nblocks - 1 && should_attempt_truncation(vacrel, params))
1017
1018                 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1019
1020                 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
1021                                                                  blkno, InvalidOffsetNumber);
1022
1023                 if (blkno == next_unskippable_block)
1024                 {
1025                         /* Time to advance next_unskippable_block */
1026                         next_unskippable_block++;
1027                         if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1028                         {
1029                                 while (next_unskippable_block < nblocks)
1030                                 {
1031                                         uint8           vmskipflags;
1032
1033                                         vmskipflags = visibilitymap_get_status(vacrel->rel,
1034                                                                                                                    next_unskippable_block,
1035                                                                                                                    &vmbuffer);
1036                                         if (aggressive)
1037                                         {
1038                                                 if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
1039                                                         break;
1040                                         }
1041                                         else
1042                                         {
1043                                                 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
1044                                                         break;
1045                                         }
1046                                         vacuum_delay_point();
1047                                         next_unskippable_block++;
1048                                 }
1049                         }
1050
1051                         /*
1052                          * We know we can't skip the current block.  But set up
1053                          * skipping_blocks to do the right thing at the following blocks.
1054                          */
1055                         if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
1056                                 skipping_blocks = true;
1057                         else
1058                                 skipping_blocks = false;
1059
1060                         /*
1061                          * Normally, the fact that we can't skip this block must mean that
1062                          * it's not all-visible.  But in an aggressive vacuum we know only
1063                          * that it's not all-frozen, so it might still be all-visible.
1064                          */
1065                         if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1066                                 all_visible_according_to_vm = true;
1067                 }
1068                 else
1069                 {
1070                         /*
1071                          * The current block is potentially skippable; if we've seen a
1072                          * long enough run of skippable blocks to justify skipping it, and
1073                          * we're not forced to check it, then go ahead and skip.
1074                          * Otherwise, the page must be at least all-visible if not
1075                          * all-frozen, so we can set all_visible_according_to_vm = true.
1076                          */
1077                         if (skipping_blocks && !FORCE_CHECK_PAGE())
1078                         {
1079                                 /*
1080                                  * Tricky, tricky.  If this is in aggressive vacuum, the page
1081                                  * must have been all-frozen at the time we checked whether it
1082                                  * was skippable, but it might not be any more.  We must be
1083                                  * careful to count it as a skipped all-frozen page in that
1084                                  * case, or else we'll think we can't update relfrozenxid and
1085                                  * relminmxid.  If it's not an aggressive vacuum, we don't
1086                                  * know whether it was all-frozen, so we have to recheck; but
1087                                  * in this case an approximate answer is OK.
1088                                  */
1089                                 if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1090                                         vacrel->frozenskipped_pages++;
1091                                 continue;
1092                         }
1093                         all_visible_according_to_vm = true;
1094                 }
1095
1096                 vacuum_delay_point();
1097
1098                 /*
1099                  * If we are close to overrunning the available space for dead-tuple
1100                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
1101                  */
1102                 if ((dead_tuples->max_tuples - dead_tuples->num_tuples) < MaxHeapTuplesPerPage &&
1103                         dead_tuples->num_tuples > 0)
1104                 {
1105                         /*
1106                          * Before beginning index vacuuming, we release any pin we may
1107                          * hold on the visibility map page.  This isn't necessary for
1108                          * correctness, but we do it anyway to avoid holding the pin
1109                          * across a lengthy, unrelated operation.
1110                          */
1111                         if (BufferIsValid(vmbuffer))
1112                         {
1113                                 ReleaseBuffer(vmbuffer);
1114                                 vmbuffer = InvalidBuffer;
1115                         }
1116
1117                         /* Work on all the indexes, then the heap */
1118                         lazy_vacuum_all_indexes(vacrel);
1119
1120                         /* Remove tuples from heap */
1121                         lazy_vacuum_heap_rel(vacrel);
1122
1123                         /*
1124                          * Forget the now-vacuumed tuples, and press on, but be careful
1125                          * not to reset latestRemovedXid since we want that value to be
1126                          * valid.
1127                          */
1128                         dead_tuples->num_tuples = 0;
1129
1130                         /*
1131                          * Vacuum the Free Space Map to make newly-freed space visible on
1132                          * upper-level FSM pages.  Note we have not yet processed blkno.
1133                          */
1134                         FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1135                                                                         blkno);
1136                         next_fsm_block_to_vacuum = blkno;
1137
1138                         /* Report that we are once again scanning the heap */
1139                         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1140                                                                                  PROGRESS_VACUUM_PHASE_SCAN_HEAP);
1141                 }
1142
1143                 /*
1144                  * Pin the visibility map page in case we need to mark the page
1145                  * all-visible.  In most cases this will be very cheap, because we'll
1146                  * already have the correct page pinned anyway.  However, it's
1147                  * possible that (a) next_unskippable_block is covered by a different
1148                  * VM page than the current block or (b) we released our pin and did a
1149                  * cycle of index vacuuming.
1150                  */
1151                 visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
1152
1153                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
1154                                                                  RBM_NORMAL, vacrel->bstrategy);
1155
1156                 /* We need buffer cleanup lock so that we can prune HOT chains. */
1157                 if (!ConditionalLockBufferForCleanup(buf))
1158                 {
1159                         /*
1160                          * If we're not performing an aggressive scan to guard against XID
1161                          * wraparound, and we don't want to forcibly check the page, then
1162                          * it's OK to skip vacuuming pages we get a lock conflict on. They
1163                          * will be dealt with in some future vacuum.
1164                          */
1165                         if (!aggressive && !FORCE_CHECK_PAGE())
1166                         {
1167                                 ReleaseBuffer(buf);
1168                                 vacrel->pinskipped_pages++;
1169                                 continue;
1170                         }
1171
1172                         /*
1173                          * Read the page with share lock to see if any xids on it need to
1174                          * be frozen.  If not we just skip the page, after updating our
1175                          * scan statistics.  If there are some, we wait for cleanup lock.
1176                          *
1177                          * We could defer the lock request further by remembering the page
1178                          * and coming back to it later, or we could even register
1179                          * ourselves for multiple buffers and then service whichever one
1180                          * is received first.  For now, this seems good enough.
1181                          *
1182                          * If we get here with aggressive false, then we're just forcibly
1183                          * checking the page, and so we don't want to insist on getting
1184                          * the lock; we only need to know if the page contains tuples, so
1185                          * that we can update nonempty_pages correctly.  It's convenient
1186                          * to use lazy_check_needs_freeze() for both situations, though.
1187                          */
1188                         LockBuffer(buf, BUFFER_LOCK_SHARE);
1189                         if (!lazy_check_needs_freeze(buf, &hastup, vacrel))
1190                         {
1191                                 UnlockReleaseBuffer(buf);
1192                                 vacrel->scanned_pages++;
1193                                 vacrel->pinskipped_pages++;
1194                                 if (hastup)
1195                                         vacrel->nonempty_pages = blkno + 1;
1196                                 continue;
1197                         }
1198                         if (!aggressive)
1199                         {
1200                                 /*
1201                                  * Here, we must not advance scanned_pages; that would amount
1202                                  * to claiming that the page contains no freezable tuples.
1203                                  */
1204                                 UnlockReleaseBuffer(buf);
1205                                 vacrel->pinskipped_pages++;
1206                                 if (hastup)
1207                                         vacrel->nonempty_pages = blkno + 1;
1208                                 continue;
1209                         }
1210                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1211                         LockBufferForCleanup(buf);
1212                         /* drop through to normal processing */
1213                 }
1214
1215                 vacrel->scanned_pages++;
1216                 vacrel->tupcount_pages++;
1217
1218                 page = BufferGetPage(buf);
1219
1220                 if (PageIsNew(page))
1221                 {
1222                         /*
1223                          * All-zeroes pages can be left over if either a backend extends
1224                          * the relation by a single page, but crashes before the newly
1225                          * initialized page has been written out, or when bulk-extending
1226                          * the relation (which creates a number of empty pages at the tail
1227                          * end of the relation, but enters them into the FSM).
1228                          *
1229                          * Note we do not enter the page into the visibilitymap. That has
1230                          * the downside that we repeatedly visit this page in subsequent
1231                          * vacuums, but otherwise we'll never not discover the space on a
1232                          * promoted standby. The harm of repeated checking ought to
1233                          * normally not be too bad - the space usually should be used at
1234                          * some point, otherwise there wouldn't be any regular vacuums.
1235                          *
1236                          * Make sure these pages are in the FSM, to ensure they can be
1237                          * reused. Do that by testing if there's any space recorded for
1238                          * the page. If not, enter it. We do so after releasing the lock
1239                          * on the heap page, the FSM is approximate, after all.
1240                          */
1241                         UnlockReleaseBuffer(buf);
1242
1243                         empty_pages++;
1244
1245                         if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1246                         {
1247                                 Size            freespace;
1248
1249                                 freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData;
1250                                 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1251                         }
1252                         continue;
1253                 }
1254
1255                 if (PageIsEmpty(page))
1256                 {
1257                         empty_pages++;
1258                         freespace = PageGetHeapFreeSpace(page);
1259
1260                         /*
1261                          * Empty pages are always all-visible and all-frozen (note that
1262                          * the same is currently not true for new pages, see above).
1263                          */
1264                         if (!PageIsAllVisible(page))
1265                         {
1266                                 START_CRIT_SECTION();
1267
1268                                 /* mark buffer dirty before writing a WAL record */
1269                                 MarkBufferDirty(buf);
1270
1271                                 /*
1272                                  * It's possible that another backend has extended the heap,
1273                                  * initialized the page, and then failed to WAL-log the page
1274                                  * due to an ERROR.  Since heap extension is not WAL-logged,
1275                                  * recovery might try to replay our record setting the page
1276                                  * all-visible and find that the page isn't initialized, which
1277                                  * will cause a PANIC.  To prevent that, check whether the
1278                                  * page has been previously WAL-logged, and if not, do that
1279                                  * now.
1280                                  */
1281                                 if (RelationNeedsWAL(vacrel->rel) &&
1282                                         PageGetLSN(page) == InvalidXLogRecPtr)
1283                                         log_newpage_buffer(buf, true);
1284
1285                                 PageSetAllVisible(page);
1286                                 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1287                                                                   vmbuffer, InvalidTransactionId,
1288                                                                   VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1289                                 END_CRIT_SECTION();
1290                         }
1291
1292                         UnlockReleaseBuffer(buf);
1293                         RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1294                         continue;
1295                 }
1296
1297                 /*
1298                  * Prune all HOT-update chains in this page.
1299                  *
1300                  * We count tuples removed by the pruning step as removed by VACUUM
1301                  * (existing LP_DEAD line pointers don't count).
1302                  */
1303                 tups_vacuumed += heap_page_prune(vacrel->rel, buf, vistest,
1304                                                                                  InvalidTransactionId, 0, false,
1305                                                                                  &vacrel->latestRemovedXid,
1306                                                                                  &vacrel->offnum);
1307
1308                 /*
1309                  * Now scan the page to collect vacuumable items and check for tuples
1310                  * requiring freezing.
1311                  */
1312                 all_visible = true;
1313                 has_dead_items = false;
1314                 nfrozen = 0;
1315                 hastup = false;
1316                 prev_dead_count = dead_tuples->num_tuples;
1317                 maxoff = PageGetMaxOffsetNumber(page);
1318
1319                 /*
1320                  * Note: If you change anything in the loop below, also look at
1321                  * heap_page_is_all_visible to see if that needs to be changed.
1322                  */
1323                 for (offnum = FirstOffsetNumber;
1324                          offnum <= maxoff;
1325                          offnum = OffsetNumberNext(offnum))
1326                 {
1327                         ItemId          itemid;
1328
1329                         /*
1330                          * Set the offset number so that we can display it along with any
1331                          * error that occurred while processing this tuple.
1332                          */
1333                         vacrel->offnum = offnum;
1334                         itemid = PageGetItemId(page, offnum);
1335
1336                         /* Unused items require no processing, but we count 'em */
1337                         if (!ItemIdIsUsed(itemid))
1338                         {
1339                                 nunused += 1;
1340                                 continue;
1341                         }
1342
1343                         /* Redirect items mustn't be touched */
1344                         if (ItemIdIsRedirected(itemid))
1345                         {
1346                                 hastup = true;  /* this page won't be truncatable */
1347                                 continue;
1348                         }
1349
1350                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
1351
1352                         /*
1353                          * LP_DEAD line pointers are to be vacuumed normally; but we don't
1354                          * count them in tups_vacuumed, else we'd be double-counting (at
1355                          * least in the common case where heap_page_prune() just freed up
1356                          * a non-HOT tuple).  Note also that the final tups_vacuumed value
1357                          * might be very low for tables where opportunistic page pruning
1358                          * happens to occur very frequently (via heap_page_prune_opt()
1359                          * calls that free up non-HOT tuples).
1360                          */
1361                         if (ItemIdIsDead(itemid))
1362                         {
1363                                 lazy_record_dead_tuple(dead_tuples, &(tuple.t_self));
1364                                 all_visible = false;
1365                                 has_dead_items = true;
1366                                 continue;
1367                         }
1368
1369                         Assert(ItemIdIsNormal(itemid));
1370
1371                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1372                         tuple.t_len = ItemIdGetLength(itemid);
1373                         tuple.t_tableOid = RelationGetRelid(vacrel->rel);
1374
1375                         tupgone = false;
1376
1377                         /*
1378                          * The criteria for counting a tuple as live in this block need to
1379                          * match what analyze.c's acquire_sample_rows() does, otherwise
1380                          * VACUUM and ANALYZE may produce wildly different reltuples
1381                          * values, e.g. when there are many recently-dead tuples.
1382                          *
1383                          * The logic here is a bit simpler than acquire_sample_rows(), as
1384                          * VACUUM can't run inside a transaction block, which makes some
1385                          * cases impossible (e.g. in-progress insert from the same
1386                          * transaction).
1387                          */
1388                         switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
1389                         {
1390                                 case HEAPTUPLE_DEAD:
1391
1392                                         /*
1393                                          * Ordinarily, DEAD tuples would have been removed by
1394                                          * heap_page_prune(), but it's possible that the tuple
1395                                          * state changed since heap_page_prune() looked.  In
1396                                          * particular an INSERT_IN_PROGRESS tuple could have
1397                                          * changed to DEAD if the inserter aborted.  So this
1398                                          * cannot be considered an error condition.
1399                                          *
1400                                          * If the tuple is HOT-updated then it must only be
1401                                          * removed by a prune operation; so we keep it just as if
1402                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
1403                                          * tuple, we choose to keep it, because it'll be a lot
1404                                          * cheaper to get rid of it in the next pruning pass than
1405                                          * to treat it like an indexed tuple. Finally, if index
1406                                          * cleanup is disabled, the second heap pass will not
1407                                          * execute, and the tuple will not get removed, so we must
1408                                          * treat it like any other dead tuple that we choose to
1409                                          * keep.
1410                                          *
1411                                          * If this were to happen for a tuple that actually needed
1412                                          * to be deleted, we'd be in trouble, because it'd
1413                                          * possibly leave a tuple below the relation's xmin
1414                                          * horizon alive.  heap_prepare_freeze_tuple() is prepared
1415                                          * to detect that case and abort the transaction,
1416                                          * preventing corruption.
1417                                          */
1418                                         if (HeapTupleIsHotUpdated(&tuple) ||
1419                                                 HeapTupleIsHeapOnly(&tuple) ||
1420                                                 params->index_cleanup == VACOPT_TERNARY_DISABLED)
1421                                                 nkeep += 1;
1422                                         else
1423                                                 tupgone = true; /* we can delete the tuple */
1424                                         all_visible = false;
1425                                         break;
1426                                 case HEAPTUPLE_LIVE:
1427
1428                                         /*
1429                                          * Count it as live.  Not only is this natural, but it's
1430                                          * also what acquire_sample_rows() does.
1431                                          */
1432                                         live_tuples += 1;
1433
1434                                         /*
1435                                          * Is the tuple definitely visible to all transactions?
1436                                          *
1437                                          * NB: Like with per-tuple hint bits, we can't set the
1438                                          * PD_ALL_VISIBLE flag if the inserter committed
1439                                          * asynchronously. See SetHintBits for more info. Check
1440                                          * that the tuple is hinted xmin-committed because of
1441                                          * that.
1442                                          */
1443                                         if (all_visible)
1444                                         {
1445                                                 TransactionId xmin;
1446
1447                                                 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1448                                                 {
1449                                                         all_visible = false;
1450                                                         break;
1451                                                 }
1452
1453                                                 /*
1454                                                  * The inserter definitely committed. But is it old
1455                                                  * enough that everyone sees it as committed?
1456                                                  */
1457                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1458                                                 if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
1459                                                 {
1460                                                         all_visible = false;
1461                                                         break;
1462                                                 }
1463
1464                                                 /* Track newest xmin on page. */
1465                                                 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
1466                                                         visibility_cutoff_xid = xmin;
1467                                         }
1468                                         break;
1469                                 case HEAPTUPLE_RECENTLY_DEAD:
1470
1471                                         /*
1472                                          * If tuple is recently deleted then we must not remove it
1473                                          * from relation.
1474                                          */
1475                                         nkeep += 1;
1476                                         all_visible = false;
1477                                         break;
1478                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
1479
1480                                         /*
1481                                          * This is an expected case during concurrent vacuum.
1482                                          *
1483                                          * We do not count these rows as live, because we expect
1484                                          * the inserting transaction to update the counters at
1485                                          * commit, and we assume that will happen only after we
1486                                          * report our results.  This assumption is a bit shaky,
1487                                          * but it is what acquire_sample_rows() does, so be
1488                                          * consistent.
1489                                          */
1490                                         all_visible = false;
1491                                         break;
1492                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
1493                                         /* This is an expected case during concurrent vacuum */
1494                                         all_visible = false;
1495
1496                                         /*
1497                                          * Count such rows as live.  As above, we assume the
1498                                          * deleting transaction will commit and update the
1499                                          * counters after we report.
1500                                          */
1501                                         live_tuples += 1;
1502                                         break;
1503                                 default:
1504                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1505                                         break;
1506                         }
1507
1508                         if (tupgone)
1509                         {
1510                                 lazy_record_dead_tuple(dead_tuples, &(tuple.t_self));
1511                                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
1512                                                                                                            &vacrel->latestRemovedXid);
1513                                 tups_vacuumed += 1;
1514                                 has_dead_items = true;
1515                         }
1516                         else
1517                         {
1518                                 bool            tuple_totally_frozen;
1519
1520                                 num_tuples += 1;
1521                                 hastup = true;
1522
1523                                 /*
1524                                  * Each non-removable tuple must be checked to see if it needs
1525                                  * freezing.  Note we already have exclusive buffer lock.
1526                                  */
1527                                 if (heap_prepare_freeze_tuple(tuple.t_data,
1528                                                                                           vacrel->relfrozenxid,
1529                                                                                           vacrel->relminmxid,
1530                                                                                           vacrel->FreezeLimit,
1531                                                                                           vacrel->MultiXactCutoff,
1532                                                                                           &frozen[nfrozen],
1533                                                                                           &tuple_totally_frozen))
1534                                         frozen[nfrozen++].offset = offnum;
1535
1536                                 if (!tuple_totally_frozen)
1537                                         all_frozen = false;
1538                         }
1539                 }                                               /* scan along page */
1540
1541                 /*
1542                  * Clear the offset information once we have processed all the tuples
1543                  * on the page.
1544                  */
1545                 vacrel->offnum = InvalidOffsetNumber;
1546
1547                 /*
1548                  * If we froze any tuples, mark the buffer dirty, and write a WAL
1549                  * record recording the changes.  We must log the changes to be
1550                  * crash-safe against future truncation of CLOG.
1551                  */
1552                 if (nfrozen > 0)
1553                 {
1554                         START_CRIT_SECTION();
1555
1556                         MarkBufferDirty(buf);
1557
1558                         /* execute collected freezes */
1559                         for (i = 0; i < nfrozen; i++)
1560                         {
1561                                 ItemId          itemid;
1562                                 HeapTupleHeader htup;
1563
1564                                 itemid = PageGetItemId(page, frozen[i].offset);
1565                                 htup = (HeapTupleHeader) PageGetItem(page, itemid);
1566
1567                                 heap_execute_freeze_tuple(htup, &frozen[i]);
1568                         }
1569
1570                         /* Now WAL-log freezing if necessary */
1571                         if (RelationNeedsWAL(vacrel->rel))
1572                         {
1573                                 XLogRecPtr      recptr;
1574
1575                                 recptr = log_heap_freeze(vacrel->rel, buf,
1576                                                                                  vacrel->FreezeLimit, frozen, nfrozen);
1577                                 PageSetLSN(page, recptr);
1578                         }
1579
1580                         END_CRIT_SECTION();
1581                 }
1582
1583                 /*
1584                  * If there are no indexes we can vacuum the page right now instead of
1585                  * doing a second scan. Also we don't do that but forget dead tuples
1586                  * when index cleanup is disabled.
1587                  */
1588                 if (!vacrel->useindex && dead_tuples->num_tuples > 0)
1589                 {
1590                         if (vacrel->nindexes == 0)
1591                         {
1592                                 /* Remove tuples from heap if the table has no index */
1593                                 lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1594                                 vacuumed_pages++;
1595                                 has_dead_items = false;
1596                         }
1597                         else
1598                         {
1599                                 /*
1600                                  * Here, we have indexes but index cleanup is disabled.
1601                                  * Instead of vacuuming the dead tuples on the heap, we just
1602                                  * forget them.
1603                                  *
1604                                  * Note that vacrelstats->dead_tuples could have tuples which
1605                                  * became dead after HOT-pruning but are not marked dead yet.
1606                                  * We do not process them because it's a very rare condition,
1607                                  * and the next vacuum will process them anyway.
1608                                  */
1609                                 Assert(params->index_cleanup == VACOPT_TERNARY_DISABLED);
1610                         }
1611
1612                         /*
1613                          * Forget the now-vacuumed tuples, and press on, but be careful
1614                          * not to reset latestRemovedXid since we want that value to be
1615                          * valid.
1616                          */
1617                         dead_tuples->num_tuples = 0;
1618
1619                         /*
1620                          * Periodically do incremental FSM vacuuming to make newly-freed
1621                          * space visible on upper FSM pages.  Note: although we've cleaned
1622                          * the current block, we haven't yet updated its FSM entry (that
1623                          * happens further down), so passing end == blkno is correct.
1624                          */
1625                         if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1626                         {
1627                                 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1628                                                                                 blkno);
1629                                 next_fsm_block_to_vacuum = blkno;
1630                         }
1631                 }
1632
1633                 freespace = PageGetHeapFreeSpace(page);
1634
1635                 /* mark page all-visible, if appropriate */
1636                 if (all_visible && !all_visible_according_to_vm)
1637                 {
1638                         uint8           flags = VISIBILITYMAP_ALL_VISIBLE;
1639
1640                         if (all_frozen)
1641                                 flags |= VISIBILITYMAP_ALL_FROZEN;
1642
1643                         /*
1644                          * It should never be the case that the visibility map page is set
1645                          * while the page-level bit is clear, but the reverse is allowed
1646                          * (if checksums are not enabled).  Regardless, set both bits so
1647                          * that we get back in sync.
1648                          *
1649                          * NB: If the heap page is all-visible but the VM bit is not set,
1650                          * we don't need to dirty the heap page.  However, if checksums
1651                          * are enabled, we do need to make sure that the heap page is
1652                          * dirtied before passing it to visibilitymap_set(), because it
1653                          * may be logged.  Given that this situation should only happen in
1654                          * rare cases after a crash, it is not worth optimizing.
1655                          */
1656                         PageSetAllVisible(page);
1657                         MarkBufferDirty(buf);
1658                         visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1659                                                           vmbuffer, visibility_cutoff_xid, flags);
1660                 }
1661
1662                 /*
1663                  * As of PostgreSQL 9.2, the visibility map bit should never be set if
1664                  * the page-level bit is clear.  However, it's possible that the bit
1665                  * got cleared after we checked it and before we took the buffer
1666                  * content lock, so we must recheck before jumping to the conclusion
1667                  * that something bad has happened.
1668                  */
1669                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1670                                  && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1671                 {
1672                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1673                                  vacrel->relname, blkno);
1674                         visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1675                                                                 VISIBILITYMAP_VALID_BITS);
1676                 }
1677
1678                 /*
1679                  * It's possible for the value returned by
1680                  * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1681                  * wrong for us to see tuples that appear to not be visible to
1682                  * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1683                  * xmin value never moves backwards, but
1684                  * GetOldestNonRemovableTransactionId() is conservative and sometimes
1685                  * returns a value that's unnecessarily small, so if we see that
1686                  * contradiction it just means that the tuples that we think are not
1687                  * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1688                  * is correct.
1689                  *
1690                  * There should never be dead tuples on a page with PD_ALL_VISIBLE
1691                  * set, however.
1692                  */
1693                 else if (PageIsAllVisible(page) && has_dead_items)
1694                 {
1695                         elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1696                                  vacrel->relname, blkno);
1697                         PageClearAllVisible(page);
1698                         MarkBufferDirty(buf);
1699                         visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1700                                                                 VISIBILITYMAP_VALID_BITS);
1701                 }
1702
1703                 /*
1704                  * If the all-visible page is all-frozen but not marked as such yet,
1705                  * mark it as all-frozen.  Note that all_frozen is only valid if
1706                  * all_visible is true, so we must check both.
1707                  */
1708                 else if (all_visible_according_to_vm && all_visible && all_frozen &&
1709                                  !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1710                 {
1711                         /*
1712                          * We can pass InvalidTransactionId as the cutoff XID here,
1713                          * because setting the all-frozen bit doesn't cause recovery
1714                          * conflicts.
1715                          */
1716                         visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1717                                                           vmbuffer, InvalidTransactionId,
1718                                                           VISIBILITYMAP_ALL_FROZEN);
1719                 }
1720
1721                 UnlockReleaseBuffer(buf);
1722
1723                 /* Remember the location of the last page with nonremovable tuples */
1724                 if (hastup)
1725                         vacrel->nonempty_pages = blkno + 1;
1726
1727                 /*
1728                  * If we remembered any tuples for deletion, then the page will be
1729                  * visited again by lazy_vacuum_heap_rel, which will compute and record
1730                  * its post-compaction free space.  If not, then we're done with this
1731                  * page, so remember its free space as-is.  (This path will always be
1732                  * taken if there are no indexes.)
1733                  */
1734                 if (dead_tuples->num_tuples == prev_dead_count)
1735                         RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1736         }
1737
1738         /* report that everything is scanned and vacuumed */
1739         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1740
1741         /* Clear the block number information */
1742         vacrel->blkno = InvalidBlockNumber;
1743
1744         pfree(frozen);
1745
1746         /* save stats for use later */
1747         vacrel->tuples_deleted = tups_vacuumed;
1748         vacrel->new_dead_tuples = nkeep;
1749
1750         /* now we can compute the new value for pg_class.reltuples */
1751         vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks,
1752                                                                                                          vacrel->tupcount_pages,
1753                                                                                                          live_tuples);
1754
1755         /*
1756          * Also compute the total number of surviving heap entries.  In the
1757          * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1758          */
1759         vacrel->new_rel_tuples =
1760                 Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples;
1761
1762         /*
1763          * Release any remaining pin on visibility map page.
1764          */
1765         if (BufferIsValid(vmbuffer))
1766         {
1767                 ReleaseBuffer(vmbuffer);
1768                 vmbuffer = InvalidBuffer;
1769         }
1770
1771         /* If any tuples need to be deleted, perform final vacuum cycle */
1772         /* XXX put a threshold on min number of tuples here? */
1773         if (dead_tuples->num_tuples > 0)
1774         {
1775                 /* Work on all the indexes, and then the heap */
1776                 lazy_vacuum_all_indexes(vacrel);
1777
1778                 /* Remove tuples from heap */
1779                 lazy_vacuum_heap_rel(vacrel);
1780         }
1781
1782         /*
1783          * Vacuum the remainder of the Free Space Map.  We must do this whether or
1784          * not there were indexes.
1785          */
1786         if (blkno > next_fsm_block_to_vacuum)
1787                 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1788
1789         /* report all blocks vacuumed */
1790         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1791
1792         /* Do post-vacuum cleanup */
1793         if (vacrel->useindex)
1794                 lazy_cleanup_all_indexes(vacrel);
1795
1796         /*
1797          * Free resources managed by lazy_space_alloc().  (We must end parallel
1798          * mode/free shared memory before updating index statistics.  We cannot
1799          * write while in parallel mode.)
1800          */
1801         lazy_space_free(vacrel);
1802
1803         /* Update index statistics */
1804         if (vacrel->useindex)
1805                 update_index_statistics(vacrel);
1806
1807         /* If no indexes, make log report that lazy_vacuum_heap_rel would've made */
1808         if (vacuumed_pages)
1809                 ereport(elevel,
1810                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1811                                                 vacrel->relname,
1812                                                 tups_vacuumed, vacuumed_pages)));
1813
1814         initStringInfo(&buf);
1815         appendStringInfo(&buf,
1816                                          _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"),
1817                                          nkeep, vacrel->OldestXmin);
1818         appendStringInfo(&buf, _("There were %.0f unused item identifiers.\n"),
1819                                          nunused);
1820         appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1821                                                                         "Skipped %u pages due to buffer pins, ",
1822                                                                         vacrel->pinskipped_pages),
1823                                          vacrel->pinskipped_pages);
1824         appendStringInfo(&buf, ngettext("%u frozen page.\n",
1825                                                                         "%u frozen pages.\n",
1826                                                                         vacrel->frozenskipped_pages),
1827                                          vacrel->frozenskipped_pages);
1828         appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
1829                                                                         "%u pages are entirely empty.\n",
1830                                                                         empty_pages),
1831                                          empty_pages);
1832         appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1833
1834         ereport(elevel,
1835                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1836                                         vacrel->relname,
1837                                         tups_vacuumed, num_tuples,
1838                                         vacrel->scanned_pages, nblocks),
1839                          errdetail_internal("%s", buf.data)));
1840         pfree(buf.data);
1841 }
1842
1843 /*
1844  *      lazy_vacuum_all_indexes() -- Main entry for index vacuuming
1845  */
1846 static void
1847 lazy_vacuum_all_indexes(LVRelState *vacrel)
1848 {
1849         Assert(!IsParallelWorker());
1850         Assert(vacrel->nindexes > 0);
1851         Assert(TransactionIdIsNormal(vacrel->relfrozenxid));
1852         Assert(MultiXactIdIsValid(vacrel->relminmxid));
1853
1854         /* Log cleanup info before we touch indexes */
1855         vacuum_log_cleanup_info(vacrel);
1856
1857         /* Report that we are now vacuuming indexes */
1858         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1859                                                                  PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
1860
1861         if (!ParallelVacuumIsActive(vacrel))
1862         {
1863                 for (int idx = 0; idx < vacrel->nindexes; idx++)
1864                 {
1865                         Relation        indrel = vacrel->indrels[idx];
1866                         IndexBulkDeleteResult *istat = vacrel->indstats[idx];
1867
1868                         vacrel->indstats[idx] =
1869                                 lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples,
1870                                                                           vacrel);
1871                 }
1872         }
1873         else
1874         {
1875                 /* Outsource everything to parallel variant */
1876                 do_parallel_lazy_vacuum_all_indexes(vacrel);
1877         }
1878
1879         /* Increase and report the number of index scans */
1880         vacrel->num_index_scans++;
1881         pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS,
1882                                                                  vacrel->num_index_scans);
1883 }
1884
1885 /*
1886  *      lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
1887  *
1888  * This routine marks dead tuples as unused and compacts out free space on
1889  * their pages.  Pages not having dead tuples recorded from lazy_scan_heap are
1890  * not visited at all.
1891  *
1892  * Note: the reason for doing this as a second pass is we cannot remove the
1893  * tuples until we've removed their index entries, and we want to process
1894  * index entry removal in batches as large as possible.
1895  */
1896 static void
1897 lazy_vacuum_heap_rel(LVRelState *vacrel)
1898 {
1899         int                     tupindex;
1900         int                     vacuumed_pages;
1901         PGRUsage        ru0;
1902         Buffer          vmbuffer = InvalidBuffer;
1903         LVSavedErrInfo saved_err_info;
1904
1905         /* Report that we are now vacuuming the heap */
1906         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1907                                                                  PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
1908
1909         /* Update error traceback information */
1910         update_vacuum_error_info(vacrel, &saved_err_info,
1911                                                          VACUUM_ERRCB_PHASE_VACUUM_HEAP,
1912                                                          InvalidBlockNumber, InvalidOffsetNumber);
1913
1914         pg_rusage_init(&ru0);
1915         vacuumed_pages = 0;
1916
1917         tupindex = 0;
1918         while (tupindex < vacrel->dead_tuples->num_tuples)
1919         {
1920                 BlockNumber tblk;
1921                 Buffer          buf;
1922                 Page            page;
1923                 Size            freespace;
1924
1925                 vacuum_delay_point();
1926
1927                 tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
1928                 vacrel->blkno = tblk;
1929                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1930                                                                  vacrel->bstrategy);
1931                 if (!ConditionalLockBufferForCleanup(buf))
1932                 {
1933                         ReleaseBuffer(buf);
1934                         ++tupindex;
1935                         continue;
1936                 }
1937                 tupindex = lazy_vacuum_heap_page(vacrel, tblk, buf, tupindex,
1938                                                                                  &vmbuffer);
1939
1940                 /* Now that we've compacted the page, record its available space */
1941                 page = BufferGetPage(buf);
1942                 freespace = PageGetHeapFreeSpace(page);
1943
1944                 UnlockReleaseBuffer(buf);
1945                 RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
1946                 vacuumed_pages++;
1947         }
1948
1949         /* Clear the block number information */
1950         vacrel->blkno = InvalidBlockNumber;
1951
1952         if (BufferIsValid(vmbuffer))
1953         {
1954                 ReleaseBuffer(vmbuffer);
1955                 vmbuffer = InvalidBuffer;
1956         }
1957
1958         ereport(elevel,
1959                         (errmsg("\"%s\": removed %d dead item identifiers in %u pages",
1960                                         vacrel->relname, tupindex, vacuumed_pages),
1961                          errdetail_internal("%s", pg_rusage_show(&ru0))));
1962
1963         /* Revert to the previous phase information for error traceback */
1964         restore_vacuum_error_info(vacrel, &saved_err_info);
1965 }
1966
1967 /*
1968  *      lazy_vacuum_heap_page() -- free dead tuples on a page
1969  *                                                and repair its fragmentation.
1970  *
1971  * Caller must hold pin and buffer cleanup lock on the buffer.
1972  *
1973  * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
1974  * this page.  We assume the rest follow sequentially.  The return value is
1975  * the first tupindex after the tuples of this page.
1976  */
1977 static int
1978 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
1979                                           int tupindex, Buffer *vmbuffer)
1980 {
1981         LVDeadTuples *dead_tuples = vacrel->dead_tuples;
1982         Page            page = BufferGetPage(buffer);
1983         OffsetNumber unused[MaxHeapTuplesPerPage];
1984         int                     uncnt = 0;
1985         TransactionId visibility_cutoff_xid;
1986         bool            all_frozen;
1987         LVSavedErrInfo saved_err_info;
1988
1989         pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1990
1991         /* Update error traceback information */
1992         update_vacuum_error_info(vacrel, &saved_err_info,
1993                                                          VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
1994                                                          InvalidOffsetNumber);
1995
1996         START_CRIT_SECTION();
1997
1998         for (; tupindex < dead_tuples->num_tuples; tupindex++)
1999         {
2000                 BlockNumber tblk;
2001                 OffsetNumber toff;
2002                 ItemId          itemid;
2003
2004                 tblk = ItemPointerGetBlockNumber(&dead_tuples->itemptrs[tupindex]);
2005                 if (tblk != blkno)
2006                         break;                          /* past end of tuples for this block */
2007                 toff = ItemPointerGetOffsetNumber(&dead_tuples->itemptrs[tupindex]);
2008                 itemid = PageGetItemId(page, toff);
2009                 ItemIdSetUnused(itemid);
2010                 unused[uncnt++] = toff;
2011         }
2012
2013         PageRepairFragmentation(page);
2014
2015         /*
2016          * Mark buffer dirty before we write WAL.
2017          */
2018         MarkBufferDirty(buffer);
2019
2020         /* XLOG stuff */
2021         if (RelationNeedsWAL(vacrel->rel))
2022         {
2023                 XLogRecPtr      recptr;
2024
2025                 recptr = log_heap_clean(vacrel->rel, buffer,
2026                                                                 NULL, 0, NULL, 0,
2027                                                                 unused, uncnt,
2028                                                                 vacrel->latestRemovedXid);
2029                 PageSetLSN(page, recptr);
2030         }
2031
2032         /*
2033          * End critical section, so we safely can do visibility tests (which
2034          * possibly need to perform IO and allocate memory!). If we crash now the
2035          * page (including the corresponding vm bit) might not be marked all
2036          * visible, but that's fine. A later vacuum will fix that.
2037          */
2038         END_CRIT_SECTION();
2039
2040         /*
2041          * Now that we have removed the dead tuples from the page, once again
2042          * check if the page has become all-visible.  The page is already marked
2043          * dirty, exclusively locked, and, if needed, a full page image has been
2044          * emitted in the log_heap_clean() above.
2045          */
2046         if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2047                                                                  &all_frozen))
2048                 PageSetAllVisible(page);
2049
2050         /*
2051          * All the changes to the heap page have been done. If the all-visible
2052          * flag is now set, also set the VM all-visible bit (and, if possible, the
2053          * all-frozen bit) unless this has already been done previously.
2054          */
2055         if (PageIsAllVisible(page))
2056         {
2057                 uint8           flags = 0;
2058                 uint8           vm_status = visibilitymap_get_status(vacrel->rel,
2059                                                                                                                  blkno, vmbuffer);
2060
2061                 /* Set the VM all-frozen bit to flag, if needed */
2062                 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2063                         flags |= VISIBILITYMAP_ALL_VISIBLE;
2064                 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2065                         flags |= VISIBILITYMAP_ALL_FROZEN;
2066
2067                 Assert(BufferIsValid(*vmbuffer));
2068                 if (flags != 0)
2069                         visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2070                                                           *vmbuffer, visibility_cutoff_xid, flags);
2071         }
2072
2073         /* Revert to the previous phase information for error traceback */
2074         restore_vacuum_error_info(vacrel, &saved_err_info);
2075         return tupindex;
2076 }
2077
2078 /*
2079  *      lazy_check_needs_freeze() -- scan page to see if any tuples
2080  *                                       need to be cleaned to avoid wraparound
2081  *
2082  * Returns true if the page needs to be vacuumed using cleanup lock.
2083  * Also returns a flag indicating whether page contains any tuples at all.
2084  */
2085 static bool
2086 lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel)
2087 {
2088         Page            page = BufferGetPage(buf);
2089         OffsetNumber offnum,
2090                                 maxoff;
2091         HeapTupleHeader tupleheader;
2092
2093         *hastup = false;
2094
2095         /*
2096          * New and empty pages, obviously, don't contain tuples. We could make
2097          * sure that the page is registered in the FSM, but it doesn't seem worth
2098          * waiting for a cleanup lock just for that, especially because it's
2099          * likely that the pin holder will do so.
2100          */
2101         if (PageIsNew(page) || PageIsEmpty(page))
2102                 return false;
2103
2104         maxoff = PageGetMaxOffsetNumber(page);
2105         for (offnum = FirstOffsetNumber;
2106                  offnum <= maxoff;
2107                  offnum = OffsetNumberNext(offnum))
2108         {
2109                 ItemId          itemid;
2110
2111                 /*
2112                  * Set the offset number so that we can display it along with any
2113                  * error that occurred while processing this tuple.
2114                  */
2115                 vacrel->offnum = offnum;
2116                 itemid = PageGetItemId(page, offnum);
2117
2118                 /* this should match hastup test in count_nondeletable_pages() */
2119                 if (ItemIdIsUsed(itemid))
2120                         *hastup = true;
2121
2122                 /* dead and redirect items never need freezing */
2123                 if (!ItemIdIsNormal(itemid))
2124                         continue;
2125
2126                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
2127
2128                 if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit,
2129                                                                         vacrel->MultiXactCutoff, buf))
2130                         break;
2131         }                                                       /* scan along page */
2132
2133         /* Clear the offset information once we have processed the given page. */
2134         vacrel->offnum = InvalidOffsetNumber;
2135
2136         return (offnum <= maxoff);
2137 }
2138
2139 /*
2140  * Perform lazy_vacuum_all_indexes() steps in parallel
2141  */
2142 static void
2143 do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel)
2144 {
2145         /* Tell parallel workers to do index vacuuming */
2146         vacrel->lps->lvshared->for_cleanup = false;
2147         vacrel->lps->lvshared->first_time = false;
2148
2149         /*
2150          * We can only provide an approximate value of num_heap_tuples in vacuum
2151          * cases.
2152          */
2153         vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples;
2154         vacrel->lps->lvshared->estimated_count = true;
2155
2156         do_parallel_vacuum_or_cleanup(vacrel,
2157                                                                   vacrel->lps->nindexes_parallel_bulkdel);
2158 }
2159
2160 /*
2161  * Perform lazy_cleanup_all_indexes() steps in parallel
2162  */
2163 static void
2164 do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel)
2165 {
2166         int                     nworkers;
2167
2168         /*
2169          * If parallel vacuum is active we perform index cleanup with parallel
2170          * workers.
2171          *
2172          * Tell parallel workers to do index cleanup.
2173          */
2174         vacrel->lps->lvshared->for_cleanup = true;
2175         vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0);
2176
2177         /*
2178          * Now we can provide a better estimate of total number of surviving
2179          * tuples (we assume indexes are more interested in that than in the
2180          * number of nominally live tuples).
2181          */
2182         vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples;
2183         vacrel->lps->lvshared->estimated_count =
2184                 (vacrel->tupcount_pages < vacrel->rel_pages);
2185
2186         /* Determine the number of parallel workers to launch */
2187         if (vacrel->lps->lvshared->first_time)
2188                 nworkers = vacrel->lps->nindexes_parallel_cleanup +
2189                         vacrel->lps->nindexes_parallel_condcleanup;
2190         else
2191                 nworkers = vacrel->lps->nindexes_parallel_cleanup;
2192
2193         do_parallel_vacuum_or_cleanup(vacrel, nworkers);
2194 }
2195
2196 /*
2197  * Perform index vacuum or index cleanup with parallel workers.  This function
2198  * must be used by the parallel vacuum leader process.  The caller must set
2199  * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2200  * cleanup.
2201  */
2202 static void
2203 do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers)
2204 {
2205         LVParallelState *lps = vacrel->lps;
2206
2207         Assert(!IsParallelWorker());
2208         Assert(ParallelVacuumIsActive(vacrel));
2209         Assert(vacrel->nindexes > 0);
2210
2211         /* The leader process will participate */
2212         nworkers--;
2213
2214         /*
2215          * It is possible that parallel context is initialized with fewer workers
2216          * than the number of indexes that need a separate worker in the current
2217          * phase, so we need to consider it.  See compute_parallel_vacuum_workers.
2218          */
2219         nworkers = Min(nworkers, lps->pcxt->nworkers);
2220
2221         /* Setup the shared cost-based vacuum delay and launch workers */
2222         if (nworkers > 0)
2223         {
2224                 if (vacrel->num_index_scans > 0)
2225                 {
2226                         /* Reset the parallel index processing counter */
2227                         pg_atomic_write_u32(&(lps->lvshared->idx), 0);
2228
2229                         /* Reinitialize the parallel context to relaunch parallel workers */
2230                         ReinitializeParallelDSM(lps->pcxt);
2231                 }
2232
2233                 /*
2234                  * Set up shared cost balance and the number of active workers for
2235                  * vacuum delay.  We need to do this before launching workers as
2236                  * otherwise, they might not see the updated values for these
2237                  * parameters.
2238                  */
2239                 pg_atomic_write_u32(&(lps->lvshared->cost_balance), VacuumCostBalance);
2240                 pg_atomic_write_u32(&(lps->lvshared->active_nworkers), 0);
2241
2242                 /*
2243                  * The number of workers can vary between bulkdelete and cleanup
2244                  * phase.
2245                  */
2246                 ReinitializeParallelWorkers(lps->pcxt, nworkers);
2247
2248                 LaunchParallelWorkers(lps->pcxt);
2249
2250                 if (lps->pcxt->nworkers_launched > 0)
2251                 {
2252                         /*
2253                          * Reset the local cost values for leader backend as we have
2254                          * already accumulated the remaining balance of heap.
2255                          */
2256                         VacuumCostBalance = 0;
2257                         VacuumCostBalanceLocal = 0;
2258
2259                         /* Enable shared cost balance for leader backend */
2260                         VacuumSharedCostBalance = &(lps->lvshared->cost_balance);
2261                         VacuumActiveNWorkers = &(lps->lvshared->active_nworkers);
2262                 }
2263
2264                 if (lps->lvshared->for_cleanup)
2265                         ereport(elevel,
2266                                         (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2267                                                                          "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2268                                                                          lps->pcxt->nworkers_launched),
2269                                                         lps->pcxt->nworkers_launched, nworkers)));
2270                 else
2271                         ereport(elevel,
2272                                         (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2273                                                                          "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2274                                                                          lps->pcxt->nworkers_launched),
2275                                                         lps->pcxt->nworkers_launched, nworkers)));
2276         }
2277
2278         /* Process the indexes that can be processed by only leader process */
2279         do_serial_processing_for_unsafe_indexes(vacrel, lps->lvshared);
2280
2281         /*
2282          * Join as a parallel worker.  The leader process alone processes all the
2283          * indexes in the case where no workers are launched.
2284          */
2285         do_parallel_processing(vacrel, lps->lvshared);
2286
2287         /*
2288          * Next, accumulate buffer and WAL usage.  (This must wait for the workers
2289          * to finish, or we might get incomplete data.)
2290          */
2291         if (nworkers > 0)
2292         {
2293                 /* Wait for all vacuum workers to finish */
2294                 WaitForParallelWorkersToFinish(lps->pcxt);
2295
2296                 for (int i = 0; i < lps->pcxt->nworkers_launched; i++)
2297                         InstrAccumParallelQuery(&lps->buffer_usage[i], &lps->wal_usage[i]);
2298         }
2299
2300         /*
2301          * Carry the shared balance value to heap scan and disable shared costing
2302          */
2303         if (VacuumSharedCostBalance)
2304         {
2305                 VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance);
2306                 VacuumSharedCostBalance = NULL;
2307                 VacuumActiveNWorkers = NULL;
2308         }
2309 }
2310
2311 /*
2312  * Index vacuum/cleanup routine used by the leader process and parallel
2313  * vacuum worker processes to process the indexes in parallel.
2314  */
2315 static void
2316 do_parallel_processing(LVRelState *vacrel, LVShared *lvshared)
2317 {
2318         /*
2319          * Increment the active worker count if we are able to launch any worker.
2320          */
2321         if (VacuumActiveNWorkers)
2322                 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2323
2324         /* Loop until all indexes are vacuumed */
2325         for (;;)
2326         {
2327                 int                     idx;
2328                 LVSharedIndStats *shared_istat;
2329                 Relation        indrel;
2330                 IndexBulkDeleteResult *istat;
2331
2332                 /* Get an index number to process */
2333                 idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1);
2334
2335                 /* Done for all indexes? */
2336                 if (idx >= vacrel->nindexes)
2337                         break;
2338
2339                 /* Get the index statistics of this index from DSM */
2340                 shared_istat = parallel_stats_for_idx(lvshared, idx);
2341
2342                 /* Skip indexes not participating in parallelism */
2343                 if (shared_istat == NULL)
2344                         continue;
2345
2346                 indrel = vacrel->indrels[idx];
2347
2348                 /*
2349                  * Skip processing indexes that are unsafe for workers (these are
2350                  * processed in do_serial_processing_for_unsafe_indexes() by leader)
2351                  */
2352                 if (!parallel_processing_is_safe(indrel, lvshared))
2353                         continue;
2354
2355                 /* Do vacuum or cleanup of the index */
2356                 istat = (vacrel->indstats[idx]);
2357                 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2358                                                                                                                    lvshared,
2359                                                                                                                    shared_istat,
2360                                                                                                                    vacrel);
2361         }
2362
2363         /*
2364          * We have completed the index vacuum so decrement the active worker
2365          * count.
2366          */
2367         if (VacuumActiveNWorkers)
2368                 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2369 }
2370
2371 /*
2372  * Vacuum or cleanup indexes that can be processed by only the leader process
2373  * because these indexes don't support parallel operation at that phase.
2374  */
2375 static void
2376 do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared)
2377 {
2378         Assert(!IsParallelWorker());
2379
2380         /*
2381          * Increment the active worker count if we are able to launch any worker.
2382          */
2383         if (VacuumActiveNWorkers)
2384                 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2385
2386         for (int idx = 0; idx < vacrel->nindexes; idx++)
2387         {
2388                 LVSharedIndStats *shared_istat;
2389                 Relation        indrel;
2390                 IndexBulkDeleteResult *istat;
2391
2392                 shared_istat = parallel_stats_for_idx(lvshared, idx);
2393
2394                 /* Skip already-complete indexes */
2395                 if (shared_istat != NULL)
2396                         continue;
2397
2398                 indrel = vacrel->indrels[idx];
2399
2400                 /*
2401                  * We're only here for the unsafe indexes
2402                  */
2403                 if (parallel_processing_is_safe(indrel, lvshared))
2404                         continue;
2405
2406                 /* Do vacuum or cleanup of the index */
2407                 istat = (vacrel->indstats[idx]);
2408                 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2409                                                                                                                    lvshared,
2410                                                                                                                    shared_istat,
2411                                                                                                                    vacrel);
2412         }
2413
2414         /*
2415          * We have completed the index vacuum so decrement the active worker
2416          * count.
2417          */
2418         if (VacuumActiveNWorkers)
2419                 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2420 }
2421
2422 /*
2423  * Vacuum or cleanup index either by leader process or by one of the worker
2424  * process.  After processing the index this function copies the index
2425  * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2426  * segment.
2427  */
2428 static IndexBulkDeleteResult *
2429 parallel_process_one_index(Relation indrel,
2430                                                    IndexBulkDeleteResult *istat,
2431                                                    LVShared *lvshared,
2432                                                    LVSharedIndStats *shared_istat,
2433                                                    LVRelState *vacrel)
2434 {
2435         IndexBulkDeleteResult *istat_res;
2436
2437         /*
2438          * Update the pointer to the corresponding bulk-deletion result if someone
2439          * has already updated it
2440          */
2441         if (shared_istat && shared_istat->updated && istat == NULL)
2442                 istat = &shared_istat->istat;
2443
2444         /* Do vacuum or cleanup of the index */
2445         if (lvshared->for_cleanup)
2446                 istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples,
2447                                                                                    lvshared->estimated_count, vacrel);
2448         else
2449                 istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples,
2450                                                                                   vacrel);
2451
2452         /*
2453          * Copy the index bulk-deletion result returned from ambulkdelete and
2454          * amvacuumcleanup to the DSM segment if it's the first cycle because they
2455          * allocate locally and it's possible that an index will be vacuumed by a
2456          * different vacuum process the next cycle.  Copying the result normally
2457          * happens only the first time an index is vacuumed.  For any additional
2458          * vacuum pass, we directly point to the result on the DSM segment and
2459          * pass it to vacuum index APIs so that workers can update it directly.
2460          *
2461          * Since all vacuum workers write the bulk-deletion result at different
2462          * slots we can write them without locking.
2463          */
2464         if (shared_istat && !shared_istat->updated && istat_res != NULL)
2465         {
2466                 memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult));
2467                 shared_istat->updated = true;
2468
2469                 /* Free the locally-allocated bulk-deletion result */
2470                 pfree(istat_res);
2471
2472                 /* return the pointer to the result from shared memory */
2473                 return &shared_istat->istat;
2474         }
2475
2476         return istat_res;
2477 }
2478
2479 /*
2480  *      lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2481  */
2482 static void
2483 lazy_cleanup_all_indexes(LVRelState *vacrel)
2484 {
2485         Assert(!IsParallelWorker());
2486         Assert(vacrel->nindexes > 0);
2487
2488         /* Report that we are now cleaning up indexes */
2489         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2490                                                                  PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
2491
2492         if (!ParallelVacuumIsActive(vacrel))
2493         {
2494                 double          reltuples = vacrel->new_rel_tuples;
2495                 bool            estimated_count =
2496                 vacrel->tupcount_pages < vacrel->rel_pages;
2497
2498                 for (int idx = 0; idx < vacrel->nindexes; idx++)
2499                 {
2500                         Relation        indrel = vacrel->indrels[idx];
2501                         IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2502
2503                         vacrel->indstats[idx] =
2504                                 lazy_cleanup_one_index(indrel, istat, reltuples,
2505                                                                            estimated_count, vacrel);
2506                 }
2507         }
2508         else
2509         {
2510                 /* Outsource everything to parallel variant */
2511                 do_parallel_lazy_cleanup_all_indexes(vacrel);
2512         }
2513 }
2514
2515 /*
2516  *      lazy_vacuum_one_index() -- vacuum index relation.
2517  *
2518  *              Delete all the index entries pointing to tuples listed in
2519  *              dead_tuples, and update running statistics.
2520  *
2521  *              reltuples is the number of heap tuples to be passed to the
2522  *              bulkdelete callback.  It's always assumed to be estimated.
2523  *
2524  * Returns bulk delete stats derived from input stats
2525  */
2526 static IndexBulkDeleteResult *
2527 lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2528                                           double reltuples, LVRelState *vacrel)
2529 {
2530         IndexVacuumInfo ivinfo;
2531         PGRUsage        ru0;
2532         LVSavedErrInfo saved_err_info;
2533
2534         pg_rusage_init(&ru0);
2535
2536         ivinfo.index = indrel;
2537         ivinfo.analyze_only = false;
2538         ivinfo.report_progress = false;
2539         ivinfo.estimated_count = true;
2540         ivinfo.message_level = elevel;
2541         ivinfo.num_heap_tuples = reltuples;
2542         ivinfo.strategy = vacrel->bstrategy;
2543
2544         /*
2545          * Update error traceback information.
2546          *
2547          * The index name is saved during this phase and restored immediately
2548          * after this phase.  See vacuum_error_callback.
2549          */
2550         Assert(vacrel->indname == NULL);
2551         vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2552         update_vacuum_error_info(vacrel, &saved_err_info,
2553                                                          VACUUM_ERRCB_PHASE_VACUUM_INDEX,
2554                                                          InvalidBlockNumber, InvalidOffsetNumber);
2555
2556         /* Do bulk deletion */
2557         istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped,
2558                                                           (void *) vacrel->dead_tuples);
2559
2560         ereport(elevel,
2561                         (errmsg("scanned index \"%s\" to remove %d row versions",
2562                                         vacrel->indname, vacrel->dead_tuples->num_tuples),
2563                          errdetail_internal("%s", pg_rusage_show(&ru0))));
2564
2565         /* Revert to the previous phase information for error traceback */
2566         restore_vacuum_error_info(vacrel, &saved_err_info);
2567         pfree(vacrel->indname);
2568         vacrel->indname = NULL;
2569
2570         return istat;
2571 }
2572
2573 /*
2574  *      lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
2575  *
2576  *              reltuples is the number of heap tuples and estimated_count is true
2577  *              if reltuples is an estimated value.
2578  *
2579  * Returns bulk delete stats derived from input stats
2580  */
2581 static IndexBulkDeleteResult *
2582 lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2583                                            double reltuples, bool estimated_count,
2584                                            LVRelState *vacrel)
2585 {
2586         IndexVacuumInfo ivinfo;
2587         PGRUsage        ru0;
2588         LVSavedErrInfo saved_err_info;
2589
2590         pg_rusage_init(&ru0);
2591
2592         ivinfo.index = indrel;
2593         ivinfo.analyze_only = false;
2594         ivinfo.report_progress = false;
2595         ivinfo.estimated_count = estimated_count;
2596         ivinfo.message_level = elevel;
2597
2598         ivinfo.num_heap_tuples = reltuples;
2599         ivinfo.strategy = vacrel->bstrategy;
2600
2601         /*
2602          * Update error traceback information.
2603          *
2604          * The index name is saved during this phase and restored immediately
2605          * after this phase.  See vacuum_error_callback.
2606          */
2607         Assert(vacrel->indname == NULL);
2608         vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2609         update_vacuum_error_info(vacrel, &saved_err_info,
2610                                                          VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
2611                                                          InvalidBlockNumber, InvalidOffsetNumber);
2612
2613         istat = index_vacuum_cleanup(&ivinfo, istat);
2614
2615         if (istat)
2616         {
2617                 ereport(elevel,
2618                                 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
2619                                                 RelationGetRelationName(indrel),
2620                                                 (istat)->num_index_tuples,
2621                                                 (istat)->num_pages),
2622                                  errdetail("%.0f index row versions were removed.\n"
2623                                                    "%u index pages were newly deleted.\n"
2624                                                    "%u index pages are currently deleted, of which %u are currently reusable.\n"
2625                                                    "%s.",
2626                                                    (istat)->tuples_removed,
2627                                                    (istat)->pages_newly_deleted,
2628                                                    (istat)->pages_deleted, (istat)->pages_free,
2629                                                    pg_rusage_show(&ru0))));
2630         }
2631
2632         /* Revert to the previous phase information for error traceback */
2633         restore_vacuum_error_info(vacrel, &saved_err_info);
2634         pfree(vacrel->indname);
2635         vacrel->indname = NULL;
2636
2637         return istat;
2638 }
2639
2640 /*
2641  * should_attempt_truncation - should we attempt to truncate the heap?
2642  *
2643  * Don't even think about it unless we have a shot at releasing a goodly
2644  * number of pages.  Otherwise, the time taken isn't worth it.
2645  *
2646  * Also don't attempt it if we are doing early pruning/vacuuming, because a
2647  * scan which cannot find a truncated heap page cannot determine that the
2648  * snapshot is too old to read that page.  We might be able to get away with
2649  * truncating all except one of the pages, setting its LSN to (at least) the
2650  * maximum of the truncated range if we also treated an index leaf tuple
2651  * pointing to a missing heap page as something to trigger the "snapshot too
2652  * old" error, but that seems fragile and seems like it deserves its own patch
2653  * if we consider it.
2654  *
2655  * This is split out so that we can test whether truncation is going to be
2656  * called for before we actually do it.  If you change the logic here, be
2657  * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
2658  */
2659 static bool
2660 should_attempt_truncation(LVRelState *vacrel, VacuumParams *params)
2661 {
2662         BlockNumber possibly_freeable;
2663
2664         if (params->truncate == VACOPT_TERNARY_DISABLED)
2665                 return false;
2666
2667         possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
2668         if (possibly_freeable > 0 &&
2669                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
2670                  possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) &&
2671                 old_snapshot_threshold < 0)
2672                 return true;
2673         else
2674                 return false;
2675 }
2676
2677 /*
2678  * lazy_truncate_heap - try to truncate off any empty pages at the end
2679  */
2680 static void
2681 lazy_truncate_heap(LVRelState *vacrel)
2682 {
2683         BlockNumber old_rel_pages = vacrel->rel_pages;
2684         BlockNumber new_rel_pages;
2685         int                     lock_retry;
2686
2687         /* Report that we are now truncating */
2688         pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2689                                                                  PROGRESS_VACUUM_PHASE_TRUNCATE);
2690
2691         /*
2692          * Loop until no more truncating can be done.
2693          */
2694         do
2695         {
2696                 PGRUsage        ru0;
2697
2698                 pg_rusage_init(&ru0);
2699
2700                 /*
2701                  * We need full exclusive lock on the relation in order to do
2702                  * truncation. If we can't get it, give up rather than waiting --- we
2703                  * don't want to block other backends, and we don't want to deadlock
2704                  * (which is quite possible considering we already hold a lower-grade
2705                  * lock).
2706                  */
2707                 vacrel->lock_waiter_detected = false;
2708                 lock_retry = 0;
2709                 while (true)
2710                 {
2711                         if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
2712                                 break;
2713
2714                         /*
2715                          * Check for interrupts while trying to (re-)acquire the exclusive
2716                          * lock.
2717                          */
2718                         CHECK_FOR_INTERRUPTS();
2719
2720                         if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
2721                                                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
2722                         {
2723                                 /*
2724                                  * We failed to establish the lock in the specified number of
2725                                  * retries. This means we give up truncating.
2726                                  */
2727                                 vacrel->lock_waiter_detected = true;
2728                                 ereport(elevel,
2729                                                 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
2730                                                                 vacrel->relname)));
2731                                 return;
2732                         }
2733
2734                         pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
2735                 }
2736
2737                 /*
2738                  * Now that we have exclusive lock, look to see if the rel has grown
2739                  * whilst we were vacuuming with non-exclusive lock.  If so, give up;
2740                  * the newly added pages presumably contain non-deletable tuples.
2741                  */
2742                 new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
2743                 if (new_rel_pages != old_rel_pages)
2744                 {
2745                         /*
2746                          * Note: we intentionally don't update vacrel->rel_pages with the
2747                          * new rel size here.  If we did, it would amount to assuming that
2748                          * the new pages are empty, which is unlikely. Leaving the numbers
2749                          * alone amounts to assuming that the new pages have the same
2750                          * tuple density as existing ones, which is less unlikely.
2751                          */
2752                         UnlockRelation(vacrel->rel, AccessExclusiveLock);
2753                         return;
2754                 }
2755
2756                 /*
2757                  * Scan backwards from the end to verify that the end pages actually
2758                  * contain no tuples.  This is *necessary*, not optional, because
2759                  * other backends could have added tuples to these pages whilst we
2760                  * were vacuuming.
2761                  */
2762                 new_rel_pages = count_nondeletable_pages(vacrel);
2763                 vacrel->blkno = new_rel_pages;
2764
2765                 if (new_rel_pages >= old_rel_pages)
2766                 {
2767                         /* can't do anything after all */
2768                         UnlockRelation(vacrel->rel, AccessExclusiveLock);
2769                         return;
2770                 }
2771
2772                 /*
2773                  * Okay to truncate.
2774                  */
2775                 RelationTruncate(vacrel->rel, new_rel_pages);
2776
2777                 /*
2778                  * We can release the exclusive lock as soon as we have truncated.
2779                  * Other backends can't safely access the relation until they have
2780                  * processed the smgr invalidation that smgrtruncate sent out ... but
2781                  * that should happen as part of standard invalidation processing once
2782                  * they acquire lock on the relation.
2783                  */
2784                 UnlockRelation(vacrel->rel, AccessExclusiveLock);
2785
2786                 /*
2787                  * Update statistics.  Here, it *is* correct to adjust rel_pages
2788                  * without also touching reltuples, since the tuple count wasn't
2789                  * changed by the truncation.
2790                  */
2791                 vacrel->pages_removed += old_rel_pages - new_rel_pages;
2792                 vacrel->rel_pages = new_rel_pages;
2793
2794                 ereport(elevel,
2795                                 (errmsg("\"%s\": truncated %u to %u pages",
2796                                                 vacrel->relname,
2797                                                 old_rel_pages, new_rel_pages),
2798                                  errdetail_internal("%s",
2799                                                                         pg_rusage_show(&ru0))));
2800                 old_rel_pages = new_rel_pages;
2801         } while (new_rel_pages > vacrel->nonempty_pages &&
2802                          vacrel->lock_waiter_detected);
2803 }
2804
2805 /*
2806  * Rescan end pages to verify that they are (still) empty of tuples.
2807  *
2808  * Returns number of nondeletable pages (last nonempty page + 1).
2809  */
2810 static BlockNumber
2811 count_nondeletable_pages(LVRelState *vacrel)
2812 {
2813         BlockNumber blkno;
2814         BlockNumber prefetchedUntil;
2815         instr_time      starttime;
2816
2817         /* Initialize the starttime if we check for conflicting lock requests */
2818         INSTR_TIME_SET_CURRENT(starttime);
2819
2820         /*
2821          * Start checking blocks at what we believe relation end to be and move
2822          * backwards.  (Strange coding of loop control is needed because blkno is
2823          * unsigned.)  To make the scan faster, we prefetch a few blocks at a time
2824          * in forward direction, so that OS-level readahead can kick in.
2825          */
2826         blkno = vacrel->rel_pages;
2827         StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
2828                                          "prefetch size must be power of 2");
2829         prefetchedUntil = InvalidBlockNumber;
2830         while (blkno > vacrel->nonempty_pages)
2831         {
2832                 Buffer          buf;
2833                 Page            page;
2834                 OffsetNumber offnum,
2835                                         maxoff;
2836                 bool            hastup;
2837
2838                 /*
2839                  * Check if another process requests a lock on our relation. We are
2840                  * holding an AccessExclusiveLock here, so they will be waiting. We
2841                  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
2842                  * only check if that interval has elapsed once every 32 blocks to
2843                  * keep the number of system calls and actual shared lock table
2844                  * lookups to a minimum.
2845                  */
2846                 if ((blkno % 32) == 0)
2847                 {
2848                         instr_time      currenttime;
2849                         instr_time      elapsed;
2850
2851                         INSTR_TIME_SET_CURRENT(currenttime);
2852                         elapsed = currenttime;
2853                         INSTR_TIME_SUBTRACT(elapsed, starttime);
2854                         if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
2855                                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
2856                         {
2857                                 if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
2858                                 {
2859                                         ereport(elevel,
2860                                                         (errmsg("\"%s\": suspending truncate due to conflicting lock request",
2861                                                                         vacrel->relname)));
2862
2863                                         vacrel->lock_waiter_detected = true;
2864                                         return blkno;
2865                                 }
2866                                 starttime = currenttime;
2867                         }
2868                 }
2869
2870                 /*
2871                  * We don't insert a vacuum delay point here, because we have an
2872                  * exclusive lock on the table which we want to hold for as short a
2873                  * time as possible.  We still need to check for interrupts however.
2874                  */
2875                 CHECK_FOR_INTERRUPTS();
2876
2877                 blkno--;
2878
2879                 /* If we haven't prefetched this lot yet, do so now. */
2880                 if (prefetchedUntil > blkno)
2881                 {
2882                         BlockNumber prefetchStart;
2883                         BlockNumber pblkno;
2884
2885                         prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
2886                         for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
2887                         {
2888                                 PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
2889                                 CHECK_FOR_INTERRUPTS();
2890                         }
2891                         prefetchedUntil = prefetchStart;
2892                 }
2893
2894                 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
2895                                                                  vacrel->bstrategy);
2896
2897                 /* In this phase we only need shared access to the buffer */
2898                 LockBuffer(buf, BUFFER_LOCK_SHARE);
2899
2900                 page = BufferGetPage(buf);
2901
2902                 if (PageIsNew(page) || PageIsEmpty(page))
2903                 {
2904                         UnlockReleaseBuffer(buf);
2905                         continue;
2906                 }
2907
2908                 hastup = false;
2909                 maxoff = PageGetMaxOffsetNumber(page);
2910                 for (offnum = FirstOffsetNumber;
2911                          offnum <= maxoff;
2912                          offnum = OffsetNumberNext(offnum))
2913                 {
2914                         ItemId          itemid;
2915
2916                         itemid = PageGetItemId(page, offnum);
2917
2918                         /*
2919                          * Note: any non-unused item should be taken as a reason to keep
2920                          * this page.  We formerly thought that DEAD tuples could be
2921                          * thrown away, but that's not so, because we'd not have cleaned
2922                          * out their index entries.
2923                          */
2924                         if (ItemIdIsUsed(itemid))
2925                         {
2926                                 hastup = true;
2927                                 break;                  /* can stop scanning */
2928                         }
2929                 }                                               /* scan along page */
2930
2931                 UnlockReleaseBuffer(buf);
2932
2933                 /* Done scanning if we found a tuple here */
2934                 if (hastup)
2935                         return blkno + 1;
2936         }
2937
2938         /*
2939          * If we fall out of the loop, all the previously-thought-to-be-empty
2940          * pages still are; we need not bother to look at the last known-nonempty
2941          * page.
2942          */
2943         return vacrel->nonempty_pages;
2944 }
2945
2946 /*
2947  * Return the maximum number of dead tuples we can record.
2948  */
2949 static long
2950 compute_max_dead_tuples(BlockNumber relblocks, bool useindex)
2951 {
2952         long            maxtuples;
2953         int                     vac_work_mem = IsAutoVacuumWorkerProcess() &&
2954         autovacuum_work_mem != -1 ?
2955         autovacuum_work_mem : maintenance_work_mem;
2956
2957         if (useindex)
2958         {
2959                 maxtuples = MAXDEADTUPLES(vac_work_mem * 1024L);
2960                 maxtuples = Min(maxtuples, INT_MAX);
2961                 maxtuples = Min(maxtuples, MAXDEADTUPLES(MaxAllocSize));
2962
2963                 /* curious coding here to ensure the multiplication can't overflow */
2964                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
2965                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
2966
2967                 /* stay sane if small maintenance_work_mem */
2968                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
2969         }
2970         else
2971                 maxtuples = MaxHeapTuplesPerPage;
2972
2973         return maxtuples;
2974 }
2975
2976 /*
2977  * lazy_space_alloc - space allocation decisions for lazy vacuum
2978  *
2979  * See the comments at the head of this file for rationale.
2980  */
2981 static void
2982 lazy_space_alloc(LVRelState *vacrel, int nworkers, BlockNumber nblocks)
2983 {
2984         LVDeadTuples *dead_tuples;
2985         long            maxtuples;
2986
2987         /*
2988          * Initialize state for a parallel vacuum.  As of now, only one worker can
2989          * be used for an index, so we invoke parallelism only if there are at
2990          * least two indexes on a table.
2991          */
2992         if (nworkers >= 0 && vacrel->nindexes > 1)
2993         {
2994                 /*
2995                  * Since parallel workers cannot access data in temporary tables, we
2996                  * can't perform parallel vacuum on them.
2997                  */
2998                 if (RelationUsesLocalBuffers(vacrel->rel))
2999                 {
3000                         /*
3001                          * Give warning only if the user explicitly tries to perform a
3002                          * parallel vacuum on the temporary table.
3003                          */
3004                         if (nworkers > 0)
3005                                 ereport(WARNING,
3006                                                 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3007                                                                 vacrel->relname)));
3008                 }
3009                 else
3010                         vacrel->lps = begin_parallel_vacuum(vacrel, nblocks, nworkers);
3011
3012                 /* If parallel mode started, we're done */
3013                 if (ParallelVacuumIsActive(vacrel))
3014                         return;
3015         }
3016
3017         maxtuples = compute_max_dead_tuples(nblocks, vacrel->nindexes > 0);
3018
3019         dead_tuples = (LVDeadTuples *) palloc(SizeOfDeadTuples(maxtuples));
3020         dead_tuples->num_tuples = 0;
3021         dead_tuples->max_tuples = (int) maxtuples;
3022
3023         vacrel->dead_tuples = dead_tuples;
3024 }
3025
3026 /*
3027  * lazy_space_free - free space allocated in lazy_space_alloc
3028  */
3029 static void
3030 lazy_space_free(LVRelState *vacrel)
3031 {
3032         if (!ParallelVacuumIsActive(vacrel))
3033                 return;
3034
3035         /*
3036          * End parallel mode before updating index statistics as we cannot write
3037          * during parallel mode.
3038          */
3039         end_parallel_vacuum(vacrel);
3040 }
3041
3042 /*
3043  * lazy_record_dead_tuple - remember one deletable tuple
3044  */
3045 static void
3046 lazy_record_dead_tuple(LVDeadTuples *dead_tuples, ItemPointer itemptr)
3047 {
3048         /*
3049          * The array shouldn't overflow under normal behavior, but perhaps it
3050          * could if we are given a really small maintenance_work_mem. In that
3051          * case, just forget the last few tuples (we'll get 'em next time).
3052          */
3053         if (dead_tuples->num_tuples < dead_tuples->max_tuples)
3054         {
3055                 dead_tuples->itemptrs[dead_tuples->num_tuples] = *itemptr;
3056                 dead_tuples->num_tuples++;
3057                 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
3058                                                                          dead_tuples->num_tuples);
3059         }
3060 }
3061
3062 /*
3063  *      lazy_tid_reaped() -- is a particular tid deletable?
3064  *
3065  *              This has the right signature to be an IndexBulkDeleteCallback.
3066  *
3067  *              Assumes dead_tuples array is in sorted order.
3068  */
3069 static bool
3070 lazy_tid_reaped(ItemPointer itemptr, void *state)
3071 {
3072         LVDeadTuples *dead_tuples = (LVDeadTuples *) state;
3073         int64           litem,
3074                                 ritem,
3075                                 item;
3076         ItemPointer res;
3077
3078         litem = itemptr_encode(&dead_tuples->itemptrs[0]);
3079         ritem = itemptr_encode(&dead_tuples->itemptrs[dead_tuples->num_tuples - 1]);
3080         item = itemptr_encode(itemptr);
3081
3082         /*
3083          * Doing a simple bound check before bsearch() is useful to avoid the
3084          * extra cost of bsearch(), especially if dead tuples on the heap are
3085          * concentrated in a certain range.  Since this function is called for
3086          * every index tuple, it pays to be really fast.
3087          */
3088         if (item < litem || item > ritem)
3089                 return false;
3090
3091         res = (ItemPointer) bsearch((void *) itemptr,
3092                                                                 (void *) dead_tuples->itemptrs,
3093                                                                 dead_tuples->num_tuples,
3094                                                                 sizeof(ItemPointerData),
3095                                                                 vac_cmp_itemptr);
3096
3097         return (res != NULL);
3098 }
3099
3100 /*
3101  * Comparator routines for use with qsort() and bsearch().
3102  */
3103 static int
3104 vac_cmp_itemptr(const void *left, const void *right)
3105 {
3106         BlockNumber lblk,
3107                                 rblk;
3108         OffsetNumber loff,
3109                                 roff;
3110
3111         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
3112         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
3113
3114         if (lblk < rblk)
3115                 return -1;
3116         if (lblk > rblk)
3117                 return 1;
3118
3119         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
3120         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
3121
3122         if (loff < roff)
3123                 return -1;
3124         if (loff > roff)
3125                 return 1;
3126
3127         return 0;
3128 }
3129
3130 /*
3131  * Check if every tuple in the given page is visible to all current and future
3132  * transactions. Also return the visibility_cutoff_xid which is the highest
3133  * xmin amongst the visible tuples.  Set *all_frozen to true if every tuple
3134  * on this page is frozen.
3135  */
3136 static bool
3137 heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
3138                                                  TransactionId *visibility_cutoff_xid,
3139                                                  bool *all_frozen)
3140 {
3141         Page            page = BufferGetPage(buf);
3142         BlockNumber blockno = BufferGetBlockNumber(buf);
3143         OffsetNumber offnum,
3144                                 maxoff;
3145         bool            all_visible = true;
3146
3147         *visibility_cutoff_xid = InvalidTransactionId;
3148         *all_frozen = true;
3149
3150         /*
3151          * This is a stripped down version of the line pointer scan in
3152          * lazy_scan_heap(). So if you change anything here, also check that code.
3153          */
3154         maxoff = PageGetMaxOffsetNumber(page);
3155         for (offnum = FirstOffsetNumber;
3156                  offnum <= maxoff && all_visible;
3157                  offnum = OffsetNumberNext(offnum))
3158         {
3159                 ItemId          itemid;
3160                 HeapTupleData tuple;
3161
3162                 /*
3163                  * Set the offset number so that we can display it along with any
3164                  * error that occurred while processing this tuple.
3165                  */
3166                 vacrel->offnum = offnum;
3167                 itemid = PageGetItemId(page, offnum);
3168
3169                 /* Unused or redirect line pointers are of no interest */
3170                 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3171                         continue;
3172
3173                 ItemPointerSet(&(tuple.t_self), blockno, offnum);
3174
3175                 /*
3176                  * Dead line pointers can have index pointers pointing to them. So
3177                  * they can't be treated as visible
3178                  */
3179                 if (ItemIdIsDead(itemid))
3180                 {
3181                         all_visible = false;
3182                         *all_frozen = false;
3183                         break;
3184                 }
3185
3186                 Assert(ItemIdIsNormal(itemid));
3187
3188                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3189                 tuple.t_len = ItemIdGetLength(itemid);
3190                 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3191
3192                 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
3193                 {
3194                         case HEAPTUPLE_LIVE:
3195                                 {
3196                                         TransactionId xmin;
3197
3198                                         /* Check comments in lazy_scan_heap. */
3199                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3200                                         {
3201                                                 all_visible = false;
3202                                                 *all_frozen = false;
3203                                                 break;
3204                                         }
3205
3206                                         /*
3207                                          * The inserter definitely committed. But is it old enough
3208                                          * that everyone sees it as committed?
3209                                          */
3210                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3211                                         if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
3212                                         {
3213                                                 all_visible = false;
3214                                                 *all_frozen = false;
3215                                                 break;
3216                                         }
3217
3218                                         /* Track newest xmin on page. */
3219                                         if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3220                                                 *visibility_cutoff_xid = xmin;
3221
3222                                         /* Check whether this tuple is already frozen or not */
3223                                         if (all_visible && *all_frozen &&
3224                                                 heap_tuple_needs_eventual_freeze(tuple.t_data))
3225                                                 *all_frozen = false;
3226                                 }
3227                                 break;
3228
3229                         case HEAPTUPLE_DEAD:
3230                         case HEAPTUPLE_RECENTLY_DEAD:
3231                         case HEAPTUPLE_INSERT_IN_PROGRESS:
3232                         case HEAPTUPLE_DELETE_IN_PROGRESS:
3233                                 {
3234                                         all_visible = false;
3235                                         *all_frozen = false;
3236                                         break;
3237                                 }
3238                         default:
3239                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3240                                 break;
3241                 }
3242         }                                                       /* scan along page */
3243
3244         /* Clear the offset information once we have processed the given page. */
3245         vacrel->offnum = InvalidOffsetNumber;
3246
3247         return all_visible;
3248 }
3249
3250 /*
3251  * Compute the number of parallel worker processes to request.  Both index
3252  * vacuum and index cleanup can be executed with parallel workers.  The index
3253  * is eligible for parallel vacuum iff its size is greater than
3254  * min_parallel_index_scan_size as invoking workers for very small indexes
3255  * can hurt performance.
3256  *
3257  * nrequested is the number of parallel workers that user requested.  If
3258  * nrequested is 0, we compute the parallel degree based on nindexes, that is
3259  * the number of indexes that support parallel vacuum.  This function also
3260  * sets can_parallel_vacuum to remember indexes that participate in parallel
3261  * vacuum.
3262  */
3263 static int
3264 compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested,
3265                                                                 bool *can_parallel_vacuum)
3266 {
3267         int                     nindexes_parallel = 0;
3268         int                     nindexes_parallel_bulkdel = 0;
3269         int                     nindexes_parallel_cleanup = 0;
3270         int                     parallel_workers;
3271
3272         /*
3273          * We don't allow performing parallel operation in standalone backend or
3274          * when parallelism is disabled.
3275          */
3276         if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0)
3277                 return 0;
3278
3279         /*
3280          * Compute the number of indexes that can participate in parallel vacuum.
3281          */
3282         for (int idx = 0; idx < vacrel->nindexes; idx++)
3283         {
3284                 Relation        indrel = vacrel->indrels[idx];
3285                 uint8           vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3286
3287                 if (vacoptions == VACUUM_OPTION_NO_PARALLEL ||
3288                         RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size)
3289                         continue;
3290
3291                 can_parallel_vacuum[idx] = true;
3292
3293                 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3294                         nindexes_parallel_bulkdel++;
3295                 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) ||
3296                         ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3297                         nindexes_parallel_cleanup++;
3298         }
3299
3300         nindexes_parallel = Max(nindexes_parallel_bulkdel,
3301                                                         nindexes_parallel_cleanup);
3302
3303         /* The leader process takes one index */
3304         nindexes_parallel--;
3305
3306         /* No index supports parallel vacuum */
3307         if (nindexes_parallel <= 0)
3308                 return 0;
3309
3310         /* Compute the parallel degree */
3311         parallel_workers = (nrequested > 0) ?
3312                 Min(nrequested, nindexes_parallel) : nindexes_parallel;
3313
3314         /* Cap by max_parallel_maintenance_workers */
3315         parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers);
3316
3317         return parallel_workers;
3318 }
3319
3320 /*
3321  * Update index statistics in pg_class if the statistics are accurate.
3322  */
3323 static void
3324 update_index_statistics(LVRelState *vacrel)
3325 {
3326         Relation   *indrels = vacrel->indrels;
3327         int                     nindexes = vacrel->nindexes;
3328         IndexBulkDeleteResult **indstats = vacrel->indstats;
3329
3330         Assert(!IsInParallelMode());
3331
3332         for (int idx = 0; idx < nindexes; idx++)
3333         {
3334                 Relation        indrel = indrels[idx];
3335                 IndexBulkDeleteResult *istat = indstats[idx];
3336
3337                 if (istat == NULL || istat->estimated_count)
3338                         continue;
3339
3340                 /* Update index statistics */
3341                 vac_update_relstats(indrel,
3342                                                         istat->num_pages,
3343                                                         istat->num_index_tuples,
3344                                                         0,
3345                                                         false,
3346                                                         InvalidTransactionId,
3347                                                         InvalidMultiXactId,
3348                                                         false);
3349         }
3350 }
3351
3352 /*
3353  * This function prepares and returns parallel vacuum state if we can launch
3354  * even one worker.  This function is responsible for entering parallel mode,
3355  * create a parallel context, and then initialize the DSM segment.
3356  */
3357 static LVParallelState *
3358 begin_parallel_vacuum(LVRelState *vacrel, BlockNumber nblocks,
3359                                           int nrequested)
3360 {
3361         LVParallelState *lps = NULL;
3362         Relation   *indrels = vacrel->indrels;
3363         int                     nindexes = vacrel->nindexes;
3364         ParallelContext *pcxt;
3365         LVShared   *shared;
3366         LVDeadTuples *dead_tuples;
3367         BufferUsage *buffer_usage;
3368         WalUsage   *wal_usage;
3369         bool       *can_parallel_vacuum;
3370         long            maxtuples;
3371         Size            est_shared;
3372         Size            est_deadtuples;
3373         int                     nindexes_mwm = 0;
3374         int                     parallel_workers = 0;
3375         int                     querylen;
3376
3377         /*
3378          * A parallel vacuum must be requested and there must be indexes on the
3379          * relation
3380          */
3381         Assert(nrequested >= 0);
3382         Assert(nindexes > 0);
3383
3384         /*
3385          * Compute the number of parallel vacuum workers to launch
3386          */
3387         can_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes);
3388         parallel_workers = compute_parallel_vacuum_workers(vacrel,
3389                                                                                                            nrequested,
3390                                                                                                            can_parallel_vacuum);
3391
3392         /* Can't perform vacuum in parallel */
3393         if (parallel_workers <= 0)
3394         {
3395                 pfree(can_parallel_vacuum);
3396                 return lps;
3397         }
3398
3399         lps = (LVParallelState *) palloc0(sizeof(LVParallelState));
3400
3401         EnterParallelMode();
3402         pcxt = CreateParallelContext("postgres", "parallel_vacuum_main",
3403                                                                  parallel_workers);
3404         Assert(pcxt->nworkers > 0);
3405         lps->pcxt = pcxt;
3406
3407         /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3408         est_shared = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3409         for (int idx = 0; idx < nindexes; idx++)
3410         {
3411                 Relation        indrel = indrels[idx];
3412                 uint8           vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3413
3414                 /*
3415                  * Cleanup option should be either disabled, always performing in
3416                  * parallel or conditionally performing in parallel.
3417                  */
3418                 Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) ||
3419                            ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0));
3420                 Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE);
3421
3422                 /* Skip indexes that don't participate in parallel vacuum */
3423                 if (!can_parallel_vacuum[idx])
3424                         continue;
3425
3426                 if (indrel->rd_indam->amusemaintenanceworkmem)
3427                         nindexes_mwm++;
3428
3429                 est_shared = add_size(est_shared, sizeof(LVSharedIndStats));
3430
3431                 /*
3432                  * Remember the number of indexes that support parallel operation for
3433                  * each phase.
3434                  */
3435                 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3436                         lps->nindexes_parallel_bulkdel++;
3437                 if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)
3438                         lps->nindexes_parallel_cleanup++;
3439                 if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)
3440                         lps->nindexes_parallel_condcleanup++;
3441         }
3442         shm_toc_estimate_chunk(&pcxt->estimator, est_shared);
3443         shm_toc_estimate_keys(&pcxt->estimator, 1);
3444
3445         /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3446         maxtuples = compute_max_dead_tuples(nblocks, true);
3447         est_deadtuples = MAXALIGN(SizeOfDeadTuples(maxtuples));
3448         shm_toc_estimate_chunk(&pcxt->estimator, est_deadtuples);
3449         shm_toc_estimate_keys(&pcxt->estimator, 1);
3450
3451         /*
3452          * Estimate space for BufferUsage and WalUsage --
3453          * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3454          *
3455          * If there are no extensions loaded that care, we could skip this.  We
3456          * have no way of knowing whether anyone's looking at pgBufferUsage or
3457          * pgWalUsage, so do it unconditionally.
3458          */
3459         shm_toc_estimate_chunk(&pcxt->estimator,
3460                                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
3461         shm_toc_estimate_keys(&pcxt->estimator, 1);
3462         shm_toc_estimate_chunk(&pcxt->estimator,
3463                                                    mul_size(sizeof(WalUsage), pcxt->nworkers));
3464         shm_toc_estimate_keys(&pcxt->estimator, 1);
3465
3466         /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3467         if (debug_query_string)
3468         {
3469                 querylen = strlen(debug_query_string);
3470                 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
3471                 shm_toc_estimate_keys(&pcxt->estimator, 1);
3472         }
3473         else
3474                 querylen = 0;                   /* keep compiler quiet */
3475
3476         InitializeParallelDSM(pcxt);
3477
3478         /* Prepare shared information */
3479         shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared);
3480         MemSet(shared, 0, est_shared);
3481         shared->relid = RelationGetRelid(vacrel->rel);
3482         shared->elevel = elevel;
3483         shared->maintenance_work_mem_worker =
3484                 (nindexes_mwm > 0) ?
3485                 maintenance_work_mem / Min(parallel_workers, nindexes_mwm) :
3486                 maintenance_work_mem;
3487
3488         pg_atomic_init_u32(&(shared->cost_balance), 0);
3489         pg_atomic_init_u32(&(shared->active_nworkers), 0);
3490         pg_atomic_init_u32(&(shared->idx), 0);
3491         shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3492
3493         /*
3494          * Initialize variables for shared index statistics, set NULL bitmap and
3495          * the size of stats for each index.
3496          */
3497         memset(shared->bitmap, 0x00, BITMAPLEN(nindexes));
3498         for (int idx = 0; idx < nindexes; idx++)
3499         {
3500                 if (!can_parallel_vacuum[idx])
3501                         continue;
3502
3503                 /* Set NOT NULL as this index does support parallelism */
3504                 shared->bitmap[idx >> 3] |= 1 << (idx & 0x07);
3505         }
3506
3507         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared);
3508         lps->lvshared = shared;
3509
3510         /* Prepare the dead tuple space */
3511         dead_tuples = (LVDeadTuples *) shm_toc_allocate(pcxt->toc, est_deadtuples);
3512         dead_tuples->max_tuples = maxtuples;
3513         dead_tuples->num_tuples = 0;
3514         MemSet(dead_tuples->itemptrs, 0, sizeof(ItemPointerData) * maxtuples);
3515         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_TUPLES, dead_tuples);
3516         vacrel->dead_tuples = dead_tuples;
3517
3518         /*
3519          * Allocate space for each worker's BufferUsage and WalUsage; no need to
3520          * initialize
3521          */
3522         buffer_usage = shm_toc_allocate(pcxt->toc,
3523                                                                         mul_size(sizeof(BufferUsage), pcxt->nworkers));
3524         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
3525         lps->buffer_usage = buffer_usage;
3526         wal_usage = shm_toc_allocate(pcxt->toc,
3527                                                                  mul_size(sizeof(WalUsage), pcxt->nworkers));
3528         shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
3529         lps->wal_usage = wal_usage;
3530
3531         /* Store query string for workers */
3532         if (debug_query_string)
3533         {
3534                 char       *sharedquery;
3535
3536                 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
3537                 memcpy(sharedquery, debug_query_string, querylen + 1);
3538                 sharedquery[querylen] = '\0';
3539                 shm_toc_insert(pcxt->toc,
3540                                            PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery);
3541         }
3542
3543         pfree(can_parallel_vacuum);
3544         return lps;
3545 }
3546
3547 /*
3548  * Destroy the parallel context, and end parallel mode.
3549  *
3550  * Since writes are not allowed during parallel mode, copy the
3551  * updated index statistics from DSM into local memory and then later use that
3552  * to update the index statistics.  One might think that we can exit from
3553  * parallel mode, update the index statistics and then destroy parallel
3554  * context, but that won't be safe (see ExitParallelMode).
3555  */
3556 static void
3557 end_parallel_vacuum(LVRelState *vacrel)
3558 {
3559         IndexBulkDeleteResult **indstats = vacrel->indstats;
3560         LVParallelState *lps = vacrel->lps;
3561         int                     nindexes = vacrel->nindexes;
3562
3563         Assert(!IsParallelWorker());
3564
3565         /* Copy the updated statistics */
3566         for (int idx = 0; idx < nindexes; idx++)
3567         {
3568                 LVSharedIndStats *shared_istat;
3569
3570                 shared_istat = parallel_stats_for_idx(lps->lvshared, idx);
3571
3572                 /*
3573                  * Skip unused slot.  The statistics of this index are already stored
3574                  * in local memory.
3575                  */
3576                 if (shared_istat == NULL)
3577                         continue;
3578
3579                 if (shared_istat->updated)
3580                 {
3581                         indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
3582                         memcpy(indstats[idx], &(shared_istat->istat), sizeof(IndexBulkDeleteResult));
3583                 }
3584                 else
3585                         indstats[idx] = NULL;
3586         }
3587
3588         DestroyParallelContext(lps->pcxt);
3589         ExitParallelMode();
3590
3591         /* Deactivate parallel vacuum */
3592         pfree(lps);
3593         vacrel->lps = NULL;
3594 }
3595
3596 /*
3597  * Return shared memory statistics for index at offset 'getidx', if any
3598  */
3599 static LVSharedIndStats *
3600 parallel_stats_for_idx(LVShared *lvshared, int getidx)
3601 {
3602         char       *p;
3603
3604         if (IndStatsIsNull(lvshared, getidx))
3605                 return NULL;
3606
3607         p = (char *) GetSharedIndStats(lvshared);
3608         for (int idx = 0; idx < getidx; idx++)
3609         {
3610                 if (IndStatsIsNull(lvshared, idx))
3611                         continue;
3612
3613                 p += sizeof(LVSharedIndStats);
3614         }
3615
3616         return (LVSharedIndStats *) p;
3617 }
3618
3619 /*
3620  * Returns false, if the given index can't participate in parallel index
3621  * vacuum or parallel index cleanup
3622  */
3623 static bool
3624 parallel_processing_is_safe(Relation indrel, LVShared *lvshared)
3625 {
3626         uint8           vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3627
3628         /* first_time must be true only if for_cleanup is true */
3629         Assert(lvshared->for_cleanup || !lvshared->first_time);
3630
3631         if (lvshared->for_cleanup)
3632         {
3633                 /* Skip, if the index does not support parallel cleanup */
3634                 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) &&
3635                         ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0))
3636                         return true;
3637
3638                 /*
3639                  * Skip, if the index supports parallel cleanup conditionally, but we
3640                  * have already processed the index (for bulkdelete).  See the
3641                  * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
3642                  * when indexes support parallel cleanup conditionally.
3643                  */
3644                 if (!lvshared->first_time &&
3645                         ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3646                         return false;
3647         }
3648         else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0)
3649         {
3650                 /* Skip if the index does not support parallel bulk deletion */
3651                 return false;
3652         }
3653
3654         return true;
3655 }
3656
3657 /*
3658  * Perform work within a launched parallel process.
3659  *
3660  * Since parallel vacuum workers perform only index vacuum or index cleanup,
3661  * we don't need to report progress information.
3662  */
3663 void
3664 parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
3665 {
3666         Relation        rel;
3667         Relation   *indrels;
3668         LVShared   *lvshared;
3669         LVDeadTuples *dead_tuples;
3670         BufferUsage *buffer_usage;
3671         WalUsage   *wal_usage;
3672         int                     nindexes;
3673         char       *sharedquery;
3674         LVRelState      vacrel;
3675         ErrorContextCallback errcallback;
3676
3677         lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED,
3678                                                                                    false);
3679         elevel = lvshared->elevel;
3680
3681         if (lvshared->for_cleanup)
3682                 elog(DEBUG1, "starting parallel vacuum worker for cleanup");
3683         else
3684                 elog(DEBUG1, "starting parallel vacuum worker for bulk delete");
3685
3686         /* Set debug_query_string for individual workers */
3687         sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true);
3688         debug_query_string = sharedquery;
3689         pgstat_report_activity(STATE_RUNNING, debug_query_string);
3690
3691         /*
3692          * Open table.  The lock mode is the same as the leader process.  It's
3693          * okay because the lock mode does not conflict among the parallel
3694          * workers.
3695          */
3696         rel = table_open(lvshared->relid, ShareUpdateExclusiveLock);
3697
3698         /*
3699          * Open all indexes. indrels are sorted in order by OID, which should be
3700          * matched to the leader's one.
3701          */
3702         vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels);
3703         Assert(nindexes > 0);
3704
3705         /* Set dead tuple space */
3706         dead_tuples = (LVDeadTuples *) shm_toc_lookup(toc,
3707                                                                                                   PARALLEL_VACUUM_KEY_DEAD_TUPLES,
3708                                                                                                   false);
3709
3710         /* Set cost-based vacuum delay */
3711         VacuumCostActive = (VacuumCostDelay > 0);
3712         VacuumCostBalance = 0;
3713         VacuumPageHit = 0;
3714         VacuumPageMiss = 0;
3715         VacuumPageDirty = 0;
3716         VacuumCostBalanceLocal = 0;
3717         VacuumSharedCostBalance = &(lvshared->cost_balance);
3718         VacuumActiveNWorkers = &(lvshared->active_nworkers);
3719
3720         vacrel.rel = rel;
3721         vacrel.indrels = indrels;
3722         vacrel.nindexes = nindexes;
3723         vacrel.indstats = (IndexBulkDeleteResult **)
3724                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
3725
3726         if (lvshared->maintenance_work_mem_worker > 0)
3727                 maintenance_work_mem = lvshared->maintenance_work_mem_worker;
3728
3729         /*
3730          * Initialize vacrel for use as error callback arg by parallel worker.
3731          */
3732         vacrel.relnamespace = get_namespace_name(RelationGetNamespace(rel));
3733         vacrel.relname = pstrdup(RelationGetRelationName(rel));
3734         vacrel.indname = NULL;
3735         vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN;      /* Not yet processing */
3736         vacrel.dead_tuples = dead_tuples;
3737
3738         /* Setup error traceback support for ereport() */
3739         errcallback.callback = vacuum_error_callback;
3740         errcallback.arg = &vacrel;
3741         errcallback.previous = error_context_stack;
3742         error_context_stack = &errcallback;
3743
3744         /* Prepare to track buffer usage during parallel execution */
3745         InstrStartParallelQuery();
3746
3747         /* Process indexes to perform vacuum/cleanup */
3748         do_parallel_processing(&vacrel, lvshared);
3749
3750         /* Report buffer/WAL usage during parallel execution */
3751         buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
3752         wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
3753         InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
3754                                                   &wal_usage[ParallelWorkerNumber]);
3755
3756         /* Pop the error context stack */
3757         error_context_stack = errcallback.previous;
3758
3759         vac_close_indexes(nindexes, indrels, RowExclusiveLock);
3760         table_close(rel, ShareUpdateExclusiveLock);
3761         pfree(vacrel.indstats);
3762 }
3763
3764 /*
3765  * Error context callback for errors occurring during vacuum.
3766  */
3767 static void
3768 vacuum_error_callback(void *arg)
3769 {
3770         LVRelState *errinfo = arg;
3771
3772         switch (errinfo->phase)
3773         {
3774                 case VACUUM_ERRCB_PHASE_SCAN_HEAP:
3775                         if (BlockNumberIsValid(errinfo->blkno))
3776                         {
3777                                 if (OffsetNumberIsValid(errinfo->offnum))
3778                                         errcontext("while scanning block %u and offset %u of relation \"%s.%s\"",
3779                                                            errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3780                                 else
3781                                         errcontext("while scanning block %u of relation \"%s.%s\"",
3782                                                            errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3783                         }
3784                         else
3785                                 errcontext("while scanning relation \"%s.%s\"",
3786                                                    errinfo->relnamespace, errinfo->relname);
3787                         break;
3788
3789                 case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
3790                         if (BlockNumberIsValid(errinfo->blkno))
3791                         {
3792                                 if (OffsetNumberIsValid(errinfo->offnum))
3793                                         errcontext("while vacuuming block %u and offset %u of relation \"%s.%s\"",
3794                                                            errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3795                                 else
3796                                         errcontext("while vacuuming block %u of relation \"%s.%s\"",
3797                                                            errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3798                         }
3799                         else
3800                                 errcontext("while vacuuming relation \"%s.%s\"",
3801                                                    errinfo->relnamespace, errinfo->relname);
3802                         break;
3803
3804                 case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
3805                         errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
3806                                            errinfo->indname, errinfo->relnamespace, errinfo->relname);
3807                         break;
3808
3809                 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
3810                         errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
3811                                            errinfo->indname, errinfo->relnamespace, errinfo->relname);
3812                         break;
3813
3814                 case VACUUM_ERRCB_PHASE_TRUNCATE:
3815                         if (BlockNumberIsValid(errinfo->blkno))
3816                                 errcontext("while truncating relation \"%s.%s\" to %u blocks",
3817                                                    errinfo->relnamespace, errinfo->relname, errinfo->blkno);
3818                         break;
3819
3820                 case VACUUM_ERRCB_PHASE_UNKNOWN:
3821                 default:
3822                         return;                         /* do nothing; the errinfo may not be
3823                                                                  * initialized */
3824         }
3825 }
3826
3827 /*
3828  * Updates the information required for vacuum error callback.  This also saves
3829  * the current information which can be later restored via restore_vacuum_error_info.
3830  */
3831 static void
3832 update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
3833                                                  int phase, BlockNumber blkno, OffsetNumber offnum)
3834 {
3835         if (saved_vacrel)
3836         {
3837                 saved_vacrel->offnum = vacrel->offnum;
3838                 saved_vacrel->blkno = vacrel->blkno;
3839                 saved_vacrel->phase = vacrel->phase;
3840         }
3841
3842         vacrel->blkno = blkno;
3843         vacrel->offnum = offnum;
3844         vacrel->phase = phase;
3845 }
3846
3847 /*
3848  * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
3849  */
3850 static void
3851 restore_vacuum_error_info(LVRelState *vacrel,
3852                                                   const LVSavedErrInfo *saved_vacrel)
3853 {
3854         vacrel->blkno = saved_vacrel->blkno;
3855         vacrel->offnum = saved_vacrel->offnum;
3856         vacrel->phase = saved_vacrel->phase;
3857 }