Fix VACUUM VERBOSE's LP_DEAD item pages output.
[pgsql.git] / src / backend / access / heap / vacuumlazy.c
blobad3feb88b3be994507bcdd1c8f69d0aaa5e4b257
1 /*-------------------------------------------------------------------------
3 * vacuumlazy.c
4 * Concurrent ("lazy") vacuuming.
7 * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 * TIDs. We want to ensure we can vacuum even the very largest relations with
9 * finite memory space usage. To do that, we set upper bounds on the number of
10 * tuples we will keep track of at once.
12 * We are willing to use at most maintenance_work_mem (or perhaps
13 * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 * initially allocate an array of TIDs of that size, with an upper limit that
15 * depends on table size (this limit ensures we don't allocate a huge area
16 * uselessly for vacuuming small tables). If the array threatens to overflow,
17 * we suspend the heap scan phase and perform a pass of index cleanup and page
18 * compaction, then resume the heap scan with an empty TID array.
20 * If we're processing a table with no indexes, we can just vacuum each page
21 * as we go; there's no need to save up multiple tuples to minimize the number
22 * of index scans performed. So we don't use maintenance_work_mem memory for
23 * the TID array, just enough to hold as many heap tuples as fit on one page.
25 * Lazy vacuum supports parallel execution with parallel worker processes. In
26 * a parallel vacuum, we perform both index vacuum and index cleanup with
27 * parallel worker processes. Individual indexes are processed by one vacuum
28 * process. At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
29 * the parallel context and initialize the DSM segment that contains shared
30 * information as well as the memory space for storing dead tuples. When
31 * starting either index vacuum or index cleanup, we launch parallel worker
32 * processes. Once all indexes are processed the parallel worker processes
33 * exit. After that, the leader process re-initializes the parallel context
34 * so that it can use the same DSM for multiple passes of index vacuum and
35 * for performing index cleanup. For updating the index statistics, we need
36 * to update the system table and since updates are not allowed during
37 * parallel mode we update the index statistics after exiting from the
38 * parallel mode.
40 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
41 * Portions Copyright (c) 1994, Regents of the University of California
44 * IDENTIFICATION
45 * src/backend/access/heap/vacuumlazy.c
47 *-------------------------------------------------------------------------
49 #include "postgres.h"
51 #include <math.h>
53 #include "access/amapi.h"
54 #include "access/genam.h"
55 #include "access/heapam.h"
56 #include "access/heapam_xlog.h"
57 #include "access/htup_details.h"
58 #include "access/multixact.h"
59 #include "access/parallel.h"
60 #include "access/transam.h"
61 #include "access/visibilitymap.h"
62 #include "access/xact.h"
63 #include "access/xlog.h"
64 #include "catalog/index.h"
65 #include "catalog/storage.h"
66 #include "commands/dbcommands.h"
67 #include "commands/progress.h"
68 #include "commands/vacuum.h"
69 #include "executor/instrument.h"
70 #include "miscadmin.h"
71 #include "optimizer/paths.h"
72 #include "pgstat.h"
73 #include "portability/instr_time.h"
74 #include "postmaster/autovacuum.h"
75 #include "storage/bufmgr.h"
76 #include "storage/freespace.h"
77 #include "storage/lmgr.h"
78 #include "tcop/tcopprot.h"
79 #include "utils/lsyscache.h"
80 #include "utils/memutils.h"
81 #include "utils/pg_rusage.h"
82 #include "utils/timestamp.h"
86 * Space/time tradeoff parameters: do these need to be user-tunable?
88 * To consider truncating the relation, we want there to be at least
89 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
90 * is less) potentially-freeable pages.
92 #define REL_TRUNCATE_MINIMUM 1000
93 #define REL_TRUNCATE_FRACTION 16
96 * Timing parameters for truncate locking heuristics.
98 * These were not exposed as user tunable GUC values because it didn't seem
99 * that the potential for improvement was great enough to merit the cost of
100 * supporting them.
102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
107 * Threshold that controls whether we bypass index vacuuming and heap
108 * vacuuming as an optimization
110 #define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */
113 * Perform a failsafe check every 4GB during the heap scan, approximately
115 #define FAILSAFE_EVERY_PAGES \
116 ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
119 * When a table has no indexes, vacuum the FSM after every 8GB, approximately
120 * (it won't be exact because we only vacuum FSM after processing a heap page
121 * that has some removable tuples). When there are indexes, this is ignored,
122 * and we vacuum FSM after each index/heap cleaning pass.
124 #define VACUUM_FSM_EVERY_PAGES \
125 ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
128 * Guesstimation of number of dead tuples per page. This is used to
129 * provide an upper limit to memory allocated when vacuuming small
130 * tables.
132 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
135 * Before we consider skipping a page that's marked as clean in
136 * visibility map, we must've seen at least this many clean pages.
138 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
141 * Size of the prefetch window for lazy vacuum backwards truncation scan.
142 * Needs to be a power of 2.
144 #define PREFETCH_SIZE ((BlockNumber) 32)
147 * DSM keys for parallel vacuum. Unlike other parallel execution code, since
148 * we don't need to worry about DSM keys conflicting with plan_node_id we can
149 * use small integers.
151 #define PARALLEL_VACUUM_KEY_SHARED 1
152 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES 2
153 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 3
154 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4
155 #define PARALLEL_VACUUM_KEY_WAL_USAGE 5
158 * Macro to check if we are in a parallel vacuum. If true, we are in the
159 * parallel mode and the DSM segment is initialized.
161 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
163 /* Phases of vacuum during which we report error context. */
164 typedef enum
166 VACUUM_ERRCB_PHASE_UNKNOWN,
167 VACUUM_ERRCB_PHASE_SCAN_HEAP,
168 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
169 VACUUM_ERRCB_PHASE_VACUUM_HEAP,
170 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
171 VACUUM_ERRCB_PHASE_TRUNCATE
172 } VacErrPhase;
175 * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
176 * This is allocated in the DSM segment in parallel mode and in local memory
177 * in non-parallel mode.
179 typedef struct LVDeadTuples
181 int max_tuples; /* # slots allocated in array */
182 int num_tuples; /* current # of entries */
183 /* List of TIDs of tuples we intend to delete */
184 /* NB: this list is ordered by TID address */
185 ItemPointerData itemptrs[FLEXIBLE_ARRAY_MEMBER]; /* array of
186 * ItemPointerData */
187 } LVDeadTuples;
189 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
190 #define SizeOfDeadTuples(cnt) \
191 add_size(offsetof(LVDeadTuples, itemptrs), \
192 mul_size(sizeof(ItemPointerData), cnt))
193 #define MAXDEADTUPLES(max_size) \
194 (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
197 * Shared information among parallel workers. So this is allocated in the DSM
198 * segment.
200 typedef struct LVShared
203 * Target table relid and log level. These fields are not modified during
204 * the lazy vacuum.
206 Oid relid;
207 int elevel;
210 * An indication for vacuum workers to perform either index vacuum or
211 * index cleanup. first_time is true only if for_cleanup is true and
212 * bulk-deletion is not performed yet.
214 bool for_cleanup;
215 bool first_time;
218 * Fields for both index vacuum and cleanup.
220 * reltuples is the total number of input heap tuples. We set either old
221 * live tuples in the index vacuum case or the new live tuples in the
222 * index cleanup case.
224 * estimated_count is true if reltuples is an estimated value. (Note that
225 * reltuples could be -1 in this case, indicating we have no idea.)
227 double reltuples;
228 bool estimated_count;
231 * In single process lazy vacuum we could consume more memory during index
232 * vacuuming or cleanup apart from the memory for heap scanning. In
233 * parallel vacuum, since individual vacuum workers can consume memory
234 * equal to maintenance_work_mem, the new maintenance_work_mem for each
235 * worker is set such that the parallel operation doesn't consume more
236 * memory than single process lazy vacuum.
238 int maintenance_work_mem_worker;
241 * Shared vacuum cost balance. During parallel vacuum,
242 * VacuumSharedCostBalance points to this value and it accumulates the
243 * balance of each parallel vacuum worker.
245 pg_atomic_uint32 cost_balance;
248 * Number of active parallel workers. This is used for computing the
249 * minimum threshold of the vacuum cost balance before a worker sleeps for
250 * cost-based delay.
252 pg_atomic_uint32 active_nworkers;
255 * Variables to control parallel vacuum. We have a bitmap to indicate
256 * which index has stats in shared memory. The set bit in the map
257 * indicates that the particular index supports a parallel vacuum.
259 pg_atomic_uint32 idx; /* counter for vacuuming and clean up */
260 uint32 offset; /* sizeof header incl. bitmap */
261 bits8 bitmap[FLEXIBLE_ARRAY_MEMBER]; /* bit map of NULLs */
263 /* Shared index statistics data follows at end of struct */
264 } LVShared;
266 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
267 #define GetSharedIndStats(s) \
268 ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
269 #define IndStatsIsNull(s, i) \
270 (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
273 * Struct for an index bulk-deletion statistic used for parallel vacuum. This
274 * is allocated in the DSM segment.
276 typedef struct LVSharedIndStats
278 bool updated; /* are the stats updated? */
279 IndexBulkDeleteResult istat;
280 } LVSharedIndStats;
282 /* Struct for maintaining a parallel vacuum state. */
283 typedef struct LVParallelState
285 ParallelContext *pcxt;
287 /* Shared information among parallel vacuum workers */
288 LVShared *lvshared;
290 /* Points to buffer usage area in DSM */
291 BufferUsage *buffer_usage;
293 /* Points to WAL usage area in DSM */
294 WalUsage *wal_usage;
297 * The number of indexes that support parallel index bulk-deletion and
298 * parallel index cleanup respectively.
300 int nindexes_parallel_bulkdel;
301 int nindexes_parallel_cleanup;
302 int nindexes_parallel_condcleanup;
303 } LVParallelState;
305 typedef struct LVRelState
307 /* Target heap relation and its indexes */
308 Relation rel;
309 Relation *indrels;
310 int nindexes;
311 /* Do index vacuuming/cleanup? */
312 bool do_index_vacuuming;
313 bool do_index_cleanup;
314 /* Wraparound failsafe in effect? (implies !do_index_vacuuming) */
315 bool do_failsafe;
317 /* Buffer access strategy and parallel state */
318 BufferAccessStrategy bstrategy;
319 LVParallelState *lps;
321 /* Statistics from pg_class when we start out */
322 BlockNumber old_rel_pages; /* previous value of pg_class.relpages */
323 double old_live_tuples; /* previous value of pg_class.reltuples */
324 /* rel's initial relfrozenxid and relminmxid */
325 TransactionId relfrozenxid;
326 MultiXactId relminmxid;
328 /* VACUUM operation's cutoff for pruning */
329 TransactionId OldestXmin;
330 /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
331 TransactionId FreezeLimit;
332 MultiXactId MultiXactCutoff;
334 /* Error reporting state */
335 char *relnamespace;
336 char *relname;
337 char *indname;
338 BlockNumber blkno; /* used only for heap operations */
339 OffsetNumber offnum; /* used only for heap operations */
340 VacErrPhase phase;
343 * State managed by lazy_scan_heap() follows
345 LVDeadTuples *dead_tuples; /* items to vacuum from indexes */
346 BlockNumber rel_pages; /* total number of pages */
347 BlockNumber scanned_pages; /* number of pages we examined */
348 BlockNumber pinskipped_pages; /* # of pages skipped due to a pin */
349 BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */
350 BlockNumber tupcount_pages; /* pages whose tuples we counted */
351 BlockNumber pages_removed; /* pages remove by truncation */
352 BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */
353 BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
354 bool lock_waiter_detected;
356 /* Statistics output by us, for table */
357 double new_rel_tuples; /* new estimated total # of tuples */
358 double new_live_tuples; /* new estimated total # of live tuples */
359 /* Statistics output by index AMs */
360 IndexBulkDeleteResult **indstats;
362 /* Instrumentation counters */
363 int num_index_scans;
364 int64 tuples_deleted; /* # deleted from table */
365 int64 lpdead_items; /* # deleted from indexes */
366 int64 new_dead_tuples; /* new estimated total # of dead items in
367 * table */
368 int64 num_tuples; /* total number of nonremovable tuples */
369 int64 live_tuples; /* live tuples (reltuples estimate) */
370 } LVRelState;
373 * State returned by lazy_scan_prune()
375 typedef struct LVPagePruneState
377 bool hastup; /* Page is truncatable? */
378 bool has_lpdead_items; /* includes existing LP_DEAD items */
381 * State describes the proper VM bit states to set for the page following
382 * pruning and freezing. all_visible implies !has_lpdead_items, but don't
383 * trust all_frozen result unless all_visible is also set to true.
385 bool all_visible; /* Every item visible to all? */
386 bool all_frozen; /* provided all_visible is also true */
387 TransactionId visibility_cutoff_xid; /* For recovery conflicts */
388 } LVPagePruneState;
390 /* Struct for saving and restoring vacuum error information. */
391 typedef struct LVSavedErrInfo
393 BlockNumber blkno;
394 OffsetNumber offnum;
395 VacErrPhase phase;
396 } LVSavedErrInfo;
398 /* elevel controls whole VACUUM's verbosity */
399 static int elevel = -1;
402 /* non-export function prototypes */
403 static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params,
404 bool aggressive);
405 static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
406 BlockNumber blkno, Page page,
407 GlobalVisState *vistest,
408 LVPagePruneState *prunestate);
409 static void lazy_vacuum(LVRelState *vacrel, bool onecall);
410 static bool lazy_vacuum_all_indexes(LVRelState *vacrel);
411 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
412 static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
413 Buffer buffer, int tupindex, Buffer *vmbuffer);
414 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
415 LVRelState *vacrel);
416 static bool lazy_check_wraparound_failsafe(LVRelState *vacrel);
417 static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel);
418 static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel);
419 static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers);
420 static void do_parallel_processing(LVRelState *vacrel,
421 LVShared *lvshared);
422 static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel,
423 LVShared *lvshared);
424 static IndexBulkDeleteResult *parallel_process_one_index(Relation indrel,
425 IndexBulkDeleteResult *istat,
426 LVShared *lvshared,
427 LVSharedIndStats *shared_indstats,
428 LVRelState *vacrel);
429 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
430 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
431 IndexBulkDeleteResult *istat,
432 double reltuples,
433 LVRelState *vacrel);
434 static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
435 IndexBulkDeleteResult *istat,
436 double reltuples,
437 bool estimated_count,
438 LVRelState *vacrel);
439 static bool should_attempt_truncation(LVRelState *vacrel,
440 VacuumParams *params);
441 static void lazy_truncate_heap(LVRelState *vacrel);
442 static BlockNumber count_nondeletable_pages(LVRelState *vacrel);
443 static long compute_max_dead_tuples(BlockNumber relblocks, bool hasindex);
444 static void lazy_space_alloc(LVRelState *vacrel, int nworkers,
445 BlockNumber relblocks);
446 static void lazy_space_free(LVRelState *vacrel);
447 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
448 static int vac_cmp_itemptr(const void *left, const void *right);
449 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
450 TransactionId *visibility_cutoff_xid, bool *all_frozen);
451 static int compute_parallel_vacuum_workers(LVRelState *vacrel,
452 int nrequested,
453 bool *can_parallel_vacuum);
454 static void update_index_statistics(LVRelState *vacrel);
455 static LVParallelState *begin_parallel_vacuum(LVRelState *vacrel,
456 BlockNumber nblocks,
457 int nrequested);
458 static void end_parallel_vacuum(LVRelState *vacrel);
459 static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx);
460 static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared);
461 static void vacuum_error_callback(void *arg);
462 static void update_vacuum_error_info(LVRelState *vacrel,
463 LVSavedErrInfo *saved_vacrel,
464 int phase, BlockNumber blkno,
465 OffsetNumber offnum);
466 static void restore_vacuum_error_info(LVRelState *vacrel,
467 const LVSavedErrInfo *saved_vacrel);
471 * heap_vacuum_rel() -- perform VACUUM for one heap relation
473 * This routine vacuums a single heap, cleans out its indexes, and
474 * updates its relpages and reltuples statistics.
476 * At entry, we have already established a transaction and opened
477 * and locked the relation.
479 void
480 heap_vacuum_rel(Relation rel, VacuumParams *params,
481 BufferAccessStrategy bstrategy)
483 LVRelState *vacrel;
484 PGRUsage ru0;
485 TimestampTz starttime = 0;
486 WalUsage walusage_start = pgWalUsage;
487 WalUsage walusage = {0, 0, 0};
488 long secs;
489 int usecs;
490 double read_rate,
491 write_rate;
492 bool aggressive; /* should we scan all unfrozen pages? */
493 bool scanned_all_unfrozen; /* actually scanned all such pages? */
494 char **indnames = NULL;
495 TransactionId xidFullScanLimit;
496 MultiXactId mxactFullScanLimit;
497 BlockNumber new_rel_pages;
498 BlockNumber new_rel_allvisible;
499 double new_live_tuples;
500 TransactionId new_frozen_xid;
501 MultiXactId new_min_multi;
502 ErrorContextCallback errcallback;
503 PgStat_Counter startreadtime = 0;
504 PgStat_Counter startwritetime = 0;
505 TransactionId OldestXmin;
506 TransactionId FreezeLimit;
507 MultiXactId MultiXactCutoff;
509 Assert(params != NULL);
510 Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
511 Assert(params->truncate != VACOPT_TERNARY_DEFAULT);
513 /* measure elapsed time iff autovacuum logging requires it */
514 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
516 pg_rusage_init(&ru0);
517 starttime = GetCurrentTimestamp();
518 if (track_io_timing)
520 startreadtime = pgStatBlockReadTime;
521 startwritetime = pgStatBlockWriteTime;
525 if (params->options & VACOPT_VERBOSE)
526 elevel = INFO;
527 else
528 elevel = DEBUG2;
530 pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
531 RelationGetRelid(rel));
533 vacuum_set_xid_limits(rel,
534 params->freeze_min_age,
535 params->freeze_table_age,
536 params->multixact_freeze_min_age,
537 params->multixact_freeze_table_age,
538 &OldestXmin, &FreezeLimit, &xidFullScanLimit,
539 &MultiXactCutoff, &mxactFullScanLimit);
542 * We request an aggressive scan if the table's frozen Xid is now older
543 * than or equal to the requested Xid full-table scan limit; or if the
544 * table's minimum MultiXactId is older than or equal to the requested
545 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
547 aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid,
548 xidFullScanLimit);
549 aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid,
550 mxactFullScanLimit);
551 if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
552 aggressive = true;
554 vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
556 /* Set up high level stuff about rel */
557 vacrel->rel = rel;
558 vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
559 &vacrel->indrels);
560 vacrel->do_index_vacuuming = true;
561 vacrel->do_index_cleanup = true;
562 vacrel->do_failsafe = false;
563 if (params->index_cleanup == VACOPT_TERNARY_DISABLED)
565 vacrel->do_index_vacuuming = false;
566 vacrel->do_index_cleanup = false;
568 vacrel->bstrategy = bstrategy;
569 vacrel->old_rel_pages = rel->rd_rel->relpages;
570 vacrel->old_live_tuples = rel->rd_rel->reltuples;
571 vacrel->relfrozenxid = rel->rd_rel->relfrozenxid;
572 vacrel->relminmxid = rel->rd_rel->relminmxid;
574 /* Set cutoffs for entire VACUUM */
575 vacrel->OldestXmin = OldestXmin;
576 vacrel->FreezeLimit = FreezeLimit;
577 vacrel->MultiXactCutoff = MultiXactCutoff;
579 vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
580 vacrel->relname = pstrdup(RelationGetRelationName(rel));
581 vacrel->indname = NULL;
582 vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
584 /* Save index names iff autovacuum logging requires it */
585 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 &&
586 vacrel->nindexes > 0)
588 indnames = palloc(sizeof(char *) * vacrel->nindexes);
589 for (int i = 0; i < vacrel->nindexes; i++)
590 indnames[i] =
591 pstrdup(RelationGetRelationName(vacrel->indrels[i]));
595 * Setup error traceback support for ereport(). The idea is to set up an
596 * error context callback to display additional information on any error
597 * during a vacuum. During different phases of vacuum (heap scan, heap
598 * vacuum, index vacuum, index clean up, heap truncate), we update the
599 * error context callback to display appropriate information.
601 * Note that the index vacuum and heap vacuum phases may be called
602 * multiple times in the middle of the heap scan phase. So the old phase
603 * information is restored at the end of those phases.
605 errcallback.callback = vacuum_error_callback;
606 errcallback.arg = vacrel;
607 errcallback.previous = error_context_stack;
608 error_context_stack = &errcallback;
610 /* Do the vacuuming */
611 lazy_scan_heap(vacrel, params, aggressive);
613 /* Done with indexes */
614 vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
617 * Compute whether we actually scanned the all unfrozen pages. If we did,
618 * we can adjust relfrozenxid and relminmxid.
620 * NB: We need to check this before truncating the relation, because that
621 * will change ->rel_pages.
623 if ((vacrel->scanned_pages + vacrel->frozenskipped_pages)
624 < vacrel->rel_pages)
626 Assert(!aggressive);
627 scanned_all_unfrozen = false;
629 else
630 scanned_all_unfrozen = true;
633 * Optionally truncate the relation.
635 if (should_attempt_truncation(vacrel, params))
638 * Update error traceback information. This is the last phase during
639 * which we add context information to errors, so we don't need to
640 * revert to the previous phase.
642 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
643 vacrel->nonempty_pages,
644 InvalidOffsetNumber);
645 lazy_truncate_heap(vacrel);
648 /* Pop the error context stack */
649 error_context_stack = errcallback.previous;
651 /* Report that we are now doing final cleanup */
652 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
653 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
656 * Update statistics in pg_class.
658 * In principle new_live_tuples could be -1 indicating that we (still)
659 * don't know the tuple count. In practice that probably can't happen,
660 * since we'd surely have scanned some pages if the table is new and
661 * nonempty.
663 * For safety, clamp relallvisible to be not more than what we're setting
664 * relpages to.
666 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
667 * since then we don't know for certain that all tuples have a newer xmin.
669 new_rel_pages = vacrel->rel_pages;
670 new_live_tuples = vacrel->new_live_tuples;
672 visibilitymap_count(rel, &new_rel_allvisible, NULL);
673 if (new_rel_allvisible > new_rel_pages)
674 new_rel_allvisible = new_rel_pages;
676 new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
677 new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
679 vac_update_relstats(rel,
680 new_rel_pages,
681 new_live_tuples,
682 new_rel_allvisible,
683 vacrel->nindexes > 0,
684 new_frozen_xid,
685 new_min_multi,
686 false);
689 * Report results to the stats collector, too.
691 * Deliberately avoid telling the stats collector about LP_DEAD items that
692 * remain in the table due to VACUUM bypassing index and heap vacuuming.
693 * ANALYZE will consider the remaining LP_DEAD items to be dead tuples. It
694 * seems like a good idea to err on the side of not vacuuming again too
695 * soon in cases where the failsafe prevented significant amounts of heap
696 * vacuuming.
698 pgstat_report_vacuum(RelationGetRelid(rel),
699 rel->rd_rel->relisshared,
700 Max(new_live_tuples, 0),
701 vacrel->new_dead_tuples);
702 pgstat_progress_end_command();
704 /* and log the action if appropriate */
705 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
707 TimestampTz endtime = GetCurrentTimestamp();
709 if (params->log_min_duration == 0 ||
710 TimestampDifferenceExceeds(starttime, endtime,
711 params->log_min_duration))
713 StringInfoData buf;
714 char *msgfmt;
716 TimestampDifference(starttime, endtime, &secs, &usecs);
718 memset(&walusage, 0, sizeof(WalUsage));
719 WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
721 read_rate = 0;
722 write_rate = 0;
723 if ((secs > 0) || (usecs > 0))
725 read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
726 (secs + usecs / 1000000.0);
727 write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
728 (secs + usecs / 1000000.0);
732 * This is pretty messy, but we split it up so that we can skip
733 * emitting individual parts of the message when not applicable.
735 initStringInfo(&buf);
736 if (params->is_wraparound)
739 * While it's possible for a VACUUM to be both is_wraparound
740 * and !aggressive, that's just a corner-case -- is_wraparound
741 * implies aggressive. Produce distinct output for the corner
742 * case all the same, just in case.
744 if (aggressive)
745 msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
746 else
747 msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
749 else
751 if (aggressive)
752 msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
753 else
754 msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
756 appendStringInfo(&buf, msgfmt,
757 get_database_name(MyDatabaseId),
758 vacrel->relnamespace,
759 vacrel->relname,
760 vacrel->num_index_scans);
761 appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
762 vacrel->pages_removed,
763 vacrel->rel_pages,
764 vacrel->pinskipped_pages,
765 vacrel->frozenskipped_pages);
766 appendStringInfo(&buf,
767 _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
768 (long long) vacrel->tuples_deleted,
769 (long long) vacrel->new_rel_tuples,
770 (long long) vacrel->new_dead_tuples,
771 OldestXmin);
772 appendStringInfo(&buf,
773 _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
774 (long long) VacuumPageHit,
775 (long long) VacuumPageMiss,
776 (long long) VacuumPageDirty);
777 if (vacrel->rel_pages > 0)
779 BlockNumber orig_rel_pages;
781 if (vacrel->do_index_vacuuming)
783 msgfmt = _(" %u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
785 if (vacrel->nindexes == 0 || vacrel->num_index_scans == 0)
786 appendStringInfo(&buf, _("index scan not needed:"));
787 else
788 appendStringInfo(&buf, _("index scan needed:"));
790 else
792 msgfmt = _(" %u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
794 if (!vacrel->do_failsafe)
795 appendStringInfo(&buf, _("index scan bypassed:"));
796 else
797 appendStringInfo(&buf, _("index scan bypassed by failsafe:"));
799 orig_rel_pages = vacrel->rel_pages + vacrel->pages_removed;
800 appendStringInfo(&buf, msgfmt,
801 vacrel->lpdead_item_pages,
802 100.0 * vacrel->lpdead_item_pages / orig_rel_pages,
803 (long long) vacrel->lpdead_items);
805 for (int i = 0; i < vacrel->nindexes; i++)
807 IndexBulkDeleteResult *istat = vacrel->indstats[i];
809 if (!istat)
810 continue;
812 appendStringInfo(&buf,
813 _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
814 indnames[i],
815 istat->num_pages,
816 istat->pages_newly_deleted,
817 istat->pages_deleted,
818 istat->pages_free);
820 appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
821 read_rate, write_rate);
822 if (track_io_timing)
824 appendStringInfoString(&buf, _("I/O Timings:"));
825 if (pgStatBlockReadTime - startreadtime > 0)
826 appendStringInfo(&buf, _(" read=%.3f"),
827 (double) (pgStatBlockReadTime - startreadtime) / 1000);
828 if (pgStatBlockWriteTime - startwritetime > 0)
829 appendStringInfo(&buf, _(" write=%.3f"),
830 (double) (pgStatBlockWriteTime - startwritetime) / 1000);
831 appendStringInfoChar(&buf, '\n');
833 appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0));
834 appendStringInfo(&buf,
835 _("WAL usage: %lld records, %lld full page images, %llu bytes"),
836 (long long) walusage.wal_records,
837 (long long) walusage.wal_fpi,
838 (unsigned long long) walusage.wal_bytes);
840 ereport(LOG,
841 (errmsg_internal("%s", buf.data)));
842 pfree(buf.data);
846 /* Cleanup index statistics and index names */
847 for (int i = 0; i < vacrel->nindexes; i++)
849 if (vacrel->indstats[i])
850 pfree(vacrel->indstats[i]);
852 if (indnames && indnames[i])
853 pfree(indnames[i]);
858 * lazy_scan_heap() -- scan an open heap relation
860 * This routine prunes each page in the heap, which will among other
861 * things truncate dead tuples to dead line pointers, defragment the
862 * page, and set commit status bits (see heap_page_prune). It also builds
863 * lists of dead tuples and pages with free space, calculates statistics
864 * on the number of live tuples in the heap, and marks pages as
865 * all-visible if appropriate. When done, or when we run low on space
866 * for dead-tuple TIDs, invoke lazy_vacuum to vacuum indexes and vacuum
867 * heap relation during its own second pass over the heap.
869 * If the table has at least two indexes, we execute both index vacuum
870 * and index cleanup with parallel workers unless parallel vacuum is
871 * disabled. In a parallel vacuum, we enter parallel mode and then
872 * create both the parallel context and the DSM segment before starting
873 * heap scan so that we can record dead tuples to the DSM segment. All
874 * parallel workers are launched at beginning of index vacuuming and
875 * index cleanup and they exit once done with all indexes. At the end of
876 * this function we exit from parallel mode. Index bulk-deletion results
877 * are stored in the DSM segment and we update index statistics for all
878 * the indexes after exiting from parallel mode since writes are not
879 * allowed during parallel mode.
881 * If there are no indexes then we can reclaim line pointers on the fly;
882 * dead line pointers need only be retained until all index pointers that
883 * reference them have been killed.
885 static void
886 lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
888 LVDeadTuples *dead_tuples;
889 BlockNumber nblocks,
890 blkno,
891 next_unskippable_block,
892 next_failsafe_block,
893 next_fsm_block_to_vacuum;
894 PGRUsage ru0;
895 Buffer vmbuffer = InvalidBuffer;
896 bool skipping_blocks,
897 have_vacuumed_indexes = false;
898 StringInfoData buf;
899 const int initprog_index[] = {
900 PROGRESS_VACUUM_PHASE,
901 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
902 PROGRESS_VACUUM_MAX_DEAD_TUPLES
904 int64 initprog_val[3];
905 GlobalVisState *vistest;
907 pg_rusage_init(&ru0);
909 if (aggressive)
910 ereport(elevel,
911 (errmsg("aggressively vacuuming \"%s.%s\"",
912 vacrel->relnamespace,
913 vacrel->relname)));
914 else
915 ereport(elevel,
916 (errmsg("vacuuming \"%s.%s\"",
917 vacrel->relnamespace,
918 vacrel->relname)));
920 nblocks = RelationGetNumberOfBlocks(vacrel->rel);
921 next_unskippable_block = 0;
922 next_failsafe_block = 0;
923 next_fsm_block_to_vacuum = 0;
924 vacrel->rel_pages = nblocks;
925 vacrel->scanned_pages = 0;
926 vacrel->pinskipped_pages = 0;
927 vacrel->frozenskipped_pages = 0;
928 vacrel->tupcount_pages = 0;
929 vacrel->pages_removed = 0;
930 vacrel->lpdead_item_pages = 0;
931 vacrel->nonempty_pages = 0;
932 vacrel->lock_waiter_detected = false;
934 /* Initialize instrumentation counters */
935 vacrel->num_index_scans = 0;
936 vacrel->tuples_deleted = 0;
937 vacrel->lpdead_items = 0;
938 vacrel->new_dead_tuples = 0;
939 vacrel->num_tuples = 0;
940 vacrel->live_tuples = 0;
942 vistest = GlobalVisTestFor(vacrel->rel);
944 vacrel->indstats = (IndexBulkDeleteResult **)
945 palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
948 * Before beginning scan, check if it's already necessary to apply
949 * failsafe
951 lazy_check_wraparound_failsafe(vacrel);
954 * Allocate the space for dead tuples. Note that this handles parallel
955 * VACUUM initialization as part of allocating shared memory space used
956 * for dead_tuples.
958 lazy_space_alloc(vacrel, params->nworkers, nblocks);
959 dead_tuples = vacrel->dead_tuples;
961 /* Report that we're scanning the heap, advertising total # of blocks */
962 initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
963 initprog_val[1] = nblocks;
964 initprog_val[2] = dead_tuples->max_tuples;
965 pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
968 * Except when aggressive is set, we want to skip pages that are
969 * all-visible according to the visibility map, but only when we can skip
970 * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
971 * sequentially, the OS should be doing readahead for us, so there's no
972 * gain in skipping a page now and then; that's likely to disable
973 * readahead and so be counterproductive. Also, skipping even a single
974 * page means that we can't update relfrozenxid, so we only want to do it
975 * if we can skip a goodly number of pages.
977 * When aggressive is set, we can't skip pages just because they are
978 * all-visible, but we can still skip pages that are all-frozen, since
979 * such pages do not need freezing and do not affect the value that we can
980 * safely set for relfrozenxid or relminmxid.
982 * Before entering the main loop, establish the invariant that
983 * next_unskippable_block is the next block number >= blkno that we can't
984 * skip based on the visibility map, either all-visible for a regular scan
985 * or all-frozen for an aggressive scan. We set it to nblocks if there's
986 * no such block. We also set up the skipping_blocks flag correctly at
987 * this stage.
989 * Note: The value returned by visibilitymap_get_status could be slightly
990 * out-of-date, since we make this test before reading the corresponding
991 * heap page or locking the buffer. This is OK. If we mistakenly think
992 * that the page is all-visible or all-frozen when in fact the flag's just
993 * been cleared, we might fail to vacuum the page. It's easy to see that
994 * skipping a page when aggressive is not set is not a very big deal; we
995 * might leave some dead tuples lying around, but the next vacuum will
996 * find them. But even when aggressive *is* set, it's still OK if we miss
997 * a page whose all-frozen marking has just been cleared. Any new XIDs
998 * just added to that page are necessarily newer than the GlobalXmin we
999 * computed, so they'll have no effect on the value to which we can safely
1000 * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
1002 * We will scan the table's last page, at least to the extent of
1003 * determining whether it has tuples or not, even if it should be skipped
1004 * according to the above rules; except when we've already determined that
1005 * it's not worth trying to truncate the table. This avoids having
1006 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
1007 * a truncation that just fails immediately because there are tuples in
1008 * the last page. This is worth avoiding mainly because such a lock must
1009 * be replayed on any hot standby, where it can be disruptive.
1011 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1013 while (next_unskippable_block < nblocks)
1015 uint8 vmstatus;
1017 vmstatus = visibilitymap_get_status(vacrel->rel,
1018 next_unskippable_block,
1019 &vmbuffer);
1020 if (aggressive)
1022 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
1023 break;
1025 else
1027 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
1028 break;
1030 vacuum_delay_point();
1031 next_unskippable_block++;
1035 if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
1036 skipping_blocks = true;
1037 else
1038 skipping_blocks = false;
1040 for (blkno = 0; blkno < nblocks; blkno++)
1042 Buffer buf;
1043 Page page;
1044 bool all_visible_according_to_vm = false;
1045 LVPagePruneState prunestate;
1048 * Consider need to skip blocks. See note above about forcing
1049 * scanning of last page.
1051 #define FORCE_CHECK_PAGE() \
1052 (blkno == nblocks - 1 && should_attempt_truncation(vacrel, params))
1054 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1056 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
1057 blkno, InvalidOffsetNumber);
1059 if (blkno == next_unskippable_block)
1061 /* Time to advance next_unskippable_block */
1062 next_unskippable_block++;
1063 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1065 while (next_unskippable_block < nblocks)
1067 uint8 vmskipflags;
1069 vmskipflags = visibilitymap_get_status(vacrel->rel,
1070 next_unskippable_block,
1071 &vmbuffer);
1072 if (aggressive)
1074 if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
1075 break;
1077 else
1079 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
1080 break;
1082 vacuum_delay_point();
1083 next_unskippable_block++;
1088 * We know we can't skip the current block. But set up
1089 * skipping_blocks to do the right thing at the following blocks.
1091 if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
1092 skipping_blocks = true;
1093 else
1094 skipping_blocks = false;
1097 * Normally, the fact that we can't skip this block must mean that
1098 * it's not all-visible. But in an aggressive vacuum we know only
1099 * that it's not all-frozen, so it might still be all-visible.
1101 if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1102 all_visible_according_to_vm = true;
1104 else
1107 * The current block is potentially skippable; if we've seen a
1108 * long enough run of skippable blocks to justify skipping it, and
1109 * we're not forced to check it, then go ahead and skip.
1110 * Otherwise, the page must be at least all-visible if not
1111 * all-frozen, so we can set all_visible_according_to_vm = true.
1113 if (skipping_blocks && !FORCE_CHECK_PAGE())
1116 * Tricky, tricky. If this is in aggressive vacuum, the page
1117 * must have been all-frozen at the time we checked whether it
1118 * was skippable, but it might not be any more. We must be
1119 * careful to count it as a skipped all-frozen page in that
1120 * case, or else we'll think we can't update relfrozenxid and
1121 * relminmxid. If it's not an aggressive vacuum, we don't
1122 * know whether it was all-frozen, so we have to recheck; but
1123 * in this case an approximate answer is OK.
1125 if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1126 vacrel->frozenskipped_pages++;
1127 continue;
1129 all_visible_according_to_vm = true;
1132 vacuum_delay_point();
1135 * Regularly check if wraparound failsafe should trigger.
1137 * There is a similar check inside lazy_vacuum_all_indexes(), but
1138 * relfrozenxid might start to look dangerously old before we reach
1139 * that point. This check also provides failsafe coverage for the
1140 * one-pass strategy case.
1142 if (blkno - next_failsafe_block >= FAILSAFE_EVERY_PAGES)
1144 lazy_check_wraparound_failsafe(vacrel);
1145 next_failsafe_block = blkno;
1149 * Consider if we definitely have enough space to process TIDs on page
1150 * already. If we are close to overrunning the available space for
1151 * dead-tuple TIDs, pause and do a cycle of vacuuming before we tackle
1152 * this page.
1154 if ((dead_tuples->max_tuples - dead_tuples->num_tuples) < MaxHeapTuplesPerPage &&
1155 dead_tuples->num_tuples > 0)
1158 * Before beginning index vacuuming, we release any pin we may
1159 * hold on the visibility map page. This isn't necessary for
1160 * correctness, but we do it anyway to avoid holding the pin
1161 * across a lengthy, unrelated operation.
1163 if (BufferIsValid(vmbuffer))
1165 ReleaseBuffer(vmbuffer);
1166 vmbuffer = InvalidBuffer;
1169 /* Remove the collected garbage tuples from table and indexes */
1170 lazy_vacuum(vacrel, false);
1171 have_vacuumed_indexes = true;
1174 * Vacuum the Free Space Map to make newly-freed space visible on
1175 * upper-level FSM pages. Note we have not yet processed blkno.
1177 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1178 blkno);
1179 next_fsm_block_to_vacuum = blkno;
1181 /* Report that we are once again scanning the heap */
1182 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1183 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
1187 * Set up visibility map page as needed.
1189 * Pin the visibility map page in case we need to mark the page
1190 * all-visible. In most cases this will be very cheap, because we'll
1191 * already have the correct page pinned anyway. However, it's
1192 * possible that (a) next_unskippable_block is covered by a different
1193 * VM page than the current block or (b) we released our pin and did a
1194 * cycle of index vacuuming.
1196 visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
1198 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
1199 RBM_NORMAL, vacrel->bstrategy);
1202 * We need buffer cleanup lock so that we can prune HOT chains and
1203 * defragment the page.
1205 if (!ConditionalLockBufferForCleanup(buf))
1207 bool hastup;
1210 * If we're not performing an aggressive scan to guard against XID
1211 * wraparound, and we don't want to forcibly check the page, then
1212 * it's OK to skip vacuuming pages we get a lock conflict on. They
1213 * will be dealt with in some future vacuum.
1215 if (!aggressive && !FORCE_CHECK_PAGE())
1217 ReleaseBuffer(buf);
1218 vacrel->pinskipped_pages++;
1219 continue;
1223 * Read the page with share lock to see if any xids on it need to
1224 * be frozen. If not we just skip the page, after updating our
1225 * scan statistics. If there are some, we wait for cleanup lock.
1227 * We could defer the lock request further by remembering the page
1228 * and coming back to it later, or we could even register
1229 * ourselves for multiple buffers and then service whichever one
1230 * is received first. For now, this seems good enough.
1232 * If we get here with aggressive false, then we're just forcibly
1233 * checking the page, and so we don't want to insist on getting
1234 * the lock; we only need to know if the page contains tuples, so
1235 * that we can update nonempty_pages correctly. It's convenient
1236 * to use lazy_check_needs_freeze() for both situations, though.
1238 LockBuffer(buf, BUFFER_LOCK_SHARE);
1239 if (!lazy_check_needs_freeze(buf, &hastup, vacrel))
1241 UnlockReleaseBuffer(buf);
1242 vacrel->scanned_pages++;
1243 vacrel->pinskipped_pages++;
1244 if (hastup)
1245 vacrel->nonempty_pages = blkno + 1;
1246 continue;
1248 if (!aggressive)
1251 * Here, we must not advance scanned_pages; that would amount
1252 * to claiming that the page contains no freezable tuples.
1254 UnlockReleaseBuffer(buf);
1255 vacrel->pinskipped_pages++;
1256 if (hastup)
1257 vacrel->nonempty_pages = blkno + 1;
1258 continue;
1260 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1261 LockBufferForCleanup(buf);
1262 /* drop through to normal processing */
1266 * By here we definitely have enough dead_tuples space for whatever
1267 * LP_DEAD tids are on this page, we have the visibility map page set
1268 * up in case we need to set this page's all_visible/all_frozen bit,
1269 * and we have a super-exclusive lock. Any tuples on this page are
1270 * now sure to be "counted" by this VACUUM.
1272 * One last piece of preamble needs to take place before we can prune:
1273 * we need to consider new and empty pages.
1275 vacrel->scanned_pages++;
1276 vacrel->tupcount_pages++;
1278 page = BufferGetPage(buf);
1280 if (PageIsNew(page))
1283 * All-zeroes pages can be left over if either a backend extends
1284 * the relation by a single page, but crashes before the newly
1285 * initialized page has been written out, or when bulk-extending
1286 * the relation (which creates a number of empty pages at the tail
1287 * end of the relation, but enters them into the FSM).
1289 * Note we do not enter the page into the visibilitymap. That has
1290 * the downside that we repeatedly visit this page in subsequent
1291 * vacuums, but otherwise we'll never not discover the space on a
1292 * promoted standby. The harm of repeated checking ought to
1293 * normally not be too bad - the space usually should be used at
1294 * some point, otherwise there wouldn't be any regular vacuums.
1296 * Make sure these pages are in the FSM, to ensure they can be
1297 * reused. Do that by testing if there's any space recorded for
1298 * the page. If not, enter it. We do so after releasing the lock
1299 * on the heap page, the FSM is approximate, after all.
1301 UnlockReleaseBuffer(buf);
1303 if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1305 Size freespace = BLCKSZ - SizeOfPageHeaderData;
1307 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1309 continue;
1312 if (PageIsEmpty(page))
1314 Size freespace = PageGetHeapFreeSpace(page);
1317 * Empty pages are always all-visible and all-frozen (note that
1318 * the same is currently not true for new pages, see above).
1320 if (!PageIsAllVisible(page))
1322 START_CRIT_SECTION();
1324 /* mark buffer dirty before writing a WAL record */
1325 MarkBufferDirty(buf);
1328 * It's possible that another backend has extended the heap,
1329 * initialized the page, and then failed to WAL-log the page
1330 * due to an ERROR. Since heap extension is not WAL-logged,
1331 * recovery might try to replay our record setting the page
1332 * all-visible and find that the page isn't initialized, which
1333 * will cause a PANIC. To prevent that, check whether the
1334 * page has been previously WAL-logged, and if not, do that
1335 * now.
1337 if (RelationNeedsWAL(vacrel->rel) &&
1338 PageGetLSN(page) == InvalidXLogRecPtr)
1339 log_newpage_buffer(buf, true);
1341 PageSetAllVisible(page);
1342 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1343 vmbuffer, InvalidTransactionId,
1344 VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1345 END_CRIT_SECTION();
1348 UnlockReleaseBuffer(buf);
1349 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1350 continue;
1354 * Prune and freeze tuples.
1356 * Accumulates details of remaining LP_DEAD line pointers on page in
1357 * dead tuple list. This includes LP_DEAD line pointers that we
1358 * pruned ourselves, as well as existing LP_DEAD line pointers that
1359 * were pruned some time earlier. Also considers freezing XIDs in the
1360 * tuple headers of remaining items with storage.
1362 lazy_scan_prune(vacrel, buf, blkno, page, vistest, &prunestate);
1364 Assert(!prunestate.all_visible || !prunestate.has_lpdead_items);
1366 /* Remember the location of the last page with nonremovable tuples */
1367 if (prunestate.hastup)
1368 vacrel->nonempty_pages = blkno + 1;
1370 if (vacrel->nindexes == 0)
1373 * Consider the need to do page-at-a-time heap vacuuming when
1374 * using the one-pass strategy now.
1376 * The one-pass strategy will never call lazy_vacuum(). The steps
1377 * performed here can be thought of as the one-pass equivalent of
1378 * a call to lazy_vacuum().
1380 if (prunestate.has_lpdead_items)
1382 Size freespace;
1384 lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1386 /* Forget the now-vacuumed tuples */
1387 dead_tuples->num_tuples = 0;
1390 * Periodically perform FSM vacuuming to make newly-freed
1391 * space visible on upper FSM pages. Note we have not yet
1392 * performed FSM processing for blkno.
1394 if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1396 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1397 blkno);
1398 next_fsm_block_to_vacuum = blkno;
1402 * Now perform FSM processing for blkno, and move on to next
1403 * page.
1405 * Our call to lazy_vacuum_heap_page() will have considered if
1406 * it's possible to set all_visible/all_frozen independently
1407 * of lazy_scan_prune(). Note that prunestate was invalidated
1408 * by lazy_vacuum_heap_page() call.
1410 freespace = PageGetHeapFreeSpace(page);
1412 UnlockReleaseBuffer(buf);
1413 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1414 continue;
1418 * There was no call to lazy_vacuum_heap_page() because pruning
1419 * didn't encounter/create any LP_DEAD items that needed to be
1420 * vacuumed. Prune state has not been invalidated, so proceed
1421 * with prunestate-driven visibility map and FSM steps (just like
1422 * the two-pass strategy).
1424 Assert(dead_tuples->num_tuples == 0);
1428 * Handle setting visibility map bit based on what the VM said about
1429 * the page before pruning started, and using prunestate
1431 if (!all_visible_according_to_vm && prunestate.all_visible)
1433 uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
1435 if (prunestate.all_frozen)
1436 flags |= VISIBILITYMAP_ALL_FROZEN;
1439 * It should never be the case that the visibility map page is set
1440 * while the page-level bit is clear, but the reverse is allowed
1441 * (if checksums are not enabled). Regardless, set both bits so
1442 * that we get back in sync.
1444 * NB: If the heap page is all-visible but the VM bit is not set,
1445 * we don't need to dirty the heap page. However, if checksums
1446 * are enabled, we do need to make sure that the heap page is
1447 * dirtied before passing it to visibilitymap_set(), because it
1448 * may be logged. Given that this situation should only happen in
1449 * rare cases after a crash, it is not worth optimizing.
1451 PageSetAllVisible(page);
1452 MarkBufferDirty(buf);
1453 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1454 vmbuffer, prunestate.visibility_cutoff_xid,
1455 flags);
1459 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1460 * the page-level bit is clear. However, it's possible that the bit
1461 * got cleared after we checked it and before we took the buffer
1462 * content lock, so we must recheck before jumping to the conclusion
1463 * that something bad has happened.
1465 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1466 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1468 elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1469 vacrel->relname, blkno);
1470 visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1471 VISIBILITYMAP_VALID_BITS);
1475 * It's possible for the value returned by
1476 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1477 * wrong for us to see tuples that appear to not be visible to
1478 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1479 * xmin value never moves backwards, but
1480 * GetOldestNonRemovableTransactionId() is conservative and sometimes
1481 * returns a value that's unnecessarily small, so if we see that
1482 * contradiction it just means that the tuples that we think are not
1483 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1484 * is correct.
1486 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1487 * set, however.
1489 else if (prunestate.has_lpdead_items && PageIsAllVisible(page))
1491 elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1492 vacrel->relname, blkno);
1493 PageClearAllVisible(page);
1494 MarkBufferDirty(buf);
1495 visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1496 VISIBILITYMAP_VALID_BITS);
1500 * If the all-visible page is all-frozen but not marked as such yet,
1501 * mark it as all-frozen. Note that all_frozen is only valid if
1502 * all_visible is true, so we must check both.
1504 else if (all_visible_according_to_vm && prunestate.all_visible &&
1505 prunestate.all_frozen &&
1506 !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1509 * We can pass InvalidTransactionId as the cutoff XID here,
1510 * because setting the all-frozen bit doesn't cause recovery
1511 * conflicts.
1513 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1514 vmbuffer, InvalidTransactionId,
1515 VISIBILITYMAP_ALL_FROZEN);
1519 * Final steps for block: drop super-exclusive lock, record free space
1520 * in the FSM
1522 if (prunestate.has_lpdead_items && vacrel->do_index_vacuuming)
1525 * Wait until lazy_vacuum_heap_rel() to save free space. This
1526 * doesn't just save us some cycles; it also allows us to record
1527 * any additional free space that lazy_vacuum_heap_page() will
1528 * make available in cases where it's possible to truncate the
1529 * page's line pointer array.
1531 * Note: It's not in fact 100% certain that we really will call
1532 * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip
1533 * index vacuuming (and so must skip heap vacuuming). This is
1534 * deemed okay because it only happens in emergencies, or when
1535 * there is very little free space anyway. (Besides, we start
1536 * recording free space in the FSM once index vacuuming has been
1537 * abandoned.)
1539 * Note: The one-pass (no indexes) case is only supposed to make
1540 * it this far when there were no LP_DEAD items during pruning.
1542 Assert(vacrel->nindexes > 0);
1543 UnlockReleaseBuffer(buf);
1545 else
1547 Size freespace = PageGetHeapFreeSpace(page);
1549 UnlockReleaseBuffer(buf);
1550 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1554 /* report that everything is now scanned */
1555 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1557 /* Clear the block number information */
1558 vacrel->blkno = InvalidBlockNumber;
1560 /* now we can compute the new value for pg_class.reltuples */
1561 vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks,
1562 vacrel->tupcount_pages,
1563 vacrel->live_tuples);
1566 * Also compute the total number of surviving heap entries. In the
1567 * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1569 vacrel->new_rel_tuples =
1570 Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples;
1573 * Release any remaining pin on visibility map page.
1575 if (BufferIsValid(vmbuffer))
1577 ReleaseBuffer(vmbuffer);
1578 vmbuffer = InvalidBuffer;
1581 /* If any tuples need to be deleted, perform final vacuum cycle */
1582 if (dead_tuples->num_tuples > 0)
1583 lazy_vacuum(vacrel, !have_vacuumed_indexes);
1586 * Vacuum the remainder of the Free Space Map. We must do this whether or
1587 * not there were indexes, and whether or not we bypassed index vacuuming.
1589 if (blkno > next_fsm_block_to_vacuum)
1590 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1592 /* report all blocks vacuumed */
1593 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1595 /* Do post-vacuum cleanup */
1596 if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1597 lazy_cleanup_all_indexes(vacrel);
1600 * Free resources managed by lazy_space_alloc(). (We must end parallel
1601 * mode/free shared memory before updating index statistics. We cannot
1602 * write while in parallel mode.)
1604 lazy_space_free(vacrel);
1606 /* Update index statistics */
1607 if (vacrel->nindexes > 0 && vacrel->do_index_cleanup)
1608 update_index_statistics(vacrel);
1611 * If table has no indexes and at least one heap pages was vacuumed, make
1612 * log report that lazy_vacuum_heap_rel would've made had there been
1613 * indexes (having indexes implies using the two pass strategy).
1615 * We deliberately don't do this in the case where there are indexes but
1616 * index vacuuming was bypassed. We make a similar report at the point
1617 * that index vacuuming is bypassed, but that's actually quite different
1618 * in one important sense: it shows information about work we _haven't_
1619 * done.
1621 * log_autovacuum output does things differently; it consistently presents
1622 * information about LP_DEAD items for the VACUUM as a whole. We always
1623 * report on each round of index and heap vacuuming separately, though.
1625 if (vacrel->nindexes == 0 && vacrel->lpdead_item_pages > 0)
1626 ereport(elevel,
1627 (errmsg("\"%s\": removed %lld dead item identifiers in %u pages",
1628 vacrel->relname, (long long) vacrel->lpdead_items,
1629 vacrel->lpdead_item_pages)));
1631 initStringInfo(&buf);
1632 appendStringInfo(&buf,
1633 _("%lld dead row versions cannot be removed yet, oldest xmin: %u\n"),
1634 (long long) vacrel->new_dead_tuples, vacrel->OldestXmin);
1635 appendStringInfo(&buf, ngettext("%u page removed.\n",
1636 "%u pages removed.\n",
1637 vacrel->pages_removed),
1638 vacrel->pages_removed);
1639 appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1640 "Skipped %u pages due to buffer pins, ",
1641 vacrel->pinskipped_pages),
1642 vacrel->pinskipped_pages);
1643 appendStringInfo(&buf, ngettext("%u frozen page.\n",
1644 "%u frozen pages.\n",
1645 vacrel->frozenskipped_pages),
1646 vacrel->frozenskipped_pages);
1647 appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1649 ereport(elevel,
1650 (errmsg("\"%s\": found %lld removable, %lld nonremovable row versions in %u out of %u pages",
1651 vacrel->relname,
1652 (long long) vacrel->tuples_deleted,
1653 (long long) vacrel->num_tuples, vacrel->scanned_pages,
1654 nblocks),
1655 errdetail_internal("%s", buf.data)));
1656 pfree(buf.data);
1660 * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1662 * Caller must hold pin and buffer cleanup lock on the buffer.
1664 * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
1665 * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
1666 * whether or not a tuple should be considered DEAD. This happened when an
1667 * inserting transaction concurrently aborted (after our heap_page_prune()
1668 * call, before our HeapTupleSatisfiesVacuum() call). There was rather a lot
1669 * of complexity just so we could deal with tuples that were DEAD to VACUUM,
1670 * but nevertheless were left with storage after pruning.
1672 * The approach we take now is to restart pruning when the race condition is
1673 * detected. This allows heap_page_prune() to prune the tuples inserted by
1674 * the now-aborted transaction. This is a little crude, but it guarantees
1675 * that any items that make it into the dead_tuples array are simple LP_DEAD
1676 * line pointers, and that every remaining item with tuple storage is
1677 * considered as a candidate for freezing.
1679 static void
1680 lazy_scan_prune(LVRelState *vacrel,
1681 Buffer buf,
1682 BlockNumber blkno,
1683 Page page,
1684 GlobalVisState *vistest,
1685 LVPagePruneState *prunestate)
1687 Relation rel = vacrel->rel;
1688 OffsetNumber offnum,
1689 maxoff;
1690 ItemId itemid;
1691 HeapTupleData tuple;
1692 HTSV_Result res;
1693 int tuples_deleted,
1694 lpdead_items,
1695 new_dead_tuples,
1696 num_tuples,
1697 live_tuples;
1698 int nfrozen;
1699 OffsetNumber deadoffsets[MaxHeapTuplesPerPage];
1700 xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage];
1702 maxoff = PageGetMaxOffsetNumber(page);
1704 retry:
1706 /* Initialize (or reset) page-level counters */
1707 tuples_deleted = 0;
1708 lpdead_items = 0;
1709 new_dead_tuples = 0;
1710 num_tuples = 0;
1711 live_tuples = 0;
1714 * Prune all HOT-update chains in this page.
1716 * We count tuples removed by the pruning step as tuples_deleted. Its
1717 * final value can be thought of as the number of tuples that have been
1718 * deleted from the table. It should not be confused with lpdead_items;
1719 * lpdead_items's final value can be thought of as the number of tuples
1720 * that were deleted from indexes.
1722 tuples_deleted = heap_page_prune(rel, buf, vistest,
1723 InvalidTransactionId, 0, false,
1724 &vacrel->offnum);
1727 * Now scan the page to collect LP_DEAD items and check for tuples
1728 * requiring freezing among remaining tuples with storage
1730 prunestate->hastup = false;
1731 prunestate->has_lpdead_items = false;
1732 prunestate->all_visible = true;
1733 prunestate->all_frozen = true;
1734 prunestate->visibility_cutoff_xid = InvalidTransactionId;
1735 nfrozen = 0;
1737 for (offnum = FirstOffsetNumber;
1738 offnum <= maxoff;
1739 offnum = OffsetNumberNext(offnum))
1741 bool tuple_totally_frozen;
1744 * Set the offset number so that we can display it along with any
1745 * error that occurred while processing this tuple.
1747 vacrel->offnum = offnum;
1748 itemid = PageGetItemId(page, offnum);
1750 if (!ItemIdIsUsed(itemid))
1751 continue;
1753 /* Redirect items mustn't be touched */
1754 if (ItemIdIsRedirected(itemid))
1756 prunestate->hastup = true; /* page won't be truncatable */
1757 continue;
1761 * LP_DEAD items are processed outside of the loop.
1763 * Note that we deliberately don't set hastup=true in the case of an
1764 * LP_DEAD item here, which is not how lazy_check_needs_freeze() or
1765 * count_nondeletable_pages() do it -- they only consider pages empty
1766 * when they only have LP_UNUSED items, which is important for
1767 * correctness.
1769 * Our assumption is that any LP_DEAD items we encounter here will
1770 * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
1771 * call count_nondeletable_pages(). In any case our opinion of
1772 * whether or not a page 'hastup' (which is how our caller sets its
1773 * vacrel->nonempty_pages value) is inherently race-prone. It must be
1774 * treated as advisory/unreliable, so we might as well be slightly
1775 * optimistic.
1777 if (ItemIdIsDead(itemid))
1779 deadoffsets[lpdead_items++] = offnum;
1780 prunestate->all_visible = false;
1781 prunestate->has_lpdead_items = true;
1782 continue;
1785 Assert(ItemIdIsNormal(itemid));
1787 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1788 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1789 tuple.t_len = ItemIdGetLength(itemid);
1790 tuple.t_tableOid = RelationGetRelid(rel);
1793 * DEAD tuples are almost always pruned into LP_DEAD line pointers by
1794 * heap_page_prune(), but it's possible that the tuple state changed
1795 * since heap_page_prune() looked. Handle that here by restarting.
1796 * (See comments at the top of function for a full explanation.)
1798 res = HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf);
1800 if (unlikely(res == HEAPTUPLE_DEAD))
1801 goto retry;
1804 * The criteria for counting a tuple as live in this block need to
1805 * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM
1806 * and ANALYZE may produce wildly different reltuples values, e.g.
1807 * when there are many recently-dead tuples.
1809 * The logic here is a bit simpler than acquire_sample_rows(), as
1810 * VACUUM can't run inside a transaction block, which makes some cases
1811 * impossible (e.g. in-progress insert from the same transaction).
1813 * We treat LP_DEAD items a little differently, too -- we don't count
1814 * them as dead_tuples at all (we only consider new_dead_tuples). The
1815 * outcome is no different because we assume that any LP_DEAD items we
1816 * encounter here will become LP_UNUSED inside lazy_vacuum_heap_page()
1817 * before we report anything to the stats collector. (Cases where we
1818 * bypass index vacuuming will violate our assumption, but the overall
1819 * impact of that should be negligible.)
1821 switch (res)
1823 case HEAPTUPLE_LIVE:
1826 * Count it as live. Not only is this natural, but it's also
1827 * what acquire_sample_rows() does.
1829 live_tuples++;
1832 * Is the tuple definitely visible to all transactions?
1834 * NB: Like with per-tuple hint bits, we can't set the
1835 * PD_ALL_VISIBLE flag if the inserter committed
1836 * asynchronously. See SetHintBits for more info. Check that
1837 * the tuple is hinted xmin-committed because of that.
1839 if (prunestate->all_visible)
1841 TransactionId xmin;
1843 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1845 prunestate->all_visible = false;
1846 break;
1850 * The inserter definitely committed. But is it old enough
1851 * that everyone sees it as committed?
1853 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1854 if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
1856 prunestate->all_visible = false;
1857 break;
1860 /* Track newest xmin on page. */
1861 if (TransactionIdFollows(xmin, prunestate->visibility_cutoff_xid))
1862 prunestate->visibility_cutoff_xid = xmin;
1864 break;
1865 case HEAPTUPLE_RECENTLY_DEAD:
1868 * If tuple is recently deleted then we must not remove it
1869 * from relation. (We only remove items that are LP_DEAD from
1870 * pruning.)
1872 new_dead_tuples++;
1873 prunestate->all_visible = false;
1874 break;
1875 case HEAPTUPLE_INSERT_IN_PROGRESS:
1878 * We do not count these rows as live, because we expect the
1879 * inserting transaction to update the counters at commit, and
1880 * we assume that will happen only after we report our
1881 * results. This assumption is a bit shaky, but it is what
1882 * acquire_sample_rows() does, so be consistent.
1884 prunestate->all_visible = false;
1885 break;
1886 case HEAPTUPLE_DELETE_IN_PROGRESS:
1887 /* This is an expected case during concurrent vacuum */
1888 prunestate->all_visible = false;
1891 * Count such rows as live. As above, we assume the deleting
1892 * transaction will commit and update the counters after we
1893 * report.
1895 live_tuples++;
1896 break;
1897 default:
1898 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1899 break;
1903 * Non-removable tuple (i.e. tuple with storage).
1905 * Check tuple left behind after pruning to see if needs to be frozen
1906 * now.
1908 num_tuples++;
1909 prunestate->hastup = true;
1910 if (heap_prepare_freeze_tuple(tuple.t_data,
1911 vacrel->relfrozenxid,
1912 vacrel->relminmxid,
1913 vacrel->FreezeLimit,
1914 vacrel->MultiXactCutoff,
1915 &frozen[nfrozen],
1916 &tuple_totally_frozen))
1918 /* Will execute freeze below */
1919 frozen[nfrozen++].offset = offnum;
1923 * If tuple is not frozen (and not about to become frozen) then caller
1924 * had better not go on to set this page's VM bit
1926 if (!tuple_totally_frozen)
1927 prunestate->all_frozen = false;
1931 * We have now divided every item on the page into either an LP_DEAD item
1932 * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
1933 * that remains and needs to be considered for freezing now (LP_UNUSED and
1934 * LP_REDIRECT items also remain, but are of no further interest to us).
1936 vacrel->offnum = InvalidOffsetNumber;
1939 * Consider the need to freeze any items with tuple storage from the page
1940 * first (arbitrary)
1942 if (nfrozen > 0)
1944 Assert(prunestate->hastup);
1947 * At least one tuple with storage needs to be frozen -- execute that
1948 * now.
1950 * If we need to freeze any tuples we'll mark the buffer dirty, and
1951 * write a WAL record recording the changes. We must log the changes
1952 * to be crash-safe against future truncation of CLOG.
1954 START_CRIT_SECTION();
1956 MarkBufferDirty(buf);
1958 /* execute collected freezes */
1959 for (int i = 0; i < nfrozen; i++)
1961 HeapTupleHeader htup;
1963 itemid = PageGetItemId(page, frozen[i].offset);
1964 htup = (HeapTupleHeader) PageGetItem(page, itemid);
1966 heap_execute_freeze_tuple(htup, &frozen[i]);
1969 /* Now WAL-log freezing if necessary */
1970 if (RelationNeedsWAL(vacrel->rel))
1972 XLogRecPtr recptr;
1974 recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit,
1975 frozen, nfrozen);
1976 PageSetLSN(page, recptr);
1979 END_CRIT_SECTION();
1983 * The second pass over the heap can also set visibility map bits, using
1984 * the same approach. This is important when the table frequently has a
1985 * few old LP_DEAD items on each page by the time we get to it (typically
1986 * because past opportunistic pruning operations freed some non-HOT
1987 * tuples).
1989 * VACUUM will call heap_page_is_all_visible() during the second pass over
1990 * the heap to determine all_visible and all_frozen for the page -- this
1991 * is a specialized version of the logic from this function. Now that
1992 * we've finished pruning and freezing, make sure that we're in total
1993 * agreement with heap_page_is_all_visible() using an assertion.
1995 #ifdef USE_ASSERT_CHECKING
1996 /* Note that all_frozen value does not matter when !all_visible */
1997 if (prunestate->all_visible)
1999 TransactionId cutoff;
2000 bool all_frozen;
2002 if (!heap_page_is_all_visible(vacrel, buf, &cutoff, &all_frozen))
2003 Assert(false);
2005 Assert(lpdead_items == 0);
2006 Assert(prunestate->all_frozen == all_frozen);
2009 * It's possible that we froze tuples and made the page's XID cutoff
2010 * (for recovery conflict purposes) FrozenTransactionId. This is okay
2011 * because visibility_cutoff_xid will be logged by our caller in a
2012 * moment.
2014 Assert(cutoff == FrozenTransactionId ||
2015 cutoff == prunestate->visibility_cutoff_xid);
2017 #endif
2020 * Now save details of the LP_DEAD items from the page in the dead_tuples
2021 * array. Also record that page has dead items in per-page prunestate.
2023 if (lpdead_items > 0)
2025 LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2026 ItemPointerData tmp;
2028 Assert(!prunestate->all_visible);
2029 Assert(prunestate->has_lpdead_items);
2031 vacrel->lpdead_item_pages++;
2033 ItemPointerSetBlockNumber(&tmp, blkno);
2035 for (int i = 0; i < lpdead_items; i++)
2037 ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]);
2038 dead_tuples->itemptrs[dead_tuples->num_tuples++] = tmp;
2041 Assert(dead_tuples->num_tuples <= dead_tuples->max_tuples);
2042 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
2043 dead_tuples->num_tuples);
2046 /* Finally, add page-local counts to whole-VACUUM counts */
2047 vacrel->tuples_deleted += tuples_deleted;
2048 vacrel->lpdead_items += lpdead_items;
2049 vacrel->new_dead_tuples += new_dead_tuples;
2050 vacrel->num_tuples += num_tuples;
2051 vacrel->live_tuples += live_tuples;
2055 * Remove the collected garbage tuples from the table and its indexes.
2057 * We may choose to bypass index vacuuming at this point, though only when the
2058 * ongoing VACUUM operation will definitely only have one index scan/round of
2059 * index vacuuming. Caller indicates whether or not this is such a VACUUM
2060 * operation using 'onecall' argument.
2062 * In rare emergencies, the ongoing VACUUM operation can be made to skip both
2063 * index vacuuming and index cleanup at the point we're called. This avoids
2064 * having the whole system refuse to allocate further XIDs/MultiXactIds due to
2065 * wraparound.
2067 static void
2068 lazy_vacuum(LVRelState *vacrel, bool onecall)
2070 bool do_bypass_optimization;
2072 /* Should not end up here with no indexes */
2073 Assert(vacrel->nindexes > 0);
2074 Assert(!IsParallelWorker());
2075 Assert(vacrel->lpdead_item_pages > 0);
2077 if (!vacrel->do_index_vacuuming)
2079 Assert(!vacrel->do_index_cleanup);
2080 vacrel->dead_tuples->num_tuples = 0;
2081 return;
2085 * Consider bypassing index vacuuming (and heap vacuuming) entirely.
2087 * We currently only do this in cases where the number of LP_DEAD items
2088 * for the entire VACUUM operation is close to zero. This avoids sharp
2089 * discontinuities in the duration and overhead of successive VACUUM
2090 * operations that run against the same table with a fixed workload.
2091 * Ideally, successive VACUUM operations will behave as if there are
2092 * exactly zero LP_DEAD items in cases where there are close to zero.
2094 * This is likely to be helpful with a table that is continually affected
2095 * by UPDATEs that can mostly apply the HOT optimization, but occasionally
2096 * have small aberrations that lead to just a few heap pages retaining
2097 * only one or two LP_DEAD items. This is pretty common; even when the
2098 * DBA goes out of their way to make UPDATEs use HOT, it is practically
2099 * impossible to predict whether HOT will be applied in 100% of cases.
2100 * It's far easier to ensure that 99%+ of all UPDATEs against a table use
2101 * HOT through careful tuning.
2103 do_bypass_optimization = false;
2104 if (onecall && vacrel->rel_pages > 0)
2106 BlockNumber threshold;
2108 Assert(vacrel->num_index_scans == 0);
2109 Assert(vacrel->lpdead_items == vacrel->dead_tuples->num_tuples);
2110 Assert(vacrel->do_index_vacuuming);
2111 Assert(vacrel->do_index_cleanup);
2114 * This crossover point at which we'll start to do index vacuuming is
2115 * expressed as a percentage of the total number of heap pages in the
2116 * table that are known to have at least one LP_DEAD item. This is
2117 * much more important than the total number of LP_DEAD items, since
2118 * it's a proxy for the number of heap pages whose visibility map bits
2119 * cannot be set on account of bypassing index and heap vacuuming.
2121 * We apply one further precautionary test: the space currently used
2122 * to store the TIDs (TIDs that now all point to LP_DEAD items) must
2123 * not exceed 32MB. This limits the risk that we will bypass index
2124 * vacuuming again and again until eventually there is a VACUUM whose
2125 * dead_tuples space is not CPU cache resident.
2127 * We don't take any special steps to remember the LP_DEAD items (such
2128 * as counting them in new_dead_tuples report to the stats collector)
2129 * when the optimization is applied. Though the accounting used in
2130 * analyze.c's acquire_sample_rows() will recognize the same LP_DEAD
2131 * items as dead rows in its own stats collector report, that's okay.
2132 * The discrepancy should be negligible. If this optimization is ever
2133 * expanded to cover more cases then this may need to be reconsidered.
2135 threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES;
2136 do_bypass_optimization =
2137 (vacrel->lpdead_item_pages < threshold &&
2138 vacrel->lpdead_items < MAXDEADTUPLES(32L * 1024L * 1024L));
2141 if (do_bypass_optimization)
2144 * There are almost zero TIDs. Behave as if there were precisely
2145 * zero: bypass index vacuuming, but do index cleanup.
2147 * We expect that the ongoing VACUUM operation will finish very
2148 * quickly, so there is no point in considering speeding up as a
2149 * failsafe against wraparound failure. (Index cleanup is expected to
2150 * finish very quickly in cases where there were no ambulkdelete()
2151 * calls.)
2153 vacrel->do_index_vacuuming = false;
2154 ereport(elevel,
2155 (errmsg("\"%s\": index scan bypassed: %u pages from table (%.2f%% of total) have %lld dead item identifiers",
2156 vacrel->relname, vacrel->lpdead_item_pages,
2157 100.0 * vacrel->lpdead_item_pages / vacrel->rel_pages,
2158 (long long) vacrel->lpdead_items)));
2160 else if (lazy_vacuum_all_indexes(vacrel))
2163 * We successfully completed a round of index vacuuming. Do related
2164 * heap vacuuming now.
2166 lazy_vacuum_heap_rel(vacrel);
2168 else
2171 * Failsafe case.
2173 * we attempted index vacuuming, but didn't finish a full round/full
2174 * index scan. This happens when relfrozenxid or relminmxid is too
2175 * far in the past.
2177 * From this point on the VACUUM operation will do no further index
2178 * vacuuming or heap vacuuming. This VACUUM operation won't end up
2179 * back here again.
2181 Assert(vacrel->do_failsafe);
2185 * Forget the LP_DEAD items that we just vacuumed (or just decided to not
2186 * vacuum)
2188 vacrel->dead_tuples->num_tuples = 0;
2192 * lazy_vacuum_all_indexes() -- Main entry for index vacuuming
2194 * Returns true in the common case when all indexes were successfully
2195 * vacuumed. Returns false in rare cases where we determined that the ongoing
2196 * VACUUM operation is at risk of taking too long to finish, leading to
2197 * wraparound failure.
2199 static bool
2200 lazy_vacuum_all_indexes(LVRelState *vacrel)
2202 bool allindexes = true;
2204 Assert(!IsParallelWorker());
2205 Assert(vacrel->nindexes > 0);
2206 Assert(vacrel->do_index_vacuuming);
2207 Assert(vacrel->do_index_cleanup);
2208 Assert(TransactionIdIsNormal(vacrel->relfrozenxid));
2209 Assert(MultiXactIdIsValid(vacrel->relminmxid));
2211 /* Precheck for XID wraparound emergencies */
2212 if (lazy_check_wraparound_failsafe(vacrel))
2214 /* Wraparound emergency -- don't even start an index scan */
2215 return false;
2218 /* Report that we are now vacuuming indexes */
2219 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2220 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
2222 if (!ParallelVacuumIsActive(vacrel))
2224 for (int idx = 0; idx < vacrel->nindexes; idx++)
2226 Relation indrel = vacrel->indrels[idx];
2227 IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2229 vacrel->indstats[idx] =
2230 lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples,
2231 vacrel);
2233 if (lazy_check_wraparound_failsafe(vacrel))
2235 /* Wraparound emergency -- end current index scan */
2236 allindexes = false;
2237 break;
2241 else
2243 /* Outsource everything to parallel variant */
2244 do_parallel_lazy_vacuum_all_indexes(vacrel);
2247 * Do a postcheck to consider applying wraparound failsafe now. Note
2248 * that parallel VACUUM only gets the precheck and this postcheck.
2250 if (lazy_check_wraparound_failsafe(vacrel))
2251 allindexes = false;
2255 * We delete all LP_DEAD items from the first heap pass in all indexes on
2256 * each call here (except calls where we choose to do the failsafe). This
2257 * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2258 * of the failsafe triggering, which prevents the next call from taking
2259 * place).
2261 Assert(vacrel->num_index_scans > 0 ||
2262 vacrel->dead_tuples->num_tuples == vacrel->lpdead_items);
2263 Assert(allindexes || vacrel->do_failsafe);
2266 * Increase and report the number of index scans.
2268 * We deliberately include the case where we started a round of bulk
2269 * deletes that we weren't able to finish due to the failsafe triggering.
2271 vacrel->num_index_scans++;
2272 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS,
2273 vacrel->num_index_scans);
2275 return allindexes;
2279 * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2281 * This routine marks LP_DEAD items in vacrel->dead_tuples array as LP_UNUSED.
2282 * Pages that never had lazy_scan_prune record LP_DEAD items are not visited
2283 * at all.
2285 * We may also be able to truncate the line pointer array of the heap pages we
2286 * visit. If there is a contiguous group of LP_UNUSED items at the end of the
2287 * array, it can be reclaimed as free space. These LP_UNUSED items usually
2288 * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2289 * each page to LP_UNUSED, and then consider if it's possible to truncate the
2290 * page's line pointer array).
2292 * Note: the reason for doing this as a second pass is we cannot remove the
2293 * tuples until we've removed their index entries, and we want to process
2294 * index entry removal in batches as large as possible.
2296 static void
2297 lazy_vacuum_heap_rel(LVRelState *vacrel)
2299 int tupindex;
2300 BlockNumber vacuumed_pages;
2301 PGRUsage ru0;
2302 Buffer vmbuffer = InvalidBuffer;
2303 LVSavedErrInfo saved_err_info;
2305 Assert(vacrel->do_index_vacuuming);
2306 Assert(vacrel->do_index_cleanup);
2307 Assert(vacrel->num_index_scans > 0);
2309 /* Report that we are now vacuuming the heap */
2310 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2311 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
2313 /* Update error traceback information */
2314 update_vacuum_error_info(vacrel, &saved_err_info,
2315 VACUUM_ERRCB_PHASE_VACUUM_HEAP,
2316 InvalidBlockNumber, InvalidOffsetNumber);
2318 pg_rusage_init(&ru0);
2319 vacuumed_pages = 0;
2321 tupindex = 0;
2322 while (tupindex < vacrel->dead_tuples->num_tuples)
2324 BlockNumber tblk;
2325 Buffer buf;
2326 Page page;
2327 Size freespace;
2329 vacuum_delay_point();
2331 tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
2332 vacrel->blkno = tblk;
2333 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
2334 vacrel->bstrategy);
2335 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2336 tupindex = lazy_vacuum_heap_page(vacrel, tblk, buf, tupindex,
2337 &vmbuffer);
2339 /* Now that we've vacuumed the page, record its available space */
2340 page = BufferGetPage(buf);
2341 freespace = PageGetHeapFreeSpace(page);
2343 UnlockReleaseBuffer(buf);
2344 RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
2345 vacuumed_pages++;
2348 /* Clear the block number information */
2349 vacrel->blkno = InvalidBlockNumber;
2351 if (BufferIsValid(vmbuffer))
2353 ReleaseBuffer(vmbuffer);
2354 vmbuffer = InvalidBuffer;
2358 * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2359 * the second heap pass. No more, no less.
2361 Assert(vacrel->num_index_scans > 1 ||
2362 (tupindex == vacrel->lpdead_items &&
2363 vacuumed_pages == vacrel->lpdead_item_pages));
2365 ereport(elevel,
2366 (errmsg("\"%s\": removed %d dead item identifiers in %u pages",
2367 vacrel->relname, tupindex, vacuumed_pages),
2368 errdetail_internal("%s", pg_rusage_show(&ru0))));
2370 /* Revert to the previous phase information for error traceback */
2371 restore_vacuum_error_info(vacrel, &saved_err_info);
2375 * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2376 * vacrel->dead_tuples array.
2378 * Caller must have an exclusive buffer lock on the buffer (though a
2379 * super-exclusive lock is also acceptable).
2381 * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
2382 * this page. We assume the rest follow sequentially. The return value is
2383 * the first tupindex after the tuples of this page.
2385 * Prior to PostgreSQL 14 there were rare cases where this routine had to set
2386 * tuples with storage to unused. These days it is strictly responsible for
2387 * marking LP_DEAD stub line pointers as unused. This only happens for those
2388 * LP_DEAD items on the page that were determined to be LP_DEAD items back
2389 * when the same page was visited by lazy_scan_prune() (i.e. those whose TID
2390 * was recorded in the dead_tuples array).
2392 static int
2393 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
2394 int tupindex, Buffer *vmbuffer)
2396 LVDeadTuples *dead_tuples = vacrel->dead_tuples;
2397 Page page = BufferGetPage(buffer);
2398 OffsetNumber unused[MaxHeapTuplesPerPage];
2399 int uncnt = 0;
2400 TransactionId visibility_cutoff_xid;
2401 bool all_frozen;
2402 LVSavedErrInfo saved_err_info;
2404 Assert(vacrel->nindexes == 0 || vacrel->do_index_vacuuming);
2406 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
2408 /* Update error traceback information */
2409 update_vacuum_error_info(vacrel, &saved_err_info,
2410 VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
2411 InvalidOffsetNumber);
2413 START_CRIT_SECTION();
2415 for (; tupindex < dead_tuples->num_tuples; tupindex++)
2417 BlockNumber tblk;
2418 OffsetNumber toff;
2419 ItemId itemid;
2421 tblk = ItemPointerGetBlockNumber(&dead_tuples->itemptrs[tupindex]);
2422 if (tblk != blkno)
2423 break; /* past end of tuples for this block */
2424 toff = ItemPointerGetOffsetNumber(&dead_tuples->itemptrs[tupindex]);
2425 itemid = PageGetItemId(page, toff);
2427 Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid));
2428 ItemIdSetUnused(itemid);
2429 unused[uncnt++] = toff;
2432 Assert(uncnt > 0);
2434 /* Attempt to truncate line pointer array now */
2435 PageTruncateLinePointerArray(page);
2438 * Mark buffer dirty before we write WAL.
2440 MarkBufferDirty(buffer);
2442 /* XLOG stuff */
2443 if (RelationNeedsWAL(vacrel->rel))
2445 xl_heap_vacuum xlrec;
2446 XLogRecPtr recptr;
2448 xlrec.nunused = uncnt;
2450 XLogBeginInsert();
2451 XLogRegisterData((char *) &xlrec, SizeOfHeapVacuum);
2453 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2454 XLogRegisterBufData(0, (char *) unused, uncnt * sizeof(OffsetNumber));
2456 recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VACUUM);
2458 PageSetLSN(page, recptr);
2462 * End critical section, so we safely can do visibility tests (which
2463 * possibly need to perform IO and allocate memory!). If we crash now the
2464 * page (including the corresponding vm bit) might not be marked all
2465 * visible, but that's fine. A later vacuum will fix that.
2467 END_CRIT_SECTION();
2470 * Now that we have removed the LD_DEAD items from the page, once again
2471 * check if the page has become all-visible. The page is already marked
2472 * dirty, exclusively locked, and, if needed, a full page image has been
2473 * emitted.
2475 if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2476 &all_frozen))
2477 PageSetAllVisible(page);
2480 * All the changes to the heap page have been done. If the all-visible
2481 * flag is now set, also set the VM all-visible bit (and, if possible, the
2482 * all-frozen bit) unless this has already been done previously.
2484 if (PageIsAllVisible(page))
2486 uint8 flags = 0;
2487 uint8 vm_status = visibilitymap_get_status(vacrel->rel,
2488 blkno, vmbuffer);
2490 /* Set the VM all-frozen bit to flag, if needed */
2491 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2492 flags |= VISIBILITYMAP_ALL_VISIBLE;
2493 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2494 flags |= VISIBILITYMAP_ALL_FROZEN;
2496 Assert(BufferIsValid(*vmbuffer));
2497 if (flags != 0)
2498 visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2499 *vmbuffer, visibility_cutoff_xid, flags);
2502 /* Revert to the previous phase information for error traceback */
2503 restore_vacuum_error_info(vacrel, &saved_err_info);
2504 return tupindex;
2508 * lazy_check_needs_freeze() -- scan page to see if any tuples
2509 * need to be cleaned to avoid wraparound
2511 * Returns true if the page needs to be vacuumed using cleanup lock.
2512 * Also returns a flag indicating whether page contains any tuples at all.
2514 static bool
2515 lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel)
2517 Page page = BufferGetPage(buf);
2518 OffsetNumber offnum,
2519 maxoff;
2520 HeapTupleHeader tupleheader;
2522 *hastup = false;
2525 * New and empty pages, obviously, don't contain tuples. We could make
2526 * sure that the page is registered in the FSM, but it doesn't seem worth
2527 * waiting for a cleanup lock just for that, especially because it's
2528 * likely that the pin holder will do so.
2530 if (PageIsNew(page) || PageIsEmpty(page))
2531 return false;
2533 maxoff = PageGetMaxOffsetNumber(page);
2534 for (offnum = FirstOffsetNumber;
2535 offnum <= maxoff;
2536 offnum = OffsetNumberNext(offnum))
2538 ItemId itemid;
2541 * Set the offset number so that we can display it along with any
2542 * error that occurred while processing this tuple.
2544 vacrel->offnum = offnum;
2545 itemid = PageGetItemId(page, offnum);
2547 /* this should match hastup test in count_nondeletable_pages() */
2548 if (ItemIdIsUsed(itemid))
2549 *hastup = true;
2551 /* dead and redirect items never need freezing */
2552 if (!ItemIdIsNormal(itemid))
2553 continue;
2555 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
2557 if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit,
2558 vacrel->MultiXactCutoff, buf))
2559 break;
2560 } /* scan along page */
2562 /* Clear the offset information once we have processed the given page. */
2563 vacrel->offnum = InvalidOffsetNumber;
2565 return (offnum <= maxoff);
2569 * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2570 * relfrozenxid and/or relminmxid that is dangerously far in the past.
2571 * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2572 * vacuuming and heap vacuuming. Truncating the heap is also bypassed.
2574 * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2575 * up when the failsafe triggers. VACUUM stops applying any cost-based delay
2576 * that it started out with.
2578 * Returns true when failsafe has been triggered.
2580 static bool
2581 lazy_check_wraparound_failsafe(LVRelState *vacrel)
2583 /* Don't warn more than once per VACUUM */
2584 if (vacrel->do_failsafe)
2585 return true;
2587 if (unlikely(vacuum_xid_failsafe_check(vacrel->relfrozenxid,
2588 vacrel->relminmxid)))
2590 Assert(vacrel->do_index_vacuuming);
2591 Assert(vacrel->do_index_cleanup);
2593 vacrel->do_index_vacuuming = false;
2594 vacrel->do_index_cleanup = false;
2595 vacrel->do_failsafe = true;
2597 ereport(WARNING,
2598 (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2599 get_database_name(MyDatabaseId),
2600 vacrel->relnamespace,
2601 vacrel->relname,
2602 vacrel->num_index_scans),
2603 errdetail("table's relfrozenxid or relminmxid is too far in the past"),
2604 errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2605 "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2607 /* Stop applying cost limits from this point on */
2608 VacuumCostActive = false;
2609 VacuumCostBalance = 0;
2611 return true;
2614 return false;
2618 * Perform lazy_vacuum_all_indexes() steps in parallel
2620 static void
2621 do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel)
2623 /* Tell parallel workers to do index vacuuming */
2624 vacrel->lps->lvshared->for_cleanup = false;
2625 vacrel->lps->lvshared->first_time = false;
2628 * We can only provide an approximate value of num_heap_tuples in vacuum
2629 * cases.
2631 vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples;
2632 vacrel->lps->lvshared->estimated_count = true;
2634 do_parallel_vacuum_or_cleanup(vacrel,
2635 vacrel->lps->nindexes_parallel_bulkdel);
2639 * Perform lazy_cleanup_all_indexes() steps in parallel
2641 static void
2642 do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel)
2644 int nworkers;
2647 * If parallel vacuum is active we perform index cleanup with parallel
2648 * workers.
2650 * Tell parallel workers to do index cleanup.
2652 vacrel->lps->lvshared->for_cleanup = true;
2653 vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0);
2656 * Now we can provide a better estimate of total number of surviving
2657 * tuples (we assume indexes are more interested in that than in the
2658 * number of nominally live tuples).
2660 vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples;
2661 vacrel->lps->lvshared->estimated_count =
2662 (vacrel->tupcount_pages < vacrel->rel_pages);
2664 /* Determine the number of parallel workers to launch */
2665 if (vacrel->lps->lvshared->first_time)
2666 nworkers = vacrel->lps->nindexes_parallel_cleanup +
2667 vacrel->lps->nindexes_parallel_condcleanup;
2668 else
2669 nworkers = vacrel->lps->nindexes_parallel_cleanup;
2671 do_parallel_vacuum_or_cleanup(vacrel, nworkers);
2675 * Perform index vacuum or index cleanup with parallel workers. This function
2676 * must be used by the parallel vacuum leader process. The caller must set
2677 * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2678 * cleanup.
2680 static void
2681 do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers)
2683 LVParallelState *lps = vacrel->lps;
2685 Assert(!IsParallelWorker());
2686 Assert(ParallelVacuumIsActive(vacrel));
2687 Assert(vacrel->nindexes > 0);
2689 /* The leader process will participate */
2690 nworkers--;
2693 * It is possible that parallel context is initialized with fewer workers
2694 * than the number of indexes that need a separate worker in the current
2695 * phase, so we need to consider it. See compute_parallel_vacuum_workers.
2697 nworkers = Min(nworkers, lps->pcxt->nworkers);
2699 /* Setup the shared cost-based vacuum delay and launch workers */
2700 if (nworkers > 0)
2702 if (vacrel->num_index_scans > 0)
2704 /* Reset the parallel index processing counter */
2705 pg_atomic_write_u32(&(lps->lvshared->idx), 0);
2707 /* Reinitialize the parallel context to relaunch parallel workers */
2708 ReinitializeParallelDSM(lps->pcxt);
2712 * Set up shared cost balance and the number of active workers for
2713 * vacuum delay. We need to do this before launching workers as
2714 * otherwise, they might not see the updated values for these
2715 * parameters.
2717 pg_atomic_write_u32(&(lps->lvshared->cost_balance), VacuumCostBalance);
2718 pg_atomic_write_u32(&(lps->lvshared->active_nworkers), 0);
2721 * The number of workers can vary between bulkdelete and cleanup
2722 * phase.
2724 ReinitializeParallelWorkers(lps->pcxt, nworkers);
2726 LaunchParallelWorkers(lps->pcxt);
2728 if (lps->pcxt->nworkers_launched > 0)
2731 * Reset the local cost values for leader backend as we have
2732 * already accumulated the remaining balance of heap.
2734 VacuumCostBalance = 0;
2735 VacuumCostBalanceLocal = 0;
2737 /* Enable shared cost balance for leader backend */
2738 VacuumSharedCostBalance = &(lps->lvshared->cost_balance);
2739 VacuumActiveNWorkers = &(lps->lvshared->active_nworkers);
2742 if (lps->lvshared->for_cleanup)
2743 ereport(elevel,
2744 (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2745 "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2746 lps->pcxt->nworkers_launched),
2747 lps->pcxt->nworkers_launched, nworkers)));
2748 else
2749 ereport(elevel,
2750 (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2751 "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2752 lps->pcxt->nworkers_launched),
2753 lps->pcxt->nworkers_launched, nworkers)));
2756 /* Process the indexes that can be processed by only leader process */
2757 do_serial_processing_for_unsafe_indexes(vacrel, lps->lvshared);
2760 * Join as a parallel worker. The leader process alone processes all the
2761 * indexes in the case where no workers are launched.
2763 do_parallel_processing(vacrel, lps->lvshared);
2766 * Next, accumulate buffer and WAL usage. (This must wait for the workers
2767 * to finish, or we might get incomplete data.)
2769 if (nworkers > 0)
2771 /* Wait for all vacuum workers to finish */
2772 WaitForParallelWorkersToFinish(lps->pcxt);
2774 for (int i = 0; i < lps->pcxt->nworkers_launched; i++)
2775 InstrAccumParallelQuery(&lps->buffer_usage[i], &lps->wal_usage[i]);
2779 * Carry the shared balance value to heap scan and disable shared costing
2781 if (VacuumSharedCostBalance)
2783 VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance);
2784 VacuumSharedCostBalance = NULL;
2785 VacuumActiveNWorkers = NULL;
2790 * Index vacuum/cleanup routine used by the leader process and parallel
2791 * vacuum worker processes to process the indexes in parallel.
2793 static void
2794 do_parallel_processing(LVRelState *vacrel, LVShared *lvshared)
2797 * Increment the active worker count if we are able to launch any worker.
2799 if (VacuumActiveNWorkers)
2800 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2802 /* Loop until all indexes are vacuumed */
2803 for (;;)
2805 int idx;
2806 LVSharedIndStats *shared_istat;
2807 Relation indrel;
2808 IndexBulkDeleteResult *istat;
2810 /* Get an index number to process */
2811 idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1);
2813 /* Done for all indexes? */
2814 if (idx >= vacrel->nindexes)
2815 break;
2817 /* Get the index statistics of this index from DSM */
2818 shared_istat = parallel_stats_for_idx(lvshared, idx);
2820 /* Skip indexes not participating in parallelism */
2821 if (shared_istat == NULL)
2822 continue;
2824 indrel = vacrel->indrels[idx];
2827 * Skip processing indexes that are unsafe for workers (these are
2828 * processed in do_serial_processing_for_unsafe_indexes() by leader)
2830 if (!parallel_processing_is_safe(indrel, lvshared))
2831 continue;
2833 /* Do vacuum or cleanup of the index */
2834 istat = (vacrel->indstats[idx]);
2835 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2836 lvshared,
2837 shared_istat,
2838 vacrel);
2842 * We have completed the index vacuum so decrement the active worker
2843 * count.
2845 if (VacuumActiveNWorkers)
2846 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2850 * Vacuum or cleanup indexes that can be processed by only the leader process
2851 * because these indexes don't support parallel operation at that phase.
2853 static void
2854 do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared)
2856 Assert(!IsParallelWorker());
2859 * Increment the active worker count if we are able to launch any worker.
2861 if (VacuumActiveNWorkers)
2862 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2864 for (int idx = 0; idx < vacrel->nindexes; idx++)
2866 LVSharedIndStats *shared_istat;
2867 Relation indrel;
2868 IndexBulkDeleteResult *istat;
2870 shared_istat = parallel_stats_for_idx(lvshared, idx);
2872 /* Skip already-complete indexes */
2873 if (shared_istat != NULL)
2874 continue;
2876 indrel = vacrel->indrels[idx];
2879 * We're only here for the unsafe indexes
2881 if (parallel_processing_is_safe(indrel, lvshared))
2882 continue;
2884 /* Do vacuum or cleanup of the index */
2885 istat = (vacrel->indstats[idx]);
2886 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2887 lvshared,
2888 shared_istat,
2889 vacrel);
2893 * We have completed the index vacuum so decrement the active worker
2894 * count.
2896 if (VacuumActiveNWorkers)
2897 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2901 * Vacuum or cleanup index either by leader process or by one of the worker
2902 * process. After processing the index this function copies the index
2903 * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2904 * segment.
2906 static IndexBulkDeleteResult *
2907 parallel_process_one_index(Relation indrel,
2908 IndexBulkDeleteResult *istat,
2909 LVShared *lvshared,
2910 LVSharedIndStats *shared_istat,
2911 LVRelState *vacrel)
2913 IndexBulkDeleteResult *istat_res;
2916 * Update the pointer to the corresponding bulk-deletion result if someone
2917 * has already updated it
2919 if (shared_istat && shared_istat->updated && istat == NULL)
2920 istat = &shared_istat->istat;
2922 /* Do vacuum or cleanup of the index */
2923 if (lvshared->for_cleanup)
2924 istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples,
2925 lvshared->estimated_count, vacrel);
2926 else
2927 istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples,
2928 vacrel);
2931 * Copy the index bulk-deletion result returned from ambulkdelete and
2932 * amvacuumcleanup to the DSM segment if it's the first cycle because they
2933 * allocate locally and it's possible that an index will be vacuumed by a
2934 * different vacuum process the next cycle. Copying the result normally
2935 * happens only the first time an index is vacuumed. For any additional
2936 * vacuum pass, we directly point to the result on the DSM segment and
2937 * pass it to vacuum index APIs so that workers can update it directly.
2939 * Since all vacuum workers write the bulk-deletion result at different
2940 * slots we can write them without locking.
2942 if (shared_istat && !shared_istat->updated && istat_res != NULL)
2944 memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult));
2945 shared_istat->updated = true;
2947 /* Free the locally-allocated bulk-deletion result */
2948 pfree(istat_res);
2950 /* return the pointer to the result from shared memory */
2951 return &shared_istat->istat;
2954 return istat_res;
2958 * lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2960 static void
2961 lazy_cleanup_all_indexes(LVRelState *vacrel)
2963 Assert(!IsParallelWorker());
2964 Assert(vacrel->nindexes > 0);
2966 /* Report that we are now cleaning up indexes */
2967 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2968 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
2970 if (!ParallelVacuumIsActive(vacrel))
2972 double reltuples = vacrel->new_rel_tuples;
2973 bool estimated_count =
2974 vacrel->tupcount_pages < vacrel->rel_pages;
2976 for (int idx = 0; idx < vacrel->nindexes; idx++)
2978 Relation indrel = vacrel->indrels[idx];
2979 IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2981 vacrel->indstats[idx] =
2982 lazy_cleanup_one_index(indrel, istat, reltuples,
2983 estimated_count, vacrel);
2986 else
2988 /* Outsource everything to parallel variant */
2989 do_parallel_lazy_cleanup_all_indexes(vacrel);
2994 * lazy_vacuum_one_index() -- vacuum index relation.
2996 * Delete all the index entries pointing to tuples listed in
2997 * dead_tuples, and update running statistics.
2999 * reltuples is the number of heap tuples to be passed to the
3000 * bulkdelete callback. It's always assumed to be estimated.
3002 * Returns bulk delete stats derived from input stats
3004 static IndexBulkDeleteResult *
3005 lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3006 double reltuples, LVRelState *vacrel)
3008 IndexVacuumInfo ivinfo;
3009 PGRUsage ru0;
3010 LVSavedErrInfo saved_err_info;
3012 pg_rusage_init(&ru0);
3014 ivinfo.index = indrel;
3015 ivinfo.analyze_only = false;
3016 ivinfo.report_progress = false;
3017 ivinfo.estimated_count = true;
3018 ivinfo.message_level = elevel;
3019 ivinfo.num_heap_tuples = reltuples;
3020 ivinfo.strategy = vacrel->bstrategy;
3023 * Update error traceback information.
3025 * The index name is saved during this phase and restored immediately
3026 * after this phase. See vacuum_error_callback.
3028 Assert(vacrel->indname == NULL);
3029 vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3030 update_vacuum_error_info(vacrel, &saved_err_info,
3031 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
3032 InvalidBlockNumber, InvalidOffsetNumber);
3034 /* Do bulk deletion */
3035 istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped,
3036 (void *) vacrel->dead_tuples);
3038 ereport(elevel,
3039 (errmsg("scanned index \"%s\" to remove %d row versions",
3040 vacrel->indname, vacrel->dead_tuples->num_tuples),
3041 errdetail_internal("%s", pg_rusage_show(&ru0))));
3043 /* Revert to the previous phase information for error traceback */
3044 restore_vacuum_error_info(vacrel, &saved_err_info);
3045 pfree(vacrel->indname);
3046 vacrel->indname = NULL;
3048 return istat;
3052 * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
3054 * reltuples is the number of heap tuples and estimated_count is true
3055 * if reltuples is an estimated value.
3057 * Returns bulk delete stats derived from input stats
3059 static IndexBulkDeleteResult *
3060 lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
3061 double reltuples, bool estimated_count,
3062 LVRelState *vacrel)
3064 IndexVacuumInfo ivinfo;
3065 PGRUsage ru0;
3066 LVSavedErrInfo saved_err_info;
3068 pg_rusage_init(&ru0);
3070 ivinfo.index = indrel;
3071 ivinfo.analyze_only = false;
3072 ivinfo.report_progress = false;
3073 ivinfo.estimated_count = estimated_count;
3074 ivinfo.message_level = elevel;
3076 ivinfo.num_heap_tuples = reltuples;
3077 ivinfo.strategy = vacrel->bstrategy;
3080 * Update error traceback information.
3082 * The index name is saved during this phase and restored immediately
3083 * after this phase. See vacuum_error_callback.
3085 Assert(vacrel->indname == NULL);
3086 vacrel->indname = pstrdup(RelationGetRelationName(indrel));
3087 update_vacuum_error_info(vacrel, &saved_err_info,
3088 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
3089 InvalidBlockNumber, InvalidOffsetNumber);
3091 istat = index_vacuum_cleanup(&ivinfo, istat);
3093 if (istat)
3095 ereport(elevel,
3096 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3097 RelationGetRelationName(indrel),
3098 (istat)->num_index_tuples,
3099 (istat)->num_pages),
3100 errdetail("%.0f index row versions were removed.\n"
3101 "%u index pages were newly deleted.\n"
3102 "%u index pages are currently deleted, of which %u are currently reusable.\n"
3103 "%s.",
3104 (istat)->tuples_removed,
3105 (istat)->pages_newly_deleted,
3106 (istat)->pages_deleted, (istat)->pages_free,
3107 pg_rusage_show(&ru0))));
3110 /* Revert to the previous phase information for error traceback */
3111 restore_vacuum_error_info(vacrel, &saved_err_info);
3112 pfree(vacrel->indname);
3113 vacrel->indname = NULL;
3115 return istat;
3119 * should_attempt_truncation - should we attempt to truncate the heap?
3121 * Don't even think about it unless we have a shot at releasing a goodly
3122 * number of pages. Otherwise, the time taken isn't worth it.
3124 * Also don't attempt it if wraparound failsafe is in effect. It's hard to
3125 * predict how long lazy_truncate_heap will take. Don't take any chances.
3126 * There is very little chance of truncation working out when the failsafe is
3127 * in effect in any case. lazy_scan_prune makes the optimistic assumption
3128 * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
3129 * we're called.
3131 * Also don't attempt it if we are doing early pruning/vacuuming, because a
3132 * scan which cannot find a truncated heap page cannot determine that the
3133 * snapshot is too old to read that page.
3135 * This is split out so that we can test whether truncation is going to be
3136 * called for before we actually do it. If you change the logic here, be
3137 * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
3139 static bool
3140 should_attempt_truncation(LVRelState *vacrel, VacuumParams *params)
3142 BlockNumber possibly_freeable;
3144 if (params->truncate == VACOPT_TERNARY_DISABLED)
3145 return false;
3147 if (vacrel->do_failsafe)
3148 return false;
3150 possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
3151 if (possibly_freeable > 0 &&
3152 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
3153 possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) &&
3154 old_snapshot_threshold < 0)
3155 return true;
3156 else
3157 return false;
3161 * lazy_truncate_heap - try to truncate off any empty pages at the end
3163 static void
3164 lazy_truncate_heap(LVRelState *vacrel)
3166 BlockNumber old_rel_pages = vacrel->rel_pages;
3167 BlockNumber new_rel_pages;
3168 int lock_retry;
3170 /* Report that we are now truncating */
3171 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
3172 PROGRESS_VACUUM_PHASE_TRUNCATE);
3175 * Loop until no more truncating can be done.
3179 PGRUsage ru0;
3181 pg_rusage_init(&ru0);
3184 * We need full exclusive lock on the relation in order to do
3185 * truncation. If we can't get it, give up rather than waiting --- we
3186 * don't want to block other backends, and we don't want to deadlock
3187 * (which is quite possible considering we already hold a lower-grade
3188 * lock).
3190 vacrel->lock_waiter_detected = false;
3191 lock_retry = 0;
3192 while (true)
3194 if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
3195 break;
3198 * Check for interrupts while trying to (re-)acquire the exclusive
3199 * lock.
3201 CHECK_FOR_INTERRUPTS();
3203 if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
3204 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
3207 * We failed to establish the lock in the specified number of
3208 * retries. This means we give up truncating.
3210 vacrel->lock_waiter_detected = true;
3211 ereport(elevel,
3212 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
3213 vacrel->relname)));
3214 return;
3217 pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
3221 * Now that we have exclusive lock, look to see if the rel has grown
3222 * whilst we were vacuuming with non-exclusive lock. If so, give up;
3223 * the newly added pages presumably contain non-deletable tuples.
3225 new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
3226 if (new_rel_pages != old_rel_pages)
3229 * Note: we intentionally don't update vacrel->rel_pages with the
3230 * new rel size here. If we did, it would amount to assuming that
3231 * the new pages are empty, which is unlikely. Leaving the numbers
3232 * alone amounts to assuming that the new pages have the same
3233 * tuple density as existing ones, which is less unlikely.
3235 UnlockRelation(vacrel->rel, AccessExclusiveLock);
3236 return;
3240 * Scan backwards from the end to verify that the end pages actually
3241 * contain no tuples. This is *necessary*, not optional, because
3242 * other backends could have added tuples to these pages whilst we
3243 * were vacuuming.
3245 new_rel_pages = count_nondeletable_pages(vacrel);
3246 vacrel->blkno = new_rel_pages;
3248 if (new_rel_pages >= old_rel_pages)
3250 /* can't do anything after all */
3251 UnlockRelation(vacrel->rel, AccessExclusiveLock);
3252 return;
3256 * Okay to truncate.
3258 RelationTruncate(vacrel->rel, new_rel_pages);
3261 * We can release the exclusive lock as soon as we have truncated.
3262 * Other backends can't safely access the relation until they have
3263 * processed the smgr invalidation that smgrtruncate sent out ... but
3264 * that should happen as part of standard invalidation processing once
3265 * they acquire lock on the relation.
3267 UnlockRelation(vacrel->rel, AccessExclusiveLock);
3270 * Update statistics. Here, it *is* correct to adjust rel_pages
3271 * without also touching reltuples, since the tuple count wasn't
3272 * changed by the truncation.
3274 vacrel->pages_removed += old_rel_pages - new_rel_pages;
3275 vacrel->rel_pages = new_rel_pages;
3277 ereport(elevel,
3278 (errmsg("\"%s\": truncated %u to %u pages",
3279 vacrel->relname,
3280 old_rel_pages, new_rel_pages),
3281 errdetail_internal("%s",
3282 pg_rusage_show(&ru0))));
3283 old_rel_pages = new_rel_pages;
3284 } while (new_rel_pages > vacrel->nonempty_pages &&
3285 vacrel->lock_waiter_detected);
3289 * Rescan end pages to verify that they are (still) empty of tuples.
3291 * Returns number of nondeletable pages (last nonempty page + 1).
3293 static BlockNumber
3294 count_nondeletable_pages(LVRelState *vacrel)
3296 BlockNumber blkno;
3297 BlockNumber prefetchedUntil;
3298 instr_time starttime;
3300 /* Initialize the starttime if we check for conflicting lock requests */
3301 INSTR_TIME_SET_CURRENT(starttime);
3304 * Start checking blocks at what we believe relation end to be and move
3305 * backwards. (Strange coding of loop control is needed because blkno is
3306 * unsigned.) To make the scan faster, we prefetch a few blocks at a time
3307 * in forward direction, so that OS-level readahead can kick in.
3309 blkno = vacrel->rel_pages;
3310 StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
3311 "prefetch size must be power of 2");
3312 prefetchedUntil = InvalidBlockNumber;
3313 while (blkno > vacrel->nonempty_pages)
3315 Buffer buf;
3316 Page page;
3317 OffsetNumber offnum,
3318 maxoff;
3319 bool hastup;
3322 * Check if another process requests a lock on our relation. We are
3323 * holding an AccessExclusiveLock here, so they will be waiting. We
3324 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
3325 * only check if that interval has elapsed once every 32 blocks to
3326 * keep the number of system calls and actual shared lock table
3327 * lookups to a minimum.
3329 if ((blkno % 32) == 0)
3331 instr_time currenttime;
3332 instr_time elapsed;
3334 INSTR_TIME_SET_CURRENT(currenttime);
3335 elapsed = currenttime;
3336 INSTR_TIME_SUBTRACT(elapsed, starttime);
3337 if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
3338 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
3340 if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
3342 ereport(elevel,
3343 (errmsg("\"%s\": suspending truncate due to conflicting lock request",
3344 vacrel->relname)));
3346 vacrel->lock_waiter_detected = true;
3347 return blkno;
3349 starttime = currenttime;
3354 * We don't insert a vacuum delay point here, because we have an
3355 * exclusive lock on the table which we want to hold for as short a
3356 * time as possible. We still need to check for interrupts however.
3358 CHECK_FOR_INTERRUPTS();
3360 blkno--;
3362 /* If we haven't prefetched this lot yet, do so now. */
3363 if (prefetchedUntil > blkno)
3365 BlockNumber prefetchStart;
3366 BlockNumber pblkno;
3368 prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
3369 for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
3371 PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
3372 CHECK_FOR_INTERRUPTS();
3374 prefetchedUntil = prefetchStart;
3377 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
3378 vacrel->bstrategy);
3380 /* In this phase we only need shared access to the buffer */
3381 LockBuffer(buf, BUFFER_LOCK_SHARE);
3383 page = BufferGetPage(buf);
3385 if (PageIsNew(page) || PageIsEmpty(page))
3387 UnlockReleaseBuffer(buf);
3388 continue;
3391 hastup = false;
3392 maxoff = PageGetMaxOffsetNumber(page);
3393 for (offnum = FirstOffsetNumber;
3394 offnum <= maxoff;
3395 offnum = OffsetNumberNext(offnum))
3397 ItemId itemid;
3399 itemid = PageGetItemId(page, offnum);
3402 * Note: any non-unused item should be taken as a reason to keep
3403 * this page. We formerly thought that DEAD tuples could be
3404 * thrown away, but that's not so, because we'd not have cleaned
3405 * out their index entries.
3407 if (ItemIdIsUsed(itemid))
3409 hastup = true;
3410 break; /* can stop scanning */
3412 } /* scan along page */
3414 UnlockReleaseBuffer(buf);
3416 /* Done scanning if we found a tuple here */
3417 if (hastup)
3418 return blkno + 1;
3422 * If we fall out of the loop, all the previously-thought-to-be-empty
3423 * pages still are; we need not bother to look at the last known-nonempty
3424 * page.
3426 return vacrel->nonempty_pages;
3430 * Return the maximum number of dead tuples we can record.
3432 static long
3433 compute_max_dead_tuples(BlockNumber relblocks, bool hasindex)
3435 long maxtuples;
3436 int vac_work_mem = IsAutoVacuumWorkerProcess() &&
3437 autovacuum_work_mem != -1 ?
3438 autovacuum_work_mem : maintenance_work_mem;
3440 if (hasindex)
3442 maxtuples = MAXDEADTUPLES(vac_work_mem * 1024L);
3443 maxtuples = Min(maxtuples, INT_MAX);
3444 maxtuples = Min(maxtuples, MAXDEADTUPLES(MaxAllocSize));
3446 /* curious coding here to ensure the multiplication can't overflow */
3447 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
3448 maxtuples = relblocks * LAZY_ALLOC_TUPLES;
3450 /* stay sane if small maintenance_work_mem */
3451 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
3453 else
3454 maxtuples = MaxHeapTuplesPerPage;
3456 return maxtuples;
3460 * lazy_space_alloc - space allocation decisions for lazy vacuum
3462 * See the comments at the head of this file for rationale.
3464 static void
3465 lazy_space_alloc(LVRelState *vacrel, int nworkers, BlockNumber nblocks)
3467 LVDeadTuples *dead_tuples;
3468 long maxtuples;
3471 * Initialize state for a parallel vacuum. As of now, only one worker can
3472 * be used for an index, so we invoke parallelism only if there are at
3473 * least two indexes on a table.
3475 if (nworkers >= 0 && vacrel->nindexes > 1 && vacrel->do_index_vacuuming)
3478 * Since parallel workers cannot access data in temporary tables, we
3479 * can't perform parallel vacuum on them.
3481 if (RelationUsesLocalBuffers(vacrel->rel))
3484 * Give warning only if the user explicitly tries to perform a
3485 * parallel vacuum on the temporary table.
3487 if (nworkers > 0)
3488 ereport(WARNING,
3489 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3490 vacrel->relname)));
3492 else
3493 vacrel->lps = begin_parallel_vacuum(vacrel, nblocks, nworkers);
3495 /* If parallel mode started, we're done */
3496 if (ParallelVacuumIsActive(vacrel))
3497 return;
3500 maxtuples = compute_max_dead_tuples(nblocks, vacrel->nindexes > 0);
3502 dead_tuples = (LVDeadTuples *) palloc(SizeOfDeadTuples(maxtuples));
3503 dead_tuples->num_tuples = 0;
3504 dead_tuples->max_tuples = (int) maxtuples;
3506 vacrel->dead_tuples = dead_tuples;
3510 * lazy_space_free - free space allocated in lazy_space_alloc
3512 static void
3513 lazy_space_free(LVRelState *vacrel)
3515 if (!ParallelVacuumIsActive(vacrel))
3516 return;
3519 * End parallel mode before updating index statistics as we cannot write
3520 * during parallel mode.
3522 end_parallel_vacuum(vacrel);
3526 * lazy_tid_reaped() -- is a particular tid deletable?
3528 * This has the right signature to be an IndexBulkDeleteCallback.
3530 * Assumes dead_tuples array is in sorted order.
3532 static bool
3533 lazy_tid_reaped(ItemPointer itemptr, void *state)
3535 LVDeadTuples *dead_tuples = (LVDeadTuples *) state;
3536 int64 litem,
3537 ritem,
3538 item;
3539 ItemPointer res;
3541 litem = itemptr_encode(&dead_tuples->itemptrs[0]);
3542 ritem = itemptr_encode(&dead_tuples->itemptrs[dead_tuples->num_tuples - 1]);
3543 item = itemptr_encode(itemptr);
3546 * Doing a simple bound check before bsearch() is useful to avoid the
3547 * extra cost of bsearch(), especially if dead tuples on the heap are
3548 * concentrated in a certain range. Since this function is called for
3549 * every index tuple, it pays to be really fast.
3551 if (item < litem || item > ritem)
3552 return false;
3554 res = (ItemPointer) bsearch((void *) itemptr,
3555 (void *) dead_tuples->itemptrs,
3556 dead_tuples->num_tuples,
3557 sizeof(ItemPointerData),
3558 vac_cmp_itemptr);
3560 return (res != NULL);
3564 * Comparator routines for use with qsort() and bsearch().
3566 static int
3567 vac_cmp_itemptr(const void *left, const void *right)
3569 BlockNumber lblk,
3570 rblk;
3571 OffsetNumber loff,
3572 roff;
3574 lblk = ItemPointerGetBlockNumber((ItemPointer) left);
3575 rblk = ItemPointerGetBlockNumber((ItemPointer) right);
3577 if (lblk < rblk)
3578 return -1;
3579 if (lblk > rblk)
3580 return 1;
3582 loff = ItemPointerGetOffsetNumber((ItemPointer) left);
3583 roff = ItemPointerGetOffsetNumber((ItemPointer) right);
3585 if (loff < roff)
3586 return -1;
3587 if (loff > roff)
3588 return 1;
3590 return 0;
3594 * Check if every tuple in the given page is visible to all current and future
3595 * transactions. Also return the visibility_cutoff_xid which is the highest
3596 * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
3597 * on this page is frozen.
3599 static bool
3600 heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
3601 TransactionId *visibility_cutoff_xid,
3602 bool *all_frozen)
3604 Page page = BufferGetPage(buf);
3605 BlockNumber blockno = BufferGetBlockNumber(buf);
3606 OffsetNumber offnum,
3607 maxoff;
3608 bool all_visible = true;
3610 *visibility_cutoff_xid = InvalidTransactionId;
3611 *all_frozen = true;
3614 * This is a stripped down version of the line pointer scan in
3615 * lazy_scan_heap(). So if you change anything here, also check that code.
3617 maxoff = PageGetMaxOffsetNumber(page);
3618 for (offnum = FirstOffsetNumber;
3619 offnum <= maxoff && all_visible;
3620 offnum = OffsetNumberNext(offnum))
3622 ItemId itemid;
3623 HeapTupleData tuple;
3626 * Set the offset number so that we can display it along with any
3627 * error that occurred while processing this tuple.
3629 vacrel->offnum = offnum;
3630 itemid = PageGetItemId(page, offnum);
3632 /* Unused or redirect line pointers are of no interest */
3633 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3634 continue;
3636 ItemPointerSet(&(tuple.t_self), blockno, offnum);
3639 * Dead line pointers can have index pointers pointing to them. So
3640 * they can't be treated as visible
3642 if (ItemIdIsDead(itemid))
3644 all_visible = false;
3645 *all_frozen = false;
3646 break;
3649 Assert(ItemIdIsNormal(itemid));
3651 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3652 tuple.t_len = ItemIdGetLength(itemid);
3653 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3655 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
3657 case HEAPTUPLE_LIVE:
3659 TransactionId xmin;
3661 /* Check comments in lazy_scan_heap. */
3662 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3664 all_visible = false;
3665 *all_frozen = false;
3666 break;
3670 * The inserter definitely committed. But is it old enough
3671 * that everyone sees it as committed?
3673 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3674 if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
3676 all_visible = false;
3677 *all_frozen = false;
3678 break;
3681 /* Track newest xmin on page. */
3682 if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3683 *visibility_cutoff_xid = xmin;
3685 /* Check whether this tuple is already frozen or not */
3686 if (all_visible && *all_frozen &&
3687 heap_tuple_needs_eventual_freeze(tuple.t_data))
3688 *all_frozen = false;
3690 break;
3692 case HEAPTUPLE_DEAD:
3693 case HEAPTUPLE_RECENTLY_DEAD:
3694 case HEAPTUPLE_INSERT_IN_PROGRESS:
3695 case HEAPTUPLE_DELETE_IN_PROGRESS:
3697 all_visible = false;
3698 *all_frozen = false;
3699 break;
3701 default:
3702 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3703 break;
3705 } /* scan along page */
3707 /* Clear the offset information once we have processed the given page. */
3708 vacrel->offnum = InvalidOffsetNumber;
3710 return all_visible;
3714 * Compute the number of parallel worker processes to request. Both index
3715 * vacuum and index cleanup can be executed with parallel workers. The index
3716 * is eligible for parallel vacuum iff its size is greater than
3717 * min_parallel_index_scan_size as invoking workers for very small indexes
3718 * can hurt performance.
3720 * nrequested is the number of parallel workers that user requested. If
3721 * nrequested is 0, we compute the parallel degree based on nindexes, that is
3722 * the number of indexes that support parallel vacuum. This function also
3723 * sets can_parallel_vacuum to remember indexes that participate in parallel
3724 * vacuum.
3726 static int
3727 compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested,
3728 bool *can_parallel_vacuum)
3730 int nindexes_parallel = 0;
3731 int nindexes_parallel_bulkdel = 0;
3732 int nindexes_parallel_cleanup = 0;
3733 int parallel_workers;
3736 * We don't allow performing parallel operation in standalone backend or
3737 * when parallelism is disabled.
3739 if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0)
3740 return 0;
3743 * Compute the number of indexes that can participate in parallel vacuum.
3745 for (int idx = 0; idx < vacrel->nindexes; idx++)
3747 Relation indrel = vacrel->indrels[idx];
3748 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3750 if (vacoptions == VACUUM_OPTION_NO_PARALLEL ||
3751 RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size)
3752 continue;
3754 can_parallel_vacuum[idx] = true;
3756 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3757 nindexes_parallel_bulkdel++;
3758 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) ||
3759 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3760 nindexes_parallel_cleanup++;
3763 nindexes_parallel = Max(nindexes_parallel_bulkdel,
3764 nindexes_parallel_cleanup);
3766 /* The leader process takes one index */
3767 nindexes_parallel--;
3769 /* No index supports parallel vacuum */
3770 if (nindexes_parallel <= 0)
3771 return 0;
3773 /* Compute the parallel degree */
3774 parallel_workers = (nrequested > 0) ?
3775 Min(nrequested, nindexes_parallel) : nindexes_parallel;
3777 /* Cap by max_parallel_maintenance_workers */
3778 parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers);
3780 return parallel_workers;
3784 * Update index statistics in pg_class if the statistics are accurate.
3786 static void
3787 update_index_statistics(LVRelState *vacrel)
3789 Relation *indrels = vacrel->indrels;
3790 int nindexes = vacrel->nindexes;
3791 IndexBulkDeleteResult **indstats = vacrel->indstats;
3793 Assert(!IsInParallelMode());
3795 for (int idx = 0; idx < nindexes; idx++)
3797 Relation indrel = indrels[idx];
3798 IndexBulkDeleteResult *istat = indstats[idx];
3800 if (istat == NULL || istat->estimated_count)
3801 continue;
3803 /* Update index statistics */
3804 vac_update_relstats(indrel,
3805 istat->num_pages,
3806 istat->num_index_tuples,
3808 false,
3809 InvalidTransactionId,
3810 InvalidMultiXactId,
3811 false);
3816 * This function prepares and returns parallel vacuum state if we can launch
3817 * even one worker. This function is responsible for entering parallel mode,
3818 * create a parallel context, and then initialize the DSM segment.
3820 static LVParallelState *
3821 begin_parallel_vacuum(LVRelState *vacrel, BlockNumber nblocks,
3822 int nrequested)
3824 LVParallelState *lps = NULL;
3825 Relation *indrels = vacrel->indrels;
3826 int nindexes = vacrel->nindexes;
3827 ParallelContext *pcxt;
3828 LVShared *shared;
3829 LVDeadTuples *dead_tuples;
3830 BufferUsage *buffer_usage;
3831 WalUsage *wal_usage;
3832 bool *can_parallel_vacuum;
3833 long maxtuples;
3834 Size est_shared;
3835 Size est_deadtuples;
3836 int nindexes_mwm = 0;
3837 int parallel_workers = 0;
3838 int querylen;
3841 * A parallel vacuum must be requested and there must be indexes on the
3842 * relation
3844 Assert(nrequested >= 0);
3845 Assert(nindexes > 0);
3848 * Compute the number of parallel vacuum workers to launch
3850 can_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes);
3851 parallel_workers = compute_parallel_vacuum_workers(vacrel,
3852 nrequested,
3853 can_parallel_vacuum);
3855 /* Can't perform vacuum in parallel */
3856 if (parallel_workers <= 0)
3858 pfree(can_parallel_vacuum);
3859 return lps;
3862 lps = (LVParallelState *) palloc0(sizeof(LVParallelState));
3864 EnterParallelMode();
3865 pcxt = CreateParallelContext("postgres", "parallel_vacuum_main",
3866 parallel_workers);
3867 Assert(pcxt->nworkers > 0);
3868 lps->pcxt = pcxt;
3870 /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3871 est_shared = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3872 for (int idx = 0; idx < nindexes; idx++)
3874 Relation indrel = indrels[idx];
3875 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3878 * Cleanup option should be either disabled, always performing in
3879 * parallel or conditionally performing in parallel.
3881 Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) ||
3882 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0));
3883 Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE);
3885 /* Skip indexes that don't participate in parallel vacuum */
3886 if (!can_parallel_vacuum[idx])
3887 continue;
3889 if (indrel->rd_indam->amusemaintenanceworkmem)
3890 nindexes_mwm++;
3892 est_shared = add_size(est_shared, sizeof(LVSharedIndStats));
3895 * Remember the number of indexes that support parallel operation for
3896 * each phase.
3898 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3899 lps->nindexes_parallel_bulkdel++;
3900 if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)
3901 lps->nindexes_parallel_cleanup++;
3902 if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)
3903 lps->nindexes_parallel_condcleanup++;
3905 shm_toc_estimate_chunk(&pcxt->estimator, est_shared);
3906 shm_toc_estimate_keys(&pcxt->estimator, 1);
3908 /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3909 maxtuples = compute_max_dead_tuples(nblocks, true);
3910 est_deadtuples = MAXALIGN(SizeOfDeadTuples(maxtuples));
3911 shm_toc_estimate_chunk(&pcxt->estimator, est_deadtuples);
3912 shm_toc_estimate_keys(&pcxt->estimator, 1);
3915 * Estimate space for BufferUsage and WalUsage --
3916 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3918 * If there are no extensions loaded that care, we could skip this. We
3919 * have no way of knowing whether anyone's looking at pgBufferUsage or
3920 * pgWalUsage, so do it unconditionally.
3922 shm_toc_estimate_chunk(&pcxt->estimator,
3923 mul_size(sizeof(BufferUsage), pcxt->nworkers));
3924 shm_toc_estimate_keys(&pcxt->estimator, 1);
3925 shm_toc_estimate_chunk(&pcxt->estimator,
3926 mul_size(sizeof(WalUsage), pcxt->nworkers));
3927 shm_toc_estimate_keys(&pcxt->estimator, 1);
3929 /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3930 if (debug_query_string)
3932 querylen = strlen(debug_query_string);
3933 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
3934 shm_toc_estimate_keys(&pcxt->estimator, 1);
3936 else
3937 querylen = 0; /* keep compiler quiet */
3939 InitializeParallelDSM(pcxt);
3941 /* Prepare shared information */
3942 shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared);
3943 MemSet(shared, 0, est_shared);
3944 shared->relid = RelationGetRelid(vacrel->rel);
3945 shared->elevel = elevel;
3946 shared->maintenance_work_mem_worker =
3947 (nindexes_mwm > 0) ?
3948 maintenance_work_mem / Min(parallel_workers, nindexes_mwm) :
3949 maintenance_work_mem;
3951 pg_atomic_init_u32(&(shared->cost_balance), 0);
3952 pg_atomic_init_u32(&(shared->active_nworkers), 0);
3953 pg_atomic_init_u32(&(shared->idx), 0);
3954 shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3957 * Initialize variables for shared index statistics, set NULL bitmap and
3958 * the size of stats for each index.
3960 memset(shared->bitmap, 0x00, BITMAPLEN(nindexes));
3961 for (int idx = 0; idx < nindexes; idx++)
3963 if (!can_parallel_vacuum[idx])
3964 continue;
3966 /* Set NOT NULL as this index does support parallelism */
3967 shared->bitmap[idx >> 3] |= 1 << (idx & 0x07);
3970 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared);
3971 lps->lvshared = shared;
3973 /* Prepare the dead tuple space */
3974 dead_tuples = (LVDeadTuples *) shm_toc_allocate(pcxt->toc, est_deadtuples);
3975 dead_tuples->max_tuples = maxtuples;
3976 dead_tuples->num_tuples = 0;
3977 MemSet(dead_tuples->itemptrs, 0, sizeof(ItemPointerData) * maxtuples);
3978 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_TUPLES, dead_tuples);
3979 vacrel->dead_tuples = dead_tuples;
3982 * Allocate space for each worker's BufferUsage and WalUsage; no need to
3983 * initialize
3985 buffer_usage = shm_toc_allocate(pcxt->toc,
3986 mul_size(sizeof(BufferUsage), pcxt->nworkers));
3987 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
3988 lps->buffer_usage = buffer_usage;
3989 wal_usage = shm_toc_allocate(pcxt->toc,
3990 mul_size(sizeof(WalUsage), pcxt->nworkers));
3991 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
3992 lps->wal_usage = wal_usage;
3994 /* Store query string for workers */
3995 if (debug_query_string)
3997 char *sharedquery;
3999 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
4000 memcpy(sharedquery, debug_query_string, querylen + 1);
4001 sharedquery[querylen] = '\0';
4002 shm_toc_insert(pcxt->toc,
4003 PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery);
4006 pfree(can_parallel_vacuum);
4007 return lps;
4011 * Destroy the parallel context, and end parallel mode.
4013 * Since writes are not allowed during parallel mode, copy the
4014 * updated index statistics from DSM into local memory and then later use that
4015 * to update the index statistics. One might think that we can exit from
4016 * parallel mode, update the index statistics and then destroy parallel
4017 * context, but that won't be safe (see ExitParallelMode).
4019 static void
4020 end_parallel_vacuum(LVRelState *vacrel)
4022 IndexBulkDeleteResult **indstats = vacrel->indstats;
4023 LVParallelState *lps = vacrel->lps;
4024 int nindexes = vacrel->nindexes;
4026 Assert(!IsParallelWorker());
4028 /* Copy the updated statistics */
4029 for (int idx = 0; idx < nindexes; idx++)
4031 LVSharedIndStats *shared_istat;
4033 shared_istat = parallel_stats_for_idx(lps->lvshared, idx);
4036 * Skip unused slot. The statistics of this index are already stored
4037 * in local memory.
4039 if (shared_istat == NULL)
4040 continue;
4042 if (shared_istat->updated)
4044 indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
4045 memcpy(indstats[idx], &(shared_istat->istat), sizeof(IndexBulkDeleteResult));
4047 else
4048 indstats[idx] = NULL;
4051 DestroyParallelContext(lps->pcxt);
4052 ExitParallelMode();
4054 /* Deactivate parallel vacuum */
4055 pfree(lps);
4056 vacrel->lps = NULL;
4060 * Return shared memory statistics for index at offset 'getidx', if any
4062 static LVSharedIndStats *
4063 parallel_stats_for_idx(LVShared *lvshared, int getidx)
4065 char *p;
4067 if (IndStatsIsNull(lvshared, getidx))
4068 return NULL;
4070 p = (char *) GetSharedIndStats(lvshared);
4071 for (int idx = 0; idx < getidx; idx++)
4073 if (IndStatsIsNull(lvshared, idx))
4074 continue;
4076 p += sizeof(LVSharedIndStats);
4079 return (LVSharedIndStats *) p;
4083 * Returns false, if the given index can't participate in parallel index
4084 * vacuum or parallel index cleanup
4086 static bool
4087 parallel_processing_is_safe(Relation indrel, LVShared *lvshared)
4089 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
4091 /* first_time must be true only if for_cleanup is true */
4092 Assert(lvshared->for_cleanup || !lvshared->first_time);
4094 if (lvshared->for_cleanup)
4096 /* Skip, if the index does not support parallel cleanup */
4097 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) &&
4098 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0))
4099 return true;
4102 * Skip, if the index supports parallel cleanup conditionally, but we
4103 * have already processed the index (for bulkdelete). See the
4104 * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
4105 * when indexes support parallel cleanup conditionally.
4107 if (!lvshared->first_time &&
4108 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
4109 return false;
4111 else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0)
4113 /* Skip if the index does not support parallel bulk deletion */
4114 return false;
4117 return true;
4121 * Perform work within a launched parallel process.
4123 * Since parallel vacuum workers perform only index vacuum or index cleanup,
4124 * we don't need to report progress information.
4126 void
4127 parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
4129 Relation rel;
4130 Relation *indrels;
4131 LVShared *lvshared;
4132 LVDeadTuples *dead_tuples;
4133 BufferUsage *buffer_usage;
4134 WalUsage *wal_usage;
4135 int nindexes;
4136 char *sharedquery;
4137 LVRelState vacrel;
4138 ErrorContextCallback errcallback;
4140 lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED,
4141 false);
4142 elevel = lvshared->elevel;
4144 if (lvshared->for_cleanup)
4145 elog(DEBUG1, "starting parallel vacuum worker for cleanup");
4146 else
4147 elog(DEBUG1, "starting parallel vacuum worker for bulk delete");
4149 /* Set debug_query_string for individual workers */
4150 sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true);
4151 debug_query_string = sharedquery;
4152 pgstat_report_activity(STATE_RUNNING, debug_query_string);
4155 * Open table. The lock mode is the same as the leader process. It's
4156 * okay because the lock mode does not conflict among the parallel
4157 * workers.
4159 rel = table_open(lvshared->relid, ShareUpdateExclusiveLock);
4162 * Open all indexes. indrels are sorted in order by OID, which should be
4163 * matched to the leader's one.
4165 vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels);
4166 Assert(nindexes > 0);
4168 /* Set dead tuple space */
4169 dead_tuples = (LVDeadTuples *) shm_toc_lookup(toc,
4170 PARALLEL_VACUUM_KEY_DEAD_TUPLES,
4171 false);
4173 /* Set cost-based vacuum delay */
4174 VacuumCostActive = (VacuumCostDelay > 0);
4175 VacuumCostBalance = 0;
4176 VacuumPageHit = 0;
4177 VacuumPageMiss = 0;
4178 VacuumPageDirty = 0;
4179 VacuumCostBalanceLocal = 0;
4180 VacuumSharedCostBalance = &(lvshared->cost_balance);
4181 VacuumActiveNWorkers = &(lvshared->active_nworkers);
4183 vacrel.rel = rel;
4184 vacrel.indrels = indrels;
4185 vacrel.nindexes = nindexes;
4186 /* Each parallel VACUUM worker gets its own access strategy */
4187 vacrel.bstrategy = GetAccessStrategy(BAS_VACUUM);
4188 vacrel.indstats = (IndexBulkDeleteResult **)
4189 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
4191 if (lvshared->maintenance_work_mem_worker > 0)
4192 maintenance_work_mem = lvshared->maintenance_work_mem_worker;
4195 * Initialize vacrel for use as error callback arg by parallel worker.
4197 vacrel.relnamespace = get_namespace_name(RelationGetNamespace(rel));
4198 vacrel.relname = pstrdup(RelationGetRelationName(rel));
4199 vacrel.indname = NULL;
4200 vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN; /* Not yet processing */
4201 vacrel.dead_tuples = dead_tuples;
4203 /* Setup error traceback support for ereport() */
4204 errcallback.callback = vacuum_error_callback;
4205 errcallback.arg = &vacrel;
4206 errcallback.previous = error_context_stack;
4207 error_context_stack = &errcallback;
4209 /* Prepare to track buffer usage during parallel execution */
4210 InstrStartParallelQuery();
4212 /* Process indexes to perform vacuum/cleanup */
4213 do_parallel_processing(&vacrel, lvshared);
4215 /* Report buffer/WAL usage during parallel execution */
4216 buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
4217 wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
4218 InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
4219 &wal_usage[ParallelWorkerNumber]);
4221 /* Pop the error context stack */
4222 error_context_stack = errcallback.previous;
4224 vac_close_indexes(nindexes, indrels, RowExclusiveLock);
4225 table_close(rel, ShareUpdateExclusiveLock);
4226 FreeAccessStrategy(vacrel.bstrategy);
4227 pfree(vacrel.indstats);
4231 * Error context callback for errors occurring during vacuum.
4233 static void
4234 vacuum_error_callback(void *arg)
4236 LVRelState *errinfo = arg;
4238 switch (errinfo->phase)
4240 case VACUUM_ERRCB_PHASE_SCAN_HEAP:
4241 if (BlockNumberIsValid(errinfo->blkno))
4243 if (OffsetNumberIsValid(errinfo->offnum))
4244 errcontext("while scanning block %u and offset %u of relation \"%s.%s\"",
4245 errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4246 else
4247 errcontext("while scanning block %u of relation \"%s.%s\"",
4248 errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4250 else
4251 errcontext("while scanning relation \"%s.%s\"",
4252 errinfo->relnamespace, errinfo->relname);
4253 break;
4255 case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
4256 if (BlockNumberIsValid(errinfo->blkno))
4258 if (OffsetNumberIsValid(errinfo->offnum))
4259 errcontext("while vacuuming block %u and offset %u of relation \"%s.%s\"",
4260 errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
4261 else
4262 errcontext("while vacuuming block %u of relation \"%s.%s\"",
4263 errinfo->blkno, errinfo->relnamespace, errinfo->relname);
4265 else
4266 errcontext("while vacuuming relation \"%s.%s\"",
4267 errinfo->relnamespace, errinfo->relname);
4268 break;
4270 case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
4271 errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
4272 errinfo->indname, errinfo->relnamespace, errinfo->relname);
4273 break;
4275 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
4276 errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
4277 errinfo->indname, errinfo->relnamespace, errinfo->relname);
4278 break;
4280 case VACUUM_ERRCB_PHASE_TRUNCATE:
4281 if (BlockNumberIsValid(errinfo->blkno))
4282 errcontext("while truncating relation \"%s.%s\" to %u blocks",
4283 errinfo->relnamespace, errinfo->relname, errinfo->blkno);
4284 break;
4286 case VACUUM_ERRCB_PHASE_UNKNOWN:
4287 default:
4288 return; /* do nothing; the errinfo may not be
4289 * initialized */
4294 * Updates the information required for vacuum error callback. This also saves
4295 * the current information which can be later restored via restore_vacuum_error_info.
4297 static void
4298 update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
4299 int phase, BlockNumber blkno, OffsetNumber offnum)
4301 if (saved_vacrel)
4303 saved_vacrel->offnum = vacrel->offnum;
4304 saved_vacrel->blkno = vacrel->blkno;
4305 saved_vacrel->phase = vacrel->phase;
4308 vacrel->blkno = blkno;
4309 vacrel->offnum = offnum;
4310 vacrel->phase = phase;
4314 * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
4316 static void
4317 restore_vacuum_error_info(LVRelState *vacrel,
4318 const LVSavedErrInfo *saved_vacrel)
4320 vacrel->blkno = saved_vacrel->blkno;
4321 vacrel->offnum = saved_vacrel->offnum;
4322 vacrel->phase = saved_vacrel->phase;