1 /*-------------------------------------------------------------------------
4 * Concurrent ("lazy") vacuuming.
7 * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 * TIDs. We want to ensure we can vacuum even the very largest relations with
9 * finite memory space usage. To do that, we set upper bounds on the number of
10 * tuples we will keep track of at once.
12 * We are willing to use at most maintenance_work_mem (or perhaps
13 * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 * initially allocate an array of TIDs of that size, with an upper limit that
15 * depends on table size (this limit ensures we don't allocate a huge area
16 * uselessly for vacuuming small tables). If the array threatens to overflow,
17 * we suspend the heap scan phase and perform a pass of index cleanup and page
18 * compaction, then resume the heap scan with an empty TID array.
20 * If we're processing a table with no indexes, we can just vacuum each page
21 * as we go; there's no need to save up multiple tuples to minimize the number
22 * of index scans performed. So we don't use maintenance_work_mem memory for
23 * the TID array, just enough to hold as many heap tuples as fit on one page.
25 * Lazy vacuum supports parallel execution with parallel worker processes. In
26 * a parallel vacuum, we perform both index vacuum and index cleanup with
27 * parallel worker processes. Individual indexes are processed by one vacuum
28 * process. At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
29 * the parallel context and initialize the DSM segment that contains shared
30 * information as well as the memory space for storing dead tuples. When
31 * starting either index vacuum or index cleanup, we launch parallel worker
32 * processes. Once all indexes are processed the parallel worker processes
33 * exit. After that, the leader process re-initializes the parallel context
34 * so that it can use the same DSM for multiple passes of index vacuum and
35 * for performing index cleanup. For updating the index statistics, we need
36 * to update the system table and since updates are not allowed during
37 * parallel mode we update the index statistics after exiting from the
40 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
41 * Portions Copyright (c) 1994, Regents of the University of California
45 * src/backend/access/heap/vacuumlazy.c
47 *-------------------------------------------------------------------------
53 #include "access/amapi.h"
54 #include "access/genam.h"
55 #include "access/heapam.h"
56 #include "access/heapam_xlog.h"
57 #include "access/htup_details.h"
58 #include "access/multixact.h"
59 #include "access/parallel.h"
60 #include "access/transam.h"
61 #include "access/visibilitymap.h"
62 #include "access/xact.h"
63 #include "access/xlog.h"
64 #include "catalog/index.h"
65 #include "catalog/storage.h"
66 #include "commands/dbcommands.h"
67 #include "commands/progress.h"
68 #include "commands/vacuum.h"
69 #include "executor/instrument.h"
70 #include "miscadmin.h"
71 #include "optimizer/paths.h"
73 #include "portability/instr_time.h"
74 #include "postmaster/autovacuum.h"
75 #include "storage/bufmgr.h"
76 #include "storage/freespace.h"
77 #include "storage/lmgr.h"
78 #include "tcop/tcopprot.h"
79 #include "utils/lsyscache.h"
80 #include "utils/memutils.h"
81 #include "utils/pg_rusage.h"
82 #include "utils/timestamp.h"
86 * Space/time tradeoff parameters: do these need to be user-tunable?
88 * To consider truncating the relation, we want there to be at least
89 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
90 * is less) potentially-freeable pages.
92 #define REL_TRUNCATE_MINIMUM 1000
93 #define REL_TRUNCATE_FRACTION 16
96 * Timing parameters for truncate locking heuristics.
98 * These were not exposed as user tunable GUC values because it didn't seem
99 * that the potential for improvement was great enough to merit the cost of
102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
107 * Threshold that controls whether we bypass index vacuuming and heap
108 * vacuuming as an optimization
110 #define BYPASS_THRESHOLD_PAGES 0.02 /* i.e. 2% of rel_pages */
113 * Perform a failsafe check every 4GB during the heap scan, approximately
115 #define FAILSAFE_EVERY_PAGES \
116 ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ))
119 * When a table has no indexes, vacuum the FSM after every 8GB, approximately
120 * (it won't be exact because we only vacuum FSM after processing a heap page
121 * that has some removable tuples). When there are indexes, this is ignored,
122 * and we vacuum FSM after each index/heap cleaning pass.
124 #define VACUUM_FSM_EVERY_PAGES \
125 ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
128 * Guesstimation of number of dead tuples per page. This is used to
129 * provide an upper limit to memory allocated when vacuuming small
132 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
135 * Before we consider skipping a page that's marked as clean in
136 * visibility map, we must've seen at least this many clean pages.
138 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
141 * Size of the prefetch window for lazy vacuum backwards truncation scan.
142 * Needs to be a power of 2.
144 #define PREFETCH_SIZE ((BlockNumber) 32)
147 * DSM keys for parallel vacuum. Unlike other parallel execution code, since
148 * we don't need to worry about DSM keys conflicting with plan_node_id we can
149 * use small integers.
151 #define PARALLEL_VACUUM_KEY_SHARED 1
152 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES 2
153 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 3
154 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4
155 #define PARALLEL_VACUUM_KEY_WAL_USAGE 5
158 * Macro to check if we are in a parallel vacuum. If true, we are in the
159 * parallel mode and the DSM segment is initialized.
161 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
163 /* Phases of vacuum during which we report error context. */
166 VACUUM_ERRCB_PHASE_UNKNOWN
,
167 VACUUM_ERRCB_PHASE_SCAN_HEAP
,
168 VACUUM_ERRCB_PHASE_VACUUM_INDEX
,
169 VACUUM_ERRCB_PHASE_VACUUM_HEAP
,
170 VACUUM_ERRCB_PHASE_INDEX_CLEANUP
,
171 VACUUM_ERRCB_PHASE_TRUNCATE
175 * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
176 * This is allocated in the DSM segment in parallel mode and in local memory
177 * in non-parallel mode.
179 typedef struct LVDeadTuples
181 int max_tuples
; /* # slots allocated in array */
182 int num_tuples
; /* current # of entries */
183 /* List of TIDs of tuples we intend to delete */
184 /* NB: this list is ordered by TID address */
185 ItemPointerData itemptrs
[FLEXIBLE_ARRAY_MEMBER
]; /* array of
189 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
190 #define SizeOfDeadTuples(cnt) \
191 add_size(offsetof(LVDeadTuples, itemptrs), \
192 mul_size(sizeof(ItemPointerData), cnt))
193 #define MAXDEADTUPLES(max_size) \
194 (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
197 * Shared information among parallel workers. So this is allocated in the DSM
200 typedef struct LVShared
203 * Target table relid and log level. These fields are not modified during
210 * An indication for vacuum workers to perform either index vacuum or
211 * index cleanup. first_time is true only if for_cleanup is true and
212 * bulk-deletion is not performed yet.
218 * Fields for both index vacuum and cleanup.
220 * reltuples is the total number of input heap tuples. We set either old
221 * live tuples in the index vacuum case or the new live tuples in the
222 * index cleanup case.
224 * estimated_count is true if reltuples is an estimated value. (Note that
225 * reltuples could be -1 in this case, indicating we have no idea.)
228 bool estimated_count
;
231 * In single process lazy vacuum we could consume more memory during index
232 * vacuuming or cleanup apart from the memory for heap scanning. In
233 * parallel vacuum, since individual vacuum workers can consume memory
234 * equal to maintenance_work_mem, the new maintenance_work_mem for each
235 * worker is set such that the parallel operation doesn't consume more
236 * memory than single process lazy vacuum.
238 int maintenance_work_mem_worker
;
241 * Shared vacuum cost balance. During parallel vacuum,
242 * VacuumSharedCostBalance points to this value and it accumulates the
243 * balance of each parallel vacuum worker.
245 pg_atomic_uint32 cost_balance
;
248 * Number of active parallel workers. This is used for computing the
249 * minimum threshold of the vacuum cost balance before a worker sleeps for
252 pg_atomic_uint32 active_nworkers
;
255 * Variables to control parallel vacuum. We have a bitmap to indicate
256 * which index has stats in shared memory. The set bit in the map
257 * indicates that the particular index supports a parallel vacuum.
259 pg_atomic_uint32 idx
; /* counter for vacuuming and clean up */
260 uint32 offset
; /* sizeof header incl. bitmap */
261 bits8 bitmap
[FLEXIBLE_ARRAY_MEMBER
]; /* bit map of NULLs */
263 /* Shared index statistics data follows at end of struct */
266 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
267 #define GetSharedIndStats(s) \
268 ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
269 #define IndStatsIsNull(s, i) \
270 (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
273 * Struct for an index bulk-deletion statistic used for parallel vacuum. This
274 * is allocated in the DSM segment.
276 typedef struct LVSharedIndStats
278 bool updated
; /* are the stats updated? */
279 IndexBulkDeleteResult istat
;
282 /* Struct for maintaining a parallel vacuum state. */
283 typedef struct LVParallelState
285 ParallelContext
*pcxt
;
287 /* Shared information among parallel vacuum workers */
290 /* Points to buffer usage area in DSM */
291 BufferUsage
*buffer_usage
;
293 /* Points to WAL usage area in DSM */
297 * The number of indexes that support parallel index bulk-deletion and
298 * parallel index cleanup respectively.
300 int nindexes_parallel_bulkdel
;
301 int nindexes_parallel_cleanup
;
302 int nindexes_parallel_condcleanup
;
305 typedef struct LVRelState
307 /* Target heap relation and its indexes */
311 /* Do index vacuuming/cleanup? */
312 bool do_index_vacuuming
;
313 bool do_index_cleanup
;
314 /* Wraparound failsafe in effect? (implies !do_index_vacuuming) */
317 /* Buffer access strategy and parallel state */
318 BufferAccessStrategy bstrategy
;
319 LVParallelState
*lps
;
321 /* Statistics from pg_class when we start out */
322 BlockNumber old_rel_pages
; /* previous value of pg_class.relpages */
323 double old_live_tuples
; /* previous value of pg_class.reltuples */
324 /* rel's initial relfrozenxid and relminmxid */
325 TransactionId relfrozenxid
;
326 MultiXactId relminmxid
;
328 /* VACUUM operation's cutoff for pruning */
329 TransactionId OldestXmin
;
330 /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
331 TransactionId FreezeLimit
;
332 MultiXactId MultiXactCutoff
;
334 /* Error reporting state */
338 BlockNumber blkno
; /* used only for heap operations */
339 OffsetNumber offnum
; /* used only for heap operations */
343 * State managed by lazy_scan_heap() follows
345 LVDeadTuples
*dead_tuples
; /* items to vacuum from indexes */
346 BlockNumber rel_pages
; /* total number of pages */
347 BlockNumber scanned_pages
; /* number of pages we examined */
348 BlockNumber pinskipped_pages
; /* # of pages skipped due to a pin */
349 BlockNumber frozenskipped_pages
; /* # of frozen pages we skipped */
350 BlockNumber tupcount_pages
; /* pages whose tuples we counted */
351 BlockNumber pages_removed
; /* pages remove by truncation */
352 BlockNumber lpdead_item_pages
; /* # pages with LP_DEAD items */
353 BlockNumber nonempty_pages
; /* actually, last nonempty page + 1 */
354 bool lock_waiter_detected
;
356 /* Statistics output by us, for table */
357 double new_rel_tuples
; /* new estimated total # of tuples */
358 double new_live_tuples
; /* new estimated total # of live tuples */
359 /* Statistics output by index AMs */
360 IndexBulkDeleteResult
**indstats
;
362 /* Instrumentation counters */
364 int64 tuples_deleted
; /* # deleted from table */
365 int64 lpdead_items
; /* # deleted from indexes */
366 int64 new_dead_tuples
; /* new estimated total # of dead items in
368 int64 num_tuples
; /* total number of nonremovable tuples */
369 int64 live_tuples
; /* live tuples (reltuples estimate) */
373 * State returned by lazy_scan_prune()
375 typedef struct LVPagePruneState
377 bool hastup
; /* Page is truncatable? */
378 bool has_lpdead_items
; /* includes existing LP_DEAD items */
381 * State describes the proper VM bit states to set for the page following
382 * pruning and freezing. all_visible implies !has_lpdead_items, but don't
383 * trust all_frozen result unless all_visible is also set to true.
385 bool all_visible
; /* Every item visible to all? */
386 bool all_frozen
; /* provided all_visible is also true */
387 TransactionId visibility_cutoff_xid
; /* For recovery conflicts */
390 /* Struct for saving and restoring vacuum error information. */
391 typedef struct LVSavedErrInfo
398 /* elevel controls whole VACUUM's verbosity */
399 static int elevel
= -1;
402 /* non-export function prototypes */
403 static void lazy_scan_heap(LVRelState
*vacrel
, VacuumParams
*params
,
405 static void lazy_scan_prune(LVRelState
*vacrel
, Buffer buf
,
406 BlockNumber blkno
, Page page
,
407 GlobalVisState
*vistest
,
408 LVPagePruneState
*prunestate
);
409 static void lazy_vacuum(LVRelState
*vacrel
, bool onecall
);
410 static bool lazy_vacuum_all_indexes(LVRelState
*vacrel
);
411 static void lazy_vacuum_heap_rel(LVRelState
*vacrel
);
412 static int lazy_vacuum_heap_page(LVRelState
*vacrel
, BlockNumber blkno
,
413 Buffer buffer
, int tupindex
, Buffer
*vmbuffer
);
414 static bool lazy_check_needs_freeze(Buffer buf
, bool *hastup
,
416 static bool lazy_check_wraparound_failsafe(LVRelState
*vacrel
);
417 static void do_parallel_lazy_vacuum_all_indexes(LVRelState
*vacrel
);
418 static void do_parallel_lazy_cleanup_all_indexes(LVRelState
*vacrel
);
419 static void do_parallel_vacuum_or_cleanup(LVRelState
*vacrel
, int nworkers
);
420 static void do_parallel_processing(LVRelState
*vacrel
,
422 static void do_serial_processing_for_unsafe_indexes(LVRelState
*vacrel
,
424 static IndexBulkDeleteResult
*parallel_process_one_index(Relation indrel
,
425 IndexBulkDeleteResult
*istat
,
427 LVSharedIndStats
*shared_indstats
,
429 static void lazy_cleanup_all_indexes(LVRelState
*vacrel
);
430 static IndexBulkDeleteResult
*lazy_vacuum_one_index(Relation indrel
,
431 IndexBulkDeleteResult
*istat
,
434 static IndexBulkDeleteResult
*lazy_cleanup_one_index(Relation indrel
,
435 IndexBulkDeleteResult
*istat
,
437 bool estimated_count
,
439 static bool should_attempt_truncation(LVRelState
*vacrel
,
440 VacuumParams
*params
);
441 static void lazy_truncate_heap(LVRelState
*vacrel
);
442 static BlockNumber
count_nondeletable_pages(LVRelState
*vacrel
);
443 static long compute_max_dead_tuples(BlockNumber relblocks
, bool hasindex
);
444 static void lazy_space_alloc(LVRelState
*vacrel
, int nworkers
,
445 BlockNumber relblocks
);
446 static void lazy_space_free(LVRelState
*vacrel
);
447 static bool lazy_tid_reaped(ItemPointer itemptr
, void *state
);
448 static int vac_cmp_itemptr(const void *left
, const void *right
);
449 static bool heap_page_is_all_visible(LVRelState
*vacrel
, Buffer buf
,
450 TransactionId
*visibility_cutoff_xid
, bool *all_frozen
);
451 static int compute_parallel_vacuum_workers(LVRelState
*vacrel
,
453 bool *can_parallel_vacuum
);
454 static void update_index_statistics(LVRelState
*vacrel
);
455 static LVParallelState
*begin_parallel_vacuum(LVRelState
*vacrel
,
458 static void end_parallel_vacuum(LVRelState
*vacrel
);
459 static LVSharedIndStats
*parallel_stats_for_idx(LVShared
*lvshared
, int getidx
);
460 static bool parallel_processing_is_safe(Relation indrel
, LVShared
*lvshared
);
461 static void vacuum_error_callback(void *arg
);
462 static void update_vacuum_error_info(LVRelState
*vacrel
,
463 LVSavedErrInfo
*saved_vacrel
,
464 int phase
, BlockNumber blkno
,
465 OffsetNumber offnum
);
466 static void restore_vacuum_error_info(LVRelState
*vacrel
,
467 const LVSavedErrInfo
*saved_vacrel
);
471 * heap_vacuum_rel() -- perform VACUUM for one heap relation
473 * This routine vacuums a single heap, cleans out its indexes, and
474 * updates its relpages and reltuples statistics.
476 * At entry, we have already established a transaction and opened
477 * and locked the relation.
480 heap_vacuum_rel(Relation rel
, VacuumParams
*params
,
481 BufferAccessStrategy bstrategy
)
485 TimestampTz starttime
= 0;
486 WalUsage walusage_start
= pgWalUsage
;
487 WalUsage walusage
= {0, 0, 0};
492 bool aggressive
; /* should we scan all unfrozen pages? */
493 bool scanned_all_unfrozen
; /* actually scanned all such pages? */
494 char **indnames
= NULL
;
495 TransactionId xidFullScanLimit
;
496 MultiXactId mxactFullScanLimit
;
497 BlockNumber new_rel_pages
;
498 BlockNumber new_rel_allvisible
;
499 double new_live_tuples
;
500 TransactionId new_frozen_xid
;
501 MultiXactId new_min_multi
;
502 ErrorContextCallback errcallback
;
503 PgStat_Counter startreadtime
= 0;
504 PgStat_Counter startwritetime
= 0;
505 TransactionId OldestXmin
;
506 TransactionId FreezeLimit
;
507 MultiXactId MultiXactCutoff
;
509 Assert(params
!= NULL
);
510 Assert(params
->index_cleanup
!= VACOPT_TERNARY_DEFAULT
);
511 Assert(params
->truncate
!= VACOPT_TERNARY_DEFAULT
);
513 /* measure elapsed time iff autovacuum logging requires it */
514 if (IsAutoVacuumWorkerProcess() && params
->log_min_duration
>= 0)
516 pg_rusage_init(&ru0
);
517 starttime
= GetCurrentTimestamp();
520 startreadtime
= pgStatBlockReadTime
;
521 startwritetime
= pgStatBlockWriteTime
;
525 if (params
->options
& VACOPT_VERBOSE
)
530 pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM
,
531 RelationGetRelid(rel
));
533 vacuum_set_xid_limits(rel
,
534 params
->freeze_min_age
,
535 params
->freeze_table_age
,
536 params
->multixact_freeze_min_age
,
537 params
->multixact_freeze_table_age
,
538 &OldestXmin
, &FreezeLimit
, &xidFullScanLimit
,
539 &MultiXactCutoff
, &mxactFullScanLimit
);
542 * We request an aggressive scan if the table's frozen Xid is now older
543 * than or equal to the requested Xid full-table scan limit; or if the
544 * table's minimum MultiXactId is older than or equal to the requested
545 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
547 aggressive
= TransactionIdPrecedesOrEquals(rel
->rd_rel
->relfrozenxid
,
549 aggressive
|= MultiXactIdPrecedesOrEquals(rel
->rd_rel
->relminmxid
,
551 if (params
->options
& VACOPT_DISABLE_PAGE_SKIPPING
)
554 vacrel
= (LVRelState
*) palloc0(sizeof(LVRelState
));
556 /* Set up high level stuff about rel */
558 vac_open_indexes(vacrel
->rel
, RowExclusiveLock
, &vacrel
->nindexes
,
560 vacrel
->do_index_vacuuming
= true;
561 vacrel
->do_index_cleanup
= true;
562 vacrel
->do_failsafe
= false;
563 if (params
->index_cleanup
== VACOPT_TERNARY_DISABLED
)
565 vacrel
->do_index_vacuuming
= false;
566 vacrel
->do_index_cleanup
= false;
568 vacrel
->bstrategy
= bstrategy
;
569 vacrel
->old_rel_pages
= rel
->rd_rel
->relpages
;
570 vacrel
->old_live_tuples
= rel
->rd_rel
->reltuples
;
571 vacrel
->relfrozenxid
= rel
->rd_rel
->relfrozenxid
;
572 vacrel
->relminmxid
= rel
->rd_rel
->relminmxid
;
574 /* Set cutoffs for entire VACUUM */
575 vacrel
->OldestXmin
= OldestXmin
;
576 vacrel
->FreezeLimit
= FreezeLimit
;
577 vacrel
->MultiXactCutoff
= MultiXactCutoff
;
579 vacrel
->relnamespace
= get_namespace_name(RelationGetNamespace(rel
));
580 vacrel
->relname
= pstrdup(RelationGetRelationName(rel
));
581 vacrel
->indname
= NULL
;
582 vacrel
->phase
= VACUUM_ERRCB_PHASE_UNKNOWN
;
584 /* Save index names iff autovacuum logging requires it */
585 if (IsAutoVacuumWorkerProcess() && params
->log_min_duration
>= 0 &&
586 vacrel
->nindexes
> 0)
588 indnames
= palloc(sizeof(char *) * vacrel
->nindexes
);
589 for (int i
= 0; i
< vacrel
->nindexes
; i
++)
591 pstrdup(RelationGetRelationName(vacrel
->indrels
[i
]));
595 * Setup error traceback support for ereport(). The idea is to set up an
596 * error context callback to display additional information on any error
597 * during a vacuum. During different phases of vacuum (heap scan, heap
598 * vacuum, index vacuum, index clean up, heap truncate), we update the
599 * error context callback to display appropriate information.
601 * Note that the index vacuum and heap vacuum phases may be called
602 * multiple times in the middle of the heap scan phase. So the old phase
603 * information is restored at the end of those phases.
605 errcallback
.callback
= vacuum_error_callback
;
606 errcallback
.arg
= vacrel
;
607 errcallback
.previous
= error_context_stack
;
608 error_context_stack
= &errcallback
;
610 /* Do the vacuuming */
611 lazy_scan_heap(vacrel
, params
, aggressive
);
613 /* Done with indexes */
614 vac_close_indexes(vacrel
->nindexes
, vacrel
->indrels
, NoLock
);
617 * Compute whether we actually scanned the all unfrozen pages. If we did,
618 * we can adjust relfrozenxid and relminmxid.
620 * NB: We need to check this before truncating the relation, because that
621 * will change ->rel_pages.
623 if ((vacrel
->scanned_pages
+ vacrel
->frozenskipped_pages
)
627 scanned_all_unfrozen
= false;
630 scanned_all_unfrozen
= true;
633 * Optionally truncate the relation.
635 if (should_attempt_truncation(vacrel
, params
))
638 * Update error traceback information. This is the last phase during
639 * which we add context information to errors, so we don't need to
640 * revert to the previous phase.
642 update_vacuum_error_info(vacrel
, NULL
, VACUUM_ERRCB_PHASE_TRUNCATE
,
643 vacrel
->nonempty_pages
,
644 InvalidOffsetNumber
);
645 lazy_truncate_heap(vacrel
);
648 /* Pop the error context stack */
649 error_context_stack
= errcallback
.previous
;
651 /* Report that we are now doing final cleanup */
652 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
653 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP
);
656 * Update statistics in pg_class.
658 * In principle new_live_tuples could be -1 indicating that we (still)
659 * don't know the tuple count. In practice that probably can't happen,
660 * since we'd surely have scanned some pages if the table is new and
663 * For safety, clamp relallvisible to be not more than what we're setting
666 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
667 * since then we don't know for certain that all tuples have a newer xmin.
669 new_rel_pages
= vacrel
->rel_pages
;
670 new_live_tuples
= vacrel
->new_live_tuples
;
672 visibilitymap_count(rel
, &new_rel_allvisible
, NULL
);
673 if (new_rel_allvisible
> new_rel_pages
)
674 new_rel_allvisible
= new_rel_pages
;
676 new_frozen_xid
= scanned_all_unfrozen
? FreezeLimit
: InvalidTransactionId
;
677 new_min_multi
= scanned_all_unfrozen
? MultiXactCutoff
: InvalidMultiXactId
;
679 vac_update_relstats(rel
,
683 vacrel
->nindexes
> 0,
689 * Report results to the stats collector, too.
691 * Deliberately avoid telling the stats collector about LP_DEAD items that
692 * remain in the table due to VACUUM bypassing index and heap vacuuming.
693 * ANALYZE will consider the remaining LP_DEAD items to be dead tuples. It
694 * seems like a good idea to err on the side of not vacuuming again too
695 * soon in cases where the failsafe prevented significant amounts of heap
698 pgstat_report_vacuum(RelationGetRelid(rel
),
699 rel
->rd_rel
->relisshared
,
700 Max(new_live_tuples
, 0),
701 vacrel
->new_dead_tuples
);
702 pgstat_progress_end_command();
704 /* and log the action if appropriate */
705 if (IsAutoVacuumWorkerProcess() && params
->log_min_duration
>= 0)
707 TimestampTz endtime
= GetCurrentTimestamp();
709 if (params
->log_min_duration
== 0 ||
710 TimestampDifferenceExceeds(starttime
, endtime
,
711 params
->log_min_duration
))
716 TimestampDifference(starttime
, endtime
, &secs
, &usecs
);
718 memset(&walusage
, 0, sizeof(WalUsage
));
719 WalUsageAccumDiff(&walusage
, &pgWalUsage
, &walusage_start
);
723 if ((secs
> 0) || (usecs
> 0))
725 read_rate
= (double) BLCKSZ
* VacuumPageMiss
/ (1024 * 1024) /
726 (secs
+ usecs
/ 1000000.0);
727 write_rate
= (double) BLCKSZ
* VacuumPageDirty
/ (1024 * 1024) /
728 (secs
+ usecs
/ 1000000.0);
732 * This is pretty messy, but we split it up so that we can skip
733 * emitting individual parts of the message when not applicable.
735 initStringInfo(&buf
);
736 if (params
->is_wraparound
)
739 * While it's possible for a VACUUM to be both is_wraparound
740 * and !aggressive, that's just a corner-case -- is_wraparound
741 * implies aggressive. Produce distinct output for the corner
742 * case all the same, just in case.
745 msgfmt
= _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
747 msgfmt
= _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
752 msgfmt
= _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
754 msgfmt
= _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
756 appendStringInfo(&buf
, msgfmt
,
757 get_database_name(MyDatabaseId
),
758 vacrel
->relnamespace
,
760 vacrel
->num_index_scans
);
761 appendStringInfo(&buf
, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
762 vacrel
->pages_removed
,
764 vacrel
->pinskipped_pages
,
765 vacrel
->frozenskipped_pages
);
766 appendStringInfo(&buf
,
767 _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
768 (long long) vacrel
->tuples_deleted
,
769 (long long) vacrel
->new_rel_tuples
,
770 (long long) vacrel
->new_dead_tuples
,
772 appendStringInfo(&buf
,
773 _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
774 (long long) VacuumPageHit
,
775 (long long) VacuumPageMiss
,
776 (long long) VacuumPageDirty
);
777 if (vacrel
->rel_pages
> 0)
779 BlockNumber orig_rel_pages
;
781 if (vacrel
->do_index_vacuuming
)
783 msgfmt
= _(" %u pages from table (%.2f%% of total) had %lld dead item identifiers removed\n");
785 if (vacrel
->nindexes
== 0 || vacrel
->num_index_scans
== 0)
786 appendStringInfo(&buf
, _("index scan not needed:"));
788 appendStringInfo(&buf
, _("index scan needed:"));
792 msgfmt
= _(" %u pages from table (%.2f%% of total) have %lld dead item identifiers\n");
794 if (!vacrel
->do_failsafe
)
795 appendStringInfo(&buf
, _("index scan bypassed:"));
797 appendStringInfo(&buf
, _("index scan bypassed by failsafe:"));
799 orig_rel_pages
= vacrel
->rel_pages
+ vacrel
->pages_removed
;
800 appendStringInfo(&buf
, msgfmt
,
801 vacrel
->lpdead_item_pages
,
802 100.0 * vacrel
->lpdead_item_pages
/ orig_rel_pages
,
803 (long long) vacrel
->lpdead_items
);
805 for (int i
= 0; i
< vacrel
->nindexes
; i
++)
807 IndexBulkDeleteResult
*istat
= vacrel
->indstats
[i
];
812 appendStringInfo(&buf
,
813 _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
816 istat
->pages_newly_deleted
,
817 istat
->pages_deleted
,
820 appendStringInfo(&buf
, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
821 read_rate
, write_rate
);
824 appendStringInfoString(&buf
, _("I/O Timings:"));
825 if (pgStatBlockReadTime
- startreadtime
> 0)
826 appendStringInfo(&buf
, _(" read=%.3f"),
827 (double) (pgStatBlockReadTime
- startreadtime
) / 1000);
828 if (pgStatBlockWriteTime
- startwritetime
> 0)
829 appendStringInfo(&buf
, _(" write=%.3f"),
830 (double) (pgStatBlockWriteTime
- startwritetime
) / 1000);
831 appendStringInfoChar(&buf
, '\n');
833 appendStringInfo(&buf
, _("system usage: %s\n"), pg_rusage_show(&ru0
));
834 appendStringInfo(&buf
,
835 _("WAL usage: %lld records, %lld full page images, %llu bytes"),
836 (long long) walusage
.wal_records
,
837 (long long) walusage
.wal_fpi
,
838 (unsigned long long) walusage
.wal_bytes
);
841 (errmsg_internal("%s", buf
.data
)));
846 /* Cleanup index statistics and index names */
847 for (int i
= 0; i
< vacrel
->nindexes
; i
++)
849 if (vacrel
->indstats
[i
])
850 pfree(vacrel
->indstats
[i
]);
852 if (indnames
&& indnames
[i
])
858 * lazy_scan_heap() -- scan an open heap relation
860 * This routine prunes each page in the heap, which will among other
861 * things truncate dead tuples to dead line pointers, defragment the
862 * page, and set commit status bits (see heap_page_prune). It also builds
863 * lists of dead tuples and pages with free space, calculates statistics
864 * on the number of live tuples in the heap, and marks pages as
865 * all-visible if appropriate. When done, or when we run low on space
866 * for dead-tuple TIDs, invoke lazy_vacuum to vacuum indexes and vacuum
867 * heap relation during its own second pass over the heap.
869 * If the table has at least two indexes, we execute both index vacuum
870 * and index cleanup with parallel workers unless parallel vacuum is
871 * disabled. In a parallel vacuum, we enter parallel mode and then
872 * create both the parallel context and the DSM segment before starting
873 * heap scan so that we can record dead tuples to the DSM segment. All
874 * parallel workers are launched at beginning of index vacuuming and
875 * index cleanup and they exit once done with all indexes. At the end of
876 * this function we exit from parallel mode. Index bulk-deletion results
877 * are stored in the DSM segment and we update index statistics for all
878 * the indexes after exiting from parallel mode since writes are not
879 * allowed during parallel mode.
881 * If there are no indexes then we can reclaim line pointers on the fly;
882 * dead line pointers need only be retained until all index pointers that
883 * reference them have been killed.
886 lazy_scan_heap(LVRelState
*vacrel
, VacuumParams
*params
, bool aggressive
)
888 LVDeadTuples
*dead_tuples
;
891 next_unskippable_block
,
893 next_fsm_block_to_vacuum
;
895 Buffer vmbuffer
= InvalidBuffer
;
896 bool skipping_blocks
,
897 have_vacuumed_indexes
= false;
899 const int initprog_index
[] = {
900 PROGRESS_VACUUM_PHASE
,
901 PROGRESS_VACUUM_TOTAL_HEAP_BLKS
,
902 PROGRESS_VACUUM_MAX_DEAD_TUPLES
904 int64 initprog_val
[3];
905 GlobalVisState
*vistest
;
907 pg_rusage_init(&ru0
);
911 (errmsg("aggressively vacuuming \"%s.%s\"",
912 vacrel
->relnamespace
,
916 (errmsg("vacuuming \"%s.%s\"",
917 vacrel
->relnamespace
,
920 nblocks
= RelationGetNumberOfBlocks(vacrel
->rel
);
921 next_unskippable_block
= 0;
922 next_failsafe_block
= 0;
923 next_fsm_block_to_vacuum
= 0;
924 vacrel
->rel_pages
= nblocks
;
925 vacrel
->scanned_pages
= 0;
926 vacrel
->pinskipped_pages
= 0;
927 vacrel
->frozenskipped_pages
= 0;
928 vacrel
->tupcount_pages
= 0;
929 vacrel
->pages_removed
= 0;
930 vacrel
->lpdead_item_pages
= 0;
931 vacrel
->nonempty_pages
= 0;
932 vacrel
->lock_waiter_detected
= false;
934 /* Initialize instrumentation counters */
935 vacrel
->num_index_scans
= 0;
936 vacrel
->tuples_deleted
= 0;
937 vacrel
->lpdead_items
= 0;
938 vacrel
->new_dead_tuples
= 0;
939 vacrel
->num_tuples
= 0;
940 vacrel
->live_tuples
= 0;
942 vistest
= GlobalVisTestFor(vacrel
->rel
);
944 vacrel
->indstats
= (IndexBulkDeleteResult
**)
945 palloc0(vacrel
->nindexes
* sizeof(IndexBulkDeleteResult
*));
948 * Before beginning scan, check if it's already necessary to apply
951 lazy_check_wraparound_failsafe(vacrel
);
954 * Allocate the space for dead tuples. Note that this handles parallel
955 * VACUUM initialization as part of allocating shared memory space used
958 lazy_space_alloc(vacrel
, params
->nworkers
, nblocks
);
959 dead_tuples
= vacrel
->dead_tuples
;
961 /* Report that we're scanning the heap, advertising total # of blocks */
962 initprog_val
[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP
;
963 initprog_val
[1] = nblocks
;
964 initprog_val
[2] = dead_tuples
->max_tuples
;
965 pgstat_progress_update_multi_param(3, initprog_index
, initprog_val
);
968 * Except when aggressive is set, we want to skip pages that are
969 * all-visible according to the visibility map, but only when we can skip
970 * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
971 * sequentially, the OS should be doing readahead for us, so there's no
972 * gain in skipping a page now and then; that's likely to disable
973 * readahead and so be counterproductive. Also, skipping even a single
974 * page means that we can't update relfrozenxid, so we only want to do it
975 * if we can skip a goodly number of pages.
977 * When aggressive is set, we can't skip pages just because they are
978 * all-visible, but we can still skip pages that are all-frozen, since
979 * such pages do not need freezing and do not affect the value that we can
980 * safely set for relfrozenxid or relminmxid.
982 * Before entering the main loop, establish the invariant that
983 * next_unskippable_block is the next block number >= blkno that we can't
984 * skip based on the visibility map, either all-visible for a regular scan
985 * or all-frozen for an aggressive scan. We set it to nblocks if there's
986 * no such block. We also set up the skipping_blocks flag correctly at
989 * Note: The value returned by visibilitymap_get_status could be slightly
990 * out-of-date, since we make this test before reading the corresponding
991 * heap page or locking the buffer. This is OK. If we mistakenly think
992 * that the page is all-visible or all-frozen when in fact the flag's just
993 * been cleared, we might fail to vacuum the page. It's easy to see that
994 * skipping a page when aggressive is not set is not a very big deal; we
995 * might leave some dead tuples lying around, but the next vacuum will
996 * find them. But even when aggressive *is* set, it's still OK if we miss
997 * a page whose all-frozen marking has just been cleared. Any new XIDs
998 * just added to that page are necessarily newer than the GlobalXmin we
999 * computed, so they'll have no effect on the value to which we can safely
1000 * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
1002 * We will scan the table's last page, at least to the extent of
1003 * determining whether it has tuples or not, even if it should be skipped
1004 * according to the above rules; except when we've already determined that
1005 * it's not worth trying to truncate the table. This avoids having
1006 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
1007 * a truncation that just fails immediately because there are tuples in
1008 * the last page. This is worth avoiding mainly because such a lock must
1009 * be replayed on any hot standby, where it can be disruptive.
1011 if ((params
->options
& VACOPT_DISABLE_PAGE_SKIPPING
) == 0)
1013 while (next_unskippable_block
< nblocks
)
1017 vmstatus
= visibilitymap_get_status(vacrel
->rel
,
1018 next_unskippable_block
,
1022 if ((vmstatus
& VISIBILITYMAP_ALL_FROZEN
) == 0)
1027 if ((vmstatus
& VISIBILITYMAP_ALL_VISIBLE
) == 0)
1030 vacuum_delay_point();
1031 next_unskippable_block
++;
1035 if (next_unskippable_block
>= SKIP_PAGES_THRESHOLD
)
1036 skipping_blocks
= true;
1038 skipping_blocks
= false;
1040 for (blkno
= 0; blkno
< nblocks
; blkno
++)
1044 bool all_visible_according_to_vm
= false;
1045 LVPagePruneState prunestate
;
1048 * Consider need to skip blocks. See note above about forcing
1049 * scanning of last page.
1051 #define FORCE_CHECK_PAGE() \
1052 (blkno == nblocks - 1 && should_attempt_truncation(vacrel, params))
1054 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED
, blkno
);
1056 update_vacuum_error_info(vacrel
, NULL
, VACUUM_ERRCB_PHASE_SCAN_HEAP
,
1057 blkno
, InvalidOffsetNumber
);
1059 if (blkno
== next_unskippable_block
)
1061 /* Time to advance next_unskippable_block */
1062 next_unskippable_block
++;
1063 if ((params
->options
& VACOPT_DISABLE_PAGE_SKIPPING
) == 0)
1065 while (next_unskippable_block
< nblocks
)
1069 vmskipflags
= visibilitymap_get_status(vacrel
->rel
,
1070 next_unskippable_block
,
1074 if ((vmskipflags
& VISIBILITYMAP_ALL_FROZEN
) == 0)
1079 if ((vmskipflags
& VISIBILITYMAP_ALL_VISIBLE
) == 0)
1082 vacuum_delay_point();
1083 next_unskippable_block
++;
1088 * We know we can't skip the current block. But set up
1089 * skipping_blocks to do the right thing at the following blocks.
1091 if (next_unskippable_block
- blkno
> SKIP_PAGES_THRESHOLD
)
1092 skipping_blocks
= true;
1094 skipping_blocks
= false;
1097 * Normally, the fact that we can't skip this block must mean that
1098 * it's not all-visible. But in an aggressive vacuum we know only
1099 * that it's not all-frozen, so it might still be all-visible.
1101 if (aggressive
&& VM_ALL_VISIBLE(vacrel
->rel
, blkno
, &vmbuffer
))
1102 all_visible_according_to_vm
= true;
1107 * The current block is potentially skippable; if we've seen a
1108 * long enough run of skippable blocks to justify skipping it, and
1109 * we're not forced to check it, then go ahead and skip.
1110 * Otherwise, the page must be at least all-visible if not
1111 * all-frozen, so we can set all_visible_according_to_vm = true.
1113 if (skipping_blocks
&& !FORCE_CHECK_PAGE())
1116 * Tricky, tricky. If this is in aggressive vacuum, the page
1117 * must have been all-frozen at the time we checked whether it
1118 * was skippable, but it might not be any more. We must be
1119 * careful to count it as a skipped all-frozen page in that
1120 * case, or else we'll think we can't update relfrozenxid and
1121 * relminmxid. If it's not an aggressive vacuum, we don't
1122 * know whether it was all-frozen, so we have to recheck; but
1123 * in this case an approximate answer is OK.
1125 if (aggressive
|| VM_ALL_FROZEN(vacrel
->rel
, blkno
, &vmbuffer
))
1126 vacrel
->frozenskipped_pages
++;
1129 all_visible_according_to_vm
= true;
1132 vacuum_delay_point();
1135 * Regularly check if wraparound failsafe should trigger.
1137 * There is a similar check inside lazy_vacuum_all_indexes(), but
1138 * relfrozenxid might start to look dangerously old before we reach
1139 * that point. This check also provides failsafe coverage for the
1140 * one-pass strategy case.
1142 if (blkno
- next_failsafe_block
>= FAILSAFE_EVERY_PAGES
)
1144 lazy_check_wraparound_failsafe(vacrel
);
1145 next_failsafe_block
= blkno
;
1149 * Consider if we definitely have enough space to process TIDs on page
1150 * already. If we are close to overrunning the available space for
1151 * dead-tuple TIDs, pause and do a cycle of vacuuming before we tackle
1154 if ((dead_tuples
->max_tuples
- dead_tuples
->num_tuples
) < MaxHeapTuplesPerPage
&&
1155 dead_tuples
->num_tuples
> 0)
1158 * Before beginning index vacuuming, we release any pin we may
1159 * hold on the visibility map page. This isn't necessary for
1160 * correctness, but we do it anyway to avoid holding the pin
1161 * across a lengthy, unrelated operation.
1163 if (BufferIsValid(vmbuffer
))
1165 ReleaseBuffer(vmbuffer
);
1166 vmbuffer
= InvalidBuffer
;
1169 /* Remove the collected garbage tuples from table and indexes */
1170 lazy_vacuum(vacrel
, false);
1171 have_vacuumed_indexes
= true;
1174 * Vacuum the Free Space Map to make newly-freed space visible on
1175 * upper-level FSM pages. Note we have not yet processed blkno.
1177 FreeSpaceMapVacuumRange(vacrel
->rel
, next_fsm_block_to_vacuum
,
1179 next_fsm_block_to_vacuum
= blkno
;
1181 /* Report that we are once again scanning the heap */
1182 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
1183 PROGRESS_VACUUM_PHASE_SCAN_HEAP
);
1187 * Set up visibility map page as needed.
1189 * Pin the visibility map page in case we need to mark the page
1190 * all-visible. In most cases this will be very cheap, because we'll
1191 * already have the correct page pinned anyway. However, it's
1192 * possible that (a) next_unskippable_block is covered by a different
1193 * VM page than the current block or (b) we released our pin and did a
1194 * cycle of index vacuuming.
1196 visibilitymap_pin(vacrel
->rel
, blkno
, &vmbuffer
);
1198 buf
= ReadBufferExtended(vacrel
->rel
, MAIN_FORKNUM
, blkno
,
1199 RBM_NORMAL
, vacrel
->bstrategy
);
1202 * We need buffer cleanup lock so that we can prune HOT chains and
1203 * defragment the page.
1205 if (!ConditionalLockBufferForCleanup(buf
))
1210 * If we're not performing an aggressive scan to guard against XID
1211 * wraparound, and we don't want to forcibly check the page, then
1212 * it's OK to skip vacuuming pages we get a lock conflict on. They
1213 * will be dealt with in some future vacuum.
1215 if (!aggressive
&& !FORCE_CHECK_PAGE())
1218 vacrel
->pinskipped_pages
++;
1223 * Read the page with share lock to see if any xids on it need to
1224 * be frozen. If not we just skip the page, after updating our
1225 * scan statistics. If there are some, we wait for cleanup lock.
1227 * We could defer the lock request further by remembering the page
1228 * and coming back to it later, or we could even register
1229 * ourselves for multiple buffers and then service whichever one
1230 * is received first. For now, this seems good enough.
1232 * If we get here with aggressive false, then we're just forcibly
1233 * checking the page, and so we don't want to insist on getting
1234 * the lock; we only need to know if the page contains tuples, so
1235 * that we can update nonempty_pages correctly. It's convenient
1236 * to use lazy_check_needs_freeze() for both situations, though.
1238 LockBuffer(buf
, BUFFER_LOCK_SHARE
);
1239 if (!lazy_check_needs_freeze(buf
, &hastup
, vacrel
))
1241 UnlockReleaseBuffer(buf
);
1242 vacrel
->scanned_pages
++;
1243 vacrel
->pinskipped_pages
++;
1245 vacrel
->nonempty_pages
= blkno
+ 1;
1251 * Here, we must not advance scanned_pages; that would amount
1252 * to claiming that the page contains no freezable tuples.
1254 UnlockReleaseBuffer(buf
);
1255 vacrel
->pinskipped_pages
++;
1257 vacrel
->nonempty_pages
= blkno
+ 1;
1260 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
1261 LockBufferForCleanup(buf
);
1262 /* drop through to normal processing */
1266 * By here we definitely have enough dead_tuples space for whatever
1267 * LP_DEAD tids are on this page, we have the visibility map page set
1268 * up in case we need to set this page's all_visible/all_frozen bit,
1269 * and we have a super-exclusive lock. Any tuples on this page are
1270 * now sure to be "counted" by this VACUUM.
1272 * One last piece of preamble needs to take place before we can prune:
1273 * we need to consider new and empty pages.
1275 vacrel
->scanned_pages
++;
1276 vacrel
->tupcount_pages
++;
1278 page
= BufferGetPage(buf
);
1280 if (PageIsNew(page
))
1283 * All-zeroes pages can be left over if either a backend extends
1284 * the relation by a single page, but crashes before the newly
1285 * initialized page has been written out, or when bulk-extending
1286 * the relation (which creates a number of empty pages at the tail
1287 * end of the relation, but enters them into the FSM).
1289 * Note we do not enter the page into the visibilitymap. That has
1290 * the downside that we repeatedly visit this page in subsequent
1291 * vacuums, but otherwise we'll never not discover the space on a
1292 * promoted standby. The harm of repeated checking ought to
1293 * normally not be too bad - the space usually should be used at
1294 * some point, otherwise there wouldn't be any regular vacuums.
1296 * Make sure these pages are in the FSM, to ensure they can be
1297 * reused. Do that by testing if there's any space recorded for
1298 * the page. If not, enter it. We do so after releasing the lock
1299 * on the heap page, the FSM is approximate, after all.
1301 UnlockReleaseBuffer(buf
);
1303 if (GetRecordedFreeSpace(vacrel
->rel
, blkno
) == 0)
1305 Size freespace
= BLCKSZ
- SizeOfPageHeaderData
;
1307 RecordPageWithFreeSpace(vacrel
->rel
, blkno
, freespace
);
1312 if (PageIsEmpty(page
))
1314 Size freespace
= PageGetHeapFreeSpace(page
);
1317 * Empty pages are always all-visible and all-frozen (note that
1318 * the same is currently not true for new pages, see above).
1320 if (!PageIsAllVisible(page
))
1322 START_CRIT_SECTION();
1324 /* mark buffer dirty before writing a WAL record */
1325 MarkBufferDirty(buf
);
1328 * It's possible that another backend has extended the heap,
1329 * initialized the page, and then failed to WAL-log the page
1330 * due to an ERROR. Since heap extension is not WAL-logged,
1331 * recovery might try to replay our record setting the page
1332 * all-visible and find that the page isn't initialized, which
1333 * will cause a PANIC. To prevent that, check whether the
1334 * page has been previously WAL-logged, and if not, do that
1337 if (RelationNeedsWAL(vacrel
->rel
) &&
1338 PageGetLSN(page
) == InvalidXLogRecPtr
)
1339 log_newpage_buffer(buf
, true);
1341 PageSetAllVisible(page
);
1342 visibilitymap_set(vacrel
->rel
, blkno
, buf
, InvalidXLogRecPtr
,
1343 vmbuffer
, InvalidTransactionId
,
1344 VISIBILITYMAP_ALL_VISIBLE
| VISIBILITYMAP_ALL_FROZEN
);
1348 UnlockReleaseBuffer(buf
);
1349 RecordPageWithFreeSpace(vacrel
->rel
, blkno
, freespace
);
1354 * Prune and freeze tuples.
1356 * Accumulates details of remaining LP_DEAD line pointers on page in
1357 * dead tuple list. This includes LP_DEAD line pointers that we
1358 * pruned ourselves, as well as existing LP_DEAD line pointers that
1359 * were pruned some time earlier. Also considers freezing XIDs in the
1360 * tuple headers of remaining items with storage.
1362 lazy_scan_prune(vacrel
, buf
, blkno
, page
, vistest
, &prunestate
);
1364 Assert(!prunestate
.all_visible
|| !prunestate
.has_lpdead_items
);
1366 /* Remember the location of the last page with nonremovable tuples */
1367 if (prunestate
.hastup
)
1368 vacrel
->nonempty_pages
= blkno
+ 1;
1370 if (vacrel
->nindexes
== 0)
1373 * Consider the need to do page-at-a-time heap vacuuming when
1374 * using the one-pass strategy now.
1376 * The one-pass strategy will never call lazy_vacuum(). The steps
1377 * performed here can be thought of as the one-pass equivalent of
1378 * a call to lazy_vacuum().
1380 if (prunestate
.has_lpdead_items
)
1384 lazy_vacuum_heap_page(vacrel
, blkno
, buf
, 0, &vmbuffer
);
1386 /* Forget the now-vacuumed tuples */
1387 dead_tuples
->num_tuples
= 0;
1390 * Periodically perform FSM vacuuming to make newly-freed
1391 * space visible on upper FSM pages. Note we have not yet
1392 * performed FSM processing for blkno.
1394 if (blkno
- next_fsm_block_to_vacuum
>= VACUUM_FSM_EVERY_PAGES
)
1396 FreeSpaceMapVacuumRange(vacrel
->rel
, next_fsm_block_to_vacuum
,
1398 next_fsm_block_to_vacuum
= blkno
;
1402 * Now perform FSM processing for blkno, and move on to next
1405 * Our call to lazy_vacuum_heap_page() will have considered if
1406 * it's possible to set all_visible/all_frozen independently
1407 * of lazy_scan_prune(). Note that prunestate was invalidated
1408 * by lazy_vacuum_heap_page() call.
1410 freespace
= PageGetHeapFreeSpace(page
);
1412 UnlockReleaseBuffer(buf
);
1413 RecordPageWithFreeSpace(vacrel
->rel
, blkno
, freespace
);
1418 * There was no call to lazy_vacuum_heap_page() because pruning
1419 * didn't encounter/create any LP_DEAD items that needed to be
1420 * vacuumed. Prune state has not been invalidated, so proceed
1421 * with prunestate-driven visibility map and FSM steps (just like
1422 * the two-pass strategy).
1424 Assert(dead_tuples
->num_tuples
== 0);
1428 * Handle setting visibility map bit based on what the VM said about
1429 * the page before pruning started, and using prunestate
1431 if (!all_visible_according_to_vm
&& prunestate
.all_visible
)
1433 uint8 flags
= VISIBILITYMAP_ALL_VISIBLE
;
1435 if (prunestate
.all_frozen
)
1436 flags
|= VISIBILITYMAP_ALL_FROZEN
;
1439 * It should never be the case that the visibility map page is set
1440 * while the page-level bit is clear, but the reverse is allowed
1441 * (if checksums are not enabled). Regardless, set both bits so
1442 * that we get back in sync.
1444 * NB: If the heap page is all-visible but the VM bit is not set,
1445 * we don't need to dirty the heap page. However, if checksums
1446 * are enabled, we do need to make sure that the heap page is
1447 * dirtied before passing it to visibilitymap_set(), because it
1448 * may be logged. Given that this situation should only happen in
1449 * rare cases after a crash, it is not worth optimizing.
1451 PageSetAllVisible(page
);
1452 MarkBufferDirty(buf
);
1453 visibilitymap_set(vacrel
->rel
, blkno
, buf
, InvalidXLogRecPtr
,
1454 vmbuffer
, prunestate
.visibility_cutoff_xid
,
1459 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1460 * the page-level bit is clear. However, it's possible that the bit
1461 * got cleared after we checked it and before we took the buffer
1462 * content lock, so we must recheck before jumping to the conclusion
1463 * that something bad has happened.
1465 else if (all_visible_according_to_vm
&& !PageIsAllVisible(page
)
1466 && VM_ALL_VISIBLE(vacrel
->rel
, blkno
, &vmbuffer
))
1468 elog(WARNING
, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1469 vacrel
->relname
, blkno
);
1470 visibilitymap_clear(vacrel
->rel
, blkno
, vmbuffer
,
1471 VISIBILITYMAP_VALID_BITS
);
1475 * It's possible for the value returned by
1476 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1477 * wrong for us to see tuples that appear to not be visible to
1478 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1479 * xmin value never moves backwards, but
1480 * GetOldestNonRemovableTransactionId() is conservative and sometimes
1481 * returns a value that's unnecessarily small, so if we see that
1482 * contradiction it just means that the tuples that we think are not
1483 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1486 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1489 else if (prunestate
.has_lpdead_items
&& PageIsAllVisible(page
))
1491 elog(WARNING
, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1492 vacrel
->relname
, blkno
);
1493 PageClearAllVisible(page
);
1494 MarkBufferDirty(buf
);
1495 visibilitymap_clear(vacrel
->rel
, blkno
, vmbuffer
,
1496 VISIBILITYMAP_VALID_BITS
);
1500 * If the all-visible page is all-frozen but not marked as such yet,
1501 * mark it as all-frozen. Note that all_frozen is only valid if
1502 * all_visible is true, so we must check both.
1504 else if (all_visible_according_to_vm
&& prunestate
.all_visible
&&
1505 prunestate
.all_frozen
&&
1506 !VM_ALL_FROZEN(vacrel
->rel
, blkno
, &vmbuffer
))
1509 * We can pass InvalidTransactionId as the cutoff XID here,
1510 * because setting the all-frozen bit doesn't cause recovery
1513 visibilitymap_set(vacrel
->rel
, blkno
, buf
, InvalidXLogRecPtr
,
1514 vmbuffer
, InvalidTransactionId
,
1515 VISIBILITYMAP_ALL_FROZEN
);
1519 * Final steps for block: drop super-exclusive lock, record free space
1522 if (prunestate
.has_lpdead_items
&& vacrel
->do_index_vacuuming
)
1525 * Wait until lazy_vacuum_heap_rel() to save free space. This
1526 * doesn't just save us some cycles; it also allows us to record
1527 * any additional free space that lazy_vacuum_heap_page() will
1528 * make available in cases where it's possible to truncate the
1529 * page's line pointer array.
1531 * Note: It's not in fact 100% certain that we really will call
1532 * lazy_vacuum_heap_rel() -- lazy_vacuum() might yet opt to skip
1533 * index vacuuming (and so must skip heap vacuuming). This is
1534 * deemed okay because it only happens in emergencies, or when
1535 * there is very little free space anyway. (Besides, we start
1536 * recording free space in the FSM once index vacuuming has been
1539 * Note: The one-pass (no indexes) case is only supposed to make
1540 * it this far when there were no LP_DEAD items during pruning.
1542 Assert(vacrel
->nindexes
> 0);
1543 UnlockReleaseBuffer(buf
);
1547 Size freespace
= PageGetHeapFreeSpace(page
);
1549 UnlockReleaseBuffer(buf
);
1550 RecordPageWithFreeSpace(vacrel
->rel
, blkno
, freespace
);
1554 /* report that everything is now scanned */
1555 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED
, blkno
);
1557 /* Clear the block number information */
1558 vacrel
->blkno
= InvalidBlockNumber
;
1560 /* now we can compute the new value for pg_class.reltuples */
1561 vacrel
->new_live_tuples
= vac_estimate_reltuples(vacrel
->rel
, nblocks
,
1562 vacrel
->tupcount_pages
,
1563 vacrel
->live_tuples
);
1566 * Also compute the total number of surviving heap entries. In the
1567 * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1569 vacrel
->new_rel_tuples
=
1570 Max(vacrel
->new_live_tuples
, 0) + vacrel
->new_dead_tuples
;
1573 * Release any remaining pin on visibility map page.
1575 if (BufferIsValid(vmbuffer
))
1577 ReleaseBuffer(vmbuffer
);
1578 vmbuffer
= InvalidBuffer
;
1581 /* If any tuples need to be deleted, perform final vacuum cycle */
1582 if (dead_tuples
->num_tuples
> 0)
1583 lazy_vacuum(vacrel
, !have_vacuumed_indexes
);
1586 * Vacuum the remainder of the Free Space Map. We must do this whether or
1587 * not there were indexes, and whether or not we bypassed index vacuuming.
1589 if (blkno
> next_fsm_block_to_vacuum
)
1590 FreeSpaceMapVacuumRange(vacrel
->rel
, next_fsm_block_to_vacuum
, blkno
);
1592 /* report all blocks vacuumed */
1593 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED
, blkno
);
1595 /* Do post-vacuum cleanup */
1596 if (vacrel
->nindexes
> 0 && vacrel
->do_index_cleanup
)
1597 lazy_cleanup_all_indexes(vacrel
);
1600 * Free resources managed by lazy_space_alloc(). (We must end parallel
1601 * mode/free shared memory before updating index statistics. We cannot
1602 * write while in parallel mode.)
1604 lazy_space_free(vacrel
);
1606 /* Update index statistics */
1607 if (vacrel
->nindexes
> 0 && vacrel
->do_index_cleanup
)
1608 update_index_statistics(vacrel
);
1611 * If table has no indexes and at least one heap pages was vacuumed, make
1612 * log report that lazy_vacuum_heap_rel would've made had there been
1613 * indexes (having indexes implies using the two pass strategy).
1615 * We deliberately don't do this in the case where there are indexes but
1616 * index vacuuming was bypassed. We make a similar report at the point
1617 * that index vacuuming is bypassed, but that's actually quite different
1618 * in one important sense: it shows information about work we _haven't_
1621 * log_autovacuum output does things differently; it consistently presents
1622 * information about LP_DEAD items for the VACUUM as a whole. We always
1623 * report on each round of index and heap vacuuming separately, though.
1625 if (vacrel
->nindexes
== 0 && vacrel
->lpdead_item_pages
> 0)
1627 (errmsg("\"%s\": removed %lld dead item identifiers in %u pages",
1628 vacrel
->relname
, (long long) vacrel
->lpdead_items
,
1629 vacrel
->lpdead_item_pages
)));
1631 initStringInfo(&buf
);
1632 appendStringInfo(&buf
,
1633 _("%lld dead row versions cannot be removed yet, oldest xmin: %u\n"),
1634 (long long) vacrel
->new_dead_tuples
, vacrel
->OldestXmin
);
1635 appendStringInfo(&buf
, ngettext("%u page removed.\n",
1636 "%u pages removed.\n",
1637 vacrel
->pages_removed
),
1638 vacrel
->pages_removed
);
1639 appendStringInfo(&buf
, ngettext("Skipped %u page due to buffer pins, ",
1640 "Skipped %u pages due to buffer pins, ",
1641 vacrel
->pinskipped_pages
),
1642 vacrel
->pinskipped_pages
);
1643 appendStringInfo(&buf
, ngettext("%u frozen page.\n",
1644 "%u frozen pages.\n",
1645 vacrel
->frozenskipped_pages
),
1646 vacrel
->frozenskipped_pages
);
1647 appendStringInfo(&buf
, _("%s."), pg_rusage_show(&ru0
));
1650 (errmsg("\"%s\": found %lld removable, %lld nonremovable row versions in %u out of %u pages",
1652 (long long) vacrel
->tuples_deleted
,
1653 (long long) vacrel
->num_tuples
, vacrel
->scanned_pages
,
1655 errdetail_internal("%s", buf
.data
)));
1660 * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing.
1662 * Caller must hold pin and buffer cleanup lock on the buffer.
1664 * Prior to PostgreSQL 14 there were very rare cases where heap_page_prune()
1665 * was allowed to disagree with our HeapTupleSatisfiesVacuum() call about
1666 * whether or not a tuple should be considered DEAD. This happened when an
1667 * inserting transaction concurrently aborted (after our heap_page_prune()
1668 * call, before our HeapTupleSatisfiesVacuum() call). There was rather a lot
1669 * of complexity just so we could deal with tuples that were DEAD to VACUUM,
1670 * but nevertheless were left with storage after pruning.
1672 * The approach we take now is to restart pruning when the race condition is
1673 * detected. This allows heap_page_prune() to prune the tuples inserted by
1674 * the now-aborted transaction. This is a little crude, but it guarantees
1675 * that any items that make it into the dead_tuples array are simple LP_DEAD
1676 * line pointers, and that every remaining item with tuple storage is
1677 * considered as a candidate for freezing.
1680 lazy_scan_prune(LVRelState
*vacrel
,
1684 GlobalVisState
*vistest
,
1685 LVPagePruneState
*prunestate
)
1687 Relation rel
= vacrel
->rel
;
1688 OffsetNumber offnum
,
1691 HeapTupleData tuple
;
1699 OffsetNumber deadoffsets
[MaxHeapTuplesPerPage
];
1700 xl_heap_freeze_tuple frozen
[MaxHeapTuplesPerPage
];
1702 maxoff
= PageGetMaxOffsetNumber(page
);
1706 /* Initialize (or reset) page-level counters */
1709 new_dead_tuples
= 0;
1714 * Prune all HOT-update chains in this page.
1716 * We count tuples removed by the pruning step as tuples_deleted. Its
1717 * final value can be thought of as the number of tuples that have been
1718 * deleted from the table. It should not be confused with lpdead_items;
1719 * lpdead_items's final value can be thought of as the number of tuples
1720 * that were deleted from indexes.
1722 tuples_deleted
= heap_page_prune(rel
, buf
, vistest
,
1723 InvalidTransactionId
, 0, false,
1727 * Now scan the page to collect LP_DEAD items and check for tuples
1728 * requiring freezing among remaining tuples with storage
1730 prunestate
->hastup
= false;
1731 prunestate
->has_lpdead_items
= false;
1732 prunestate
->all_visible
= true;
1733 prunestate
->all_frozen
= true;
1734 prunestate
->visibility_cutoff_xid
= InvalidTransactionId
;
1737 for (offnum
= FirstOffsetNumber
;
1739 offnum
= OffsetNumberNext(offnum
))
1741 bool tuple_totally_frozen
;
1744 * Set the offset number so that we can display it along with any
1745 * error that occurred while processing this tuple.
1747 vacrel
->offnum
= offnum
;
1748 itemid
= PageGetItemId(page
, offnum
);
1750 if (!ItemIdIsUsed(itemid
))
1753 /* Redirect items mustn't be touched */
1754 if (ItemIdIsRedirected(itemid
))
1756 prunestate
->hastup
= true; /* page won't be truncatable */
1761 * LP_DEAD items are processed outside of the loop.
1763 * Note that we deliberately don't set hastup=true in the case of an
1764 * LP_DEAD item here, which is not how lazy_check_needs_freeze() or
1765 * count_nondeletable_pages() do it -- they only consider pages empty
1766 * when they only have LP_UNUSED items, which is important for
1769 * Our assumption is that any LP_DEAD items we encounter here will
1770 * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually
1771 * call count_nondeletable_pages(). In any case our opinion of
1772 * whether or not a page 'hastup' (which is how our caller sets its
1773 * vacrel->nonempty_pages value) is inherently race-prone. It must be
1774 * treated as advisory/unreliable, so we might as well be slightly
1777 if (ItemIdIsDead(itemid
))
1779 deadoffsets
[lpdead_items
++] = offnum
;
1780 prunestate
->all_visible
= false;
1781 prunestate
->has_lpdead_items
= true;
1785 Assert(ItemIdIsNormal(itemid
));
1787 ItemPointerSet(&(tuple
.t_self
), blkno
, offnum
);
1788 tuple
.t_data
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
1789 tuple
.t_len
= ItemIdGetLength(itemid
);
1790 tuple
.t_tableOid
= RelationGetRelid(rel
);
1793 * DEAD tuples are almost always pruned into LP_DEAD line pointers by
1794 * heap_page_prune(), but it's possible that the tuple state changed
1795 * since heap_page_prune() looked. Handle that here by restarting.
1796 * (See comments at the top of function for a full explanation.)
1798 res
= HeapTupleSatisfiesVacuum(&tuple
, vacrel
->OldestXmin
, buf
);
1800 if (unlikely(res
== HEAPTUPLE_DEAD
))
1804 * The criteria for counting a tuple as live in this block need to
1805 * match what analyze.c's acquire_sample_rows() does, otherwise VACUUM
1806 * and ANALYZE may produce wildly different reltuples values, e.g.
1807 * when there are many recently-dead tuples.
1809 * The logic here is a bit simpler than acquire_sample_rows(), as
1810 * VACUUM can't run inside a transaction block, which makes some cases
1811 * impossible (e.g. in-progress insert from the same transaction).
1813 * We treat LP_DEAD items a little differently, too -- we don't count
1814 * them as dead_tuples at all (we only consider new_dead_tuples). The
1815 * outcome is no different because we assume that any LP_DEAD items we
1816 * encounter here will become LP_UNUSED inside lazy_vacuum_heap_page()
1817 * before we report anything to the stats collector. (Cases where we
1818 * bypass index vacuuming will violate our assumption, but the overall
1819 * impact of that should be negligible.)
1823 case HEAPTUPLE_LIVE
:
1826 * Count it as live. Not only is this natural, but it's also
1827 * what acquire_sample_rows() does.
1832 * Is the tuple definitely visible to all transactions?
1834 * NB: Like with per-tuple hint bits, we can't set the
1835 * PD_ALL_VISIBLE flag if the inserter committed
1836 * asynchronously. See SetHintBits for more info. Check that
1837 * the tuple is hinted xmin-committed because of that.
1839 if (prunestate
->all_visible
)
1843 if (!HeapTupleHeaderXminCommitted(tuple
.t_data
))
1845 prunestate
->all_visible
= false;
1850 * The inserter definitely committed. But is it old enough
1851 * that everyone sees it as committed?
1853 xmin
= HeapTupleHeaderGetXmin(tuple
.t_data
);
1854 if (!TransactionIdPrecedes(xmin
, vacrel
->OldestXmin
))
1856 prunestate
->all_visible
= false;
1860 /* Track newest xmin on page. */
1861 if (TransactionIdFollows(xmin
, prunestate
->visibility_cutoff_xid
))
1862 prunestate
->visibility_cutoff_xid
= xmin
;
1865 case HEAPTUPLE_RECENTLY_DEAD
:
1868 * If tuple is recently deleted then we must not remove it
1869 * from relation. (We only remove items that are LP_DEAD from
1873 prunestate
->all_visible
= false;
1875 case HEAPTUPLE_INSERT_IN_PROGRESS
:
1878 * We do not count these rows as live, because we expect the
1879 * inserting transaction to update the counters at commit, and
1880 * we assume that will happen only after we report our
1881 * results. This assumption is a bit shaky, but it is what
1882 * acquire_sample_rows() does, so be consistent.
1884 prunestate
->all_visible
= false;
1886 case HEAPTUPLE_DELETE_IN_PROGRESS
:
1887 /* This is an expected case during concurrent vacuum */
1888 prunestate
->all_visible
= false;
1891 * Count such rows as live. As above, we assume the deleting
1892 * transaction will commit and update the counters after we
1898 elog(ERROR
, "unexpected HeapTupleSatisfiesVacuum result");
1903 * Non-removable tuple (i.e. tuple with storage).
1905 * Check tuple left behind after pruning to see if needs to be frozen
1909 prunestate
->hastup
= true;
1910 if (heap_prepare_freeze_tuple(tuple
.t_data
,
1911 vacrel
->relfrozenxid
,
1913 vacrel
->FreezeLimit
,
1914 vacrel
->MultiXactCutoff
,
1916 &tuple_totally_frozen
))
1918 /* Will execute freeze below */
1919 frozen
[nfrozen
++].offset
= offnum
;
1923 * If tuple is not frozen (and not about to become frozen) then caller
1924 * had better not go on to set this page's VM bit
1926 if (!tuple_totally_frozen
)
1927 prunestate
->all_frozen
= false;
1931 * We have now divided every item on the page into either an LP_DEAD item
1932 * that will need to be vacuumed in indexes later, or a LP_NORMAL tuple
1933 * that remains and needs to be considered for freezing now (LP_UNUSED and
1934 * LP_REDIRECT items also remain, but are of no further interest to us).
1936 vacrel
->offnum
= InvalidOffsetNumber
;
1939 * Consider the need to freeze any items with tuple storage from the page
1944 Assert(prunestate
->hastup
);
1947 * At least one tuple with storage needs to be frozen -- execute that
1950 * If we need to freeze any tuples we'll mark the buffer dirty, and
1951 * write a WAL record recording the changes. We must log the changes
1952 * to be crash-safe against future truncation of CLOG.
1954 START_CRIT_SECTION();
1956 MarkBufferDirty(buf
);
1958 /* execute collected freezes */
1959 for (int i
= 0; i
< nfrozen
; i
++)
1961 HeapTupleHeader htup
;
1963 itemid
= PageGetItemId(page
, frozen
[i
].offset
);
1964 htup
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
1966 heap_execute_freeze_tuple(htup
, &frozen
[i
]);
1969 /* Now WAL-log freezing if necessary */
1970 if (RelationNeedsWAL(vacrel
->rel
))
1974 recptr
= log_heap_freeze(vacrel
->rel
, buf
, vacrel
->FreezeLimit
,
1976 PageSetLSN(page
, recptr
);
1983 * The second pass over the heap can also set visibility map bits, using
1984 * the same approach. This is important when the table frequently has a
1985 * few old LP_DEAD items on each page by the time we get to it (typically
1986 * because past opportunistic pruning operations freed some non-HOT
1989 * VACUUM will call heap_page_is_all_visible() during the second pass over
1990 * the heap to determine all_visible and all_frozen for the page -- this
1991 * is a specialized version of the logic from this function. Now that
1992 * we've finished pruning and freezing, make sure that we're in total
1993 * agreement with heap_page_is_all_visible() using an assertion.
1995 #ifdef USE_ASSERT_CHECKING
1996 /* Note that all_frozen value does not matter when !all_visible */
1997 if (prunestate
->all_visible
)
1999 TransactionId cutoff
;
2002 if (!heap_page_is_all_visible(vacrel
, buf
, &cutoff
, &all_frozen
))
2005 Assert(lpdead_items
== 0);
2006 Assert(prunestate
->all_frozen
== all_frozen
);
2009 * It's possible that we froze tuples and made the page's XID cutoff
2010 * (for recovery conflict purposes) FrozenTransactionId. This is okay
2011 * because visibility_cutoff_xid will be logged by our caller in a
2014 Assert(cutoff
== FrozenTransactionId
||
2015 cutoff
== prunestate
->visibility_cutoff_xid
);
2020 * Now save details of the LP_DEAD items from the page in the dead_tuples
2021 * array. Also record that page has dead items in per-page prunestate.
2023 if (lpdead_items
> 0)
2025 LVDeadTuples
*dead_tuples
= vacrel
->dead_tuples
;
2026 ItemPointerData tmp
;
2028 Assert(!prunestate
->all_visible
);
2029 Assert(prunestate
->has_lpdead_items
);
2031 vacrel
->lpdead_item_pages
++;
2033 ItemPointerSetBlockNumber(&tmp
, blkno
);
2035 for (int i
= 0; i
< lpdead_items
; i
++)
2037 ItemPointerSetOffsetNumber(&tmp
, deadoffsets
[i
]);
2038 dead_tuples
->itemptrs
[dead_tuples
->num_tuples
++] = tmp
;
2041 Assert(dead_tuples
->num_tuples
<= dead_tuples
->max_tuples
);
2042 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES
,
2043 dead_tuples
->num_tuples
);
2046 /* Finally, add page-local counts to whole-VACUUM counts */
2047 vacrel
->tuples_deleted
+= tuples_deleted
;
2048 vacrel
->lpdead_items
+= lpdead_items
;
2049 vacrel
->new_dead_tuples
+= new_dead_tuples
;
2050 vacrel
->num_tuples
+= num_tuples
;
2051 vacrel
->live_tuples
+= live_tuples
;
2055 * Remove the collected garbage tuples from the table and its indexes.
2057 * We may choose to bypass index vacuuming at this point, though only when the
2058 * ongoing VACUUM operation will definitely only have one index scan/round of
2059 * index vacuuming. Caller indicates whether or not this is such a VACUUM
2060 * operation using 'onecall' argument.
2062 * In rare emergencies, the ongoing VACUUM operation can be made to skip both
2063 * index vacuuming and index cleanup at the point we're called. This avoids
2064 * having the whole system refuse to allocate further XIDs/MultiXactIds due to
2068 lazy_vacuum(LVRelState
*vacrel
, bool onecall
)
2070 bool do_bypass_optimization
;
2072 /* Should not end up here with no indexes */
2073 Assert(vacrel
->nindexes
> 0);
2074 Assert(!IsParallelWorker());
2075 Assert(vacrel
->lpdead_item_pages
> 0);
2077 if (!vacrel
->do_index_vacuuming
)
2079 Assert(!vacrel
->do_index_cleanup
);
2080 vacrel
->dead_tuples
->num_tuples
= 0;
2085 * Consider bypassing index vacuuming (and heap vacuuming) entirely.
2087 * We currently only do this in cases where the number of LP_DEAD items
2088 * for the entire VACUUM operation is close to zero. This avoids sharp
2089 * discontinuities in the duration and overhead of successive VACUUM
2090 * operations that run against the same table with a fixed workload.
2091 * Ideally, successive VACUUM operations will behave as if there are
2092 * exactly zero LP_DEAD items in cases where there are close to zero.
2094 * This is likely to be helpful with a table that is continually affected
2095 * by UPDATEs that can mostly apply the HOT optimization, but occasionally
2096 * have small aberrations that lead to just a few heap pages retaining
2097 * only one or two LP_DEAD items. This is pretty common; even when the
2098 * DBA goes out of their way to make UPDATEs use HOT, it is practically
2099 * impossible to predict whether HOT will be applied in 100% of cases.
2100 * It's far easier to ensure that 99%+ of all UPDATEs against a table use
2101 * HOT through careful tuning.
2103 do_bypass_optimization
= false;
2104 if (onecall
&& vacrel
->rel_pages
> 0)
2106 BlockNumber threshold
;
2108 Assert(vacrel
->num_index_scans
== 0);
2109 Assert(vacrel
->lpdead_items
== vacrel
->dead_tuples
->num_tuples
);
2110 Assert(vacrel
->do_index_vacuuming
);
2111 Assert(vacrel
->do_index_cleanup
);
2114 * This crossover point at which we'll start to do index vacuuming is
2115 * expressed as a percentage of the total number of heap pages in the
2116 * table that are known to have at least one LP_DEAD item. This is
2117 * much more important than the total number of LP_DEAD items, since
2118 * it's a proxy for the number of heap pages whose visibility map bits
2119 * cannot be set on account of bypassing index and heap vacuuming.
2121 * We apply one further precautionary test: the space currently used
2122 * to store the TIDs (TIDs that now all point to LP_DEAD items) must
2123 * not exceed 32MB. This limits the risk that we will bypass index
2124 * vacuuming again and again until eventually there is a VACUUM whose
2125 * dead_tuples space is not CPU cache resident.
2127 * We don't take any special steps to remember the LP_DEAD items (such
2128 * as counting them in new_dead_tuples report to the stats collector)
2129 * when the optimization is applied. Though the accounting used in
2130 * analyze.c's acquire_sample_rows() will recognize the same LP_DEAD
2131 * items as dead rows in its own stats collector report, that's okay.
2132 * The discrepancy should be negligible. If this optimization is ever
2133 * expanded to cover more cases then this may need to be reconsidered.
2135 threshold
= (double) vacrel
->rel_pages
* BYPASS_THRESHOLD_PAGES
;
2136 do_bypass_optimization
=
2137 (vacrel
->lpdead_item_pages
< threshold
&&
2138 vacrel
->lpdead_items
< MAXDEADTUPLES(32L * 1024L * 1024L));
2141 if (do_bypass_optimization
)
2144 * There are almost zero TIDs. Behave as if there were precisely
2145 * zero: bypass index vacuuming, but do index cleanup.
2147 * We expect that the ongoing VACUUM operation will finish very
2148 * quickly, so there is no point in considering speeding up as a
2149 * failsafe against wraparound failure. (Index cleanup is expected to
2150 * finish very quickly in cases where there were no ambulkdelete()
2153 vacrel
->do_index_vacuuming
= false;
2155 (errmsg("\"%s\": index scan bypassed: %u pages from table (%.2f%% of total) have %lld dead item identifiers",
2156 vacrel
->relname
, vacrel
->lpdead_item_pages
,
2157 100.0 * vacrel
->lpdead_item_pages
/ vacrel
->rel_pages
,
2158 (long long) vacrel
->lpdead_items
)));
2160 else if (lazy_vacuum_all_indexes(vacrel
))
2163 * We successfully completed a round of index vacuuming. Do related
2164 * heap vacuuming now.
2166 lazy_vacuum_heap_rel(vacrel
);
2173 * we attempted index vacuuming, but didn't finish a full round/full
2174 * index scan. This happens when relfrozenxid or relminmxid is too
2177 * From this point on the VACUUM operation will do no further index
2178 * vacuuming or heap vacuuming. This VACUUM operation won't end up
2181 Assert(vacrel
->do_failsafe
);
2185 * Forget the LP_DEAD items that we just vacuumed (or just decided to not
2188 vacrel
->dead_tuples
->num_tuples
= 0;
2192 * lazy_vacuum_all_indexes() -- Main entry for index vacuuming
2194 * Returns true in the common case when all indexes were successfully
2195 * vacuumed. Returns false in rare cases where we determined that the ongoing
2196 * VACUUM operation is at risk of taking too long to finish, leading to
2197 * wraparound failure.
2200 lazy_vacuum_all_indexes(LVRelState
*vacrel
)
2202 bool allindexes
= true;
2204 Assert(!IsParallelWorker());
2205 Assert(vacrel
->nindexes
> 0);
2206 Assert(vacrel
->do_index_vacuuming
);
2207 Assert(vacrel
->do_index_cleanup
);
2208 Assert(TransactionIdIsNormal(vacrel
->relfrozenxid
));
2209 Assert(MultiXactIdIsValid(vacrel
->relminmxid
));
2211 /* Precheck for XID wraparound emergencies */
2212 if (lazy_check_wraparound_failsafe(vacrel
))
2214 /* Wraparound emergency -- don't even start an index scan */
2218 /* Report that we are now vacuuming indexes */
2219 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
2220 PROGRESS_VACUUM_PHASE_VACUUM_INDEX
);
2222 if (!ParallelVacuumIsActive(vacrel
))
2224 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
2226 Relation indrel
= vacrel
->indrels
[idx
];
2227 IndexBulkDeleteResult
*istat
= vacrel
->indstats
[idx
];
2229 vacrel
->indstats
[idx
] =
2230 lazy_vacuum_one_index(indrel
, istat
, vacrel
->old_live_tuples
,
2233 if (lazy_check_wraparound_failsafe(vacrel
))
2235 /* Wraparound emergency -- end current index scan */
2243 /* Outsource everything to parallel variant */
2244 do_parallel_lazy_vacuum_all_indexes(vacrel
);
2247 * Do a postcheck to consider applying wraparound failsafe now. Note
2248 * that parallel VACUUM only gets the precheck and this postcheck.
2250 if (lazy_check_wraparound_failsafe(vacrel
))
2255 * We delete all LP_DEAD items from the first heap pass in all indexes on
2256 * each call here (except calls where we choose to do the failsafe). This
2257 * makes the next call to lazy_vacuum_heap_rel() safe (except in the event
2258 * of the failsafe triggering, which prevents the next call from taking
2261 Assert(vacrel
->num_index_scans
> 0 ||
2262 vacrel
->dead_tuples
->num_tuples
== vacrel
->lpdead_items
);
2263 Assert(allindexes
|| vacrel
->do_failsafe
);
2266 * Increase and report the number of index scans.
2268 * We deliberately include the case where we started a round of bulk
2269 * deletes that we weren't able to finish due to the failsafe triggering.
2271 vacrel
->num_index_scans
++;
2272 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS
,
2273 vacrel
->num_index_scans
);
2279 * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
2281 * This routine marks LP_DEAD items in vacrel->dead_tuples array as LP_UNUSED.
2282 * Pages that never had lazy_scan_prune record LP_DEAD items are not visited
2285 * We may also be able to truncate the line pointer array of the heap pages we
2286 * visit. If there is a contiguous group of LP_UNUSED items at the end of the
2287 * array, it can be reclaimed as free space. These LP_UNUSED items usually
2288 * start out as LP_DEAD items recorded by lazy_scan_prune (we set items from
2289 * each page to LP_UNUSED, and then consider if it's possible to truncate the
2290 * page's line pointer array).
2292 * Note: the reason for doing this as a second pass is we cannot remove the
2293 * tuples until we've removed their index entries, and we want to process
2294 * index entry removal in batches as large as possible.
2297 lazy_vacuum_heap_rel(LVRelState
*vacrel
)
2300 BlockNumber vacuumed_pages
;
2302 Buffer vmbuffer
= InvalidBuffer
;
2303 LVSavedErrInfo saved_err_info
;
2305 Assert(vacrel
->do_index_vacuuming
);
2306 Assert(vacrel
->do_index_cleanup
);
2307 Assert(vacrel
->num_index_scans
> 0);
2309 /* Report that we are now vacuuming the heap */
2310 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
2311 PROGRESS_VACUUM_PHASE_VACUUM_HEAP
);
2313 /* Update error traceback information */
2314 update_vacuum_error_info(vacrel
, &saved_err_info
,
2315 VACUUM_ERRCB_PHASE_VACUUM_HEAP
,
2316 InvalidBlockNumber
, InvalidOffsetNumber
);
2318 pg_rusage_init(&ru0
);
2322 while (tupindex
< vacrel
->dead_tuples
->num_tuples
)
2329 vacuum_delay_point();
2331 tblk
= ItemPointerGetBlockNumber(&vacrel
->dead_tuples
->itemptrs
[tupindex
]);
2332 vacrel
->blkno
= tblk
;
2333 buf
= ReadBufferExtended(vacrel
->rel
, MAIN_FORKNUM
, tblk
, RBM_NORMAL
,
2335 LockBuffer(buf
, BUFFER_LOCK_EXCLUSIVE
);
2336 tupindex
= lazy_vacuum_heap_page(vacrel
, tblk
, buf
, tupindex
,
2339 /* Now that we've vacuumed the page, record its available space */
2340 page
= BufferGetPage(buf
);
2341 freespace
= PageGetHeapFreeSpace(page
);
2343 UnlockReleaseBuffer(buf
);
2344 RecordPageWithFreeSpace(vacrel
->rel
, tblk
, freespace
);
2348 /* Clear the block number information */
2349 vacrel
->blkno
= InvalidBlockNumber
;
2351 if (BufferIsValid(vmbuffer
))
2353 ReleaseBuffer(vmbuffer
);
2354 vmbuffer
= InvalidBuffer
;
2358 * We set all LP_DEAD items from the first heap pass to LP_UNUSED during
2359 * the second heap pass. No more, no less.
2361 Assert(vacrel
->num_index_scans
> 1 ||
2362 (tupindex
== vacrel
->lpdead_items
&&
2363 vacuumed_pages
== vacrel
->lpdead_item_pages
));
2366 (errmsg("\"%s\": removed %d dead item identifiers in %u pages",
2367 vacrel
->relname
, tupindex
, vacuumed_pages
),
2368 errdetail_internal("%s", pg_rusage_show(&ru0
))));
2370 /* Revert to the previous phase information for error traceback */
2371 restore_vacuum_error_info(vacrel
, &saved_err_info
);
2375 * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the
2376 * vacrel->dead_tuples array.
2378 * Caller must have an exclusive buffer lock on the buffer (though a
2379 * super-exclusive lock is also acceptable).
2381 * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
2382 * this page. We assume the rest follow sequentially. The return value is
2383 * the first tupindex after the tuples of this page.
2385 * Prior to PostgreSQL 14 there were rare cases where this routine had to set
2386 * tuples with storage to unused. These days it is strictly responsible for
2387 * marking LP_DEAD stub line pointers as unused. This only happens for those
2388 * LP_DEAD items on the page that were determined to be LP_DEAD items back
2389 * when the same page was visited by lazy_scan_prune() (i.e. those whose TID
2390 * was recorded in the dead_tuples array).
2393 lazy_vacuum_heap_page(LVRelState
*vacrel
, BlockNumber blkno
, Buffer buffer
,
2394 int tupindex
, Buffer
*vmbuffer
)
2396 LVDeadTuples
*dead_tuples
= vacrel
->dead_tuples
;
2397 Page page
= BufferGetPage(buffer
);
2398 OffsetNumber unused
[MaxHeapTuplesPerPage
];
2400 TransactionId visibility_cutoff_xid
;
2402 LVSavedErrInfo saved_err_info
;
2404 Assert(vacrel
->nindexes
== 0 || vacrel
->do_index_vacuuming
);
2406 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED
, blkno
);
2408 /* Update error traceback information */
2409 update_vacuum_error_info(vacrel
, &saved_err_info
,
2410 VACUUM_ERRCB_PHASE_VACUUM_HEAP
, blkno
,
2411 InvalidOffsetNumber
);
2413 START_CRIT_SECTION();
2415 for (; tupindex
< dead_tuples
->num_tuples
; tupindex
++)
2421 tblk
= ItemPointerGetBlockNumber(&dead_tuples
->itemptrs
[tupindex
]);
2423 break; /* past end of tuples for this block */
2424 toff
= ItemPointerGetOffsetNumber(&dead_tuples
->itemptrs
[tupindex
]);
2425 itemid
= PageGetItemId(page
, toff
);
2427 Assert(ItemIdIsDead(itemid
) && !ItemIdHasStorage(itemid
));
2428 ItemIdSetUnused(itemid
);
2429 unused
[uncnt
++] = toff
;
2434 /* Attempt to truncate line pointer array now */
2435 PageTruncateLinePointerArray(page
);
2438 * Mark buffer dirty before we write WAL.
2440 MarkBufferDirty(buffer
);
2443 if (RelationNeedsWAL(vacrel
->rel
))
2445 xl_heap_vacuum xlrec
;
2448 xlrec
.nunused
= uncnt
;
2451 XLogRegisterData((char *) &xlrec
, SizeOfHeapVacuum
);
2453 XLogRegisterBuffer(0, buffer
, REGBUF_STANDARD
);
2454 XLogRegisterBufData(0, (char *) unused
, uncnt
* sizeof(OffsetNumber
));
2456 recptr
= XLogInsert(RM_HEAP2_ID
, XLOG_HEAP2_VACUUM
);
2458 PageSetLSN(page
, recptr
);
2462 * End critical section, so we safely can do visibility tests (which
2463 * possibly need to perform IO and allocate memory!). If we crash now the
2464 * page (including the corresponding vm bit) might not be marked all
2465 * visible, but that's fine. A later vacuum will fix that.
2470 * Now that we have removed the LD_DEAD items from the page, once again
2471 * check if the page has become all-visible. The page is already marked
2472 * dirty, exclusively locked, and, if needed, a full page image has been
2475 if (heap_page_is_all_visible(vacrel
, buffer
, &visibility_cutoff_xid
,
2477 PageSetAllVisible(page
);
2480 * All the changes to the heap page have been done. If the all-visible
2481 * flag is now set, also set the VM all-visible bit (and, if possible, the
2482 * all-frozen bit) unless this has already been done previously.
2484 if (PageIsAllVisible(page
))
2487 uint8 vm_status
= visibilitymap_get_status(vacrel
->rel
,
2490 /* Set the VM all-frozen bit to flag, if needed */
2491 if ((vm_status
& VISIBILITYMAP_ALL_VISIBLE
) == 0)
2492 flags
|= VISIBILITYMAP_ALL_VISIBLE
;
2493 if ((vm_status
& VISIBILITYMAP_ALL_FROZEN
) == 0 && all_frozen
)
2494 flags
|= VISIBILITYMAP_ALL_FROZEN
;
2496 Assert(BufferIsValid(*vmbuffer
));
2498 visibilitymap_set(vacrel
->rel
, blkno
, buffer
, InvalidXLogRecPtr
,
2499 *vmbuffer
, visibility_cutoff_xid
, flags
);
2502 /* Revert to the previous phase information for error traceback */
2503 restore_vacuum_error_info(vacrel
, &saved_err_info
);
2508 * lazy_check_needs_freeze() -- scan page to see if any tuples
2509 * need to be cleaned to avoid wraparound
2511 * Returns true if the page needs to be vacuumed using cleanup lock.
2512 * Also returns a flag indicating whether page contains any tuples at all.
2515 lazy_check_needs_freeze(Buffer buf
, bool *hastup
, LVRelState
*vacrel
)
2517 Page page
= BufferGetPage(buf
);
2518 OffsetNumber offnum
,
2520 HeapTupleHeader tupleheader
;
2525 * New and empty pages, obviously, don't contain tuples. We could make
2526 * sure that the page is registered in the FSM, but it doesn't seem worth
2527 * waiting for a cleanup lock just for that, especially because it's
2528 * likely that the pin holder will do so.
2530 if (PageIsNew(page
) || PageIsEmpty(page
))
2533 maxoff
= PageGetMaxOffsetNumber(page
);
2534 for (offnum
= FirstOffsetNumber
;
2536 offnum
= OffsetNumberNext(offnum
))
2541 * Set the offset number so that we can display it along with any
2542 * error that occurred while processing this tuple.
2544 vacrel
->offnum
= offnum
;
2545 itemid
= PageGetItemId(page
, offnum
);
2547 /* this should match hastup test in count_nondeletable_pages() */
2548 if (ItemIdIsUsed(itemid
))
2551 /* dead and redirect items never need freezing */
2552 if (!ItemIdIsNormal(itemid
))
2555 tupleheader
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
2557 if (heap_tuple_needs_freeze(tupleheader
, vacrel
->FreezeLimit
,
2558 vacrel
->MultiXactCutoff
, buf
))
2560 } /* scan along page */
2562 /* Clear the offset information once we have processed the given page. */
2563 vacrel
->offnum
= InvalidOffsetNumber
;
2565 return (offnum
<= maxoff
);
2569 * Trigger the failsafe to avoid wraparound failure when vacrel table has a
2570 * relfrozenxid and/or relminmxid that is dangerously far in the past.
2571 * Triggering the failsafe makes the ongoing VACUUM bypass any further index
2572 * vacuuming and heap vacuuming. Truncating the heap is also bypassed.
2574 * Any remaining work (work that VACUUM cannot just bypass) is typically sped
2575 * up when the failsafe triggers. VACUUM stops applying any cost-based delay
2576 * that it started out with.
2578 * Returns true when failsafe has been triggered.
2581 lazy_check_wraparound_failsafe(LVRelState
*vacrel
)
2583 /* Don't warn more than once per VACUUM */
2584 if (vacrel
->do_failsafe
)
2587 if (unlikely(vacuum_xid_failsafe_check(vacrel
->relfrozenxid
,
2588 vacrel
->relminmxid
)))
2590 Assert(vacrel
->do_index_vacuuming
);
2591 Assert(vacrel
->do_index_cleanup
);
2593 vacrel
->do_index_vacuuming
= false;
2594 vacrel
->do_index_cleanup
= false;
2595 vacrel
->do_failsafe
= true;
2598 (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans",
2599 get_database_name(MyDatabaseId
),
2600 vacrel
->relnamespace
,
2602 vacrel
->num_index_scans
),
2603 errdetail("table's relfrozenxid or relminmxid is too far in the past"),
2604 errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n"
2605 "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs.")));
2607 /* Stop applying cost limits from this point on */
2608 VacuumCostActive
= false;
2609 VacuumCostBalance
= 0;
2618 * Perform lazy_vacuum_all_indexes() steps in parallel
2621 do_parallel_lazy_vacuum_all_indexes(LVRelState
*vacrel
)
2623 /* Tell parallel workers to do index vacuuming */
2624 vacrel
->lps
->lvshared
->for_cleanup
= false;
2625 vacrel
->lps
->lvshared
->first_time
= false;
2628 * We can only provide an approximate value of num_heap_tuples in vacuum
2631 vacrel
->lps
->lvshared
->reltuples
= vacrel
->old_live_tuples
;
2632 vacrel
->lps
->lvshared
->estimated_count
= true;
2634 do_parallel_vacuum_or_cleanup(vacrel
,
2635 vacrel
->lps
->nindexes_parallel_bulkdel
);
2639 * Perform lazy_cleanup_all_indexes() steps in parallel
2642 do_parallel_lazy_cleanup_all_indexes(LVRelState
*vacrel
)
2647 * If parallel vacuum is active we perform index cleanup with parallel
2650 * Tell parallel workers to do index cleanup.
2652 vacrel
->lps
->lvshared
->for_cleanup
= true;
2653 vacrel
->lps
->lvshared
->first_time
= (vacrel
->num_index_scans
== 0);
2656 * Now we can provide a better estimate of total number of surviving
2657 * tuples (we assume indexes are more interested in that than in the
2658 * number of nominally live tuples).
2660 vacrel
->lps
->lvshared
->reltuples
= vacrel
->new_rel_tuples
;
2661 vacrel
->lps
->lvshared
->estimated_count
=
2662 (vacrel
->tupcount_pages
< vacrel
->rel_pages
);
2664 /* Determine the number of parallel workers to launch */
2665 if (vacrel
->lps
->lvshared
->first_time
)
2666 nworkers
= vacrel
->lps
->nindexes_parallel_cleanup
+
2667 vacrel
->lps
->nindexes_parallel_condcleanup
;
2669 nworkers
= vacrel
->lps
->nindexes_parallel_cleanup
;
2671 do_parallel_vacuum_or_cleanup(vacrel
, nworkers
);
2675 * Perform index vacuum or index cleanup with parallel workers. This function
2676 * must be used by the parallel vacuum leader process. The caller must set
2677 * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2681 do_parallel_vacuum_or_cleanup(LVRelState
*vacrel
, int nworkers
)
2683 LVParallelState
*lps
= vacrel
->lps
;
2685 Assert(!IsParallelWorker());
2686 Assert(ParallelVacuumIsActive(vacrel
));
2687 Assert(vacrel
->nindexes
> 0);
2689 /* The leader process will participate */
2693 * It is possible that parallel context is initialized with fewer workers
2694 * than the number of indexes that need a separate worker in the current
2695 * phase, so we need to consider it. See compute_parallel_vacuum_workers.
2697 nworkers
= Min(nworkers
, lps
->pcxt
->nworkers
);
2699 /* Setup the shared cost-based vacuum delay and launch workers */
2702 if (vacrel
->num_index_scans
> 0)
2704 /* Reset the parallel index processing counter */
2705 pg_atomic_write_u32(&(lps
->lvshared
->idx
), 0);
2707 /* Reinitialize the parallel context to relaunch parallel workers */
2708 ReinitializeParallelDSM(lps
->pcxt
);
2712 * Set up shared cost balance and the number of active workers for
2713 * vacuum delay. We need to do this before launching workers as
2714 * otherwise, they might not see the updated values for these
2717 pg_atomic_write_u32(&(lps
->lvshared
->cost_balance
), VacuumCostBalance
);
2718 pg_atomic_write_u32(&(lps
->lvshared
->active_nworkers
), 0);
2721 * The number of workers can vary between bulkdelete and cleanup
2724 ReinitializeParallelWorkers(lps
->pcxt
, nworkers
);
2726 LaunchParallelWorkers(lps
->pcxt
);
2728 if (lps
->pcxt
->nworkers_launched
> 0)
2731 * Reset the local cost values for leader backend as we have
2732 * already accumulated the remaining balance of heap.
2734 VacuumCostBalance
= 0;
2735 VacuumCostBalanceLocal
= 0;
2737 /* Enable shared cost balance for leader backend */
2738 VacuumSharedCostBalance
= &(lps
->lvshared
->cost_balance
);
2739 VacuumActiveNWorkers
= &(lps
->lvshared
->active_nworkers
);
2742 if (lps
->lvshared
->for_cleanup
)
2744 (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2745 "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2746 lps
->pcxt
->nworkers_launched
),
2747 lps
->pcxt
->nworkers_launched
, nworkers
)));
2750 (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2751 "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2752 lps
->pcxt
->nworkers_launched
),
2753 lps
->pcxt
->nworkers_launched
, nworkers
)));
2756 /* Process the indexes that can be processed by only leader process */
2757 do_serial_processing_for_unsafe_indexes(vacrel
, lps
->lvshared
);
2760 * Join as a parallel worker. The leader process alone processes all the
2761 * indexes in the case where no workers are launched.
2763 do_parallel_processing(vacrel
, lps
->lvshared
);
2766 * Next, accumulate buffer and WAL usage. (This must wait for the workers
2767 * to finish, or we might get incomplete data.)
2771 /* Wait for all vacuum workers to finish */
2772 WaitForParallelWorkersToFinish(lps
->pcxt
);
2774 for (int i
= 0; i
< lps
->pcxt
->nworkers_launched
; i
++)
2775 InstrAccumParallelQuery(&lps
->buffer_usage
[i
], &lps
->wal_usage
[i
]);
2779 * Carry the shared balance value to heap scan and disable shared costing
2781 if (VacuumSharedCostBalance
)
2783 VacuumCostBalance
= pg_atomic_read_u32(VacuumSharedCostBalance
);
2784 VacuumSharedCostBalance
= NULL
;
2785 VacuumActiveNWorkers
= NULL
;
2790 * Index vacuum/cleanup routine used by the leader process and parallel
2791 * vacuum worker processes to process the indexes in parallel.
2794 do_parallel_processing(LVRelState
*vacrel
, LVShared
*lvshared
)
2797 * Increment the active worker count if we are able to launch any worker.
2799 if (VacuumActiveNWorkers
)
2800 pg_atomic_add_fetch_u32(VacuumActiveNWorkers
, 1);
2802 /* Loop until all indexes are vacuumed */
2806 LVSharedIndStats
*shared_istat
;
2808 IndexBulkDeleteResult
*istat
;
2810 /* Get an index number to process */
2811 idx
= pg_atomic_fetch_add_u32(&(lvshared
->idx
), 1);
2813 /* Done for all indexes? */
2814 if (idx
>= vacrel
->nindexes
)
2817 /* Get the index statistics of this index from DSM */
2818 shared_istat
= parallel_stats_for_idx(lvshared
, idx
);
2820 /* Skip indexes not participating in parallelism */
2821 if (shared_istat
== NULL
)
2824 indrel
= vacrel
->indrels
[idx
];
2827 * Skip processing indexes that are unsafe for workers (these are
2828 * processed in do_serial_processing_for_unsafe_indexes() by leader)
2830 if (!parallel_processing_is_safe(indrel
, lvshared
))
2833 /* Do vacuum or cleanup of the index */
2834 istat
= (vacrel
->indstats
[idx
]);
2835 vacrel
->indstats
[idx
] = parallel_process_one_index(indrel
, istat
,
2842 * We have completed the index vacuum so decrement the active worker
2845 if (VacuumActiveNWorkers
)
2846 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers
, 1);
2850 * Vacuum or cleanup indexes that can be processed by only the leader process
2851 * because these indexes don't support parallel operation at that phase.
2854 do_serial_processing_for_unsafe_indexes(LVRelState
*vacrel
, LVShared
*lvshared
)
2856 Assert(!IsParallelWorker());
2859 * Increment the active worker count if we are able to launch any worker.
2861 if (VacuumActiveNWorkers
)
2862 pg_atomic_add_fetch_u32(VacuumActiveNWorkers
, 1);
2864 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
2866 LVSharedIndStats
*shared_istat
;
2868 IndexBulkDeleteResult
*istat
;
2870 shared_istat
= parallel_stats_for_idx(lvshared
, idx
);
2872 /* Skip already-complete indexes */
2873 if (shared_istat
!= NULL
)
2876 indrel
= vacrel
->indrels
[idx
];
2879 * We're only here for the unsafe indexes
2881 if (parallel_processing_is_safe(indrel
, lvshared
))
2884 /* Do vacuum or cleanup of the index */
2885 istat
= (vacrel
->indstats
[idx
]);
2886 vacrel
->indstats
[idx
] = parallel_process_one_index(indrel
, istat
,
2893 * We have completed the index vacuum so decrement the active worker
2896 if (VacuumActiveNWorkers
)
2897 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers
, 1);
2901 * Vacuum or cleanup index either by leader process or by one of the worker
2902 * process. After processing the index this function copies the index
2903 * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2906 static IndexBulkDeleteResult
*
2907 parallel_process_one_index(Relation indrel
,
2908 IndexBulkDeleteResult
*istat
,
2910 LVSharedIndStats
*shared_istat
,
2913 IndexBulkDeleteResult
*istat_res
;
2916 * Update the pointer to the corresponding bulk-deletion result if someone
2917 * has already updated it
2919 if (shared_istat
&& shared_istat
->updated
&& istat
== NULL
)
2920 istat
= &shared_istat
->istat
;
2922 /* Do vacuum or cleanup of the index */
2923 if (lvshared
->for_cleanup
)
2924 istat_res
= lazy_cleanup_one_index(indrel
, istat
, lvshared
->reltuples
,
2925 lvshared
->estimated_count
, vacrel
);
2927 istat_res
= lazy_vacuum_one_index(indrel
, istat
, lvshared
->reltuples
,
2931 * Copy the index bulk-deletion result returned from ambulkdelete and
2932 * amvacuumcleanup to the DSM segment if it's the first cycle because they
2933 * allocate locally and it's possible that an index will be vacuumed by a
2934 * different vacuum process the next cycle. Copying the result normally
2935 * happens only the first time an index is vacuumed. For any additional
2936 * vacuum pass, we directly point to the result on the DSM segment and
2937 * pass it to vacuum index APIs so that workers can update it directly.
2939 * Since all vacuum workers write the bulk-deletion result at different
2940 * slots we can write them without locking.
2942 if (shared_istat
&& !shared_istat
->updated
&& istat_res
!= NULL
)
2944 memcpy(&shared_istat
->istat
, istat_res
, sizeof(IndexBulkDeleteResult
));
2945 shared_istat
->updated
= true;
2947 /* Free the locally-allocated bulk-deletion result */
2950 /* return the pointer to the result from shared memory */
2951 return &shared_istat
->istat
;
2958 * lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2961 lazy_cleanup_all_indexes(LVRelState
*vacrel
)
2963 Assert(!IsParallelWorker());
2964 Assert(vacrel
->nindexes
> 0);
2966 /* Report that we are now cleaning up indexes */
2967 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
2968 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP
);
2970 if (!ParallelVacuumIsActive(vacrel
))
2972 double reltuples
= vacrel
->new_rel_tuples
;
2973 bool estimated_count
=
2974 vacrel
->tupcount_pages
< vacrel
->rel_pages
;
2976 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
2978 Relation indrel
= vacrel
->indrels
[idx
];
2979 IndexBulkDeleteResult
*istat
= vacrel
->indstats
[idx
];
2981 vacrel
->indstats
[idx
] =
2982 lazy_cleanup_one_index(indrel
, istat
, reltuples
,
2983 estimated_count
, vacrel
);
2988 /* Outsource everything to parallel variant */
2989 do_parallel_lazy_cleanup_all_indexes(vacrel
);
2994 * lazy_vacuum_one_index() -- vacuum index relation.
2996 * Delete all the index entries pointing to tuples listed in
2997 * dead_tuples, and update running statistics.
2999 * reltuples is the number of heap tuples to be passed to the
3000 * bulkdelete callback. It's always assumed to be estimated.
3002 * Returns bulk delete stats derived from input stats
3004 static IndexBulkDeleteResult
*
3005 lazy_vacuum_one_index(Relation indrel
, IndexBulkDeleteResult
*istat
,
3006 double reltuples
, LVRelState
*vacrel
)
3008 IndexVacuumInfo ivinfo
;
3010 LVSavedErrInfo saved_err_info
;
3012 pg_rusage_init(&ru0
);
3014 ivinfo
.index
= indrel
;
3015 ivinfo
.analyze_only
= false;
3016 ivinfo
.report_progress
= false;
3017 ivinfo
.estimated_count
= true;
3018 ivinfo
.message_level
= elevel
;
3019 ivinfo
.num_heap_tuples
= reltuples
;
3020 ivinfo
.strategy
= vacrel
->bstrategy
;
3023 * Update error traceback information.
3025 * The index name is saved during this phase and restored immediately
3026 * after this phase. See vacuum_error_callback.
3028 Assert(vacrel
->indname
== NULL
);
3029 vacrel
->indname
= pstrdup(RelationGetRelationName(indrel
));
3030 update_vacuum_error_info(vacrel
, &saved_err_info
,
3031 VACUUM_ERRCB_PHASE_VACUUM_INDEX
,
3032 InvalidBlockNumber
, InvalidOffsetNumber
);
3034 /* Do bulk deletion */
3035 istat
= index_bulk_delete(&ivinfo
, istat
, lazy_tid_reaped
,
3036 (void *) vacrel
->dead_tuples
);
3039 (errmsg("scanned index \"%s\" to remove %d row versions",
3040 vacrel
->indname
, vacrel
->dead_tuples
->num_tuples
),
3041 errdetail_internal("%s", pg_rusage_show(&ru0
))));
3043 /* Revert to the previous phase information for error traceback */
3044 restore_vacuum_error_info(vacrel
, &saved_err_info
);
3045 pfree(vacrel
->indname
);
3046 vacrel
->indname
= NULL
;
3052 * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
3054 * reltuples is the number of heap tuples and estimated_count is true
3055 * if reltuples is an estimated value.
3057 * Returns bulk delete stats derived from input stats
3059 static IndexBulkDeleteResult
*
3060 lazy_cleanup_one_index(Relation indrel
, IndexBulkDeleteResult
*istat
,
3061 double reltuples
, bool estimated_count
,
3064 IndexVacuumInfo ivinfo
;
3066 LVSavedErrInfo saved_err_info
;
3068 pg_rusage_init(&ru0
);
3070 ivinfo
.index
= indrel
;
3071 ivinfo
.analyze_only
= false;
3072 ivinfo
.report_progress
= false;
3073 ivinfo
.estimated_count
= estimated_count
;
3074 ivinfo
.message_level
= elevel
;
3076 ivinfo
.num_heap_tuples
= reltuples
;
3077 ivinfo
.strategy
= vacrel
->bstrategy
;
3080 * Update error traceback information.
3082 * The index name is saved during this phase and restored immediately
3083 * after this phase. See vacuum_error_callback.
3085 Assert(vacrel
->indname
== NULL
);
3086 vacrel
->indname
= pstrdup(RelationGetRelationName(indrel
));
3087 update_vacuum_error_info(vacrel
, &saved_err_info
,
3088 VACUUM_ERRCB_PHASE_INDEX_CLEANUP
,
3089 InvalidBlockNumber
, InvalidOffsetNumber
);
3091 istat
= index_vacuum_cleanup(&ivinfo
, istat
);
3096 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3097 RelationGetRelationName(indrel
),
3098 (istat
)->num_index_tuples
,
3099 (istat
)->num_pages
),
3100 errdetail("%.0f index row versions were removed.\n"
3101 "%u index pages were newly deleted.\n"
3102 "%u index pages are currently deleted, of which %u are currently reusable.\n"
3104 (istat
)->tuples_removed
,
3105 (istat
)->pages_newly_deleted
,
3106 (istat
)->pages_deleted
, (istat
)->pages_free
,
3107 pg_rusage_show(&ru0
))));
3110 /* Revert to the previous phase information for error traceback */
3111 restore_vacuum_error_info(vacrel
, &saved_err_info
);
3112 pfree(vacrel
->indname
);
3113 vacrel
->indname
= NULL
;
3119 * should_attempt_truncation - should we attempt to truncate the heap?
3121 * Don't even think about it unless we have a shot at releasing a goodly
3122 * number of pages. Otherwise, the time taken isn't worth it.
3124 * Also don't attempt it if wraparound failsafe is in effect. It's hard to
3125 * predict how long lazy_truncate_heap will take. Don't take any chances.
3126 * There is very little chance of truncation working out when the failsafe is
3127 * in effect in any case. lazy_scan_prune makes the optimistic assumption
3128 * that any LP_DEAD items it encounters will always be LP_UNUSED by the time
3131 * Also don't attempt it if we are doing early pruning/vacuuming, because a
3132 * scan which cannot find a truncated heap page cannot determine that the
3133 * snapshot is too old to read that page.
3135 * This is split out so that we can test whether truncation is going to be
3136 * called for before we actually do it. If you change the logic here, be
3137 * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
3140 should_attempt_truncation(LVRelState
*vacrel
, VacuumParams
*params
)
3142 BlockNumber possibly_freeable
;
3144 if (params
->truncate
== VACOPT_TERNARY_DISABLED
)
3147 if (vacrel
->do_failsafe
)
3150 possibly_freeable
= vacrel
->rel_pages
- vacrel
->nonempty_pages
;
3151 if (possibly_freeable
> 0 &&
3152 (possibly_freeable
>= REL_TRUNCATE_MINIMUM
||
3153 possibly_freeable
>= vacrel
->rel_pages
/ REL_TRUNCATE_FRACTION
) &&
3154 old_snapshot_threshold
< 0)
3161 * lazy_truncate_heap - try to truncate off any empty pages at the end
3164 lazy_truncate_heap(LVRelState
*vacrel
)
3166 BlockNumber old_rel_pages
= vacrel
->rel_pages
;
3167 BlockNumber new_rel_pages
;
3170 /* Report that we are now truncating */
3171 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
3172 PROGRESS_VACUUM_PHASE_TRUNCATE
);
3175 * Loop until no more truncating can be done.
3181 pg_rusage_init(&ru0
);
3184 * We need full exclusive lock on the relation in order to do
3185 * truncation. If we can't get it, give up rather than waiting --- we
3186 * don't want to block other backends, and we don't want to deadlock
3187 * (which is quite possible considering we already hold a lower-grade
3190 vacrel
->lock_waiter_detected
= false;
3194 if (ConditionalLockRelation(vacrel
->rel
, AccessExclusiveLock
))
3198 * Check for interrupts while trying to (re-)acquire the exclusive
3201 CHECK_FOR_INTERRUPTS();
3203 if (++lock_retry
> (VACUUM_TRUNCATE_LOCK_TIMEOUT
/
3204 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL
))
3207 * We failed to establish the lock in the specified number of
3208 * retries. This means we give up truncating.
3210 vacrel
->lock_waiter_detected
= true;
3212 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
3217 pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL
* 1000L);
3221 * Now that we have exclusive lock, look to see if the rel has grown
3222 * whilst we were vacuuming with non-exclusive lock. If so, give up;
3223 * the newly added pages presumably contain non-deletable tuples.
3225 new_rel_pages
= RelationGetNumberOfBlocks(vacrel
->rel
);
3226 if (new_rel_pages
!= old_rel_pages
)
3229 * Note: we intentionally don't update vacrel->rel_pages with the
3230 * new rel size here. If we did, it would amount to assuming that
3231 * the new pages are empty, which is unlikely. Leaving the numbers
3232 * alone amounts to assuming that the new pages have the same
3233 * tuple density as existing ones, which is less unlikely.
3235 UnlockRelation(vacrel
->rel
, AccessExclusiveLock
);
3240 * Scan backwards from the end to verify that the end pages actually
3241 * contain no tuples. This is *necessary*, not optional, because
3242 * other backends could have added tuples to these pages whilst we
3245 new_rel_pages
= count_nondeletable_pages(vacrel
);
3246 vacrel
->blkno
= new_rel_pages
;
3248 if (new_rel_pages
>= old_rel_pages
)
3250 /* can't do anything after all */
3251 UnlockRelation(vacrel
->rel
, AccessExclusiveLock
);
3258 RelationTruncate(vacrel
->rel
, new_rel_pages
);
3261 * We can release the exclusive lock as soon as we have truncated.
3262 * Other backends can't safely access the relation until they have
3263 * processed the smgr invalidation that smgrtruncate sent out ... but
3264 * that should happen as part of standard invalidation processing once
3265 * they acquire lock on the relation.
3267 UnlockRelation(vacrel
->rel
, AccessExclusiveLock
);
3270 * Update statistics. Here, it *is* correct to adjust rel_pages
3271 * without also touching reltuples, since the tuple count wasn't
3272 * changed by the truncation.
3274 vacrel
->pages_removed
+= old_rel_pages
- new_rel_pages
;
3275 vacrel
->rel_pages
= new_rel_pages
;
3278 (errmsg("\"%s\": truncated %u to %u pages",
3280 old_rel_pages
, new_rel_pages
),
3281 errdetail_internal("%s",
3282 pg_rusage_show(&ru0
))));
3283 old_rel_pages
= new_rel_pages
;
3284 } while (new_rel_pages
> vacrel
->nonempty_pages
&&
3285 vacrel
->lock_waiter_detected
);
3289 * Rescan end pages to verify that they are (still) empty of tuples.
3291 * Returns number of nondeletable pages (last nonempty page + 1).
3294 count_nondeletable_pages(LVRelState
*vacrel
)
3297 BlockNumber prefetchedUntil
;
3298 instr_time starttime
;
3300 /* Initialize the starttime if we check for conflicting lock requests */
3301 INSTR_TIME_SET_CURRENT(starttime
);
3304 * Start checking blocks at what we believe relation end to be and move
3305 * backwards. (Strange coding of loop control is needed because blkno is
3306 * unsigned.) To make the scan faster, we prefetch a few blocks at a time
3307 * in forward direction, so that OS-level readahead can kick in.
3309 blkno
= vacrel
->rel_pages
;
3310 StaticAssertStmt((PREFETCH_SIZE
& (PREFETCH_SIZE
- 1)) == 0,
3311 "prefetch size must be power of 2");
3312 prefetchedUntil
= InvalidBlockNumber
;
3313 while (blkno
> vacrel
->nonempty_pages
)
3317 OffsetNumber offnum
,
3322 * Check if another process requests a lock on our relation. We are
3323 * holding an AccessExclusiveLock here, so they will be waiting. We
3324 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
3325 * only check if that interval has elapsed once every 32 blocks to
3326 * keep the number of system calls and actual shared lock table
3327 * lookups to a minimum.
3329 if ((blkno
% 32) == 0)
3331 instr_time currenttime
;
3334 INSTR_TIME_SET_CURRENT(currenttime
);
3335 elapsed
= currenttime
;
3336 INSTR_TIME_SUBTRACT(elapsed
, starttime
);
3337 if ((INSTR_TIME_GET_MICROSEC(elapsed
) / 1000)
3338 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL
)
3340 if (LockHasWaitersRelation(vacrel
->rel
, AccessExclusiveLock
))
3343 (errmsg("\"%s\": suspending truncate due to conflicting lock request",
3346 vacrel
->lock_waiter_detected
= true;
3349 starttime
= currenttime
;
3354 * We don't insert a vacuum delay point here, because we have an
3355 * exclusive lock on the table which we want to hold for as short a
3356 * time as possible. We still need to check for interrupts however.
3358 CHECK_FOR_INTERRUPTS();
3362 /* If we haven't prefetched this lot yet, do so now. */
3363 if (prefetchedUntil
> blkno
)
3365 BlockNumber prefetchStart
;
3368 prefetchStart
= blkno
& ~(PREFETCH_SIZE
- 1);
3369 for (pblkno
= prefetchStart
; pblkno
<= blkno
; pblkno
++)
3371 PrefetchBuffer(vacrel
->rel
, MAIN_FORKNUM
, pblkno
);
3372 CHECK_FOR_INTERRUPTS();
3374 prefetchedUntil
= prefetchStart
;
3377 buf
= ReadBufferExtended(vacrel
->rel
, MAIN_FORKNUM
, blkno
, RBM_NORMAL
,
3380 /* In this phase we only need shared access to the buffer */
3381 LockBuffer(buf
, BUFFER_LOCK_SHARE
);
3383 page
= BufferGetPage(buf
);
3385 if (PageIsNew(page
) || PageIsEmpty(page
))
3387 UnlockReleaseBuffer(buf
);
3392 maxoff
= PageGetMaxOffsetNumber(page
);
3393 for (offnum
= FirstOffsetNumber
;
3395 offnum
= OffsetNumberNext(offnum
))
3399 itemid
= PageGetItemId(page
, offnum
);
3402 * Note: any non-unused item should be taken as a reason to keep
3403 * this page. We formerly thought that DEAD tuples could be
3404 * thrown away, but that's not so, because we'd not have cleaned
3405 * out their index entries.
3407 if (ItemIdIsUsed(itemid
))
3410 break; /* can stop scanning */
3412 } /* scan along page */
3414 UnlockReleaseBuffer(buf
);
3416 /* Done scanning if we found a tuple here */
3422 * If we fall out of the loop, all the previously-thought-to-be-empty
3423 * pages still are; we need not bother to look at the last known-nonempty
3426 return vacrel
->nonempty_pages
;
3430 * Return the maximum number of dead tuples we can record.
3433 compute_max_dead_tuples(BlockNumber relblocks
, bool hasindex
)
3436 int vac_work_mem
= IsAutoVacuumWorkerProcess() &&
3437 autovacuum_work_mem
!= -1 ?
3438 autovacuum_work_mem
: maintenance_work_mem
;
3442 maxtuples
= MAXDEADTUPLES(vac_work_mem
* 1024L);
3443 maxtuples
= Min(maxtuples
, INT_MAX
);
3444 maxtuples
= Min(maxtuples
, MAXDEADTUPLES(MaxAllocSize
));
3446 /* curious coding here to ensure the multiplication can't overflow */
3447 if ((BlockNumber
) (maxtuples
/ LAZY_ALLOC_TUPLES
) > relblocks
)
3448 maxtuples
= relblocks
* LAZY_ALLOC_TUPLES
;
3450 /* stay sane if small maintenance_work_mem */
3451 maxtuples
= Max(maxtuples
, MaxHeapTuplesPerPage
);
3454 maxtuples
= MaxHeapTuplesPerPage
;
3460 * lazy_space_alloc - space allocation decisions for lazy vacuum
3462 * See the comments at the head of this file for rationale.
3465 lazy_space_alloc(LVRelState
*vacrel
, int nworkers
, BlockNumber nblocks
)
3467 LVDeadTuples
*dead_tuples
;
3471 * Initialize state for a parallel vacuum. As of now, only one worker can
3472 * be used for an index, so we invoke parallelism only if there are at
3473 * least two indexes on a table.
3475 if (nworkers
>= 0 && vacrel
->nindexes
> 1 && vacrel
->do_index_vacuuming
)
3478 * Since parallel workers cannot access data in temporary tables, we
3479 * can't perform parallel vacuum on them.
3481 if (RelationUsesLocalBuffers(vacrel
->rel
))
3484 * Give warning only if the user explicitly tries to perform a
3485 * parallel vacuum on the temporary table.
3489 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3493 vacrel
->lps
= begin_parallel_vacuum(vacrel
, nblocks
, nworkers
);
3495 /* If parallel mode started, we're done */
3496 if (ParallelVacuumIsActive(vacrel
))
3500 maxtuples
= compute_max_dead_tuples(nblocks
, vacrel
->nindexes
> 0);
3502 dead_tuples
= (LVDeadTuples
*) palloc(SizeOfDeadTuples(maxtuples
));
3503 dead_tuples
->num_tuples
= 0;
3504 dead_tuples
->max_tuples
= (int) maxtuples
;
3506 vacrel
->dead_tuples
= dead_tuples
;
3510 * lazy_space_free - free space allocated in lazy_space_alloc
3513 lazy_space_free(LVRelState
*vacrel
)
3515 if (!ParallelVacuumIsActive(vacrel
))
3519 * End parallel mode before updating index statistics as we cannot write
3520 * during parallel mode.
3522 end_parallel_vacuum(vacrel
);
3526 * lazy_tid_reaped() -- is a particular tid deletable?
3528 * This has the right signature to be an IndexBulkDeleteCallback.
3530 * Assumes dead_tuples array is in sorted order.
3533 lazy_tid_reaped(ItemPointer itemptr
, void *state
)
3535 LVDeadTuples
*dead_tuples
= (LVDeadTuples
*) state
;
3541 litem
= itemptr_encode(&dead_tuples
->itemptrs
[0]);
3542 ritem
= itemptr_encode(&dead_tuples
->itemptrs
[dead_tuples
->num_tuples
- 1]);
3543 item
= itemptr_encode(itemptr
);
3546 * Doing a simple bound check before bsearch() is useful to avoid the
3547 * extra cost of bsearch(), especially if dead tuples on the heap are
3548 * concentrated in a certain range. Since this function is called for
3549 * every index tuple, it pays to be really fast.
3551 if (item
< litem
|| item
> ritem
)
3554 res
= (ItemPointer
) bsearch((void *) itemptr
,
3555 (void *) dead_tuples
->itemptrs
,
3556 dead_tuples
->num_tuples
,
3557 sizeof(ItemPointerData
),
3560 return (res
!= NULL
);
3564 * Comparator routines for use with qsort() and bsearch().
3567 vac_cmp_itemptr(const void *left
, const void *right
)
3574 lblk
= ItemPointerGetBlockNumber((ItemPointer
) left
);
3575 rblk
= ItemPointerGetBlockNumber((ItemPointer
) right
);
3582 loff
= ItemPointerGetOffsetNumber((ItemPointer
) left
);
3583 roff
= ItemPointerGetOffsetNumber((ItemPointer
) right
);
3594 * Check if every tuple in the given page is visible to all current and future
3595 * transactions. Also return the visibility_cutoff_xid which is the highest
3596 * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
3597 * on this page is frozen.
3600 heap_page_is_all_visible(LVRelState
*vacrel
, Buffer buf
,
3601 TransactionId
*visibility_cutoff_xid
,
3604 Page page
= BufferGetPage(buf
);
3605 BlockNumber blockno
= BufferGetBlockNumber(buf
);
3606 OffsetNumber offnum
,
3608 bool all_visible
= true;
3610 *visibility_cutoff_xid
= InvalidTransactionId
;
3614 * This is a stripped down version of the line pointer scan in
3615 * lazy_scan_heap(). So if you change anything here, also check that code.
3617 maxoff
= PageGetMaxOffsetNumber(page
);
3618 for (offnum
= FirstOffsetNumber
;
3619 offnum
<= maxoff
&& all_visible
;
3620 offnum
= OffsetNumberNext(offnum
))
3623 HeapTupleData tuple
;
3626 * Set the offset number so that we can display it along with any
3627 * error that occurred while processing this tuple.
3629 vacrel
->offnum
= offnum
;
3630 itemid
= PageGetItemId(page
, offnum
);
3632 /* Unused or redirect line pointers are of no interest */
3633 if (!ItemIdIsUsed(itemid
) || ItemIdIsRedirected(itemid
))
3636 ItemPointerSet(&(tuple
.t_self
), blockno
, offnum
);
3639 * Dead line pointers can have index pointers pointing to them. So
3640 * they can't be treated as visible
3642 if (ItemIdIsDead(itemid
))
3644 all_visible
= false;
3645 *all_frozen
= false;
3649 Assert(ItemIdIsNormal(itemid
));
3651 tuple
.t_data
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
3652 tuple
.t_len
= ItemIdGetLength(itemid
);
3653 tuple
.t_tableOid
= RelationGetRelid(vacrel
->rel
);
3655 switch (HeapTupleSatisfiesVacuum(&tuple
, vacrel
->OldestXmin
, buf
))
3657 case HEAPTUPLE_LIVE
:
3661 /* Check comments in lazy_scan_heap. */
3662 if (!HeapTupleHeaderXminCommitted(tuple
.t_data
))
3664 all_visible
= false;
3665 *all_frozen
= false;
3670 * The inserter definitely committed. But is it old enough
3671 * that everyone sees it as committed?
3673 xmin
= HeapTupleHeaderGetXmin(tuple
.t_data
);
3674 if (!TransactionIdPrecedes(xmin
, vacrel
->OldestXmin
))
3676 all_visible
= false;
3677 *all_frozen
= false;
3681 /* Track newest xmin on page. */
3682 if (TransactionIdFollows(xmin
, *visibility_cutoff_xid
))
3683 *visibility_cutoff_xid
= xmin
;
3685 /* Check whether this tuple is already frozen or not */
3686 if (all_visible
&& *all_frozen
&&
3687 heap_tuple_needs_eventual_freeze(tuple
.t_data
))
3688 *all_frozen
= false;
3692 case HEAPTUPLE_DEAD
:
3693 case HEAPTUPLE_RECENTLY_DEAD
:
3694 case HEAPTUPLE_INSERT_IN_PROGRESS
:
3695 case HEAPTUPLE_DELETE_IN_PROGRESS
:
3697 all_visible
= false;
3698 *all_frozen
= false;
3702 elog(ERROR
, "unexpected HeapTupleSatisfiesVacuum result");
3705 } /* scan along page */
3707 /* Clear the offset information once we have processed the given page. */
3708 vacrel
->offnum
= InvalidOffsetNumber
;
3714 * Compute the number of parallel worker processes to request. Both index
3715 * vacuum and index cleanup can be executed with parallel workers. The index
3716 * is eligible for parallel vacuum iff its size is greater than
3717 * min_parallel_index_scan_size as invoking workers for very small indexes
3718 * can hurt performance.
3720 * nrequested is the number of parallel workers that user requested. If
3721 * nrequested is 0, we compute the parallel degree based on nindexes, that is
3722 * the number of indexes that support parallel vacuum. This function also
3723 * sets can_parallel_vacuum to remember indexes that participate in parallel
3727 compute_parallel_vacuum_workers(LVRelState
*vacrel
, int nrequested
,
3728 bool *can_parallel_vacuum
)
3730 int nindexes_parallel
= 0;
3731 int nindexes_parallel_bulkdel
= 0;
3732 int nindexes_parallel_cleanup
= 0;
3733 int parallel_workers
;
3736 * We don't allow performing parallel operation in standalone backend or
3737 * when parallelism is disabled.
3739 if (!IsUnderPostmaster
|| max_parallel_maintenance_workers
== 0)
3743 * Compute the number of indexes that can participate in parallel vacuum.
3745 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
3747 Relation indrel
= vacrel
->indrels
[idx
];
3748 uint8 vacoptions
= indrel
->rd_indam
->amparallelvacuumoptions
;
3750 if (vacoptions
== VACUUM_OPTION_NO_PARALLEL
||
3751 RelationGetNumberOfBlocks(indrel
) < min_parallel_index_scan_size
)
3754 can_parallel_vacuum
[idx
] = true;
3756 if ((vacoptions
& VACUUM_OPTION_PARALLEL_BULKDEL
) != 0)
3757 nindexes_parallel_bulkdel
++;
3758 if (((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) != 0) ||
3759 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) != 0))
3760 nindexes_parallel_cleanup
++;
3763 nindexes_parallel
= Max(nindexes_parallel_bulkdel
,
3764 nindexes_parallel_cleanup
);
3766 /* The leader process takes one index */
3767 nindexes_parallel
--;
3769 /* No index supports parallel vacuum */
3770 if (nindexes_parallel
<= 0)
3773 /* Compute the parallel degree */
3774 parallel_workers
= (nrequested
> 0) ?
3775 Min(nrequested
, nindexes_parallel
) : nindexes_parallel
;
3777 /* Cap by max_parallel_maintenance_workers */
3778 parallel_workers
= Min(parallel_workers
, max_parallel_maintenance_workers
);
3780 return parallel_workers
;
3784 * Update index statistics in pg_class if the statistics are accurate.
3787 update_index_statistics(LVRelState
*vacrel
)
3789 Relation
*indrels
= vacrel
->indrels
;
3790 int nindexes
= vacrel
->nindexes
;
3791 IndexBulkDeleteResult
**indstats
= vacrel
->indstats
;
3793 Assert(!IsInParallelMode());
3795 for (int idx
= 0; idx
< nindexes
; idx
++)
3797 Relation indrel
= indrels
[idx
];
3798 IndexBulkDeleteResult
*istat
= indstats
[idx
];
3800 if (istat
== NULL
|| istat
->estimated_count
)
3803 /* Update index statistics */
3804 vac_update_relstats(indrel
,
3806 istat
->num_index_tuples
,
3809 InvalidTransactionId
,
3816 * This function prepares and returns parallel vacuum state if we can launch
3817 * even one worker. This function is responsible for entering parallel mode,
3818 * create a parallel context, and then initialize the DSM segment.
3820 static LVParallelState
*
3821 begin_parallel_vacuum(LVRelState
*vacrel
, BlockNumber nblocks
,
3824 LVParallelState
*lps
= NULL
;
3825 Relation
*indrels
= vacrel
->indrels
;
3826 int nindexes
= vacrel
->nindexes
;
3827 ParallelContext
*pcxt
;
3829 LVDeadTuples
*dead_tuples
;
3830 BufferUsage
*buffer_usage
;
3831 WalUsage
*wal_usage
;
3832 bool *can_parallel_vacuum
;
3835 Size est_deadtuples
;
3836 int nindexes_mwm
= 0;
3837 int parallel_workers
= 0;
3841 * A parallel vacuum must be requested and there must be indexes on the
3844 Assert(nrequested
>= 0);
3845 Assert(nindexes
> 0);
3848 * Compute the number of parallel vacuum workers to launch
3850 can_parallel_vacuum
= (bool *) palloc0(sizeof(bool) * nindexes
);
3851 parallel_workers
= compute_parallel_vacuum_workers(vacrel
,
3853 can_parallel_vacuum
);
3855 /* Can't perform vacuum in parallel */
3856 if (parallel_workers
<= 0)
3858 pfree(can_parallel_vacuum
);
3862 lps
= (LVParallelState
*) palloc0(sizeof(LVParallelState
));
3864 EnterParallelMode();
3865 pcxt
= CreateParallelContext("postgres", "parallel_vacuum_main",
3867 Assert(pcxt
->nworkers
> 0);
3870 /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3871 est_shared
= MAXALIGN(add_size(SizeOfLVShared
, BITMAPLEN(nindexes
)));
3872 for (int idx
= 0; idx
< nindexes
; idx
++)
3874 Relation indrel
= indrels
[idx
];
3875 uint8 vacoptions
= indrel
->rd_indam
->amparallelvacuumoptions
;
3878 * Cleanup option should be either disabled, always performing in
3879 * parallel or conditionally performing in parallel.
3881 Assert(((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) == 0) ||
3882 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) == 0));
3883 Assert(vacoptions
<= VACUUM_OPTION_MAX_VALID_VALUE
);
3885 /* Skip indexes that don't participate in parallel vacuum */
3886 if (!can_parallel_vacuum
[idx
])
3889 if (indrel
->rd_indam
->amusemaintenanceworkmem
)
3892 est_shared
= add_size(est_shared
, sizeof(LVSharedIndStats
));
3895 * Remember the number of indexes that support parallel operation for
3898 if ((vacoptions
& VACUUM_OPTION_PARALLEL_BULKDEL
) != 0)
3899 lps
->nindexes_parallel_bulkdel
++;
3900 if ((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) != 0)
3901 lps
->nindexes_parallel_cleanup
++;
3902 if ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) != 0)
3903 lps
->nindexes_parallel_condcleanup
++;
3905 shm_toc_estimate_chunk(&pcxt
->estimator
, est_shared
);
3906 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3908 /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3909 maxtuples
= compute_max_dead_tuples(nblocks
, true);
3910 est_deadtuples
= MAXALIGN(SizeOfDeadTuples(maxtuples
));
3911 shm_toc_estimate_chunk(&pcxt
->estimator
, est_deadtuples
);
3912 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3915 * Estimate space for BufferUsage and WalUsage --
3916 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3918 * If there are no extensions loaded that care, we could skip this. We
3919 * have no way of knowing whether anyone's looking at pgBufferUsage or
3920 * pgWalUsage, so do it unconditionally.
3922 shm_toc_estimate_chunk(&pcxt
->estimator
,
3923 mul_size(sizeof(BufferUsage
), pcxt
->nworkers
));
3924 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3925 shm_toc_estimate_chunk(&pcxt
->estimator
,
3926 mul_size(sizeof(WalUsage
), pcxt
->nworkers
));
3927 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3929 /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3930 if (debug_query_string
)
3932 querylen
= strlen(debug_query_string
);
3933 shm_toc_estimate_chunk(&pcxt
->estimator
, querylen
+ 1);
3934 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3937 querylen
= 0; /* keep compiler quiet */
3939 InitializeParallelDSM(pcxt
);
3941 /* Prepare shared information */
3942 shared
= (LVShared
*) shm_toc_allocate(pcxt
->toc
, est_shared
);
3943 MemSet(shared
, 0, est_shared
);
3944 shared
->relid
= RelationGetRelid(vacrel
->rel
);
3945 shared
->elevel
= elevel
;
3946 shared
->maintenance_work_mem_worker
=
3947 (nindexes_mwm
> 0) ?
3948 maintenance_work_mem
/ Min(parallel_workers
, nindexes_mwm
) :
3949 maintenance_work_mem
;
3951 pg_atomic_init_u32(&(shared
->cost_balance
), 0);
3952 pg_atomic_init_u32(&(shared
->active_nworkers
), 0);
3953 pg_atomic_init_u32(&(shared
->idx
), 0);
3954 shared
->offset
= MAXALIGN(add_size(SizeOfLVShared
, BITMAPLEN(nindexes
)));
3957 * Initialize variables for shared index statistics, set NULL bitmap and
3958 * the size of stats for each index.
3960 memset(shared
->bitmap
, 0x00, BITMAPLEN(nindexes
));
3961 for (int idx
= 0; idx
< nindexes
; idx
++)
3963 if (!can_parallel_vacuum
[idx
])
3966 /* Set NOT NULL as this index does support parallelism */
3967 shared
->bitmap
[idx
>> 3] |= 1 << (idx
& 0x07);
3970 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_SHARED
, shared
);
3971 lps
->lvshared
= shared
;
3973 /* Prepare the dead tuple space */
3974 dead_tuples
= (LVDeadTuples
*) shm_toc_allocate(pcxt
->toc
, est_deadtuples
);
3975 dead_tuples
->max_tuples
= maxtuples
;
3976 dead_tuples
->num_tuples
= 0;
3977 MemSet(dead_tuples
->itemptrs
, 0, sizeof(ItemPointerData
) * maxtuples
);
3978 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_DEAD_TUPLES
, dead_tuples
);
3979 vacrel
->dead_tuples
= dead_tuples
;
3982 * Allocate space for each worker's BufferUsage and WalUsage; no need to
3985 buffer_usage
= shm_toc_allocate(pcxt
->toc
,
3986 mul_size(sizeof(BufferUsage
), pcxt
->nworkers
));
3987 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_BUFFER_USAGE
, buffer_usage
);
3988 lps
->buffer_usage
= buffer_usage
;
3989 wal_usage
= shm_toc_allocate(pcxt
->toc
,
3990 mul_size(sizeof(WalUsage
), pcxt
->nworkers
));
3991 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_WAL_USAGE
, wal_usage
);
3992 lps
->wal_usage
= wal_usage
;
3994 /* Store query string for workers */
3995 if (debug_query_string
)
3999 sharedquery
= (char *) shm_toc_allocate(pcxt
->toc
, querylen
+ 1);
4000 memcpy(sharedquery
, debug_query_string
, querylen
+ 1);
4001 sharedquery
[querylen
] = '\0';
4002 shm_toc_insert(pcxt
->toc
,
4003 PARALLEL_VACUUM_KEY_QUERY_TEXT
, sharedquery
);
4006 pfree(can_parallel_vacuum
);
4011 * Destroy the parallel context, and end parallel mode.
4013 * Since writes are not allowed during parallel mode, copy the
4014 * updated index statistics from DSM into local memory and then later use that
4015 * to update the index statistics. One might think that we can exit from
4016 * parallel mode, update the index statistics and then destroy parallel
4017 * context, but that won't be safe (see ExitParallelMode).
4020 end_parallel_vacuum(LVRelState
*vacrel
)
4022 IndexBulkDeleteResult
**indstats
= vacrel
->indstats
;
4023 LVParallelState
*lps
= vacrel
->lps
;
4024 int nindexes
= vacrel
->nindexes
;
4026 Assert(!IsParallelWorker());
4028 /* Copy the updated statistics */
4029 for (int idx
= 0; idx
< nindexes
; idx
++)
4031 LVSharedIndStats
*shared_istat
;
4033 shared_istat
= parallel_stats_for_idx(lps
->lvshared
, idx
);
4036 * Skip unused slot. The statistics of this index are already stored
4039 if (shared_istat
== NULL
)
4042 if (shared_istat
->updated
)
4044 indstats
[idx
] = (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
4045 memcpy(indstats
[idx
], &(shared_istat
->istat
), sizeof(IndexBulkDeleteResult
));
4048 indstats
[idx
] = NULL
;
4051 DestroyParallelContext(lps
->pcxt
);
4054 /* Deactivate parallel vacuum */
4060 * Return shared memory statistics for index at offset 'getidx', if any
4062 static LVSharedIndStats
*
4063 parallel_stats_for_idx(LVShared
*lvshared
, int getidx
)
4067 if (IndStatsIsNull(lvshared
, getidx
))
4070 p
= (char *) GetSharedIndStats(lvshared
);
4071 for (int idx
= 0; idx
< getidx
; idx
++)
4073 if (IndStatsIsNull(lvshared
, idx
))
4076 p
+= sizeof(LVSharedIndStats
);
4079 return (LVSharedIndStats
*) p
;
4083 * Returns false, if the given index can't participate in parallel index
4084 * vacuum or parallel index cleanup
4087 parallel_processing_is_safe(Relation indrel
, LVShared
*lvshared
)
4089 uint8 vacoptions
= indrel
->rd_indam
->amparallelvacuumoptions
;
4091 /* first_time must be true only if for_cleanup is true */
4092 Assert(lvshared
->for_cleanup
|| !lvshared
->first_time
);
4094 if (lvshared
->for_cleanup
)
4096 /* Skip, if the index does not support parallel cleanup */
4097 if (((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) == 0) &&
4098 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) == 0))
4102 * Skip, if the index supports parallel cleanup conditionally, but we
4103 * have already processed the index (for bulkdelete). See the
4104 * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
4105 * when indexes support parallel cleanup conditionally.
4107 if (!lvshared
->first_time
&&
4108 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) != 0))
4111 else if ((vacoptions
& VACUUM_OPTION_PARALLEL_BULKDEL
) == 0)
4113 /* Skip if the index does not support parallel bulk deletion */
4121 * Perform work within a launched parallel process.
4123 * Since parallel vacuum workers perform only index vacuum or index cleanup,
4124 * we don't need to report progress information.
4127 parallel_vacuum_main(dsm_segment
*seg
, shm_toc
*toc
)
4132 LVDeadTuples
*dead_tuples
;
4133 BufferUsage
*buffer_usage
;
4134 WalUsage
*wal_usage
;
4138 ErrorContextCallback errcallback
;
4140 lvshared
= (LVShared
*) shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_SHARED
,
4142 elevel
= lvshared
->elevel
;
4144 if (lvshared
->for_cleanup
)
4145 elog(DEBUG1
, "starting parallel vacuum worker for cleanup");
4147 elog(DEBUG1
, "starting parallel vacuum worker for bulk delete");
4149 /* Set debug_query_string for individual workers */
4150 sharedquery
= shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_QUERY_TEXT
, true);
4151 debug_query_string
= sharedquery
;
4152 pgstat_report_activity(STATE_RUNNING
, debug_query_string
);
4155 * Open table. The lock mode is the same as the leader process. It's
4156 * okay because the lock mode does not conflict among the parallel
4159 rel
= table_open(lvshared
->relid
, ShareUpdateExclusiveLock
);
4162 * Open all indexes. indrels are sorted in order by OID, which should be
4163 * matched to the leader's one.
4165 vac_open_indexes(rel
, RowExclusiveLock
, &nindexes
, &indrels
);
4166 Assert(nindexes
> 0);
4168 /* Set dead tuple space */
4169 dead_tuples
= (LVDeadTuples
*) shm_toc_lookup(toc
,
4170 PARALLEL_VACUUM_KEY_DEAD_TUPLES
,
4173 /* Set cost-based vacuum delay */
4174 VacuumCostActive
= (VacuumCostDelay
> 0);
4175 VacuumCostBalance
= 0;
4178 VacuumPageDirty
= 0;
4179 VacuumCostBalanceLocal
= 0;
4180 VacuumSharedCostBalance
= &(lvshared
->cost_balance
);
4181 VacuumActiveNWorkers
= &(lvshared
->active_nworkers
);
4184 vacrel
.indrels
= indrels
;
4185 vacrel
.nindexes
= nindexes
;
4186 /* Each parallel VACUUM worker gets its own access strategy */
4187 vacrel
.bstrategy
= GetAccessStrategy(BAS_VACUUM
);
4188 vacrel
.indstats
= (IndexBulkDeleteResult
**)
4189 palloc0(nindexes
* sizeof(IndexBulkDeleteResult
*));
4191 if (lvshared
->maintenance_work_mem_worker
> 0)
4192 maintenance_work_mem
= lvshared
->maintenance_work_mem_worker
;
4195 * Initialize vacrel for use as error callback arg by parallel worker.
4197 vacrel
.relnamespace
= get_namespace_name(RelationGetNamespace(rel
));
4198 vacrel
.relname
= pstrdup(RelationGetRelationName(rel
));
4199 vacrel
.indname
= NULL
;
4200 vacrel
.phase
= VACUUM_ERRCB_PHASE_UNKNOWN
; /* Not yet processing */
4201 vacrel
.dead_tuples
= dead_tuples
;
4203 /* Setup error traceback support for ereport() */
4204 errcallback
.callback
= vacuum_error_callback
;
4205 errcallback
.arg
= &vacrel
;
4206 errcallback
.previous
= error_context_stack
;
4207 error_context_stack
= &errcallback
;
4209 /* Prepare to track buffer usage during parallel execution */
4210 InstrStartParallelQuery();
4212 /* Process indexes to perform vacuum/cleanup */
4213 do_parallel_processing(&vacrel
, lvshared
);
4215 /* Report buffer/WAL usage during parallel execution */
4216 buffer_usage
= shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_BUFFER_USAGE
, false);
4217 wal_usage
= shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_WAL_USAGE
, false);
4218 InstrEndParallelQuery(&buffer_usage
[ParallelWorkerNumber
],
4219 &wal_usage
[ParallelWorkerNumber
]);
4221 /* Pop the error context stack */
4222 error_context_stack
= errcallback
.previous
;
4224 vac_close_indexes(nindexes
, indrels
, RowExclusiveLock
);
4225 table_close(rel
, ShareUpdateExclusiveLock
);
4226 FreeAccessStrategy(vacrel
.bstrategy
);
4227 pfree(vacrel
.indstats
);
4231 * Error context callback for errors occurring during vacuum.
4234 vacuum_error_callback(void *arg
)
4236 LVRelState
*errinfo
= arg
;
4238 switch (errinfo
->phase
)
4240 case VACUUM_ERRCB_PHASE_SCAN_HEAP
:
4241 if (BlockNumberIsValid(errinfo
->blkno
))
4243 if (OffsetNumberIsValid(errinfo
->offnum
))
4244 errcontext("while scanning block %u and offset %u of relation \"%s.%s\"",
4245 errinfo
->blkno
, errinfo
->offnum
, errinfo
->relnamespace
, errinfo
->relname
);
4247 errcontext("while scanning block %u of relation \"%s.%s\"",
4248 errinfo
->blkno
, errinfo
->relnamespace
, errinfo
->relname
);
4251 errcontext("while scanning relation \"%s.%s\"",
4252 errinfo
->relnamespace
, errinfo
->relname
);
4255 case VACUUM_ERRCB_PHASE_VACUUM_HEAP
:
4256 if (BlockNumberIsValid(errinfo
->blkno
))
4258 if (OffsetNumberIsValid(errinfo
->offnum
))
4259 errcontext("while vacuuming block %u and offset %u of relation \"%s.%s\"",
4260 errinfo
->blkno
, errinfo
->offnum
, errinfo
->relnamespace
, errinfo
->relname
);
4262 errcontext("while vacuuming block %u of relation \"%s.%s\"",
4263 errinfo
->blkno
, errinfo
->relnamespace
, errinfo
->relname
);
4266 errcontext("while vacuuming relation \"%s.%s\"",
4267 errinfo
->relnamespace
, errinfo
->relname
);
4270 case VACUUM_ERRCB_PHASE_VACUUM_INDEX
:
4271 errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
4272 errinfo
->indname
, errinfo
->relnamespace
, errinfo
->relname
);
4275 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP
:
4276 errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
4277 errinfo
->indname
, errinfo
->relnamespace
, errinfo
->relname
);
4280 case VACUUM_ERRCB_PHASE_TRUNCATE
:
4281 if (BlockNumberIsValid(errinfo
->blkno
))
4282 errcontext("while truncating relation \"%s.%s\" to %u blocks",
4283 errinfo
->relnamespace
, errinfo
->relname
, errinfo
->blkno
);
4286 case VACUUM_ERRCB_PHASE_UNKNOWN
:
4288 return; /* do nothing; the errinfo may not be
4294 * Updates the information required for vacuum error callback. This also saves
4295 * the current information which can be later restored via restore_vacuum_error_info.
4298 update_vacuum_error_info(LVRelState
*vacrel
, LVSavedErrInfo
*saved_vacrel
,
4299 int phase
, BlockNumber blkno
, OffsetNumber offnum
)
4303 saved_vacrel
->offnum
= vacrel
->offnum
;
4304 saved_vacrel
->blkno
= vacrel
->blkno
;
4305 saved_vacrel
->phase
= vacrel
->phase
;
4308 vacrel
->blkno
= blkno
;
4309 vacrel
->offnum
= offnum
;
4310 vacrel
->phase
= phase
;
4314 * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
4317 restore_vacuum_error_info(LVRelState
*vacrel
,
4318 const LVSavedErrInfo
*saved_vacrel
)
4320 vacrel
->blkno
= saved_vacrel
->blkno
;
4321 vacrel
->offnum
= saved_vacrel
->offnum
;
4322 vacrel
->phase
= saved_vacrel
->phase
;