1 /*-------------------------------------------------------------------------
4 * Concurrent ("lazy") vacuuming.
7 * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 * TIDs. We want to ensure we can vacuum even the very largest relations with
9 * finite memory space usage. To do that, we set upper bounds on the number of
10 * tuples we will keep track of at once.
12 * We are willing to use at most maintenance_work_mem (or perhaps
13 * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 * initially allocate an array of TIDs of that size, with an upper limit that
15 * depends on table size (this limit ensures we don't allocate a huge area
16 * uselessly for vacuuming small tables). If the array threatens to overflow,
17 * we suspend the heap scan phase and perform a pass of index cleanup and page
18 * compaction, then resume the heap scan with an empty TID array.
20 * If we're processing a table with no indexes, we can just vacuum each page
21 * as we go; there's no need to save up multiple tuples to minimize the number
22 * of index scans performed. So we don't use maintenance_work_mem memory for
23 * the TID array, just enough to hold as many heap tuples as fit on one page.
25 * Lazy vacuum supports parallel execution with parallel worker processes. In
26 * a parallel vacuum, we perform both index vacuum and index cleanup with
27 * parallel worker processes. Individual indexes are processed by one vacuum
28 * process. At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
29 * the parallel context and initialize the DSM segment that contains shared
30 * information as well as the memory space for storing dead tuples. When
31 * starting either index vacuum or index cleanup, we launch parallel worker
32 * processes. Once all indexes are processed the parallel worker processes
33 * exit. After that, the leader process re-initializes the parallel context
34 * so that it can use the same DSM for multiple passes of index vacuum and
35 * for performing index cleanup. For updating the index statistics, we need
36 * to update the system table and since updates are not allowed during
37 * parallel mode we update the index statistics after exiting from the
40 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
41 * Portions Copyright (c) 1994, Regents of the University of California
45 * src/backend/access/heap/vacuumlazy.c
47 *-------------------------------------------------------------------------
53 #include "access/amapi.h"
54 #include "access/genam.h"
55 #include "access/heapam.h"
56 #include "access/heapam_xlog.h"
57 #include "access/htup_details.h"
58 #include "access/multixact.h"
59 #include "access/parallel.h"
60 #include "access/transam.h"
61 #include "access/visibilitymap.h"
62 #include "access/xact.h"
63 #include "access/xlog.h"
64 #include "catalog/index.h"
65 #include "catalog/storage.h"
66 #include "commands/dbcommands.h"
67 #include "commands/progress.h"
68 #include "commands/vacuum.h"
69 #include "executor/instrument.h"
70 #include "miscadmin.h"
71 #include "optimizer/paths.h"
73 #include "portability/instr_time.h"
74 #include "postmaster/autovacuum.h"
75 #include "storage/bufmgr.h"
76 #include "storage/freespace.h"
77 #include "storage/lmgr.h"
78 #include "tcop/tcopprot.h"
79 #include "utils/lsyscache.h"
80 #include "utils/memutils.h"
81 #include "utils/pg_rusage.h"
82 #include "utils/timestamp.h"
86 * Space/time tradeoff parameters: do these need to be user-tunable?
88 * To consider truncating the relation, we want there to be at least
89 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
90 * is less) potentially-freeable pages.
92 #define REL_TRUNCATE_MINIMUM 1000
93 #define REL_TRUNCATE_FRACTION 16
96 * Timing parameters for truncate locking heuristics.
98 * These were not exposed as user tunable GUC values because it didn't seem
99 * that the potential for improvement was great enough to merit the cost of
102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
107 * When a table has no indexes, vacuum the FSM after every 8GB, approximately
108 * (it won't be exact because we only vacuum FSM after processing a heap page
109 * that has some removable tuples). When there are indexes, this is ignored,
110 * and we vacuum FSM after each index/heap cleaning pass.
112 #define VACUUM_FSM_EVERY_PAGES \
113 ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
116 * Guesstimation of number of dead tuples per page. This is used to
117 * provide an upper limit to memory allocated when vacuuming small
120 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
123 * Before we consider skipping a page that's marked as clean in
124 * visibility map, we must've seen at least this many clean pages.
126 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
129 * Size of the prefetch window for lazy vacuum backwards truncation scan.
130 * Needs to be a power of 2.
132 #define PREFETCH_SIZE ((BlockNumber) 32)
135 * DSM keys for parallel vacuum. Unlike other parallel execution code, since
136 * we don't need to worry about DSM keys conflicting with plan_node_id we can
137 * use small integers.
139 #define PARALLEL_VACUUM_KEY_SHARED 1
140 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES 2
141 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 3
142 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4
143 #define PARALLEL_VACUUM_KEY_WAL_USAGE 5
146 * Macro to check if we are in a parallel vacuum. If true, we are in the
147 * parallel mode and the DSM segment is initialized.
149 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
151 /* Phases of vacuum during which we report error context. */
154 VACUUM_ERRCB_PHASE_UNKNOWN
,
155 VACUUM_ERRCB_PHASE_SCAN_HEAP
,
156 VACUUM_ERRCB_PHASE_VACUUM_INDEX
,
157 VACUUM_ERRCB_PHASE_VACUUM_HEAP
,
158 VACUUM_ERRCB_PHASE_INDEX_CLEANUP
,
159 VACUUM_ERRCB_PHASE_TRUNCATE
163 * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
164 * This is allocated in the DSM segment in parallel mode and in local memory
165 * in non-parallel mode.
167 typedef struct LVDeadTuples
169 int max_tuples
; /* # slots allocated in array */
170 int num_tuples
; /* current # of entries */
171 /* List of TIDs of tuples we intend to delete */
172 /* NB: this list is ordered by TID address */
173 ItemPointerData itemptrs
[FLEXIBLE_ARRAY_MEMBER
]; /* array of
177 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
178 #define SizeOfDeadTuples(cnt) \
179 add_size(offsetof(LVDeadTuples, itemptrs), \
180 mul_size(sizeof(ItemPointerData), cnt))
181 #define MAXDEADTUPLES(max_size) \
182 (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
185 * Shared information among parallel workers. So this is allocated in the DSM
188 typedef struct LVShared
191 * Target table relid and log level. These fields are not modified during
198 * An indication for vacuum workers to perform either index vacuum or
199 * index cleanup. first_time is true only if for_cleanup is true and
200 * bulk-deletion is not performed yet.
206 * Fields for both index vacuum and cleanup.
208 * reltuples is the total number of input heap tuples. We set either old
209 * live tuples in the index vacuum case or the new live tuples in the
210 * index cleanup case.
212 * estimated_count is true if reltuples is an estimated value. (Note that
213 * reltuples could be -1 in this case, indicating we have no idea.)
216 bool estimated_count
;
219 * In single process lazy vacuum we could consume more memory during index
220 * vacuuming or cleanup apart from the memory for heap scanning. In
221 * parallel vacuum, since individual vacuum workers can consume memory
222 * equal to maintenance_work_mem, the new maintenance_work_mem for each
223 * worker is set such that the parallel operation doesn't consume more
224 * memory than single process lazy vacuum.
226 int maintenance_work_mem_worker
;
229 * Shared vacuum cost balance. During parallel vacuum,
230 * VacuumSharedCostBalance points to this value and it accumulates the
231 * balance of each parallel vacuum worker.
233 pg_atomic_uint32 cost_balance
;
236 * Number of active parallel workers. This is used for computing the
237 * minimum threshold of the vacuum cost balance before a worker sleeps for
240 pg_atomic_uint32 active_nworkers
;
243 * Variables to control parallel vacuum. We have a bitmap to indicate
244 * which index has stats in shared memory. The set bit in the map
245 * indicates that the particular index supports a parallel vacuum.
247 pg_atomic_uint32 idx
; /* counter for vacuuming and clean up */
248 uint32 offset
; /* sizeof header incl. bitmap */
249 bits8 bitmap
[FLEXIBLE_ARRAY_MEMBER
]; /* bit map of NULLs */
251 /* Shared index statistics data follows at end of struct */
254 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
255 #define GetSharedIndStats(s) \
256 ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
257 #define IndStatsIsNull(s, i) \
258 (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
261 * Struct for an index bulk-deletion statistic used for parallel vacuum. This
262 * is allocated in the DSM segment.
264 typedef struct LVSharedIndStats
266 bool updated
; /* are the stats updated? */
267 IndexBulkDeleteResult istat
;
270 /* Struct for maintaining a parallel vacuum state. */
271 typedef struct LVParallelState
273 ParallelContext
*pcxt
;
275 /* Shared information among parallel vacuum workers */
278 /* Points to buffer usage area in DSM */
279 BufferUsage
*buffer_usage
;
281 /* Points to WAL usage area in DSM */
285 * The number of indexes that support parallel index bulk-deletion and
286 * parallel index cleanup respectively.
288 int nindexes_parallel_bulkdel
;
289 int nindexes_parallel_cleanup
;
290 int nindexes_parallel_condcleanup
;
293 typedef struct LVRelState
295 /* Target heap relation and its indexes */
299 /* useindex = true means two-pass strategy; false means one-pass */
302 /* Buffer access strategy and parallel state */
303 BufferAccessStrategy bstrategy
;
304 LVParallelState
*lps
;
306 /* Statistics from pg_class when we start out */
307 BlockNumber old_rel_pages
; /* previous value of pg_class.relpages */
308 double old_live_tuples
; /* previous value of pg_class.reltuples */
309 /* rel's initial relfrozenxid and relminmxid */
310 TransactionId relfrozenxid
;
311 MultiXactId relminmxid
;
312 TransactionId latestRemovedXid
;
314 /* VACUUM operation's cutoff for pruning */
315 TransactionId OldestXmin
;
316 /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
317 TransactionId FreezeLimit
;
318 MultiXactId MultiXactCutoff
;
320 /* Error reporting state */
324 BlockNumber blkno
; /* used only for heap operations */
325 OffsetNumber offnum
; /* used only for heap operations */
329 * State managed by lazy_scan_heap() follows
331 LVDeadTuples
*dead_tuples
; /* items to vacuum from indexes */
332 BlockNumber rel_pages
; /* total number of pages */
333 BlockNumber scanned_pages
; /* number of pages we examined */
334 BlockNumber pinskipped_pages
; /* # of pages skipped due to a pin */
335 BlockNumber frozenskipped_pages
; /* # of frozen pages we skipped */
336 BlockNumber tupcount_pages
; /* pages whose tuples we counted */
337 BlockNumber pages_removed
; /* pages remove by truncation */
338 BlockNumber nonempty_pages
; /* actually, last nonempty page + 1 */
339 bool lock_waiter_detected
;
341 /* Statistics output by us, for table */
342 double new_rel_tuples
; /* new estimated total # of tuples */
343 double new_live_tuples
; /* new estimated total # of live tuples */
344 /* Statistics output by index AMs */
345 IndexBulkDeleteResult
**indstats
;
347 /* Instrumentation counters */
349 int64 tuples_deleted
; /* # deleted from table */
350 int64 new_dead_tuples
; /* new estimated total # of dead items in
352 int64 num_tuples
; /* total number of nonremovable tuples */
353 int64 live_tuples
; /* live tuples (reltuples estimate) */
356 /* Struct for saving and restoring vacuum error information. */
357 typedef struct LVSavedErrInfo
364 /* elevel controls whole VACUUM's verbosity */
365 static int elevel
= -1;
368 /* non-export function prototypes */
369 static void lazy_scan_heap(LVRelState
*vacrel
, VacuumParams
*params
,
371 static void lazy_vacuum_all_indexes(LVRelState
*vacrel
);
372 static void lazy_vacuum_heap_rel(LVRelState
*vacrel
);
373 static int lazy_vacuum_heap_page(LVRelState
*vacrel
, BlockNumber blkno
,
374 Buffer buffer
, int tupindex
, Buffer
*vmbuffer
);
375 static bool lazy_check_needs_freeze(Buffer buf
, bool *hastup
,
377 static void do_parallel_lazy_vacuum_all_indexes(LVRelState
*vacrel
);
378 static void do_parallel_lazy_cleanup_all_indexes(LVRelState
*vacrel
);
379 static void do_parallel_vacuum_or_cleanup(LVRelState
*vacrel
, int nworkers
);
380 static void do_parallel_processing(LVRelState
*vacrel
,
382 static void do_serial_processing_for_unsafe_indexes(LVRelState
*vacrel
,
384 static IndexBulkDeleteResult
*parallel_process_one_index(Relation indrel
,
385 IndexBulkDeleteResult
*istat
,
387 LVSharedIndStats
*shared_indstats
,
389 static void lazy_cleanup_all_indexes(LVRelState
*vacrel
);
390 static IndexBulkDeleteResult
*lazy_vacuum_one_index(Relation indrel
,
391 IndexBulkDeleteResult
*istat
,
394 static IndexBulkDeleteResult
*lazy_cleanup_one_index(Relation indrel
,
395 IndexBulkDeleteResult
*istat
,
397 bool estimated_count
,
399 static bool should_attempt_truncation(LVRelState
*vacrel
,
400 VacuumParams
*params
);
401 static void lazy_truncate_heap(LVRelState
*vacrel
);
402 static BlockNumber
count_nondeletable_pages(LVRelState
*vacrel
);
403 static long compute_max_dead_tuples(BlockNumber relblocks
, bool hasindex
);
404 static void lazy_space_alloc(LVRelState
*vacrel
, int nworkers
,
405 BlockNumber relblocks
);
406 static void lazy_space_free(LVRelState
*vacrel
);
407 static void lazy_record_dead_tuple(LVDeadTuples
*dead_tuples
,
408 ItemPointer itemptr
);
409 static bool lazy_tid_reaped(ItemPointer itemptr
, void *state
);
410 static int vac_cmp_itemptr(const void *left
, const void *right
);
411 static bool heap_page_is_all_visible(LVRelState
*vacrel
, Buffer buf
,
412 TransactionId
*visibility_cutoff_xid
, bool *all_frozen
);
413 static int compute_parallel_vacuum_workers(LVRelState
*vacrel
,
415 bool *can_parallel_vacuum
);
416 static void update_index_statistics(LVRelState
*vacrel
);
417 static LVParallelState
*begin_parallel_vacuum(LVRelState
*vacrel
,
420 static void end_parallel_vacuum(LVRelState
*vacrel
);
421 static LVSharedIndStats
*parallel_stats_for_idx(LVShared
*lvshared
, int getidx
);
422 static bool parallel_processing_is_safe(Relation indrel
, LVShared
*lvshared
);
423 static void vacuum_error_callback(void *arg
);
424 static void update_vacuum_error_info(LVRelState
*vacrel
,
425 LVSavedErrInfo
*saved_vacrel
,
426 int phase
, BlockNumber blkno
,
427 OffsetNumber offnum
);
428 static void restore_vacuum_error_info(LVRelState
*vacrel
,
429 const LVSavedErrInfo
*saved_vacrel
);
433 * heap_vacuum_rel() -- perform VACUUM for one heap relation
435 * This routine vacuums a single heap, cleans out its indexes, and
436 * updates its relpages and reltuples statistics.
438 * At entry, we have already established a transaction and opened
439 * and locked the relation.
442 heap_vacuum_rel(Relation rel
, VacuumParams
*params
,
443 BufferAccessStrategy bstrategy
)
447 TimestampTz starttime
= 0;
448 WalUsage walusage_start
= pgWalUsage
;
449 WalUsage walusage
= {0, 0, 0};
454 bool aggressive
; /* should we scan all unfrozen pages? */
455 bool scanned_all_unfrozen
; /* actually scanned all such pages? */
456 char **indnames
= NULL
;
457 TransactionId xidFullScanLimit
;
458 MultiXactId mxactFullScanLimit
;
459 BlockNumber new_rel_pages
;
460 BlockNumber new_rel_allvisible
;
461 double new_live_tuples
;
462 TransactionId new_frozen_xid
;
463 MultiXactId new_min_multi
;
464 ErrorContextCallback errcallback
;
465 PgStat_Counter startreadtime
= 0;
466 PgStat_Counter startwritetime
= 0;
467 TransactionId OldestXmin
;
468 TransactionId FreezeLimit
;
469 MultiXactId MultiXactCutoff
;
471 Assert(params
!= NULL
);
472 Assert(params
->index_cleanup
!= VACOPT_TERNARY_DEFAULT
);
473 Assert(params
->truncate
!= VACOPT_TERNARY_DEFAULT
);
475 /* measure elapsed time iff autovacuum logging requires it */
476 if (IsAutoVacuumWorkerProcess() && params
->log_min_duration
>= 0)
478 pg_rusage_init(&ru0
);
479 starttime
= GetCurrentTimestamp();
482 startreadtime
= pgStatBlockReadTime
;
483 startwritetime
= pgStatBlockWriteTime
;
487 if (params
->options
& VACOPT_VERBOSE
)
492 pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM
,
493 RelationGetRelid(rel
));
495 vacuum_set_xid_limits(rel
,
496 params
->freeze_min_age
,
497 params
->freeze_table_age
,
498 params
->multixact_freeze_min_age
,
499 params
->multixact_freeze_table_age
,
500 &OldestXmin
, &FreezeLimit
, &xidFullScanLimit
,
501 &MultiXactCutoff
, &mxactFullScanLimit
);
504 * We request an aggressive scan if the table's frozen Xid is now older
505 * than or equal to the requested Xid full-table scan limit; or if the
506 * table's minimum MultiXactId is older than or equal to the requested
507 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
509 aggressive
= TransactionIdPrecedesOrEquals(rel
->rd_rel
->relfrozenxid
,
511 aggressive
|= MultiXactIdPrecedesOrEquals(rel
->rd_rel
->relminmxid
,
513 if (params
->options
& VACOPT_DISABLE_PAGE_SKIPPING
)
516 vacrel
= (LVRelState
*) palloc0(sizeof(LVRelState
));
518 /* Set up high level stuff about rel */
520 vac_open_indexes(vacrel
->rel
, RowExclusiveLock
, &vacrel
->nindexes
,
522 vacrel
->useindex
= (vacrel
->nindexes
> 0 &&
523 params
->index_cleanup
== VACOPT_TERNARY_ENABLED
);
524 vacrel
->bstrategy
= bstrategy
;
525 vacrel
->old_rel_pages
= rel
->rd_rel
->relpages
;
526 vacrel
->old_live_tuples
= rel
->rd_rel
->reltuples
;
527 vacrel
->relfrozenxid
= rel
->rd_rel
->relfrozenxid
;
528 vacrel
->relminmxid
= rel
->rd_rel
->relminmxid
;
529 vacrel
->latestRemovedXid
= InvalidTransactionId
;
531 /* Set cutoffs for entire VACUUM */
532 vacrel
->OldestXmin
= OldestXmin
;
533 vacrel
->FreezeLimit
= FreezeLimit
;
534 vacrel
->MultiXactCutoff
= MultiXactCutoff
;
536 vacrel
->relnamespace
= get_namespace_name(RelationGetNamespace(rel
));
537 vacrel
->relname
= pstrdup(RelationGetRelationName(rel
));
538 vacrel
->indname
= NULL
;
539 vacrel
->phase
= VACUUM_ERRCB_PHASE_UNKNOWN
;
541 /* Save index names iff autovacuum logging requires it */
542 if (IsAutoVacuumWorkerProcess() && params
->log_min_duration
>= 0 &&
543 vacrel
->nindexes
> 0)
545 indnames
= palloc(sizeof(char *) * vacrel
->nindexes
);
546 for (int i
= 0; i
< vacrel
->nindexes
; i
++)
548 pstrdup(RelationGetRelationName(vacrel
->indrels
[i
]));
552 * Setup error traceback support for ereport(). The idea is to set up an
553 * error context callback to display additional information on any error
554 * during a vacuum. During different phases of vacuum (heap scan, heap
555 * vacuum, index vacuum, index clean up, heap truncate), we update the
556 * error context callback to display appropriate information.
558 * Note that the index vacuum and heap vacuum phases may be called
559 * multiple times in the middle of the heap scan phase. So the old phase
560 * information is restored at the end of those phases.
562 errcallback
.callback
= vacuum_error_callback
;
563 errcallback
.arg
= vacrel
;
564 errcallback
.previous
= error_context_stack
;
565 error_context_stack
= &errcallback
;
567 /* Do the vacuuming */
568 lazy_scan_heap(vacrel
, params
, aggressive
);
570 /* Done with indexes */
571 vac_close_indexes(vacrel
->nindexes
, vacrel
->indrels
, NoLock
);
574 * Compute whether we actually scanned the all unfrozen pages. If we did,
575 * we can adjust relfrozenxid and relminmxid.
577 * NB: We need to check this before truncating the relation, because that
578 * will change ->rel_pages.
580 if ((vacrel
->scanned_pages
+ vacrel
->frozenskipped_pages
)
584 scanned_all_unfrozen
= false;
587 scanned_all_unfrozen
= true;
590 * Optionally truncate the relation.
592 if (should_attempt_truncation(vacrel
, params
))
595 * Update error traceback information. This is the last phase during
596 * which we add context information to errors, so we don't need to
597 * revert to the previous phase.
599 update_vacuum_error_info(vacrel
, NULL
, VACUUM_ERRCB_PHASE_TRUNCATE
,
600 vacrel
->nonempty_pages
,
601 InvalidOffsetNumber
);
602 lazy_truncate_heap(vacrel
);
605 /* Pop the error context stack */
606 error_context_stack
= errcallback
.previous
;
608 /* Report that we are now doing final cleanup */
609 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
610 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP
);
613 * Update statistics in pg_class.
615 * In principle new_live_tuples could be -1 indicating that we (still)
616 * don't know the tuple count. In practice that probably can't happen,
617 * since we'd surely have scanned some pages if the table is new and
620 * For safety, clamp relallvisible to be not more than what we're setting
623 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
624 * since then we don't know for certain that all tuples have a newer xmin.
626 new_rel_pages
= vacrel
->rel_pages
;
627 new_live_tuples
= vacrel
->new_live_tuples
;
629 visibilitymap_count(rel
, &new_rel_allvisible
, NULL
);
630 if (new_rel_allvisible
> new_rel_pages
)
631 new_rel_allvisible
= new_rel_pages
;
633 new_frozen_xid
= scanned_all_unfrozen
? FreezeLimit
: InvalidTransactionId
;
634 new_min_multi
= scanned_all_unfrozen
? MultiXactCutoff
: InvalidMultiXactId
;
636 vac_update_relstats(rel
,
640 vacrel
->nindexes
> 0,
645 /* report results to the stats collector, too */
646 pgstat_report_vacuum(RelationGetRelid(rel
),
647 rel
->rd_rel
->relisshared
,
648 Max(new_live_tuples
, 0),
649 vacrel
->new_dead_tuples
);
650 pgstat_progress_end_command();
652 /* and log the action if appropriate */
653 if (IsAutoVacuumWorkerProcess() && params
->log_min_duration
>= 0)
655 TimestampTz endtime
= GetCurrentTimestamp();
657 if (params
->log_min_duration
== 0 ||
658 TimestampDifferenceExceeds(starttime
, endtime
,
659 params
->log_min_duration
))
664 TimestampDifference(starttime
, endtime
, &secs
, &usecs
);
666 memset(&walusage
, 0, sizeof(WalUsage
));
667 WalUsageAccumDiff(&walusage
, &pgWalUsage
, &walusage_start
);
671 if ((secs
> 0) || (usecs
> 0))
673 read_rate
= (double) BLCKSZ
* VacuumPageMiss
/ (1024 * 1024) /
674 (secs
+ usecs
/ 1000000.0);
675 write_rate
= (double) BLCKSZ
* VacuumPageDirty
/ (1024 * 1024) /
676 (secs
+ usecs
/ 1000000.0);
680 * This is pretty messy, but we split it up so that we can skip
681 * emitting individual parts of the message when not applicable.
683 initStringInfo(&buf
);
684 if (params
->is_wraparound
)
687 * While it's possible for a VACUUM to be both is_wraparound
688 * and !aggressive, that's just a corner-case -- is_wraparound
689 * implies aggressive. Produce distinct output for the corner
690 * case all the same, just in case.
693 msgfmt
= _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
695 msgfmt
= _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
700 msgfmt
= _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
702 msgfmt
= _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
704 appendStringInfo(&buf
, msgfmt
,
705 get_database_name(MyDatabaseId
),
706 vacrel
->relnamespace
,
708 vacrel
->num_index_scans
);
709 appendStringInfo(&buf
, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
710 vacrel
->pages_removed
,
712 vacrel
->pinskipped_pages
,
713 vacrel
->frozenskipped_pages
);
714 appendStringInfo(&buf
,
715 _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
716 (long long) vacrel
->tuples_deleted
,
717 (long long) vacrel
->new_rel_tuples
,
718 (long long) vacrel
->new_dead_tuples
,
720 appendStringInfo(&buf
,
721 _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
722 (long long) VacuumPageHit
,
723 (long long) VacuumPageMiss
,
724 (long long) VacuumPageDirty
);
725 for (int i
= 0; i
< vacrel
->nindexes
; i
++)
727 IndexBulkDeleteResult
*istat
= vacrel
->indstats
[i
];
732 appendStringInfo(&buf
,
733 _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
736 istat
->pages_newly_deleted
,
737 istat
->pages_deleted
,
740 appendStringInfo(&buf
, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
741 read_rate
, write_rate
);
744 appendStringInfoString(&buf
, _("I/O Timings:"));
745 if (pgStatBlockReadTime
- startreadtime
> 0)
746 appendStringInfo(&buf
, _(" read=%.3f"),
747 (double) (pgStatBlockReadTime
- startreadtime
) / 1000);
748 if (pgStatBlockWriteTime
- startwritetime
> 0)
749 appendStringInfo(&buf
, _(" write=%.3f"),
750 (double) (pgStatBlockWriteTime
- startwritetime
) / 1000);
751 appendStringInfoChar(&buf
, '\n');
753 appendStringInfo(&buf
, _("system usage: %s\n"), pg_rusage_show(&ru0
));
754 appendStringInfo(&buf
,
755 _("WAL usage: %ld records, %ld full page images, %llu bytes"),
756 walusage
.wal_records
,
758 (unsigned long long) walusage
.wal_bytes
);
761 (errmsg_internal("%s", buf
.data
)));
766 /* Cleanup index statistics and index names */
767 for (int i
= 0; i
< vacrel
->nindexes
; i
++)
769 if (vacrel
->indstats
[i
])
770 pfree(vacrel
->indstats
[i
]);
772 if (indnames
&& indnames
[i
])
778 * For Hot Standby we need to know the highest transaction id that will
779 * be removed by any change. VACUUM proceeds in a number of passes so
780 * we need to consider how each pass operates. The first phase runs
781 * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
782 * progresses - these will have a latestRemovedXid on each record.
783 * In some cases this removes all of the tuples to be removed, though
784 * often we have dead tuples with index pointers so we must remember them
785 * for removal in phase 3. Index records for those rows are removed
786 * in phase 2 and index blocks do not have MVCC information attached.
787 * So before we can allow removal of any index tuples we need to issue
788 * a WAL record containing the latestRemovedXid of rows that will be
789 * removed in phase three. This allows recovery queries to block at the
790 * correct place, i.e. before phase two, rather than during phase three
791 * which would be after the rows have become inaccessible.
794 vacuum_log_cleanup_info(LVRelState
*vacrel
)
797 * Skip this for relations for which no WAL is to be written, or if we're
798 * not trying to support archive recovery.
800 if (!RelationNeedsWAL(vacrel
->rel
) || !XLogIsNeeded())
804 * No need to write the record at all unless it contains a valid value
806 if (TransactionIdIsValid(vacrel
->latestRemovedXid
))
807 (void) log_heap_cleanup_info(vacrel
->rel
->rd_node
,
808 vacrel
->latestRemovedXid
);
812 * lazy_scan_heap() -- scan an open heap relation
814 * This routine prunes each page in the heap, which will among other
815 * things truncate dead tuples to dead line pointers, defragment the
816 * page, and set commit status bits (see heap_page_prune). It also builds
817 * lists of dead tuples and pages with free space, calculates statistics
818 * on the number of live tuples in the heap, and marks pages as
819 * all-visible if appropriate. When done, or when we run low on space
820 * for dead-tuple TIDs, invoke vacuuming of indexes and reclaim dead line
823 * If the table has at least two indexes, we execute both index vacuum
824 * and index cleanup with parallel workers unless parallel vacuum is
825 * disabled. In a parallel vacuum, we enter parallel mode and then
826 * create both the parallel context and the DSM segment before starting
827 * heap scan so that we can record dead tuples to the DSM segment. All
828 * parallel workers are launched at beginning of index vacuuming and
829 * index cleanup and they exit once done with all indexes. At the end of
830 * this function we exit from parallel mode. Index bulk-deletion results
831 * are stored in the DSM segment and we update index statistics for all
832 * the indexes after exiting from parallel mode since writes are not
833 * allowed during parallel mode.
835 * If there are no indexes then we can reclaim line pointers on the fly;
836 * dead line pointers need only be retained until all index pointers that
837 * reference them have been killed.
840 lazy_scan_heap(LVRelState
*vacrel
, VacuumParams
*params
, bool aggressive
)
842 LVDeadTuples
*dead_tuples
;
846 BlockNumber empty_pages
,
848 next_fsm_block_to_vacuum
;
849 double num_tuples
, /* total number of nonremovable tuples */
850 live_tuples
, /* live tuples (reltuples estimate) */
851 tups_vacuumed
, /* tuples cleaned up by current vacuum */
852 nkeep
, /* dead-but-not-removable tuples */
853 nunused
; /* # existing unused line pointers */
856 Buffer vmbuffer
= InvalidBuffer
;
857 BlockNumber next_unskippable_block
;
858 bool skipping_blocks
;
859 xl_heap_freeze_tuple
*frozen
;
861 const int initprog_index
[] = {
862 PROGRESS_VACUUM_PHASE
,
863 PROGRESS_VACUUM_TOTAL_HEAP_BLKS
,
864 PROGRESS_VACUUM_MAX_DEAD_TUPLES
866 int64 initprog_val
[3];
867 GlobalVisState
*vistest
;
869 pg_rusage_init(&ru0
);
873 (errmsg("aggressively vacuuming \"%s.%s\"",
874 vacrel
->relnamespace
,
878 (errmsg("vacuuming \"%s.%s\"",
879 vacrel
->relnamespace
,
882 empty_pages
= vacuumed_pages
= 0;
883 next_fsm_block_to_vacuum
= (BlockNumber
) 0;
884 num_tuples
= live_tuples
= tups_vacuumed
= nkeep
= nunused
= 0;
886 nblocks
= RelationGetNumberOfBlocks(vacrel
->rel
);
887 vacrel
->rel_pages
= nblocks
;
888 vacrel
->scanned_pages
= 0;
889 vacrel
->pinskipped_pages
= 0;
890 vacrel
->frozenskipped_pages
= 0;
891 vacrel
->tupcount_pages
= 0;
892 vacrel
->pages_removed
= 0;
893 vacrel
->nonempty_pages
= 0;
894 vacrel
->lock_waiter_detected
= false;
896 /* Initialize instrumentation counters */
897 vacrel
->num_index_scans
= 0;
898 vacrel
->tuples_deleted
= 0;
899 vacrel
->new_dead_tuples
= 0;
900 vacrel
->num_tuples
= 0;
901 vacrel
->live_tuples
= 0;
903 vistest
= GlobalVisTestFor(vacrel
->rel
);
905 vacrel
->indstats
= (IndexBulkDeleteResult
**)
906 palloc0(vacrel
->nindexes
* sizeof(IndexBulkDeleteResult
*));
909 * Allocate the space for dead tuples. Note that this handles parallel
910 * VACUUM initialization as part of allocating shared memory space used
913 lazy_space_alloc(vacrel
, params
->nworkers
, nblocks
);
914 dead_tuples
= vacrel
->dead_tuples
;
915 frozen
= palloc(sizeof(xl_heap_freeze_tuple
) * MaxHeapTuplesPerPage
);
917 /* Report that we're scanning the heap, advertising total # of blocks */
918 initprog_val
[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP
;
919 initprog_val
[1] = nblocks
;
920 initprog_val
[2] = dead_tuples
->max_tuples
;
921 pgstat_progress_update_multi_param(3, initprog_index
, initprog_val
);
924 * Except when aggressive is set, we want to skip pages that are
925 * all-visible according to the visibility map, but only when we can skip
926 * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
927 * sequentially, the OS should be doing readahead for us, so there's no
928 * gain in skipping a page now and then; that's likely to disable
929 * readahead and so be counterproductive. Also, skipping even a single
930 * page means that we can't update relfrozenxid, so we only want to do it
931 * if we can skip a goodly number of pages.
933 * When aggressive is set, we can't skip pages just because they are
934 * all-visible, but we can still skip pages that are all-frozen, since
935 * such pages do not need freezing and do not affect the value that we can
936 * safely set for relfrozenxid or relminmxid.
938 * Before entering the main loop, establish the invariant that
939 * next_unskippable_block is the next block number >= blkno that we can't
940 * skip based on the visibility map, either all-visible for a regular scan
941 * or all-frozen for an aggressive scan. We set it to nblocks if there's
942 * no such block. We also set up the skipping_blocks flag correctly at
945 * Note: The value returned by visibilitymap_get_status could be slightly
946 * out-of-date, since we make this test before reading the corresponding
947 * heap page or locking the buffer. This is OK. If we mistakenly think
948 * that the page is all-visible or all-frozen when in fact the flag's just
949 * been cleared, we might fail to vacuum the page. It's easy to see that
950 * skipping a page when aggressive is not set is not a very big deal; we
951 * might leave some dead tuples lying around, but the next vacuum will
952 * find them. But even when aggressive *is* set, it's still OK if we miss
953 * a page whose all-frozen marking has just been cleared. Any new XIDs
954 * just added to that page are necessarily newer than the GlobalXmin we
955 * computed, so they'll have no effect on the value to which we can safely
956 * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
958 * We will scan the table's last page, at least to the extent of
959 * determining whether it has tuples or not, even if it should be skipped
960 * according to the above rules; except when we've already determined that
961 * it's not worth trying to truncate the table. This avoids having
962 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
963 * a truncation that just fails immediately because there are tuples in
964 * the last page. This is worth avoiding mainly because such a lock must
965 * be replayed on any hot standby, where it can be disruptive.
967 next_unskippable_block
= 0;
968 if ((params
->options
& VACOPT_DISABLE_PAGE_SKIPPING
) == 0)
970 while (next_unskippable_block
< nblocks
)
974 vmstatus
= visibilitymap_get_status(vacrel
->rel
,
975 next_unskippable_block
,
979 if ((vmstatus
& VISIBILITYMAP_ALL_FROZEN
) == 0)
984 if ((vmstatus
& VISIBILITYMAP_ALL_VISIBLE
) == 0)
987 vacuum_delay_point();
988 next_unskippable_block
++;
992 if (next_unskippable_block
>= SKIP_PAGES_THRESHOLD
)
993 skipping_blocks
= true;
995 skipping_blocks
= false;
997 for (blkno
= 0; blkno
< nblocks
; blkno
++)
1001 OffsetNumber offnum
,
1005 int prev_dead_count
;
1008 bool all_visible_according_to_vm
= false;
1010 bool all_frozen
= true; /* provided all_visible is also true */
1011 bool has_dead_items
; /* includes existing LP_DEAD items */
1012 TransactionId visibility_cutoff_xid
= InvalidTransactionId
;
1014 /* see note above about forcing scanning of last page */
1015 #define FORCE_CHECK_PAGE() \
1016 (blkno == nblocks - 1 && should_attempt_truncation(vacrel, params))
1018 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED
, blkno
);
1020 update_vacuum_error_info(vacrel
, NULL
, VACUUM_ERRCB_PHASE_SCAN_HEAP
,
1021 blkno
, InvalidOffsetNumber
);
1023 if (blkno
== next_unskippable_block
)
1025 /* Time to advance next_unskippable_block */
1026 next_unskippable_block
++;
1027 if ((params
->options
& VACOPT_DISABLE_PAGE_SKIPPING
) == 0)
1029 while (next_unskippable_block
< nblocks
)
1033 vmskipflags
= visibilitymap_get_status(vacrel
->rel
,
1034 next_unskippable_block
,
1038 if ((vmskipflags
& VISIBILITYMAP_ALL_FROZEN
) == 0)
1043 if ((vmskipflags
& VISIBILITYMAP_ALL_VISIBLE
) == 0)
1046 vacuum_delay_point();
1047 next_unskippable_block
++;
1052 * We know we can't skip the current block. But set up
1053 * skipping_blocks to do the right thing at the following blocks.
1055 if (next_unskippable_block
- blkno
> SKIP_PAGES_THRESHOLD
)
1056 skipping_blocks
= true;
1058 skipping_blocks
= false;
1061 * Normally, the fact that we can't skip this block must mean that
1062 * it's not all-visible. But in an aggressive vacuum we know only
1063 * that it's not all-frozen, so it might still be all-visible.
1065 if (aggressive
&& VM_ALL_VISIBLE(vacrel
->rel
, blkno
, &vmbuffer
))
1066 all_visible_according_to_vm
= true;
1071 * The current block is potentially skippable; if we've seen a
1072 * long enough run of skippable blocks to justify skipping it, and
1073 * we're not forced to check it, then go ahead and skip.
1074 * Otherwise, the page must be at least all-visible if not
1075 * all-frozen, so we can set all_visible_according_to_vm = true.
1077 if (skipping_blocks
&& !FORCE_CHECK_PAGE())
1080 * Tricky, tricky. If this is in aggressive vacuum, the page
1081 * must have been all-frozen at the time we checked whether it
1082 * was skippable, but it might not be any more. We must be
1083 * careful to count it as a skipped all-frozen page in that
1084 * case, or else we'll think we can't update relfrozenxid and
1085 * relminmxid. If it's not an aggressive vacuum, we don't
1086 * know whether it was all-frozen, so we have to recheck; but
1087 * in this case an approximate answer is OK.
1089 if (aggressive
|| VM_ALL_FROZEN(vacrel
->rel
, blkno
, &vmbuffer
))
1090 vacrel
->frozenskipped_pages
++;
1093 all_visible_according_to_vm
= true;
1096 vacuum_delay_point();
1099 * If we are close to overrunning the available space for dead-tuple
1100 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
1102 if ((dead_tuples
->max_tuples
- dead_tuples
->num_tuples
) < MaxHeapTuplesPerPage
&&
1103 dead_tuples
->num_tuples
> 0)
1106 * Before beginning index vacuuming, we release any pin we may
1107 * hold on the visibility map page. This isn't necessary for
1108 * correctness, but we do it anyway to avoid holding the pin
1109 * across a lengthy, unrelated operation.
1111 if (BufferIsValid(vmbuffer
))
1113 ReleaseBuffer(vmbuffer
);
1114 vmbuffer
= InvalidBuffer
;
1117 /* Work on all the indexes, then the heap */
1118 lazy_vacuum_all_indexes(vacrel
);
1120 /* Remove tuples from heap */
1121 lazy_vacuum_heap_rel(vacrel
);
1124 * Forget the now-vacuumed tuples, and press on, but be careful
1125 * not to reset latestRemovedXid since we want that value to be
1128 dead_tuples
->num_tuples
= 0;
1131 * Vacuum the Free Space Map to make newly-freed space visible on
1132 * upper-level FSM pages. Note we have not yet processed blkno.
1134 FreeSpaceMapVacuumRange(vacrel
->rel
, next_fsm_block_to_vacuum
,
1136 next_fsm_block_to_vacuum
= blkno
;
1138 /* Report that we are once again scanning the heap */
1139 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
1140 PROGRESS_VACUUM_PHASE_SCAN_HEAP
);
1144 * Pin the visibility map page in case we need to mark the page
1145 * all-visible. In most cases this will be very cheap, because we'll
1146 * already have the correct page pinned anyway. However, it's
1147 * possible that (a) next_unskippable_block is covered by a different
1148 * VM page than the current block or (b) we released our pin and did a
1149 * cycle of index vacuuming.
1151 visibilitymap_pin(vacrel
->rel
, blkno
, &vmbuffer
);
1153 buf
= ReadBufferExtended(vacrel
->rel
, MAIN_FORKNUM
, blkno
,
1154 RBM_NORMAL
, vacrel
->bstrategy
);
1156 /* We need buffer cleanup lock so that we can prune HOT chains. */
1157 if (!ConditionalLockBufferForCleanup(buf
))
1160 * If we're not performing an aggressive scan to guard against XID
1161 * wraparound, and we don't want to forcibly check the page, then
1162 * it's OK to skip vacuuming pages we get a lock conflict on. They
1163 * will be dealt with in some future vacuum.
1165 if (!aggressive
&& !FORCE_CHECK_PAGE())
1168 vacrel
->pinskipped_pages
++;
1173 * Read the page with share lock to see if any xids on it need to
1174 * be frozen. If not we just skip the page, after updating our
1175 * scan statistics. If there are some, we wait for cleanup lock.
1177 * We could defer the lock request further by remembering the page
1178 * and coming back to it later, or we could even register
1179 * ourselves for multiple buffers and then service whichever one
1180 * is received first. For now, this seems good enough.
1182 * If we get here with aggressive false, then we're just forcibly
1183 * checking the page, and so we don't want to insist on getting
1184 * the lock; we only need to know if the page contains tuples, so
1185 * that we can update nonempty_pages correctly. It's convenient
1186 * to use lazy_check_needs_freeze() for both situations, though.
1188 LockBuffer(buf
, BUFFER_LOCK_SHARE
);
1189 if (!lazy_check_needs_freeze(buf
, &hastup
, vacrel
))
1191 UnlockReleaseBuffer(buf
);
1192 vacrel
->scanned_pages
++;
1193 vacrel
->pinskipped_pages
++;
1195 vacrel
->nonempty_pages
= blkno
+ 1;
1201 * Here, we must not advance scanned_pages; that would amount
1202 * to claiming that the page contains no freezable tuples.
1204 UnlockReleaseBuffer(buf
);
1205 vacrel
->pinskipped_pages
++;
1207 vacrel
->nonempty_pages
= blkno
+ 1;
1210 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
1211 LockBufferForCleanup(buf
);
1212 /* drop through to normal processing */
1215 vacrel
->scanned_pages
++;
1216 vacrel
->tupcount_pages
++;
1218 page
= BufferGetPage(buf
);
1220 if (PageIsNew(page
))
1223 * All-zeroes pages can be left over if either a backend extends
1224 * the relation by a single page, but crashes before the newly
1225 * initialized page has been written out, or when bulk-extending
1226 * the relation (which creates a number of empty pages at the tail
1227 * end of the relation, but enters them into the FSM).
1229 * Note we do not enter the page into the visibilitymap. That has
1230 * the downside that we repeatedly visit this page in subsequent
1231 * vacuums, but otherwise we'll never not discover the space on a
1232 * promoted standby. The harm of repeated checking ought to
1233 * normally not be too bad - the space usually should be used at
1234 * some point, otherwise there wouldn't be any regular vacuums.
1236 * Make sure these pages are in the FSM, to ensure they can be
1237 * reused. Do that by testing if there's any space recorded for
1238 * the page. If not, enter it. We do so after releasing the lock
1239 * on the heap page, the FSM is approximate, after all.
1241 UnlockReleaseBuffer(buf
);
1245 if (GetRecordedFreeSpace(vacrel
->rel
, blkno
) == 0)
1249 freespace
= BufferGetPageSize(buf
) - SizeOfPageHeaderData
;
1250 RecordPageWithFreeSpace(vacrel
->rel
, blkno
, freespace
);
1255 if (PageIsEmpty(page
))
1258 freespace
= PageGetHeapFreeSpace(page
);
1261 * Empty pages are always all-visible and all-frozen (note that
1262 * the same is currently not true for new pages, see above).
1264 if (!PageIsAllVisible(page
))
1266 START_CRIT_SECTION();
1268 /* mark buffer dirty before writing a WAL record */
1269 MarkBufferDirty(buf
);
1272 * It's possible that another backend has extended the heap,
1273 * initialized the page, and then failed to WAL-log the page
1274 * due to an ERROR. Since heap extension is not WAL-logged,
1275 * recovery might try to replay our record setting the page
1276 * all-visible and find that the page isn't initialized, which
1277 * will cause a PANIC. To prevent that, check whether the
1278 * page has been previously WAL-logged, and if not, do that
1281 if (RelationNeedsWAL(vacrel
->rel
) &&
1282 PageGetLSN(page
) == InvalidXLogRecPtr
)
1283 log_newpage_buffer(buf
, true);
1285 PageSetAllVisible(page
);
1286 visibilitymap_set(vacrel
->rel
, blkno
, buf
, InvalidXLogRecPtr
,
1287 vmbuffer
, InvalidTransactionId
,
1288 VISIBILITYMAP_ALL_VISIBLE
| VISIBILITYMAP_ALL_FROZEN
);
1292 UnlockReleaseBuffer(buf
);
1293 RecordPageWithFreeSpace(vacrel
->rel
, blkno
, freespace
);
1298 * Prune all HOT-update chains in this page.
1300 * We count tuples removed by the pruning step as removed by VACUUM
1301 * (existing LP_DEAD line pointers don't count).
1303 tups_vacuumed
+= heap_page_prune(vacrel
->rel
, buf
, vistest
,
1304 InvalidTransactionId
, 0, false,
1305 &vacrel
->latestRemovedXid
,
1309 * Now scan the page to collect vacuumable items and check for tuples
1310 * requiring freezing.
1313 has_dead_items
= false;
1316 prev_dead_count
= dead_tuples
->num_tuples
;
1317 maxoff
= PageGetMaxOffsetNumber(page
);
1320 * Note: If you change anything in the loop below, also look at
1321 * heap_page_is_all_visible to see if that needs to be changed.
1323 for (offnum
= FirstOffsetNumber
;
1325 offnum
= OffsetNumberNext(offnum
))
1330 * Set the offset number so that we can display it along with any
1331 * error that occurred while processing this tuple.
1333 vacrel
->offnum
= offnum
;
1334 itemid
= PageGetItemId(page
, offnum
);
1336 /* Unused items require no processing, but we count 'em */
1337 if (!ItemIdIsUsed(itemid
))
1343 /* Redirect items mustn't be touched */
1344 if (ItemIdIsRedirected(itemid
))
1346 hastup
= true; /* this page won't be truncatable */
1350 ItemPointerSet(&(tuple
.t_self
), blkno
, offnum
);
1353 * LP_DEAD line pointers are to be vacuumed normally; but we don't
1354 * count them in tups_vacuumed, else we'd be double-counting (at
1355 * least in the common case where heap_page_prune() just freed up
1356 * a non-HOT tuple). Note also that the final tups_vacuumed value
1357 * might be very low for tables where opportunistic page pruning
1358 * happens to occur very frequently (via heap_page_prune_opt()
1359 * calls that free up non-HOT tuples).
1361 if (ItemIdIsDead(itemid
))
1363 lazy_record_dead_tuple(dead_tuples
, &(tuple
.t_self
));
1364 all_visible
= false;
1365 has_dead_items
= true;
1369 Assert(ItemIdIsNormal(itemid
));
1371 tuple
.t_data
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
1372 tuple
.t_len
= ItemIdGetLength(itemid
);
1373 tuple
.t_tableOid
= RelationGetRelid(vacrel
->rel
);
1378 * The criteria for counting a tuple as live in this block need to
1379 * match what analyze.c's acquire_sample_rows() does, otherwise
1380 * VACUUM and ANALYZE may produce wildly different reltuples
1381 * values, e.g. when there are many recently-dead tuples.
1383 * The logic here is a bit simpler than acquire_sample_rows(), as
1384 * VACUUM can't run inside a transaction block, which makes some
1385 * cases impossible (e.g. in-progress insert from the same
1388 switch (HeapTupleSatisfiesVacuum(&tuple
, vacrel
->OldestXmin
, buf
))
1390 case HEAPTUPLE_DEAD
:
1393 * Ordinarily, DEAD tuples would have been removed by
1394 * heap_page_prune(), but it's possible that the tuple
1395 * state changed since heap_page_prune() looked. In
1396 * particular an INSERT_IN_PROGRESS tuple could have
1397 * changed to DEAD if the inserter aborted. So this
1398 * cannot be considered an error condition.
1400 * If the tuple is HOT-updated then it must only be
1401 * removed by a prune operation; so we keep it just as if
1402 * it were RECENTLY_DEAD. Also, if it's a heap-only
1403 * tuple, we choose to keep it, because it'll be a lot
1404 * cheaper to get rid of it in the next pruning pass than
1405 * to treat it like an indexed tuple. Finally, if index
1406 * cleanup is disabled, the second heap pass will not
1407 * execute, and the tuple will not get removed, so we must
1408 * treat it like any other dead tuple that we choose to
1411 * If this were to happen for a tuple that actually needed
1412 * to be deleted, we'd be in trouble, because it'd
1413 * possibly leave a tuple below the relation's xmin
1414 * horizon alive. heap_prepare_freeze_tuple() is prepared
1415 * to detect that case and abort the transaction,
1416 * preventing corruption.
1418 if (HeapTupleIsHotUpdated(&tuple
) ||
1419 HeapTupleIsHeapOnly(&tuple
) ||
1420 params
->index_cleanup
== VACOPT_TERNARY_DISABLED
)
1423 tupgone
= true; /* we can delete the tuple */
1424 all_visible
= false;
1426 case HEAPTUPLE_LIVE
:
1429 * Count it as live. Not only is this natural, but it's
1430 * also what acquire_sample_rows() does.
1435 * Is the tuple definitely visible to all transactions?
1437 * NB: Like with per-tuple hint bits, we can't set the
1438 * PD_ALL_VISIBLE flag if the inserter committed
1439 * asynchronously. See SetHintBits for more info. Check
1440 * that the tuple is hinted xmin-committed because of
1447 if (!HeapTupleHeaderXminCommitted(tuple
.t_data
))
1449 all_visible
= false;
1454 * The inserter definitely committed. But is it old
1455 * enough that everyone sees it as committed?
1457 xmin
= HeapTupleHeaderGetXmin(tuple
.t_data
);
1458 if (!TransactionIdPrecedes(xmin
, vacrel
->OldestXmin
))
1460 all_visible
= false;
1464 /* Track newest xmin on page. */
1465 if (TransactionIdFollows(xmin
, visibility_cutoff_xid
))
1466 visibility_cutoff_xid
= xmin
;
1469 case HEAPTUPLE_RECENTLY_DEAD
:
1472 * If tuple is recently deleted then we must not remove it
1476 all_visible
= false;
1478 case HEAPTUPLE_INSERT_IN_PROGRESS
:
1481 * This is an expected case during concurrent vacuum.
1483 * We do not count these rows as live, because we expect
1484 * the inserting transaction to update the counters at
1485 * commit, and we assume that will happen only after we
1486 * report our results. This assumption is a bit shaky,
1487 * but it is what acquire_sample_rows() does, so be
1490 all_visible
= false;
1492 case HEAPTUPLE_DELETE_IN_PROGRESS
:
1493 /* This is an expected case during concurrent vacuum */
1494 all_visible
= false;
1497 * Count such rows as live. As above, we assume the
1498 * deleting transaction will commit and update the
1499 * counters after we report.
1504 elog(ERROR
, "unexpected HeapTupleSatisfiesVacuum result");
1510 lazy_record_dead_tuple(dead_tuples
, &(tuple
.t_self
));
1511 HeapTupleHeaderAdvanceLatestRemovedXid(tuple
.t_data
,
1512 &vacrel
->latestRemovedXid
);
1514 has_dead_items
= true;
1518 bool tuple_totally_frozen
;
1524 * Each non-removable tuple must be checked to see if it needs
1525 * freezing. Note we already have exclusive buffer lock.
1527 if (heap_prepare_freeze_tuple(tuple
.t_data
,
1528 vacrel
->relfrozenxid
,
1530 vacrel
->FreezeLimit
,
1531 vacrel
->MultiXactCutoff
,
1533 &tuple_totally_frozen
))
1534 frozen
[nfrozen
++].offset
= offnum
;
1536 if (!tuple_totally_frozen
)
1539 } /* scan along page */
1542 * Clear the offset information once we have processed all the tuples
1545 vacrel
->offnum
= InvalidOffsetNumber
;
1548 * If we froze any tuples, mark the buffer dirty, and write a WAL
1549 * record recording the changes. We must log the changes to be
1550 * crash-safe against future truncation of CLOG.
1554 START_CRIT_SECTION();
1556 MarkBufferDirty(buf
);
1558 /* execute collected freezes */
1559 for (i
= 0; i
< nfrozen
; i
++)
1562 HeapTupleHeader htup
;
1564 itemid
= PageGetItemId(page
, frozen
[i
].offset
);
1565 htup
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
1567 heap_execute_freeze_tuple(htup
, &frozen
[i
]);
1570 /* Now WAL-log freezing if necessary */
1571 if (RelationNeedsWAL(vacrel
->rel
))
1575 recptr
= log_heap_freeze(vacrel
->rel
, buf
,
1576 vacrel
->FreezeLimit
, frozen
, nfrozen
);
1577 PageSetLSN(page
, recptr
);
1584 * If there are no indexes we can vacuum the page right now instead of
1585 * doing a second scan. Also we don't do that but forget dead tuples
1586 * when index cleanup is disabled.
1588 if (!vacrel
->useindex
&& dead_tuples
->num_tuples
> 0)
1590 if (vacrel
->nindexes
== 0)
1592 /* Remove tuples from heap if the table has no index */
1593 lazy_vacuum_heap_page(vacrel
, blkno
, buf
, 0, &vmbuffer
);
1595 has_dead_items
= false;
1600 * Here, we have indexes but index cleanup is disabled.
1601 * Instead of vacuuming the dead tuples on the heap, we just
1604 * Note that vacrelstats->dead_tuples could have tuples which
1605 * became dead after HOT-pruning but are not marked dead yet.
1606 * We do not process them because it's a very rare condition,
1607 * and the next vacuum will process them anyway.
1609 Assert(params
->index_cleanup
== VACOPT_TERNARY_DISABLED
);
1613 * Forget the now-vacuumed tuples, and press on, but be careful
1614 * not to reset latestRemovedXid since we want that value to be
1617 dead_tuples
->num_tuples
= 0;
1620 * Periodically do incremental FSM vacuuming to make newly-freed
1621 * space visible on upper FSM pages. Note: although we've cleaned
1622 * the current block, we haven't yet updated its FSM entry (that
1623 * happens further down), so passing end == blkno is correct.
1625 if (blkno
- next_fsm_block_to_vacuum
>= VACUUM_FSM_EVERY_PAGES
)
1627 FreeSpaceMapVacuumRange(vacrel
->rel
, next_fsm_block_to_vacuum
,
1629 next_fsm_block_to_vacuum
= blkno
;
1633 freespace
= PageGetHeapFreeSpace(page
);
1635 /* mark page all-visible, if appropriate */
1636 if (all_visible
&& !all_visible_according_to_vm
)
1638 uint8 flags
= VISIBILITYMAP_ALL_VISIBLE
;
1641 flags
|= VISIBILITYMAP_ALL_FROZEN
;
1644 * It should never be the case that the visibility map page is set
1645 * while the page-level bit is clear, but the reverse is allowed
1646 * (if checksums are not enabled). Regardless, set both bits so
1647 * that we get back in sync.
1649 * NB: If the heap page is all-visible but the VM bit is not set,
1650 * we don't need to dirty the heap page. However, if checksums
1651 * are enabled, we do need to make sure that the heap page is
1652 * dirtied before passing it to visibilitymap_set(), because it
1653 * may be logged. Given that this situation should only happen in
1654 * rare cases after a crash, it is not worth optimizing.
1656 PageSetAllVisible(page
);
1657 MarkBufferDirty(buf
);
1658 visibilitymap_set(vacrel
->rel
, blkno
, buf
, InvalidXLogRecPtr
,
1659 vmbuffer
, visibility_cutoff_xid
, flags
);
1663 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1664 * the page-level bit is clear. However, it's possible that the bit
1665 * got cleared after we checked it and before we took the buffer
1666 * content lock, so we must recheck before jumping to the conclusion
1667 * that something bad has happened.
1669 else if (all_visible_according_to_vm
&& !PageIsAllVisible(page
)
1670 && VM_ALL_VISIBLE(vacrel
->rel
, blkno
, &vmbuffer
))
1672 elog(WARNING
, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1673 vacrel
->relname
, blkno
);
1674 visibilitymap_clear(vacrel
->rel
, blkno
, vmbuffer
,
1675 VISIBILITYMAP_VALID_BITS
);
1679 * It's possible for the value returned by
1680 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1681 * wrong for us to see tuples that appear to not be visible to
1682 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1683 * xmin value never moves backwards, but
1684 * GetOldestNonRemovableTransactionId() is conservative and sometimes
1685 * returns a value that's unnecessarily small, so if we see that
1686 * contradiction it just means that the tuples that we think are not
1687 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1690 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1693 else if (PageIsAllVisible(page
) && has_dead_items
)
1695 elog(WARNING
, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1696 vacrel
->relname
, blkno
);
1697 PageClearAllVisible(page
);
1698 MarkBufferDirty(buf
);
1699 visibilitymap_clear(vacrel
->rel
, blkno
, vmbuffer
,
1700 VISIBILITYMAP_VALID_BITS
);
1704 * If the all-visible page is all-frozen but not marked as such yet,
1705 * mark it as all-frozen. Note that all_frozen is only valid if
1706 * all_visible is true, so we must check both.
1708 else if (all_visible_according_to_vm
&& all_visible
&& all_frozen
&&
1709 !VM_ALL_FROZEN(vacrel
->rel
, blkno
, &vmbuffer
))
1712 * We can pass InvalidTransactionId as the cutoff XID here,
1713 * because setting the all-frozen bit doesn't cause recovery
1716 visibilitymap_set(vacrel
->rel
, blkno
, buf
, InvalidXLogRecPtr
,
1717 vmbuffer
, InvalidTransactionId
,
1718 VISIBILITYMAP_ALL_FROZEN
);
1721 UnlockReleaseBuffer(buf
);
1723 /* Remember the location of the last page with nonremovable tuples */
1725 vacrel
->nonempty_pages
= blkno
+ 1;
1728 * If we remembered any tuples for deletion, then the page will be
1729 * visited again by lazy_vacuum_heap_rel, which will compute and record
1730 * its post-compaction free space. If not, then we're done with this
1731 * page, so remember its free space as-is. (This path will always be
1732 * taken if there are no indexes.)
1734 if (dead_tuples
->num_tuples
== prev_dead_count
)
1735 RecordPageWithFreeSpace(vacrel
->rel
, blkno
, freespace
);
1738 /* report that everything is scanned and vacuumed */
1739 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED
, blkno
);
1741 /* Clear the block number information */
1742 vacrel
->blkno
= InvalidBlockNumber
;
1746 /* save stats for use later */
1747 vacrel
->tuples_deleted
= tups_vacuumed
;
1748 vacrel
->new_dead_tuples
= nkeep
;
1750 /* now we can compute the new value for pg_class.reltuples */
1751 vacrel
->new_live_tuples
= vac_estimate_reltuples(vacrel
->rel
, nblocks
,
1752 vacrel
->tupcount_pages
,
1756 * Also compute the total number of surviving heap entries. In the
1757 * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1759 vacrel
->new_rel_tuples
=
1760 Max(vacrel
->new_live_tuples
, 0) + vacrel
->new_dead_tuples
;
1763 * Release any remaining pin on visibility map page.
1765 if (BufferIsValid(vmbuffer
))
1767 ReleaseBuffer(vmbuffer
);
1768 vmbuffer
= InvalidBuffer
;
1771 /* If any tuples need to be deleted, perform final vacuum cycle */
1772 /* XXX put a threshold on min number of tuples here? */
1773 if (dead_tuples
->num_tuples
> 0)
1775 /* Work on all the indexes, and then the heap */
1776 lazy_vacuum_all_indexes(vacrel
);
1778 /* Remove tuples from heap */
1779 lazy_vacuum_heap_rel(vacrel
);
1783 * Vacuum the remainder of the Free Space Map. We must do this whether or
1784 * not there were indexes.
1786 if (blkno
> next_fsm_block_to_vacuum
)
1787 FreeSpaceMapVacuumRange(vacrel
->rel
, next_fsm_block_to_vacuum
, blkno
);
1789 /* report all blocks vacuumed */
1790 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED
, blkno
);
1792 /* Do post-vacuum cleanup */
1793 if (vacrel
->useindex
)
1794 lazy_cleanup_all_indexes(vacrel
);
1797 * Free resources managed by lazy_space_alloc(). (We must end parallel
1798 * mode/free shared memory before updating index statistics. We cannot
1799 * write while in parallel mode.)
1801 lazy_space_free(vacrel
);
1803 /* Update index statistics */
1804 if (vacrel
->useindex
)
1805 update_index_statistics(vacrel
);
1807 /* If no indexes, make log report that lazy_vacuum_heap_rel would've made */
1810 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1812 tups_vacuumed
, vacuumed_pages
)));
1814 initStringInfo(&buf
);
1815 appendStringInfo(&buf
,
1816 _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"),
1817 nkeep
, vacrel
->OldestXmin
);
1818 appendStringInfo(&buf
, _("There were %.0f unused item identifiers.\n"),
1820 appendStringInfo(&buf
, ngettext("Skipped %u page due to buffer pins, ",
1821 "Skipped %u pages due to buffer pins, ",
1822 vacrel
->pinskipped_pages
),
1823 vacrel
->pinskipped_pages
);
1824 appendStringInfo(&buf
, ngettext("%u frozen page.\n",
1825 "%u frozen pages.\n",
1826 vacrel
->frozenskipped_pages
),
1827 vacrel
->frozenskipped_pages
);
1828 appendStringInfo(&buf
, ngettext("%u page is entirely empty.\n",
1829 "%u pages are entirely empty.\n",
1832 appendStringInfo(&buf
, _("%s."), pg_rusage_show(&ru0
));
1835 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1837 tups_vacuumed
, num_tuples
,
1838 vacrel
->scanned_pages
, nblocks
),
1839 errdetail_internal("%s", buf
.data
)));
1844 * lazy_vacuum_all_indexes() -- Main entry for index vacuuming
1847 lazy_vacuum_all_indexes(LVRelState
*vacrel
)
1849 Assert(!IsParallelWorker());
1850 Assert(vacrel
->nindexes
> 0);
1851 Assert(TransactionIdIsNormal(vacrel
->relfrozenxid
));
1852 Assert(MultiXactIdIsValid(vacrel
->relminmxid
));
1854 /* Log cleanup info before we touch indexes */
1855 vacuum_log_cleanup_info(vacrel
);
1857 /* Report that we are now vacuuming indexes */
1858 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
1859 PROGRESS_VACUUM_PHASE_VACUUM_INDEX
);
1861 if (!ParallelVacuumIsActive(vacrel
))
1863 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
1865 Relation indrel
= vacrel
->indrels
[idx
];
1866 IndexBulkDeleteResult
*istat
= vacrel
->indstats
[idx
];
1868 vacrel
->indstats
[idx
] =
1869 lazy_vacuum_one_index(indrel
, istat
, vacrel
->old_live_tuples
,
1875 /* Outsource everything to parallel variant */
1876 do_parallel_lazy_vacuum_all_indexes(vacrel
);
1879 /* Increase and report the number of index scans */
1880 vacrel
->num_index_scans
++;
1881 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS
,
1882 vacrel
->num_index_scans
);
1886 * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
1888 * This routine marks dead tuples as unused and compacts out free space on
1889 * their pages. Pages not having dead tuples recorded from lazy_scan_heap are
1890 * not visited at all.
1892 * Note: the reason for doing this as a second pass is we cannot remove the
1893 * tuples until we've removed their index entries, and we want to process
1894 * index entry removal in batches as large as possible.
1897 lazy_vacuum_heap_rel(LVRelState
*vacrel
)
1902 Buffer vmbuffer
= InvalidBuffer
;
1903 LVSavedErrInfo saved_err_info
;
1905 /* Report that we are now vacuuming the heap */
1906 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
1907 PROGRESS_VACUUM_PHASE_VACUUM_HEAP
);
1909 /* Update error traceback information */
1910 update_vacuum_error_info(vacrel
, &saved_err_info
,
1911 VACUUM_ERRCB_PHASE_VACUUM_HEAP
,
1912 InvalidBlockNumber
, InvalidOffsetNumber
);
1914 pg_rusage_init(&ru0
);
1918 while (tupindex
< vacrel
->dead_tuples
->num_tuples
)
1925 vacuum_delay_point();
1927 tblk
= ItemPointerGetBlockNumber(&vacrel
->dead_tuples
->itemptrs
[tupindex
]);
1928 vacrel
->blkno
= tblk
;
1929 buf
= ReadBufferExtended(vacrel
->rel
, MAIN_FORKNUM
, tblk
, RBM_NORMAL
,
1931 if (!ConditionalLockBufferForCleanup(buf
))
1937 tupindex
= lazy_vacuum_heap_page(vacrel
, tblk
, buf
, tupindex
,
1940 /* Now that we've compacted the page, record its available space */
1941 page
= BufferGetPage(buf
);
1942 freespace
= PageGetHeapFreeSpace(page
);
1944 UnlockReleaseBuffer(buf
);
1945 RecordPageWithFreeSpace(vacrel
->rel
, tblk
, freespace
);
1949 /* Clear the block number information */
1950 vacrel
->blkno
= InvalidBlockNumber
;
1952 if (BufferIsValid(vmbuffer
))
1954 ReleaseBuffer(vmbuffer
);
1955 vmbuffer
= InvalidBuffer
;
1959 (errmsg("\"%s\": removed %d dead item identifiers in %u pages",
1960 vacrel
->relname
, tupindex
, vacuumed_pages
),
1961 errdetail_internal("%s", pg_rusage_show(&ru0
))));
1963 /* Revert to the previous phase information for error traceback */
1964 restore_vacuum_error_info(vacrel
, &saved_err_info
);
1968 * lazy_vacuum_heap_page() -- free dead tuples on a page
1969 * and repair its fragmentation.
1971 * Caller must hold pin and buffer cleanup lock on the buffer.
1973 * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
1974 * this page. We assume the rest follow sequentially. The return value is
1975 * the first tupindex after the tuples of this page.
1978 lazy_vacuum_heap_page(LVRelState
*vacrel
, BlockNumber blkno
, Buffer buffer
,
1979 int tupindex
, Buffer
*vmbuffer
)
1981 LVDeadTuples
*dead_tuples
= vacrel
->dead_tuples
;
1982 Page page
= BufferGetPage(buffer
);
1983 OffsetNumber unused
[MaxHeapTuplesPerPage
];
1985 TransactionId visibility_cutoff_xid
;
1987 LVSavedErrInfo saved_err_info
;
1989 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED
, blkno
);
1991 /* Update error traceback information */
1992 update_vacuum_error_info(vacrel
, &saved_err_info
,
1993 VACUUM_ERRCB_PHASE_VACUUM_HEAP
, blkno
,
1994 InvalidOffsetNumber
);
1996 START_CRIT_SECTION();
1998 for (; tupindex
< dead_tuples
->num_tuples
; tupindex
++)
2004 tblk
= ItemPointerGetBlockNumber(&dead_tuples
->itemptrs
[tupindex
]);
2006 break; /* past end of tuples for this block */
2007 toff
= ItemPointerGetOffsetNumber(&dead_tuples
->itemptrs
[tupindex
]);
2008 itemid
= PageGetItemId(page
, toff
);
2009 ItemIdSetUnused(itemid
);
2010 unused
[uncnt
++] = toff
;
2013 PageRepairFragmentation(page
);
2016 * Mark buffer dirty before we write WAL.
2018 MarkBufferDirty(buffer
);
2021 if (RelationNeedsWAL(vacrel
->rel
))
2025 recptr
= log_heap_clean(vacrel
->rel
, buffer
,
2028 vacrel
->latestRemovedXid
);
2029 PageSetLSN(page
, recptr
);
2033 * End critical section, so we safely can do visibility tests (which
2034 * possibly need to perform IO and allocate memory!). If we crash now the
2035 * page (including the corresponding vm bit) might not be marked all
2036 * visible, but that's fine. A later vacuum will fix that.
2041 * Now that we have removed the dead tuples from the page, once again
2042 * check if the page has become all-visible. The page is already marked
2043 * dirty, exclusively locked, and, if needed, a full page image has been
2044 * emitted in the log_heap_clean() above.
2046 if (heap_page_is_all_visible(vacrel
, buffer
, &visibility_cutoff_xid
,
2048 PageSetAllVisible(page
);
2051 * All the changes to the heap page have been done. If the all-visible
2052 * flag is now set, also set the VM all-visible bit (and, if possible, the
2053 * all-frozen bit) unless this has already been done previously.
2055 if (PageIsAllVisible(page
))
2058 uint8 vm_status
= visibilitymap_get_status(vacrel
->rel
,
2061 /* Set the VM all-frozen bit to flag, if needed */
2062 if ((vm_status
& VISIBILITYMAP_ALL_VISIBLE
) == 0)
2063 flags
|= VISIBILITYMAP_ALL_VISIBLE
;
2064 if ((vm_status
& VISIBILITYMAP_ALL_FROZEN
) == 0 && all_frozen
)
2065 flags
|= VISIBILITYMAP_ALL_FROZEN
;
2067 Assert(BufferIsValid(*vmbuffer
));
2069 visibilitymap_set(vacrel
->rel
, blkno
, buffer
, InvalidXLogRecPtr
,
2070 *vmbuffer
, visibility_cutoff_xid
, flags
);
2073 /* Revert to the previous phase information for error traceback */
2074 restore_vacuum_error_info(vacrel
, &saved_err_info
);
2079 * lazy_check_needs_freeze() -- scan page to see if any tuples
2080 * need to be cleaned to avoid wraparound
2082 * Returns true if the page needs to be vacuumed using cleanup lock.
2083 * Also returns a flag indicating whether page contains any tuples at all.
2086 lazy_check_needs_freeze(Buffer buf
, bool *hastup
, LVRelState
*vacrel
)
2088 Page page
= BufferGetPage(buf
);
2089 OffsetNumber offnum
,
2091 HeapTupleHeader tupleheader
;
2096 * New and empty pages, obviously, don't contain tuples. We could make
2097 * sure that the page is registered in the FSM, but it doesn't seem worth
2098 * waiting for a cleanup lock just for that, especially because it's
2099 * likely that the pin holder will do so.
2101 if (PageIsNew(page
) || PageIsEmpty(page
))
2104 maxoff
= PageGetMaxOffsetNumber(page
);
2105 for (offnum
= FirstOffsetNumber
;
2107 offnum
= OffsetNumberNext(offnum
))
2112 * Set the offset number so that we can display it along with any
2113 * error that occurred while processing this tuple.
2115 vacrel
->offnum
= offnum
;
2116 itemid
= PageGetItemId(page
, offnum
);
2118 /* this should match hastup test in count_nondeletable_pages() */
2119 if (ItemIdIsUsed(itemid
))
2122 /* dead and redirect items never need freezing */
2123 if (!ItemIdIsNormal(itemid
))
2126 tupleheader
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
2128 if (heap_tuple_needs_freeze(tupleheader
, vacrel
->FreezeLimit
,
2129 vacrel
->MultiXactCutoff
, buf
))
2131 } /* scan along page */
2133 /* Clear the offset information once we have processed the given page. */
2134 vacrel
->offnum
= InvalidOffsetNumber
;
2136 return (offnum
<= maxoff
);
2140 * Perform lazy_vacuum_all_indexes() steps in parallel
2143 do_parallel_lazy_vacuum_all_indexes(LVRelState
*vacrel
)
2145 /* Tell parallel workers to do index vacuuming */
2146 vacrel
->lps
->lvshared
->for_cleanup
= false;
2147 vacrel
->lps
->lvshared
->first_time
= false;
2150 * We can only provide an approximate value of num_heap_tuples in vacuum
2153 vacrel
->lps
->lvshared
->reltuples
= vacrel
->old_live_tuples
;
2154 vacrel
->lps
->lvshared
->estimated_count
= true;
2156 do_parallel_vacuum_or_cleanup(vacrel
,
2157 vacrel
->lps
->nindexes_parallel_bulkdel
);
2161 * Perform lazy_cleanup_all_indexes() steps in parallel
2164 do_parallel_lazy_cleanup_all_indexes(LVRelState
*vacrel
)
2169 * If parallel vacuum is active we perform index cleanup with parallel
2172 * Tell parallel workers to do index cleanup.
2174 vacrel
->lps
->lvshared
->for_cleanup
= true;
2175 vacrel
->lps
->lvshared
->first_time
= (vacrel
->num_index_scans
== 0);
2178 * Now we can provide a better estimate of total number of surviving
2179 * tuples (we assume indexes are more interested in that than in the
2180 * number of nominally live tuples).
2182 vacrel
->lps
->lvshared
->reltuples
= vacrel
->new_rel_tuples
;
2183 vacrel
->lps
->lvshared
->estimated_count
=
2184 (vacrel
->tupcount_pages
< vacrel
->rel_pages
);
2186 /* Determine the number of parallel workers to launch */
2187 if (vacrel
->lps
->lvshared
->first_time
)
2188 nworkers
= vacrel
->lps
->nindexes_parallel_cleanup
+
2189 vacrel
->lps
->nindexes_parallel_condcleanup
;
2191 nworkers
= vacrel
->lps
->nindexes_parallel_cleanup
;
2193 do_parallel_vacuum_or_cleanup(vacrel
, nworkers
);
2197 * Perform index vacuum or index cleanup with parallel workers. This function
2198 * must be used by the parallel vacuum leader process. The caller must set
2199 * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2203 do_parallel_vacuum_or_cleanup(LVRelState
*vacrel
, int nworkers
)
2205 LVParallelState
*lps
= vacrel
->lps
;
2207 Assert(!IsParallelWorker());
2208 Assert(ParallelVacuumIsActive(vacrel
));
2209 Assert(vacrel
->nindexes
> 0);
2211 /* The leader process will participate */
2215 * It is possible that parallel context is initialized with fewer workers
2216 * than the number of indexes that need a separate worker in the current
2217 * phase, so we need to consider it. See compute_parallel_vacuum_workers.
2219 nworkers
= Min(nworkers
, lps
->pcxt
->nworkers
);
2221 /* Setup the shared cost-based vacuum delay and launch workers */
2224 if (vacrel
->num_index_scans
> 0)
2226 /* Reset the parallel index processing counter */
2227 pg_atomic_write_u32(&(lps
->lvshared
->idx
), 0);
2229 /* Reinitialize the parallel context to relaunch parallel workers */
2230 ReinitializeParallelDSM(lps
->pcxt
);
2234 * Set up shared cost balance and the number of active workers for
2235 * vacuum delay. We need to do this before launching workers as
2236 * otherwise, they might not see the updated values for these
2239 pg_atomic_write_u32(&(lps
->lvshared
->cost_balance
), VacuumCostBalance
);
2240 pg_atomic_write_u32(&(lps
->lvshared
->active_nworkers
), 0);
2243 * The number of workers can vary between bulkdelete and cleanup
2246 ReinitializeParallelWorkers(lps
->pcxt
, nworkers
);
2248 LaunchParallelWorkers(lps
->pcxt
);
2250 if (lps
->pcxt
->nworkers_launched
> 0)
2253 * Reset the local cost values for leader backend as we have
2254 * already accumulated the remaining balance of heap.
2256 VacuumCostBalance
= 0;
2257 VacuumCostBalanceLocal
= 0;
2259 /* Enable shared cost balance for leader backend */
2260 VacuumSharedCostBalance
= &(lps
->lvshared
->cost_balance
);
2261 VacuumActiveNWorkers
= &(lps
->lvshared
->active_nworkers
);
2264 if (lps
->lvshared
->for_cleanup
)
2266 (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2267 "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2268 lps
->pcxt
->nworkers_launched
),
2269 lps
->pcxt
->nworkers_launched
, nworkers
)));
2272 (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2273 "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2274 lps
->pcxt
->nworkers_launched
),
2275 lps
->pcxt
->nworkers_launched
, nworkers
)));
2278 /* Process the indexes that can be processed by only leader process */
2279 do_serial_processing_for_unsafe_indexes(vacrel
, lps
->lvshared
);
2282 * Join as a parallel worker. The leader process alone processes all the
2283 * indexes in the case where no workers are launched.
2285 do_parallel_processing(vacrel
, lps
->lvshared
);
2288 * Next, accumulate buffer and WAL usage. (This must wait for the workers
2289 * to finish, or we might get incomplete data.)
2293 /* Wait for all vacuum workers to finish */
2294 WaitForParallelWorkersToFinish(lps
->pcxt
);
2296 for (int i
= 0; i
< lps
->pcxt
->nworkers_launched
; i
++)
2297 InstrAccumParallelQuery(&lps
->buffer_usage
[i
], &lps
->wal_usage
[i
]);
2301 * Carry the shared balance value to heap scan and disable shared costing
2303 if (VacuumSharedCostBalance
)
2305 VacuumCostBalance
= pg_atomic_read_u32(VacuumSharedCostBalance
);
2306 VacuumSharedCostBalance
= NULL
;
2307 VacuumActiveNWorkers
= NULL
;
2312 * Index vacuum/cleanup routine used by the leader process and parallel
2313 * vacuum worker processes to process the indexes in parallel.
2316 do_parallel_processing(LVRelState
*vacrel
, LVShared
*lvshared
)
2319 * Increment the active worker count if we are able to launch any worker.
2321 if (VacuumActiveNWorkers
)
2322 pg_atomic_add_fetch_u32(VacuumActiveNWorkers
, 1);
2324 /* Loop until all indexes are vacuumed */
2328 LVSharedIndStats
*shared_istat
;
2330 IndexBulkDeleteResult
*istat
;
2332 /* Get an index number to process */
2333 idx
= pg_atomic_fetch_add_u32(&(lvshared
->idx
), 1);
2335 /* Done for all indexes? */
2336 if (idx
>= vacrel
->nindexes
)
2339 /* Get the index statistics of this index from DSM */
2340 shared_istat
= parallel_stats_for_idx(lvshared
, idx
);
2342 /* Skip indexes not participating in parallelism */
2343 if (shared_istat
== NULL
)
2346 indrel
= vacrel
->indrels
[idx
];
2349 * Skip processing indexes that are unsafe for workers (these are
2350 * processed in do_serial_processing_for_unsafe_indexes() by leader)
2352 if (!parallel_processing_is_safe(indrel
, lvshared
))
2355 /* Do vacuum or cleanup of the index */
2356 istat
= (vacrel
->indstats
[idx
]);
2357 vacrel
->indstats
[idx
] = parallel_process_one_index(indrel
, istat
,
2364 * We have completed the index vacuum so decrement the active worker
2367 if (VacuumActiveNWorkers
)
2368 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers
, 1);
2372 * Vacuum or cleanup indexes that can be processed by only the leader process
2373 * because these indexes don't support parallel operation at that phase.
2376 do_serial_processing_for_unsafe_indexes(LVRelState
*vacrel
, LVShared
*lvshared
)
2378 Assert(!IsParallelWorker());
2381 * Increment the active worker count if we are able to launch any worker.
2383 if (VacuumActiveNWorkers
)
2384 pg_atomic_add_fetch_u32(VacuumActiveNWorkers
, 1);
2386 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
2388 LVSharedIndStats
*shared_istat
;
2390 IndexBulkDeleteResult
*istat
;
2392 shared_istat
= parallel_stats_for_idx(lvshared
, idx
);
2394 /* Skip already-complete indexes */
2395 if (shared_istat
!= NULL
)
2398 indrel
= vacrel
->indrels
[idx
];
2401 * We're only here for the unsafe indexes
2403 if (parallel_processing_is_safe(indrel
, lvshared
))
2406 /* Do vacuum or cleanup of the index */
2407 istat
= (vacrel
->indstats
[idx
]);
2408 vacrel
->indstats
[idx
] = parallel_process_one_index(indrel
, istat
,
2415 * We have completed the index vacuum so decrement the active worker
2418 if (VacuumActiveNWorkers
)
2419 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers
, 1);
2423 * Vacuum or cleanup index either by leader process or by one of the worker
2424 * process. After processing the index this function copies the index
2425 * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2428 static IndexBulkDeleteResult
*
2429 parallel_process_one_index(Relation indrel
,
2430 IndexBulkDeleteResult
*istat
,
2432 LVSharedIndStats
*shared_istat
,
2435 IndexBulkDeleteResult
*istat_res
;
2438 * Update the pointer to the corresponding bulk-deletion result if someone
2439 * has already updated it
2441 if (shared_istat
&& shared_istat
->updated
&& istat
== NULL
)
2442 istat
= &shared_istat
->istat
;
2444 /* Do vacuum or cleanup of the index */
2445 if (lvshared
->for_cleanup
)
2446 istat_res
= lazy_cleanup_one_index(indrel
, istat
, lvshared
->reltuples
,
2447 lvshared
->estimated_count
, vacrel
);
2449 istat_res
= lazy_vacuum_one_index(indrel
, istat
, lvshared
->reltuples
,
2453 * Copy the index bulk-deletion result returned from ambulkdelete and
2454 * amvacuumcleanup to the DSM segment if it's the first cycle because they
2455 * allocate locally and it's possible that an index will be vacuumed by a
2456 * different vacuum process the next cycle. Copying the result normally
2457 * happens only the first time an index is vacuumed. For any additional
2458 * vacuum pass, we directly point to the result on the DSM segment and
2459 * pass it to vacuum index APIs so that workers can update it directly.
2461 * Since all vacuum workers write the bulk-deletion result at different
2462 * slots we can write them without locking.
2464 if (shared_istat
&& !shared_istat
->updated
&& istat_res
!= NULL
)
2466 memcpy(&shared_istat
->istat
, istat_res
, sizeof(IndexBulkDeleteResult
));
2467 shared_istat
->updated
= true;
2469 /* Free the locally-allocated bulk-deletion result */
2472 /* return the pointer to the result from shared memory */
2473 return &shared_istat
->istat
;
2480 * lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2483 lazy_cleanup_all_indexes(LVRelState
*vacrel
)
2485 Assert(!IsParallelWorker());
2486 Assert(vacrel
->nindexes
> 0);
2488 /* Report that we are now cleaning up indexes */
2489 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
2490 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP
);
2492 if (!ParallelVacuumIsActive(vacrel
))
2494 double reltuples
= vacrel
->new_rel_tuples
;
2495 bool estimated_count
=
2496 vacrel
->tupcount_pages
< vacrel
->rel_pages
;
2498 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
2500 Relation indrel
= vacrel
->indrels
[idx
];
2501 IndexBulkDeleteResult
*istat
= vacrel
->indstats
[idx
];
2503 vacrel
->indstats
[idx
] =
2504 lazy_cleanup_one_index(indrel
, istat
, reltuples
,
2505 estimated_count
, vacrel
);
2510 /* Outsource everything to parallel variant */
2511 do_parallel_lazy_cleanup_all_indexes(vacrel
);
2516 * lazy_vacuum_one_index() -- vacuum index relation.
2518 * Delete all the index entries pointing to tuples listed in
2519 * dead_tuples, and update running statistics.
2521 * reltuples is the number of heap tuples to be passed to the
2522 * bulkdelete callback. It's always assumed to be estimated.
2524 * Returns bulk delete stats derived from input stats
2526 static IndexBulkDeleteResult
*
2527 lazy_vacuum_one_index(Relation indrel
, IndexBulkDeleteResult
*istat
,
2528 double reltuples
, LVRelState
*vacrel
)
2530 IndexVacuumInfo ivinfo
;
2532 LVSavedErrInfo saved_err_info
;
2534 pg_rusage_init(&ru0
);
2536 ivinfo
.index
= indrel
;
2537 ivinfo
.analyze_only
= false;
2538 ivinfo
.report_progress
= false;
2539 ivinfo
.estimated_count
= true;
2540 ivinfo
.message_level
= elevel
;
2541 ivinfo
.num_heap_tuples
= reltuples
;
2542 ivinfo
.strategy
= vacrel
->bstrategy
;
2545 * Update error traceback information.
2547 * The index name is saved during this phase and restored immediately
2548 * after this phase. See vacuum_error_callback.
2550 Assert(vacrel
->indname
== NULL
);
2551 vacrel
->indname
= pstrdup(RelationGetRelationName(indrel
));
2552 update_vacuum_error_info(vacrel
, &saved_err_info
,
2553 VACUUM_ERRCB_PHASE_VACUUM_INDEX
,
2554 InvalidBlockNumber
, InvalidOffsetNumber
);
2556 /* Do bulk deletion */
2557 istat
= index_bulk_delete(&ivinfo
, istat
, lazy_tid_reaped
,
2558 (void *) vacrel
->dead_tuples
);
2561 (errmsg("scanned index \"%s\" to remove %d row versions",
2562 vacrel
->indname
, vacrel
->dead_tuples
->num_tuples
),
2563 errdetail_internal("%s", pg_rusage_show(&ru0
))));
2565 /* Revert to the previous phase information for error traceback */
2566 restore_vacuum_error_info(vacrel
, &saved_err_info
);
2567 pfree(vacrel
->indname
);
2568 vacrel
->indname
= NULL
;
2574 * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
2576 * reltuples is the number of heap tuples and estimated_count is true
2577 * if reltuples is an estimated value.
2579 * Returns bulk delete stats derived from input stats
2581 static IndexBulkDeleteResult
*
2582 lazy_cleanup_one_index(Relation indrel
, IndexBulkDeleteResult
*istat
,
2583 double reltuples
, bool estimated_count
,
2586 IndexVacuumInfo ivinfo
;
2588 LVSavedErrInfo saved_err_info
;
2590 pg_rusage_init(&ru0
);
2592 ivinfo
.index
= indrel
;
2593 ivinfo
.analyze_only
= false;
2594 ivinfo
.report_progress
= false;
2595 ivinfo
.estimated_count
= estimated_count
;
2596 ivinfo
.message_level
= elevel
;
2598 ivinfo
.num_heap_tuples
= reltuples
;
2599 ivinfo
.strategy
= vacrel
->bstrategy
;
2602 * Update error traceback information.
2604 * The index name is saved during this phase and restored immediately
2605 * after this phase. See vacuum_error_callback.
2607 Assert(vacrel
->indname
== NULL
);
2608 vacrel
->indname
= pstrdup(RelationGetRelationName(indrel
));
2609 update_vacuum_error_info(vacrel
, &saved_err_info
,
2610 VACUUM_ERRCB_PHASE_INDEX_CLEANUP
,
2611 InvalidBlockNumber
, InvalidOffsetNumber
);
2613 istat
= index_vacuum_cleanup(&ivinfo
, istat
);
2618 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
2619 RelationGetRelationName(indrel
),
2620 (istat
)->num_index_tuples
,
2621 (istat
)->num_pages
),
2622 errdetail("%.0f index row versions were removed.\n"
2623 "%u index pages were newly deleted.\n"
2624 "%u index pages are currently deleted, of which %u are currently reusable.\n"
2626 (istat
)->tuples_removed
,
2627 (istat
)->pages_newly_deleted
,
2628 (istat
)->pages_deleted
, (istat
)->pages_free
,
2629 pg_rusage_show(&ru0
))));
2632 /* Revert to the previous phase information for error traceback */
2633 restore_vacuum_error_info(vacrel
, &saved_err_info
);
2634 pfree(vacrel
->indname
);
2635 vacrel
->indname
= NULL
;
2641 * should_attempt_truncation - should we attempt to truncate the heap?
2643 * Don't even think about it unless we have a shot at releasing a goodly
2644 * number of pages. Otherwise, the time taken isn't worth it.
2646 * Also don't attempt it if we are doing early pruning/vacuuming, because a
2647 * scan which cannot find a truncated heap page cannot determine that the
2648 * snapshot is too old to read that page. We might be able to get away with
2649 * truncating all except one of the pages, setting its LSN to (at least) the
2650 * maximum of the truncated range if we also treated an index leaf tuple
2651 * pointing to a missing heap page as something to trigger the "snapshot too
2652 * old" error, but that seems fragile and seems like it deserves its own patch
2653 * if we consider it.
2655 * This is split out so that we can test whether truncation is going to be
2656 * called for before we actually do it. If you change the logic here, be
2657 * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
2660 should_attempt_truncation(LVRelState
*vacrel
, VacuumParams
*params
)
2662 BlockNumber possibly_freeable
;
2664 if (params
->truncate
== VACOPT_TERNARY_DISABLED
)
2667 possibly_freeable
= vacrel
->rel_pages
- vacrel
->nonempty_pages
;
2668 if (possibly_freeable
> 0 &&
2669 (possibly_freeable
>= REL_TRUNCATE_MINIMUM
||
2670 possibly_freeable
>= vacrel
->rel_pages
/ REL_TRUNCATE_FRACTION
) &&
2671 old_snapshot_threshold
< 0)
2678 * lazy_truncate_heap - try to truncate off any empty pages at the end
2681 lazy_truncate_heap(LVRelState
*vacrel
)
2683 BlockNumber old_rel_pages
= vacrel
->rel_pages
;
2684 BlockNumber new_rel_pages
;
2687 /* Report that we are now truncating */
2688 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE
,
2689 PROGRESS_VACUUM_PHASE_TRUNCATE
);
2692 * Loop until no more truncating can be done.
2698 pg_rusage_init(&ru0
);
2701 * We need full exclusive lock on the relation in order to do
2702 * truncation. If we can't get it, give up rather than waiting --- we
2703 * don't want to block other backends, and we don't want to deadlock
2704 * (which is quite possible considering we already hold a lower-grade
2707 vacrel
->lock_waiter_detected
= false;
2711 if (ConditionalLockRelation(vacrel
->rel
, AccessExclusiveLock
))
2715 * Check for interrupts while trying to (re-)acquire the exclusive
2718 CHECK_FOR_INTERRUPTS();
2720 if (++lock_retry
> (VACUUM_TRUNCATE_LOCK_TIMEOUT
/
2721 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL
))
2724 * We failed to establish the lock in the specified number of
2725 * retries. This means we give up truncating.
2727 vacrel
->lock_waiter_detected
= true;
2729 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
2734 pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL
* 1000L);
2738 * Now that we have exclusive lock, look to see if the rel has grown
2739 * whilst we were vacuuming with non-exclusive lock. If so, give up;
2740 * the newly added pages presumably contain non-deletable tuples.
2742 new_rel_pages
= RelationGetNumberOfBlocks(vacrel
->rel
);
2743 if (new_rel_pages
!= old_rel_pages
)
2746 * Note: we intentionally don't update vacrel->rel_pages with the
2747 * new rel size here. If we did, it would amount to assuming that
2748 * the new pages are empty, which is unlikely. Leaving the numbers
2749 * alone amounts to assuming that the new pages have the same
2750 * tuple density as existing ones, which is less unlikely.
2752 UnlockRelation(vacrel
->rel
, AccessExclusiveLock
);
2757 * Scan backwards from the end to verify that the end pages actually
2758 * contain no tuples. This is *necessary*, not optional, because
2759 * other backends could have added tuples to these pages whilst we
2762 new_rel_pages
= count_nondeletable_pages(vacrel
);
2763 vacrel
->blkno
= new_rel_pages
;
2765 if (new_rel_pages
>= old_rel_pages
)
2767 /* can't do anything after all */
2768 UnlockRelation(vacrel
->rel
, AccessExclusiveLock
);
2775 RelationTruncate(vacrel
->rel
, new_rel_pages
);
2778 * We can release the exclusive lock as soon as we have truncated.
2779 * Other backends can't safely access the relation until they have
2780 * processed the smgr invalidation that smgrtruncate sent out ... but
2781 * that should happen as part of standard invalidation processing once
2782 * they acquire lock on the relation.
2784 UnlockRelation(vacrel
->rel
, AccessExclusiveLock
);
2787 * Update statistics. Here, it *is* correct to adjust rel_pages
2788 * without also touching reltuples, since the tuple count wasn't
2789 * changed by the truncation.
2791 vacrel
->pages_removed
+= old_rel_pages
- new_rel_pages
;
2792 vacrel
->rel_pages
= new_rel_pages
;
2795 (errmsg("\"%s\": truncated %u to %u pages",
2797 old_rel_pages
, new_rel_pages
),
2798 errdetail_internal("%s",
2799 pg_rusage_show(&ru0
))));
2800 old_rel_pages
= new_rel_pages
;
2801 } while (new_rel_pages
> vacrel
->nonempty_pages
&&
2802 vacrel
->lock_waiter_detected
);
2806 * Rescan end pages to verify that they are (still) empty of tuples.
2808 * Returns number of nondeletable pages (last nonempty page + 1).
2811 count_nondeletable_pages(LVRelState
*vacrel
)
2814 BlockNumber prefetchedUntil
;
2815 instr_time starttime
;
2817 /* Initialize the starttime if we check for conflicting lock requests */
2818 INSTR_TIME_SET_CURRENT(starttime
);
2821 * Start checking blocks at what we believe relation end to be and move
2822 * backwards. (Strange coding of loop control is needed because blkno is
2823 * unsigned.) To make the scan faster, we prefetch a few blocks at a time
2824 * in forward direction, so that OS-level readahead can kick in.
2826 blkno
= vacrel
->rel_pages
;
2827 StaticAssertStmt((PREFETCH_SIZE
& (PREFETCH_SIZE
- 1)) == 0,
2828 "prefetch size must be power of 2");
2829 prefetchedUntil
= InvalidBlockNumber
;
2830 while (blkno
> vacrel
->nonempty_pages
)
2834 OffsetNumber offnum
,
2839 * Check if another process requests a lock on our relation. We are
2840 * holding an AccessExclusiveLock here, so they will be waiting. We
2841 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
2842 * only check if that interval has elapsed once every 32 blocks to
2843 * keep the number of system calls and actual shared lock table
2844 * lookups to a minimum.
2846 if ((blkno
% 32) == 0)
2848 instr_time currenttime
;
2851 INSTR_TIME_SET_CURRENT(currenttime
);
2852 elapsed
= currenttime
;
2853 INSTR_TIME_SUBTRACT(elapsed
, starttime
);
2854 if ((INSTR_TIME_GET_MICROSEC(elapsed
) / 1000)
2855 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL
)
2857 if (LockHasWaitersRelation(vacrel
->rel
, AccessExclusiveLock
))
2860 (errmsg("\"%s\": suspending truncate due to conflicting lock request",
2863 vacrel
->lock_waiter_detected
= true;
2866 starttime
= currenttime
;
2871 * We don't insert a vacuum delay point here, because we have an
2872 * exclusive lock on the table which we want to hold for as short a
2873 * time as possible. We still need to check for interrupts however.
2875 CHECK_FOR_INTERRUPTS();
2879 /* If we haven't prefetched this lot yet, do so now. */
2880 if (prefetchedUntil
> blkno
)
2882 BlockNumber prefetchStart
;
2885 prefetchStart
= blkno
& ~(PREFETCH_SIZE
- 1);
2886 for (pblkno
= prefetchStart
; pblkno
<= blkno
; pblkno
++)
2888 PrefetchBuffer(vacrel
->rel
, MAIN_FORKNUM
, pblkno
);
2889 CHECK_FOR_INTERRUPTS();
2891 prefetchedUntil
= prefetchStart
;
2894 buf
= ReadBufferExtended(vacrel
->rel
, MAIN_FORKNUM
, blkno
, RBM_NORMAL
,
2897 /* In this phase we only need shared access to the buffer */
2898 LockBuffer(buf
, BUFFER_LOCK_SHARE
);
2900 page
= BufferGetPage(buf
);
2902 if (PageIsNew(page
) || PageIsEmpty(page
))
2904 UnlockReleaseBuffer(buf
);
2909 maxoff
= PageGetMaxOffsetNumber(page
);
2910 for (offnum
= FirstOffsetNumber
;
2912 offnum
= OffsetNumberNext(offnum
))
2916 itemid
= PageGetItemId(page
, offnum
);
2919 * Note: any non-unused item should be taken as a reason to keep
2920 * this page. We formerly thought that DEAD tuples could be
2921 * thrown away, but that's not so, because we'd not have cleaned
2922 * out their index entries.
2924 if (ItemIdIsUsed(itemid
))
2927 break; /* can stop scanning */
2929 } /* scan along page */
2931 UnlockReleaseBuffer(buf
);
2933 /* Done scanning if we found a tuple here */
2939 * If we fall out of the loop, all the previously-thought-to-be-empty
2940 * pages still are; we need not bother to look at the last known-nonempty
2943 return vacrel
->nonempty_pages
;
2947 * Return the maximum number of dead tuples we can record.
2950 compute_max_dead_tuples(BlockNumber relblocks
, bool useindex
)
2953 int vac_work_mem
= IsAutoVacuumWorkerProcess() &&
2954 autovacuum_work_mem
!= -1 ?
2955 autovacuum_work_mem
: maintenance_work_mem
;
2959 maxtuples
= MAXDEADTUPLES(vac_work_mem
* 1024L);
2960 maxtuples
= Min(maxtuples
, INT_MAX
);
2961 maxtuples
= Min(maxtuples
, MAXDEADTUPLES(MaxAllocSize
));
2963 /* curious coding here to ensure the multiplication can't overflow */
2964 if ((BlockNumber
) (maxtuples
/ LAZY_ALLOC_TUPLES
) > relblocks
)
2965 maxtuples
= relblocks
* LAZY_ALLOC_TUPLES
;
2967 /* stay sane if small maintenance_work_mem */
2968 maxtuples
= Max(maxtuples
, MaxHeapTuplesPerPage
);
2971 maxtuples
= MaxHeapTuplesPerPage
;
2977 * lazy_space_alloc - space allocation decisions for lazy vacuum
2979 * See the comments at the head of this file for rationale.
2982 lazy_space_alloc(LVRelState
*vacrel
, int nworkers
, BlockNumber nblocks
)
2984 LVDeadTuples
*dead_tuples
;
2988 * Initialize state for a parallel vacuum. As of now, only one worker can
2989 * be used for an index, so we invoke parallelism only if there are at
2990 * least two indexes on a table.
2992 if (nworkers
>= 0 && vacrel
->nindexes
> 1)
2995 * Since parallel workers cannot access data in temporary tables, we
2996 * can't perform parallel vacuum on them.
2998 if (RelationUsesLocalBuffers(vacrel
->rel
))
3001 * Give warning only if the user explicitly tries to perform a
3002 * parallel vacuum on the temporary table.
3006 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3010 vacrel
->lps
= begin_parallel_vacuum(vacrel
, nblocks
, nworkers
);
3012 /* If parallel mode started, we're done */
3013 if (ParallelVacuumIsActive(vacrel
))
3017 maxtuples
= compute_max_dead_tuples(nblocks
, vacrel
->nindexes
> 0);
3019 dead_tuples
= (LVDeadTuples
*) palloc(SizeOfDeadTuples(maxtuples
));
3020 dead_tuples
->num_tuples
= 0;
3021 dead_tuples
->max_tuples
= (int) maxtuples
;
3023 vacrel
->dead_tuples
= dead_tuples
;
3027 * lazy_space_free - free space allocated in lazy_space_alloc
3030 lazy_space_free(LVRelState
*vacrel
)
3032 if (!ParallelVacuumIsActive(vacrel
))
3036 * End parallel mode before updating index statistics as we cannot write
3037 * during parallel mode.
3039 end_parallel_vacuum(vacrel
);
3043 * lazy_record_dead_tuple - remember one deletable tuple
3046 lazy_record_dead_tuple(LVDeadTuples
*dead_tuples
, ItemPointer itemptr
)
3049 * The array shouldn't overflow under normal behavior, but perhaps it
3050 * could if we are given a really small maintenance_work_mem. In that
3051 * case, just forget the last few tuples (we'll get 'em next time).
3053 if (dead_tuples
->num_tuples
< dead_tuples
->max_tuples
)
3055 dead_tuples
->itemptrs
[dead_tuples
->num_tuples
] = *itemptr
;
3056 dead_tuples
->num_tuples
++;
3057 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES
,
3058 dead_tuples
->num_tuples
);
3063 * lazy_tid_reaped() -- is a particular tid deletable?
3065 * This has the right signature to be an IndexBulkDeleteCallback.
3067 * Assumes dead_tuples array is in sorted order.
3070 lazy_tid_reaped(ItemPointer itemptr
, void *state
)
3072 LVDeadTuples
*dead_tuples
= (LVDeadTuples
*) state
;
3078 litem
= itemptr_encode(&dead_tuples
->itemptrs
[0]);
3079 ritem
= itemptr_encode(&dead_tuples
->itemptrs
[dead_tuples
->num_tuples
- 1]);
3080 item
= itemptr_encode(itemptr
);
3083 * Doing a simple bound check before bsearch() is useful to avoid the
3084 * extra cost of bsearch(), especially if dead tuples on the heap are
3085 * concentrated in a certain range. Since this function is called for
3086 * every index tuple, it pays to be really fast.
3088 if (item
< litem
|| item
> ritem
)
3091 res
= (ItemPointer
) bsearch((void *) itemptr
,
3092 (void *) dead_tuples
->itemptrs
,
3093 dead_tuples
->num_tuples
,
3094 sizeof(ItemPointerData
),
3097 return (res
!= NULL
);
3101 * Comparator routines for use with qsort() and bsearch().
3104 vac_cmp_itemptr(const void *left
, const void *right
)
3111 lblk
= ItemPointerGetBlockNumber((ItemPointer
) left
);
3112 rblk
= ItemPointerGetBlockNumber((ItemPointer
) right
);
3119 loff
= ItemPointerGetOffsetNumber((ItemPointer
) left
);
3120 roff
= ItemPointerGetOffsetNumber((ItemPointer
) right
);
3131 * Check if every tuple in the given page is visible to all current and future
3132 * transactions. Also return the visibility_cutoff_xid which is the highest
3133 * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
3134 * on this page is frozen.
3137 heap_page_is_all_visible(LVRelState
*vacrel
, Buffer buf
,
3138 TransactionId
*visibility_cutoff_xid
,
3141 Page page
= BufferGetPage(buf
);
3142 BlockNumber blockno
= BufferGetBlockNumber(buf
);
3143 OffsetNumber offnum
,
3145 bool all_visible
= true;
3147 *visibility_cutoff_xid
= InvalidTransactionId
;
3151 * This is a stripped down version of the line pointer scan in
3152 * lazy_scan_heap(). So if you change anything here, also check that code.
3154 maxoff
= PageGetMaxOffsetNumber(page
);
3155 for (offnum
= FirstOffsetNumber
;
3156 offnum
<= maxoff
&& all_visible
;
3157 offnum
= OffsetNumberNext(offnum
))
3160 HeapTupleData tuple
;
3163 * Set the offset number so that we can display it along with any
3164 * error that occurred while processing this tuple.
3166 vacrel
->offnum
= offnum
;
3167 itemid
= PageGetItemId(page
, offnum
);
3169 /* Unused or redirect line pointers are of no interest */
3170 if (!ItemIdIsUsed(itemid
) || ItemIdIsRedirected(itemid
))
3173 ItemPointerSet(&(tuple
.t_self
), blockno
, offnum
);
3176 * Dead line pointers can have index pointers pointing to them. So
3177 * they can't be treated as visible
3179 if (ItemIdIsDead(itemid
))
3181 all_visible
= false;
3182 *all_frozen
= false;
3186 Assert(ItemIdIsNormal(itemid
));
3188 tuple
.t_data
= (HeapTupleHeader
) PageGetItem(page
, itemid
);
3189 tuple
.t_len
= ItemIdGetLength(itemid
);
3190 tuple
.t_tableOid
= RelationGetRelid(vacrel
->rel
);
3192 switch (HeapTupleSatisfiesVacuum(&tuple
, vacrel
->OldestXmin
, buf
))
3194 case HEAPTUPLE_LIVE
:
3198 /* Check comments in lazy_scan_heap. */
3199 if (!HeapTupleHeaderXminCommitted(tuple
.t_data
))
3201 all_visible
= false;
3202 *all_frozen
= false;
3207 * The inserter definitely committed. But is it old enough
3208 * that everyone sees it as committed?
3210 xmin
= HeapTupleHeaderGetXmin(tuple
.t_data
);
3211 if (!TransactionIdPrecedes(xmin
, vacrel
->OldestXmin
))
3213 all_visible
= false;
3214 *all_frozen
= false;
3218 /* Track newest xmin on page. */
3219 if (TransactionIdFollows(xmin
, *visibility_cutoff_xid
))
3220 *visibility_cutoff_xid
= xmin
;
3222 /* Check whether this tuple is already frozen or not */
3223 if (all_visible
&& *all_frozen
&&
3224 heap_tuple_needs_eventual_freeze(tuple
.t_data
))
3225 *all_frozen
= false;
3229 case HEAPTUPLE_DEAD
:
3230 case HEAPTUPLE_RECENTLY_DEAD
:
3231 case HEAPTUPLE_INSERT_IN_PROGRESS
:
3232 case HEAPTUPLE_DELETE_IN_PROGRESS
:
3234 all_visible
= false;
3235 *all_frozen
= false;
3239 elog(ERROR
, "unexpected HeapTupleSatisfiesVacuum result");
3242 } /* scan along page */
3244 /* Clear the offset information once we have processed the given page. */
3245 vacrel
->offnum
= InvalidOffsetNumber
;
3251 * Compute the number of parallel worker processes to request. Both index
3252 * vacuum and index cleanup can be executed with parallel workers. The index
3253 * is eligible for parallel vacuum iff its size is greater than
3254 * min_parallel_index_scan_size as invoking workers for very small indexes
3255 * can hurt performance.
3257 * nrequested is the number of parallel workers that user requested. If
3258 * nrequested is 0, we compute the parallel degree based on nindexes, that is
3259 * the number of indexes that support parallel vacuum. This function also
3260 * sets can_parallel_vacuum to remember indexes that participate in parallel
3264 compute_parallel_vacuum_workers(LVRelState
*vacrel
, int nrequested
,
3265 bool *can_parallel_vacuum
)
3267 int nindexes_parallel
= 0;
3268 int nindexes_parallel_bulkdel
= 0;
3269 int nindexes_parallel_cleanup
= 0;
3270 int parallel_workers
;
3273 * We don't allow performing parallel operation in standalone backend or
3274 * when parallelism is disabled.
3276 if (!IsUnderPostmaster
|| max_parallel_maintenance_workers
== 0)
3280 * Compute the number of indexes that can participate in parallel vacuum.
3282 for (int idx
= 0; idx
< vacrel
->nindexes
; idx
++)
3284 Relation indrel
= vacrel
->indrels
[idx
];
3285 uint8 vacoptions
= indrel
->rd_indam
->amparallelvacuumoptions
;
3287 if (vacoptions
== VACUUM_OPTION_NO_PARALLEL
||
3288 RelationGetNumberOfBlocks(indrel
) < min_parallel_index_scan_size
)
3291 can_parallel_vacuum
[idx
] = true;
3293 if ((vacoptions
& VACUUM_OPTION_PARALLEL_BULKDEL
) != 0)
3294 nindexes_parallel_bulkdel
++;
3295 if (((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) != 0) ||
3296 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) != 0))
3297 nindexes_parallel_cleanup
++;
3300 nindexes_parallel
= Max(nindexes_parallel_bulkdel
,
3301 nindexes_parallel_cleanup
);
3303 /* The leader process takes one index */
3304 nindexes_parallel
--;
3306 /* No index supports parallel vacuum */
3307 if (nindexes_parallel
<= 0)
3310 /* Compute the parallel degree */
3311 parallel_workers
= (nrequested
> 0) ?
3312 Min(nrequested
, nindexes_parallel
) : nindexes_parallel
;
3314 /* Cap by max_parallel_maintenance_workers */
3315 parallel_workers
= Min(parallel_workers
, max_parallel_maintenance_workers
);
3317 return parallel_workers
;
3321 * Update index statistics in pg_class if the statistics are accurate.
3324 update_index_statistics(LVRelState
*vacrel
)
3326 Relation
*indrels
= vacrel
->indrels
;
3327 int nindexes
= vacrel
->nindexes
;
3328 IndexBulkDeleteResult
**indstats
= vacrel
->indstats
;
3330 Assert(!IsInParallelMode());
3332 for (int idx
= 0; idx
< nindexes
; idx
++)
3334 Relation indrel
= indrels
[idx
];
3335 IndexBulkDeleteResult
*istat
= indstats
[idx
];
3337 if (istat
== NULL
|| istat
->estimated_count
)
3340 /* Update index statistics */
3341 vac_update_relstats(indrel
,
3343 istat
->num_index_tuples
,
3346 InvalidTransactionId
,
3353 * This function prepares and returns parallel vacuum state if we can launch
3354 * even one worker. This function is responsible for entering parallel mode,
3355 * create a parallel context, and then initialize the DSM segment.
3357 static LVParallelState
*
3358 begin_parallel_vacuum(LVRelState
*vacrel
, BlockNumber nblocks
,
3361 LVParallelState
*lps
= NULL
;
3362 Relation
*indrels
= vacrel
->indrels
;
3363 int nindexes
= vacrel
->nindexes
;
3364 ParallelContext
*pcxt
;
3366 LVDeadTuples
*dead_tuples
;
3367 BufferUsage
*buffer_usage
;
3368 WalUsage
*wal_usage
;
3369 bool *can_parallel_vacuum
;
3372 Size est_deadtuples
;
3373 int nindexes_mwm
= 0;
3374 int parallel_workers
= 0;
3378 * A parallel vacuum must be requested and there must be indexes on the
3381 Assert(nrequested
>= 0);
3382 Assert(nindexes
> 0);
3385 * Compute the number of parallel vacuum workers to launch
3387 can_parallel_vacuum
= (bool *) palloc0(sizeof(bool) * nindexes
);
3388 parallel_workers
= compute_parallel_vacuum_workers(vacrel
,
3390 can_parallel_vacuum
);
3392 /* Can't perform vacuum in parallel */
3393 if (parallel_workers
<= 0)
3395 pfree(can_parallel_vacuum
);
3399 lps
= (LVParallelState
*) palloc0(sizeof(LVParallelState
));
3401 EnterParallelMode();
3402 pcxt
= CreateParallelContext("postgres", "parallel_vacuum_main",
3404 Assert(pcxt
->nworkers
> 0);
3407 /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3408 est_shared
= MAXALIGN(add_size(SizeOfLVShared
, BITMAPLEN(nindexes
)));
3409 for (int idx
= 0; idx
< nindexes
; idx
++)
3411 Relation indrel
= indrels
[idx
];
3412 uint8 vacoptions
= indrel
->rd_indam
->amparallelvacuumoptions
;
3415 * Cleanup option should be either disabled, always performing in
3416 * parallel or conditionally performing in parallel.
3418 Assert(((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) == 0) ||
3419 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) == 0));
3420 Assert(vacoptions
<= VACUUM_OPTION_MAX_VALID_VALUE
);
3422 /* Skip indexes that don't participate in parallel vacuum */
3423 if (!can_parallel_vacuum
[idx
])
3426 if (indrel
->rd_indam
->amusemaintenanceworkmem
)
3429 est_shared
= add_size(est_shared
, sizeof(LVSharedIndStats
));
3432 * Remember the number of indexes that support parallel operation for
3435 if ((vacoptions
& VACUUM_OPTION_PARALLEL_BULKDEL
) != 0)
3436 lps
->nindexes_parallel_bulkdel
++;
3437 if ((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) != 0)
3438 lps
->nindexes_parallel_cleanup
++;
3439 if ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) != 0)
3440 lps
->nindexes_parallel_condcleanup
++;
3442 shm_toc_estimate_chunk(&pcxt
->estimator
, est_shared
);
3443 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3445 /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3446 maxtuples
= compute_max_dead_tuples(nblocks
, true);
3447 est_deadtuples
= MAXALIGN(SizeOfDeadTuples(maxtuples
));
3448 shm_toc_estimate_chunk(&pcxt
->estimator
, est_deadtuples
);
3449 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3452 * Estimate space for BufferUsage and WalUsage --
3453 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3455 * If there are no extensions loaded that care, we could skip this. We
3456 * have no way of knowing whether anyone's looking at pgBufferUsage or
3457 * pgWalUsage, so do it unconditionally.
3459 shm_toc_estimate_chunk(&pcxt
->estimator
,
3460 mul_size(sizeof(BufferUsage
), pcxt
->nworkers
));
3461 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3462 shm_toc_estimate_chunk(&pcxt
->estimator
,
3463 mul_size(sizeof(WalUsage
), pcxt
->nworkers
));
3464 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3466 /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3467 if (debug_query_string
)
3469 querylen
= strlen(debug_query_string
);
3470 shm_toc_estimate_chunk(&pcxt
->estimator
, querylen
+ 1);
3471 shm_toc_estimate_keys(&pcxt
->estimator
, 1);
3474 querylen
= 0; /* keep compiler quiet */
3476 InitializeParallelDSM(pcxt
);
3478 /* Prepare shared information */
3479 shared
= (LVShared
*) shm_toc_allocate(pcxt
->toc
, est_shared
);
3480 MemSet(shared
, 0, est_shared
);
3481 shared
->relid
= RelationGetRelid(vacrel
->rel
);
3482 shared
->elevel
= elevel
;
3483 shared
->maintenance_work_mem_worker
=
3484 (nindexes_mwm
> 0) ?
3485 maintenance_work_mem
/ Min(parallel_workers
, nindexes_mwm
) :
3486 maintenance_work_mem
;
3488 pg_atomic_init_u32(&(shared
->cost_balance
), 0);
3489 pg_atomic_init_u32(&(shared
->active_nworkers
), 0);
3490 pg_atomic_init_u32(&(shared
->idx
), 0);
3491 shared
->offset
= MAXALIGN(add_size(SizeOfLVShared
, BITMAPLEN(nindexes
)));
3494 * Initialize variables for shared index statistics, set NULL bitmap and
3495 * the size of stats for each index.
3497 memset(shared
->bitmap
, 0x00, BITMAPLEN(nindexes
));
3498 for (int idx
= 0; idx
< nindexes
; idx
++)
3500 if (!can_parallel_vacuum
[idx
])
3503 /* Set NOT NULL as this index does support parallelism */
3504 shared
->bitmap
[idx
>> 3] |= 1 << (idx
& 0x07);
3507 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_SHARED
, shared
);
3508 lps
->lvshared
= shared
;
3510 /* Prepare the dead tuple space */
3511 dead_tuples
= (LVDeadTuples
*) shm_toc_allocate(pcxt
->toc
, est_deadtuples
);
3512 dead_tuples
->max_tuples
= maxtuples
;
3513 dead_tuples
->num_tuples
= 0;
3514 MemSet(dead_tuples
->itemptrs
, 0, sizeof(ItemPointerData
) * maxtuples
);
3515 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_DEAD_TUPLES
, dead_tuples
);
3516 vacrel
->dead_tuples
= dead_tuples
;
3519 * Allocate space for each worker's BufferUsage and WalUsage; no need to
3522 buffer_usage
= shm_toc_allocate(pcxt
->toc
,
3523 mul_size(sizeof(BufferUsage
), pcxt
->nworkers
));
3524 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_BUFFER_USAGE
, buffer_usage
);
3525 lps
->buffer_usage
= buffer_usage
;
3526 wal_usage
= shm_toc_allocate(pcxt
->toc
,
3527 mul_size(sizeof(WalUsage
), pcxt
->nworkers
));
3528 shm_toc_insert(pcxt
->toc
, PARALLEL_VACUUM_KEY_WAL_USAGE
, wal_usage
);
3529 lps
->wal_usage
= wal_usage
;
3531 /* Store query string for workers */
3532 if (debug_query_string
)
3536 sharedquery
= (char *) shm_toc_allocate(pcxt
->toc
, querylen
+ 1);
3537 memcpy(sharedquery
, debug_query_string
, querylen
+ 1);
3538 sharedquery
[querylen
] = '\0';
3539 shm_toc_insert(pcxt
->toc
,
3540 PARALLEL_VACUUM_KEY_QUERY_TEXT
, sharedquery
);
3543 pfree(can_parallel_vacuum
);
3548 * Destroy the parallel context, and end parallel mode.
3550 * Since writes are not allowed during parallel mode, copy the
3551 * updated index statistics from DSM into local memory and then later use that
3552 * to update the index statistics. One might think that we can exit from
3553 * parallel mode, update the index statistics and then destroy parallel
3554 * context, but that won't be safe (see ExitParallelMode).
3557 end_parallel_vacuum(LVRelState
*vacrel
)
3559 IndexBulkDeleteResult
**indstats
= vacrel
->indstats
;
3560 LVParallelState
*lps
= vacrel
->lps
;
3561 int nindexes
= vacrel
->nindexes
;
3563 Assert(!IsParallelWorker());
3565 /* Copy the updated statistics */
3566 for (int idx
= 0; idx
< nindexes
; idx
++)
3568 LVSharedIndStats
*shared_istat
;
3570 shared_istat
= parallel_stats_for_idx(lps
->lvshared
, idx
);
3573 * Skip unused slot. The statistics of this index are already stored
3576 if (shared_istat
== NULL
)
3579 if (shared_istat
->updated
)
3581 indstats
[idx
] = (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
3582 memcpy(indstats
[idx
], &(shared_istat
->istat
), sizeof(IndexBulkDeleteResult
));
3585 indstats
[idx
] = NULL
;
3588 DestroyParallelContext(lps
->pcxt
);
3591 /* Deactivate parallel vacuum */
3597 * Return shared memory statistics for index at offset 'getidx', if any
3599 static LVSharedIndStats
*
3600 parallel_stats_for_idx(LVShared
*lvshared
, int getidx
)
3604 if (IndStatsIsNull(lvshared
, getidx
))
3607 p
= (char *) GetSharedIndStats(lvshared
);
3608 for (int idx
= 0; idx
< getidx
; idx
++)
3610 if (IndStatsIsNull(lvshared
, idx
))
3613 p
+= sizeof(LVSharedIndStats
);
3616 return (LVSharedIndStats
*) p
;
3620 * Returns false, if the given index can't participate in parallel index
3621 * vacuum or parallel index cleanup
3624 parallel_processing_is_safe(Relation indrel
, LVShared
*lvshared
)
3626 uint8 vacoptions
= indrel
->rd_indam
->amparallelvacuumoptions
;
3628 /* first_time must be true only if for_cleanup is true */
3629 Assert(lvshared
->for_cleanup
|| !lvshared
->first_time
);
3631 if (lvshared
->for_cleanup
)
3633 /* Skip, if the index does not support parallel cleanup */
3634 if (((vacoptions
& VACUUM_OPTION_PARALLEL_CLEANUP
) == 0) &&
3635 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) == 0))
3639 * Skip, if the index supports parallel cleanup conditionally, but we
3640 * have already processed the index (for bulkdelete). See the
3641 * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
3642 * when indexes support parallel cleanup conditionally.
3644 if (!lvshared
->first_time
&&
3645 ((vacoptions
& VACUUM_OPTION_PARALLEL_COND_CLEANUP
) != 0))
3648 else if ((vacoptions
& VACUUM_OPTION_PARALLEL_BULKDEL
) == 0)
3650 /* Skip if the index does not support parallel bulk deletion */
3658 * Perform work within a launched parallel process.
3660 * Since parallel vacuum workers perform only index vacuum or index cleanup,
3661 * we don't need to report progress information.
3664 parallel_vacuum_main(dsm_segment
*seg
, shm_toc
*toc
)
3669 LVDeadTuples
*dead_tuples
;
3670 BufferUsage
*buffer_usage
;
3671 WalUsage
*wal_usage
;
3675 ErrorContextCallback errcallback
;
3677 lvshared
= (LVShared
*) shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_SHARED
,
3679 elevel
= lvshared
->elevel
;
3681 if (lvshared
->for_cleanup
)
3682 elog(DEBUG1
, "starting parallel vacuum worker for cleanup");
3684 elog(DEBUG1
, "starting parallel vacuum worker for bulk delete");
3686 /* Set debug_query_string for individual workers */
3687 sharedquery
= shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_QUERY_TEXT
, true);
3688 debug_query_string
= sharedquery
;
3689 pgstat_report_activity(STATE_RUNNING
, debug_query_string
);
3692 * Open table. The lock mode is the same as the leader process. It's
3693 * okay because the lock mode does not conflict among the parallel
3696 rel
= table_open(lvshared
->relid
, ShareUpdateExclusiveLock
);
3699 * Open all indexes. indrels are sorted in order by OID, which should be
3700 * matched to the leader's one.
3702 vac_open_indexes(rel
, RowExclusiveLock
, &nindexes
, &indrels
);
3703 Assert(nindexes
> 0);
3705 /* Set dead tuple space */
3706 dead_tuples
= (LVDeadTuples
*) shm_toc_lookup(toc
,
3707 PARALLEL_VACUUM_KEY_DEAD_TUPLES
,
3710 /* Set cost-based vacuum delay */
3711 VacuumCostActive
= (VacuumCostDelay
> 0);
3712 VacuumCostBalance
= 0;
3715 VacuumPageDirty
= 0;
3716 VacuumCostBalanceLocal
= 0;
3717 VacuumSharedCostBalance
= &(lvshared
->cost_balance
);
3718 VacuumActiveNWorkers
= &(lvshared
->active_nworkers
);
3721 vacrel
.indrels
= indrels
;
3722 vacrel
.nindexes
= nindexes
;
3723 vacrel
.indstats
= (IndexBulkDeleteResult
**)
3724 palloc0(nindexes
* sizeof(IndexBulkDeleteResult
*));
3726 if (lvshared
->maintenance_work_mem_worker
> 0)
3727 maintenance_work_mem
= lvshared
->maintenance_work_mem_worker
;
3730 * Initialize vacrel for use as error callback arg by parallel worker.
3732 vacrel
.relnamespace
= get_namespace_name(RelationGetNamespace(rel
));
3733 vacrel
.relname
= pstrdup(RelationGetRelationName(rel
));
3734 vacrel
.indname
= NULL
;
3735 vacrel
.phase
= VACUUM_ERRCB_PHASE_UNKNOWN
; /* Not yet processing */
3736 vacrel
.dead_tuples
= dead_tuples
;
3738 /* Setup error traceback support for ereport() */
3739 errcallback
.callback
= vacuum_error_callback
;
3740 errcallback
.arg
= &vacrel
;
3741 errcallback
.previous
= error_context_stack
;
3742 error_context_stack
= &errcallback
;
3744 /* Prepare to track buffer usage during parallel execution */
3745 InstrStartParallelQuery();
3747 /* Process indexes to perform vacuum/cleanup */
3748 do_parallel_processing(&vacrel
, lvshared
);
3750 /* Report buffer/WAL usage during parallel execution */
3751 buffer_usage
= shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_BUFFER_USAGE
, false);
3752 wal_usage
= shm_toc_lookup(toc
, PARALLEL_VACUUM_KEY_WAL_USAGE
, false);
3753 InstrEndParallelQuery(&buffer_usage
[ParallelWorkerNumber
],
3754 &wal_usage
[ParallelWorkerNumber
]);
3756 /* Pop the error context stack */
3757 error_context_stack
= errcallback
.previous
;
3759 vac_close_indexes(nindexes
, indrels
, RowExclusiveLock
);
3760 table_close(rel
, ShareUpdateExclusiveLock
);
3761 pfree(vacrel
.indstats
);
3765 * Error context callback for errors occurring during vacuum.
3768 vacuum_error_callback(void *arg
)
3770 LVRelState
*errinfo
= arg
;
3772 switch (errinfo
->phase
)
3774 case VACUUM_ERRCB_PHASE_SCAN_HEAP
:
3775 if (BlockNumberIsValid(errinfo
->blkno
))
3777 if (OffsetNumberIsValid(errinfo
->offnum
))
3778 errcontext("while scanning block %u and offset %u of relation \"%s.%s\"",
3779 errinfo
->blkno
, errinfo
->offnum
, errinfo
->relnamespace
, errinfo
->relname
);
3781 errcontext("while scanning block %u of relation \"%s.%s\"",
3782 errinfo
->blkno
, errinfo
->relnamespace
, errinfo
->relname
);
3785 errcontext("while scanning relation \"%s.%s\"",
3786 errinfo
->relnamespace
, errinfo
->relname
);
3789 case VACUUM_ERRCB_PHASE_VACUUM_HEAP
:
3790 if (BlockNumberIsValid(errinfo
->blkno
))
3792 if (OffsetNumberIsValid(errinfo
->offnum
))
3793 errcontext("while vacuuming block %u and offset %u of relation \"%s.%s\"",
3794 errinfo
->blkno
, errinfo
->offnum
, errinfo
->relnamespace
, errinfo
->relname
);
3796 errcontext("while vacuuming block %u of relation \"%s.%s\"",
3797 errinfo
->blkno
, errinfo
->relnamespace
, errinfo
->relname
);
3800 errcontext("while vacuuming relation \"%s.%s\"",
3801 errinfo
->relnamespace
, errinfo
->relname
);
3804 case VACUUM_ERRCB_PHASE_VACUUM_INDEX
:
3805 errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
3806 errinfo
->indname
, errinfo
->relnamespace
, errinfo
->relname
);
3809 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP
:
3810 errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
3811 errinfo
->indname
, errinfo
->relnamespace
, errinfo
->relname
);
3814 case VACUUM_ERRCB_PHASE_TRUNCATE
:
3815 if (BlockNumberIsValid(errinfo
->blkno
))
3816 errcontext("while truncating relation \"%s.%s\" to %u blocks",
3817 errinfo
->relnamespace
, errinfo
->relname
, errinfo
->blkno
);
3820 case VACUUM_ERRCB_PHASE_UNKNOWN
:
3822 return; /* do nothing; the errinfo may not be
3828 * Updates the information required for vacuum error callback. This also saves
3829 * the current information which can be later restored via restore_vacuum_error_info.
3832 update_vacuum_error_info(LVRelState
*vacrel
, LVSavedErrInfo
*saved_vacrel
,
3833 int phase
, BlockNumber blkno
, OffsetNumber offnum
)
3837 saved_vacrel
->offnum
= vacrel
->offnum
;
3838 saved_vacrel
->blkno
= vacrel
->blkno
;
3839 saved_vacrel
->phase
= vacrel
->phase
;
3842 vacrel
->blkno
= blkno
;
3843 vacrel
->offnum
= offnum
;
3844 vacrel
->phase
= phase
;
3848 * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
3851 restore_vacuum_error_info(LVRelState
*vacrel
,
3852 const LVSavedErrInfo
*saved_vacrel
)
3854 vacrel
->blkno
= saved_vacrel
->blkno
;
3855 vacrel
->offnum
= saved_vacrel
->offnum
;
3856 vacrel
->phase
= saved_vacrel
->phase
;