Simplify state managed by VACUUM.
[pgsql.git] / src / backend / access / heap / vacuumlazy.c
blobcc98ede539484155e138c9276d8c08b0e77fc095
1 /*-------------------------------------------------------------------------
3 * vacuumlazy.c
4 * Concurrent ("lazy") vacuuming.
7 * The major space usage for LAZY VACUUM is storage for the array of dead tuple
8 * TIDs. We want to ensure we can vacuum even the very largest relations with
9 * finite memory space usage. To do that, we set upper bounds on the number of
10 * tuples we will keep track of at once.
12 * We are willing to use at most maintenance_work_mem (or perhaps
13 * autovacuum_work_mem) memory space to keep track of dead tuples. We
14 * initially allocate an array of TIDs of that size, with an upper limit that
15 * depends on table size (this limit ensures we don't allocate a huge area
16 * uselessly for vacuuming small tables). If the array threatens to overflow,
17 * we suspend the heap scan phase and perform a pass of index cleanup and page
18 * compaction, then resume the heap scan with an empty TID array.
20 * If we're processing a table with no indexes, we can just vacuum each page
21 * as we go; there's no need to save up multiple tuples to minimize the number
22 * of index scans performed. So we don't use maintenance_work_mem memory for
23 * the TID array, just enough to hold as many heap tuples as fit on one page.
25 * Lazy vacuum supports parallel execution with parallel worker processes. In
26 * a parallel vacuum, we perform both index vacuum and index cleanup with
27 * parallel worker processes. Individual indexes are processed by one vacuum
28 * process. At the beginning of a lazy vacuum (at lazy_scan_heap) we prepare
29 * the parallel context and initialize the DSM segment that contains shared
30 * information as well as the memory space for storing dead tuples. When
31 * starting either index vacuum or index cleanup, we launch parallel worker
32 * processes. Once all indexes are processed the parallel worker processes
33 * exit. After that, the leader process re-initializes the parallel context
34 * so that it can use the same DSM for multiple passes of index vacuum and
35 * for performing index cleanup. For updating the index statistics, we need
36 * to update the system table and since updates are not allowed during
37 * parallel mode we update the index statistics after exiting from the
38 * parallel mode.
40 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
41 * Portions Copyright (c) 1994, Regents of the University of California
44 * IDENTIFICATION
45 * src/backend/access/heap/vacuumlazy.c
47 *-------------------------------------------------------------------------
49 #include "postgres.h"
51 #include <math.h>
53 #include "access/amapi.h"
54 #include "access/genam.h"
55 #include "access/heapam.h"
56 #include "access/heapam_xlog.h"
57 #include "access/htup_details.h"
58 #include "access/multixact.h"
59 #include "access/parallel.h"
60 #include "access/transam.h"
61 #include "access/visibilitymap.h"
62 #include "access/xact.h"
63 #include "access/xlog.h"
64 #include "catalog/index.h"
65 #include "catalog/storage.h"
66 #include "commands/dbcommands.h"
67 #include "commands/progress.h"
68 #include "commands/vacuum.h"
69 #include "executor/instrument.h"
70 #include "miscadmin.h"
71 #include "optimizer/paths.h"
72 #include "pgstat.h"
73 #include "portability/instr_time.h"
74 #include "postmaster/autovacuum.h"
75 #include "storage/bufmgr.h"
76 #include "storage/freespace.h"
77 #include "storage/lmgr.h"
78 #include "tcop/tcopprot.h"
79 #include "utils/lsyscache.h"
80 #include "utils/memutils.h"
81 #include "utils/pg_rusage.h"
82 #include "utils/timestamp.h"
86 * Space/time tradeoff parameters: do these need to be user-tunable?
88 * To consider truncating the relation, we want there to be at least
89 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
90 * is less) potentially-freeable pages.
92 #define REL_TRUNCATE_MINIMUM 1000
93 #define REL_TRUNCATE_FRACTION 16
96 * Timing parameters for truncate locking heuristics.
98 * These were not exposed as user tunable GUC values because it didn't seem
99 * that the potential for improvement was great enough to merit the cost of
100 * supporting them.
102 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
103 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
104 #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
107 * When a table has no indexes, vacuum the FSM after every 8GB, approximately
108 * (it won't be exact because we only vacuum FSM after processing a heap page
109 * that has some removable tuples). When there are indexes, this is ignored,
110 * and we vacuum FSM after each index/heap cleaning pass.
112 #define VACUUM_FSM_EVERY_PAGES \
113 ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ))
116 * Guesstimation of number of dead tuples per page. This is used to
117 * provide an upper limit to memory allocated when vacuuming small
118 * tables.
120 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
123 * Before we consider skipping a page that's marked as clean in
124 * visibility map, we must've seen at least this many clean pages.
126 #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32)
129 * Size of the prefetch window for lazy vacuum backwards truncation scan.
130 * Needs to be a power of 2.
132 #define PREFETCH_SIZE ((BlockNumber) 32)
135 * DSM keys for parallel vacuum. Unlike other parallel execution code, since
136 * we don't need to worry about DSM keys conflicting with plan_node_id we can
137 * use small integers.
139 #define PARALLEL_VACUUM_KEY_SHARED 1
140 #define PARALLEL_VACUUM_KEY_DEAD_TUPLES 2
141 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 3
142 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4
143 #define PARALLEL_VACUUM_KEY_WAL_USAGE 5
146 * Macro to check if we are in a parallel vacuum. If true, we are in the
147 * parallel mode and the DSM segment is initialized.
149 #define ParallelVacuumIsActive(vacrel) ((vacrel)->lps != NULL)
151 /* Phases of vacuum during which we report error context. */
152 typedef enum
154 VACUUM_ERRCB_PHASE_UNKNOWN,
155 VACUUM_ERRCB_PHASE_SCAN_HEAP,
156 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
157 VACUUM_ERRCB_PHASE_VACUUM_HEAP,
158 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
159 VACUUM_ERRCB_PHASE_TRUNCATE
160 } VacErrPhase;
163 * LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
164 * This is allocated in the DSM segment in parallel mode and in local memory
165 * in non-parallel mode.
167 typedef struct LVDeadTuples
169 int max_tuples; /* # slots allocated in array */
170 int num_tuples; /* current # of entries */
171 /* List of TIDs of tuples we intend to delete */
172 /* NB: this list is ordered by TID address */
173 ItemPointerData itemptrs[FLEXIBLE_ARRAY_MEMBER]; /* array of
174 * ItemPointerData */
175 } LVDeadTuples;
177 /* The dead tuple space consists of LVDeadTuples and dead tuple TIDs */
178 #define SizeOfDeadTuples(cnt) \
179 add_size(offsetof(LVDeadTuples, itemptrs), \
180 mul_size(sizeof(ItemPointerData), cnt))
181 #define MAXDEADTUPLES(max_size) \
182 (((max_size) - offsetof(LVDeadTuples, itemptrs)) / sizeof(ItemPointerData))
185 * Shared information among parallel workers. So this is allocated in the DSM
186 * segment.
188 typedef struct LVShared
191 * Target table relid and log level. These fields are not modified during
192 * the lazy vacuum.
194 Oid relid;
195 int elevel;
198 * An indication for vacuum workers to perform either index vacuum or
199 * index cleanup. first_time is true only if for_cleanup is true and
200 * bulk-deletion is not performed yet.
202 bool for_cleanup;
203 bool first_time;
206 * Fields for both index vacuum and cleanup.
208 * reltuples is the total number of input heap tuples. We set either old
209 * live tuples in the index vacuum case or the new live tuples in the
210 * index cleanup case.
212 * estimated_count is true if reltuples is an estimated value. (Note that
213 * reltuples could be -1 in this case, indicating we have no idea.)
215 double reltuples;
216 bool estimated_count;
219 * In single process lazy vacuum we could consume more memory during index
220 * vacuuming or cleanup apart from the memory for heap scanning. In
221 * parallel vacuum, since individual vacuum workers can consume memory
222 * equal to maintenance_work_mem, the new maintenance_work_mem for each
223 * worker is set such that the parallel operation doesn't consume more
224 * memory than single process lazy vacuum.
226 int maintenance_work_mem_worker;
229 * Shared vacuum cost balance. During parallel vacuum,
230 * VacuumSharedCostBalance points to this value and it accumulates the
231 * balance of each parallel vacuum worker.
233 pg_atomic_uint32 cost_balance;
236 * Number of active parallel workers. This is used for computing the
237 * minimum threshold of the vacuum cost balance before a worker sleeps for
238 * cost-based delay.
240 pg_atomic_uint32 active_nworkers;
243 * Variables to control parallel vacuum. We have a bitmap to indicate
244 * which index has stats in shared memory. The set bit in the map
245 * indicates that the particular index supports a parallel vacuum.
247 pg_atomic_uint32 idx; /* counter for vacuuming and clean up */
248 uint32 offset; /* sizeof header incl. bitmap */
249 bits8 bitmap[FLEXIBLE_ARRAY_MEMBER]; /* bit map of NULLs */
251 /* Shared index statistics data follows at end of struct */
252 } LVShared;
254 #define SizeOfLVShared (offsetof(LVShared, bitmap) + sizeof(bits8))
255 #define GetSharedIndStats(s) \
256 ((LVSharedIndStats *)((char *)(s) + ((LVShared *)(s))->offset))
257 #define IndStatsIsNull(s, i) \
258 (!(((LVShared *)(s))->bitmap[(i) >> 3] & (1 << ((i) & 0x07))))
261 * Struct for an index bulk-deletion statistic used for parallel vacuum. This
262 * is allocated in the DSM segment.
264 typedef struct LVSharedIndStats
266 bool updated; /* are the stats updated? */
267 IndexBulkDeleteResult istat;
268 } LVSharedIndStats;
270 /* Struct for maintaining a parallel vacuum state. */
271 typedef struct LVParallelState
273 ParallelContext *pcxt;
275 /* Shared information among parallel vacuum workers */
276 LVShared *lvshared;
278 /* Points to buffer usage area in DSM */
279 BufferUsage *buffer_usage;
281 /* Points to WAL usage area in DSM */
282 WalUsage *wal_usage;
285 * The number of indexes that support parallel index bulk-deletion and
286 * parallel index cleanup respectively.
288 int nindexes_parallel_bulkdel;
289 int nindexes_parallel_cleanup;
290 int nindexes_parallel_condcleanup;
291 } LVParallelState;
293 typedef struct LVRelState
295 /* Target heap relation and its indexes */
296 Relation rel;
297 Relation *indrels;
298 int nindexes;
299 /* useindex = true means two-pass strategy; false means one-pass */
300 bool useindex;
302 /* Buffer access strategy and parallel state */
303 BufferAccessStrategy bstrategy;
304 LVParallelState *lps;
306 /* Statistics from pg_class when we start out */
307 BlockNumber old_rel_pages; /* previous value of pg_class.relpages */
308 double old_live_tuples; /* previous value of pg_class.reltuples */
309 /* rel's initial relfrozenxid and relminmxid */
310 TransactionId relfrozenxid;
311 MultiXactId relminmxid;
312 TransactionId latestRemovedXid;
314 /* VACUUM operation's cutoff for pruning */
315 TransactionId OldestXmin;
316 /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */
317 TransactionId FreezeLimit;
318 MultiXactId MultiXactCutoff;
320 /* Error reporting state */
321 char *relnamespace;
322 char *relname;
323 char *indname;
324 BlockNumber blkno; /* used only for heap operations */
325 OffsetNumber offnum; /* used only for heap operations */
326 VacErrPhase phase;
329 * State managed by lazy_scan_heap() follows
331 LVDeadTuples *dead_tuples; /* items to vacuum from indexes */
332 BlockNumber rel_pages; /* total number of pages */
333 BlockNumber scanned_pages; /* number of pages we examined */
334 BlockNumber pinskipped_pages; /* # of pages skipped due to a pin */
335 BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */
336 BlockNumber tupcount_pages; /* pages whose tuples we counted */
337 BlockNumber pages_removed; /* pages remove by truncation */
338 BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
339 bool lock_waiter_detected;
341 /* Statistics output by us, for table */
342 double new_rel_tuples; /* new estimated total # of tuples */
343 double new_live_tuples; /* new estimated total # of live tuples */
344 /* Statistics output by index AMs */
345 IndexBulkDeleteResult **indstats;
347 /* Instrumentation counters */
348 int num_index_scans;
349 int64 tuples_deleted; /* # deleted from table */
350 int64 new_dead_tuples; /* new estimated total # of dead items in
351 * table */
352 int64 num_tuples; /* total number of nonremovable tuples */
353 int64 live_tuples; /* live tuples (reltuples estimate) */
354 } LVRelState;
356 /* Struct for saving and restoring vacuum error information. */
357 typedef struct LVSavedErrInfo
359 BlockNumber blkno;
360 OffsetNumber offnum;
361 VacErrPhase phase;
362 } LVSavedErrInfo;
364 /* elevel controls whole VACUUM's verbosity */
365 static int elevel = -1;
368 /* non-export function prototypes */
369 static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params,
370 bool aggressive);
371 static void lazy_vacuum_all_indexes(LVRelState *vacrel);
372 static void lazy_vacuum_heap_rel(LVRelState *vacrel);
373 static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno,
374 Buffer buffer, int tupindex, Buffer *vmbuffer);
375 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup,
376 LVRelState *vacrel);
377 static void do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel);
378 static void do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel);
379 static void do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers);
380 static void do_parallel_processing(LVRelState *vacrel,
381 LVShared *lvshared);
382 static void do_serial_processing_for_unsafe_indexes(LVRelState *vacrel,
383 LVShared *lvshared);
384 static IndexBulkDeleteResult *parallel_process_one_index(Relation indrel,
385 IndexBulkDeleteResult *istat,
386 LVShared *lvshared,
387 LVSharedIndStats *shared_indstats,
388 LVRelState *vacrel);
389 static void lazy_cleanup_all_indexes(LVRelState *vacrel);
390 static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel,
391 IndexBulkDeleteResult *istat,
392 double reltuples,
393 LVRelState *vacrel);
394 static IndexBulkDeleteResult *lazy_cleanup_one_index(Relation indrel,
395 IndexBulkDeleteResult *istat,
396 double reltuples,
397 bool estimated_count,
398 LVRelState *vacrel);
399 static bool should_attempt_truncation(LVRelState *vacrel,
400 VacuumParams *params);
401 static void lazy_truncate_heap(LVRelState *vacrel);
402 static BlockNumber count_nondeletable_pages(LVRelState *vacrel);
403 static long compute_max_dead_tuples(BlockNumber relblocks, bool hasindex);
404 static void lazy_space_alloc(LVRelState *vacrel, int nworkers,
405 BlockNumber relblocks);
406 static void lazy_space_free(LVRelState *vacrel);
407 static void lazy_record_dead_tuple(LVDeadTuples *dead_tuples,
408 ItemPointer itemptr);
409 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
410 static int vac_cmp_itemptr(const void *left, const void *right);
411 static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
412 TransactionId *visibility_cutoff_xid, bool *all_frozen);
413 static int compute_parallel_vacuum_workers(LVRelState *vacrel,
414 int nrequested,
415 bool *can_parallel_vacuum);
416 static void update_index_statistics(LVRelState *vacrel);
417 static LVParallelState *begin_parallel_vacuum(LVRelState *vacrel,
418 BlockNumber nblocks,
419 int nrequested);
420 static void end_parallel_vacuum(LVRelState *vacrel);
421 static LVSharedIndStats *parallel_stats_for_idx(LVShared *lvshared, int getidx);
422 static bool parallel_processing_is_safe(Relation indrel, LVShared *lvshared);
423 static void vacuum_error_callback(void *arg);
424 static void update_vacuum_error_info(LVRelState *vacrel,
425 LVSavedErrInfo *saved_vacrel,
426 int phase, BlockNumber blkno,
427 OffsetNumber offnum);
428 static void restore_vacuum_error_info(LVRelState *vacrel,
429 const LVSavedErrInfo *saved_vacrel);
433 * heap_vacuum_rel() -- perform VACUUM for one heap relation
435 * This routine vacuums a single heap, cleans out its indexes, and
436 * updates its relpages and reltuples statistics.
438 * At entry, we have already established a transaction and opened
439 * and locked the relation.
441 void
442 heap_vacuum_rel(Relation rel, VacuumParams *params,
443 BufferAccessStrategy bstrategy)
445 LVRelState *vacrel;
446 PGRUsage ru0;
447 TimestampTz starttime = 0;
448 WalUsage walusage_start = pgWalUsage;
449 WalUsage walusage = {0, 0, 0};
450 long secs;
451 int usecs;
452 double read_rate,
453 write_rate;
454 bool aggressive; /* should we scan all unfrozen pages? */
455 bool scanned_all_unfrozen; /* actually scanned all such pages? */
456 char **indnames = NULL;
457 TransactionId xidFullScanLimit;
458 MultiXactId mxactFullScanLimit;
459 BlockNumber new_rel_pages;
460 BlockNumber new_rel_allvisible;
461 double new_live_tuples;
462 TransactionId new_frozen_xid;
463 MultiXactId new_min_multi;
464 ErrorContextCallback errcallback;
465 PgStat_Counter startreadtime = 0;
466 PgStat_Counter startwritetime = 0;
467 TransactionId OldestXmin;
468 TransactionId FreezeLimit;
469 MultiXactId MultiXactCutoff;
471 Assert(params != NULL);
472 Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
473 Assert(params->truncate != VACOPT_TERNARY_DEFAULT);
475 /* measure elapsed time iff autovacuum logging requires it */
476 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
478 pg_rusage_init(&ru0);
479 starttime = GetCurrentTimestamp();
480 if (track_io_timing)
482 startreadtime = pgStatBlockReadTime;
483 startwritetime = pgStatBlockWriteTime;
487 if (params->options & VACOPT_VERBOSE)
488 elevel = INFO;
489 else
490 elevel = DEBUG2;
492 pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
493 RelationGetRelid(rel));
495 vacuum_set_xid_limits(rel,
496 params->freeze_min_age,
497 params->freeze_table_age,
498 params->multixact_freeze_min_age,
499 params->multixact_freeze_table_age,
500 &OldestXmin, &FreezeLimit, &xidFullScanLimit,
501 &MultiXactCutoff, &mxactFullScanLimit);
504 * We request an aggressive scan if the table's frozen Xid is now older
505 * than or equal to the requested Xid full-table scan limit; or if the
506 * table's minimum MultiXactId is older than or equal to the requested
507 * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified.
509 aggressive = TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid,
510 xidFullScanLimit);
511 aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid,
512 mxactFullScanLimit);
513 if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
514 aggressive = true;
516 vacrel = (LVRelState *) palloc0(sizeof(LVRelState));
518 /* Set up high level stuff about rel */
519 vacrel->rel = rel;
520 vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
521 &vacrel->indrels);
522 vacrel->useindex = (vacrel->nindexes > 0 &&
523 params->index_cleanup == VACOPT_TERNARY_ENABLED);
524 vacrel->bstrategy = bstrategy;
525 vacrel->old_rel_pages = rel->rd_rel->relpages;
526 vacrel->old_live_tuples = rel->rd_rel->reltuples;
527 vacrel->relfrozenxid = rel->rd_rel->relfrozenxid;
528 vacrel->relminmxid = rel->rd_rel->relminmxid;
529 vacrel->latestRemovedXid = InvalidTransactionId;
531 /* Set cutoffs for entire VACUUM */
532 vacrel->OldestXmin = OldestXmin;
533 vacrel->FreezeLimit = FreezeLimit;
534 vacrel->MultiXactCutoff = MultiXactCutoff;
536 vacrel->relnamespace = get_namespace_name(RelationGetNamespace(rel));
537 vacrel->relname = pstrdup(RelationGetRelationName(rel));
538 vacrel->indname = NULL;
539 vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
541 /* Save index names iff autovacuum logging requires it */
542 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0 &&
543 vacrel->nindexes > 0)
545 indnames = palloc(sizeof(char *) * vacrel->nindexes);
546 for (int i = 0; i < vacrel->nindexes; i++)
547 indnames[i] =
548 pstrdup(RelationGetRelationName(vacrel->indrels[i]));
552 * Setup error traceback support for ereport(). The idea is to set up an
553 * error context callback to display additional information on any error
554 * during a vacuum. During different phases of vacuum (heap scan, heap
555 * vacuum, index vacuum, index clean up, heap truncate), we update the
556 * error context callback to display appropriate information.
558 * Note that the index vacuum and heap vacuum phases may be called
559 * multiple times in the middle of the heap scan phase. So the old phase
560 * information is restored at the end of those phases.
562 errcallback.callback = vacuum_error_callback;
563 errcallback.arg = vacrel;
564 errcallback.previous = error_context_stack;
565 error_context_stack = &errcallback;
567 /* Do the vacuuming */
568 lazy_scan_heap(vacrel, params, aggressive);
570 /* Done with indexes */
571 vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock);
574 * Compute whether we actually scanned the all unfrozen pages. If we did,
575 * we can adjust relfrozenxid and relminmxid.
577 * NB: We need to check this before truncating the relation, because that
578 * will change ->rel_pages.
580 if ((vacrel->scanned_pages + vacrel->frozenskipped_pages)
581 < vacrel->rel_pages)
583 Assert(!aggressive);
584 scanned_all_unfrozen = false;
586 else
587 scanned_all_unfrozen = true;
590 * Optionally truncate the relation.
592 if (should_attempt_truncation(vacrel, params))
595 * Update error traceback information. This is the last phase during
596 * which we add context information to errors, so we don't need to
597 * revert to the previous phase.
599 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
600 vacrel->nonempty_pages,
601 InvalidOffsetNumber);
602 lazy_truncate_heap(vacrel);
605 /* Pop the error context stack */
606 error_context_stack = errcallback.previous;
608 /* Report that we are now doing final cleanup */
609 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
610 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
613 * Update statistics in pg_class.
615 * In principle new_live_tuples could be -1 indicating that we (still)
616 * don't know the tuple count. In practice that probably can't happen,
617 * since we'd surely have scanned some pages if the table is new and
618 * nonempty.
620 * For safety, clamp relallvisible to be not more than what we're setting
621 * relpages to.
623 * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
624 * since then we don't know for certain that all tuples have a newer xmin.
626 new_rel_pages = vacrel->rel_pages;
627 new_live_tuples = vacrel->new_live_tuples;
629 visibilitymap_count(rel, &new_rel_allvisible, NULL);
630 if (new_rel_allvisible > new_rel_pages)
631 new_rel_allvisible = new_rel_pages;
633 new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
634 new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
636 vac_update_relstats(rel,
637 new_rel_pages,
638 new_live_tuples,
639 new_rel_allvisible,
640 vacrel->nindexes > 0,
641 new_frozen_xid,
642 new_min_multi,
643 false);
645 /* report results to the stats collector, too */
646 pgstat_report_vacuum(RelationGetRelid(rel),
647 rel->rd_rel->relisshared,
648 Max(new_live_tuples, 0),
649 vacrel->new_dead_tuples);
650 pgstat_progress_end_command();
652 /* and log the action if appropriate */
653 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
655 TimestampTz endtime = GetCurrentTimestamp();
657 if (params->log_min_duration == 0 ||
658 TimestampDifferenceExceeds(starttime, endtime,
659 params->log_min_duration))
661 StringInfoData buf;
662 char *msgfmt;
664 TimestampDifference(starttime, endtime, &secs, &usecs);
666 memset(&walusage, 0, sizeof(WalUsage));
667 WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
669 read_rate = 0;
670 write_rate = 0;
671 if ((secs > 0) || (usecs > 0))
673 read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
674 (secs + usecs / 1000000.0);
675 write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
676 (secs + usecs / 1000000.0);
680 * This is pretty messy, but we split it up so that we can skip
681 * emitting individual parts of the message when not applicable.
683 initStringInfo(&buf);
684 if (params->is_wraparound)
687 * While it's possible for a VACUUM to be both is_wraparound
688 * and !aggressive, that's just a corner-case -- is_wraparound
689 * implies aggressive. Produce distinct output for the corner
690 * case all the same, just in case.
692 if (aggressive)
693 msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
694 else
695 msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n");
697 else
699 if (aggressive)
700 msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n");
701 else
702 msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n");
704 appendStringInfo(&buf, msgfmt,
705 get_database_name(MyDatabaseId),
706 vacrel->relnamespace,
707 vacrel->relname,
708 vacrel->num_index_scans);
709 appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
710 vacrel->pages_removed,
711 vacrel->rel_pages,
712 vacrel->pinskipped_pages,
713 vacrel->frozenskipped_pages);
714 appendStringInfo(&buf,
715 _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"),
716 (long long) vacrel->tuples_deleted,
717 (long long) vacrel->new_rel_tuples,
718 (long long) vacrel->new_dead_tuples,
719 OldestXmin);
720 appendStringInfo(&buf,
721 _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
722 (long long) VacuumPageHit,
723 (long long) VacuumPageMiss,
724 (long long) VacuumPageDirty);
725 for (int i = 0; i < vacrel->nindexes; i++)
727 IndexBulkDeleteResult *istat = vacrel->indstats[i];
729 if (!istat)
730 continue;
732 appendStringInfo(&buf,
733 _("index \"%s\": pages: %u in total, %u newly deleted, %u currently deleted, %u reusable\n"),
734 indnames[i],
735 istat->num_pages,
736 istat->pages_newly_deleted,
737 istat->pages_deleted,
738 istat->pages_free);
740 appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
741 read_rate, write_rate);
742 if (track_io_timing)
744 appendStringInfoString(&buf, _("I/O Timings:"));
745 if (pgStatBlockReadTime - startreadtime > 0)
746 appendStringInfo(&buf, _(" read=%.3f"),
747 (double) (pgStatBlockReadTime - startreadtime) / 1000);
748 if (pgStatBlockWriteTime - startwritetime > 0)
749 appendStringInfo(&buf, _(" write=%.3f"),
750 (double) (pgStatBlockWriteTime - startwritetime) / 1000);
751 appendStringInfoChar(&buf, '\n');
753 appendStringInfo(&buf, _("system usage: %s\n"), pg_rusage_show(&ru0));
754 appendStringInfo(&buf,
755 _("WAL usage: %ld records, %ld full page images, %llu bytes"),
756 walusage.wal_records,
757 walusage.wal_fpi,
758 (unsigned long long) walusage.wal_bytes);
760 ereport(LOG,
761 (errmsg_internal("%s", buf.data)));
762 pfree(buf.data);
766 /* Cleanup index statistics and index names */
767 for (int i = 0; i < vacrel->nindexes; i++)
769 if (vacrel->indstats[i])
770 pfree(vacrel->indstats[i]);
772 if (indnames && indnames[i])
773 pfree(indnames[i]);
778 * For Hot Standby we need to know the highest transaction id that will
779 * be removed by any change. VACUUM proceeds in a number of passes so
780 * we need to consider how each pass operates. The first phase runs
781 * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
782 * progresses - these will have a latestRemovedXid on each record.
783 * In some cases this removes all of the tuples to be removed, though
784 * often we have dead tuples with index pointers so we must remember them
785 * for removal in phase 3. Index records for those rows are removed
786 * in phase 2 and index blocks do not have MVCC information attached.
787 * So before we can allow removal of any index tuples we need to issue
788 * a WAL record containing the latestRemovedXid of rows that will be
789 * removed in phase three. This allows recovery queries to block at the
790 * correct place, i.e. before phase two, rather than during phase three
791 * which would be after the rows have become inaccessible.
793 static void
794 vacuum_log_cleanup_info(LVRelState *vacrel)
797 * Skip this for relations for which no WAL is to be written, or if we're
798 * not trying to support archive recovery.
800 if (!RelationNeedsWAL(vacrel->rel) || !XLogIsNeeded())
801 return;
804 * No need to write the record at all unless it contains a valid value
806 if (TransactionIdIsValid(vacrel->latestRemovedXid))
807 (void) log_heap_cleanup_info(vacrel->rel->rd_node,
808 vacrel->latestRemovedXid);
812 * lazy_scan_heap() -- scan an open heap relation
814 * This routine prunes each page in the heap, which will among other
815 * things truncate dead tuples to dead line pointers, defragment the
816 * page, and set commit status bits (see heap_page_prune). It also builds
817 * lists of dead tuples and pages with free space, calculates statistics
818 * on the number of live tuples in the heap, and marks pages as
819 * all-visible if appropriate. When done, or when we run low on space
820 * for dead-tuple TIDs, invoke vacuuming of indexes and reclaim dead line
821 * pointers.
823 * If the table has at least two indexes, we execute both index vacuum
824 * and index cleanup with parallel workers unless parallel vacuum is
825 * disabled. In a parallel vacuum, we enter parallel mode and then
826 * create both the parallel context and the DSM segment before starting
827 * heap scan so that we can record dead tuples to the DSM segment. All
828 * parallel workers are launched at beginning of index vacuuming and
829 * index cleanup and they exit once done with all indexes. At the end of
830 * this function we exit from parallel mode. Index bulk-deletion results
831 * are stored in the DSM segment and we update index statistics for all
832 * the indexes after exiting from parallel mode since writes are not
833 * allowed during parallel mode.
835 * If there are no indexes then we can reclaim line pointers on the fly;
836 * dead line pointers need only be retained until all index pointers that
837 * reference them have been killed.
839 static void
840 lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
842 LVDeadTuples *dead_tuples;
843 BlockNumber nblocks,
844 blkno;
845 HeapTupleData tuple;
846 BlockNumber empty_pages,
847 vacuumed_pages,
848 next_fsm_block_to_vacuum;
849 double num_tuples, /* total number of nonremovable tuples */
850 live_tuples, /* live tuples (reltuples estimate) */
851 tups_vacuumed, /* tuples cleaned up by current vacuum */
852 nkeep, /* dead-but-not-removable tuples */
853 nunused; /* # existing unused line pointers */
854 int i;
855 PGRUsage ru0;
856 Buffer vmbuffer = InvalidBuffer;
857 BlockNumber next_unskippable_block;
858 bool skipping_blocks;
859 xl_heap_freeze_tuple *frozen;
860 StringInfoData buf;
861 const int initprog_index[] = {
862 PROGRESS_VACUUM_PHASE,
863 PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
864 PROGRESS_VACUUM_MAX_DEAD_TUPLES
866 int64 initprog_val[3];
867 GlobalVisState *vistest;
869 pg_rusage_init(&ru0);
871 if (aggressive)
872 ereport(elevel,
873 (errmsg("aggressively vacuuming \"%s.%s\"",
874 vacrel->relnamespace,
875 vacrel->relname)));
876 else
877 ereport(elevel,
878 (errmsg("vacuuming \"%s.%s\"",
879 vacrel->relnamespace,
880 vacrel->relname)));
882 empty_pages = vacuumed_pages = 0;
883 next_fsm_block_to_vacuum = (BlockNumber) 0;
884 num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
886 nblocks = RelationGetNumberOfBlocks(vacrel->rel);
887 vacrel->rel_pages = nblocks;
888 vacrel->scanned_pages = 0;
889 vacrel->pinskipped_pages = 0;
890 vacrel->frozenskipped_pages = 0;
891 vacrel->tupcount_pages = 0;
892 vacrel->pages_removed = 0;
893 vacrel->nonempty_pages = 0;
894 vacrel->lock_waiter_detected = false;
896 /* Initialize instrumentation counters */
897 vacrel->num_index_scans = 0;
898 vacrel->tuples_deleted = 0;
899 vacrel->new_dead_tuples = 0;
900 vacrel->num_tuples = 0;
901 vacrel->live_tuples = 0;
903 vistest = GlobalVisTestFor(vacrel->rel);
905 vacrel->indstats = (IndexBulkDeleteResult **)
906 palloc0(vacrel->nindexes * sizeof(IndexBulkDeleteResult *));
909 * Allocate the space for dead tuples. Note that this handles parallel
910 * VACUUM initialization as part of allocating shared memory space used
911 * for dead_tuples.
913 lazy_space_alloc(vacrel, params->nworkers, nblocks);
914 dead_tuples = vacrel->dead_tuples;
915 frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
917 /* Report that we're scanning the heap, advertising total # of blocks */
918 initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
919 initprog_val[1] = nblocks;
920 initprog_val[2] = dead_tuples->max_tuples;
921 pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
924 * Except when aggressive is set, we want to skip pages that are
925 * all-visible according to the visibility map, but only when we can skip
926 * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading
927 * sequentially, the OS should be doing readahead for us, so there's no
928 * gain in skipping a page now and then; that's likely to disable
929 * readahead and so be counterproductive. Also, skipping even a single
930 * page means that we can't update relfrozenxid, so we only want to do it
931 * if we can skip a goodly number of pages.
933 * When aggressive is set, we can't skip pages just because they are
934 * all-visible, but we can still skip pages that are all-frozen, since
935 * such pages do not need freezing and do not affect the value that we can
936 * safely set for relfrozenxid or relminmxid.
938 * Before entering the main loop, establish the invariant that
939 * next_unskippable_block is the next block number >= blkno that we can't
940 * skip based on the visibility map, either all-visible for a regular scan
941 * or all-frozen for an aggressive scan. We set it to nblocks if there's
942 * no such block. We also set up the skipping_blocks flag correctly at
943 * this stage.
945 * Note: The value returned by visibilitymap_get_status could be slightly
946 * out-of-date, since we make this test before reading the corresponding
947 * heap page or locking the buffer. This is OK. If we mistakenly think
948 * that the page is all-visible or all-frozen when in fact the flag's just
949 * been cleared, we might fail to vacuum the page. It's easy to see that
950 * skipping a page when aggressive is not set is not a very big deal; we
951 * might leave some dead tuples lying around, but the next vacuum will
952 * find them. But even when aggressive *is* set, it's still OK if we miss
953 * a page whose all-frozen marking has just been cleared. Any new XIDs
954 * just added to that page are necessarily newer than the GlobalXmin we
955 * computed, so they'll have no effect on the value to which we can safely
956 * set relfrozenxid. A similar argument applies for MXIDs and relminmxid.
958 * We will scan the table's last page, at least to the extent of
959 * determining whether it has tuples or not, even if it should be skipped
960 * according to the above rules; except when we've already determined that
961 * it's not worth trying to truncate the table. This avoids having
962 * lazy_truncate_heap() take access-exclusive lock on the table to attempt
963 * a truncation that just fails immediately because there are tuples in
964 * the last page. This is worth avoiding mainly because such a lock must
965 * be replayed on any hot standby, where it can be disruptive.
967 next_unskippable_block = 0;
968 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
970 while (next_unskippable_block < nblocks)
972 uint8 vmstatus;
974 vmstatus = visibilitymap_get_status(vacrel->rel,
975 next_unskippable_block,
976 &vmbuffer);
977 if (aggressive)
979 if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
980 break;
982 else
984 if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
985 break;
987 vacuum_delay_point();
988 next_unskippable_block++;
992 if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
993 skipping_blocks = true;
994 else
995 skipping_blocks = false;
997 for (blkno = 0; blkno < nblocks; blkno++)
999 Buffer buf;
1000 Page page;
1001 OffsetNumber offnum,
1002 maxoff;
1003 bool tupgone,
1004 hastup;
1005 int prev_dead_count;
1006 int nfrozen;
1007 Size freespace;
1008 bool all_visible_according_to_vm = false;
1009 bool all_visible;
1010 bool all_frozen = true; /* provided all_visible is also true */
1011 bool has_dead_items; /* includes existing LP_DEAD items */
1012 TransactionId visibility_cutoff_xid = InvalidTransactionId;
1014 /* see note above about forcing scanning of last page */
1015 #define FORCE_CHECK_PAGE() \
1016 (blkno == nblocks - 1 && should_attempt_truncation(vacrel, params))
1018 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1020 update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
1021 blkno, InvalidOffsetNumber);
1023 if (blkno == next_unskippable_block)
1025 /* Time to advance next_unskippable_block */
1026 next_unskippable_block++;
1027 if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0)
1029 while (next_unskippable_block < nblocks)
1031 uint8 vmskipflags;
1033 vmskipflags = visibilitymap_get_status(vacrel->rel,
1034 next_unskippable_block,
1035 &vmbuffer);
1036 if (aggressive)
1038 if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
1039 break;
1041 else
1043 if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
1044 break;
1046 vacuum_delay_point();
1047 next_unskippable_block++;
1052 * We know we can't skip the current block. But set up
1053 * skipping_blocks to do the right thing at the following blocks.
1055 if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
1056 skipping_blocks = true;
1057 else
1058 skipping_blocks = false;
1061 * Normally, the fact that we can't skip this block must mean that
1062 * it's not all-visible. But in an aggressive vacuum we know only
1063 * that it's not all-frozen, so it might still be all-visible.
1065 if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1066 all_visible_according_to_vm = true;
1068 else
1071 * The current block is potentially skippable; if we've seen a
1072 * long enough run of skippable blocks to justify skipping it, and
1073 * we're not forced to check it, then go ahead and skip.
1074 * Otherwise, the page must be at least all-visible if not
1075 * all-frozen, so we can set all_visible_according_to_vm = true.
1077 if (skipping_blocks && !FORCE_CHECK_PAGE())
1080 * Tricky, tricky. If this is in aggressive vacuum, the page
1081 * must have been all-frozen at the time we checked whether it
1082 * was skippable, but it might not be any more. We must be
1083 * careful to count it as a skipped all-frozen page in that
1084 * case, or else we'll think we can't update relfrozenxid and
1085 * relminmxid. If it's not an aggressive vacuum, we don't
1086 * know whether it was all-frozen, so we have to recheck; but
1087 * in this case an approximate answer is OK.
1089 if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1090 vacrel->frozenskipped_pages++;
1091 continue;
1093 all_visible_according_to_vm = true;
1096 vacuum_delay_point();
1099 * If we are close to overrunning the available space for dead-tuple
1100 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
1102 if ((dead_tuples->max_tuples - dead_tuples->num_tuples) < MaxHeapTuplesPerPage &&
1103 dead_tuples->num_tuples > 0)
1106 * Before beginning index vacuuming, we release any pin we may
1107 * hold on the visibility map page. This isn't necessary for
1108 * correctness, but we do it anyway to avoid holding the pin
1109 * across a lengthy, unrelated operation.
1111 if (BufferIsValid(vmbuffer))
1113 ReleaseBuffer(vmbuffer);
1114 vmbuffer = InvalidBuffer;
1117 /* Work on all the indexes, then the heap */
1118 lazy_vacuum_all_indexes(vacrel);
1120 /* Remove tuples from heap */
1121 lazy_vacuum_heap_rel(vacrel);
1124 * Forget the now-vacuumed tuples, and press on, but be careful
1125 * not to reset latestRemovedXid since we want that value to be
1126 * valid.
1128 dead_tuples->num_tuples = 0;
1131 * Vacuum the Free Space Map to make newly-freed space visible on
1132 * upper-level FSM pages. Note we have not yet processed blkno.
1134 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1135 blkno);
1136 next_fsm_block_to_vacuum = blkno;
1138 /* Report that we are once again scanning the heap */
1139 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1140 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
1144 * Pin the visibility map page in case we need to mark the page
1145 * all-visible. In most cases this will be very cheap, because we'll
1146 * already have the correct page pinned anyway. However, it's
1147 * possible that (a) next_unskippable_block is covered by a different
1148 * VM page than the current block or (b) we released our pin and did a
1149 * cycle of index vacuuming.
1151 visibilitymap_pin(vacrel->rel, blkno, &vmbuffer);
1153 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno,
1154 RBM_NORMAL, vacrel->bstrategy);
1156 /* We need buffer cleanup lock so that we can prune HOT chains. */
1157 if (!ConditionalLockBufferForCleanup(buf))
1160 * If we're not performing an aggressive scan to guard against XID
1161 * wraparound, and we don't want to forcibly check the page, then
1162 * it's OK to skip vacuuming pages we get a lock conflict on. They
1163 * will be dealt with in some future vacuum.
1165 if (!aggressive && !FORCE_CHECK_PAGE())
1167 ReleaseBuffer(buf);
1168 vacrel->pinskipped_pages++;
1169 continue;
1173 * Read the page with share lock to see if any xids on it need to
1174 * be frozen. If not we just skip the page, after updating our
1175 * scan statistics. If there are some, we wait for cleanup lock.
1177 * We could defer the lock request further by remembering the page
1178 * and coming back to it later, or we could even register
1179 * ourselves for multiple buffers and then service whichever one
1180 * is received first. For now, this seems good enough.
1182 * If we get here with aggressive false, then we're just forcibly
1183 * checking the page, and so we don't want to insist on getting
1184 * the lock; we only need to know if the page contains tuples, so
1185 * that we can update nonempty_pages correctly. It's convenient
1186 * to use lazy_check_needs_freeze() for both situations, though.
1188 LockBuffer(buf, BUFFER_LOCK_SHARE);
1189 if (!lazy_check_needs_freeze(buf, &hastup, vacrel))
1191 UnlockReleaseBuffer(buf);
1192 vacrel->scanned_pages++;
1193 vacrel->pinskipped_pages++;
1194 if (hastup)
1195 vacrel->nonempty_pages = blkno + 1;
1196 continue;
1198 if (!aggressive)
1201 * Here, we must not advance scanned_pages; that would amount
1202 * to claiming that the page contains no freezable tuples.
1204 UnlockReleaseBuffer(buf);
1205 vacrel->pinskipped_pages++;
1206 if (hastup)
1207 vacrel->nonempty_pages = blkno + 1;
1208 continue;
1210 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1211 LockBufferForCleanup(buf);
1212 /* drop through to normal processing */
1215 vacrel->scanned_pages++;
1216 vacrel->tupcount_pages++;
1218 page = BufferGetPage(buf);
1220 if (PageIsNew(page))
1223 * All-zeroes pages can be left over if either a backend extends
1224 * the relation by a single page, but crashes before the newly
1225 * initialized page has been written out, or when bulk-extending
1226 * the relation (which creates a number of empty pages at the tail
1227 * end of the relation, but enters them into the FSM).
1229 * Note we do not enter the page into the visibilitymap. That has
1230 * the downside that we repeatedly visit this page in subsequent
1231 * vacuums, but otherwise we'll never not discover the space on a
1232 * promoted standby. The harm of repeated checking ought to
1233 * normally not be too bad - the space usually should be used at
1234 * some point, otherwise there wouldn't be any regular vacuums.
1236 * Make sure these pages are in the FSM, to ensure they can be
1237 * reused. Do that by testing if there's any space recorded for
1238 * the page. If not, enter it. We do so after releasing the lock
1239 * on the heap page, the FSM is approximate, after all.
1241 UnlockReleaseBuffer(buf);
1243 empty_pages++;
1245 if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0)
1247 Size freespace;
1249 freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData;
1250 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1252 continue;
1255 if (PageIsEmpty(page))
1257 empty_pages++;
1258 freespace = PageGetHeapFreeSpace(page);
1261 * Empty pages are always all-visible and all-frozen (note that
1262 * the same is currently not true for new pages, see above).
1264 if (!PageIsAllVisible(page))
1266 START_CRIT_SECTION();
1268 /* mark buffer dirty before writing a WAL record */
1269 MarkBufferDirty(buf);
1272 * It's possible that another backend has extended the heap,
1273 * initialized the page, and then failed to WAL-log the page
1274 * due to an ERROR. Since heap extension is not WAL-logged,
1275 * recovery might try to replay our record setting the page
1276 * all-visible and find that the page isn't initialized, which
1277 * will cause a PANIC. To prevent that, check whether the
1278 * page has been previously WAL-logged, and if not, do that
1279 * now.
1281 if (RelationNeedsWAL(vacrel->rel) &&
1282 PageGetLSN(page) == InvalidXLogRecPtr)
1283 log_newpage_buffer(buf, true);
1285 PageSetAllVisible(page);
1286 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1287 vmbuffer, InvalidTransactionId,
1288 VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
1289 END_CRIT_SECTION();
1292 UnlockReleaseBuffer(buf);
1293 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1294 continue;
1298 * Prune all HOT-update chains in this page.
1300 * We count tuples removed by the pruning step as removed by VACUUM
1301 * (existing LP_DEAD line pointers don't count).
1303 tups_vacuumed += heap_page_prune(vacrel->rel, buf, vistest,
1304 InvalidTransactionId, 0, false,
1305 &vacrel->latestRemovedXid,
1306 &vacrel->offnum);
1309 * Now scan the page to collect vacuumable items and check for tuples
1310 * requiring freezing.
1312 all_visible = true;
1313 has_dead_items = false;
1314 nfrozen = 0;
1315 hastup = false;
1316 prev_dead_count = dead_tuples->num_tuples;
1317 maxoff = PageGetMaxOffsetNumber(page);
1320 * Note: If you change anything in the loop below, also look at
1321 * heap_page_is_all_visible to see if that needs to be changed.
1323 for (offnum = FirstOffsetNumber;
1324 offnum <= maxoff;
1325 offnum = OffsetNumberNext(offnum))
1327 ItemId itemid;
1330 * Set the offset number so that we can display it along with any
1331 * error that occurred while processing this tuple.
1333 vacrel->offnum = offnum;
1334 itemid = PageGetItemId(page, offnum);
1336 /* Unused items require no processing, but we count 'em */
1337 if (!ItemIdIsUsed(itemid))
1339 nunused += 1;
1340 continue;
1343 /* Redirect items mustn't be touched */
1344 if (ItemIdIsRedirected(itemid))
1346 hastup = true; /* this page won't be truncatable */
1347 continue;
1350 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1353 * LP_DEAD line pointers are to be vacuumed normally; but we don't
1354 * count them in tups_vacuumed, else we'd be double-counting (at
1355 * least in the common case where heap_page_prune() just freed up
1356 * a non-HOT tuple). Note also that the final tups_vacuumed value
1357 * might be very low for tables where opportunistic page pruning
1358 * happens to occur very frequently (via heap_page_prune_opt()
1359 * calls that free up non-HOT tuples).
1361 if (ItemIdIsDead(itemid))
1363 lazy_record_dead_tuple(dead_tuples, &(tuple.t_self));
1364 all_visible = false;
1365 has_dead_items = true;
1366 continue;
1369 Assert(ItemIdIsNormal(itemid));
1371 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1372 tuple.t_len = ItemIdGetLength(itemid);
1373 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
1375 tupgone = false;
1378 * The criteria for counting a tuple as live in this block need to
1379 * match what analyze.c's acquire_sample_rows() does, otherwise
1380 * VACUUM and ANALYZE may produce wildly different reltuples
1381 * values, e.g. when there are many recently-dead tuples.
1383 * The logic here is a bit simpler than acquire_sample_rows(), as
1384 * VACUUM can't run inside a transaction block, which makes some
1385 * cases impossible (e.g. in-progress insert from the same
1386 * transaction).
1388 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
1390 case HEAPTUPLE_DEAD:
1393 * Ordinarily, DEAD tuples would have been removed by
1394 * heap_page_prune(), but it's possible that the tuple
1395 * state changed since heap_page_prune() looked. In
1396 * particular an INSERT_IN_PROGRESS tuple could have
1397 * changed to DEAD if the inserter aborted. So this
1398 * cannot be considered an error condition.
1400 * If the tuple is HOT-updated then it must only be
1401 * removed by a prune operation; so we keep it just as if
1402 * it were RECENTLY_DEAD. Also, if it's a heap-only
1403 * tuple, we choose to keep it, because it'll be a lot
1404 * cheaper to get rid of it in the next pruning pass than
1405 * to treat it like an indexed tuple. Finally, if index
1406 * cleanup is disabled, the second heap pass will not
1407 * execute, and the tuple will not get removed, so we must
1408 * treat it like any other dead tuple that we choose to
1409 * keep.
1411 * If this were to happen for a tuple that actually needed
1412 * to be deleted, we'd be in trouble, because it'd
1413 * possibly leave a tuple below the relation's xmin
1414 * horizon alive. heap_prepare_freeze_tuple() is prepared
1415 * to detect that case and abort the transaction,
1416 * preventing corruption.
1418 if (HeapTupleIsHotUpdated(&tuple) ||
1419 HeapTupleIsHeapOnly(&tuple) ||
1420 params->index_cleanup == VACOPT_TERNARY_DISABLED)
1421 nkeep += 1;
1422 else
1423 tupgone = true; /* we can delete the tuple */
1424 all_visible = false;
1425 break;
1426 case HEAPTUPLE_LIVE:
1429 * Count it as live. Not only is this natural, but it's
1430 * also what acquire_sample_rows() does.
1432 live_tuples += 1;
1435 * Is the tuple definitely visible to all transactions?
1437 * NB: Like with per-tuple hint bits, we can't set the
1438 * PD_ALL_VISIBLE flag if the inserter committed
1439 * asynchronously. See SetHintBits for more info. Check
1440 * that the tuple is hinted xmin-committed because of
1441 * that.
1443 if (all_visible)
1445 TransactionId xmin;
1447 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1449 all_visible = false;
1450 break;
1454 * The inserter definitely committed. But is it old
1455 * enough that everyone sees it as committed?
1457 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1458 if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
1460 all_visible = false;
1461 break;
1464 /* Track newest xmin on page. */
1465 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
1466 visibility_cutoff_xid = xmin;
1468 break;
1469 case HEAPTUPLE_RECENTLY_DEAD:
1472 * If tuple is recently deleted then we must not remove it
1473 * from relation.
1475 nkeep += 1;
1476 all_visible = false;
1477 break;
1478 case HEAPTUPLE_INSERT_IN_PROGRESS:
1481 * This is an expected case during concurrent vacuum.
1483 * We do not count these rows as live, because we expect
1484 * the inserting transaction to update the counters at
1485 * commit, and we assume that will happen only after we
1486 * report our results. This assumption is a bit shaky,
1487 * but it is what acquire_sample_rows() does, so be
1488 * consistent.
1490 all_visible = false;
1491 break;
1492 case HEAPTUPLE_DELETE_IN_PROGRESS:
1493 /* This is an expected case during concurrent vacuum */
1494 all_visible = false;
1497 * Count such rows as live. As above, we assume the
1498 * deleting transaction will commit and update the
1499 * counters after we report.
1501 live_tuples += 1;
1502 break;
1503 default:
1504 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1505 break;
1508 if (tupgone)
1510 lazy_record_dead_tuple(dead_tuples, &(tuple.t_self));
1511 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
1512 &vacrel->latestRemovedXid);
1513 tups_vacuumed += 1;
1514 has_dead_items = true;
1516 else
1518 bool tuple_totally_frozen;
1520 num_tuples += 1;
1521 hastup = true;
1524 * Each non-removable tuple must be checked to see if it needs
1525 * freezing. Note we already have exclusive buffer lock.
1527 if (heap_prepare_freeze_tuple(tuple.t_data,
1528 vacrel->relfrozenxid,
1529 vacrel->relminmxid,
1530 vacrel->FreezeLimit,
1531 vacrel->MultiXactCutoff,
1532 &frozen[nfrozen],
1533 &tuple_totally_frozen))
1534 frozen[nfrozen++].offset = offnum;
1536 if (!tuple_totally_frozen)
1537 all_frozen = false;
1539 } /* scan along page */
1542 * Clear the offset information once we have processed all the tuples
1543 * on the page.
1545 vacrel->offnum = InvalidOffsetNumber;
1548 * If we froze any tuples, mark the buffer dirty, and write a WAL
1549 * record recording the changes. We must log the changes to be
1550 * crash-safe against future truncation of CLOG.
1552 if (nfrozen > 0)
1554 START_CRIT_SECTION();
1556 MarkBufferDirty(buf);
1558 /* execute collected freezes */
1559 for (i = 0; i < nfrozen; i++)
1561 ItemId itemid;
1562 HeapTupleHeader htup;
1564 itemid = PageGetItemId(page, frozen[i].offset);
1565 htup = (HeapTupleHeader) PageGetItem(page, itemid);
1567 heap_execute_freeze_tuple(htup, &frozen[i]);
1570 /* Now WAL-log freezing if necessary */
1571 if (RelationNeedsWAL(vacrel->rel))
1573 XLogRecPtr recptr;
1575 recptr = log_heap_freeze(vacrel->rel, buf,
1576 vacrel->FreezeLimit, frozen, nfrozen);
1577 PageSetLSN(page, recptr);
1580 END_CRIT_SECTION();
1584 * If there are no indexes we can vacuum the page right now instead of
1585 * doing a second scan. Also we don't do that but forget dead tuples
1586 * when index cleanup is disabled.
1588 if (!vacrel->useindex && dead_tuples->num_tuples > 0)
1590 if (vacrel->nindexes == 0)
1592 /* Remove tuples from heap if the table has no index */
1593 lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer);
1594 vacuumed_pages++;
1595 has_dead_items = false;
1597 else
1600 * Here, we have indexes but index cleanup is disabled.
1601 * Instead of vacuuming the dead tuples on the heap, we just
1602 * forget them.
1604 * Note that vacrelstats->dead_tuples could have tuples which
1605 * became dead after HOT-pruning but are not marked dead yet.
1606 * We do not process them because it's a very rare condition,
1607 * and the next vacuum will process them anyway.
1609 Assert(params->index_cleanup == VACOPT_TERNARY_DISABLED);
1613 * Forget the now-vacuumed tuples, and press on, but be careful
1614 * not to reset latestRemovedXid since we want that value to be
1615 * valid.
1617 dead_tuples->num_tuples = 0;
1620 * Periodically do incremental FSM vacuuming to make newly-freed
1621 * space visible on upper FSM pages. Note: although we've cleaned
1622 * the current block, we haven't yet updated its FSM entry (that
1623 * happens further down), so passing end == blkno is correct.
1625 if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
1627 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
1628 blkno);
1629 next_fsm_block_to_vacuum = blkno;
1633 freespace = PageGetHeapFreeSpace(page);
1635 /* mark page all-visible, if appropriate */
1636 if (all_visible && !all_visible_according_to_vm)
1638 uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
1640 if (all_frozen)
1641 flags |= VISIBILITYMAP_ALL_FROZEN;
1644 * It should never be the case that the visibility map page is set
1645 * while the page-level bit is clear, but the reverse is allowed
1646 * (if checksums are not enabled). Regardless, set both bits so
1647 * that we get back in sync.
1649 * NB: If the heap page is all-visible but the VM bit is not set,
1650 * we don't need to dirty the heap page. However, if checksums
1651 * are enabled, we do need to make sure that the heap page is
1652 * dirtied before passing it to visibilitymap_set(), because it
1653 * may be logged. Given that this situation should only happen in
1654 * rare cases after a crash, it is not worth optimizing.
1656 PageSetAllVisible(page);
1657 MarkBufferDirty(buf);
1658 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1659 vmbuffer, visibility_cutoff_xid, flags);
1663 * As of PostgreSQL 9.2, the visibility map bit should never be set if
1664 * the page-level bit is clear. However, it's possible that the bit
1665 * got cleared after we checked it and before we took the buffer
1666 * content lock, so we must recheck before jumping to the conclusion
1667 * that something bad has happened.
1669 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1670 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
1672 elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1673 vacrel->relname, blkno);
1674 visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1675 VISIBILITYMAP_VALID_BITS);
1679 * It's possible for the value returned by
1680 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
1681 * wrong for us to see tuples that appear to not be visible to
1682 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
1683 * xmin value never moves backwards, but
1684 * GetOldestNonRemovableTransactionId() is conservative and sometimes
1685 * returns a value that's unnecessarily small, so if we see that
1686 * contradiction it just means that the tuples that we think are not
1687 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
1688 * is correct.
1690 * There should never be dead tuples on a page with PD_ALL_VISIBLE
1691 * set, however.
1693 else if (PageIsAllVisible(page) && has_dead_items)
1695 elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1696 vacrel->relname, blkno);
1697 PageClearAllVisible(page);
1698 MarkBufferDirty(buf);
1699 visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
1700 VISIBILITYMAP_VALID_BITS);
1704 * If the all-visible page is all-frozen but not marked as such yet,
1705 * mark it as all-frozen. Note that all_frozen is only valid if
1706 * all_visible is true, so we must check both.
1708 else if (all_visible_according_to_vm && all_visible && all_frozen &&
1709 !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer))
1712 * We can pass InvalidTransactionId as the cutoff XID here,
1713 * because setting the all-frozen bit doesn't cause recovery
1714 * conflicts.
1716 visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr,
1717 vmbuffer, InvalidTransactionId,
1718 VISIBILITYMAP_ALL_FROZEN);
1721 UnlockReleaseBuffer(buf);
1723 /* Remember the location of the last page with nonremovable tuples */
1724 if (hastup)
1725 vacrel->nonempty_pages = blkno + 1;
1728 * If we remembered any tuples for deletion, then the page will be
1729 * visited again by lazy_vacuum_heap_rel, which will compute and record
1730 * its post-compaction free space. If not, then we're done with this
1731 * page, so remember its free space as-is. (This path will always be
1732 * taken if there are no indexes.)
1734 if (dead_tuples->num_tuples == prev_dead_count)
1735 RecordPageWithFreeSpace(vacrel->rel, blkno, freespace);
1738 /* report that everything is scanned and vacuumed */
1739 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
1741 /* Clear the block number information */
1742 vacrel->blkno = InvalidBlockNumber;
1744 pfree(frozen);
1746 /* save stats for use later */
1747 vacrel->tuples_deleted = tups_vacuumed;
1748 vacrel->new_dead_tuples = nkeep;
1750 /* now we can compute the new value for pg_class.reltuples */
1751 vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks,
1752 vacrel->tupcount_pages,
1753 live_tuples);
1756 * Also compute the total number of surviving heap entries. In the
1757 * (unlikely) scenario that new_live_tuples is -1, take it as zero.
1759 vacrel->new_rel_tuples =
1760 Max(vacrel->new_live_tuples, 0) + vacrel->new_dead_tuples;
1763 * Release any remaining pin on visibility map page.
1765 if (BufferIsValid(vmbuffer))
1767 ReleaseBuffer(vmbuffer);
1768 vmbuffer = InvalidBuffer;
1771 /* If any tuples need to be deleted, perform final vacuum cycle */
1772 /* XXX put a threshold on min number of tuples here? */
1773 if (dead_tuples->num_tuples > 0)
1775 /* Work on all the indexes, and then the heap */
1776 lazy_vacuum_all_indexes(vacrel);
1778 /* Remove tuples from heap */
1779 lazy_vacuum_heap_rel(vacrel);
1783 * Vacuum the remainder of the Free Space Map. We must do this whether or
1784 * not there were indexes.
1786 if (blkno > next_fsm_block_to_vacuum)
1787 FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno);
1789 /* report all blocks vacuumed */
1790 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1792 /* Do post-vacuum cleanup */
1793 if (vacrel->useindex)
1794 lazy_cleanup_all_indexes(vacrel);
1797 * Free resources managed by lazy_space_alloc(). (We must end parallel
1798 * mode/free shared memory before updating index statistics. We cannot
1799 * write while in parallel mode.)
1801 lazy_space_free(vacrel);
1803 /* Update index statistics */
1804 if (vacrel->useindex)
1805 update_index_statistics(vacrel);
1807 /* If no indexes, make log report that lazy_vacuum_heap_rel would've made */
1808 if (vacuumed_pages)
1809 ereport(elevel,
1810 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1811 vacrel->relname,
1812 tups_vacuumed, vacuumed_pages)));
1814 initStringInfo(&buf);
1815 appendStringInfo(&buf,
1816 _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"),
1817 nkeep, vacrel->OldestXmin);
1818 appendStringInfo(&buf, _("There were %.0f unused item identifiers.\n"),
1819 nunused);
1820 appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ",
1821 "Skipped %u pages due to buffer pins, ",
1822 vacrel->pinskipped_pages),
1823 vacrel->pinskipped_pages);
1824 appendStringInfo(&buf, ngettext("%u frozen page.\n",
1825 "%u frozen pages.\n",
1826 vacrel->frozenskipped_pages),
1827 vacrel->frozenskipped_pages);
1828 appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
1829 "%u pages are entirely empty.\n",
1830 empty_pages),
1831 empty_pages);
1832 appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0));
1834 ereport(elevel,
1835 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1836 vacrel->relname,
1837 tups_vacuumed, num_tuples,
1838 vacrel->scanned_pages, nblocks),
1839 errdetail_internal("%s", buf.data)));
1840 pfree(buf.data);
1844 * lazy_vacuum_all_indexes() -- Main entry for index vacuuming
1846 static void
1847 lazy_vacuum_all_indexes(LVRelState *vacrel)
1849 Assert(!IsParallelWorker());
1850 Assert(vacrel->nindexes > 0);
1851 Assert(TransactionIdIsNormal(vacrel->relfrozenxid));
1852 Assert(MultiXactIdIsValid(vacrel->relminmxid));
1854 /* Log cleanup info before we touch indexes */
1855 vacuum_log_cleanup_info(vacrel);
1857 /* Report that we are now vacuuming indexes */
1858 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1859 PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
1861 if (!ParallelVacuumIsActive(vacrel))
1863 for (int idx = 0; idx < vacrel->nindexes; idx++)
1865 Relation indrel = vacrel->indrels[idx];
1866 IndexBulkDeleteResult *istat = vacrel->indstats[idx];
1868 vacrel->indstats[idx] =
1869 lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples,
1870 vacrel);
1873 else
1875 /* Outsource everything to parallel variant */
1876 do_parallel_lazy_vacuum_all_indexes(vacrel);
1879 /* Increase and report the number of index scans */
1880 vacrel->num_index_scans++;
1881 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_INDEX_VACUUMS,
1882 vacrel->num_index_scans);
1886 * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy
1888 * This routine marks dead tuples as unused and compacts out free space on
1889 * their pages. Pages not having dead tuples recorded from lazy_scan_heap are
1890 * not visited at all.
1892 * Note: the reason for doing this as a second pass is we cannot remove the
1893 * tuples until we've removed their index entries, and we want to process
1894 * index entry removal in batches as large as possible.
1896 static void
1897 lazy_vacuum_heap_rel(LVRelState *vacrel)
1899 int tupindex;
1900 int vacuumed_pages;
1901 PGRUsage ru0;
1902 Buffer vmbuffer = InvalidBuffer;
1903 LVSavedErrInfo saved_err_info;
1905 /* Report that we are now vacuuming the heap */
1906 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
1907 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
1909 /* Update error traceback information */
1910 update_vacuum_error_info(vacrel, &saved_err_info,
1911 VACUUM_ERRCB_PHASE_VACUUM_HEAP,
1912 InvalidBlockNumber, InvalidOffsetNumber);
1914 pg_rusage_init(&ru0);
1915 vacuumed_pages = 0;
1917 tupindex = 0;
1918 while (tupindex < vacrel->dead_tuples->num_tuples)
1920 BlockNumber tblk;
1921 Buffer buf;
1922 Page page;
1923 Size freespace;
1925 vacuum_delay_point();
1927 tblk = ItemPointerGetBlockNumber(&vacrel->dead_tuples->itemptrs[tupindex]);
1928 vacrel->blkno = tblk;
1929 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1930 vacrel->bstrategy);
1931 if (!ConditionalLockBufferForCleanup(buf))
1933 ReleaseBuffer(buf);
1934 ++tupindex;
1935 continue;
1937 tupindex = lazy_vacuum_heap_page(vacrel, tblk, buf, tupindex,
1938 &vmbuffer);
1940 /* Now that we've compacted the page, record its available space */
1941 page = BufferGetPage(buf);
1942 freespace = PageGetHeapFreeSpace(page);
1944 UnlockReleaseBuffer(buf);
1945 RecordPageWithFreeSpace(vacrel->rel, tblk, freespace);
1946 vacuumed_pages++;
1949 /* Clear the block number information */
1950 vacrel->blkno = InvalidBlockNumber;
1952 if (BufferIsValid(vmbuffer))
1954 ReleaseBuffer(vmbuffer);
1955 vmbuffer = InvalidBuffer;
1958 ereport(elevel,
1959 (errmsg("\"%s\": removed %d dead item identifiers in %u pages",
1960 vacrel->relname, tupindex, vacuumed_pages),
1961 errdetail_internal("%s", pg_rusage_show(&ru0))));
1963 /* Revert to the previous phase information for error traceback */
1964 restore_vacuum_error_info(vacrel, &saved_err_info);
1968 * lazy_vacuum_heap_page() -- free dead tuples on a page
1969 * and repair its fragmentation.
1971 * Caller must hold pin and buffer cleanup lock on the buffer.
1973 * tupindex is the index in vacrel->dead_tuples of the first dead tuple for
1974 * this page. We assume the rest follow sequentially. The return value is
1975 * the first tupindex after the tuples of this page.
1977 static int
1978 lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
1979 int tupindex, Buffer *vmbuffer)
1981 LVDeadTuples *dead_tuples = vacrel->dead_tuples;
1982 Page page = BufferGetPage(buffer);
1983 OffsetNumber unused[MaxHeapTuplesPerPage];
1984 int uncnt = 0;
1985 TransactionId visibility_cutoff_xid;
1986 bool all_frozen;
1987 LVSavedErrInfo saved_err_info;
1989 pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
1991 /* Update error traceback information */
1992 update_vacuum_error_info(vacrel, &saved_err_info,
1993 VACUUM_ERRCB_PHASE_VACUUM_HEAP, blkno,
1994 InvalidOffsetNumber);
1996 START_CRIT_SECTION();
1998 for (; tupindex < dead_tuples->num_tuples; tupindex++)
2000 BlockNumber tblk;
2001 OffsetNumber toff;
2002 ItemId itemid;
2004 tblk = ItemPointerGetBlockNumber(&dead_tuples->itemptrs[tupindex]);
2005 if (tblk != blkno)
2006 break; /* past end of tuples for this block */
2007 toff = ItemPointerGetOffsetNumber(&dead_tuples->itemptrs[tupindex]);
2008 itemid = PageGetItemId(page, toff);
2009 ItemIdSetUnused(itemid);
2010 unused[uncnt++] = toff;
2013 PageRepairFragmentation(page);
2016 * Mark buffer dirty before we write WAL.
2018 MarkBufferDirty(buffer);
2020 /* XLOG stuff */
2021 if (RelationNeedsWAL(vacrel->rel))
2023 XLogRecPtr recptr;
2025 recptr = log_heap_clean(vacrel->rel, buffer,
2026 NULL, 0, NULL, 0,
2027 unused, uncnt,
2028 vacrel->latestRemovedXid);
2029 PageSetLSN(page, recptr);
2033 * End critical section, so we safely can do visibility tests (which
2034 * possibly need to perform IO and allocate memory!). If we crash now the
2035 * page (including the corresponding vm bit) might not be marked all
2036 * visible, but that's fine. A later vacuum will fix that.
2038 END_CRIT_SECTION();
2041 * Now that we have removed the dead tuples from the page, once again
2042 * check if the page has become all-visible. The page is already marked
2043 * dirty, exclusively locked, and, if needed, a full page image has been
2044 * emitted in the log_heap_clean() above.
2046 if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
2047 &all_frozen))
2048 PageSetAllVisible(page);
2051 * All the changes to the heap page have been done. If the all-visible
2052 * flag is now set, also set the VM all-visible bit (and, if possible, the
2053 * all-frozen bit) unless this has already been done previously.
2055 if (PageIsAllVisible(page))
2057 uint8 flags = 0;
2058 uint8 vm_status = visibilitymap_get_status(vacrel->rel,
2059 blkno, vmbuffer);
2061 /* Set the VM all-frozen bit to flag, if needed */
2062 if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
2063 flags |= VISIBILITYMAP_ALL_VISIBLE;
2064 if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
2065 flags |= VISIBILITYMAP_ALL_FROZEN;
2067 Assert(BufferIsValid(*vmbuffer));
2068 if (flags != 0)
2069 visibilitymap_set(vacrel->rel, blkno, buffer, InvalidXLogRecPtr,
2070 *vmbuffer, visibility_cutoff_xid, flags);
2073 /* Revert to the previous phase information for error traceback */
2074 restore_vacuum_error_info(vacrel, &saved_err_info);
2075 return tupindex;
2079 * lazy_check_needs_freeze() -- scan page to see if any tuples
2080 * need to be cleaned to avoid wraparound
2082 * Returns true if the page needs to be vacuumed using cleanup lock.
2083 * Also returns a flag indicating whether page contains any tuples at all.
2085 static bool
2086 lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel)
2088 Page page = BufferGetPage(buf);
2089 OffsetNumber offnum,
2090 maxoff;
2091 HeapTupleHeader tupleheader;
2093 *hastup = false;
2096 * New and empty pages, obviously, don't contain tuples. We could make
2097 * sure that the page is registered in the FSM, but it doesn't seem worth
2098 * waiting for a cleanup lock just for that, especially because it's
2099 * likely that the pin holder will do so.
2101 if (PageIsNew(page) || PageIsEmpty(page))
2102 return false;
2104 maxoff = PageGetMaxOffsetNumber(page);
2105 for (offnum = FirstOffsetNumber;
2106 offnum <= maxoff;
2107 offnum = OffsetNumberNext(offnum))
2109 ItemId itemid;
2112 * Set the offset number so that we can display it along with any
2113 * error that occurred while processing this tuple.
2115 vacrel->offnum = offnum;
2116 itemid = PageGetItemId(page, offnum);
2118 /* this should match hastup test in count_nondeletable_pages() */
2119 if (ItemIdIsUsed(itemid))
2120 *hastup = true;
2122 /* dead and redirect items never need freezing */
2123 if (!ItemIdIsNormal(itemid))
2124 continue;
2126 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
2128 if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit,
2129 vacrel->MultiXactCutoff, buf))
2130 break;
2131 } /* scan along page */
2133 /* Clear the offset information once we have processed the given page. */
2134 vacrel->offnum = InvalidOffsetNumber;
2136 return (offnum <= maxoff);
2140 * Perform lazy_vacuum_all_indexes() steps in parallel
2142 static void
2143 do_parallel_lazy_vacuum_all_indexes(LVRelState *vacrel)
2145 /* Tell parallel workers to do index vacuuming */
2146 vacrel->lps->lvshared->for_cleanup = false;
2147 vacrel->lps->lvshared->first_time = false;
2150 * We can only provide an approximate value of num_heap_tuples in vacuum
2151 * cases.
2153 vacrel->lps->lvshared->reltuples = vacrel->old_live_tuples;
2154 vacrel->lps->lvshared->estimated_count = true;
2156 do_parallel_vacuum_or_cleanup(vacrel,
2157 vacrel->lps->nindexes_parallel_bulkdel);
2161 * Perform lazy_cleanup_all_indexes() steps in parallel
2163 static void
2164 do_parallel_lazy_cleanup_all_indexes(LVRelState *vacrel)
2166 int nworkers;
2169 * If parallel vacuum is active we perform index cleanup with parallel
2170 * workers.
2172 * Tell parallel workers to do index cleanup.
2174 vacrel->lps->lvshared->for_cleanup = true;
2175 vacrel->lps->lvshared->first_time = (vacrel->num_index_scans == 0);
2178 * Now we can provide a better estimate of total number of surviving
2179 * tuples (we assume indexes are more interested in that than in the
2180 * number of nominally live tuples).
2182 vacrel->lps->lvshared->reltuples = vacrel->new_rel_tuples;
2183 vacrel->lps->lvshared->estimated_count =
2184 (vacrel->tupcount_pages < vacrel->rel_pages);
2186 /* Determine the number of parallel workers to launch */
2187 if (vacrel->lps->lvshared->first_time)
2188 nworkers = vacrel->lps->nindexes_parallel_cleanup +
2189 vacrel->lps->nindexes_parallel_condcleanup;
2190 else
2191 nworkers = vacrel->lps->nindexes_parallel_cleanup;
2193 do_parallel_vacuum_or_cleanup(vacrel, nworkers);
2197 * Perform index vacuum or index cleanup with parallel workers. This function
2198 * must be used by the parallel vacuum leader process. The caller must set
2199 * lps->lvshared->for_cleanup to indicate whether to perform vacuum or
2200 * cleanup.
2202 static void
2203 do_parallel_vacuum_or_cleanup(LVRelState *vacrel, int nworkers)
2205 LVParallelState *lps = vacrel->lps;
2207 Assert(!IsParallelWorker());
2208 Assert(ParallelVacuumIsActive(vacrel));
2209 Assert(vacrel->nindexes > 0);
2211 /* The leader process will participate */
2212 nworkers--;
2215 * It is possible that parallel context is initialized with fewer workers
2216 * than the number of indexes that need a separate worker in the current
2217 * phase, so we need to consider it. See compute_parallel_vacuum_workers.
2219 nworkers = Min(nworkers, lps->pcxt->nworkers);
2221 /* Setup the shared cost-based vacuum delay and launch workers */
2222 if (nworkers > 0)
2224 if (vacrel->num_index_scans > 0)
2226 /* Reset the parallel index processing counter */
2227 pg_atomic_write_u32(&(lps->lvshared->idx), 0);
2229 /* Reinitialize the parallel context to relaunch parallel workers */
2230 ReinitializeParallelDSM(lps->pcxt);
2234 * Set up shared cost balance and the number of active workers for
2235 * vacuum delay. We need to do this before launching workers as
2236 * otherwise, they might not see the updated values for these
2237 * parameters.
2239 pg_atomic_write_u32(&(lps->lvshared->cost_balance), VacuumCostBalance);
2240 pg_atomic_write_u32(&(lps->lvshared->active_nworkers), 0);
2243 * The number of workers can vary between bulkdelete and cleanup
2244 * phase.
2246 ReinitializeParallelWorkers(lps->pcxt, nworkers);
2248 LaunchParallelWorkers(lps->pcxt);
2250 if (lps->pcxt->nworkers_launched > 0)
2253 * Reset the local cost values for leader backend as we have
2254 * already accumulated the remaining balance of heap.
2256 VacuumCostBalance = 0;
2257 VacuumCostBalanceLocal = 0;
2259 /* Enable shared cost balance for leader backend */
2260 VacuumSharedCostBalance = &(lps->lvshared->cost_balance);
2261 VacuumActiveNWorkers = &(lps->lvshared->active_nworkers);
2264 if (lps->lvshared->for_cleanup)
2265 ereport(elevel,
2266 (errmsg(ngettext("launched %d parallel vacuum worker for index cleanup (planned: %d)",
2267 "launched %d parallel vacuum workers for index cleanup (planned: %d)",
2268 lps->pcxt->nworkers_launched),
2269 lps->pcxt->nworkers_launched, nworkers)));
2270 else
2271 ereport(elevel,
2272 (errmsg(ngettext("launched %d parallel vacuum worker for index vacuuming (planned: %d)",
2273 "launched %d parallel vacuum workers for index vacuuming (planned: %d)",
2274 lps->pcxt->nworkers_launched),
2275 lps->pcxt->nworkers_launched, nworkers)));
2278 /* Process the indexes that can be processed by only leader process */
2279 do_serial_processing_for_unsafe_indexes(vacrel, lps->lvshared);
2282 * Join as a parallel worker. The leader process alone processes all the
2283 * indexes in the case where no workers are launched.
2285 do_parallel_processing(vacrel, lps->lvshared);
2288 * Next, accumulate buffer and WAL usage. (This must wait for the workers
2289 * to finish, or we might get incomplete data.)
2291 if (nworkers > 0)
2293 /* Wait for all vacuum workers to finish */
2294 WaitForParallelWorkersToFinish(lps->pcxt);
2296 for (int i = 0; i < lps->pcxt->nworkers_launched; i++)
2297 InstrAccumParallelQuery(&lps->buffer_usage[i], &lps->wal_usage[i]);
2301 * Carry the shared balance value to heap scan and disable shared costing
2303 if (VacuumSharedCostBalance)
2305 VacuumCostBalance = pg_atomic_read_u32(VacuumSharedCostBalance);
2306 VacuumSharedCostBalance = NULL;
2307 VacuumActiveNWorkers = NULL;
2312 * Index vacuum/cleanup routine used by the leader process and parallel
2313 * vacuum worker processes to process the indexes in parallel.
2315 static void
2316 do_parallel_processing(LVRelState *vacrel, LVShared *lvshared)
2319 * Increment the active worker count if we are able to launch any worker.
2321 if (VacuumActiveNWorkers)
2322 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2324 /* Loop until all indexes are vacuumed */
2325 for (;;)
2327 int idx;
2328 LVSharedIndStats *shared_istat;
2329 Relation indrel;
2330 IndexBulkDeleteResult *istat;
2332 /* Get an index number to process */
2333 idx = pg_atomic_fetch_add_u32(&(lvshared->idx), 1);
2335 /* Done for all indexes? */
2336 if (idx >= vacrel->nindexes)
2337 break;
2339 /* Get the index statistics of this index from DSM */
2340 shared_istat = parallel_stats_for_idx(lvshared, idx);
2342 /* Skip indexes not participating in parallelism */
2343 if (shared_istat == NULL)
2344 continue;
2346 indrel = vacrel->indrels[idx];
2349 * Skip processing indexes that are unsafe for workers (these are
2350 * processed in do_serial_processing_for_unsafe_indexes() by leader)
2352 if (!parallel_processing_is_safe(indrel, lvshared))
2353 continue;
2355 /* Do vacuum or cleanup of the index */
2356 istat = (vacrel->indstats[idx]);
2357 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2358 lvshared,
2359 shared_istat,
2360 vacrel);
2364 * We have completed the index vacuum so decrement the active worker
2365 * count.
2367 if (VacuumActiveNWorkers)
2368 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2372 * Vacuum or cleanup indexes that can be processed by only the leader process
2373 * because these indexes don't support parallel operation at that phase.
2375 static void
2376 do_serial_processing_for_unsafe_indexes(LVRelState *vacrel, LVShared *lvshared)
2378 Assert(!IsParallelWorker());
2381 * Increment the active worker count if we are able to launch any worker.
2383 if (VacuumActiveNWorkers)
2384 pg_atomic_add_fetch_u32(VacuumActiveNWorkers, 1);
2386 for (int idx = 0; idx < vacrel->nindexes; idx++)
2388 LVSharedIndStats *shared_istat;
2389 Relation indrel;
2390 IndexBulkDeleteResult *istat;
2392 shared_istat = parallel_stats_for_idx(lvshared, idx);
2394 /* Skip already-complete indexes */
2395 if (shared_istat != NULL)
2396 continue;
2398 indrel = vacrel->indrels[idx];
2401 * We're only here for the unsafe indexes
2403 if (parallel_processing_is_safe(indrel, lvshared))
2404 continue;
2406 /* Do vacuum or cleanup of the index */
2407 istat = (vacrel->indstats[idx]);
2408 vacrel->indstats[idx] = parallel_process_one_index(indrel, istat,
2409 lvshared,
2410 shared_istat,
2411 vacrel);
2415 * We have completed the index vacuum so decrement the active worker
2416 * count.
2418 if (VacuumActiveNWorkers)
2419 pg_atomic_sub_fetch_u32(VacuumActiveNWorkers, 1);
2423 * Vacuum or cleanup index either by leader process or by one of the worker
2424 * process. After processing the index this function copies the index
2425 * statistics returned from ambulkdelete and amvacuumcleanup to the DSM
2426 * segment.
2428 static IndexBulkDeleteResult *
2429 parallel_process_one_index(Relation indrel,
2430 IndexBulkDeleteResult *istat,
2431 LVShared *lvshared,
2432 LVSharedIndStats *shared_istat,
2433 LVRelState *vacrel)
2435 IndexBulkDeleteResult *istat_res;
2438 * Update the pointer to the corresponding bulk-deletion result if someone
2439 * has already updated it
2441 if (shared_istat && shared_istat->updated && istat == NULL)
2442 istat = &shared_istat->istat;
2444 /* Do vacuum or cleanup of the index */
2445 if (lvshared->for_cleanup)
2446 istat_res = lazy_cleanup_one_index(indrel, istat, lvshared->reltuples,
2447 lvshared->estimated_count, vacrel);
2448 else
2449 istat_res = lazy_vacuum_one_index(indrel, istat, lvshared->reltuples,
2450 vacrel);
2453 * Copy the index bulk-deletion result returned from ambulkdelete and
2454 * amvacuumcleanup to the DSM segment if it's the first cycle because they
2455 * allocate locally and it's possible that an index will be vacuumed by a
2456 * different vacuum process the next cycle. Copying the result normally
2457 * happens only the first time an index is vacuumed. For any additional
2458 * vacuum pass, we directly point to the result on the DSM segment and
2459 * pass it to vacuum index APIs so that workers can update it directly.
2461 * Since all vacuum workers write the bulk-deletion result at different
2462 * slots we can write them without locking.
2464 if (shared_istat && !shared_istat->updated && istat_res != NULL)
2466 memcpy(&shared_istat->istat, istat_res, sizeof(IndexBulkDeleteResult));
2467 shared_istat->updated = true;
2469 /* Free the locally-allocated bulk-deletion result */
2470 pfree(istat_res);
2472 /* return the pointer to the result from shared memory */
2473 return &shared_istat->istat;
2476 return istat_res;
2480 * lazy_cleanup_all_indexes() -- cleanup all indexes of relation.
2482 static void
2483 lazy_cleanup_all_indexes(LVRelState *vacrel)
2485 Assert(!IsParallelWorker());
2486 Assert(vacrel->nindexes > 0);
2488 /* Report that we are now cleaning up indexes */
2489 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2490 PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
2492 if (!ParallelVacuumIsActive(vacrel))
2494 double reltuples = vacrel->new_rel_tuples;
2495 bool estimated_count =
2496 vacrel->tupcount_pages < vacrel->rel_pages;
2498 for (int idx = 0; idx < vacrel->nindexes; idx++)
2500 Relation indrel = vacrel->indrels[idx];
2501 IndexBulkDeleteResult *istat = vacrel->indstats[idx];
2503 vacrel->indstats[idx] =
2504 lazy_cleanup_one_index(indrel, istat, reltuples,
2505 estimated_count, vacrel);
2508 else
2510 /* Outsource everything to parallel variant */
2511 do_parallel_lazy_cleanup_all_indexes(vacrel);
2516 * lazy_vacuum_one_index() -- vacuum index relation.
2518 * Delete all the index entries pointing to tuples listed in
2519 * dead_tuples, and update running statistics.
2521 * reltuples is the number of heap tuples to be passed to the
2522 * bulkdelete callback. It's always assumed to be estimated.
2524 * Returns bulk delete stats derived from input stats
2526 static IndexBulkDeleteResult *
2527 lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2528 double reltuples, LVRelState *vacrel)
2530 IndexVacuumInfo ivinfo;
2531 PGRUsage ru0;
2532 LVSavedErrInfo saved_err_info;
2534 pg_rusage_init(&ru0);
2536 ivinfo.index = indrel;
2537 ivinfo.analyze_only = false;
2538 ivinfo.report_progress = false;
2539 ivinfo.estimated_count = true;
2540 ivinfo.message_level = elevel;
2541 ivinfo.num_heap_tuples = reltuples;
2542 ivinfo.strategy = vacrel->bstrategy;
2545 * Update error traceback information.
2547 * The index name is saved during this phase and restored immediately
2548 * after this phase. See vacuum_error_callback.
2550 Assert(vacrel->indname == NULL);
2551 vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2552 update_vacuum_error_info(vacrel, &saved_err_info,
2553 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
2554 InvalidBlockNumber, InvalidOffsetNumber);
2556 /* Do bulk deletion */
2557 istat = index_bulk_delete(&ivinfo, istat, lazy_tid_reaped,
2558 (void *) vacrel->dead_tuples);
2560 ereport(elevel,
2561 (errmsg("scanned index \"%s\" to remove %d row versions",
2562 vacrel->indname, vacrel->dead_tuples->num_tuples),
2563 errdetail_internal("%s", pg_rusage_show(&ru0))));
2565 /* Revert to the previous phase information for error traceback */
2566 restore_vacuum_error_info(vacrel, &saved_err_info);
2567 pfree(vacrel->indname);
2568 vacrel->indname = NULL;
2570 return istat;
2574 * lazy_cleanup_one_index() -- do post-vacuum cleanup for index relation.
2576 * reltuples is the number of heap tuples and estimated_count is true
2577 * if reltuples is an estimated value.
2579 * Returns bulk delete stats derived from input stats
2581 static IndexBulkDeleteResult *
2582 lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat,
2583 double reltuples, bool estimated_count,
2584 LVRelState *vacrel)
2586 IndexVacuumInfo ivinfo;
2587 PGRUsage ru0;
2588 LVSavedErrInfo saved_err_info;
2590 pg_rusage_init(&ru0);
2592 ivinfo.index = indrel;
2593 ivinfo.analyze_only = false;
2594 ivinfo.report_progress = false;
2595 ivinfo.estimated_count = estimated_count;
2596 ivinfo.message_level = elevel;
2598 ivinfo.num_heap_tuples = reltuples;
2599 ivinfo.strategy = vacrel->bstrategy;
2602 * Update error traceback information.
2604 * The index name is saved during this phase and restored immediately
2605 * after this phase. See vacuum_error_callback.
2607 Assert(vacrel->indname == NULL);
2608 vacrel->indname = pstrdup(RelationGetRelationName(indrel));
2609 update_vacuum_error_info(vacrel, &saved_err_info,
2610 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
2611 InvalidBlockNumber, InvalidOffsetNumber);
2613 istat = index_vacuum_cleanup(&ivinfo, istat);
2615 if (istat)
2617 ereport(elevel,
2618 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
2619 RelationGetRelationName(indrel),
2620 (istat)->num_index_tuples,
2621 (istat)->num_pages),
2622 errdetail("%.0f index row versions were removed.\n"
2623 "%u index pages were newly deleted.\n"
2624 "%u index pages are currently deleted, of which %u are currently reusable.\n"
2625 "%s.",
2626 (istat)->tuples_removed,
2627 (istat)->pages_newly_deleted,
2628 (istat)->pages_deleted, (istat)->pages_free,
2629 pg_rusage_show(&ru0))));
2632 /* Revert to the previous phase information for error traceback */
2633 restore_vacuum_error_info(vacrel, &saved_err_info);
2634 pfree(vacrel->indname);
2635 vacrel->indname = NULL;
2637 return istat;
2641 * should_attempt_truncation - should we attempt to truncate the heap?
2643 * Don't even think about it unless we have a shot at releasing a goodly
2644 * number of pages. Otherwise, the time taken isn't worth it.
2646 * Also don't attempt it if we are doing early pruning/vacuuming, because a
2647 * scan which cannot find a truncated heap page cannot determine that the
2648 * snapshot is too old to read that page. We might be able to get away with
2649 * truncating all except one of the pages, setting its LSN to (at least) the
2650 * maximum of the truncated range if we also treated an index leaf tuple
2651 * pointing to a missing heap page as something to trigger the "snapshot too
2652 * old" error, but that seems fragile and seems like it deserves its own patch
2653 * if we consider it.
2655 * This is split out so that we can test whether truncation is going to be
2656 * called for before we actually do it. If you change the logic here, be
2657 * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
2659 static bool
2660 should_attempt_truncation(LVRelState *vacrel, VacuumParams *params)
2662 BlockNumber possibly_freeable;
2664 if (params->truncate == VACOPT_TERNARY_DISABLED)
2665 return false;
2667 possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
2668 if (possibly_freeable > 0 &&
2669 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
2670 possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION) &&
2671 old_snapshot_threshold < 0)
2672 return true;
2673 else
2674 return false;
2678 * lazy_truncate_heap - try to truncate off any empty pages at the end
2680 static void
2681 lazy_truncate_heap(LVRelState *vacrel)
2683 BlockNumber old_rel_pages = vacrel->rel_pages;
2684 BlockNumber new_rel_pages;
2685 int lock_retry;
2687 /* Report that we are now truncating */
2688 pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
2689 PROGRESS_VACUUM_PHASE_TRUNCATE);
2692 * Loop until no more truncating can be done.
2696 PGRUsage ru0;
2698 pg_rusage_init(&ru0);
2701 * We need full exclusive lock on the relation in order to do
2702 * truncation. If we can't get it, give up rather than waiting --- we
2703 * don't want to block other backends, and we don't want to deadlock
2704 * (which is quite possible considering we already hold a lower-grade
2705 * lock).
2707 vacrel->lock_waiter_detected = false;
2708 lock_retry = 0;
2709 while (true)
2711 if (ConditionalLockRelation(vacrel->rel, AccessExclusiveLock))
2712 break;
2715 * Check for interrupts while trying to (re-)acquire the exclusive
2716 * lock.
2718 CHECK_FOR_INTERRUPTS();
2720 if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
2721 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
2724 * We failed to establish the lock in the specified number of
2725 * retries. This means we give up truncating.
2727 vacrel->lock_waiter_detected = true;
2728 ereport(elevel,
2729 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
2730 vacrel->relname)));
2731 return;
2734 pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L);
2738 * Now that we have exclusive lock, look to see if the rel has grown
2739 * whilst we were vacuuming with non-exclusive lock. If so, give up;
2740 * the newly added pages presumably contain non-deletable tuples.
2742 new_rel_pages = RelationGetNumberOfBlocks(vacrel->rel);
2743 if (new_rel_pages != old_rel_pages)
2746 * Note: we intentionally don't update vacrel->rel_pages with the
2747 * new rel size here. If we did, it would amount to assuming that
2748 * the new pages are empty, which is unlikely. Leaving the numbers
2749 * alone amounts to assuming that the new pages have the same
2750 * tuple density as existing ones, which is less unlikely.
2752 UnlockRelation(vacrel->rel, AccessExclusiveLock);
2753 return;
2757 * Scan backwards from the end to verify that the end pages actually
2758 * contain no tuples. This is *necessary*, not optional, because
2759 * other backends could have added tuples to these pages whilst we
2760 * were vacuuming.
2762 new_rel_pages = count_nondeletable_pages(vacrel);
2763 vacrel->blkno = new_rel_pages;
2765 if (new_rel_pages >= old_rel_pages)
2767 /* can't do anything after all */
2768 UnlockRelation(vacrel->rel, AccessExclusiveLock);
2769 return;
2773 * Okay to truncate.
2775 RelationTruncate(vacrel->rel, new_rel_pages);
2778 * We can release the exclusive lock as soon as we have truncated.
2779 * Other backends can't safely access the relation until they have
2780 * processed the smgr invalidation that smgrtruncate sent out ... but
2781 * that should happen as part of standard invalidation processing once
2782 * they acquire lock on the relation.
2784 UnlockRelation(vacrel->rel, AccessExclusiveLock);
2787 * Update statistics. Here, it *is* correct to adjust rel_pages
2788 * without also touching reltuples, since the tuple count wasn't
2789 * changed by the truncation.
2791 vacrel->pages_removed += old_rel_pages - new_rel_pages;
2792 vacrel->rel_pages = new_rel_pages;
2794 ereport(elevel,
2795 (errmsg("\"%s\": truncated %u to %u pages",
2796 vacrel->relname,
2797 old_rel_pages, new_rel_pages),
2798 errdetail_internal("%s",
2799 pg_rusage_show(&ru0))));
2800 old_rel_pages = new_rel_pages;
2801 } while (new_rel_pages > vacrel->nonempty_pages &&
2802 vacrel->lock_waiter_detected);
2806 * Rescan end pages to verify that they are (still) empty of tuples.
2808 * Returns number of nondeletable pages (last nonempty page + 1).
2810 static BlockNumber
2811 count_nondeletable_pages(LVRelState *vacrel)
2813 BlockNumber blkno;
2814 BlockNumber prefetchedUntil;
2815 instr_time starttime;
2817 /* Initialize the starttime if we check for conflicting lock requests */
2818 INSTR_TIME_SET_CURRENT(starttime);
2821 * Start checking blocks at what we believe relation end to be and move
2822 * backwards. (Strange coding of loop control is needed because blkno is
2823 * unsigned.) To make the scan faster, we prefetch a few blocks at a time
2824 * in forward direction, so that OS-level readahead can kick in.
2826 blkno = vacrel->rel_pages;
2827 StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0,
2828 "prefetch size must be power of 2");
2829 prefetchedUntil = InvalidBlockNumber;
2830 while (blkno > vacrel->nonempty_pages)
2832 Buffer buf;
2833 Page page;
2834 OffsetNumber offnum,
2835 maxoff;
2836 bool hastup;
2839 * Check if another process requests a lock on our relation. We are
2840 * holding an AccessExclusiveLock here, so they will be waiting. We
2841 * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
2842 * only check if that interval has elapsed once every 32 blocks to
2843 * keep the number of system calls and actual shared lock table
2844 * lookups to a minimum.
2846 if ((blkno % 32) == 0)
2848 instr_time currenttime;
2849 instr_time elapsed;
2851 INSTR_TIME_SET_CURRENT(currenttime);
2852 elapsed = currenttime;
2853 INSTR_TIME_SUBTRACT(elapsed, starttime);
2854 if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
2855 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
2857 if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
2859 ereport(elevel,
2860 (errmsg("\"%s\": suspending truncate due to conflicting lock request",
2861 vacrel->relname)));
2863 vacrel->lock_waiter_detected = true;
2864 return blkno;
2866 starttime = currenttime;
2871 * We don't insert a vacuum delay point here, because we have an
2872 * exclusive lock on the table which we want to hold for as short a
2873 * time as possible. We still need to check for interrupts however.
2875 CHECK_FOR_INTERRUPTS();
2877 blkno--;
2879 /* If we haven't prefetched this lot yet, do so now. */
2880 if (prefetchedUntil > blkno)
2882 BlockNumber prefetchStart;
2883 BlockNumber pblkno;
2885 prefetchStart = blkno & ~(PREFETCH_SIZE - 1);
2886 for (pblkno = prefetchStart; pblkno <= blkno; pblkno++)
2888 PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, pblkno);
2889 CHECK_FOR_INTERRUPTS();
2891 prefetchedUntil = prefetchStart;
2894 buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
2895 vacrel->bstrategy);
2897 /* In this phase we only need shared access to the buffer */
2898 LockBuffer(buf, BUFFER_LOCK_SHARE);
2900 page = BufferGetPage(buf);
2902 if (PageIsNew(page) || PageIsEmpty(page))
2904 UnlockReleaseBuffer(buf);
2905 continue;
2908 hastup = false;
2909 maxoff = PageGetMaxOffsetNumber(page);
2910 for (offnum = FirstOffsetNumber;
2911 offnum <= maxoff;
2912 offnum = OffsetNumberNext(offnum))
2914 ItemId itemid;
2916 itemid = PageGetItemId(page, offnum);
2919 * Note: any non-unused item should be taken as a reason to keep
2920 * this page. We formerly thought that DEAD tuples could be
2921 * thrown away, but that's not so, because we'd not have cleaned
2922 * out their index entries.
2924 if (ItemIdIsUsed(itemid))
2926 hastup = true;
2927 break; /* can stop scanning */
2929 } /* scan along page */
2931 UnlockReleaseBuffer(buf);
2933 /* Done scanning if we found a tuple here */
2934 if (hastup)
2935 return blkno + 1;
2939 * If we fall out of the loop, all the previously-thought-to-be-empty
2940 * pages still are; we need not bother to look at the last known-nonempty
2941 * page.
2943 return vacrel->nonempty_pages;
2947 * Return the maximum number of dead tuples we can record.
2949 static long
2950 compute_max_dead_tuples(BlockNumber relblocks, bool useindex)
2952 long maxtuples;
2953 int vac_work_mem = IsAutoVacuumWorkerProcess() &&
2954 autovacuum_work_mem != -1 ?
2955 autovacuum_work_mem : maintenance_work_mem;
2957 if (useindex)
2959 maxtuples = MAXDEADTUPLES(vac_work_mem * 1024L);
2960 maxtuples = Min(maxtuples, INT_MAX);
2961 maxtuples = Min(maxtuples, MAXDEADTUPLES(MaxAllocSize));
2963 /* curious coding here to ensure the multiplication can't overflow */
2964 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
2965 maxtuples = relblocks * LAZY_ALLOC_TUPLES;
2967 /* stay sane if small maintenance_work_mem */
2968 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
2970 else
2971 maxtuples = MaxHeapTuplesPerPage;
2973 return maxtuples;
2977 * lazy_space_alloc - space allocation decisions for lazy vacuum
2979 * See the comments at the head of this file for rationale.
2981 static void
2982 lazy_space_alloc(LVRelState *vacrel, int nworkers, BlockNumber nblocks)
2984 LVDeadTuples *dead_tuples;
2985 long maxtuples;
2988 * Initialize state for a parallel vacuum. As of now, only one worker can
2989 * be used for an index, so we invoke parallelism only if there are at
2990 * least two indexes on a table.
2992 if (nworkers >= 0 && vacrel->nindexes > 1)
2995 * Since parallel workers cannot access data in temporary tables, we
2996 * can't perform parallel vacuum on them.
2998 if (RelationUsesLocalBuffers(vacrel->rel))
3001 * Give warning only if the user explicitly tries to perform a
3002 * parallel vacuum on the temporary table.
3004 if (nworkers > 0)
3005 ereport(WARNING,
3006 (errmsg("disabling parallel option of vacuum on \"%s\" --- cannot vacuum temporary tables in parallel",
3007 vacrel->relname)));
3009 else
3010 vacrel->lps = begin_parallel_vacuum(vacrel, nblocks, nworkers);
3012 /* If parallel mode started, we're done */
3013 if (ParallelVacuumIsActive(vacrel))
3014 return;
3017 maxtuples = compute_max_dead_tuples(nblocks, vacrel->nindexes > 0);
3019 dead_tuples = (LVDeadTuples *) palloc(SizeOfDeadTuples(maxtuples));
3020 dead_tuples->num_tuples = 0;
3021 dead_tuples->max_tuples = (int) maxtuples;
3023 vacrel->dead_tuples = dead_tuples;
3027 * lazy_space_free - free space allocated in lazy_space_alloc
3029 static void
3030 lazy_space_free(LVRelState *vacrel)
3032 if (!ParallelVacuumIsActive(vacrel))
3033 return;
3036 * End parallel mode before updating index statistics as we cannot write
3037 * during parallel mode.
3039 end_parallel_vacuum(vacrel);
3043 * lazy_record_dead_tuple - remember one deletable tuple
3045 static void
3046 lazy_record_dead_tuple(LVDeadTuples *dead_tuples, ItemPointer itemptr)
3049 * The array shouldn't overflow under normal behavior, but perhaps it
3050 * could if we are given a really small maintenance_work_mem. In that
3051 * case, just forget the last few tuples (we'll get 'em next time).
3053 if (dead_tuples->num_tuples < dead_tuples->max_tuples)
3055 dead_tuples->itemptrs[dead_tuples->num_tuples] = *itemptr;
3056 dead_tuples->num_tuples++;
3057 pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
3058 dead_tuples->num_tuples);
3063 * lazy_tid_reaped() -- is a particular tid deletable?
3065 * This has the right signature to be an IndexBulkDeleteCallback.
3067 * Assumes dead_tuples array is in sorted order.
3069 static bool
3070 lazy_tid_reaped(ItemPointer itemptr, void *state)
3072 LVDeadTuples *dead_tuples = (LVDeadTuples *) state;
3073 int64 litem,
3074 ritem,
3075 item;
3076 ItemPointer res;
3078 litem = itemptr_encode(&dead_tuples->itemptrs[0]);
3079 ritem = itemptr_encode(&dead_tuples->itemptrs[dead_tuples->num_tuples - 1]);
3080 item = itemptr_encode(itemptr);
3083 * Doing a simple bound check before bsearch() is useful to avoid the
3084 * extra cost of bsearch(), especially if dead tuples on the heap are
3085 * concentrated in a certain range. Since this function is called for
3086 * every index tuple, it pays to be really fast.
3088 if (item < litem || item > ritem)
3089 return false;
3091 res = (ItemPointer) bsearch((void *) itemptr,
3092 (void *) dead_tuples->itemptrs,
3093 dead_tuples->num_tuples,
3094 sizeof(ItemPointerData),
3095 vac_cmp_itemptr);
3097 return (res != NULL);
3101 * Comparator routines for use with qsort() and bsearch().
3103 static int
3104 vac_cmp_itemptr(const void *left, const void *right)
3106 BlockNumber lblk,
3107 rblk;
3108 OffsetNumber loff,
3109 roff;
3111 lblk = ItemPointerGetBlockNumber((ItemPointer) left);
3112 rblk = ItemPointerGetBlockNumber((ItemPointer) right);
3114 if (lblk < rblk)
3115 return -1;
3116 if (lblk > rblk)
3117 return 1;
3119 loff = ItemPointerGetOffsetNumber((ItemPointer) left);
3120 roff = ItemPointerGetOffsetNumber((ItemPointer) right);
3122 if (loff < roff)
3123 return -1;
3124 if (loff > roff)
3125 return 1;
3127 return 0;
3131 * Check if every tuple in the given page is visible to all current and future
3132 * transactions. Also return the visibility_cutoff_xid which is the highest
3133 * xmin amongst the visible tuples. Set *all_frozen to true if every tuple
3134 * on this page is frozen.
3136 static bool
3137 heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
3138 TransactionId *visibility_cutoff_xid,
3139 bool *all_frozen)
3141 Page page = BufferGetPage(buf);
3142 BlockNumber blockno = BufferGetBlockNumber(buf);
3143 OffsetNumber offnum,
3144 maxoff;
3145 bool all_visible = true;
3147 *visibility_cutoff_xid = InvalidTransactionId;
3148 *all_frozen = true;
3151 * This is a stripped down version of the line pointer scan in
3152 * lazy_scan_heap(). So if you change anything here, also check that code.
3154 maxoff = PageGetMaxOffsetNumber(page);
3155 for (offnum = FirstOffsetNumber;
3156 offnum <= maxoff && all_visible;
3157 offnum = OffsetNumberNext(offnum))
3159 ItemId itemid;
3160 HeapTupleData tuple;
3163 * Set the offset number so that we can display it along with any
3164 * error that occurred while processing this tuple.
3166 vacrel->offnum = offnum;
3167 itemid = PageGetItemId(page, offnum);
3169 /* Unused or redirect line pointers are of no interest */
3170 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
3171 continue;
3173 ItemPointerSet(&(tuple.t_self), blockno, offnum);
3176 * Dead line pointers can have index pointers pointing to them. So
3177 * they can't be treated as visible
3179 if (ItemIdIsDead(itemid))
3181 all_visible = false;
3182 *all_frozen = false;
3183 break;
3186 Assert(ItemIdIsNormal(itemid));
3188 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
3189 tuple.t_len = ItemIdGetLength(itemid);
3190 tuple.t_tableOid = RelationGetRelid(vacrel->rel);
3192 switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf))
3194 case HEAPTUPLE_LIVE:
3196 TransactionId xmin;
3198 /* Check comments in lazy_scan_heap. */
3199 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
3201 all_visible = false;
3202 *all_frozen = false;
3203 break;
3207 * The inserter definitely committed. But is it old enough
3208 * that everyone sees it as committed?
3210 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
3211 if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
3213 all_visible = false;
3214 *all_frozen = false;
3215 break;
3218 /* Track newest xmin on page. */
3219 if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
3220 *visibility_cutoff_xid = xmin;
3222 /* Check whether this tuple is already frozen or not */
3223 if (all_visible && *all_frozen &&
3224 heap_tuple_needs_eventual_freeze(tuple.t_data))
3225 *all_frozen = false;
3227 break;
3229 case HEAPTUPLE_DEAD:
3230 case HEAPTUPLE_RECENTLY_DEAD:
3231 case HEAPTUPLE_INSERT_IN_PROGRESS:
3232 case HEAPTUPLE_DELETE_IN_PROGRESS:
3234 all_visible = false;
3235 *all_frozen = false;
3236 break;
3238 default:
3239 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
3240 break;
3242 } /* scan along page */
3244 /* Clear the offset information once we have processed the given page. */
3245 vacrel->offnum = InvalidOffsetNumber;
3247 return all_visible;
3251 * Compute the number of parallel worker processes to request. Both index
3252 * vacuum and index cleanup can be executed with parallel workers. The index
3253 * is eligible for parallel vacuum iff its size is greater than
3254 * min_parallel_index_scan_size as invoking workers for very small indexes
3255 * can hurt performance.
3257 * nrequested is the number of parallel workers that user requested. If
3258 * nrequested is 0, we compute the parallel degree based on nindexes, that is
3259 * the number of indexes that support parallel vacuum. This function also
3260 * sets can_parallel_vacuum to remember indexes that participate in parallel
3261 * vacuum.
3263 static int
3264 compute_parallel_vacuum_workers(LVRelState *vacrel, int nrequested,
3265 bool *can_parallel_vacuum)
3267 int nindexes_parallel = 0;
3268 int nindexes_parallel_bulkdel = 0;
3269 int nindexes_parallel_cleanup = 0;
3270 int parallel_workers;
3273 * We don't allow performing parallel operation in standalone backend or
3274 * when parallelism is disabled.
3276 if (!IsUnderPostmaster || max_parallel_maintenance_workers == 0)
3277 return 0;
3280 * Compute the number of indexes that can participate in parallel vacuum.
3282 for (int idx = 0; idx < vacrel->nindexes; idx++)
3284 Relation indrel = vacrel->indrels[idx];
3285 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3287 if (vacoptions == VACUUM_OPTION_NO_PARALLEL ||
3288 RelationGetNumberOfBlocks(indrel) < min_parallel_index_scan_size)
3289 continue;
3291 can_parallel_vacuum[idx] = true;
3293 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3294 nindexes_parallel_bulkdel++;
3295 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0) ||
3296 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3297 nindexes_parallel_cleanup++;
3300 nindexes_parallel = Max(nindexes_parallel_bulkdel,
3301 nindexes_parallel_cleanup);
3303 /* The leader process takes one index */
3304 nindexes_parallel--;
3306 /* No index supports parallel vacuum */
3307 if (nindexes_parallel <= 0)
3308 return 0;
3310 /* Compute the parallel degree */
3311 parallel_workers = (nrequested > 0) ?
3312 Min(nrequested, nindexes_parallel) : nindexes_parallel;
3314 /* Cap by max_parallel_maintenance_workers */
3315 parallel_workers = Min(parallel_workers, max_parallel_maintenance_workers);
3317 return parallel_workers;
3321 * Update index statistics in pg_class if the statistics are accurate.
3323 static void
3324 update_index_statistics(LVRelState *vacrel)
3326 Relation *indrels = vacrel->indrels;
3327 int nindexes = vacrel->nindexes;
3328 IndexBulkDeleteResult **indstats = vacrel->indstats;
3330 Assert(!IsInParallelMode());
3332 for (int idx = 0; idx < nindexes; idx++)
3334 Relation indrel = indrels[idx];
3335 IndexBulkDeleteResult *istat = indstats[idx];
3337 if (istat == NULL || istat->estimated_count)
3338 continue;
3340 /* Update index statistics */
3341 vac_update_relstats(indrel,
3342 istat->num_pages,
3343 istat->num_index_tuples,
3345 false,
3346 InvalidTransactionId,
3347 InvalidMultiXactId,
3348 false);
3353 * This function prepares and returns parallel vacuum state if we can launch
3354 * even one worker. This function is responsible for entering parallel mode,
3355 * create a parallel context, and then initialize the DSM segment.
3357 static LVParallelState *
3358 begin_parallel_vacuum(LVRelState *vacrel, BlockNumber nblocks,
3359 int nrequested)
3361 LVParallelState *lps = NULL;
3362 Relation *indrels = vacrel->indrels;
3363 int nindexes = vacrel->nindexes;
3364 ParallelContext *pcxt;
3365 LVShared *shared;
3366 LVDeadTuples *dead_tuples;
3367 BufferUsage *buffer_usage;
3368 WalUsage *wal_usage;
3369 bool *can_parallel_vacuum;
3370 long maxtuples;
3371 Size est_shared;
3372 Size est_deadtuples;
3373 int nindexes_mwm = 0;
3374 int parallel_workers = 0;
3375 int querylen;
3378 * A parallel vacuum must be requested and there must be indexes on the
3379 * relation
3381 Assert(nrequested >= 0);
3382 Assert(nindexes > 0);
3385 * Compute the number of parallel vacuum workers to launch
3387 can_parallel_vacuum = (bool *) palloc0(sizeof(bool) * nindexes);
3388 parallel_workers = compute_parallel_vacuum_workers(vacrel,
3389 nrequested,
3390 can_parallel_vacuum);
3392 /* Can't perform vacuum in parallel */
3393 if (parallel_workers <= 0)
3395 pfree(can_parallel_vacuum);
3396 return lps;
3399 lps = (LVParallelState *) palloc0(sizeof(LVParallelState));
3401 EnterParallelMode();
3402 pcxt = CreateParallelContext("postgres", "parallel_vacuum_main",
3403 parallel_workers);
3404 Assert(pcxt->nworkers > 0);
3405 lps->pcxt = pcxt;
3407 /* Estimate size for shared information -- PARALLEL_VACUUM_KEY_SHARED */
3408 est_shared = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3409 for (int idx = 0; idx < nindexes; idx++)
3411 Relation indrel = indrels[idx];
3412 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3415 * Cleanup option should be either disabled, always performing in
3416 * parallel or conditionally performing in parallel.
3418 Assert(((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) ||
3419 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0));
3420 Assert(vacoptions <= VACUUM_OPTION_MAX_VALID_VALUE);
3422 /* Skip indexes that don't participate in parallel vacuum */
3423 if (!can_parallel_vacuum[idx])
3424 continue;
3426 if (indrel->rd_indam->amusemaintenanceworkmem)
3427 nindexes_mwm++;
3429 est_shared = add_size(est_shared, sizeof(LVSharedIndStats));
3432 * Remember the number of indexes that support parallel operation for
3433 * each phase.
3435 if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) != 0)
3436 lps->nindexes_parallel_bulkdel++;
3437 if ((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) != 0)
3438 lps->nindexes_parallel_cleanup++;
3439 if ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0)
3440 lps->nindexes_parallel_condcleanup++;
3442 shm_toc_estimate_chunk(&pcxt->estimator, est_shared);
3443 shm_toc_estimate_keys(&pcxt->estimator, 1);
3445 /* Estimate size for dead tuples -- PARALLEL_VACUUM_KEY_DEAD_TUPLES */
3446 maxtuples = compute_max_dead_tuples(nblocks, true);
3447 est_deadtuples = MAXALIGN(SizeOfDeadTuples(maxtuples));
3448 shm_toc_estimate_chunk(&pcxt->estimator, est_deadtuples);
3449 shm_toc_estimate_keys(&pcxt->estimator, 1);
3452 * Estimate space for BufferUsage and WalUsage --
3453 * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE.
3455 * If there are no extensions loaded that care, we could skip this. We
3456 * have no way of knowing whether anyone's looking at pgBufferUsage or
3457 * pgWalUsage, so do it unconditionally.
3459 shm_toc_estimate_chunk(&pcxt->estimator,
3460 mul_size(sizeof(BufferUsage), pcxt->nworkers));
3461 shm_toc_estimate_keys(&pcxt->estimator, 1);
3462 shm_toc_estimate_chunk(&pcxt->estimator,
3463 mul_size(sizeof(WalUsage), pcxt->nworkers));
3464 shm_toc_estimate_keys(&pcxt->estimator, 1);
3466 /* Finally, estimate PARALLEL_VACUUM_KEY_QUERY_TEXT space */
3467 if (debug_query_string)
3469 querylen = strlen(debug_query_string);
3470 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
3471 shm_toc_estimate_keys(&pcxt->estimator, 1);
3473 else
3474 querylen = 0; /* keep compiler quiet */
3476 InitializeParallelDSM(pcxt);
3478 /* Prepare shared information */
3479 shared = (LVShared *) shm_toc_allocate(pcxt->toc, est_shared);
3480 MemSet(shared, 0, est_shared);
3481 shared->relid = RelationGetRelid(vacrel->rel);
3482 shared->elevel = elevel;
3483 shared->maintenance_work_mem_worker =
3484 (nindexes_mwm > 0) ?
3485 maintenance_work_mem / Min(parallel_workers, nindexes_mwm) :
3486 maintenance_work_mem;
3488 pg_atomic_init_u32(&(shared->cost_balance), 0);
3489 pg_atomic_init_u32(&(shared->active_nworkers), 0);
3490 pg_atomic_init_u32(&(shared->idx), 0);
3491 shared->offset = MAXALIGN(add_size(SizeOfLVShared, BITMAPLEN(nindexes)));
3494 * Initialize variables for shared index statistics, set NULL bitmap and
3495 * the size of stats for each index.
3497 memset(shared->bitmap, 0x00, BITMAPLEN(nindexes));
3498 for (int idx = 0; idx < nindexes; idx++)
3500 if (!can_parallel_vacuum[idx])
3501 continue;
3503 /* Set NOT NULL as this index does support parallelism */
3504 shared->bitmap[idx >> 3] |= 1 << (idx & 0x07);
3507 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared);
3508 lps->lvshared = shared;
3510 /* Prepare the dead tuple space */
3511 dead_tuples = (LVDeadTuples *) shm_toc_allocate(pcxt->toc, est_deadtuples);
3512 dead_tuples->max_tuples = maxtuples;
3513 dead_tuples->num_tuples = 0;
3514 MemSet(dead_tuples->itemptrs, 0, sizeof(ItemPointerData) * maxtuples);
3515 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_TUPLES, dead_tuples);
3516 vacrel->dead_tuples = dead_tuples;
3519 * Allocate space for each worker's BufferUsage and WalUsage; no need to
3520 * initialize
3522 buffer_usage = shm_toc_allocate(pcxt->toc,
3523 mul_size(sizeof(BufferUsage), pcxt->nworkers));
3524 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, buffer_usage);
3525 lps->buffer_usage = buffer_usage;
3526 wal_usage = shm_toc_allocate(pcxt->toc,
3527 mul_size(sizeof(WalUsage), pcxt->nworkers));
3528 shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_WAL_USAGE, wal_usage);
3529 lps->wal_usage = wal_usage;
3531 /* Store query string for workers */
3532 if (debug_query_string)
3534 char *sharedquery;
3536 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
3537 memcpy(sharedquery, debug_query_string, querylen + 1);
3538 sharedquery[querylen] = '\0';
3539 shm_toc_insert(pcxt->toc,
3540 PARALLEL_VACUUM_KEY_QUERY_TEXT, sharedquery);
3543 pfree(can_parallel_vacuum);
3544 return lps;
3548 * Destroy the parallel context, and end parallel mode.
3550 * Since writes are not allowed during parallel mode, copy the
3551 * updated index statistics from DSM into local memory and then later use that
3552 * to update the index statistics. One might think that we can exit from
3553 * parallel mode, update the index statistics and then destroy parallel
3554 * context, but that won't be safe (see ExitParallelMode).
3556 static void
3557 end_parallel_vacuum(LVRelState *vacrel)
3559 IndexBulkDeleteResult **indstats = vacrel->indstats;
3560 LVParallelState *lps = vacrel->lps;
3561 int nindexes = vacrel->nindexes;
3563 Assert(!IsParallelWorker());
3565 /* Copy the updated statistics */
3566 for (int idx = 0; idx < nindexes; idx++)
3568 LVSharedIndStats *shared_istat;
3570 shared_istat = parallel_stats_for_idx(lps->lvshared, idx);
3573 * Skip unused slot. The statistics of this index are already stored
3574 * in local memory.
3576 if (shared_istat == NULL)
3577 continue;
3579 if (shared_istat->updated)
3581 indstats[idx] = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
3582 memcpy(indstats[idx], &(shared_istat->istat), sizeof(IndexBulkDeleteResult));
3584 else
3585 indstats[idx] = NULL;
3588 DestroyParallelContext(lps->pcxt);
3589 ExitParallelMode();
3591 /* Deactivate parallel vacuum */
3592 pfree(lps);
3593 vacrel->lps = NULL;
3597 * Return shared memory statistics for index at offset 'getidx', if any
3599 static LVSharedIndStats *
3600 parallel_stats_for_idx(LVShared *lvshared, int getidx)
3602 char *p;
3604 if (IndStatsIsNull(lvshared, getidx))
3605 return NULL;
3607 p = (char *) GetSharedIndStats(lvshared);
3608 for (int idx = 0; idx < getidx; idx++)
3610 if (IndStatsIsNull(lvshared, idx))
3611 continue;
3613 p += sizeof(LVSharedIndStats);
3616 return (LVSharedIndStats *) p;
3620 * Returns false, if the given index can't participate in parallel index
3621 * vacuum or parallel index cleanup
3623 static bool
3624 parallel_processing_is_safe(Relation indrel, LVShared *lvshared)
3626 uint8 vacoptions = indrel->rd_indam->amparallelvacuumoptions;
3628 /* first_time must be true only if for_cleanup is true */
3629 Assert(lvshared->for_cleanup || !lvshared->first_time);
3631 if (lvshared->for_cleanup)
3633 /* Skip, if the index does not support parallel cleanup */
3634 if (((vacoptions & VACUUM_OPTION_PARALLEL_CLEANUP) == 0) &&
3635 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) == 0))
3636 return true;
3639 * Skip, if the index supports parallel cleanup conditionally, but we
3640 * have already processed the index (for bulkdelete). See the
3641 * comments for option VACUUM_OPTION_PARALLEL_COND_CLEANUP to know
3642 * when indexes support parallel cleanup conditionally.
3644 if (!lvshared->first_time &&
3645 ((vacoptions & VACUUM_OPTION_PARALLEL_COND_CLEANUP) != 0))
3646 return false;
3648 else if ((vacoptions & VACUUM_OPTION_PARALLEL_BULKDEL) == 0)
3650 /* Skip if the index does not support parallel bulk deletion */
3651 return false;
3654 return true;
3658 * Perform work within a launched parallel process.
3660 * Since parallel vacuum workers perform only index vacuum or index cleanup,
3661 * we don't need to report progress information.
3663 void
3664 parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
3666 Relation rel;
3667 Relation *indrels;
3668 LVShared *lvshared;
3669 LVDeadTuples *dead_tuples;
3670 BufferUsage *buffer_usage;
3671 WalUsage *wal_usage;
3672 int nindexes;
3673 char *sharedquery;
3674 LVRelState vacrel;
3675 ErrorContextCallback errcallback;
3677 lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED,
3678 false);
3679 elevel = lvshared->elevel;
3681 if (lvshared->for_cleanup)
3682 elog(DEBUG1, "starting parallel vacuum worker for cleanup");
3683 else
3684 elog(DEBUG1, "starting parallel vacuum worker for bulk delete");
3686 /* Set debug_query_string for individual workers */
3687 sharedquery = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_QUERY_TEXT, true);
3688 debug_query_string = sharedquery;
3689 pgstat_report_activity(STATE_RUNNING, debug_query_string);
3692 * Open table. The lock mode is the same as the leader process. It's
3693 * okay because the lock mode does not conflict among the parallel
3694 * workers.
3696 rel = table_open(lvshared->relid, ShareUpdateExclusiveLock);
3699 * Open all indexes. indrels are sorted in order by OID, which should be
3700 * matched to the leader's one.
3702 vac_open_indexes(rel, RowExclusiveLock, &nindexes, &indrels);
3703 Assert(nindexes > 0);
3705 /* Set dead tuple space */
3706 dead_tuples = (LVDeadTuples *) shm_toc_lookup(toc,
3707 PARALLEL_VACUUM_KEY_DEAD_TUPLES,
3708 false);
3710 /* Set cost-based vacuum delay */
3711 VacuumCostActive = (VacuumCostDelay > 0);
3712 VacuumCostBalance = 0;
3713 VacuumPageHit = 0;
3714 VacuumPageMiss = 0;
3715 VacuumPageDirty = 0;
3716 VacuumCostBalanceLocal = 0;
3717 VacuumSharedCostBalance = &(lvshared->cost_balance);
3718 VacuumActiveNWorkers = &(lvshared->active_nworkers);
3720 vacrel.rel = rel;
3721 vacrel.indrels = indrels;
3722 vacrel.nindexes = nindexes;
3723 vacrel.indstats = (IndexBulkDeleteResult **)
3724 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
3726 if (lvshared->maintenance_work_mem_worker > 0)
3727 maintenance_work_mem = lvshared->maintenance_work_mem_worker;
3730 * Initialize vacrel for use as error callback arg by parallel worker.
3732 vacrel.relnamespace = get_namespace_name(RelationGetNamespace(rel));
3733 vacrel.relname = pstrdup(RelationGetRelationName(rel));
3734 vacrel.indname = NULL;
3735 vacrel.phase = VACUUM_ERRCB_PHASE_UNKNOWN; /* Not yet processing */
3736 vacrel.dead_tuples = dead_tuples;
3738 /* Setup error traceback support for ereport() */
3739 errcallback.callback = vacuum_error_callback;
3740 errcallback.arg = &vacrel;
3741 errcallback.previous = error_context_stack;
3742 error_context_stack = &errcallback;
3744 /* Prepare to track buffer usage during parallel execution */
3745 InstrStartParallelQuery();
3747 /* Process indexes to perform vacuum/cleanup */
3748 do_parallel_processing(&vacrel, lvshared);
3750 /* Report buffer/WAL usage during parallel execution */
3751 buffer_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_BUFFER_USAGE, false);
3752 wal_usage = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_WAL_USAGE, false);
3753 InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber],
3754 &wal_usage[ParallelWorkerNumber]);
3756 /* Pop the error context stack */
3757 error_context_stack = errcallback.previous;
3759 vac_close_indexes(nindexes, indrels, RowExclusiveLock);
3760 table_close(rel, ShareUpdateExclusiveLock);
3761 pfree(vacrel.indstats);
3765 * Error context callback for errors occurring during vacuum.
3767 static void
3768 vacuum_error_callback(void *arg)
3770 LVRelState *errinfo = arg;
3772 switch (errinfo->phase)
3774 case VACUUM_ERRCB_PHASE_SCAN_HEAP:
3775 if (BlockNumberIsValid(errinfo->blkno))
3777 if (OffsetNumberIsValid(errinfo->offnum))
3778 errcontext("while scanning block %u and offset %u of relation \"%s.%s\"",
3779 errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3780 else
3781 errcontext("while scanning block %u of relation \"%s.%s\"",
3782 errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3784 else
3785 errcontext("while scanning relation \"%s.%s\"",
3786 errinfo->relnamespace, errinfo->relname);
3787 break;
3789 case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
3790 if (BlockNumberIsValid(errinfo->blkno))
3792 if (OffsetNumberIsValid(errinfo->offnum))
3793 errcontext("while vacuuming block %u and offset %u of relation \"%s.%s\"",
3794 errinfo->blkno, errinfo->offnum, errinfo->relnamespace, errinfo->relname);
3795 else
3796 errcontext("while vacuuming block %u of relation \"%s.%s\"",
3797 errinfo->blkno, errinfo->relnamespace, errinfo->relname);
3799 else
3800 errcontext("while vacuuming relation \"%s.%s\"",
3801 errinfo->relnamespace, errinfo->relname);
3802 break;
3804 case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
3805 errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
3806 errinfo->indname, errinfo->relnamespace, errinfo->relname);
3807 break;
3809 case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
3810 errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
3811 errinfo->indname, errinfo->relnamespace, errinfo->relname);
3812 break;
3814 case VACUUM_ERRCB_PHASE_TRUNCATE:
3815 if (BlockNumberIsValid(errinfo->blkno))
3816 errcontext("while truncating relation \"%s.%s\" to %u blocks",
3817 errinfo->relnamespace, errinfo->relname, errinfo->blkno);
3818 break;
3820 case VACUUM_ERRCB_PHASE_UNKNOWN:
3821 default:
3822 return; /* do nothing; the errinfo may not be
3823 * initialized */
3828 * Updates the information required for vacuum error callback. This also saves
3829 * the current information which can be later restored via restore_vacuum_error_info.
3831 static void
3832 update_vacuum_error_info(LVRelState *vacrel, LVSavedErrInfo *saved_vacrel,
3833 int phase, BlockNumber blkno, OffsetNumber offnum)
3835 if (saved_vacrel)
3837 saved_vacrel->offnum = vacrel->offnum;
3838 saved_vacrel->blkno = vacrel->blkno;
3839 saved_vacrel->phase = vacrel->phase;
3842 vacrel->blkno = blkno;
3843 vacrel->offnum = offnum;
3844 vacrel->phase = phase;
3848 * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
3850 static void
3851 restore_vacuum_error_info(LVRelState *vacrel,
3852 const LVSavedErrInfo *saved_vacrel)
3854 vacrel->blkno = saved_vacrel->blkno;
3855 vacrel->offnum = saved_vacrel->offnum;
3856 vacrel->phase = saved_vacrel->phase;