Fix pg_dump bug in the database-level collation patch. "datcollate" and
[PostgreSQL.git] / src / backend / commands / vacuumlazy.c
bloba9b8a0a8d422173ec51cf9753cdf80e493599d78
1 /*-------------------------------------------------------------------------
3 * vacuumlazy.c
4 * Concurrent ("lazy") vacuuming.
7 * The major space usage for LAZY VACUUM is storage for the array of dead
8 * tuple TIDs, with the next biggest need being storage for per-disk-page
9 * free space info. We want to ensure we can vacuum even the very largest
10 * relations with finite memory space usage. To do that, we set upper bounds
11 * on the number of tuples and pages we will keep track of at once.
13 * We are willing to use at most maintenance_work_mem memory space to keep
14 * track of dead tuples. We initially allocate an array of TIDs of that size,
15 * with an upper limit that depends on table size (this limit ensures we don't
16 * allocate a huge area uselessly for vacuuming small tables). If the array
17 * threatens to overflow, we suspend the heap scan phase and perform a pass of
18 * index cleanup and page compaction, then resume the heap scan with an empty
19 * TID array.
21 * We can limit the storage for page free space to MaxFSMPages entries,
22 * since that's the most the free space map will be willing to remember
23 * anyway. If the relation has fewer than that many pages with free space,
24 * life is easy: just build an array of per-page info. If it has more,
25 * we store the free space info as a heap ordered by amount of free space,
26 * so that we can discard the pages with least free space to ensure we never
27 * have more than MaxFSMPages entries in all. The surviving page entries
28 * are passed to the free space map at conclusion of the scan.
30 * If we're processing a table with no indexes, we can just vacuum each page
31 * as we go; there's no need to save up multiple tuples to minimize the number
32 * of index scans performed. So we don't use maintenance_work_mem memory for
33 * the TID array, just enough to hold as many heap tuples as fit on one page.
36 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
37 * Portions Copyright (c) 1994, Regents of the University of California
40 * IDENTIFICATION
41 * $PostgreSQL$
43 *-------------------------------------------------------------------------
45 #include "postgres.h"
47 #include <math.h>
49 #include "access/genam.h"
50 #include "access/heapam.h"
51 #include "access/transam.h"
52 #include "commands/dbcommands.h"
53 #include "commands/vacuum.h"
54 #include "miscadmin.h"
55 #include "pgstat.h"
56 #include "postmaster/autovacuum.h"
57 #include "storage/bufmgr.h"
58 #include "storage/freespace.h"
59 #include "storage/lmgr.h"
60 #include "utils/lsyscache.h"
61 #include "utils/memutils.h"
62 #include "utils/pg_rusage.h"
63 #include "utils/tqual.h"
67 * Space/time tradeoff parameters: do these need to be user-tunable?
69 * To consider truncating the relation, we want there to be at least
70 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
71 * is less) potentially-freeable pages.
73 #define REL_TRUNCATE_MINIMUM 1000
74 #define REL_TRUNCATE_FRACTION 16
77 * Guesstimation of number of dead tuples per page. This is used to
78 * provide an upper limit to memory allocated when vacuuming small
79 * tables.
81 #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage
83 typedef struct LVRelStats
85 /* hasindex = true means two-pass strategy; false means one-pass */
86 bool hasindex;
87 /* Overall statistics about rel */
88 BlockNumber rel_pages;
89 double rel_tuples;
90 BlockNumber pages_removed;
91 double tuples_deleted;
92 BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
93 Size threshold; /* minimum interesting free space */
94 /* List of TIDs of tuples we intend to delete */
95 /* NB: this list is ordered by TID address */
96 int num_dead_tuples; /* current # of entries */
97 int max_dead_tuples; /* # slots allocated in array */
98 ItemPointer dead_tuples; /* array of ItemPointerData */
99 /* Array or heap of per-page info about free space */
100 /* We use a simple array until it fills up, then convert to heap */
101 bool fs_is_heap; /* are we using heap organization? */
102 int num_free_pages; /* current # of entries */
103 int max_free_pages; /* # slots allocated in array */
104 FSMPageData *free_pages; /* array or heap of blkno/avail */
105 BlockNumber tot_free_pages; /* total pages with >= threshold space */
106 int num_index_scans;
107 } LVRelStats;
110 /* A few variables that don't seem worth passing around as parameters */
111 static int elevel = -1;
113 static TransactionId OldestXmin;
114 static TransactionId FreezeLimit;
116 static BufferAccessStrategy vac_strategy;
119 /* non-export function prototypes */
120 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
121 Relation *Irel, int nindexes);
122 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
123 static void lazy_vacuum_index(Relation indrel,
124 IndexBulkDeleteResult **stats,
125 LVRelStats *vacrelstats);
126 static void lazy_cleanup_index(Relation indrel,
127 IndexBulkDeleteResult *stats,
128 LVRelStats *vacrelstats);
129 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
130 int tupindex, LVRelStats *vacrelstats);
131 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
132 static BlockNumber count_nondeletable_pages(Relation onerel,
133 LVRelStats *vacrelstats);
134 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
135 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
136 ItemPointer itemptr);
137 static void lazy_record_free_space(LVRelStats *vacrelstats,
138 BlockNumber page, Size avail);
139 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
140 static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats);
141 static int vac_cmp_itemptr(const void *left, const void *right);
142 static int vac_cmp_page_spaces(const void *left, const void *right);
146 * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
148 * This routine vacuums a single heap, cleans out its indexes, and
149 * updates its relpages and reltuples statistics.
151 * At entry, we have already established a transaction and opened
152 * and locked the relation.
154 void
155 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
156 BufferAccessStrategy bstrategy)
158 LVRelStats *vacrelstats;
159 Relation *Irel;
160 int nindexes;
161 BlockNumber possibly_freeable;
162 PGRUsage ru0;
163 TimestampTz starttime = 0;
165 pg_rusage_init(&ru0);
167 /* measure elapsed time iff autovacuum logging requires it */
168 if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0)
169 starttime = GetCurrentTimestamp();
171 if (vacstmt->verbose)
172 elevel = INFO;
173 else
174 elevel = DEBUG2;
176 vac_strategy = bstrategy;
178 vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
179 &OldestXmin, &FreezeLimit);
181 vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
183 /* Set threshold for interesting free space = average request size */
184 /* XXX should we scale it up or down? Adjust vacuum.c too, if so */
185 vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);
187 vacrelstats->num_index_scans = 0;
189 /* Open all indexes of the relation */
190 vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
191 vacrelstats->hasindex = (nindexes > 0);
193 /* Do the vacuuming */
194 lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
196 /* Done with indexes */
197 vac_close_indexes(nindexes, Irel, NoLock);
200 * Optionally truncate the relation.
202 * Don't even think about it unless we have a shot at releasing a goodly
203 * number of pages. Otherwise, the time taken isn't worth it.
205 possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
206 if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
207 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
208 lazy_truncate_heap(onerel, vacrelstats);
210 /* Update shared free space map with final free space info */
211 lazy_update_fsm(onerel, vacrelstats);
213 if (vacrelstats->tot_free_pages > MaxFSMPages)
214 ereport(WARNING,
215 (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space",
216 get_namespace_name(RelationGetNamespace(onerel)),
217 RelationGetRelationName(onerel)),
218 /* Only suggest VACUUM FULL if > 20% free */
219 (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ?
220 errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") :
221 errhint("Consider increasing the configuration parameter \"max_fsm_pages\".")));
223 /* Update statistics in pg_class */
224 vac_update_relstats(RelationGetRelid(onerel),
225 vacrelstats->rel_pages,
226 vacrelstats->rel_tuples,
227 vacrelstats->hasindex,
228 FreezeLimit);
230 /* report results to the stats collector, too */
231 pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
232 vacstmt->analyze, vacrelstats->rel_tuples);
234 /* and log the action if appropriate */
235 if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
237 if (Log_autovacuum_min_duration == 0 ||
238 TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
239 Log_autovacuum_min_duration))
240 ereport(LOG,
241 (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
242 "pages: %d removed, %d remain\n"
243 "tuples: %.0f removed, %.0f remain\n"
244 "system usage: %s",
245 get_database_name(MyDatabaseId),
246 get_namespace_name(RelationGetNamespace(onerel)),
247 RelationGetRelationName(onerel),
248 vacrelstats->num_index_scans,
249 vacrelstats->pages_removed, vacrelstats->rel_pages,
250 vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
251 pg_rusage_show(&ru0))));
257 * lazy_scan_heap() -- scan an open heap relation
259 * This routine sets commit status bits, builds lists of dead tuples
260 * and pages with free space, and calculates statistics on the number
261 * of live tuples in the heap. When done, or when we run low on space
262 * for dead-tuple TIDs, invoke vacuuming of indexes and heap.
264 * If there are no indexes then we just vacuum each dirty page as we
265 * process it, since there's no point in gathering many tuples.
267 static void
268 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
269 Relation *Irel, int nindexes)
271 BlockNumber nblocks,
272 blkno;
273 HeapTupleData tuple;
274 char *relname;
275 BlockNumber empty_pages,
276 vacuumed_pages;
277 double num_tuples,
278 tups_vacuumed,
279 nkeep,
280 nunused;
281 IndexBulkDeleteResult **indstats;
282 int i;
283 PGRUsage ru0;
285 pg_rusage_init(&ru0);
287 relname = RelationGetRelationName(onerel);
288 ereport(elevel,
289 (errmsg("vacuuming \"%s.%s\"",
290 get_namespace_name(RelationGetNamespace(onerel)),
291 relname)));
293 empty_pages = vacuumed_pages = 0;
294 num_tuples = tups_vacuumed = nkeep = nunused = 0;
296 indstats = (IndexBulkDeleteResult **)
297 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
299 nblocks = RelationGetNumberOfBlocks(onerel);
300 vacrelstats->rel_pages = nblocks;
301 vacrelstats->nonempty_pages = 0;
303 lazy_space_alloc(vacrelstats, nblocks);
305 for (blkno = 0; blkno < nblocks; blkno++)
307 Buffer buf;
308 Page page;
309 OffsetNumber offnum,
310 maxoff;
311 bool tupgone,
312 hastup;
313 int prev_dead_count;
314 OffsetNumber frozen[MaxOffsetNumber];
315 int nfrozen;
317 vacuum_delay_point();
320 * If we are close to overrunning the available space for dead-tuple
321 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
323 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
324 vacrelstats->num_dead_tuples > 0)
326 /* Remove index entries */
327 for (i = 0; i < nindexes; i++)
328 lazy_vacuum_index(Irel[i],
329 &indstats[i],
330 vacrelstats);
331 /* Remove tuples from heap */
332 lazy_vacuum_heap(onerel, vacrelstats);
333 /* Forget the now-vacuumed tuples, and press on */
334 vacrelstats->num_dead_tuples = 0;
335 vacrelstats->num_index_scans++;
338 buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
340 /* We need buffer cleanup lock so that we can prune HOT chains. */
341 LockBufferForCleanup(buf);
343 page = BufferGetPage(buf);
345 if (PageIsNew(page))
348 * An all-zeroes page could be left over if a backend extends the
349 * relation but crashes before initializing the page. Reclaim such
350 * pages for use.
352 * We have to be careful here because we could be looking at a
353 * page that someone has just added to the relation and not yet
354 * been able to initialize (see RelationGetBufferForTuple). To
355 * protect against that, release the buffer lock, grab the
356 * relation extension lock momentarily, and re-lock the buffer. If
357 * the page is still uninitialized by then, it must be left over
358 * from a crashed backend, and we can initialize it.
360 * We don't really need the relation lock when this is a new or
361 * temp relation, but it's probably not worth the code space to
362 * check that, since this surely isn't a critical path.
364 * Note: the comparable code in vacuum.c need not worry because
365 * it's got exclusive lock on the whole relation.
367 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
368 LockRelationForExtension(onerel, ExclusiveLock);
369 UnlockRelationForExtension(onerel, ExclusiveLock);
370 LockBufferForCleanup(buf);
371 if (PageIsNew(page))
373 ereport(WARNING,
374 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
375 relname, blkno)));
376 PageInit(page, BufferGetPageSize(buf), 0);
377 empty_pages++;
378 lazy_record_free_space(vacrelstats, blkno,
379 PageGetHeapFreeSpace(page));
381 MarkBufferDirty(buf);
382 UnlockReleaseBuffer(buf);
383 continue;
386 if (PageIsEmpty(page))
388 empty_pages++;
389 lazy_record_free_space(vacrelstats, blkno,
390 PageGetHeapFreeSpace(page));
391 UnlockReleaseBuffer(buf);
392 continue;
396 * Prune all HOT-update chains in this page.
398 * We count tuples removed by the pruning step as removed by VACUUM.
400 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
401 false, false);
404 * Now scan the page to collect vacuumable items and check for tuples
405 * requiring freezing.
407 nfrozen = 0;
408 hastup = false;
409 prev_dead_count = vacrelstats->num_dead_tuples;
410 maxoff = PageGetMaxOffsetNumber(page);
411 for (offnum = FirstOffsetNumber;
412 offnum <= maxoff;
413 offnum = OffsetNumberNext(offnum))
415 ItemId itemid;
417 itemid = PageGetItemId(page, offnum);
419 /* Unused items require no processing, but we count 'em */
420 if (!ItemIdIsUsed(itemid))
422 nunused += 1;
423 continue;
426 /* Redirect items mustn't be touched */
427 if (ItemIdIsRedirected(itemid))
429 hastup = true; /* this page won't be truncatable */
430 continue;
433 ItemPointerSet(&(tuple.t_self), blkno, offnum);
436 * DEAD item pointers are to be vacuumed normally; but we don't
437 * count them in tups_vacuumed, else we'd be double-counting (at
438 * least in the common case where heap_page_prune() just freed up
439 * a non-HOT tuple).
441 if (ItemIdIsDead(itemid))
443 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
444 continue;
447 Assert(ItemIdIsNormal(itemid));
449 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
450 tuple.t_len = ItemIdGetLength(itemid);
452 tupgone = false;
454 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
456 case HEAPTUPLE_DEAD:
459 * Ordinarily, DEAD tuples would have been removed by
460 * heap_page_prune(), but it's possible that the tuple
461 * state changed since heap_page_prune() looked. In
462 * particular an INSERT_IN_PROGRESS tuple could have
463 * changed to DEAD if the inserter aborted. So this
464 * cannot be considered an error condition.
466 * If the tuple is HOT-updated then it must only be
467 * removed by a prune operation; so we keep it just as if
468 * it were RECENTLY_DEAD. Also, if it's a heap-only
469 * tuple, we choose to keep it, because it'll be a lot
470 * cheaper to get rid of it in the next pruning pass than
471 * to treat it like an indexed tuple.
473 if (HeapTupleIsHotUpdated(&tuple) ||
474 HeapTupleIsHeapOnly(&tuple))
475 nkeep += 1;
476 else
477 tupgone = true; /* we can delete the tuple */
478 break;
479 case HEAPTUPLE_LIVE:
480 /* Tuple is good --- but let's do some validity checks */
481 if (onerel->rd_rel->relhasoids &&
482 !OidIsValid(HeapTupleGetOid(&tuple)))
483 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
484 relname, blkno, offnum);
485 break;
486 case HEAPTUPLE_RECENTLY_DEAD:
489 * If tuple is recently deleted then we must not remove it
490 * from relation.
492 nkeep += 1;
493 break;
494 case HEAPTUPLE_INSERT_IN_PROGRESS:
495 /* This is an expected case during concurrent vacuum */
496 break;
497 case HEAPTUPLE_DELETE_IN_PROGRESS:
498 /* This is an expected case during concurrent vacuum */
499 break;
500 default:
501 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
502 break;
505 if (tupgone)
507 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
508 tups_vacuumed += 1;
510 else
512 num_tuples += 1;
513 hastup = true;
516 * Each non-removable tuple must be checked to see if it needs
517 * freezing. Note we already have exclusive buffer lock.
519 if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
520 InvalidBuffer))
521 frozen[nfrozen++] = offnum;
523 } /* scan along page */
526 * If we froze any tuples, mark the buffer dirty, and write a WAL
527 * record recording the changes. We must log the changes to be
528 * crash-safe against future truncation of CLOG.
530 if (nfrozen > 0)
532 MarkBufferDirty(buf);
533 /* no XLOG for temp tables, though */
534 if (!onerel->rd_istemp)
536 XLogRecPtr recptr;
538 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
539 frozen, nfrozen);
540 PageSetLSN(page, recptr);
541 PageSetTLI(page, ThisTimeLineID);
546 * If there are no indexes then we can vacuum the page right now
547 * instead of doing a second scan.
549 if (nindexes == 0 &&
550 vacrelstats->num_dead_tuples > 0)
552 /* Remove tuples from heap */
553 lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
554 /* Forget the now-vacuumed tuples, and press on */
555 vacrelstats->num_dead_tuples = 0;
556 vacuumed_pages++;
560 * If we remembered any tuples for deletion, then the page will be
561 * visited again by lazy_vacuum_heap, which will compute and record
562 * its post-compaction free space. If not, then we're done with this
563 * page, so remember its free space as-is. (This path will always be
564 * taken if there are no indexes.)
566 if (vacrelstats->num_dead_tuples == prev_dead_count)
568 lazy_record_free_space(vacrelstats, blkno,
569 PageGetHeapFreeSpace(page));
572 /* Remember the location of the last page with nonremovable tuples */
573 if (hastup)
574 vacrelstats->nonempty_pages = blkno + 1;
576 UnlockReleaseBuffer(buf);
579 /* save stats for use later */
580 vacrelstats->rel_tuples = num_tuples;
581 vacrelstats->tuples_deleted = tups_vacuumed;
583 /* If any tuples need to be deleted, perform final vacuum cycle */
584 /* XXX put a threshold on min number of tuples here? */
585 if (vacrelstats->num_dead_tuples > 0)
587 /* Remove index entries */
588 for (i = 0; i < nindexes; i++)
589 lazy_vacuum_index(Irel[i],
590 &indstats[i],
591 vacrelstats);
592 /* Remove tuples from heap */
593 lazy_vacuum_heap(onerel, vacrelstats);
594 vacrelstats->num_index_scans++;
597 /* Do post-vacuum cleanup and statistics update for each index */
598 for (i = 0; i < nindexes; i++)
599 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
601 /* If no indexes, make log report that lazy_vacuum_heap would've made */
602 if (vacuumed_pages)
603 ereport(elevel,
604 (errmsg("\"%s\": removed %.0f row versions in %u pages",
605 RelationGetRelationName(onerel),
606 tups_vacuumed, vacuumed_pages)));
608 ereport(elevel,
609 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
610 RelationGetRelationName(onerel),
611 tups_vacuumed, num_tuples, nblocks),
612 errdetail("%.0f dead row versions cannot be removed yet.\n"
613 "There were %.0f unused item pointers.\n"
614 "%u pages contain useful free space.\n"
615 "%u pages are entirely empty.\n"
616 "%s.",
617 nkeep,
618 nunused,
619 vacrelstats->tot_free_pages,
620 empty_pages,
621 pg_rusage_show(&ru0))));
626 * lazy_vacuum_heap() -- second pass over the heap
628 * This routine marks dead tuples as unused and compacts out free
629 * space on their pages. Pages not having dead tuples recorded from
630 * lazy_scan_heap are not visited at all.
632 * Note: the reason for doing this as a second pass is we cannot remove
633 * the tuples until we've removed their index entries, and we want to
634 * process index entry removal in batches as large as possible.
636 static void
637 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
639 int tupindex;
640 int npages;
641 PGRUsage ru0;
643 pg_rusage_init(&ru0);
644 npages = 0;
646 tupindex = 0;
647 while (tupindex < vacrelstats->num_dead_tuples)
649 BlockNumber tblk;
650 Buffer buf;
651 Page page;
653 vacuum_delay_point();
655 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
656 buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy);
657 LockBufferForCleanup(buf);
658 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
659 /* Now that we've compacted the page, record its available space */
660 page = BufferGetPage(buf);
661 lazy_record_free_space(vacrelstats, tblk,
662 PageGetHeapFreeSpace(page));
663 UnlockReleaseBuffer(buf);
664 npages++;
667 ereport(elevel,
668 (errmsg("\"%s\": removed %d row versions in %d pages",
669 RelationGetRelationName(onerel),
670 tupindex, npages),
671 errdetail("%s.",
672 pg_rusage_show(&ru0))));
676 * lazy_vacuum_page() -- free dead tuples on a page
677 * and repair its fragmentation.
679 * Caller must hold pin and buffer cleanup lock on the buffer.
681 * tupindex is the index in vacrelstats->dead_tuples of the first dead
682 * tuple for this page. We assume the rest follow sequentially.
683 * The return value is the first tupindex after the tuples of this page.
685 static int
686 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
687 int tupindex, LVRelStats *vacrelstats)
689 Page page = BufferGetPage(buffer);
690 OffsetNumber unused[MaxOffsetNumber];
691 int uncnt = 0;
693 START_CRIT_SECTION();
695 for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
697 BlockNumber tblk;
698 OffsetNumber toff;
699 ItemId itemid;
701 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
702 if (tblk != blkno)
703 break; /* past end of tuples for this block */
704 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
705 itemid = PageGetItemId(page, toff);
706 ItemIdSetUnused(itemid);
707 unused[uncnt++] = toff;
710 PageRepairFragmentation(page);
712 MarkBufferDirty(buffer);
714 /* XLOG stuff */
715 if (!onerel->rd_istemp)
717 XLogRecPtr recptr;
719 recptr = log_heap_clean(onerel, buffer,
720 NULL, 0, NULL, 0,
721 unused, uncnt,
722 false);
723 PageSetLSN(page, recptr);
724 PageSetTLI(page, ThisTimeLineID);
727 END_CRIT_SECTION();
729 return tupindex;
733 * lazy_vacuum_index() -- vacuum one index relation.
735 * Delete all the index entries pointing to tuples listed in
736 * vacrelstats->dead_tuples, and update running statistics.
738 static void
739 lazy_vacuum_index(Relation indrel,
740 IndexBulkDeleteResult **stats,
741 LVRelStats *vacrelstats)
743 IndexVacuumInfo ivinfo;
744 PGRUsage ru0;
746 pg_rusage_init(&ru0);
748 ivinfo.index = indrel;
749 ivinfo.vacuum_full = false;
750 ivinfo.message_level = elevel;
751 /* We don't yet know rel_tuples, so pass -1 */
752 ivinfo.num_heap_tuples = -1;
753 ivinfo.strategy = vac_strategy;
755 /* Do bulk deletion */
756 *stats = index_bulk_delete(&ivinfo, *stats,
757 lazy_tid_reaped, (void *) vacrelstats);
759 ereport(elevel,
760 (errmsg("scanned index \"%s\" to remove %d row versions",
761 RelationGetRelationName(indrel),
762 vacrelstats->num_dead_tuples),
763 errdetail("%s.", pg_rusage_show(&ru0))));
767 * lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
769 static void
770 lazy_cleanup_index(Relation indrel,
771 IndexBulkDeleteResult *stats,
772 LVRelStats *vacrelstats)
774 IndexVacuumInfo ivinfo;
775 PGRUsage ru0;
777 pg_rusage_init(&ru0);
779 ivinfo.index = indrel;
780 ivinfo.vacuum_full = false;
781 ivinfo.message_level = elevel;
782 ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
783 ivinfo.strategy = vac_strategy;
785 stats = index_vacuum_cleanup(&ivinfo, stats);
787 if (!stats)
788 return;
790 /* now update statistics in pg_class */
791 vac_update_relstats(RelationGetRelid(indrel),
792 stats->num_pages,
793 stats->num_index_tuples,
794 false, InvalidTransactionId);
796 ereport(elevel,
797 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
798 RelationGetRelationName(indrel),
799 stats->num_index_tuples,
800 stats->num_pages),
801 errdetail("%.0f index row versions were removed.\n"
802 "%u index pages have been deleted, %u are currently reusable.\n"
803 "%s.",
804 stats->tuples_removed,
805 stats->pages_deleted, stats->pages_free,
806 pg_rusage_show(&ru0))));
808 pfree(stats);
812 * lazy_truncate_heap - try to truncate off any empty pages at the end
814 static void
815 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
817 BlockNumber old_rel_pages = vacrelstats->rel_pages;
818 BlockNumber new_rel_pages;
819 FSMPageData *pageSpaces;
820 int n;
821 int i,
823 PGRUsage ru0;
825 pg_rusage_init(&ru0);
828 * We need full exclusive lock on the relation in order to do truncation.
829 * If we can't get it, give up rather than waiting --- we don't want to
830 * block other backends, and we don't want to deadlock (which is quite
831 * possible considering we already hold a lower-grade lock).
833 if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
834 return;
837 * Now that we have exclusive lock, look to see if the rel has grown
838 * whilst we were vacuuming with non-exclusive lock. If so, give up; the
839 * newly added pages presumably contain non-deletable tuples.
841 new_rel_pages = RelationGetNumberOfBlocks(onerel);
842 if (new_rel_pages != old_rel_pages)
844 /* might as well use the latest news when we update pg_class stats */
845 vacrelstats->rel_pages = new_rel_pages;
846 UnlockRelation(onerel, AccessExclusiveLock);
847 return;
851 * Scan backwards from the end to verify that the end pages actually
852 * contain no tuples. This is *necessary*, not optional, because other
853 * backends could have added tuples to these pages whilst we were
854 * vacuuming.
856 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
858 if (new_rel_pages >= old_rel_pages)
860 /* can't do anything after all */
861 UnlockRelation(onerel, AccessExclusiveLock);
862 return;
866 * Okay to truncate.
868 RelationTruncate(onerel, new_rel_pages);
871 * Note: once we have truncated, we *must* keep the exclusive lock until
872 * commit. The sinval message that will be sent at commit (as a result of
873 * vac_update_relstats()) must be received by other backends, to cause
874 * them to reset their rd_targblock values, before they can safely access
875 * the table again.
879 * Drop free-space info for removed blocks; these must not get entered
880 * into the FSM!
882 pageSpaces = vacrelstats->free_pages;
883 n = vacrelstats->num_free_pages;
884 j = 0;
885 for (i = 0; i < n; i++)
887 if (FSMPageGetPageNum(&pageSpaces[i]) < new_rel_pages)
889 pageSpaces[j] = pageSpaces[i];
890 j++;
893 vacrelstats->num_free_pages = j;
896 * If tot_free_pages was more than num_free_pages, we can't tell for sure
897 * what its correct value is now, because we don't know which of the
898 * forgotten pages are getting truncated. Conservatively set it equal to
899 * num_free_pages.
901 vacrelstats->tot_free_pages = j;
903 /* We destroyed the heap ordering, so mark array unordered */
904 vacrelstats->fs_is_heap = false;
906 /* update statistics */
907 vacrelstats->rel_pages = new_rel_pages;
908 vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
910 ereport(elevel,
911 (errmsg("\"%s\": truncated %u to %u pages",
912 RelationGetRelationName(onerel),
913 old_rel_pages, new_rel_pages),
914 errdetail("%s.",
915 pg_rusage_show(&ru0))));
919 * Rescan end pages to verify that they are (still) empty of tuples.
921 * Returns number of nondeletable pages (last nonempty page + 1).
923 static BlockNumber
924 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
926 BlockNumber blkno;
928 /* Strange coding of loop control is needed because blkno is unsigned */
929 blkno = vacrelstats->rel_pages;
930 while (blkno > vacrelstats->nonempty_pages)
932 Buffer buf;
933 Page page;
934 OffsetNumber offnum,
935 maxoff;
936 bool hastup;
939 * We don't insert a vacuum delay point here, because we have an
940 * exclusive lock on the table which we want to hold for as short a
941 * time as possible. We still need to check for interrupts however.
943 CHECK_FOR_INTERRUPTS();
945 blkno--;
947 buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
949 /* In this phase we only need shared access to the buffer */
950 LockBuffer(buf, BUFFER_LOCK_SHARE);
952 page = BufferGetPage(buf);
954 if (PageIsNew(page) || PageIsEmpty(page))
956 /* PageIsNew probably shouldn't happen... */
957 UnlockReleaseBuffer(buf);
958 continue;
961 hastup = false;
962 maxoff = PageGetMaxOffsetNumber(page);
963 for (offnum = FirstOffsetNumber;
964 offnum <= maxoff;
965 offnum = OffsetNumberNext(offnum))
967 ItemId itemid;
969 itemid = PageGetItemId(page, offnum);
972 * Note: any non-unused item should be taken as a reason to keep
973 * this page. We formerly thought that DEAD tuples could be
974 * thrown away, but that's not so, because we'd not have cleaned
975 * out their index entries.
977 if (ItemIdIsUsed(itemid))
979 hastup = true;
980 break; /* can stop scanning */
982 } /* scan along page */
984 UnlockReleaseBuffer(buf);
986 /* Done scanning if we found a tuple here */
987 if (hastup)
988 return blkno + 1;
992 * If we fall out of the loop, all the previously-thought-to-be-empty
993 * pages still are; we need not bother to look at the last known-nonempty
994 * page.
996 return vacrelstats->nonempty_pages;
1000 * lazy_space_alloc - space allocation decisions for lazy vacuum
1002 * See the comments at the head of this file for rationale.
1004 static void
1005 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1007 long maxtuples;
1008 int maxpages;
1010 if (vacrelstats->hasindex)
1012 maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
1013 maxtuples = Min(maxtuples, INT_MAX);
1014 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1016 /* curious coding here to ensure the multiplication can't overflow */
1017 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1018 maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1020 /* stay sane if small maintenance_work_mem */
1021 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1023 else
1025 maxtuples = MaxHeapTuplesPerPage;
1028 vacrelstats->num_dead_tuples = 0;
1029 vacrelstats->max_dead_tuples = (int) maxtuples;
1030 vacrelstats->dead_tuples = (ItemPointer)
1031 palloc(maxtuples * sizeof(ItemPointerData));
1033 maxpages = MaxFSMPages;
1034 maxpages = Min(maxpages, MaxAllocSize / sizeof(FSMPageData));
1035 /* No need to allocate more pages than the relation has blocks */
1036 if (relblocks < (BlockNumber) maxpages)
1037 maxpages = (int) relblocks;
1039 vacrelstats->fs_is_heap = false;
1040 vacrelstats->num_free_pages = 0;
1041 vacrelstats->max_free_pages = maxpages;
1042 vacrelstats->free_pages = (FSMPageData *)
1043 palloc(maxpages * sizeof(FSMPageData));
1044 vacrelstats->tot_free_pages = 0;
1048 * lazy_record_dead_tuple - remember one deletable tuple
1050 static void
1051 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1052 ItemPointer itemptr)
1055 * The array shouldn't overflow under normal behavior, but perhaps it
1056 * could if we are given a really small maintenance_work_mem. In that
1057 * case, just forget the last few tuples (we'll get 'em next time).
1059 if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1061 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1062 vacrelstats->num_dead_tuples++;
1067 * lazy_record_free_space - remember free space on one page
1069 static void
1070 lazy_record_free_space(LVRelStats *vacrelstats,
1071 BlockNumber page,
1072 Size avail)
1074 FSMPageData *pageSpaces;
1075 int n;
1078 * A page with less than stats->threshold free space will be forgotten
1079 * immediately, and never passed to the free space map. Removing the
1080 * uselessly small entries early saves cycles, and in particular reduces
1081 * the amount of time we spend holding the FSM lock when we finally call
1082 * RecordRelationFreeSpace. Since the FSM will probably drop pages with
1083 * little free space anyway, there's no point in making this really small.
1085 * XXX Is it worth trying to measure average tuple size, and using that to
1086 * adjust the threshold? Would be worthwhile if FSM has no stats yet for
1087 * this relation. But changing the threshold as we scan the rel might
1088 * lead to bizarre behavior, too. Also, it's probably better if vacuum.c
1089 * has the same thresholding behavior as we do here.
1091 if (avail < vacrelstats->threshold)
1092 return;
1094 /* Count all pages over threshold, even if not enough space in array */
1095 vacrelstats->tot_free_pages++;
1097 /* Copy pointers to local variables for notational simplicity */
1098 pageSpaces = vacrelstats->free_pages;
1099 n = vacrelstats->max_free_pages;
1101 /* If we haven't filled the array yet, just keep adding entries */
1102 if (vacrelstats->num_free_pages < n)
1104 FSMPageSetPageNum(&pageSpaces[vacrelstats->num_free_pages], page);
1105 FSMPageSetSpace(&pageSpaces[vacrelstats->num_free_pages], avail);
1106 vacrelstats->num_free_pages++;
1107 return;
1110 /*----------
1111 * The rest of this routine works with "heap" organization of the
1112 * free space arrays, wherein we maintain the heap property
1113 * avail[(j-1) div 2] <= avail[j] for 0 < j < n.
1114 * In particular, the zero'th element always has the smallest available
1115 * space and can be discarded to make room for a new page with more space.
1116 * See Knuth's discussion of heap-based priority queues, sec 5.2.3;
1117 * but note he uses 1-origin array subscripts, not 0-origin.
1118 *----------
1121 /* If we haven't yet converted the array to heap organization, do it */
1122 if (!vacrelstats->fs_is_heap)
1125 * Scan backwards through the array, "sift-up" each value into its
1126 * correct position. We can start the scan at n/2-1 since each entry
1127 * above that position has no children to worry about.
1129 int l = n / 2;
1131 while (--l >= 0)
1133 BlockNumber R = FSMPageGetPageNum(&pageSpaces[l]);
1134 Size K = FSMPageGetSpace(&pageSpaces[l]);
1135 int i; /* i is where the "hole" is */
1137 i = l;
1138 for (;;)
1140 int j = 2 * i + 1;
1142 if (j >= n)
1143 break;
1144 if (j + 1 < n && FSMPageGetSpace(&pageSpaces[j]) > FSMPageGetSpace(&pageSpaces[j + 1]))
1145 j++;
1146 if (K <= FSMPageGetSpace(&pageSpaces[j]))
1147 break;
1148 pageSpaces[i] = pageSpaces[j];
1149 i = j;
1151 FSMPageSetPageNum(&pageSpaces[i], R);
1152 FSMPageSetSpace(&pageSpaces[i], K);
1155 vacrelstats->fs_is_heap = true;
1158 /* If new page has more than zero'th entry, insert it into heap */
1159 if (avail > FSMPageGetSpace(&pageSpaces[0]))
1162 * Notionally, we replace the zero'th entry with the new data, and
1163 * then sift-up to maintain the heap property. Physically, the new
1164 * data doesn't get stored into the arrays until we find the right
1165 * location for it.
1167 int i = 0; /* i is where the "hole" is */
1169 for (;;)
1171 int j = 2 * i + 1;
1173 if (j >= n)
1174 break;
1175 if (j + 1 < n && FSMPageGetSpace(&pageSpaces[j]) > FSMPageGetSpace(&pageSpaces[j + 1]))
1176 j++;
1177 if (avail <= FSMPageGetSpace(&pageSpaces[j]))
1178 break;
1179 pageSpaces[i] = pageSpaces[j];
1180 i = j;
1182 FSMPageSetPageNum(&pageSpaces[i], page);
1183 FSMPageSetSpace(&pageSpaces[i], avail);
1188 * lazy_tid_reaped() -- is a particular tid deletable?
1190 * This has the right signature to be an IndexBulkDeleteCallback.
1192 * Assumes dead_tuples array is in sorted order.
1194 static bool
1195 lazy_tid_reaped(ItemPointer itemptr, void *state)
1197 LVRelStats *vacrelstats = (LVRelStats *) state;
1198 ItemPointer res;
1200 res = (ItemPointer) bsearch((void *) itemptr,
1201 (void *) vacrelstats->dead_tuples,
1202 vacrelstats->num_dead_tuples,
1203 sizeof(ItemPointerData),
1204 vac_cmp_itemptr);
1206 return (res != NULL);
1210 * Update the shared Free Space Map with the info we now have about
1211 * free space in the relation, discarding any old info the map may have.
1213 static void
1214 lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
1216 FSMPageData *pageSpaces = vacrelstats->free_pages;
1217 int nPages = vacrelstats->num_free_pages;
1220 * Sort data into order, as required by RecordRelationFreeSpace.
1222 if (nPages > 1)
1223 qsort(pageSpaces, nPages, sizeof(FSMPageData),
1224 vac_cmp_page_spaces);
1226 RecordRelationFreeSpace(&onerel->rd_node, vacrelstats->tot_free_pages,
1227 nPages, pageSpaces);
1231 * Comparator routines for use with qsort() and bsearch().
1233 static int
1234 vac_cmp_itemptr(const void *left, const void *right)
1236 BlockNumber lblk,
1237 rblk;
1238 OffsetNumber loff,
1239 roff;
1241 lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1242 rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1244 if (lblk < rblk)
1245 return -1;
1246 if (lblk > rblk)
1247 return 1;
1249 loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1250 roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1252 if (loff < roff)
1253 return -1;
1254 if (loff > roff)
1255 return 1;
1257 return 0;
1260 static int
1261 vac_cmp_page_spaces(const void *left, const void *right)
1263 FSMPageData *linfo = (FSMPageData *) left;
1264 FSMPageData *rinfo = (FSMPageData *) right;
1265 BlockNumber lblkno = FSMPageGetPageNum(linfo);
1266 BlockNumber rblkno = FSMPageGetPageNum(rinfo);
1268 if (lblkno < rblkno)
1269 return -1;
1270 else if (lblkno > rblkno)
1271 return 1;
1272 return 0;