1 /*-------------------------------------------------------------------------
4 * Implementation of Lehman and Yao's btree management algorithm for
8 * This file contains only the public interface routines.
11 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
17 *-------------------------------------------------------------------------
21 #include "access/genam.h"
22 #include "access/nbtree.h"
23 #include "access/relscan.h"
24 #include "catalog/index.h"
25 #include "catalog/storage.h"
26 #include "commands/vacuum.h"
27 #include "storage/bufmgr.h"
28 #include "storage/freespace.h"
29 #include "storage/indexfsm.h"
30 #include "storage/ipc.h"
31 #include "storage/lmgr.h"
32 #include "utils/memutils.h"
35 /* Working state for btbuild and its callback */
44 * spool2 is needed only when the index is an unique index. Dead tuples
45 * are put into spool2 instead of spool in order to avoid uniqueness
52 /* Working state needed by btvacuumpage */
55 IndexVacuumInfo
*info
;
56 IndexBulkDeleteResult
*stats
;
57 IndexBulkDeleteCallback callback
;
60 BlockNumber lastUsedPage
;
61 BlockNumber totFreePages
; /* true total # of free pages */
62 MemoryContext pagedelcontext
;
66 static void btbuildCallback(Relation index
,
72 static void btvacuumscan(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
,
73 IndexBulkDeleteCallback callback
, void *callback_state
,
75 static void btvacuumpage(BTVacState
*vstate
, BlockNumber blkno
,
76 BlockNumber orig_blkno
);
80 * btbuild() -- build a new btree index.
83 btbuild(PG_FUNCTION_ARGS
)
85 Relation heap
= (Relation
) PG_GETARG_POINTER(0);
86 Relation index
= (Relation
) PG_GETARG_POINTER(1);
87 IndexInfo
*indexInfo
= (IndexInfo
*) PG_GETARG_POINTER(2);
88 IndexBuildResult
*result
;
90 BTBuildState buildstate
;
92 buildstate
.isUnique
= indexInfo
->ii_Unique
;
93 buildstate
.haveDead
= false;
94 buildstate
.heapRel
= heap
;
95 buildstate
.spool
= NULL
;
96 buildstate
.spool2
= NULL
;
97 buildstate
.indtuples
= 0;
99 #ifdef BTREE_BUILD_STATS
100 if (log_btree_build_stats
)
102 #endif /* BTREE_BUILD_STATS */
105 * We expect to be called exactly once for any index relation. If that's
106 * not the case, big trouble's what we have.
108 if (RelationGetNumberOfBlocks(index
) != 0)
109 elog(ERROR
, "index \"%s\" already contains data",
110 RelationGetRelationName(index
));
112 buildstate
.spool
= _bt_spoolinit(index
, indexInfo
->ii_Unique
, false);
115 * If building a unique index, put dead tuples in a second spool to keep
116 * them out of the uniqueness check.
118 if (indexInfo
->ii_Unique
)
119 buildstate
.spool2
= _bt_spoolinit(index
, false, true);
121 /* do the heap scan */
122 reltuples
= IndexBuildHeapScan(heap
, index
, indexInfo
, true,
123 btbuildCallback
, (void *) &buildstate
);
125 /* okay, all heap tuples are indexed */
126 if (buildstate
.spool2
&& !buildstate
.haveDead
)
128 /* spool2 turns out to be unnecessary */
129 _bt_spooldestroy(buildstate
.spool2
);
130 buildstate
.spool2
= NULL
;
134 * Finish the build by (1) completing the sort of the spool file, (2)
135 * inserting the sorted tuples into btree pages and (3) building the upper
138 _bt_leafbuild(buildstate
.spool
, buildstate
.spool2
);
139 _bt_spooldestroy(buildstate
.spool
);
140 if (buildstate
.spool2
)
141 _bt_spooldestroy(buildstate
.spool2
);
143 #ifdef BTREE_BUILD_STATS
144 if (log_btree_build_stats
)
146 ShowUsage("BTREE BUILD STATS");
149 #endif /* BTREE_BUILD_STATS */
152 * If we are reindexing a pre-existing index, it is critical to send out a
153 * relcache invalidation SI message to ensure all backends re-read the
154 * index metapage. We expect that the caller will ensure that happens
155 * (typically as a side effect of updating index stats, but it must happen
156 * even if the stats don't change!)
162 result
= (IndexBuildResult
*) palloc(sizeof(IndexBuildResult
));
164 result
->heap_tuples
= reltuples
;
165 result
->index_tuples
= buildstate
.indtuples
;
167 PG_RETURN_POINTER(result
);
171 * Per-tuple callback from IndexBuildHeapScan
174 btbuildCallback(Relation index
,
181 BTBuildState
*buildstate
= (BTBuildState
*) state
;
184 /* form an index tuple and point it at the heap tuple */
185 itup
= index_form_tuple(RelationGetDescr(index
), values
, isnull
);
186 itup
->t_tid
= htup
->t_self
;
189 * insert the index tuple into the appropriate spool file for subsequent
192 if (tupleIsAlive
|| buildstate
->spool2
== NULL
)
193 _bt_spool(itup
, buildstate
->spool
);
196 /* dead tuples are put into spool2 */
197 buildstate
->haveDead
= true;
198 _bt_spool(itup
, buildstate
->spool2
);
201 buildstate
->indtuples
+= 1;
207 * btinsert() -- insert an index tuple into a btree.
209 * Descend the tree recursively, find the appropriate location for our
210 * new tuple, and put it there.
213 btinsert(PG_FUNCTION_ARGS
)
215 Relation rel
= (Relation
) PG_GETARG_POINTER(0);
216 Datum
*values
= (Datum
*) PG_GETARG_POINTER(1);
217 bool *isnull
= (bool *) PG_GETARG_POINTER(2);
218 ItemPointer ht_ctid
= (ItemPointer
) PG_GETARG_POINTER(3);
219 Relation heapRel
= (Relation
) PG_GETARG_POINTER(4);
220 bool checkUnique
= PG_GETARG_BOOL(5);
223 /* generate an index tuple */
224 itup
= index_form_tuple(RelationGetDescr(rel
), values
, isnull
);
225 itup
->t_tid
= *ht_ctid
;
227 _bt_doinsert(rel
, itup
, checkUnique
, heapRel
);
231 PG_RETURN_BOOL(true);
235 * btgettuple() -- Get the next tuple in the scan.
238 btgettuple(PG_FUNCTION_ARGS
)
240 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
241 ScanDirection dir
= (ScanDirection
) PG_GETARG_INT32(1);
242 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
245 /* btree indexes are never lossy */
246 scan
->xs_recheck
= false;
249 * If we've already initialized this scan, we can just advance it in the
250 * appropriate direction. If we haven't done so yet, we call a routine to
251 * get the first item in the scan.
253 if (BTScanPosIsValid(so
->currPos
))
256 * Check to see if we should kill the previously-fetched tuple.
258 if (scan
->kill_prior_tuple
)
261 * Yes, remember it for later. (We'll deal with all such tuples
262 * at once right before leaving the index page.) The test for
263 * numKilled overrun is not just paranoia: if the caller reverses
264 * direction in the indexscan then the same item might get entered
265 * multiple times. It's not worth trying to optimize that, so we
266 * don't detect it, but instead just forget any excess entries.
268 if (so
->killedItems
== NULL
)
269 so
->killedItems
= (int *)
270 palloc(MaxIndexTuplesPerPage
* sizeof(int));
271 if (so
->numKilled
< MaxIndexTuplesPerPage
)
272 so
->killedItems
[so
->numKilled
++] = so
->currPos
.itemIndex
;
276 * Now continue the scan.
278 res
= _bt_next(scan
, dir
);
281 res
= _bt_first(scan
, dir
);
287 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
290 btgetbitmap(PG_FUNCTION_ARGS
)
292 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
293 TIDBitmap
*tbm
= (TIDBitmap
*) PG_GETARG_POINTER(1);
294 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
298 /* Fetch the first page & tuple. */
299 if (!_bt_first(scan
, ForwardScanDirection
))
304 /* Save tuple ID, and continue scanning */
305 heapTid
= &scan
->xs_ctup
.t_self
;
306 tbm_add_tuples(tbm
, heapTid
, 1, false);
312 * Advance to next tuple within page. This is the same as the easy
313 * case in _bt_next().
315 if (++so
->currPos
.itemIndex
> so
->currPos
.lastItem
)
317 /* let _bt_next do the heavy lifting */
318 if (!_bt_next(scan
, ForwardScanDirection
))
322 /* Save tuple ID, and continue scanning */
323 heapTid
= &so
->currPos
.items
[so
->currPos
.itemIndex
].heapTid
;
324 tbm_add_tuples(tbm
, heapTid
, 1, false);
328 PG_RETURN_INT64(ntids
);
332 * btbeginscan() -- start a scan on a btree index
335 btbeginscan(PG_FUNCTION_ARGS
)
337 Relation rel
= (Relation
) PG_GETARG_POINTER(0);
338 int keysz
= PG_GETARG_INT32(1);
339 ScanKey scankey
= (ScanKey
) PG_GETARG_POINTER(2);
343 scan
= RelationGetIndexScan(rel
, keysz
, scankey
);
345 PG_RETURN_POINTER(scan
);
349 * btrescan() -- rescan an index relation
352 btrescan(PG_FUNCTION_ARGS
)
354 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
355 ScanKey scankey
= (ScanKey
) PG_GETARG_POINTER(1);
358 so
= (BTScanOpaque
) scan
->opaque
;
360 if (so
== NULL
) /* if called from btbeginscan */
362 so
= (BTScanOpaque
) palloc(sizeof(BTScanOpaqueData
));
363 so
->currPos
.buf
= so
->markPos
.buf
= InvalidBuffer
;
364 if (scan
->numberOfKeys
> 0)
365 so
->keyData
= (ScanKey
) palloc(scan
->numberOfKeys
* sizeof(ScanKeyData
));
368 so
->killedItems
= NULL
; /* until needed */
373 /* we aren't holding any read locks, but gotta drop the pins */
374 if (BTScanPosIsValid(so
->currPos
))
376 /* Before leaving current page, deal with any killed items */
377 if (so
->numKilled
> 0)
378 _bt_killitems(scan
, false);
379 ReleaseBuffer(so
->currPos
.buf
);
380 so
->currPos
.buf
= InvalidBuffer
;
383 if (BTScanPosIsValid(so
->markPos
))
385 ReleaseBuffer(so
->markPos
.buf
);
386 so
->markPos
.buf
= InvalidBuffer
;
388 so
->markItemIndex
= -1;
391 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
394 if (scankey
&& scan
->numberOfKeys
> 0)
395 memmove(scan
->keyData
,
397 scan
->numberOfKeys
* sizeof(ScanKeyData
));
398 so
->numberOfKeys
= 0; /* until _bt_preprocess_keys sets it */
404 * btendscan() -- close down a scan
407 btendscan(PG_FUNCTION_ARGS
)
409 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
410 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
412 /* we aren't holding any read locks, but gotta drop the pins */
413 if (BTScanPosIsValid(so
->currPos
))
415 /* Before leaving current page, deal with any killed items */
416 if (so
->numKilled
> 0)
417 _bt_killitems(scan
, false);
418 ReleaseBuffer(so
->currPos
.buf
);
419 so
->currPos
.buf
= InvalidBuffer
;
422 if (BTScanPosIsValid(so
->markPos
))
424 ReleaseBuffer(so
->markPos
.buf
);
425 so
->markPos
.buf
= InvalidBuffer
;
427 so
->markItemIndex
= -1;
429 if (so
->killedItems
!= NULL
)
430 pfree(so
->killedItems
);
431 if (so
->keyData
!= NULL
)
439 * btmarkpos() -- save current scan position
442 btmarkpos(PG_FUNCTION_ARGS
)
444 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
445 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
447 /* we aren't holding any read locks, but gotta drop the pin */
448 if (BTScanPosIsValid(so
->markPos
))
450 ReleaseBuffer(so
->markPos
.buf
);
451 so
->markPos
.buf
= InvalidBuffer
;
455 * Just record the current itemIndex. If we later step to next page
456 * before releasing the marked position, _bt_steppage makes a full copy of
457 * the currPos struct in markPos. If (as often happens) the mark is moved
458 * before we leave the page, we don't have to do that work.
460 if (BTScanPosIsValid(so
->currPos
))
461 so
->markItemIndex
= so
->currPos
.itemIndex
;
463 so
->markItemIndex
= -1;
469 * btrestrpos() -- restore scan to last saved position
472 btrestrpos(PG_FUNCTION_ARGS
)
474 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
475 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
477 if (so
->markItemIndex
>= 0)
480 * The mark position is on the same page we are currently on. Just
481 * restore the itemIndex.
483 so
->currPos
.itemIndex
= so
->markItemIndex
;
487 /* we aren't holding any read locks, but gotta drop the pin */
488 if (BTScanPosIsValid(so
->currPos
))
490 /* Before leaving current page, deal with any killed items */
491 if (so
->numKilled
> 0 &&
492 so
->currPos
.buf
!= so
->markPos
.buf
)
493 _bt_killitems(scan
, false);
494 ReleaseBuffer(so
->currPos
.buf
);
495 so
->currPos
.buf
= InvalidBuffer
;
498 if (BTScanPosIsValid(so
->markPos
))
500 /* bump pin on mark buffer for assignment to current buffer */
501 IncrBufferRefCount(so
->markPos
.buf
);
502 memcpy(&so
->currPos
, &so
->markPos
,
503 offsetof(BTScanPosData
, items
[1]) +
504 so
->markPos
.lastItem
* sizeof(BTScanPosItem
));
512 * Bulk deletion of all index entries pointing to a set of heap tuples.
513 * The set of target tuples is specified via a callback routine that tells
514 * whether any given heap tuple (identified by ItemPointer) is being deleted.
516 * Result: a palloc'd struct containing statistical info for VACUUM displays.
519 btbulkdelete(PG_FUNCTION_ARGS
)
521 IndexVacuumInfo
*info
= (IndexVacuumInfo
*) PG_GETARG_POINTER(0);
522 IndexBulkDeleteResult
*volatile stats
= (IndexBulkDeleteResult
*) PG_GETARG_POINTER(1);
523 IndexBulkDeleteCallback callback
= (IndexBulkDeleteCallback
) PG_GETARG_POINTER(2);
524 void *callback_state
= (void *) PG_GETARG_POINTER(3);
525 Relation rel
= info
->index
;
528 /* allocate stats if first time through, else re-use existing struct */
530 stats
= (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
532 /* Establish the vacuum cycle ID to use for this scan */
533 /* The ENSURE stuff ensures we clean up shared memory on failure */
534 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback
, PointerGetDatum(rel
));
536 cycleid
= _bt_start_vacuum(rel
);
538 btvacuumscan(info
, stats
, callback
, callback_state
, cycleid
);
540 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback
, PointerGetDatum(rel
));
543 PG_RETURN_POINTER(stats
);
547 * Post-VACUUM cleanup.
549 * Result: a palloc'd struct containing statistical info for VACUUM displays.
552 btvacuumcleanup(PG_FUNCTION_ARGS
)
554 IndexVacuumInfo
*info
= (IndexVacuumInfo
*) PG_GETARG_POINTER(0);
555 IndexBulkDeleteResult
*stats
= (IndexBulkDeleteResult
*) PG_GETARG_POINTER(1);
557 /* No-op in ANALYZE ONLY mode */
558 if (info
->analyze_only
)
559 PG_RETURN_POINTER(stats
);
562 * If btbulkdelete was called, we need not do anything, just return the
563 * stats from the latest btbulkdelete call. If it wasn't called, we must
564 * still do a pass over the index, to recycle any newly-recyclable pages
565 * and to obtain index statistics.
567 * Since we aren't going to actually delete any leaf items, there's no
568 * need to go through all the vacuum-cycle-ID pushups.
572 stats
= (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
573 btvacuumscan(info
, stats
, NULL
, NULL
, 0);
576 /* Finally, vacuum the FSM */
577 IndexFreeSpaceMapVacuum(info
->index
);
580 * During a non-FULL vacuum it's quite possible for us to be fooled by
581 * concurrent page splits into double-counting some index tuples, so
582 * disbelieve any total that exceeds the underlying heap's count. (We
583 * can't check this during btbulkdelete.)
585 if (!info
->vacuum_full
)
587 if (stats
->num_index_tuples
> info
->num_heap_tuples
)
588 stats
->num_index_tuples
= info
->num_heap_tuples
;
591 PG_RETURN_POINTER(stats
);
595 * btvacuumscan --- scan the index for VACUUMing purposes
597 * This combines the functions of looking for leaf tuples that are deletable
598 * according to the vacuum callback, looking for empty pages that can be
599 * deleted, and looking for old deleted pages that can be recycled. Both
600 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
601 * btbulkdelete call occurred).
603 * The caller is responsible for initially allocating/zeroing a stats struct
604 * and for obtaining a vacuum cycle ID if necessary.
607 btvacuumscan(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
,
608 IndexBulkDeleteCallback callback
, void *callback_state
,
611 Relation rel
= info
->index
;
613 BlockNumber num_pages
;
618 * Reset counts that will be incremented during the scan; needed in case
619 * of multiple scans during a single VACUUM command
621 stats
->num_index_tuples
= 0;
622 stats
->pages_deleted
= 0;
624 /* Set up info to pass down to btvacuumpage */
626 vstate
.stats
= stats
;
627 vstate
.callback
= callback
;
628 vstate
.callback_state
= callback_state
;
629 vstate
.cycleid
= cycleid
;
630 vstate
.lastUsedPage
= BTREE_METAPAGE
;
631 vstate
.totFreePages
= 0;
633 /* Create a temporary memory context to run _bt_pagedel in */
634 vstate
.pagedelcontext
= AllocSetContextCreate(CurrentMemoryContext
,
636 ALLOCSET_DEFAULT_MINSIZE
,
637 ALLOCSET_DEFAULT_INITSIZE
,
638 ALLOCSET_DEFAULT_MAXSIZE
);
641 * The outer loop iterates over all index pages except the metapage, in
642 * physical order (we hope the kernel will cooperate in providing
643 * read-ahead for speed). It is critical that we visit all leaf pages,
644 * including ones added after we start the scan, else we might fail to
645 * delete some deletable tuples. Hence, we must repeatedly check the
646 * relation length. We must acquire the relation-extension lock while
647 * doing so to avoid a race condition: if someone else is extending the
648 * relation, there is a window where bufmgr/smgr have created a new
649 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
650 * we manage to scan such a page here, we'll improperly assume it can be
651 * recycled. Taking the lock synchronizes things enough to prevent a
652 * problem: either num_pages won't include the new page, or _bt_getbuf
653 * already has write lock on the buffer and it will be fully initialized
654 * before we can examine it. (See also vacuumlazy.c, which has the same
655 * issue.) Also, we need not worry if a page is added immediately after
656 * we look; the page splitting code already has write-lock on the left
657 * page before it adds a right page, so we must already have processed any
658 * tuples due to be moved into such a page.
660 * We can skip locking for new or temp relations, however, since no one
661 * else could be accessing them.
663 needLock
= !RELATION_IS_LOCAL(rel
);
665 blkno
= BTREE_METAPAGE
+ 1;
668 /* Get the current relation length */
670 LockRelationForExtension(rel
, ExclusiveLock
);
671 num_pages
= RelationGetNumberOfBlocks(rel
);
673 UnlockRelationForExtension(rel
, ExclusiveLock
);
675 /* Quit if we've scanned the whole relation */
676 if (blkno
>= num_pages
)
678 /* Iterate over pages, then loop back to recheck length */
679 for (; blkno
< num_pages
; blkno
++)
681 btvacuumpage(&vstate
, blkno
, blkno
);
686 * During VACUUM FULL, we truncate off any recyclable pages at the end of
687 * the index. In a normal vacuum it'd be unsafe to do this except by
688 * acquiring exclusive lock on the index and then rechecking all the
689 * pages; doesn't seem worth it.
691 if (info
->vacuum_full
&& vstate
.lastUsedPage
< num_pages
- 1)
693 BlockNumber new_pages
= vstate
.lastUsedPage
+ 1;
698 RelationTruncate(rel
, new_pages
);
700 /* update statistics */
701 stats
->pages_removed
+= num_pages
- new_pages
;
702 vstate
.totFreePages
-= (num_pages
- new_pages
);
703 num_pages
= new_pages
;
706 MemoryContextDelete(vstate
.pagedelcontext
);
708 /* update statistics */
709 stats
->num_pages
= num_pages
;
710 stats
->pages_free
= vstate
.totFreePages
;
714 * btvacuumpage --- VACUUM one page
716 * This processes a single page for btvacuumscan(). In some cases we
717 * must go back and re-examine previously-scanned pages; this routine
718 * recurses when necessary to handle that case.
720 * blkno is the page to process. orig_blkno is the highest block number
721 * reached by the outer btvacuumscan loop (the same as blkno, unless we
722 * are recursing to re-examine a previous page).
725 btvacuumpage(BTVacState
*vstate
, BlockNumber blkno
, BlockNumber orig_blkno
)
727 IndexVacuumInfo
*info
= vstate
->info
;
728 IndexBulkDeleteResult
*stats
= vstate
->stats
;
729 IndexBulkDeleteCallback callback
= vstate
->callback
;
730 void *callback_state
= vstate
->callback_state
;
731 Relation rel
= info
->index
;
733 BlockNumber recurse_to
;
742 /* call vacuum_delay_point while not holding any buffer lock */
743 vacuum_delay_point();
746 * We can't use _bt_getbuf() here because it always applies
747 * _bt_checkpage(), which will barf on an all-zero page. We want to
748 * recycle all-zero pages, not fail. Also, we want to use a nondefault
749 * buffer access strategy.
751 buf
= ReadBufferExtended(rel
, MAIN_FORKNUM
, blkno
, RBM_NORMAL
,
753 LockBuffer(buf
, BT_READ
);
754 page
= BufferGetPage(buf
);
755 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
756 if (!PageIsNew(page
))
757 _bt_checkpage(rel
, buf
);
760 * If we are recursing, the only case we want to do anything with is a
761 * live leaf page having the current vacuum cycle ID. Any other state
762 * implies we already saw the page (eg, deleted it as being empty).
764 if (blkno
!= orig_blkno
)
766 if (_bt_page_recyclable(page
) ||
769 opaque
->btpo_cycleid
!= vstate
->cycleid
)
771 _bt_relbuf(rel
, buf
);
776 /* If the page is in use, update lastUsedPage */
777 if (!_bt_page_recyclable(page
) && vstate
->lastUsedPage
< blkno
)
778 vstate
->lastUsedPage
= blkno
;
780 /* Page is valid, see what to do with it */
781 if (_bt_page_recyclable(page
))
783 /* Okay to recycle this page */
784 RecordFreeIndexPage(rel
, blkno
);
785 vstate
->totFreePages
++;
786 stats
->pages_deleted
++;
788 else if (P_ISDELETED(opaque
))
790 /* Already deleted, but can't recycle yet */
791 stats
->pages_deleted
++;
793 else if (P_ISHALFDEAD(opaque
))
795 /* Half-dead, try to delete */
798 else if (P_ISLEAF(opaque
))
800 OffsetNumber deletable
[MaxOffsetNumber
];
807 * Trade in the initial read lock for a super-exclusive write lock on
808 * this page. We must get such a lock on every leaf page over the
809 * course of the vacuum scan, whether or not it actually contains any
810 * deletable tuples --- see nbtree/README.
812 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
813 LockBufferForCleanup(buf
);
816 * Check whether we need to recurse back to earlier pages. What we
817 * are concerned about is a page split that happened since we started
818 * the vacuum scan. If the split moved some tuples to a lower page
819 * then we might have missed 'em. If so, set up for tail recursion.
820 * (Must do this before possibly clearing btpo_cycleid below!)
822 if (vstate
->cycleid
!= 0 &&
823 opaque
->btpo_cycleid
== vstate
->cycleid
&&
824 !(opaque
->btpo_flags
& BTP_SPLIT_END
) &&
825 !P_RIGHTMOST(opaque
) &&
826 opaque
->btpo_next
< orig_blkno
)
827 recurse_to
= opaque
->btpo_next
;
830 * Scan over all items to see which ones need deleted according to the
834 minoff
= P_FIRSTDATAKEY(opaque
);
835 maxoff
= PageGetMaxOffsetNumber(page
);
838 for (offnum
= minoff
;
840 offnum
= OffsetNumberNext(offnum
))
845 itup
= (IndexTuple
) PageGetItem(page
,
846 PageGetItemId(page
, offnum
));
847 htup
= &(itup
->t_tid
);
848 if (callback(htup
, callback_state
))
849 deletable
[ndeletable
++] = offnum
;
854 * Apply any needed deletes. We issue just one _bt_delitems() call
855 * per page, so as to minimize WAL traffic.
859 _bt_delitems(rel
, buf
, deletable
, ndeletable
);
860 stats
->tuples_removed
+= ndeletable
;
861 /* must recompute maxoff */
862 maxoff
= PageGetMaxOffsetNumber(page
);
867 * If the page has been split during this vacuum cycle, it seems
868 * worth expending a write to clear btpo_cycleid even if we don't
869 * have any deletions to do. (If we do, _bt_delitems takes care
870 * of this.) This ensures we won't process the page again.
872 * We treat this like a hint-bit update because there's no need to
875 if (vstate
->cycleid
!= 0 &&
876 opaque
->btpo_cycleid
== vstate
->cycleid
)
878 opaque
->btpo_cycleid
= 0;
879 SetBufferCommitInfoNeedsSave(buf
);
884 * If it's now empty, try to delete; else count the live tuples. We
885 * don't delete when recursing, though, to avoid putting entries into
886 * freePages out-of-order (doesn't seem worth any extra code to handle
890 delete_now
= (blkno
== orig_blkno
);
892 stats
->num_index_tuples
+= maxoff
- minoff
+ 1;
897 MemoryContext oldcontext
;
900 /* Run pagedel in a temp context to avoid memory leakage */
901 MemoryContextReset(vstate
->pagedelcontext
);
902 oldcontext
= MemoryContextSwitchTo(vstate
->pagedelcontext
);
904 ndel
= _bt_pagedel(rel
, buf
, NULL
, info
->vacuum_full
);
906 /* count only this page, else may double-count parent */
908 stats
->pages_deleted
++;
911 * During VACUUM FULL it's okay to recycle deleted pages immediately,
912 * since there can be no other transactions scanning the index. Note
913 * that we will only recycle the current page and not any parent pages
914 * that _bt_pagedel might have recursed to; this seems reasonable in
915 * the name of simplicity. (Trying to do otherwise would mean we'd
916 * have to sort the list of recyclable pages we're building.)
918 if (ndel
&& info
->vacuum_full
)
920 RecordFreeIndexPage(rel
, blkno
);
921 vstate
->totFreePages
++;
924 MemoryContextSwitchTo(oldcontext
);
925 /* pagedel released buffer, so we shouldn't */
928 _bt_relbuf(rel
, buf
);
931 * This is really tail recursion, but if the compiler is too stupid to
932 * optimize it as such, we'd eat an uncomfortably large amount of stack
933 * space per recursion level (due to the deletable[] array). A failure is
934 * improbable since the number of levels isn't likely to be large ... but
935 * just in case, let's hand-optimize into a loop.
937 if (recurse_to
!= P_NONE
)