Insert CHECK_FOR_INTERRUPTS() calls into btree and hash index scans at the
[PostgreSQL.git] / src / backend / access / nbtree / nbtree.c
blob69c6f1e86abffc3c3816a99bd607aed34879857e
1 /*-------------------------------------------------------------------------
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
7 * NOTES
8 * This file contains only the public interface routines.
11 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
14 * IDENTIFICATION
15 * $PostgreSQL$
17 *-------------------------------------------------------------------------
19 #include "postgres.h"
21 #include "access/genam.h"
22 #include "access/nbtree.h"
23 #include "access/relscan.h"
24 #include "catalog/index.h"
25 #include "catalog/storage.h"
26 #include "commands/vacuum.h"
27 #include "storage/bufmgr.h"
28 #include "storage/freespace.h"
29 #include "storage/indexfsm.h"
30 #include "storage/ipc.h"
31 #include "storage/lmgr.h"
32 #include "utils/memutils.h"
35 /* Working state for btbuild and its callback */
36 typedef struct
38 bool isUnique;
39 bool haveDead;
40 Relation heapRel;
41 BTSpool *spool;
44 * spool2 is needed only when the index is an unique index. Dead tuples
45 * are put into spool2 instead of spool in order to avoid uniqueness
46 * check.
48 BTSpool *spool2;
49 double indtuples;
50 } BTBuildState;
52 /* Working state needed by btvacuumpage */
53 typedef struct
55 IndexVacuumInfo *info;
56 IndexBulkDeleteResult *stats;
57 IndexBulkDeleteCallback callback;
58 void *callback_state;
59 BTCycleId cycleid;
60 BlockNumber lastUsedPage;
61 BlockNumber totFreePages; /* true total # of free pages */
62 MemoryContext pagedelcontext;
63 } BTVacState;
66 static void btbuildCallback(Relation index,
67 HeapTuple htup,
68 Datum *values,
69 bool *isnull,
70 bool tupleIsAlive,
71 void *state);
72 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
73 IndexBulkDeleteCallback callback, void *callback_state,
74 BTCycleId cycleid);
75 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
76 BlockNumber orig_blkno);
80 * btbuild() -- build a new btree index.
82 Datum
83 btbuild(PG_FUNCTION_ARGS)
85 Relation heap = (Relation) PG_GETARG_POINTER(0);
86 Relation index = (Relation) PG_GETARG_POINTER(1);
87 IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
88 IndexBuildResult *result;
89 double reltuples;
90 BTBuildState buildstate;
92 buildstate.isUnique = indexInfo->ii_Unique;
93 buildstate.haveDead = false;
94 buildstate.heapRel = heap;
95 buildstate.spool = NULL;
96 buildstate.spool2 = NULL;
97 buildstate.indtuples = 0;
99 #ifdef BTREE_BUILD_STATS
100 if (log_btree_build_stats)
101 ResetUsage();
102 #endif /* BTREE_BUILD_STATS */
105 * We expect to be called exactly once for any index relation. If that's
106 * not the case, big trouble's what we have.
108 if (RelationGetNumberOfBlocks(index) != 0)
109 elog(ERROR, "index \"%s\" already contains data",
110 RelationGetRelationName(index));
112 buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
115 * If building a unique index, put dead tuples in a second spool to keep
116 * them out of the uniqueness check.
118 if (indexInfo->ii_Unique)
119 buildstate.spool2 = _bt_spoolinit(index, false, true);
121 /* do the heap scan */
122 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
123 btbuildCallback, (void *) &buildstate);
125 /* okay, all heap tuples are indexed */
126 if (buildstate.spool2 && !buildstate.haveDead)
128 /* spool2 turns out to be unnecessary */
129 _bt_spooldestroy(buildstate.spool2);
130 buildstate.spool2 = NULL;
134 * Finish the build by (1) completing the sort of the spool file, (2)
135 * inserting the sorted tuples into btree pages and (3) building the upper
136 * levels.
138 _bt_leafbuild(buildstate.spool, buildstate.spool2);
139 _bt_spooldestroy(buildstate.spool);
140 if (buildstate.spool2)
141 _bt_spooldestroy(buildstate.spool2);
143 #ifdef BTREE_BUILD_STATS
144 if (log_btree_build_stats)
146 ShowUsage("BTREE BUILD STATS");
147 ResetUsage();
149 #endif /* BTREE_BUILD_STATS */
152 * If we are reindexing a pre-existing index, it is critical to send out a
153 * relcache invalidation SI message to ensure all backends re-read the
154 * index metapage. We expect that the caller will ensure that happens
155 * (typically as a side effect of updating index stats, but it must happen
156 * even if the stats don't change!)
160 * Return statistics
162 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
164 result->heap_tuples = reltuples;
165 result->index_tuples = buildstate.indtuples;
167 PG_RETURN_POINTER(result);
171 * Per-tuple callback from IndexBuildHeapScan
173 static void
174 btbuildCallback(Relation index,
175 HeapTuple htup,
176 Datum *values,
177 bool *isnull,
178 bool tupleIsAlive,
179 void *state)
181 BTBuildState *buildstate = (BTBuildState *) state;
182 IndexTuple itup;
184 /* form an index tuple and point it at the heap tuple */
185 itup = index_form_tuple(RelationGetDescr(index), values, isnull);
186 itup->t_tid = htup->t_self;
189 * insert the index tuple into the appropriate spool file for subsequent
190 * processing
192 if (tupleIsAlive || buildstate->spool2 == NULL)
193 _bt_spool(itup, buildstate->spool);
194 else
196 /* dead tuples are put into spool2 */
197 buildstate->haveDead = true;
198 _bt_spool(itup, buildstate->spool2);
201 buildstate->indtuples += 1;
203 pfree(itup);
207 * btinsert() -- insert an index tuple into a btree.
209 * Descend the tree recursively, find the appropriate location for our
210 * new tuple, and put it there.
212 Datum
213 btinsert(PG_FUNCTION_ARGS)
215 Relation rel = (Relation) PG_GETARG_POINTER(0);
216 Datum *values = (Datum *) PG_GETARG_POINTER(1);
217 bool *isnull = (bool *) PG_GETARG_POINTER(2);
218 ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
219 Relation heapRel = (Relation) PG_GETARG_POINTER(4);
220 bool checkUnique = PG_GETARG_BOOL(5);
221 IndexTuple itup;
223 /* generate an index tuple */
224 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
225 itup->t_tid = *ht_ctid;
227 _bt_doinsert(rel, itup, checkUnique, heapRel);
229 pfree(itup);
231 PG_RETURN_BOOL(true);
235 * btgettuple() -- Get the next tuple in the scan.
237 Datum
238 btgettuple(PG_FUNCTION_ARGS)
240 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
241 ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
242 BTScanOpaque so = (BTScanOpaque) scan->opaque;
243 bool res;
245 /* btree indexes are never lossy */
246 scan->xs_recheck = false;
249 * If we've already initialized this scan, we can just advance it in the
250 * appropriate direction. If we haven't done so yet, we call a routine to
251 * get the first item in the scan.
253 if (BTScanPosIsValid(so->currPos))
256 * Check to see if we should kill the previously-fetched tuple.
258 if (scan->kill_prior_tuple)
261 * Yes, remember it for later. (We'll deal with all such tuples
262 * at once right before leaving the index page.) The test for
263 * numKilled overrun is not just paranoia: if the caller reverses
264 * direction in the indexscan then the same item might get entered
265 * multiple times. It's not worth trying to optimize that, so we
266 * don't detect it, but instead just forget any excess entries.
268 if (so->killedItems == NULL)
269 so->killedItems = (int *)
270 palloc(MaxIndexTuplesPerPage * sizeof(int));
271 if (so->numKilled < MaxIndexTuplesPerPage)
272 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
276 * Now continue the scan.
278 res = _bt_next(scan, dir);
280 else
281 res = _bt_first(scan, dir);
283 PG_RETURN_BOOL(res);
287 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
289 Datum
290 btgetbitmap(PG_FUNCTION_ARGS)
292 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
293 TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
294 BTScanOpaque so = (BTScanOpaque) scan->opaque;
295 int64 ntids = 0;
296 ItemPointer heapTid;
298 /* Fetch the first page & tuple. */
299 if (!_bt_first(scan, ForwardScanDirection))
301 /* empty scan */
302 PG_RETURN_INT64(0);
304 /* Save tuple ID, and continue scanning */
305 heapTid = &scan->xs_ctup.t_self;
306 tbm_add_tuples(tbm, heapTid, 1, false);
307 ntids++;
309 for (;;)
312 * Advance to next tuple within page. This is the same as the easy
313 * case in _bt_next().
315 if (++so->currPos.itemIndex > so->currPos.lastItem)
317 /* let _bt_next do the heavy lifting */
318 if (!_bt_next(scan, ForwardScanDirection))
319 break;
322 /* Save tuple ID, and continue scanning */
323 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
324 tbm_add_tuples(tbm, heapTid, 1, false);
325 ntids++;
328 PG_RETURN_INT64(ntids);
332 * btbeginscan() -- start a scan on a btree index
334 Datum
335 btbeginscan(PG_FUNCTION_ARGS)
337 Relation rel = (Relation) PG_GETARG_POINTER(0);
338 int keysz = PG_GETARG_INT32(1);
339 ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2);
340 IndexScanDesc scan;
342 /* get the scan */
343 scan = RelationGetIndexScan(rel, keysz, scankey);
345 PG_RETURN_POINTER(scan);
349 * btrescan() -- rescan an index relation
351 Datum
352 btrescan(PG_FUNCTION_ARGS)
354 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
355 ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
356 BTScanOpaque so;
358 so = (BTScanOpaque) scan->opaque;
360 if (so == NULL) /* if called from btbeginscan */
362 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
363 so->currPos.buf = so->markPos.buf = InvalidBuffer;
364 if (scan->numberOfKeys > 0)
365 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
366 else
367 so->keyData = NULL;
368 so->killedItems = NULL; /* until needed */
369 so->numKilled = 0;
370 scan->opaque = so;
373 /* we aren't holding any read locks, but gotta drop the pins */
374 if (BTScanPosIsValid(so->currPos))
376 /* Before leaving current page, deal with any killed items */
377 if (so->numKilled > 0)
378 _bt_killitems(scan, false);
379 ReleaseBuffer(so->currPos.buf);
380 so->currPos.buf = InvalidBuffer;
383 if (BTScanPosIsValid(so->markPos))
385 ReleaseBuffer(so->markPos.buf);
386 so->markPos.buf = InvalidBuffer;
388 so->markItemIndex = -1;
391 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
392 * - vadim 05/05/97
394 if (scankey && scan->numberOfKeys > 0)
395 memmove(scan->keyData,
396 scankey,
397 scan->numberOfKeys * sizeof(ScanKeyData));
398 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
400 PG_RETURN_VOID();
404 * btendscan() -- close down a scan
406 Datum
407 btendscan(PG_FUNCTION_ARGS)
409 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
410 BTScanOpaque so = (BTScanOpaque) scan->opaque;
412 /* we aren't holding any read locks, but gotta drop the pins */
413 if (BTScanPosIsValid(so->currPos))
415 /* Before leaving current page, deal with any killed items */
416 if (so->numKilled > 0)
417 _bt_killitems(scan, false);
418 ReleaseBuffer(so->currPos.buf);
419 so->currPos.buf = InvalidBuffer;
422 if (BTScanPosIsValid(so->markPos))
424 ReleaseBuffer(so->markPos.buf);
425 so->markPos.buf = InvalidBuffer;
427 so->markItemIndex = -1;
429 if (so->killedItems != NULL)
430 pfree(so->killedItems);
431 if (so->keyData != NULL)
432 pfree(so->keyData);
433 pfree(so);
435 PG_RETURN_VOID();
439 * btmarkpos() -- save current scan position
441 Datum
442 btmarkpos(PG_FUNCTION_ARGS)
444 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
445 BTScanOpaque so = (BTScanOpaque) scan->opaque;
447 /* we aren't holding any read locks, but gotta drop the pin */
448 if (BTScanPosIsValid(so->markPos))
450 ReleaseBuffer(so->markPos.buf);
451 so->markPos.buf = InvalidBuffer;
455 * Just record the current itemIndex. If we later step to next page
456 * before releasing the marked position, _bt_steppage makes a full copy of
457 * the currPos struct in markPos. If (as often happens) the mark is moved
458 * before we leave the page, we don't have to do that work.
460 if (BTScanPosIsValid(so->currPos))
461 so->markItemIndex = so->currPos.itemIndex;
462 else
463 so->markItemIndex = -1;
465 PG_RETURN_VOID();
469 * btrestrpos() -- restore scan to last saved position
471 Datum
472 btrestrpos(PG_FUNCTION_ARGS)
474 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
475 BTScanOpaque so = (BTScanOpaque) scan->opaque;
477 if (so->markItemIndex >= 0)
480 * The mark position is on the same page we are currently on. Just
481 * restore the itemIndex.
483 so->currPos.itemIndex = so->markItemIndex;
485 else
487 /* we aren't holding any read locks, but gotta drop the pin */
488 if (BTScanPosIsValid(so->currPos))
490 /* Before leaving current page, deal with any killed items */
491 if (so->numKilled > 0 &&
492 so->currPos.buf != so->markPos.buf)
493 _bt_killitems(scan, false);
494 ReleaseBuffer(so->currPos.buf);
495 so->currPos.buf = InvalidBuffer;
498 if (BTScanPosIsValid(so->markPos))
500 /* bump pin on mark buffer for assignment to current buffer */
501 IncrBufferRefCount(so->markPos.buf);
502 memcpy(&so->currPos, &so->markPos,
503 offsetof(BTScanPosData, items[1]) +
504 so->markPos.lastItem * sizeof(BTScanPosItem));
508 PG_RETURN_VOID();
512 * Bulk deletion of all index entries pointing to a set of heap tuples.
513 * The set of target tuples is specified via a callback routine that tells
514 * whether any given heap tuple (identified by ItemPointer) is being deleted.
516 * Result: a palloc'd struct containing statistical info for VACUUM displays.
518 Datum
519 btbulkdelete(PG_FUNCTION_ARGS)
521 IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
522 IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
523 IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
524 void *callback_state = (void *) PG_GETARG_POINTER(3);
525 Relation rel = info->index;
526 BTCycleId cycleid;
528 /* allocate stats if first time through, else re-use existing struct */
529 if (stats == NULL)
530 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
532 /* Establish the vacuum cycle ID to use for this scan */
533 /* The ENSURE stuff ensures we clean up shared memory on failure */
534 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
536 cycleid = _bt_start_vacuum(rel);
538 btvacuumscan(info, stats, callback, callback_state, cycleid);
540 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
541 _bt_end_vacuum(rel);
543 PG_RETURN_POINTER(stats);
547 * Post-VACUUM cleanup.
549 * Result: a palloc'd struct containing statistical info for VACUUM displays.
551 Datum
552 btvacuumcleanup(PG_FUNCTION_ARGS)
554 IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
555 IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
557 /* No-op in ANALYZE ONLY mode */
558 if (info->analyze_only)
559 PG_RETURN_POINTER(stats);
562 * If btbulkdelete was called, we need not do anything, just return the
563 * stats from the latest btbulkdelete call. If it wasn't called, we must
564 * still do a pass over the index, to recycle any newly-recyclable pages
565 * and to obtain index statistics.
567 * Since we aren't going to actually delete any leaf items, there's no
568 * need to go through all the vacuum-cycle-ID pushups.
570 if (stats == NULL)
572 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
573 btvacuumscan(info, stats, NULL, NULL, 0);
576 /* Finally, vacuum the FSM */
577 IndexFreeSpaceMapVacuum(info->index);
580 * During a non-FULL vacuum it's quite possible for us to be fooled by
581 * concurrent page splits into double-counting some index tuples, so
582 * disbelieve any total that exceeds the underlying heap's count. (We
583 * can't check this during btbulkdelete.)
585 if (!info->vacuum_full)
587 if (stats->num_index_tuples > info->num_heap_tuples)
588 stats->num_index_tuples = info->num_heap_tuples;
591 PG_RETURN_POINTER(stats);
595 * btvacuumscan --- scan the index for VACUUMing purposes
597 * This combines the functions of looking for leaf tuples that are deletable
598 * according to the vacuum callback, looking for empty pages that can be
599 * deleted, and looking for old deleted pages that can be recycled. Both
600 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
601 * btbulkdelete call occurred).
603 * The caller is responsible for initially allocating/zeroing a stats struct
604 * and for obtaining a vacuum cycle ID if necessary.
606 static void
607 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
608 IndexBulkDeleteCallback callback, void *callback_state,
609 BTCycleId cycleid)
611 Relation rel = info->index;
612 BTVacState vstate;
613 BlockNumber num_pages;
614 BlockNumber blkno;
615 bool needLock;
618 * Reset counts that will be incremented during the scan; needed in case
619 * of multiple scans during a single VACUUM command
621 stats->num_index_tuples = 0;
622 stats->pages_deleted = 0;
624 /* Set up info to pass down to btvacuumpage */
625 vstate.info = info;
626 vstate.stats = stats;
627 vstate.callback = callback;
628 vstate.callback_state = callback_state;
629 vstate.cycleid = cycleid;
630 vstate.lastUsedPage = BTREE_METAPAGE;
631 vstate.totFreePages = 0;
633 /* Create a temporary memory context to run _bt_pagedel in */
634 vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
635 "_bt_pagedel",
636 ALLOCSET_DEFAULT_MINSIZE,
637 ALLOCSET_DEFAULT_INITSIZE,
638 ALLOCSET_DEFAULT_MAXSIZE);
641 * The outer loop iterates over all index pages except the metapage, in
642 * physical order (we hope the kernel will cooperate in providing
643 * read-ahead for speed). It is critical that we visit all leaf pages,
644 * including ones added after we start the scan, else we might fail to
645 * delete some deletable tuples. Hence, we must repeatedly check the
646 * relation length. We must acquire the relation-extension lock while
647 * doing so to avoid a race condition: if someone else is extending the
648 * relation, there is a window where bufmgr/smgr have created a new
649 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
650 * we manage to scan such a page here, we'll improperly assume it can be
651 * recycled. Taking the lock synchronizes things enough to prevent a
652 * problem: either num_pages won't include the new page, or _bt_getbuf
653 * already has write lock on the buffer and it will be fully initialized
654 * before we can examine it. (See also vacuumlazy.c, which has the same
655 * issue.) Also, we need not worry if a page is added immediately after
656 * we look; the page splitting code already has write-lock on the left
657 * page before it adds a right page, so we must already have processed any
658 * tuples due to be moved into such a page.
660 * We can skip locking for new or temp relations, however, since no one
661 * else could be accessing them.
663 needLock = !RELATION_IS_LOCAL(rel);
665 blkno = BTREE_METAPAGE + 1;
666 for (;;)
668 /* Get the current relation length */
669 if (needLock)
670 LockRelationForExtension(rel, ExclusiveLock);
671 num_pages = RelationGetNumberOfBlocks(rel);
672 if (needLock)
673 UnlockRelationForExtension(rel, ExclusiveLock);
675 /* Quit if we've scanned the whole relation */
676 if (blkno >= num_pages)
677 break;
678 /* Iterate over pages, then loop back to recheck length */
679 for (; blkno < num_pages; blkno++)
681 btvacuumpage(&vstate, blkno, blkno);
686 * During VACUUM FULL, we truncate off any recyclable pages at the end of
687 * the index. In a normal vacuum it'd be unsafe to do this except by
688 * acquiring exclusive lock on the index and then rechecking all the
689 * pages; doesn't seem worth it.
691 if (info->vacuum_full && vstate.lastUsedPage < num_pages - 1)
693 BlockNumber new_pages = vstate.lastUsedPage + 1;
696 * Okay to truncate.
698 RelationTruncate(rel, new_pages);
700 /* update statistics */
701 stats->pages_removed += num_pages - new_pages;
702 vstate.totFreePages -= (num_pages - new_pages);
703 num_pages = new_pages;
706 MemoryContextDelete(vstate.pagedelcontext);
708 /* update statistics */
709 stats->num_pages = num_pages;
710 stats->pages_free = vstate.totFreePages;
714 * btvacuumpage --- VACUUM one page
716 * This processes a single page for btvacuumscan(). In some cases we
717 * must go back and re-examine previously-scanned pages; this routine
718 * recurses when necessary to handle that case.
720 * blkno is the page to process. orig_blkno is the highest block number
721 * reached by the outer btvacuumscan loop (the same as blkno, unless we
722 * are recursing to re-examine a previous page).
724 static void
725 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
727 IndexVacuumInfo *info = vstate->info;
728 IndexBulkDeleteResult *stats = vstate->stats;
729 IndexBulkDeleteCallback callback = vstate->callback;
730 void *callback_state = vstate->callback_state;
731 Relation rel = info->index;
732 bool delete_now;
733 BlockNumber recurse_to;
734 Buffer buf;
735 Page page;
736 BTPageOpaque opaque;
738 restart:
739 delete_now = false;
740 recurse_to = P_NONE;
742 /* call vacuum_delay_point while not holding any buffer lock */
743 vacuum_delay_point();
746 * We can't use _bt_getbuf() here because it always applies
747 * _bt_checkpage(), which will barf on an all-zero page. We want to
748 * recycle all-zero pages, not fail. Also, we want to use a nondefault
749 * buffer access strategy.
751 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
752 info->strategy);
753 LockBuffer(buf, BT_READ);
754 page = BufferGetPage(buf);
755 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
756 if (!PageIsNew(page))
757 _bt_checkpage(rel, buf);
760 * If we are recursing, the only case we want to do anything with is a
761 * live leaf page having the current vacuum cycle ID. Any other state
762 * implies we already saw the page (eg, deleted it as being empty).
764 if (blkno != orig_blkno)
766 if (_bt_page_recyclable(page) ||
767 P_IGNORE(opaque) ||
768 !P_ISLEAF(opaque) ||
769 opaque->btpo_cycleid != vstate->cycleid)
771 _bt_relbuf(rel, buf);
772 return;
776 /* If the page is in use, update lastUsedPage */
777 if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
778 vstate->lastUsedPage = blkno;
780 /* Page is valid, see what to do with it */
781 if (_bt_page_recyclable(page))
783 /* Okay to recycle this page */
784 RecordFreeIndexPage(rel, blkno);
785 vstate->totFreePages++;
786 stats->pages_deleted++;
788 else if (P_ISDELETED(opaque))
790 /* Already deleted, but can't recycle yet */
791 stats->pages_deleted++;
793 else if (P_ISHALFDEAD(opaque))
795 /* Half-dead, try to delete */
796 delete_now = true;
798 else if (P_ISLEAF(opaque))
800 OffsetNumber deletable[MaxOffsetNumber];
801 int ndeletable;
802 OffsetNumber offnum,
803 minoff,
804 maxoff;
807 * Trade in the initial read lock for a super-exclusive write lock on
808 * this page. We must get such a lock on every leaf page over the
809 * course of the vacuum scan, whether or not it actually contains any
810 * deletable tuples --- see nbtree/README.
812 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
813 LockBufferForCleanup(buf);
816 * Check whether we need to recurse back to earlier pages. What we
817 * are concerned about is a page split that happened since we started
818 * the vacuum scan. If the split moved some tuples to a lower page
819 * then we might have missed 'em. If so, set up for tail recursion.
820 * (Must do this before possibly clearing btpo_cycleid below!)
822 if (vstate->cycleid != 0 &&
823 opaque->btpo_cycleid == vstate->cycleid &&
824 !(opaque->btpo_flags & BTP_SPLIT_END) &&
825 !P_RIGHTMOST(opaque) &&
826 opaque->btpo_next < orig_blkno)
827 recurse_to = opaque->btpo_next;
830 * Scan over all items to see which ones need deleted according to the
831 * callback function.
833 ndeletable = 0;
834 minoff = P_FIRSTDATAKEY(opaque);
835 maxoff = PageGetMaxOffsetNumber(page);
836 if (callback)
838 for (offnum = minoff;
839 offnum <= maxoff;
840 offnum = OffsetNumberNext(offnum))
842 IndexTuple itup;
843 ItemPointer htup;
845 itup = (IndexTuple) PageGetItem(page,
846 PageGetItemId(page, offnum));
847 htup = &(itup->t_tid);
848 if (callback(htup, callback_state))
849 deletable[ndeletable++] = offnum;
854 * Apply any needed deletes. We issue just one _bt_delitems() call
855 * per page, so as to minimize WAL traffic.
857 if (ndeletable > 0)
859 _bt_delitems(rel, buf, deletable, ndeletable);
860 stats->tuples_removed += ndeletable;
861 /* must recompute maxoff */
862 maxoff = PageGetMaxOffsetNumber(page);
864 else
867 * If the page has been split during this vacuum cycle, it seems
868 * worth expending a write to clear btpo_cycleid even if we don't
869 * have any deletions to do. (If we do, _bt_delitems takes care
870 * of this.) This ensures we won't process the page again.
872 * We treat this like a hint-bit update because there's no need to
873 * WAL-log it.
875 if (vstate->cycleid != 0 &&
876 opaque->btpo_cycleid == vstate->cycleid)
878 opaque->btpo_cycleid = 0;
879 SetBufferCommitInfoNeedsSave(buf);
884 * If it's now empty, try to delete; else count the live tuples. We
885 * don't delete when recursing, though, to avoid putting entries into
886 * freePages out-of-order (doesn't seem worth any extra code to handle
887 * the case).
889 if (minoff > maxoff)
890 delete_now = (blkno == orig_blkno);
891 else
892 stats->num_index_tuples += maxoff - minoff + 1;
895 if (delete_now)
897 MemoryContext oldcontext;
898 int ndel;
900 /* Run pagedel in a temp context to avoid memory leakage */
901 MemoryContextReset(vstate->pagedelcontext);
902 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
904 ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full);
906 /* count only this page, else may double-count parent */
907 if (ndel)
908 stats->pages_deleted++;
911 * During VACUUM FULL it's okay to recycle deleted pages immediately,
912 * since there can be no other transactions scanning the index. Note
913 * that we will only recycle the current page and not any parent pages
914 * that _bt_pagedel might have recursed to; this seems reasonable in
915 * the name of simplicity. (Trying to do otherwise would mean we'd
916 * have to sort the list of recyclable pages we're building.)
918 if (ndel && info->vacuum_full)
920 RecordFreeIndexPage(rel, blkno);
921 vstate->totFreePages++;
924 MemoryContextSwitchTo(oldcontext);
925 /* pagedel released buffer, so we shouldn't */
927 else
928 _bt_relbuf(rel, buf);
931 * This is really tail recursion, but if the compiler is too stupid to
932 * optimize it as such, we'd eat an uncomfortably large amount of stack
933 * space per recursion level (due to the deletable[] array). A failure is
934 * improbable since the number of levels isn't likely to be large ... but
935 * just in case, let's hand-optimize into a loop.
937 if (recurse_to != P_NONE)
939 blkno = recurse_to;
940 goto restart;