Avoid nbtree parallel scan currPos confusion.
[pgsql.git] / src / backend / access / nbtree / nbtree.c
blobdd76fe1da909cd56dd60fb1a2db081de9c5b5343
1 /*-------------------------------------------------------------------------
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
7 * NOTES
8 * This file contains only the public interface routines.
11 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
17 *-------------------------------------------------------------------------
19 #include "postgres.h"
21 #include "access/nbtree.h"
22 #include "access/relscan.h"
23 #include "commands/progress.h"
24 #include "commands/vacuum.h"
25 #include "nodes/execnodes.h"
26 #include "pgstat.h"
27 #include "storage/bulk_write.h"
28 #include "storage/condition_variable.h"
29 #include "storage/indexfsm.h"
30 #include "storage/ipc.h"
31 #include "storage/lmgr.h"
32 #include "utils/fmgrprotos.h"
33 #include "utils/index_selfuncs.h"
34 #include "utils/memutils.h"
38 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
40 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
41 * scan to advance it via another call to _bt_first.
43 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
44 * a new page; others must wait.
46 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
47 * to a new page; some process can start doing that.
49 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
51 typedef enum
53 BTPARALLEL_NOT_INITIALIZED,
54 BTPARALLEL_NEED_PRIMSCAN,
55 BTPARALLEL_ADVANCING,
56 BTPARALLEL_IDLE,
57 BTPARALLEL_DONE,
58 } BTPS_State;
61 * BTParallelScanDescData contains btree specific shared information required
62 * for parallel scan.
64 typedef struct BTParallelScanDescData
66 BlockNumber btps_nextScanPage; /* next page to be scanned */
67 BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
68 * btps_nextScanPage */
69 BTPS_State btps_pageStatus; /* indicates whether next page is
70 * available for scan. see above for
71 * possible states of parallel scan. */
72 slock_t btps_mutex; /* protects above variables, btps_arrElems */
73 ConditionVariable btps_cv; /* used to synchronize parallel scan */
76 * btps_arrElems is used when scans need to schedule another primitive
77 * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79 int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
80 } BTParallelScanDescData;
82 typedef struct BTParallelScanDescData *BTParallelScanDesc;
85 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
86 IndexBulkDeleteCallback callback, void *callback_state,
87 BTCycleId cycleid);
88 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
89 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
90 IndexTuple posting,
91 OffsetNumber updatedoffset,
92 int *nremaining);
96 * Btree handler function: return IndexAmRoutine with access method parameters
97 * and callbacks.
99 Datum
100 bthandler(PG_FUNCTION_ARGS)
102 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
104 amroutine->amstrategies = BTMaxStrategyNumber;
105 amroutine->amsupport = BTNProcs;
106 amroutine->amoptsprocnum = BTOPTIONS_PROC;
107 amroutine->amcanorder = true;
108 amroutine->amcanorderbyop = false;
109 amroutine->amcanbackward = true;
110 amroutine->amcanunique = true;
111 amroutine->amcanmulticol = true;
112 amroutine->amoptionalkey = true;
113 amroutine->amsearcharray = true;
114 amroutine->amsearchnulls = true;
115 amroutine->amstorage = false;
116 amroutine->amclusterable = true;
117 amroutine->ampredlocks = true;
118 amroutine->amcanparallel = true;
119 amroutine->amcanbuildparallel = true;
120 amroutine->amcaninclude = true;
121 amroutine->amusemaintenanceworkmem = false;
122 amroutine->amsummarizing = false;
123 amroutine->amparallelvacuumoptions =
124 VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
125 amroutine->amkeytype = InvalidOid;
127 amroutine->ambuild = btbuild;
128 amroutine->ambuildempty = btbuildempty;
129 amroutine->aminsert = btinsert;
130 amroutine->aminsertcleanup = NULL;
131 amroutine->ambulkdelete = btbulkdelete;
132 amroutine->amvacuumcleanup = btvacuumcleanup;
133 amroutine->amcanreturn = btcanreturn;
134 amroutine->amcostestimate = btcostestimate;
135 amroutine->amgettreeheight = btgettreeheight;
136 amroutine->amoptions = btoptions;
137 amroutine->amproperty = btproperty;
138 amroutine->ambuildphasename = btbuildphasename;
139 amroutine->amvalidate = btvalidate;
140 amroutine->amadjustmembers = btadjustmembers;
141 amroutine->ambeginscan = btbeginscan;
142 amroutine->amrescan = btrescan;
143 amroutine->amgettuple = btgettuple;
144 amroutine->amgetbitmap = btgetbitmap;
145 amroutine->amendscan = btendscan;
146 amroutine->ammarkpos = btmarkpos;
147 amroutine->amrestrpos = btrestrpos;
148 amroutine->amestimateparallelscan = btestimateparallelscan;
149 amroutine->aminitparallelscan = btinitparallelscan;
150 amroutine->amparallelrescan = btparallelrescan;
152 PG_RETURN_POINTER(amroutine);
156 * btbuildempty() -- build an empty btree index in the initialization fork
158 void
159 btbuildempty(Relation index)
161 bool allequalimage = _bt_allequalimage(index, false);
162 BulkWriteState *bulkstate;
163 BulkWriteBuffer metabuf;
165 bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
167 /* Construct metapage. */
168 metabuf = smgr_bulk_get_buf(bulkstate);
169 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
170 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
172 smgr_bulk_finish(bulkstate);
176 * btinsert() -- insert an index tuple into a btree.
178 * Descend the tree recursively, find the appropriate location for our
179 * new tuple, and put it there.
181 bool
182 btinsert(Relation rel, Datum *values, bool *isnull,
183 ItemPointer ht_ctid, Relation heapRel,
184 IndexUniqueCheck checkUnique,
185 bool indexUnchanged,
186 IndexInfo *indexInfo)
188 bool result;
189 IndexTuple itup;
191 /* generate an index tuple */
192 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
193 itup->t_tid = *ht_ctid;
195 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
197 pfree(itup);
199 return result;
203 * btgettuple() -- Get the next tuple in the scan.
205 bool
206 btgettuple(IndexScanDesc scan, ScanDirection dir)
208 BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 bool res;
211 /* btree indexes are never lossy */
212 scan->xs_recheck = false;
214 /* Each loop iteration performs another primitive index scan */
218 * If we've already initialized this scan, we can just advance it in
219 * the appropriate direction. If we haven't done so yet, we call
220 * _bt_first() to get the first item in the scan.
222 if (!BTScanPosIsValid(so->currPos))
223 res = _bt_first(scan, dir);
224 else
227 * Check to see if we should kill the previously-fetched tuple.
229 if (scan->kill_prior_tuple)
232 * Yes, remember it for later. (We'll deal with all such
233 * tuples at once right before leaving the index page.) The
234 * test for numKilled overrun is not just paranoia: if the
235 * caller reverses direction in the indexscan then the same
236 * item might get entered multiple times. It's not worth
237 * trying to optimize that, so we don't detect it, but instead
238 * just forget any excess entries.
240 if (so->killedItems == NULL)
241 so->killedItems = (int *)
242 palloc(MaxTIDsPerBTreePage * sizeof(int));
243 if (so->numKilled < MaxTIDsPerBTreePage)
244 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
248 * Now continue the scan.
250 res = _bt_next(scan, dir);
253 /* If we have a tuple, return it ... */
254 if (res)
255 break;
256 /* ... otherwise see if we need another primitive index scan */
257 } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
259 return res;
263 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
265 int64
266 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
268 BTScanOpaque so = (BTScanOpaque) scan->opaque;
269 int64 ntids = 0;
270 ItemPointer heapTid;
272 /* Each loop iteration performs another primitive index scan */
275 /* Fetch the first page & tuple */
276 if (_bt_first(scan, ForwardScanDirection))
278 /* Save tuple ID, and continue scanning */
279 heapTid = &scan->xs_heaptid;
280 tbm_add_tuples(tbm, heapTid, 1, false);
281 ntids++;
283 for (;;)
286 * Advance to next tuple within page. This is the same as the
287 * easy case in _bt_next().
289 if (++so->currPos.itemIndex > so->currPos.lastItem)
291 /* let _bt_next do the heavy lifting */
292 if (!_bt_next(scan, ForwardScanDirection))
293 break;
296 /* Save tuple ID, and continue scanning */
297 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
298 tbm_add_tuples(tbm, heapTid, 1, false);
299 ntids++;
302 /* Now see if we need another primitive index scan */
303 } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
305 return ntids;
309 * btbeginscan() -- start a scan on a btree index
311 IndexScanDesc
312 btbeginscan(Relation rel, int nkeys, int norderbys)
314 IndexScanDesc scan;
315 BTScanOpaque so;
317 /* no order by operators allowed */
318 Assert(norderbys == 0);
320 /* get the scan */
321 scan = RelationGetIndexScan(rel, nkeys, norderbys);
323 /* allocate private workspace */
324 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
325 BTScanPosInvalidate(so->currPos);
326 BTScanPosInvalidate(so->markPos);
327 if (scan->numberOfKeys > 0)
328 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
329 else
330 so->keyData = NULL;
332 so->needPrimScan = false;
333 so->scanBehind = false;
334 so->oppositeDirCheck = false;
335 so->arrayKeys = NULL;
336 so->orderProcs = NULL;
337 so->arrayContext = NULL;
339 so->killedItems = NULL; /* until needed */
340 so->numKilled = 0;
343 * We don't know yet whether the scan will be index-only, so we do not
344 * allocate the tuple workspace arrays until btrescan. However, we set up
345 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
347 so->currTuples = so->markTuples = NULL;
349 scan->xs_itupdesc = RelationGetDescr(rel);
351 scan->opaque = so;
353 return scan;
357 * btrescan() -- rescan an index relation
359 void
360 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
361 ScanKey orderbys, int norderbys)
363 BTScanOpaque so = (BTScanOpaque) scan->opaque;
365 /* we aren't holding any read locks, but gotta drop the pins */
366 if (BTScanPosIsValid(so->currPos))
368 /* Before leaving current page, deal with any killed items */
369 if (so->numKilled > 0)
370 _bt_killitems(scan);
371 BTScanPosUnpinIfPinned(so->currPos);
372 BTScanPosInvalidate(so->currPos);
375 so->markItemIndex = -1;
376 so->needPrimScan = false;
377 so->scanBehind = false;
378 so->oppositeDirCheck = false;
379 BTScanPosUnpinIfPinned(so->markPos);
380 BTScanPosInvalidate(so->markPos);
383 * Allocate tuple workspace arrays, if needed for an index-only scan and
384 * not already done in a previous rescan call. To save on palloc
385 * overhead, both workspaces are allocated as one palloc block; only this
386 * function and btendscan know that.
388 * NOTE: this data structure also makes it safe to return data from a
389 * "name" column, even though btree name_ops uses an underlying storage
390 * datatype of cstring. The risk there is that "name" is supposed to be
391 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
392 * However, since we only return data out of tuples sitting in the
393 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
394 * data out of the markTuples array --- running off the end of memory for
395 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
396 * adding special-case treatment for name_ops elsewhere.
398 if (scan->xs_want_itup && so->currTuples == NULL)
400 so->currTuples = (char *) palloc(BLCKSZ * 2);
401 so->markTuples = so->currTuples + BLCKSZ;
405 * Reset the scan keys
407 if (scankey && scan->numberOfKeys > 0)
408 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
409 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
410 so->numArrayKeys = 0; /* ditto */
414 * btendscan() -- close down a scan
416 void
417 btendscan(IndexScanDesc scan)
419 BTScanOpaque so = (BTScanOpaque) scan->opaque;
421 /* we aren't holding any read locks, but gotta drop the pins */
422 if (BTScanPosIsValid(so->currPos))
424 /* Before leaving current page, deal with any killed items */
425 if (so->numKilled > 0)
426 _bt_killitems(scan);
427 BTScanPosUnpinIfPinned(so->currPos);
430 so->markItemIndex = -1;
431 BTScanPosUnpinIfPinned(so->markPos);
433 /* No need to invalidate positions, the RAM is about to be freed. */
435 /* Release storage */
436 if (so->keyData != NULL)
437 pfree(so->keyData);
438 /* so->arrayKeys and so->orderProcs are in arrayContext */
439 if (so->arrayContext != NULL)
440 MemoryContextDelete(so->arrayContext);
441 if (so->killedItems != NULL)
442 pfree(so->killedItems);
443 if (so->currTuples != NULL)
444 pfree(so->currTuples);
445 /* so->markTuples should not be pfree'd, see btrescan */
446 pfree(so);
450 * btmarkpos() -- save current scan position
452 void
453 btmarkpos(IndexScanDesc scan)
455 BTScanOpaque so = (BTScanOpaque) scan->opaque;
457 /* There may be an old mark with a pin (but no lock). */
458 BTScanPosUnpinIfPinned(so->markPos);
461 * Just record the current itemIndex. If we later step to next page
462 * before releasing the marked position, _bt_steppage makes a full copy of
463 * the currPos struct in markPos. If (as often happens) the mark is moved
464 * before we leave the page, we don't have to do that work.
466 if (BTScanPosIsValid(so->currPos))
467 so->markItemIndex = so->currPos.itemIndex;
468 else
470 BTScanPosInvalidate(so->markPos);
471 so->markItemIndex = -1;
476 * btrestrpos() -- restore scan to last saved position
478 void
479 btrestrpos(IndexScanDesc scan)
481 BTScanOpaque so = (BTScanOpaque) scan->opaque;
483 if (so->markItemIndex >= 0)
486 * The scan has never moved to a new page since the last mark. Just
487 * restore the itemIndex.
489 * NB: In this case we can't count on anything in so->markPos to be
490 * accurate.
492 so->currPos.itemIndex = so->markItemIndex;
494 else
497 * The scan moved to a new page after last mark or restore, and we are
498 * now restoring to the marked page. We aren't holding any read
499 * locks, but if we're still holding the pin for the current position,
500 * we must drop it.
502 if (BTScanPosIsValid(so->currPos))
504 /* Before leaving current page, deal with any killed items */
505 if (so->numKilled > 0)
506 _bt_killitems(scan);
507 BTScanPosUnpinIfPinned(so->currPos);
510 if (BTScanPosIsValid(so->markPos))
512 /* bump pin on mark buffer for assignment to current buffer */
513 if (BTScanPosIsPinned(so->markPos))
514 IncrBufferRefCount(so->markPos.buf);
515 memcpy(&so->currPos, &so->markPos,
516 offsetof(BTScanPosData, items[1]) +
517 so->markPos.lastItem * sizeof(BTScanPosItem));
518 if (so->currTuples)
519 memcpy(so->currTuples, so->markTuples,
520 so->markPos.nextTupleOffset);
521 /* Reset the scan's array keys (see _bt_steppage for why) */
522 if (so->numArrayKeys)
524 _bt_start_array_keys(scan, so->currPos.dir);
525 so->needPrimScan = false;
528 else
529 BTScanPosInvalidate(so->currPos);
534 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
536 Size
537 btestimateparallelscan(int nkeys, int norderbys)
539 /* Pessimistically assume all input scankeys will be output with arrays */
540 return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
544 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
546 void
547 btinitparallelscan(void *target)
549 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
551 SpinLockInit(&bt_target->btps_mutex);
552 bt_target->btps_nextScanPage = InvalidBlockNumber;
553 bt_target->btps_lastCurrPage = InvalidBlockNumber;
554 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
555 ConditionVariableInit(&bt_target->btps_cv);
559 * btparallelrescan() -- reset parallel scan
561 void
562 btparallelrescan(IndexScanDesc scan)
564 BTParallelScanDesc btscan;
565 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
567 Assert(parallel_scan);
569 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
570 parallel_scan->ps_offset);
573 * In theory, we don't need to acquire the spinlock here, because there
574 * shouldn't be any other workers running at this point, but we do so for
575 * consistency.
577 SpinLockAcquire(&btscan->btps_mutex);
578 btscan->btps_nextScanPage = InvalidBlockNumber;
579 btscan->btps_lastCurrPage = InvalidBlockNumber;
580 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
581 SpinLockRelease(&btscan->btps_mutex);
585 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
586 * page. Other scans must wait until we call _bt_parallel_release()
587 * or _bt_parallel_done().
589 * The return value is true if we successfully seized the scan and false
590 * if we did not. The latter case occurs when no pages remain, or when
591 * another primitive index scan is scheduled that caller's backend cannot
592 * start just yet (only backends that call from _bt_first are capable of
593 * starting primitive index scans, which they indicate by passing first=true).
595 * If the return value is true, *next_scan_page returns the next page of the
596 * scan, and *last_curr_page returns the page that *next_scan_page came from.
597 * An invalid *next_scan_page means the scan hasn't yet started, or that
598 * caller needs to start the next primitive index scan (if it's the latter
599 * case we'll set so.needPrimScan).
601 * Callers should ignore the value of *next_scan_page and *last_curr_page if
602 * the return value is false.
604 bool
605 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
606 BlockNumber *last_curr_page, bool first)
608 BTScanOpaque so = (BTScanOpaque) scan->opaque;
609 bool exit_loop = false,
610 status = true,
611 endscan = false;
612 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
613 BTParallelScanDesc btscan;
615 *next_scan_page = InvalidBlockNumber;
616 *last_curr_page = InvalidBlockNumber;
619 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
620 * call to _bt_readnextpage treats this backend similarly to a serial
621 * backend that steps from *last_curr_page to *next_scan_page (unless this
622 * backend's so->currPos is initialized by _bt_readfirstpage before then).
624 BTScanPosInvalidate(so->currPos);
625 so->currPos.moreLeft = so->currPos.moreRight = true;
627 if (first)
630 * Initialize array related state when called from _bt_first, assuming
631 * that this will be the first primitive index scan for the scan
633 so->needPrimScan = false;
634 so->scanBehind = false;
635 so->oppositeDirCheck = false;
637 else
640 * Don't attempt to seize the scan when it requires another primitive
641 * index scan, since caller's backend cannot start it right now
643 if (so->needPrimScan)
644 return false;
647 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
648 parallel_scan->ps_offset);
650 while (1)
652 SpinLockAcquire(&btscan->btps_mutex);
654 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
656 /* We're done with this parallel index scan */
657 status = false;
659 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
660 btscan->btps_nextScanPage == P_NONE)
662 /* End this parallel index scan */
663 status = false;
664 endscan = true;
666 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
668 Assert(so->numArrayKeys);
670 if (first)
672 /* Can start scheduled primitive scan right away, so do so */
673 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
674 for (int i = 0; i < so->numArrayKeys; i++)
676 BTArrayKeyInfo *array = &so->arrayKeys[i];
677 ScanKey skey = &so->keyData[array->scan_key];
679 array->cur_elem = btscan->btps_arrElems[i];
680 skey->sk_argument = array->elem_values[array->cur_elem];
682 exit_loop = true;
684 else
687 * Don't attempt to seize the scan when it requires another
688 * primitive index scan, since caller's backend cannot start
689 * it right now
691 status = false;
695 * Either way, update backend local state to indicate that a
696 * pending primitive scan is required
698 so->needPrimScan = true;
699 so->scanBehind = false;
700 so->oppositeDirCheck = false;
702 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
705 * We have successfully seized control of the scan for the purpose
706 * of advancing it to a new page!
708 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
709 Assert(btscan->btps_nextScanPage != P_NONE);
710 *next_scan_page = btscan->btps_nextScanPage;
711 *last_curr_page = btscan->btps_lastCurrPage;
712 exit_loop = true;
714 SpinLockRelease(&btscan->btps_mutex);
715 if (exit_loop || !status)
716 break;
717 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
719 ConditionVariableCancelSleep();
721 /* When the scan has reached the rightmost (or leftmost) page, end it */
722 if (endscan)
723 _bt_parallel_done(scan);
725 return status;
729 * _bt_parallel_release() -- Complete the process of advancing the scan to a
730 * new page. We now have the new value btps_nextScanPage; another backend
731 * can now begin advancing the scan.
733 * Callers whose scan uses array keys must save their curr_page argument so
734 * that it can be passed to _bt_parallel_primscan_schedule, should caller
735 * determine that another primitive index scan is required.
737 * If caller's next_scan_page is P_NONE, the scan has reached the index's
738 * rightmost/leftmost page. This is treated as reaching the end of the scan
739 * within _bt_parallel_seize.
741 * Note: unlike the serial case, parallel scans don't need to remember both
742 * sibling links. next_scan_page is whichever link is next given the scan's
743 * direction. That's all we'll ever need, since the direction of a parallel
744 * scan can never change.
746 void
747 _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
748 BlockNumber curr_page)
750 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
751 BTParallelScanDesc btscan;
753 Assert(BlockNumberIsValid(next_scan_page));
755 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
756 parallel_scan->ps_offset);
758 SpinLockAcquire(&btscan->btps_mutex);
759 btscan->btps_nextScanPage = next_scan_page;
760 btscan->btps_lastCurrPage = curr_page;
761 btscan->btps_pageStatus = BTPARALLEL_IDLE;
762 SpinLockRelease(&btscan->btps_mutex);
763 ConditionVariableSignal(&btscan->btps_cv);
767 * _bt_parallel_done() -- Mark the parallel scan as complete.
769 * When there are no pages left to scan, this function should be called to
770 * notify other workers. Otherwise, they might wait forever for the scan to
771 * advance to the next page.
773 void
774 _bt_parallel_done(IndexScanDesc scan)
776 BTScanOpaque so = (BTScanOpaque) scan->opaque;
777 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
778 BTParallelScanDesc btscan;
779 bool status_changed = false;
781 Assert(!BTScanPosIsValid(so->currPos));
783 /* Do nothing, for non-parallel scans */
784 if (parallel_scan == NULL)
785 return;
788 * Should not mark parallel scan done when there's still a pending
789 * primitive index scan
791 if (so->needPrimScan)
792 return;
794 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
795 parallel_scan->ps_offset);
798 * Mark the parallel scan as done, unless some other process did so
799 * already
801 SpinLockAcquire(&btscan->btps_mutex);
802 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
803 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
805 btscan->btps_pageStatus = BTPARALLEL_DONE;
806 status_changed = true;
808 SpinLockRelease(&btscan->btps_mutex);
810 /* wake up all the workers associated with this parallel scan */
811 if (status_changed)
812 ConditionVariableBroadcast(&btscan->btps_cv);
816 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
818 * Caller passes the curr_page most recently passed to _bt_parallel_release
819 * by its backend. Caller successfully schedules the next primitive index scan
820 * if the shared parallel state hasn't been seized since caller's backend last
821 * advanced the scan.
823 void
824 _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
826 BTScanOpaque so = (BTScanOpaque) scan->opaque;
827 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
828 BTParallelScanDesc btscan;
830 Assert(so->numArrayKeys);
832 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
833 parallel_scan->ps_offset);
835 SpinLockAcquire(&btscan->btps_mutex);
836 if (btscan->btps_lastCurrPage == curr_page &&
837 btscan->btps_pageStatus == BTPARALLEL_IDLE)
839 btscan->btps_nextScanPage = InvalidBlockNumber;
840 btscan->btps_lastCurrPage = InvalidBlockNumber;
841 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
843 /* Serialize scan's current array keys */
844 for (int i = 0; i < so->numArrayKeys; i++)
846 BTArrayKeyInfo *array = &so->arrayKeys[i];
848 btscan->btps_arrElems[i] = array->cur_elem;
851 SpinLockRelease(&btscan->btps_mutex);
855 * Bulk deletion of all index entries pointing to a set of heap tuples.
856 * The set of target tuples is specified via a callback routine that tells
857 * whether any given heap tuple (identified by ItemPointer) is being deleted.
859 * Result: a palloc'd struct containing statistical info for VACUUM displays.
861 IndexBulkDeleteResult *
862 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
863 IndexBulkDeleteCallback callback, void *callback_state)
865 Relation rel = info->index;
866 BTCycleId cycleid;
868 /* allocate stats if first time through, else re-use existing struct */
869 if (stats == NULL)
870 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
872 /* Establish the vacuum cycle ID to use for this scan */
873 /* The ENSURE stuff ensures we clean up shared memory on failure */
874 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
876 cycleid = _bt_start_vacuum(rel);
878 btvacuumscan(info, stats, callback, callback_state, cycleid);
880 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
881 _bt_end_vacuum(rel);
883 return stats;
887 * Post-VACUUM cleanup.
889 * Result: a palloc'd struct containing statistical info for VACUUM displays.
891 IndexBulkDeleteResult *
892 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
894 BlockNumber num_delpages;
896 /* No-op in ANALYZE ONLY mode */
897 if (info->analyze_only)
898 return stats;
901 * If btbulkdelete was called, we need not do anything (we just maintain
902 * the information used within _bt_vacuum_needs_cleanup() by calling
903 * _bt_set_cleanup_info() below).
905 * If btbulkdelete was _not_ called, then we have a choice to make: we
906 * must decide whether or not a btvacuumscan() call is needed now (i.e.
907 * whether the ongoing VACUUM operation can entirely avoid a physical scan
908 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
909 * now.
911 if (stats == NULL)
913 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
914 if (!_bt_vacuum_needs_cleanup(info->index))
915 return NULL;
918 * Since we aren't going to actually delete any leaf items, there's no
919 * need to go through all the vacuum-cycle-ID pushups here.
921 * Posting list tuples are a source of inaccuracy for cleanup-only
922 * scans. btvacuumscan() will assume that the number of index tuples
923 * from each page can be used as num_index_tuples, even though
924 * num_index_tuples is supposed to represent the number of TIDs in the
925 * index. This naive approach can underestimate the number of tuples
926 * in the index significantly.
928 * We handle the problem by making num_index_tuples an estimate in
929 * cleanup-only case.
931 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
932 btvacuumscan(info, stats, NULL, NULL, 0);
933 stats->estimated_count = true;
937 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
939 * num_delpages is the number of deleted pages now in the index that were
940 * not safe to place in the FSM to be recycled just yet. num_delpages is
941 * greater than 0 only when _bt_pagedel() actually deleted pages during
942 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
943 * have failed to place any newly deleted pages in the FSM just moments
944 * ago. (Actually, there are edge cases where recycling of the current
945 * VACUUM's newly deleted pages does not even become safe by the time the
946 * next VACUUM comes around. See nbtree/README.)
948 Assert(stats->pages_deleted >= stats->pages_free);
949 num_delpages = stats->pages_deleted - stats->pages_free;
950 _bt_set_cleanup_info(info->index, num_delpages);
953 * It's quite possible for us to be fooled by concurrent page splits into
954 * double-counting some index tuples, so disbelieve any total that exceeds
955 * the underlying heap's count ... if we know that accurately. Otherwise
956 * this might just make matters worse.
958 if (!info->estimated_count)
960 if (stats->num_index_tuples > info->num_heap_tuples)
961 stats->num_index_tuples = info->num_heap_tuples;
964 return stats;
968 * btvacuumscan --- scan the index for VACUUMing purposes
970 * This combines the functions of looking for leaf tuples that are deletable
971 * according to the vacuum callback, looking for empty pages that can be
972 * deleted, and looking for old deleted pages that can be recycled. Both
973 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
974 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
976 * The caller is responsible for initially allocating/zeroing a stats struct
977 * and for obtaining a vacuum cycle ID if necessary.
979 static void
980 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
981 IndexBulkDeleteCallback callback, void *callback_state,
982 BTCycleId cycleid)
984 Relation rel = info->index;
985 BTVacState vstate;
986 BlockNumber num_pages;
987 BlockNumber scanblkno;
988 bool needLock;
991 * Reset fields that track information about the entire index now. This
992 * avoids double-counting in the case where a single VACUUM command
993 * requires multiple scans of the index.
995 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
996 * since they track information about the VACUUM command, and so must last
997 * across each call to btvacuumscan().
999 * (Note that pages_free is treated as state about the whole index, not
1000 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1001 * calls are idempotent, and get repeated for the same deleted pages in
1002 * some scenarios. The point for us is to track the number of recyclable
1003 * pages in the index at the end of the VACUUM command.)
1005 stats->num_pages = 0;
1006 stats->num_index_tuples = 0;
1007 stats->pages_deleted = 0;
1008 stats->pages_free = 0;
1010 /* Set up info to pass down to btvacuumpage */
1011 vstate.info = info;
1012 vstate.stats = stats;
1013 vstate.callback = callback;
1014 vstate.callback_state = callback_state;
1015 vstate.cycleid = cycleid;
1017 /* Create a temporary memory context to run _bt_pagedel in */
1018 vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1019 "_bt_pagedel",
1020 ALLOCSET_DEFAULT_SIZES);
1022 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1023 vstate.bufsize = 0;
1024 vstate.maxbufsize = 0;
1025 vstate.pendingpages = NULL;
1026 vstate.npendingpages = 0;
1027 /* Consider applying _bt_pendingfsm_finalize optimization */
1028 _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1031 * The outer loop iterates over all index pages except the metapage, in
1032 * physical order (we hope the kernel will cooperate in providing
1033 * read-ahead for speed). It is critical that we visit all leaf pages,
1034 * including ones added after we start the scan, else we might fail to
1035 * delete some deletable tuples. Hence, we must repeatedly check the
1036 * relation length. We must acquire the relation-extension lock while
1037 * doing so to avoid a race condition: if someone else is extending the
1038 * relation, there is a window where bufmgr/smgr have created a new
1039 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1040 * we manage to scan such a page here, we'll improperly assume it can be
1041 * recycled. Taking the lock synchronizes things enough to prevent a
1042 * problem: either num_pages won't include the new page, or _bt_getbuf
1043 * already has write lock on the buffer and it will be fully initialized
1044 * before we can examine it. Also, we need not worry if a page is added
1045 * immediately after we look; the page splitting code already has
1046 * write-lock on the left page before it adds a right page, so we must
1047 * already have processed any tuples due to be moved into such a page.
1049 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1050 * think the use of the extension lock is still required.
1052 * We can skip locking for new or temp relations, however, since no one
1053 * else could be accessing them.
1055 needLock = !RELATION_IS_LOCAL(rel);
1057 scanblkno = BTREE_METAPAGE + 1;
1058 for (;;)
1060 /* Get the current relation length */
1061 if (needLock)
1062 LockRelationForExtension(rel, ExclusiveLock);
1063 num_pages = RelationGetNumberOfBlocks(rel);
1064 if (needLock)
1065 UnlockRelationForExtension(rel, ExclusiveLock);
1067 if (info->report_progress)
1068 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1069 num_pages);
1071 /* Quit if we've scanned the whole relation */
1072 if (scanblkno >= num_pages)
1073 break;
1074 /* Iterate over pages, then loop back to recheck length */
1075 for (; scanblkno < num_pages; scanblkno++)
1077 btvacuumpage(&vstate, scanblkno);
1078 if (info->report_progress)
1079 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1080 scanblkno);
1084 /* Set statistics num_pages field to final size of index */
1085 stats->num_pages = num_pages;
1087 MemoryContextDelete(vstate.pagedelcontext);
1090 * If there were any calls to _bt_pagedel() during scan of the index then
1091 * see if any of the resulting pages can be placed in the FSM now. When
1092 * it's not safe we'll have to leave it up to a future VACUUM operation.
1094 * Finally, if we placed any pages in the FSM (either just now or during
1095 * the scan), forcibly update the upper-level FSM pages to ensure that
1096 * searchers can find them.
1098 _bt_pendingfsm_finalize(rel, &vstate);
1099 if (stats->pages_free > 0)
1100 IndexFreeSpaceMapVacuum(rel);
1104 * btvacuumpage --- VACUUM one page
1106 * This processes a single page for btvacuumscan(). In some cases we must
1107 * backtrack to re-examine and VACUUM pages that were the scanblkno during
1108 * a previous call here. This is how we handle page splits (that happened
1109 * after our cycleid was acquired) whose right half page happened to reuse
1110 * a block that we might have processed at some point before it was
1111 * recycled (i.e. before the page split).
1113 static void
1114 btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1116 IndexVacuumInfo *info = vstate->info;
1117 IndexBulkDeleteResult *stats = vstate->stats;
1118 IndexBulkDeleteCallback callback = vstate->callback;
1119 void *callback_state = vstate->callback_state;
1120 Relation rel = info->index;
1121 Relation heaprel = info->heaprel;
1122 bool attempt_pagedel;
1123 BlockNumber blkno,
1124 backtrack_to;
1125 Buffer buf;
1126 Page page;
1127 BTPageOpaque opaque;
1129 blkno = scanblkno;
1131 backtrack:
1133 attempt_pagedel = false;
1134 backtrack_to = P_NONE;
1136 /* call vacuum_delay_point while not holding any buffer lock */
1137 vacuum_delay_point();
1140 * We can't use _bt_getbuf() here because it always applies
1141 * _bt_checkpage(), which will barf on an all-zero page. We want to
1142 * recycle all-zero pages, not fail. Also, we want to use a nondefault
1143 * buffer access strategy.
1145 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1146 info->strategy);
1147 _bt_lockbuf(rel, buf, BT_READ);
1148 page = BufferGetPage(buf);
1149 opaque = NULL;
1150 if (!PageIsNew(page))
1152 _bt_checkpage(rel, buf);
1153 opaque = BTPageGetOpaque(page);
1156 Assert(blkno <= scanblkno);
1157 if (blkno != scanblkno)
1160 * We're backtracking.
1162 * We followed a right link to a sibling leaf page (a page that
1163 * happens to be from a block located before scanblkno). The only
1164 * case we want to do anything with is a live leaf page having the
1165 * current vacuum cycle ID.
1167 * The page had better be in a state that's consistent with what we
1168 * expect. Check for conditions that imply corruption in passing. It
1169 * can't be half-dead because only an interrupted VACUUM process can
1170 * leave pages in that state, so we'd definitely have dealt with it
1171 * back when the page was the scanblkno page (half-dead pages are
1172 * always marked fully deleted by _bt_pagedel(), barring corruption).
1174 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1176 Assert(false);
1177 ereport(LOG,
1178 (errcode(ERRCODE_INDEX_CORRUPTED),
1179 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1180 blkno, scanblkno, RelationGetRelationName(rel))));
1181 _bt_relbuf(rel, buf);
1182 return;
1186 * We may have already processed the page in an earlier call, when the
1187 * page was scanblkno. This happens when the leaf page split occurred
1188 * after the scan began, but before the right sibling page became the
1189 * scanblkno.
1191 * Page may also have been deleted by current btvacuumpage() call,
1192 * since _bt_pagedel() sometimes deletes the right sibling page of
1193 * scanblkno in passing (it does so after we decided where to
1194 * backtrack to). We don't need to process this page as a deleted
1195 * page a second time now (in fact, it would be wrong to count it as a
1196 * deleted page in the bulk delete statistics a second time).
1198 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1200 /* Done with current scanblkno (and all lower split pages) */
1201 _bt_relbuf(rel, buf);
1202 return;
1206 if (!opaque || BTPageIsRecyclable(page, heaprel))
1208 /* Okay to recycle this page (which could be leaf or internal) */
1209 RecordFreeIndexPage(rel, blkno);
1210 stats->pages_deleted++;
1211 stats->pages_free++;
1213 else if (P_ISDELETED(opaque))
1216 * Already deleted page (which could be leaf or internal). Can't
1217 * recycle yet.
1219 stats->pages_deleted++;
1221 else if (P_ISHALFDEAD(opaque))
1223 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1224 attempt_pagedel = true;
1227 * _bt_pagedel() will increment both pages_newly_deleted and
1228 * pages_deleted stats in all cases (barring corruption)
1231 else if (P_ISLEAF(opaque))
1233 OffsetNumber deletable[MaxIndexTuplesPerPage];
1234 int ndeletable;
1235 BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1236 int nupdatable;
1237 OffsetNumber offnum,
1238 minoff,
1239 maxoff;
1240 int nhtidsdead,
1241 nhtidslive;
1244 * Trade in the initial read lock for a full cleanup lock on this
1245 * page. We must get such a lock on every leaf page over the course
1246 * of the vacuum scan, whether or not it actually contains any
1247 * deletable tuples --- see nbtree/README.
1249 _bt_upgradelockbufcleanup(rel, buf);
1252 * Check whether we need to backtrack to earlier pages. What we are
1253 * concerned about is a page split that happened since we started the
1254 * vacuum scan. If the split moved tuples on the right half of the
1255 * split (i.e. the tuples that sort high) to a block that we already
1256 * passed over, then we might have missed the tuples. We need to
1257 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1258 * or deleting scanblkno page below!)
1260 if (vstate->cycleid != 0 &&
1261 opaque->btpo_cycleid == vstate->cycleid &&
1262 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1263 !P_RIGHTMOST(opaque) &&
1264 opaque->btpo_next < scanblkno)
1265 backtrack_to = opaque->btpo_next;
1267 ndeletable = 0;
1268 nupdatable = 0;
1269 minoff = P_FIRSTDATAKEY(opaque);
1270 maxoff = PageGetMaxOffsetNumber(page);
1271 nhtidsdead = 0;
1272 nhtidslive = 0;
1273 if (callback)
1275 /* btbulkdelete callback tells us what to delete (or update) */
1276 for (offnum = minoff;
1277 offnum <= maxoff;
1278 offnum = OffsetNumberNext(offnum))
1280 IndexTuple itup;
1282 itup = (IndexTuple) PageGetItem(page,
1283 PageGetItemId(page, offnum));
1285 Assert(!BTreeTupleIsPivot(itup));
1286 if (!BTreeTupleIsPosting(itup))
1288 /* Regular tuple, standard table TID representation */
1289 if (callback(&itup->t_tid, callback_state))
1291 deletable[ndeletable++] = offnum;
1292 nhtidsdead++;
1294 else
1295 nhtidslive++;
1297 else
1299 BTVacuumPosting vacposting;
1300 int nremaining;
1302 /* Posting list tuple */
1303 vacposting = btreevacuumposting(vstate, itup, offnum,
1304 &nremaining);
1305 if (vacposting == NULL)
1308 * All table TIDs from the posting tuple remain, so no
1309 * delete or update required
1311 Assert(nremaining == BTreeTupleGetNPosting(itup));
1313 else if (nremaining > 0)
1317 * Store metadata about posting list tuple in
1318 * updatable array for entire page. Existing tuple
1319 * will be updated during the later call to
1320 * _bt_delitems_vacuum().
1322 Assert(nremaining < BTreeTupleGetNPosting(itup));
1323 updatable[nupdatable++] = vacposting;
1324 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1326 else
1329 * All table TIDs from the posting list must be
1330 * deleted. We'll delete the index tuple completely
1331 * (no update required).
1333 Assert(nremaining == 0);
1334 deletable[ndeletable++] = offnum;
1335 nhtidsdead += BTreeTupleGetNPosting(itup);
1336 pfree(vacposting);
1339 nhtidslive += nremaining;
1345 * Apply any needed deletes or updates. We issue just one
1346 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1348 if (ndeletable > 0 || nupdatable > 0)
1350 Assert(nhtidsdead >= ndeletable + nupdatable);
1351 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1352 nupdatable);
1354 stats->tuples_removed += nhtidsdead;
1355 /* must recompute maxoff */
1356 maxoff = PageGetMaxOffsetNumber(page);
1358 /* can't leak memory here */
1359 for (int i = 0; i < nupdatable; i++)
1360 pfree(updatable[i]);
1362 else
1365 * If the leaf page has been split during this vacuum cycle, it
1366 * seems worth expending a write to clear btpo_cycleid even if we
1367 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1368 * takes care of this.) This ensures we won't process the page
1369 * again.
1371 * We treat this like a hint-bit update because there's no need to
1372 * WAL-log it.
1374 Assert(nhtidsdead == 0);
1375 if (vstate->cycleid != 0 &&
1376 opaque->btpo_cycleid == vstate->cycleid)
1378 opaque->btpo_cycleid = 0;
1379 MarkBufferDirtyHint(buf, true);
1384 * If the leaf page is now empty, try to delete it; else count the
1385 * live tuples (live table TIDs in posting lists are counted as
1386 * separate live tuples). We don't delete when backtracking, though,
1387 * since that would require teaching _bt_pagedel() about backtracking
1388 * (doesn't seem worth adding more complexity to deal with that).
1390 * We don't count the number of live TIDs during cleanup-only calls to
1391 * btvacuumscan (i.e. when callback is not set). We count the number
1392 * of index tuples directly instead. This avoids the expense of
1393 * directly examining all of the tuples on each page. VACUUM will
1394 * treat num_index_tuples as an estimate in cleanup-only case, so it
1395 * doesn't matter that this underestimates num_index_tuples
1396 * significantly in some cases.
1398 if (minoff > maxoff)
1399 attempt_pagedel = (blkno == scanblkno);
1400 else if (callback)
1401 stats->num_index_tuples += nhtidslive;
1402 else
1403 stats->num_index_tuples += maxoff - minoff + 1;
1405 Assert(!attempt_pagedel || nhtidslive == 0);
1408 if (attempt_pagedel)
1410 MemoryContext oldcontext;
1412 /* Run pagedel in a temp context to avoid memory leakage */
1413 MemoryContextReset(vstate->pagedelcontext);
1414 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1417 * _bt_pagedel maintains the bulk delete stats on our behalf;
1418 * pages_newly_deleted and pages_deleted are likely to be incremented
1419 * during call
1421 Assert(blkno == scanblkno);
1422 _bt_pagedel(rel, buf, vstate);
1424 MemoryContextSwitchTo(oldcontext);
1425 /* pagedel released buffer, so we shouldn't */
1427 else
1428 _bt_relbuf(rel, buf);
1430 if (backtrack_to != P_NONE)
1432 blkno = backtrack_to;
1433 goto backtrack;
1438 * btreevacuumposting --- determine TIDs still needed in posting list
1440 * Returns metadata describing how to build replacement tuple without the TIDs
1441 * that VACUUM needs to delete. Returned value is NULL in the common case
1442 * where no changes are needed to caller's posting list tuple (we avoid
1443 * allocating memory here as an optimization).
1445 * The number of TIDs that should remain in the posting list tuple is set for
1446 * caller in *nremaining.
1448 static BTVacuumPosting
1449 btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1450 OffsetNumber updatedoffset, int *nremaining)
1452 int live = 0;
1453 int nitem = BTreeTupleGetNPosting(posting);
1454 ItemPointer items = BTreeTupleGetPosting(posting);
1455 BTVacuumPosting vacposting = NULL;
1457 for (int i = 0; i < nitem; i++)
1459 if (!vstate->callback(items + i, vstate->callback_state))
1461 /* Live table TID */
1462 live++;
1464 else if (vacposting == NULL)
1467 * First dead table TID encountered.
1469 * It's now clear that we need to delete one or more dead table
1470 * TIDs, so start maintaining metadata describing how to update
1471 * existing posting list tuple.
1473 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1474 nitem * sizeof(uint16));
1476 vacposting->itup = posting;
1477 vacposting->updatedoffset = updatedoffset;
1478 vacposting->ndeletedtids = 0;
1479 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1481 else
1483 /* Second or subsequent dead table TID */
1484 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1488 *nremaining = live;
1489 return vacposting;
1493 * btcanreturn() -- Check whether btree indexes support index-only scans.
1495 * btrees always do, so this is trivial.
1497 bool
1498 btcanreturn(Relation index, int attno)
1500 return true;
1504 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1507 btgettreeheight(Relation rel)
1509 return _bt_getrootheight(rel);