1 /*-------------------------------------------------------------------------
4 * Implementation of Lehman and Yao's btree management algorithm for
8 * This file contains only the public interface routines.
11 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
15 * src/backend/access/nbtree/nbtree.c
17 *-------------------------------------------------------------------------
21 #include "access/nbtree.h"
22 #include "access/relscan.h"
23 #include "commands/progress.h"
24 #include "commands/vacuum.h"
25 #include "nodes/execnodes.h"
27 #include "storage/bulk_write.h"
28 #include "storage/condition_variable.h"
29 #include "storage/indexfsm.h"
30 #include "storage/ipc.h"
31 #include "storage/lmgr.h"
32 #include "utils/fmgrprotos.h"
33 #include "utils/index_selfuncs.h"
34 #include "utils/memutils.h"
38 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
40 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
41 * scan to advance it via another call to _bt_first.
43 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
44 * a new page; others must wait.
46 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
47 * to a new page; some process can start doing that.
49 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 BTPARALLEL_NOT_INITIALIZED
,
54 BTPARALLEL_NEED_PRIMSCAN
,
61 * BTParallelScanDescData contains btree specific shared information required
64 typedef struct BTParallelScanDescData
66 BlockNumber btps_nextScanPage
; /* next page to be scanned */
67 BlockNumber btps_lastCurrPage
; /* page whose sibling link was copied into
68 * btps_nextScanPage */
69 BTPS_State btps_pageStatus
; /* indicates whether next page is
70 * available for scan. see above for
71 * possible states of parallel scan. */
72 slock_t btps_mutex
; /* protects above variables, btps_arrElems */
73 ConditionVariable btps_cv
; /* used to synchronize parallel scan */
76 * btps_arrElems is used when scans need to schedule another primitive
77 * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79 int btps_arrElems
[FLEXIBLE_ARRAY_MEMBER
];
80 } BTParallelScanDescData
;
82 typedef struct BTParallelScanDescData
*BTParallelScanDesc
;
85 static void btvacuumscan(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
,
86 IndexBulkDeleteCallback callback
, void *callback_state
,
88 static void btvacuumpage(BTVacState
*vstate
, BlockNumber scanblkno
);
89 static BTVacuumPosting
btreevacuumposting(BTVacState
*vstate
,
91 OffsetNumber updatedoffset
,
96 * Btree handler function: return IndexAmRoutine with access method parameters
100 bthandler(PG_FUNCTION_ARGS
)
102 IndexAmRoutine
*amroutine
= makeNode(IndexAmRoutine
);
104 amroutine
->amstrategies
= BTMaxStrategyNumber
;
105 amroutine
->amsupport
= BTNProcs
;
106 amroutine
->amoptsprocnum
= BTOPTIONS_PROC
;
107 amroutine
->amcanorder
= true;
108 amroutine
->amcanorderbyop
= false;
109 amroutine
->amcanbackward
= true;
110 amroutine
->amcanunique
= true;
111 amroutine
->amcanmulticol
= true;
112 amroutine
->amoptionalkey
= true;
113 amroutine
->amsearcharray
= true;
114 amroutine
->amsearchnulls
= true;
115 amroutine
->amstorage
= false;
116 amroutine
->amclusterable
= true;
117 amroutine
->ampredlocks
= true;
118 amroutine
->amcanparallel
= true;
119 amroutine
->amcanbuildparallel
= true;
120 amroutine
->amcaninclude
= true;
121 amroutine
->amusemaintenanceworkmem
= false;
122 amroutine
->amsummarizing
= false;
123 amroutine
->amparallelvacuumoptions
=
124 VACUUM_OPTION_PARALLEL_BULKDEL
| VACUUM_OPTION_PARALLEL_COND_CLEANUP
;
125 amroutine
->amkeytype
= InvalidOid
;
127 amroutine
->ambuild
= btbuild
;
128 amroutine
->ambuildempty
= btbuildempty
;
129 amroutine
->aminsert
= btinsert
;
130 amroutine
->aminsertcleanup
= NULL
;
131 amroutine
->ambulkdelete
= btbulkdelete
;
132 amroutine
->amvacuumcleanup
= btvacuumcleanup
;
133 amroutine
->amcanreturn
= btcanreturn
;
134 amroutine
->amcostestimate
= btcostestimate
;
135 amroutine
->amgettreeheight
= btgettreeheight
;
136 amroutine
->amoptions
= btoptions
;
137 amroutine
->amproperty
= btproperty
;
138 amroutine
->ambuildphasename
= btbuildphasename
;
139 amroutine
->amvalidate
= btvalidate
;
140 amroutine
->amadjustmembers
= btadjustmembers
;
141 amroutine
->ambeginscan
= btbeginscan
;
142 amroutine
->amrescan
= btrescan
;
143 amroutine
->amgettuple
= btgettuple
;
144 amroutine
->amgetbitmap
= btgetbitmap
;
145 amroutine
->amendscan
= btendscan
;
146 amroutine
->ammarkpos
= btmarkpos
;
147 amroutine
->amrestrpos
= btrestrpos
;
148 amroutine
->amestimateparallelscan
= btestimateparallelscan
;
149 amroutine
->aminitparallelscan
= btinitparallelscan
;
150 amroutine
->amparallelrescan
= btparallelrescan
;
152 PG_RETURN_POINTER(amroutine
);
156 * btbuildempty() -- build an empty btree index in the initialization fork
159 btbuildempty(Relation index
)
161 bool allequalimage
= _bt_allequalimage(index
, false);
162 BulkWriteState
*bulkstate
;
163 BulkWriteBuffer metabuf
;
165 bulkstate
= smgr_bulk_start_rel(index
, INIT_FORKNUM
);
167 /* Construct metapage. */
168 metabuf
= smgr_bulk_get_buf(bulkstate
);
169 _bt_initmetapage((Page
) metabuf
, P_NONE
, 0, allequalimage
);
170 smgr_bulk_write(bulkstate
, BTREE_METAPAGE
, metabuf
, true);
172 smgr_bulk_finish(bulkstate
);
176 * btinsert() -- insert an index tuple into a btree.
178 * Descend the tree recursively, find the appropriate location for our
179 * new tuple, and put it there.
182 btinsert(Relation rel
, Datum
*values
, bool *isnull
,
183 ItemPointer ht_ctid
, Relation heapRel
,
184 IndexUniqueCheck checkUnique
,
186 IndexInfo
*indexInfo
)
191 /* generate an index tuple */
192 itup
= index_form_tuple(RelationGetDescr(rel
), values
, isnull
);
193 itup
->t_tid
= *ht_ctid
;
195 result
= _bt_doinsert(rel
, itup
, checkUnique
, indexUnchanged
, heapRel
);
203 * btgettuple() -- Get the next tuple in the scan.
206 btgettuple(IndexScanDesc scan
, ScanDirection dir
)
208 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
211 /* btree indexes are never lossy */
212 scan
->xs_recheck
= false;
214 /* Each loop iteration performs another primitive index scan */
218 * If we've already initialized this scan, we can just advance it in
219 * the appropriate direction. If we haven't done so yet, we call
220 * _bt_first() to get the first item in the scan.
222 if (!BTScanPosIsValid(so
->currPos
))
223 res
= _bt_first(scan
, dir
);
227 * Check to see if we should kill the previously-fetched tuple.
229 if (scan
->kill_prior_tuple
)
232 * Yes, remember it for later. (We'll deal with all such
233 * tuples at once right before leaving the index page.) The
234 * test for numKilled overrun is not just paranoia: if the
235 * caller reverses direction in the indexscan then the same
236 * item might get entered multiple times. It's not worth
237 * trying to optimize that, so we don't detect it, but instead
238 * just forget any excess entries.
240 if (so
->killedItems
== NULL
)
241 so
->killedItems
= (int *)
242 palloc(MaxTIDsPerBTreePage
* sizeof(int));
243 if (so
->numKilled
< MaxTIDsPerBTreePage
)
244 so
->killedItems
[so
->numKilled
++] = so
->currPos
.itemIndex
;
248 * Now continue the scan.
250 res
= _bt_next(scan
, dir
);
253 /* If we have a tuple, return it ... */
256 /* ... otherwise see if we need another primitive index scan */
257 } while (so
->numArrayKeys
&& _bt_start_prim_scan(scan
, dir
));
263 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
266 btgetbitmap(IndexScanDesc scan
, TIDBitmap
*tbm
)
268 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
272 /* Each loop iteration performs another primitive index scan */
275 /* Fetch the first page & tuple */
276 if (_bt_first(scan
, ForwardScanDirection
))
278 /* Save tuple ID, and continue scanning */
279 heapTid
= &scan
->xs_heaptid
;
280 tbm_add_tuples(tbm
, heapTid
, 1, false);
286 * Advance to next tuple within page. This is the same as the
287 * easy case in _bt_next().
289 if (++so
->currPos
.itemIndex
> so
->currPos
.lastItem
)
291 /* let _bt_next do the heavy lifting */
292 if (!_bt_next(scan
, ForwardScanDirection
))
296 /* Save tuple ID, and continue scanning */
297 heapTid
= &so
->currPos
.items
[so
->currPos
.itemIndex
].heapTid
;
298 tbm_add_tuples(tbm
, heapTid
, 1, false);
302 /* Now see if we need another primitive index scan */
303 } while (so
->numArrayKeys
&& _bt_start_prim_scan(scan
, ForwardScanDirection
));
309 * btbeginscan() -- start a scan on a btree index
312 btbeginscan(Relation rel
, int nkeys
, int norderbys
)
317 /* no order by operators allowed */
318 Assert(norderbys
== 0);
321 scan
= RelationGetIndexScan(rel
, nkeys
, norderbys
);
323 /* allocate private workspace */
324 so
= (BTScanOpaque
) palloc(sizeof(BTScanOpaqueData
));
325 BTScanPosInvalidate(so
->currPos
);
326 BTScanPosInvalidate(so
->markPos
);
327 if (scan
->numberOfKeys
> 0)
328 so
->keyData
= (ScanKey
) palloc(scan
->numberOfKeys
* sizeof(ScanKeyData
));
332 so
->needPrimScan
= false;
333 so
->scanBehind
= false;
334 so
->oppositeDirCheck
= false;
335 so
->arrayKeys
= NULL
;
336 so
->orderProcs
= NULL
;
337 so
->arrayContext
= NULL
;
339 so
->killedItems
= NULL
; /* until needed */
343 * We don't know yet whether the scan will be index-only, so we do not
344 * allocate the tuple workspace arrays until btrescan. However, we set up
345 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
347 so
->currTuples
= so
->markTuples
= NULL
;
349 scan
->xs_itupdesc
= RelationGetDescr(rel
);
357 * btrescan() -- rescan an index relation
360 btrescan(IndexScanDesc scan
, ScanKey scankey
, int nscankeys
,
361 ScanKey orderbys
, int norderbys
)
363 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
365 /* we aren't holding any read locks, but gotta drop the pins */
366 if (BTScanPosIsValid(so
->currPos
))
368 /* Before leaving current page, deal with any killed items */
369 if (so
->numKilled
> 0)
371 BTScanPosUnpinIfPinned(so
->currPos
);
372 BTScanPosInvalidate(so
->currPos
);
375 so
->markItemIndex
= -1;
376 so
->needPrimScan
= false;
377 so
->scanBehind
= false;
378 so
->oppositeDirCheck
= false;
379 BTScanPosUnpinIfPinned(so
->markPos
);
380 BTScanPosInvalidate(so
->markPos
);
383 * Allocate tuple workspace arrays, if needed for an index-only scan and
384 * not already done in a previous rescan call. To save on palloc
385 * overhead, both workspaces are allocated as one palloc block; only this
386 * function and btendscan know that.
388 * NOTE: this data structure also makes it safe to return data from a
389 * "name" column, even though btree name_ops uses an underlying storage
390 * datatype of cstring. The risk there is that "name" is supposed to be
391 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
392 * However, since we only return data out of tuples sitting in the
393 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
394 * data out of the markTuples array --- running off the end of memory for
395 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
396 * adding special-case treatment for name_ops elsewhere.
398 if (scan
->xs_want_itup
&& so
->currTuples
== NULL
)
400 so
->currTuples
= (char *) palloc(BLCKSZ
* 2);
401 so
->markTuples
= so
->currTuples
+ BLCKSZ
;
405 * Reset the scan keys
407 if (scankey
&& scan
->numberOfKeys
> 0)
408 memcpy(scan
->keyData
, scankey
, scan
->numberOfKeys
* sizeof(ScanKeyData
));
409 so
->numberOfKeys
= 0; /* until _bt_preprocess_keys sets it */
410 so
->numArrayKeys
= 0; /* ditto */
414 * btendscan() -- close down a scan
417 btendscan(IndexScanDesc scan
)
419 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
421 /* we aren't holding any read locks, but gotta drop the pins */
422 if (BTScanPosIsValid(so
->currPos
))
424 /* Before leaving current page, deal with any killed items */
425 if (so
->numKilled
> 0)
427 BTScanPosUnpinIfPinned(so
->currPos
);
430 so
->markItemIndex
= -1;
431 BTScanPosUnpinIfPinned(so
->markPos
);
433 /* No need to invalidate positions, the RAM is about to be freed. */
435 /* Release storage */
436 if (so
->keyData
!= NULL
)
438 /* so->arrayKeys and so->orderProcs are in arrayContext */
439 if (so
->arrayContext
!= NULL
)
440 MemoryContextDelete(so
->arrayContext
);
441 if (so
->killedItems
!= NULL
)
442 pfree(so
->killedItems
);
443 if (so
->currTuples
!= NULL
)
444 pfree(so
->currTuples
);
445 /* so->markTuples should not be pfree'd, see btrescan */
450 * btmarkpos() -- save current scan position
453 btmarkpos(IndexScanDesc scan
)
455 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
457 /* There may be an old mark with a pin (but no lock). */
458 BTScanPosUnpinIfPinned(so
->markPos
);
461 * Just record the current itemIndex. If we later step to next page
462 * before releasing the marked position, _bt_steppage makes a full copy of
463 * the currPos struct in markPos. If (as often happens) the mark is moved
464 * before we leave the page, we don't have to do that work.
466 if (BTScanPosIsValid(so
->currPos
))
467 so
->markItemIndex
= so
->currPos
.itemIndex
;
470 BTScanPosInvalidate(so
->markPos
);
471 so
->markItemIndex
= -1;
476 * btrestrpos() -- restore scan to last saved position
479 btrestrpos(IndexScanDesc scan
)
481 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
483 if (so
->markItemIndex
>= 0)
486 * The scan has never moved to a new page since the last mark. Just
487 * restore the itemIndex.
489 * NB: In this case we can't count on anything in so->markPos to be
492 so
->currPos
.itemIndex
= so
->markItemIndex
;
497 * The scan moved to a new page after last mark or restore, and we are
498 * now restoring to the marked page. We aren't holding any read
499 * locks, but if we're still holding the pin for the current position,
502 if (BTScanPosIsValid(so
->currPos
))
504 /* Before leaving current page, deal with any killed items */
505 if (so
->numKilled
> 0)
507 BTScanPosUnpinIfPinned(so
->currPos
);
510 if (BTScanPosIsValid(so
->markPos
))
512 /* bump pin on mark buffer for assignment to current buffer */
513 if (BTScanPosIsPinned(so
->markPos
))
514 IncrBufferRefCount(so
->markPos
.buf
);
515 memcpy(&so
->currPos
, &so
->markPos
,
516 offsetof(BTScanPosData
, items
[1]) +
517 so
->markPos
.lastItem
* sizeof(BTScanPosItem
));
519 memcpy(so
->currTuples
, so
->markTuples
,
520 so
->markPos
.nextTupleOffset
);
521 /* Reset the scan's array keys (see _bt_steppage for why) */
522 if (so
->numArrayKeys
)
524 _bt_start_array_keys(scan
, so
->currPos
.dir
);
525 so
->needPrimScan
= false;
529 BTScanPosInvalidate(so
->currPos
);
534 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
537 btestimateparallelscan(int nkeys
, int norderbys
)
539 /* Pessimistically assume all input scankeys will be output with arrays */
540 return offsetof(BTParallelScanDescData
, btps_arrElems
) + sizeof(int) * nkeys
;
544 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
547 btinitparallelscan(void *target
)
549 BTParallelScanDesc bt_target
= (BTParallelScanDesc
) target
;
551 SpinLockInit(&bt_target
->btps_mutex
);
552 bt_target
->btps_nextScanPage
= InvalidBlockNumber
;
553 bt_target
->btps_lastCurrPage
= InvalidBlockNumber
;
554 bt_target
->btps_pageStatus
= BTPARALLEL_NOT_INITIALIZED
;
555 ConditionVariableInit(&bt_target
->btps_cv
);
559 * btparallelrescan() -- reset parallel scan
562 btparallelrescan(IndexScanDesc scan
)
564 BTParallelScanDesc btscan
;
565 ParallelIndexScanDesc parallel_scan
= scan
->parallel_scan
;
567 Assert(parallel_scan
);
569 btscan
= (BTParallelScanDesc
) OffsetToPointer((void *) parallel_scan
,
570 parallel_scan
->ps_offset
);
573 * In theory, we don't need to acquire the spinlock here, because there
574 * shouldn't be any other workers running at this point, but we do so for
577 SpinLockAcquire(&btscan
->btps_mutex
);
578 btscan
->btps_nextScanPage
= InvalidBlockNumber
;
579 btscan
->btps_lastCurrPage
= InvalidBlockNumber
;
580 btscan
->btps_pageStatus
= BTPARALLEL_NOT_INITIALIZED
;
581 SpinLockRelease(&btscan
->btps_mutex
);
585 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
586 * page. Other scans must wait until we call _bt_parallel_release()
587 * or _bt_parallel_done().
589 * The return value is true if we successfully seized the scan and false
590 * if we did not. The latter case occurs when no pages remain, or when
591 * another primitive index scan is scheduled that caller's backend cannot
592 * start just yet (only backends that call from _bt_first are capable of
593 * starting primitive index scans, which they indicate by passing first=true).
595 * If the return value is true, *next_scan_page returns the next page of the
596 * scan, and *last_curr_page returns the page that *next_scan_page came from.
597 * An invalid *next_scan_page means the scan hasn't yet started, or that
598 * caller needs to start the next primitive index scan (if it's the latter
599 * case we'll set so.needPrimScan).
601 * Callers should ignore the value of *next_scan_page and *last_curr_page if
602 * the return value is false.
605 _bt_parallel_seize(IndexScanDesc scan
, BlockNumber
*next_scan_page
,
606 BlockNumber
*last_curr_page
, bool first
)
608 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
609 bool exit_loop
= false,
612 ParallelIndexScanDesc parallel_scan
= scan
->parallel_scan
;
613 BTParallelScanDesc btscan
;
615 *next_scan_page
= InvalidBlockNumber
;
616 *last_curr_page
= InvalidBlockNumber
;
619 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
620 * call to _bt_readnextpage treats this backend similarly to a serial
621 * backend that steps from *last_curr_page to *next_scan_page (unless this
622 * backend's so->currPos is initialized by _bt_readfirstpage before then).
624 BTScanPosInvalidate(so
->currPos
);
625 so
->currPos
.moreLeft
= so
->currPos
.moreRight
= true;
630 * Initialize array related state when called from _bt_first, assuming
631 * that this will be the first primitive index scan for the scan
633 so
->needPrimScan
= false;
634 so
->scanBehind
= false;
635 so
->oppositeDirCheck
= false;
640 * Don't attempt to seize the scan when it requires another primitive
641 * index scan, since caller's backend cannot start it right now
643 if (so
->needPrimScan
)
647 btscan
= (BTParallelScanDesc
) OffsetToPointer((void *) parallel_scan
,
648 parallel_scan
->ps_offset
);
652 SpinLockAcquire(&btscan
->btps_mutex
);
654 if (btscan
->btps_pageStatus
== BTPARALLEL_DONE
)
656 /* We're done with this parallel index scan */
659 else if (btscan
->btps_pageStatus
== BTPARALLEL_IDLE
&&
660 btscan
->btps_nextScanPage
== P_NONE
)
662 /* End this parallel index scan */
666 else if (btscan
->btps_pageStatus
== BTPARALLEL_NEED_PRIMSCAN
)
668 Assert(so
->numArrayKeys
);
672 /* Can start scheduled primitive scan right away, so do so */
673 btscan
->btps_pageStatus
= BTPARALLEL_ADVANCING
;
674 for (int i
= 0; i
< so
->numArrayKeys
; i
++)
676 BTArrayKeyInfo
*array
= &so
->arrayKeys
[i
];
677 ScanKey skey
= &so
->keyData
[array
->scan_key
];
679 array
->cur_elem
= btscan
->btps_arrElems
[i
];
680 skey
->sk_argument
= array
->elem_values
[array
->cur_elem
];
687 * Don't attempt to seize the scan when it requires another
688 * primitive index scan, since caller's backend cannot start
695 * Either way, update backend local state to indicate that a
696 * pending primitive scan is required
698 so
->needPrimScan
= true;
699 so
->scanBehind
= false;
700 so
->oppositeDirCheck
= false;
702 else if (btscan
->btps_pageStatus
!= BTPARALLEL_ADVANCING
)
705 * We have successfully seized control of the scan for the purpose
706 * of advancing it to a new page!
708 btscan
->btps_pageStatus
= BTPARALLEL_ADVANCING
;
709 Assert(btscan
->btps_nextScanPage
!= P_NONE
);
710 *next_scan_page
= btscan
->btps_nextScanPage
;
711 *last_curr_page
= btscan
->btps_lastCurrPage
;
714 SpinLockRelease(&btscan
->btps_mutex
);
715 if (exit_loop
|| !status
)
717 ConditionVariableSleep(&btscan
->btps_cv
, WAIT_EVENT_BTREE_PAGE
);
719 ConditionVariableCancelSleep();
721 /* When the scan has reached the rightmost (or leftmost) page, end it */
723 _bt_parallel_done(scan
);
729 * _bt_parallel_release() -- Complete the process of advancing the scan to a
730 * new page. We now have the new value btps_nextScanPage; another backend
731 * can now begin advancing the scan.
733 * Callers whose scan uses array keys must save their curr_page argument so
734 * that it can be passed to _bt_parallel_primscan_schedule, should caller
735 * determine that another primitive index scan is required.
737 * If caller's next_scan_page is P_NONE, the scan has reached the index's
738 * rightmost/leftmost page. This is treated as reaching the end of the scan
739 * within _bt_parallel_seize.
741 * Note: unlike the serial case, parallel scans don't need to remember both
742 * sibling links. next_scan_page is whichever link is next given the scan's
743 * direction. That's all we'll ever need, since the direction of a parallel
744 * scan can never change.
747 _bt_parallel_release(IndexScanDesc scan
, BlockNumber next_scan_page
,
748 BlockNumber curr_page
)
750 ParallelIndexScanDesc parallel_scan
= scan
->parallel_scan
;
751 BTParallelScanDesc btscan
;
753 Assert(BlockNumberIsValid(next_scan_page
));
755 btscan
= (BTParallelScanDesc
) OffsetToPointer((void *) parallel_scan
,
756 parallel_scan
->ps_offset
);
758 SpinLockAcquire(&btscan
->btps_mutex
);
759 btscan
->btps_nextScanPage
= next_scan_page
;
760 btscan
->btps_lastCurrPage
= curr_page
;
761 btscan
->btps_pageStatus
= BTPARALLEL_IDLE
;
762 SpinLockRelease(&btscan
->btps_mutex
);
763 ConditionVariableSignal(&btscan
->btps_cv
);
767 * _bt_parallel_done() -- Mark the parallel scan as complete.
769 * When there are no pages left to scan, this function should be called to
770 * notify other workers. Otherwise, they might wait forever for the scan to
771 * advance to the next page.
774 _bt_parallel_done(IndexScanDesc scan
)
776 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
777 ParallelIndexScanDesc parallel_scan
= scan
->parallel_scan
;
778 BTParallelScanDesc btscan
;
779 bool status_changed
= false;
781 Assert(!BTScanPosIsValid(so
->currPos
));
783 /* Do nothing, for non-parallel scans */
784 if (parallel_scan
== NULL
)
788 * Should not mark parallel scan done when there's still a pending
789 * primitive index scan
791 if (so
->needPrimScan
)
794 btscan
= (BTParallelScanDesc
) OffsetToPointer((void *) parallel_scan
,
795 parallel_scan
->ps_offset
);
798 * Mark the parallel scan as done, unless some other process did so
801 SpinLockAcquire(&btscan
->btps_mutex
);
802 Assert(btscan
->btps_pageStatus
!= BTPARALLEL_NEED_PRIMSCAN
);
803 if (btscan
->btps_pageStatus
!= BTPARALLEL_DONE
)
805 btscan
->btps_pageStatus
= BTPARALLEL_DONE
;
806 status_changed
= true;
808 SpinLockRelease(&btscan
->btps_mutex
);
810 /* wake up all the workers associated with this parallel scan */
812 ConditionVariableBroadcast(&btscan
->btps_cv
);
816 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
818 * Caller passes the curr_page most recently passed to _bt_parallel_release
819 * by its backend. Caller successfully schedules the next primitive index scan
820 * if the shared parallel state hasn't been seized since caller's backend last
824 _bt_parallel_primscan_schedule(IndexScanDesc scan
, BlockNumber curr_page
)
826 BTScanOpaque so
= (BTScanOpaque
) scan
->opaque
;
827 ParallelIndexScanDesc parallel_scan
= scan
->parallel_scan
;
828 BTParallelScanDesc btscan
;
830 Assert(so
->numArrayKeys
);
832 btscan
= (BTParallelScanDesc
) OffsetToPointer((void *) parallel_scan
,
833 parallel_scan
->ps_offset
);
835 SpinLockAcquire(&btscan
->btps_mutex
);
836 if (btscan
->btps_lastCurrPage
== curr_page
&&
837 btscan
->btps_pageStatus
== BTPARALLEL_IDLE
)
839 btscan
->btps_nextScanPage
= InvalidBlockNumber
;
840 btscan
->btps_lastCurrPage
= InvalidBlockNumber
;
841 btscan
->btps_pageStatus
= BTPARALLEL_NEED_PRIMSCAN
;
843 /* Serialize scan's current array keys */
844 for (int i
= 0; i
< so
->numArrayKeys
; i
++)
846 BTArrayKeyInfo
*array
= &so
->arrayKeys
[i
];
848 btscan
->btps_arrElems
[i
] = array
->cur_elem
;
851 SpinLockRelease(&btscan
->btps_mutex
);
855 * Bulk deletion of all index entries pointing to a set of heap tuples.
856 * The set of target tuples is specified via a callback routine that tells
857 * whether any given heap tuple (identified by ItemPointer) is being deleted.
859 * Result: a palloc'd struct containing statistical info for VACUUM displays.
861 IndexBulkDeleteResult
*
862 btbulkdelete(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
,
863 IndexBulkDeleteCallback callback
, void *callback_state
)
865 Relation rel
= info
->index
;
868 /* allocate stats if first time through, else re-use existing struct */
870 stats
= (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
872 /* Establish the vacuum cycle ID to use for this scan */
873 /* The ENSURE stuff ensures we clean up shared memory on failure */
874 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback
, PointerGetDatum(rel
));
876 cycleid
= _bt_start_vacuum(rel
);
878 btvacuumscan(info
, stats
, callback
, callback_state
, cycleid
);
880 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback
, PointerGetDatum(rel
));
887 * Post-VACUUM cleanup.
889 * Result: a palloc'd struct containing statistical info for VACUUM displays.
891 IndexBulkDeleteResult
*
892 btvacuumcleanup(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
)
894 BlockNumber num_delpages
;
896 /* No-op in ANALYZE ONLY mode */
897 if (info
->analyze_only
)
901 * If btbulkdelete was called, we need not do anything (we just maintain
902 * the information used within _bt_vacuum_needs_cleanup() by calling
903 * _bt_set_cleanup_info() below).
905 * If btbulkdelete was _not_ called, then we have a choice to make: we
906 * must decide whether or not a btvacuumscan() call is needed now (i.e.
907 * whether the ongoing VACUUM operation can entirely avoid a physical scan
908 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
913 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
914 if (!_bt_vacuum_needs_cleanup(info
->index
))
918 * Since we aren't going to actually delete any leaf items, there's no
919 * need to go through all the vacuum-cycle-ID pushups here.
921 * Posting list tuples are a source of inaccuracy for cleanup-only
922 * scans. btvacuumscan() will assume that the number of index tuples
923 * from each page can be used as num_index_tuples, even though
924 * num_index_tuples is supposed to represent the number of TIDs in the
925 * index. This naive approach can underestimate the number of tuples
926 * in the index significantly.
928 * We handle the problem by making num_index_tuples an estimate in
931 stats
= (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
932 btvacuumscan(info
, stats
, NULL
, NULL
, 0);
933 stats
->estimated_count
= true;
937 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
939 * num_delpages is the number of deleted pages now in the index that were
940 * not safe to place in the FSM to be recycled just yet. num_delpages is
941 * greater than 0 only when _bt_pagedel() actually deleted pages during
942 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
943 * have failed to place any newly deleted pages in the FSM just moments
944 * ago. (Actually, there are edge cases where recycling of the current
945 * VACUUM's newly deleted pages does not even become safe by the time the
946 * next VACUUM comes around. See nbtree/README.)
948 Assert(stats
->pages_deleted
>= stats
->pages_free
);
949 num_delpages
= stats
->pages_deleted
- stats
->pages_free
;
950 _bt_set_cleanup_info(info
->index
, num_delpages
);
953 * It's quite possible for us to be fooled by concurrent page splits into
954 * double-counting some index tuples, so disbelieve any total that exceeds
955 * the underlying heap's count ... if we know that accurately. Otherwise
956 * this might just make matters worse.
958 if (!info
->estimated_count
)
960 if (stats
->num_index_tuples
> info
->num_heap_tuples
)
961 stats
->num_index_tuples
= info
->num_heap_tuples
;
968 * btvacuumscan --- scan the index for VACUUMing purposes
970 * This combines the functions of looking for leaf tuples that are deletable
971 * according to the vacuum callback, looking for empty pages that can be
972 * deleted, and looking for old deleted pages that can be recycled. Both
973 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
974 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
976 * The caller is responsible for initially allocating/zeroing a stats struct
977 * and for obtaining a vacuum cycle ID if necessary.
980 btvacuumscan(IndexVacuumInfo
*info
, IndexBulkDeleteResult
*stats
,
981 IndexBulkDeleteCallback callback
, void *callback_state
,
984 Relation rel
= info
->index
;
986 BlockNumber num_pages
;
987 BlockNumber scanblkno
;
991 * Reset fields that track information about the entire index now. This
992 * avoids double-counting in the case where a single VACUUM command
993 * requires multiple scans of the index.
995 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
996 * since they track information about the VACUUM command, and so must last
997 * across each call to btvacuumscan().
999 * (Note that pages_free is treated as state about the whole index, not
1000 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1001 * calls are idempotent, and get repeated for the same deleted pages in
1002 * some scenarios. The point for us is to track the number of recyclable
1003 * pages in the index at the end of the VACUUM command.)
1005 stats
->num_pages
= 0;
1006 stats
->num_index_tuples
= 0;
1007 stats
->pages_deleted
= 0;
1008 stats
->pages_free
= 0;
1010 /* Set up info to pass down to btvacuumpage */
1012 vstate
.stats
= stats
;
1013 vstate
.callback
= callback
;
1014 vstate
.callback_state
= callback_state
;
1015 vstate
.cycleid
= cycleid
;
1017 /* Create a temporary memory context to run _bt_pagedel in */
1018 vstate
.pagedelcontext
= AllocSetContextCreate(CurrentMemoryContext
,
1020 ALLOCSET_DEFAULT_SIZES
);
1022 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1024 vstate
.maxbufsize
= 0;
1025 vstate
.pendingpages
= NULL
;
1026 vstate
.npendingpages
= 0;
1027 /* Consider applying _bt_pendingfsm_finalize optimization */
1028 _bt_pendingfsm_init(rel
, &vstate
, (callback
== NULL
));
1031 * The outer loop iterates over all index pages except the metapage, in
1032 * physical order (we hope the kernel will cooperate in providing
1033 * read-ahead for speed). It is critical that we visit all leaf pages,
1034 * including ones added after we start the scan, else we might fail to
1035 * delete some deletable tuples. Hence, we must repeatedly check the
1036 * relation length. We must acquire the relation-extension lock while
1037 * doing so to avoid a race condition: if someone else is extending the
1038 * relation, there is a window where bufmgr/smgr have created a new
1039 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1040 * we manage to scan such a page here, we'll improperly assume it can be
1041 * recycled. Taking the lock synchronizes things enough to prevent a
1042 * problem: either num_pages won't include the new page, or _bt_getbuf
1043 * already has write lock on the buffer and it will be fully initialized
1044 * before we can examine it. Also, we need not worry if a page is added
1045 * immediately after we look; the page splitting code already has
1046 * write-lock on the left page before it adds a right page, so we must
1047 * already have processed any tuples due to be moved into such a page.
1049 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1050 * think the use of the extension lock is still required.
1052 * We can skip locking for new or temp relations, however, since no one
1053 * else could be accessing them.
1055 needLock
= !RELATION_IS_LOCAL(rel
);
1057 scanblkno
= BTREE_METAPAGE
+ 1;
1060 /* Get the current relation length */
1062 LockRelationForExtension(rel
, ExclusiveLock
);
1063 num_pages
= RelationGetNumberOfBlocks(rel
);
1065 UnlockRelationForExtension(rel
, ExclusiveLock
);
1067 if (info
->report_progress
)
1068 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL
,
1071 /* Quit if we've scanned the whole relation */
1072 if (scanblkno
>= num_pages
)
1074 /* Iterate over pages, then loop back to recheck length */
1075 for (; scanblkno
< num_pages
; scanblkno
++)
1077 btvacuumpage(&vstate
, scanblkno
);
1078 if (info
->report_progress
)
1079 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE
,
1084 /* Set statistics num_pages field to final size of index */
1085 stats
->num_pages
= num_pages
;
1087 MemoryContextDelete(vstate
.pagedelcontext
);
1090 * If there were any calls to _bt_pagedel() during scan of the index then
1091 * see if any of the resulting pages can be placed in the FSM now. When
1092 * it's not safe we'll have to leave it up to a future VACUUM operation.
1094 * Finally, if we placed any pages in the FSM (either just now or during
1095 * the scan), forcibly update the upper-level FSM pages to ensure that
1096 * searchers can find them.
1098 _bt_pendingfsm_finalize(rel
, &vstate
);
1099 if (stats
->pages_free
> 0)
1100 IndexFreeSpaceMapVacuum(rel
);
1104 * btvacuumpage --- VACUUM one page
1106 * This processes a single page for btvacuumscan(). In some cases we must
1107 * backtrack to re-examine and VACUUM pages that were the scanblkno during
1108 * a previous call here. This is how we handle page splits (that happened
1109 * after our cycleid was acquired) whose right half page happened to reuse
1110 * a block that we might have processed at some point before it was
1111 * recycled (i.e. before the page split).
1114 btvacuumpage(BTVacState
*vstate
, BlockNumber scanblkno
)
1116 IndexVacuumInfo
*info
= vstate
->info
;
1117 IndexBulkDeleteResult
*stats
= vstate
->stats
;
1118 IndexBulkDeleteCallback callback
= vstate
->callback
;
1119 void *callback_state
= vstate
->callback_state
;
1120 Relation rel
= info
->index
;
1121 Relation heaprel
= info
->heaprel
;
1122 bool attempt_pagedel
;
1127 BTPageOpaque opaque
;
1133 attempt_pagedel
= false;
1134 backtrack_to
= P_NONE
;
1136 /* call vacuum_delay_point while not holding any buffer lock */
1137 vacuum_delay_point();
1140 * We can't use _bt_getbuf() here because it always applies
1141 * _bt_checkpage(), which will barf on an all-zero page. We want to
1142 * recycle all-zero pages, not fail. Also, we want to use a nondefault
1143 * buffer access strategy.
1145 buf
= ReadBufferExtended(rel
, MAIN_FORKNUM
, blkno
, RBM_NORMAL
,
1147 _bt_lockbuf(rel
, buf
, BT_READ
);
1148 page
= BufferGetPage(buf
);
1150 if (!PageIsNew(page
))
1152 _bt_checkpage(rel
, buf
);
1153 opaque
= BTPageGetOpaque(page
);
1156 Assert(blkno
<= scanblkno
);
1157 if (blkno
!= scanblkno
)
1160 * We're backtracking.
1162 * We followed a right link to a sibling leaf page (a page that
1163 * happens to be from a block located before scanblkno). The only
1164 * case we want to do anything with is a live leaf page having the
1165 * current vacuum cycle ID.
1167 * The page had better be in a state that's consistent with what we
1168 * expect. Check for conditions that imply corruption in passing. It
1169 * can't be half-dead because only an interrupted VACUUM process can
1170 * leave pages in that state, so we'd definitely have dealt with it
1171 * back when the page was the scanblkno page (half-dead pages are
1172 * always marked fully deleted by _bt_pagedel(), barring corruption).
1174 if (!opaque
|| !P_ISLEAF(opaque
) || P_ISHALFDEAD(opaque
))
1178 (errcode(ERRCODE_INDEX_CORRUPTED
),
1179 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1180 blkno
, scanblkno
, RelationGetRelationName(rel
))));
1181 _bt_relbuf(rel
, buf
);
1186 * We may have already processed the page in an earlier call, when the
1187 * page was scanblkno. This happens when the leaf page split occurred
1188 * after the scan began, but before the right sibling page became the
1191 * Page may also have been deleted by current btvacuumpage() call,
1192 * since _bt_pagedel() sometimes deletes the right sibling page of
1193 * scanblkno in passing (it does so after we decided where to
1194 * backtrack to). We don't need to process this page as a deleted
1195 * page a second time now (in fact, it would be wrong to count it as a
1196 * deleted page in the bulk delete statistics a second time).
1198 if (opaque
->btpo_cycleid
!= vstate
->cycleid
|| P_ISDELETED(opaque
))
1200 /* Done with current scanblkno (and all lower split pages) */
1201 _bt_relbuf(rel
, buf
);
1206 if (!opaque
|| BTPageIsRecyclable(page
, heaprel
))
1208 /* Okay to recycle this page (which could be leaf or internal) */
1209 RecordFreeIndexPage(rel
, blkno
);
1210 stats
->pages_deleted
++;
1211 stats
->pages_free
++;
1213 else if (P_ISDELETED(opaque
))
1216 * Already deleted page (which could be leaf or internal). Can't
1219 stats
->pages_deleted
++;
1221 else if (P_ISHALFDEAD(opaque
))
1223 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1224 attempt_pagedel
= true;
1227 * _bt_pagedel() will increment both pages_newly_deleted and
1228 * pages_deleted stats in all cases (barring corruption)
1231 else if (P_ISLEAF(opaque
))
1233 OffsetNumber deletable
[MaxIndexTuplesPerPage
];
1235 BTVacuumPosting updatable
[MaxIndexTuplesPerPage
];
1237 OffsetNumber offnum
,
1244 * Trade in the initial read lock for a full cleanup lock on this
1245 * page. We must get such a lock on every leaf page over the course
1246 * of the vacuum scan, whether or not it actually contains any
1247 * deletable tuples --- see nbtree/README.
1249 _bt_upgradelockbufcleanup(rel
, buf
);
1252 * Check whether we need to backtrack to earlier pages. What we are
1253 * concerned about is a page split that happened since we started the
1254 * vacuum scan. If the split moved tuples on the right half of the
1255 * split (i.e. the tuples that sort high) to a block that we already
1256 * passed over, then we might have missed the tuples. We need to
1257 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1258 * or deleting scanblkno page below!)
1260 if (vstate
->cycleid
!= 0 &&
1261 opaque
->btpo_cycleid
== vstate
->cycleid
&&
1262 !(opaque
->btpo_flags
& BTP_SPLIT_END
) &&
1263 !P_RIGHTMOST(opaque
) &&
1264 opaque
->btpo_next
< scanblkno
)
1265 backtrack_to
= opaque
->btpo_next
;
1269 minoff
= P_FIRSTDATAKEY(opaque
);
1270 maxoff
= PageGetMaxOffsetNumber(page
);
1275 /* btbulkdelete callback tells us what to delete (or update) */
1276 for (offnum
= minoff
;
1278 offnum
= OffsetNumberNext(offnum
))
1282 itup
= (IndexTuple
) PageGetItem(page
,
1283 PageGetItemId(page
, offnum
));
1285 Assert(!BTreeTupleIsPivot(itup
));
1286 if (!BTreeTupleIsPosting(itup
))
1288 /* Regular tuple, standard table TID representation */
1289 if (callback(&itup
->t_tid
, callback_state
))
1291 deletable
[ndeletable
++] = offnum
;
1299 BTVacuumPosting vacposting
;
1302 /* Posting list tuple */
1303 vacposting
= btreevacuumposting(vstate
, itup
, offnum
,
1305 if (vacposting
== NULL
)
1308 * All table TIDs from the posting tuple remain, so no
1309 * delete or update required
1311 Assert(nremaining
== BTreeTupleGetNPosting(itup
));
1313 else if (nremaining
> 0)
1317 * Store metadata about posting list tuple in
1318 * updatable array for entire page. Existing tuple
1319 * will be updated during the later call to
1320 * _bt_delitems_vacuum().
1322 Assert(nremaining
< BTreeTupleGetNPosting(itup
));
1323 updatable
[nupdatable
++] = vacposting
;
1324 nhtidsdead
+= BTreeTupleGetNPosting(itup
) - nremaining
;
1329 * All table TIDs from the posting list must be
1330 * deleted. We'll delete the index tuple completely
1331 * (no update required).
1333 Assert(nremaining
== 0);
1334 deletable
[ndeletable
++] = offnum
;
1335 nhtidsdead
+= BTreeTupleGetNPosting(itup
);
1339 nhtidslive
+= nremaining
;
1345 * Apply any needed deletes or updates. We issue just one
1346 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1348 if (ndeletable
> 0 || nupdatable
> 0)
1350 Assert(nhtidsdead
>= ndeletable
+ nupdatable
);
1351 _bt_delitems_vacuum(rel
, buf
, deletable
, ndeletable
, updatable
,
1354 stats
->tuples_removed
+= nhtidsdead
;
1355 /* must recompute maxoff */
1356 maxoff
= PageGetMaxOffsetNumber(page
);
1358 /* can't leak memory here */
1359 for (int i
= 0; i
< nupdatable
; i
++)
1360 pfree(updatable
[i
]);
1365 * If the leaf page has been split during this vacuum cycle, it
1366 * seems worth expending a write to clear btpo_cycleid even if we
1367 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1368 * takes care of this.) This ensures we won't process the page
1371 * We treat this like a hint-bit update because there's no need to
1374 Assert(nhtidsdead
== 0);
1375 if (vstate
->cycleid
!= 0 &&
1376 opaque
->btpo_cycleid
== vstate
->cycleid
)
1378 opaque
->btpo_cycleid
= 0;
1379 MarkBufferDirtyHint(buf
, true);
1384 * If the leaf page is now empty, try to delete it; else count the
1385 * live tuples (live table TIDs in posting lists are counted as
1386 * separate live tuples). We don't delete when backtracking, though,
1387 * since that would require teaching _bt_pagedel() about backtracking
1388 * (doesn't seem worth adding more complexity to deal with that).
1390 * We don't count the number of live TIDs during cleanup-only calls to
1391 * btvacuumscan (i.e. when callback is not set). We count the number
1392 * of index tuples directly instead. This avoids the expense of
1393 * directly examining all of the tuples on each page. VACUUM will
1394 * treat num_index_tuples as an estimate in cleanup-only case, so it
1395 * doesn't matter that this underestimates num_index_tuples
1396 * significantly in some cases.
1398 if (minoff
> maxoff
)
1399 attempt_pagedel
= (blkno
== scanblkno
);
1401 stats
->num_index_tuples
+= nhtidslive
;
1403 stats
->num_index_tuples
+= maxoff
- minoff
+ 1;
1405 Assert(!attempt_pagedel
|| nhtidslive
== 0);
1408 if (attempt_pagedel
)
1410 MemoryContext oldcontext
;
1412 /* Run pagedel in a temp context to avoid memory leakage */
1413 MemoryContextReset(vstate
->pagedelcontext
);
1414 oldcontext
= MemoryContextSwitchTo(vstate
->pagedelcontext
);
1417 * _bt_pagedel maintains the bulk delete stats on our behalf;
1418 * pages_newly_deleted and pages_deleted are likely to be incremented
1421 Assert(blkno
== scanblkno
);
1422 _bt_pagedel(rel
, buf
, vstate
);
1424 MemoryContextSwitchTo(oldcontext
);
1425 /* pagedel released buffer, so we shouldn't */
1428 _bt_relbuf(rel
, buf
);
1430 if (backtrack_to
!= P_NONE
)
1432 blkno
= backtrack_to
;
1438 * btreevacuumposting --- determine TIDs still needed in posting list
1440 * Returns metadata describing how to build replacement tuple without the TIDs
1441 * that VACUUM needs to delete. Returned value is NULL in the common case
1442 * where no changes are needed to caller's posting list tuple (we avoid
1443 * allocating memory here as an optimization).
1445 * The number of TIDs that should remain in the posting list tuple is set for
1446 * caller in *nremaining.
1448 static BTVacuumPosting
1449 btreevacuumposting(BTVacState
*vstate
, IndexTuple posting
,
1450 OffsetNumber updatedoffset
, int *nremaining
)
1453 int nitem
= BTreeTupleGetNPosting(posting
);
1454 ItemPointer items
= BTreeTupleGetPosting(posting
);
1455 BTVacuumPosting vacposting
= NULL
;
1457 for (int i
= 0; i
< nitem
; i
++)
1459 if (!vstate
->callback(items
+ i
, vstate
->callback_state
))
1461 /* Live table TID */
1464 else if (vacposting
== NULL
)
1467 * First dead table TID encountered.
1469 * It's now clear that we need to delete one or more dead table
1470 * TIDs, so start maintaining metadata describing how to update
1471 * existing posting list tuple.
1473 vacposting
= palloc(offsetof(BTVacuumPostingData
, deletetids
) +
1474 nitem
* sizeof(uint16
));
1476 vacposting
->itup
= posting
;
1477 vacposting
->updatedoffset
= updatedoffset
;
1478 vacposting
->ndeletedtids
= 0;
1479 vacposting
->deletetids
[vacposting
->ndeletedtids
++] = i
;
1483 /* Second or subsequent dead table TID */
1484 vacposting
->deletetids
[vacposting
->ndeletedtids
++] = i
;
1493 * btcanreturn() -- Check whether btree indexes support index-only scans.
1495 * btrees always do, so this is trivial.
1498 btcanreturn(Relation index
, int attno
)
1504 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1507 btgettreeheight(Relation rel
)
1509 return _bt_getrootheight(rel
);