3 * Range map for BRIN indexes
5 * The range map (revmap) is a translation structure for BRIN indexes: for each
6 * page range there is one summary tuple, and its location is tracked by the
7 * revmap. Whenever a new tuple is inserted into a table that violates the
8 * previously recorded summary values, a new tuple is inserted into the index
9 * and the revmap is updated to point to it.
11 * The revmap is stored in the first pages of the index, immediately following
12 * the metapage. When the revmap needs to be expanded, all tuples on the
13 * regular BRIN page at that block (if any) are moved out of the way.
15 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
16 * Portions Copyright (c) 1994, Regents of the University of California
19 * src/backend/access/brin/brin_revmap.c
23 #include "access/brin_page.h"
24 #include "access/brin_pageops.h"
25 #include "access/brin_revmap.h"
26 #include "access/brin_tuple.h"
27 #include "access/brin_xlog.h"
28 #include "access/rmgr.h"
29 #include "access/xloginsert.h"
30 #include "miscadmin.h"
31 #include "storage/bufmgr.h"
32 #include "utils/rel.h"
36 * In revmap pages, each item stores an ItemPointerData. These defines let one
37 * find the logical revmap page number and index number of the revmap item for
38 * the given heap block number.
40 #define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \
41 ((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS)
42 #define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \
43 ((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS)
49 BlockNumber rm_pagesPerRange
;
50 BlockNumber rm_lastRevmapPage
; /* cached from the metapage */
55 /* typedef appears in brin_revmap.h */
58 static BlockNumber
revmap_get_blkno(BrinRevmap
*revmap
,
60 static Buffer
revmap_get_buffer(BrinRevmap
*revmap
, BlockNumber heapBlk
);
61 static BlockNumber
revmap_extend_and_get_blkno(BrinRevmap
*revmap
,
63 static void revmap_physical_extend(BrinRevmap
*revmap
);
66 * Initialize an access object for a range map. This must be freed by
67 * brinRevmapTerminate when caller is done with it.
70 brinRevmapInitialize(Relation idxrel
, BlockNumber
*pagesPerRange
)
74 BrinMetaPageData
*metadata
;
77 meta
= ReadBuffer(idxrel
, BRIN_METAPAGE_BLKNO
);
78 LockBuffer(meta
, BUFFER_LOCK_SHARE
);
79 page
= BufferGetPage(meta
);
80 metadata
= (BrinMetaPageData
*) PageGetContents(page
);
82 revmap
= palloc(sizeof(BrinRevmap
));
83 revmap
->rm_irel
= idxrel
;
84 revmap
->rm_pagesPerRange
= metadata
->pagesPerRange
;
85 revmap
->rm_lastRevmapPage
= metadata
->lastRevmapPage
;
86 revmap
->rm_metaBuf
= meta
;
87 revmap
->rm_currBuf
= InvalidBuffer
;
89 *pagesPerRange
= metadata
->pagesPerRange
;
91 LockBuffer(meta
, BUFFER_LOCK_UNLOCK
);
97 * Release resources associated with a revmap access object.
100 brinRevmapTerminate(BrinRevmap
*revmap
)
102 ReleaseBuffer(revmap
->rm_metaBuf
);
103 if (revmap
->rm_currBuf
!= InvalidBuffer
)
104 ReleaseBuffer(revmap
->rm_currBuf
);
109 * Extend the revmap to cover the given heap block number.
112 brinRevmapExtend(BrinRevmap
*revmap
, BlockNumber heapBlk
)
114 BlockNumber mapBlk PG_USED_FOR_ASSERTS_ONLY
;
116 mapBlk
= revmap_extend_and_get_blkno(revmap
, heapBlk
);
118 /* Ensure the buffer we got is in the expected range */
119 Assert(mapBlk
!= InvalidBlockNumber
&&
120 mapBlk
!= BRIN_METAPAGE_BLKNO
&&
121 mapBlk
<= revmap
->rm_lastRevmapPage
);
125 * Prepare to insert an entry into the revmap; the revmap buffer in which the
126 * entry is to reside is locked and returned. Most callers should call
127 * brinRevmapExtend beforehand, as this routine does not extend the revmap if
128 * it's not long enough.
130 * The returned buffer is also recorded in the revmap struct; finishing that
131 * releases the buffer, therefore the caller needn't do it explicitly.
134 brinLockRevmapPageForUpdate(BrinRevmap
*revmap
, BlockNumber heapBlk
)
138 rmBuf
= revmap_get_buffer(revmap
, heapBlk
);
139 LockBuffer(rmBuf
, BUFFER_LOCK_EXCLUSIVE
);
145 * In the given revmap buffer (locked appropriately by caller), which is used
146 * in a BRIN index of pagesPerRange pages per range, set the element
147 * corresponding to heap block number heapBlk to the given TID.
149 * Once the operation is complete, the caller must update the LSN on the
152 * This is used both in regular operation and during WAL replay.
155 brinSetHeapBlockItemptr(Buffer buf
, BlockNumber pagesPerRange
,
156 BlockNumber heapBlk
, ItemPointerData tid
)
158 RevmapContents
*contents
;
159 ItemPointerData
*iptr
;
162 /* The correct page should already be pinned and locked */
163 page
= BufferGetPage(buf
);
164 contents
= (RevmapContents
*) PageGetContents(page
);
165 iptr
= (ItemPointerData
*) contents
->rm_tids
;
166 iptr
+= HEAPBLK_TO_REVMAP_INDEX(pagesPerRange
, heapBlk
);
168 if (ItemPointerIsValid(&tid
))
170 ItemPointerGetBlockNumber(&tid
),
171 ItemPointerGetOffsetNumber(&tid
));
173 ItemPointerSetInvalid(iptr
);
177 * Fetch the BrinTuple for a given heap block.
179 * The buffer containing the tuple is locked, and returned in *buf. The
180 * returned tuple points to the shared buffer and must not be freed; if caller
181 * wants to use it after releasing the buffer lock, it must create its own
182 * palloc'ed copy. As an optimization, the caller can pass a pinned buffer
183 * *buf on entry, which will avoid a pin-unpin cycle when the next tuple is on
184 * the same page as a previous one.
186 * If no tuple is found for the given heap range, returns NULL. In that case,
187 * *buf might still be updated (and pin must be released by caller), but it's
190 * The output tuple offset within the buffer is returned in *off, and its size
191 * is returned in *size.
194 brinGetTupleForHeapBlock(BrinRevmap
*revmap
, BlockNumber heapBlk
,
195 Buffer
*buf
, OffsetNumber
*off
, Size
*size
, int mode
)
197 Relation idxRel
= revmap
->rm_irel
;
199 RevmapContents
*contents
;
200 ItemPointerData
*iptr
;
205 ItemPointerData previptr
;
207 /* normalize the heap block number to be the first page in the range */
208 heapBlk
= (heapBlk
/ revmap
->rm_pagesPerRange
) * revmap
->rm_pagesPerRange
;
211 * Compute the revmap page number we need. If Invalid is returned (i.e.,
212 * the revmap page hasn't been created yet), the requested page range is
215 mapBlk
= revmap_get_blkno(revmap
, heapBlk
);
216 if (mapBlk
== InvalidBlockNumber
)
218 *off
= InvalidOffsetNumber
;
222 ItemPointerSetInvalid(&previptr
);
225 CHECK_FOR_INTERRUPTS();
227 if (revmap
->rm_currBuf
== InvalidBuffer
||
228 BufferGetBlockNumber(revmap
->rm_currBuf
) != mapBlk
)
230 if (revmap
->rm_currBuf
!= InvalidBuffer
)
231 ReleaseBuffer(revmap
->rm_currBuf
);
233 Assert(mapBlk
!= InvalidBlockNumber
);
234 revmap
->rm_currBuf
= ReadBuffer(revmap
->rm_irel
, mapBlk
);
237 LockBuffer(revmap
->rm_currBuf
, BUFFER_LOCK_SHARE
);
239 contents
= (RevmapContents
*)
240 PageGetContents(BufferGetPage(revmap
->rm_currBuf
));
241 iptr
= contents
->rm_tids
;
242 iptr
+= HEAPBLK_TO_REVMAP_INDEX(revmap
->rm_pagesPerRange
, heapBlk
);
244 if (!ItemPointerIsValid(iptr
))
246 LockBuffer(revmap
->rm_currBuf
, BUFFER_LOCK_UNLOCK
);
251 * Check the TID we got in a previous iteration, if any, and save the
252 * current TID we got from the revmap; if we loop, we can sanity-check
253 * that the next one we get is different. Otherwise we might be stuck
254 * looping forever if the revmap is somehow badly broken.
256 if (ItemPointerIsValid(&previptr
) && ItemPointerEquals(&previptr
, iptr
))
258 (errcode(ERRCODE_INDEX_CORRUPTED
),
259 errmsg_internal("corrupted BRIN index: inconsistent range map")));
262 blk
= ItemPointerGetBlockNumber(iptr
);
263 *off
= ItemPointerGetOffsetNumber(iptr
);
265 LockBuffer(revmap
->rm_currBuf
, BUFFER_LOCK_UNLOCK
);
267 /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */
268 if (!BufferIsValid(*buf
) || BufferGetBlockNumber(*buf
) != blk
)
270 if (BufferIsValid(*buf
))
272 *buf
= ReadBuffer(idxRel
, blk
);
274 LockBuffer(*buf
, mode
);
275 page
= BufferGetPage(*buf
);
277 /* If we land on a revmap page, start over */
278 if (BRIN_IS_REGULAR_PAGE(page
))
281 * If the offset number is greater than what's in the page, it's
282 * possible that the range was desummarized concurrently. Just
283 * return NULL to handle that case.
285 if (*off
> PageGetMaxOffsetNumber(page
))
287 LockBuffer(*buf
, BUFFER_LOCK_UNLOCK
);
291 lp
= PageGetItemId(page
, *off
);
292 if (ItemIdIsUsed(lp
))
294 tup
= (BrinTuple
*) PageGetItem(page
, lp
);
296 if (tup
->bt_blkno
== heapBlk
)
299 *size
= ItemIdGetLength(lp
);
307 * No luck. Assume that the revmap was updated concurrently.
309 LockBuffer(*buf
, BUFFER_LOCK_UNLOCK
);
311 /* not reached, but keep compiler quiet */
316 * Delete an index tuple, marking a page range as unsummarized.
318 * Index must be locked in ShareUpdateExclusiveLock mode.
320 * Return false if caller should retry.
323 brinRevmapDesummarizeRange(Relation idxrel
, BlockNumber heapBlk
)
326 BlockNumber pagesPerRange
;
327 RevmapContents
*contents
;
328 ItemPointerData
*iptr
;
329 ItemPointerData invalidIptr
;
330 BlockNumber revmapBlk
;
335 OffsetNumber revmapOffset
;
336 OffsetNumber regOffset
;
339 revmap
= brinRevmapInitialize(idxrel
, &pagesPerRange
);
341 revmapBlk
= revmap_get_blkno(revmap
, heapBlk
);
342 if (!BlockNumberIsValid(revmapBlk
))
344 /* revmap page doesn't exist: range not summarized, we're done */
345 brinRevmapTerminate(revmap
);
349 /* Lock the revmap page, obtain the index tuple pointer from it */
350 revmapBuf
= brinLockRevmapPageForUpdate(revmap
, heapBlk
);
351 revmapPg
= BufferGetPage(revmapBuf
);
352 revmapOffset
= HEAPBLK_TO_REVMAP_INDEX(revmap
->rm_pagesPerRange
, heapBlk
);
354 contents
= (RevmapContents
*) PageGetContents(revmapPg
);
355 iptr
= contents
->rm_tids
;
356 iptr
+= revmapOffset
;
358 if (!ItemPointerIsValid(iptr
))
360 /* no index tuple: range not summarized, we're done */
361 LockBuffer(revmapBuf
, BUFFER_LOCK_UNLOCK
);
362 brinRevmapTerminate(revmap
);
366 regBuf
= ReadBuffer(idxrel
, ItemPointerGetBlockNumber(iptr
));
367 LockBuffer(regBuf
, BUFFER_LOCK_EXCLUSIVE
);
368 regPg
= BufferGetPage(regBuf
);
370 /* if this is no longer a regular page, tell caller to start over */
371 if (!BRIN_IS_REGULAR_PAGE(regPg
))
373 LockBuffer(revmapBuf
, BUFFER_LOCK_UNLOCK
);
374 LockBuffer(regBuf
, BUFFER_LOCK_UNLOCK
);
375 brinRevmapTerminate(revmap
);
379 regOffset
= ItemPointerGetOffsetNumber(iptr
);
380 if (regOffset
> PageGetMaxOffsetNumber(regPg
))
382 (errcode(ERRCODE_INDEX_CORRUPTED
),
383 errmsg("corrupted BRIN index: inconsistent range map")));
385 lp
= PageGetItemId(regPg
, regOffset
);
386 if (!ItemIdIsUsed(lp
))
388 (errcode(ERRCODE_INDEX_CORRUPTED
),
389 errmsg("corrupted BRIN index: inconsistent range map")));
392 * Placeholder tuples only appear during unfinished summarization, and we
393 * hold ShareUpdateExclusiveLock, so this function cannot run concurrently
394 * with that. So any placeholder tuples that exist are leftovers from a
395 * crashed or aborted summarization; remove them silently.
398 START_CRIT_SECTION();
400 ItemPointerSetInvalid(&invalidIptr
);
401 brinSetHeapBlockItemptr(revmapBuf
, revmap
->rm_pagesPerRange
, heapBlk
,
403 PageIndexTupleDeleteNoCompact(regPg
, regOffset
);
404 /* XXX record free space in FSM? */
406 MarkBufferDirty(regBuf
);
407 MarkBufferDirty(revmapBuf
);
409 if (RelationNeedsWAL(idxrel
))
411 xl_brin_desummarize xlrec
;
414 xlrec
.pagesPerRange
= revmap
->rm_pagesPerRange
;
415 xlrec
.heapBlk
= heapBlk
;
416 xlrec
.regOffset
= regOffset
;
419 XLogRegisterData((char *) &xlrec
, SizeOfBrinDesummarize
);
420 XLogRegisterBuffer(0, revmapBuf
, 0);
421 XLogRegisterBuffer(1, regBuf
, REGBUF_STANDARD
);
422 recptr
= XLogInsert(RM_BRIN_ID
, XLOG_BRIN_DESUMMARIZE
);
423 PageSetLSN(revmapPg
, recptr
);
424 PageSetLSN(regPg
, recptr
);
429 UnlockReleaseBuffer(regBuf
);
430 LockBuffer(revmapBuf
, BUFFER_LOCK_UNLOCK
);
431 brinRevmapTerminate(revmap
);
437 * Given a heap block number, find the corresponding physical revmap block
438 * number and return it. If the revmap page hasn't been allocated yet, return
439 * InvalidBlockNumber.
442 revmap_get_blkno(BrinRevmap
*revmap
, BlockNumber heapBlk
)
444 BlockNumber targetblk
;
446 /* obtain revmap block number, skip 1 for metapage block */
447 targetblk
= HEAPBLK_TO_REVMAP_BLK(revmap
->rm_pagesPerRange
, heapBlk
) + 1;
449 /* Normal case: the revmap page is already allocated */
450 if (targetblk
<= revmap
->rm_lastRevmapPage
)
453 return InvalidBlockNumber
;
457 * Obtain and return a buffer containing the revmap page for the given heap
458 * page. The revmap must have been previously extended to cover that page.
459 * The returned buffer is also recorded in the revmap struct; finishing that
460 * releases the buffer, therefore the caller needn't do it explicitly.
463 revmap_get_buffer(BrinRevmap
*revmap
, BlockNumber heapBlk
)
467 /* Translate the heap block number to physical index location. */
468 mapBlk
= revmap_get_blkno(revmap
, heapBlk
);
470 if (mapBlk
== InvalidBlockNumber
)
471 elog(ERROR
, "revmap does not cover heap block %u", heapBlk
);
473 /* Ensure the buffer we got is in the expected range */
474 Assert(mapBlk
!= BRIN_METAPAGE_BLKNO
&&
475 mapBlk
<= revmap
->rm_lastRevmapPage
);
478 * Obtain the buffer from which we need to read. If we already have the
479 * correct buffer in our access struct, use that; otherwise, release that,
480 * (if valid) and read the one we need.
482 if (revmap
->rm_currBuf
== InvalidBuffer
||
483 mapBlk
!= BufferGetBlockNumber(revmap
->rm_currBuf
))
485 if (revmap
->rm_currBuf
!= InvalidBuffer
)
486 ReleaseBuffer(revmap
->rm_currBuf
);
488 revmap
->rm_currBuf
= ReadBuffer(revmap
->rm_irel
, mapBlk
);
491 return revmap
->rm_currBuf
;
495 * Given a heap block number, find the corresponding physical revmap block
496 * number and return it. If the revmap page hasn't been allocated yet, extend
497 * the revmap until it is.
500 revmap_extend_and_get_blkno(BrinRevmap
*revmap
, BlockNumber heapBlk
)
502 BlockNumber targetblk
;
504 /* obtain revmap block number, skip 1 for metapage block */
505 targetblk
= HEAPBLK_TO_REVMAP_BLK(revmap
->rm_pagesPerRange
, heapBlk
) + 1;
507 /* Extend the revmap, if necessary */
508 while (targetblk
> revmap
->rm_lastRevmapPage
)
510 CHECK_FOR_INTERRUPTS();
511 revmap_physical_extend(revmap
);
518 * Try to extend the revmap by one page. This might not happen for a number of
519 * reasons; caller is expected to retry until the expected outcome is obtained.
522 revmap_physical_extend(BrinRevmap
*revmap
)
527 BrinMetaPageData
*metadata
;
530 Relation irel
= revmap
->rm_irel
;
533 * Lock the metapage. This locks out concurrent extensions of the revmap,
534 * but note that we still need to grab the relation extension lock because
535 * another backend can extend the index with regular BRIN pages.
537 LockBuffer(revmap
->rm_metaBuf
, BUFFER_LOCK_EXCLUSIVE
);
538 metapage
= BufferGetPage(revmap
->rm_metaBuf
);
539 metadata
= (BrinMetaPageData
*) PageGetContents(metapage
);
542 * Check that our cached lastRevmapPage value was up-to-date; if it
543 * wasn't, update the cached copy and have caller start over.
545 if (metadata
->lastRevmapPage
!= revmap
->rm_lastRevmapPage
)
547 revmap
->rm_lastRevmapPage
= metadata
->lastRevmapPage
;
548 LockBuffer(revmap
->rm_metaBuf
, BUFFER_LOCK_UNLOCK
);
551 mapBlk
= metadata
->lastRevmapPage
+ 1;
553 nblocks
= RelationGetNumberOfBlocks(irel
);
554 if (mapBlk
< nblocks
)
556 buf
= ReadBuffer(irel
, mapBlk
);
557 LockBuffer(buf
, BUFFER_LOCK_EXCLUSIVE
);
558 page
= BufferGetPage(buf
);
562 buf
= ExtendBufferedRel(BMR_REL(irel
), MAIN_FORKNUM
, NULL
,
564 if (BufferGetBlockNumber(buf
) != mapBlk
)
567 * Very rare corner case: somebody extended the relation
568 * concurrently after we read its length. If this happens, give
569 * up and have caller start over. We will have to evacuate that
570 * page from under whoever is using it.
572 LockBuffer(revmap
->rm_metaBuf
, BUFFER_LOCK_UNLOCK
);
573 UnlockReleaseBuffer(buf
);
576 page
= BufferGetPage(buf
);
579 /* Check that it's a regular block (or an empty page) */
580 if (!PageIsNew(page
) && !BRIN_IS_REGULAR_PAGE(page
))
582 (errcode(ERRCODE_INDEX_CORRUPTED
),
583 errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u",
585 RelationGetRelationName(irel
),
586 BufferGetBlockNumber(buf
))));
588 /* If the page is in use, evacuate it and restart */
589 if (brin_start_evacuating_page(irel
, buf
))
591 LockBuffer(revmap
->rm_metaBuf
, BUFFER_LOCK_UNLOCK
);
592 brin_evacuate_page(irel
, revmap
->rm_pagesPerRange
, revmap
, buf
);
594 /* have caller start over */
599 * Ok, we have now locked the metapage and the target block. Re-initialize
600 * the target block as a revmap page, and update the metapage.
602 START_CRIT_SECTION();
604 /* the rm_tids array is initialized to all invalid by PageInit */
605 brin_page_init(page
, BRIN_PAGETYPE_REVMAP
);
606 MarkBufferDirty(buf
);
608 metadata
->lastRevmapPage
= mapBlk
;
611 * Set pd_lower just past the end of the metadata. This is essential,
612 * because without doing so, metadata will be lost if xlog.c compresses
613 * the page. (We must do this here because pre-v11 versions of PG did not
614 * set the metapage's pd_lower correctly, so a pg_upgraded index might
615 * contain the wrong value.)
617 ((PageHeader
) metapage
)->pd_lower
=
618 ((char *) metadata
+ sizeof(BrinMetaPageData
)) - (char *) metapage
;
620 MarkBufferDirty(revmap
->rm_metaBuf
);
622 if (RelationNeedsWAL(revmap
->rm_irel
))
624 xl_brin_revmap_extend xlrec
;
627 xlrec
.targetBlk
= mapBlk
;
630 XLogRegisterData((char *) &xlrec
, SizeOfBrinRevmapExtend
);
631 XLogRegisterBuffer(0, revmap
->rm_metaBuf
, REGBUF_STANDARD
);
633 XLogRegisterBuffer(1, buf
, REGBUF_WILL_INIT
);
635 recptr
= XLogInsert(RM_BRIN_ID
, XLOG_BRIN_REVMAP_EXTEND
);
636 PageSetLSN(metapage
, recptr
);
637 PageSetLSN(page
, recptr
);
642 LockBuffer(revmap
->rm_metaBuf
, BUFFER_LOCK_UNLOCK
);
644 UnlockReleaseBuffer(buf
);