src/backend/access/heap/visibilitymap.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * visibilitymap.c
   4  *        bitmap for tracking visibility of heap tuples
   5  *
   6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/heap/visibilitymap.c
  12  *
  13  * INTERFACE ROUTINES
  14  *              visibilitymap_clear  - clear bits for one page in the visibility map
  15  *              visibilitymap_pin        - pin a map page for setting a bit
  16  *              visibilitymap_pin_ok - check whether correct map page is already pinned
  17  *              visibilitymap_set        - set a bit in a previously pinned page
  18  *              visibilitymap_get_status - get status of bits
  19  *              visibilitymap_count  - count number of bits set in visibility map
  20  *              visibilitymap_prepare_truncate -
  21  *                      prepare for truncation of the visibility map
  22  *
  23  * NOTES
  24  *
  25  * The visibility map is a bitmap with two bits (all-visible and all-frozen)
  26  * per heap page. A set all-visible bit means that all tuples on the page are
  27  * known visible to all transactions, and therefore the page doesn't need to
  28  * be vacuumed. A set all-frozen bit means that all tuples on the page are
  29  * completely frozen, and therefore the page doesn't need to be vacuumed even
  30  * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum).
  31  * The all-frozen bit must be set only when the page is already all-visible.
  32  *
  33  * The map is conservative in the sense that we make sure that whenever a bit
  34  * is set, we know the condition is true, but if a bit is not set, it might or
  35  * might not be true.
  36  *
  37  * Clearing visibility map bits is not separately WAL-logged.  The callers
  38  * must make sure that whenever a bit is cleared, the bit is cleared on WAL
  39  * replay of the updating operation as well.
  40  *
  41  * When we *set* a visibility map during VACUUM, we must write WAL.  This may
  42  * seem counterintuitive, since the bit is basically a hint: if it is clear,
  43  * it may still be the case that every tuple on the page is visible to all
  44  * transactions; we just don't know that for certain.  The difficulty is that
  45  * there are two bits which are typically set together: the PD_ALL_VISIBLE bit
  46  * on the page itself, and the visibility map bit.  If a crash occurs after the
  47  * visibility map page makes it to disk and before the updated heap page makes
  48  * it to disk, redo must set the bit on the heap page.  Otherwise, the next
  49  * insert, update, or delete on the heap page will fail to realize that the
  50  * visibility map bit must be cleared, possibly causing index-only scans to
  51  * return wrong answers.
  52  *
  53  * VACUUM will normally skip pages for which the visibility map bit is set;
  54  * such pages can't contain any dead tuples and therefore don't need vacuuming.
  55  *
  56  * LOCKING
  57  *
  58  * In heapam.c, whenever a page is modified so that not all tuples on the
  59  * page are visible to everyone anymore, the corresponding bit in the
  60  * visibility map is cleared. In order to be crash-safe, we need to do this
  61  * while still holding a lock on the heap page and in the same critical
  62  * section that logs the page modification. However, we don't want to hold
  63  * the buffer lock over any I/O that may be required to read in the visibility
  64  * map page.  To avoid this, we examine the heap page before locking it;
  65  * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map
  66  * bit.  Then, we lock the buffer.  But this creates a race condition: there
  67  * is a possibility that in the time it takes to lock the buffer, the
  68  * PD_ALL_VISIBLE bit gets set.  If that happens, we have to unlock the
  69  * buffer, pin the visibility map page, and relock the buffer.  This shouldn't
  70  * happen often, because only VACUUM currently sets visibility map bits,
  71  * and the race will only occur if VACUUM processes a given page at almost
  72  * exactly the same time that someone tries to further modify it.
  73  *
  74  * To set a bit, you need to hold a lock on the heap page. That prevents
  75  * the race condition where VACUUM sees that all tuples on the page are
  76  * visible to everyone, but another backend modifies the page before VACUUM
  77  * sets the bit in the visibility map.
  78  *
  79  * When a bit is set, the LSN of the visibility map page is updated to make
  80  * sure that the visibility map update doesn't get written to disk before the
  81  * WAL record of the changes that made it possible to set the bit is flushed.
  82  * But when a bit is cleared, we don't have to do that because it's always
  83  * safe to clear a bit in the map from correctness point of view.
  84  *
  85  *-------------------------------------------------------------------------
  86  */
  87 #include "postgres.h"
  88
  89 #include "access/heapam_xlog.h"
  90 #include "access/visibilitymap.h"
  91 #include "access/xloginsert.h"
  92 #include "access/xlogutils.h"
  93 #include "miscadmin.h"
  94 #include "port/pg_bitutils.h"
  95 #include "storage/bufmgr.h"
  96 #include "storage/smgr.h"
  97 #include "utils/inval.h"
  98 #include "utils/rel.h"
  99
 100
 101 /*#define TRACE_VISIBILITYMAP */
 102
 103 /*
 104  * Size of the bitmap on each visibility map page, in bytes. There's no
 105  * extra headers, so the whole page minus the standard page header is
 106  * used for the bitmap.
 107  */
 108 #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
 109
 110 /* Number of heap blocks we can represent in one byte */
 111 #define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
 112
 113 /* Number of heap blocks we can represent in one visibility map page. */
 114 #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
 115
 116 /* Mapping from heap block number to the right bit in the visibility map */
 117 #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
 118 #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
 119 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
 120
 121 /* Masks for counting subsets of bits in the visibility map. */
 122 #define VISIBLE_MASK8   (0x55)  /* The lower bit of each bit pair */
 123 #define FROZEN_MASK8    (0xaa)  /* The upper bit of each bit pair */
 124
 125 /* prototypes for internal routines */
 126 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
 127 static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks);
 128
 129
 130 /*
 131  *      visibilitymap_clear - clear specified bits for one page in visibility map
 132  *
 133  * You must pass a buffer containing the correct map page to this function.
 134  * Call visibilitymap_pin first to pin the right one. This function doesn't do
 135  * any I/O.  Returns true if any bits have been cleared and false otherwise.
 136  */
 137 bool
 138 visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
 139 {
 140         BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
 141         int                     mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
 142         int                     mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
 143         uint8           mask = flags << mapOffset;
 144         char       *map;
 145         bool            cleared = false;
 146
 147         /* Must never clear all_visible bit while leaving all_frozen bit set */
 148         Assert(flags & VISIBILITYMAP_VALID_BITS);
 149         Assert(flags != VISIBILITYMAP_ALL_VISIBLE);
 150
 151 #ifdef TRACE_VISIBILITYMAP
 152         elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
 153 #endif
 154
 155         if (!BufferIsValid(vmbuf) || BufferGetBlockNumber(vmbuf) != mapBlock)
 156                 elog(ERROR, "wrong buffer passed to visibilitymap_clear");
 157
 158         LockBuffer(vmbuf, BUFFER_LOCK_EXCLUSIVE);
 159         map = PageGetContents(BufferGetPage(vmbuf));
 160
 161         if (map[mapByte] & mask)
 162         {
 163                 map[mapByte] &= ~mask;
 164
 165                 MarkBufferDirty(vmbuf);
 166                 cleared = true;
 167         }
 168
 169         LockBuffer(vmbuf, BUFFER_LOCK_UNLOCK);
 170
 171         return cleared;
 172 }
 173
 174 /*
 175  *      visibilitymap_pin - pin a map page for setting a bit
 176  *
 177  * Setting a bit in the visibility map is a two-phase operation. First, call
 178  * visibilitymap_pin, to pin the visibility map page containing the bit for
 179  * the heap page. Because that can require I/O to read the map page, you
 180  * shouldn't hold a lock on the heap page while doing that. Then, call
 181  * visibilitymap_set to actually set the bit.
 182  *
 183  * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by
 184  * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
 185  * relation. On return, *vmbuf is a valid buffer with the map page containing
 186  * the bit for heapBlk.
 187  *
 188  * If the page doesn't exist in the map file yet, it is extended.
 189  */
 190 void
 191 visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
 192 {
 193         BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
 194
 195         /* Reuse the old pinned buffer if possible */
 196         if (BufferIsValid(*vmbuf))
 197         {
 198                 if (BufferGetBlockNumber(*vmbuf) == mapBlock)
 199                         return;
 200
 201                 ReleaseBuffer(*vmbuf);
 202         }
 203         *vmbuf = vm_readbuf(rel, mapBlock, true);
 204 }
 205
 206 /*
 207  *      visibilitymap_pin_ok - do we already have the correct page pinned?
 208  *
 209  * On entry, vmbuf should be InvalidBuffer or a valid buffer returned by
 210  * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
 211  * relation.  The return value indicates whether the buffer covers the
 212  * given heapBlk.
 213  */
 214 bool
 215 visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
 216 {
 217         BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
 218
 219         return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock;
 220 }
 221
 222 /*
 223  *      visibilitymap_set - set bit(s) on a previously pinned page
 224  *
 225  * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
 226  * or InvalidXLogRecPtr in normal running.  The VM page LSN is advanced to the
 227  * one provided; in normal running, we generate a new XLOG record and set the
 228  * page LSN to that value (though the heap page's LSN may *not* be updated;
 229  * see below).  cutoff_xid is the largest xmin on the page being marked
 230  * all-visible; it is needed for Hot Standby, and can be InvalidTransactionId
 231  * if the page contains no tuples.  It can also be set to InvalidTransactionId
 232  * when a page that is already all-visible is being marked all-frozen.
 233  *
 234  * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
 235  * this function. Except in recovery, caller should also pass the heap
 236  * buffer. When checksums are enabled and we're not in recovery, we must add
 237  * the heap buffer to the WAL chain to protect it from being torn.
 238  *
 239  * You must pass a buffer containing the correct map page to this function.
 240  * Call visibilitymap_pin first to pin the right one. This function doesn't do
 241  * any I/O.
 242  */
 243 void
 244 visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 245                                   XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
 246                                   uint8 flags)
 247 {
 248         BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
 249         uint32          mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
 250         uint8           mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
 251         Page            page;
 252         uint8      *map;
 253
 254 #ifdef TRACE_VISIBILITYMAP
 255         elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
 256 #endif
 257
 258         Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
 259         Assert(InRecovery || PageIsAllVisible((Page) BufferGetPage(heapBuf)));
 260         Assert((flags & VISIBILITYMAP_VALID_BITS) == flags);
 261
 262         /* Must never set all_frozen bit without also setting all_visible bit */
 263         Assert(flags != VISIBILITYMAP_ALL_FROZEN);
 264
 265         /* Check that we have the right heap page pinned, if present */
 266         if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
 267                 elog(ERROR, "wrong heap buffer passed to visibilitymap_set");
 268
 269         /* Check that we have the right VM page pinned */
 270         if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
 271                 elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
 272
 273         page = BufferGetPage(vmBuf);
 274         map = (uint8 *) PageGetContents(page);
 275         LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
 276
 277         if (flags != (map[mapByte] >> mapOffset & VISIBILITYMAP_VALID_BITS))
 278         {
 279                 START_CRIT_SECTION();
 280
 281                 map[mapByte] |= (flags << mapOffset);
 282                 MarkBufferDirty(vmBuf);
 283
 284                 if (RelationNeedsWAL(rel))
 285                 {
 286                         if (XLogRecPtrIsInvalid(recptr))
 287                         {
 288                                 Assert(!InRecovery);
 289                                 recptr = log_heap_visible(rel, heapBuf, vmBuf, cutoff_xid, flags);
 290
 291                                 /*
 292                                  * If data checksums are enabled (or wal_log_hints=on), we
 293                                  * need to protect the heap page from being torn.
 294                                  *
 295                                  * If not, then we must *not* update the heap page's LSN. In
 296                                  * this case, the FPI for the heap page was omitted from the
 297                                  * WAL record inserted above, so it would be incorrect to
 298                                  * update the heap page's LSN.
 299                                  */
 300                                 if (XLogHintBitIsNeeded())
 301                                 {
 302                                         Page            heapPage = BufferGetPage(heapBuf);
 303
 304                                         PageSetLSN(heapPage, recptr);
 305                                 }
 306                         }
 307                         PageSetLSN(page, recptr);
 308                 }
 309
 310                 END_CRIT_SECTION();
 311         }
 312
 313         LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
 314 }
 315
 316 /*
 317  *      visibilitymap_get_status - get status of bits
 318  *
 319  * Are all tuples on heapBlk visible to all or are marked frozen, according
 320  * to the visibility map?
 321  *
 322  * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by an
 323  * earlier call to visibilitymap_pin or visibilitymap_get_status on the same
 324  * relation. On return, *vmbuf is a valid buffer with the map page containing
 325  * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
 326  * releasing *vmbuf after it's done testing and setting bits.
 327  *
 328  * NOTE: This function is typically called without a lock on the heap page,
 329  * so somebody else could change the bit just after we look at it.  In fact,
 330  * since we don't lock the visibility map page either, it's even possible that
 331  * someone else could have changed the bit just before we look at it, but yet
 332  * we might see the old value.  It is the caller's responsibility to deal with
 333  * all concurrency issues!
 334  */
 335 uint8
 336 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
 337 {
 338         BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
 339         uint32          mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
 340         uint8           mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
 341         char       *map;
 342         uint8           result;
 343
 344 #ifdef TRACE_VISIBILITYMAP
 345         elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk);
 346 #endif
 347
 348         /* Reuse the old pinned buffer if possible */
 349         if (BufferIsValid(*vmbuf))
 350         {
 351                 if (BufferGetBlockNumber(*vmbuf) != mapBlock)
 352                 {
 353                         ReleaseBuffer(*vmbuf);
 354                         *vmbuf = InvalidBuffer;
 355                 }
 356         }
 357
 358         if (!BufferIsValid(*vmbuf))
 359         {
 360                 *vmbuf = vm_readbuf(rel, mapBlock, false);
 361                 if (!BufferIsValid(*vmbuf))
 362                         return false;
 363         }
 364
 365         map = PageGetContents(BufferGetPage(*vmbuf));
 366
 367         /*
 368          * A single byte read is atomic.  There could be memory-ordering effects
 369          * here, but for performance reasons we make it the caller's job to worry
 370          * about that.
 371          */
 372         result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS);
 373         return result;
 374 }
 375
 376 /*
 377  *      visibilitymap_count  - count number of bits set in visibility map
 378  *
 379  * Note: we ignore the possibility of race conditions when the table is being
 380  * extended concurrently with the call.  New pages added to the table aren't
 381  * going to be marked all-visible or all-frozen, so they won't affect the result.
 382  */
 383 void
 384 visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen)
 385 {
 386         BlockNumber mapBlock;
 387         BlockNumber nvisible = 0;
 388         BlockNumber nfrozen = 0;
 389
 390         /* all_visible must be specified */
 391         Assert(all_visible);
 392
 393         for (mapBlock = 0;; mapBlock++)
 394         {
 395                 Buffer          mapBuffer;
 396                 uint64     *map;
 397
 398                 /*
 399                  * Read till we fall off the end of the map.  We assume that any extra
 400                  * bytes in the last page are zeroed, so we don't bother excluding
 401                  * them from the count.
 402                  */
 403                 mapBuffer = vm_readbuf(rel, mapBlock, false);
 404                 if (!BufferIsValid(mapBuffer))
 405                         break;
 406
 407                 /*
 408                  * We choose not to lock the page, since the result is going to be
 409                  * immediately stale anyway if anyone is concurrently setting or
 410                  * clearing bits, and we only really need an approximate value.
 411                  */
 412                 map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
 413
 414                 nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
 415                 if (all_frozen)
 416                         nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
 417
 418                 ReleaseBuffer(mapBuffer);
 419         }
 420
 421         *all_visible = nvisible;
 422         if (all_frozen)
 423                 *all_frozen = nfrozen;
 424 }
 425
 426 /*
 427  *      visibilitymap_prepare_truncate -
 428  *                      prepare for truncation of the visibility map
 429  *
 430  * nheapblocks is the new size of the heap.
 431  *
 432  * Return the number of blocks of new visibility map.
 433  * If it's InvalidBlockNumber, there is nothing to truncate;
 434  * otherwise the caller is responsible for calling smgrtruncate()
 435  * to truncate the visibility map pages.
 436  */
 437 BlockNumber
 438 visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
 439 {
 440         BlockNumber newnblocks;
 441
 442         /* last remaining block, byte, and bit */
 443         BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
 444         uint32          truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
 445         uint8           truncOffset = HEAPBLK_TO_OFFSET(nheapblocks);
 446
 447 #ifdef TRACE_VISIBILITYMAP
 448         elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
 449 #endif
 450
 451         /*
 452          * If no visibility map has been created yet for this relation, there's
 453          * nothing to truncate.
 454          */
 455         if (!smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM))
 456                 return InvalidBlockNumber;
 457
 458         /*
 459          * Unless the new size is exactly at a visibility map page boundary, the
 460          * tail bits in the last remaining map page, representing truncated heap
 461          * blocks, need to be cleared. This is not only tidy, but also necessary
 462          * because we don't get a chance to clear the bits if the heap is extended
 463          * again.
 464          */
 465         if (truncByte != 0 || truncOffset != 0)
 466         {
 467                 Buffer          mapBuffer;
 468                 Page            page;
 469                 char       *map;
 470
 471                 newnblocks = truncBlock + 1;
 472
 473                 mapBuffer = vm_readbuf(rel, truncBlock, false);
 474                 if (!BufferIsValid(mapBuffer))
 475                 {
 476                         /* nothing to do, the file was already smaller */
 477                         return InvalidBlockNumber;
 478                 }
 479
 480                 page = BufferGetPage(mapBuffer);
 481                 map = PageGetContents(page);
 482
 483                 LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
 484
 485                 /* NO EREPORT(ERROR) from here till changes are logged */
 486                 START_CRIT_SECTION();
 487
 488                 /* Clear out the unwanted bytes. */
 489                 MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
 490
 491                 /*----
 492                  * Mask out the unwanted bits of the last remaining byte.
 493                  *
 494                  * ((1 << 0) - 1) = 00000000
 495                  * ((1 << 1) - 1) = 00000001
 496                  * ...
 497                  * ((1 << 6) - 1) = 00111111
 498                  * ((1 << 7) - 1) = 01111111
 499                  *----
 500                  */
 501                 map[truncByte] &= (1 << truncOffset) - 1;
 502
 503                 /*
 504                  * Truncation of a relation is WAL-logged at a higher-level, and we
 505                  * will be called at WAL replay. But if checksums are enabled, we need
 506                  * to still write a WAL record to protect against a torn page, if the
 507                  * page is flushed to disk before the truncation WAL record. We cannot
 508                  * use MarkBufferDirtyHint here, because that will not dirty the page
 509                  * during recovery.
 510                  */
 511                 MarkBufferDirty(mapBuffer);
 512                 if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
 513                         log_newpage_buffer(mapBuffer, false);
 514
 515                 END_CRIT_SECTION();
 516
 517                 UnlockReleaseBuffer(mapBuffer);
 518         }
 519         else
 520                 newnblocks = truncBlock;
 521
 522         if (smgrnblocks(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM) <= newnblocks)
 523         {
 524                 /* nothing to do, the file was already smaller than requested size */
 525                 return InvalidBlockNumber;
 526         }
 527
 528         return newnblocks;
 529 }
 530
 531 /*
 532  * Read a visibility map page.
 533  *
 534  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
 535  * true, the visibility map file is extended.
 536  */
 537 static Buffer
 538 vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
 539 {
 540         Buffer          buf;
 541         SMgrRelation reln;
 542
 543         /*
 544          * Caution: re-using this smgr pointer could fail if the relcache entry
 545          * gets closed.  It's safe as long as we only do smgr-level operations
 546          * between here and the last use of the pointer.
 547          */
 548         reln = RelationGetSmgr(rel);
 549
 550         /*
 551          * If we haven't cached the size of the visibility map fork yet, check it
 552          * first.
 553          */
 554         if (reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber)
 555         {
 556                 if (smgrexists(reln, VISIBILITYMAP_FORKNUM))
 557                         smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
 558                 else
 559                         reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0;
 560         }
 561
 562         /*
 563          * For reading we use ZERO_ON_ERROR mode, and initialize the page if
 564          * necessary. It's always safe to clear bits, so it's better to clear
 565          * corrupt pages than error out.
 566          *
 567          * We use the same path below to initialize pages when extending the
 568          * relation, as a concurrent extension can end up with vm_extend()
 569          * returning an already-initialized page.
 570          */
 571         if (blkno >= reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM])
 572         {
 573                 if (extend)
 574                         buf = vm_extend(rel, blkno + 1);
 575                 else
 576                         return InvalidBuffer;
 577         }
 578         else
 579                 buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
 580                                                                  RBM_ZERO_ON_ERROR, NULL);
 581
 582         /*
 583          * Initializing the page when needed is trickier than it looks, because of
 584          * the possibility of multiple backends doing this concurrently, and our
 585          * desire to not uselessly take the buffer lock in the normal path where
 586          * the page is OK.  We must take the lock to initialize the page, so
 587          * recheck page newness after we have the lock, in case someone else
 588          * already did it.  Also, because we initially check PageIsNew with no
 589          * lock, it's possible to fall through and return the buffer while someone
 590          * else is still initializing the page (i.e., we might see pd_upper as set
 591          * but other page header fields are still zeroes).  This is harmless for
 592          * callers that will take a buffer lock themselves, but some callers
 593          * inspect the page without any lock at all.  The latter is OK only so
 594          * long as it doesn't depend on the page header having correct contents.
 595          * Current usage is safe because PageGetContents() does not require that.
 596          */
 597         if (PageIsNew(BufferGetPage(buf)))
 598         {
 599                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 600                 if (PageIsNew(BufferGetPage(buf)))
 601                         PageInit(BufferGetPage(buf), BLCKSZ, 0);
 602                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 603         }
 604         return buf;
 605 }
 606
 607 /*
 608  * Ensure that the visibility map fork is at least vm_nblocks long, extending
 609  * it if necessary with zeroed pages.
 610  */
 611 static Buffer
 612 vm_extend(Relation rel, BlockNumber vm_nblocks)
 613 {
 614         Buffer          buf;
 615
 616         buf = ExtendBufferedRelTo(BMR_REL(rel), VISIBILITYMAP_FORKNUM, NULL,
 617                                                           EB_CREATE_FORK_IF_NEEDED |
 618                                                           EB_CLEAR_SIZE_CACHE,
 619                                                           vm_nblocks,
 620                                                           RBM_ZERO_ON_ERROR);
 621
 622         /*
 623          * Send a shared-inval message to force other backends to close any smgr
 624          * references they may have for this rel, which we are about to change.
 625          * This is a useful optimization because it means that backends don't have
 626          * to keep checking for creation or extension of the file, which happens
 627          * infrequently.
 628          */
 629         CacheInvalidateSmgr(RelationGetSmgr(rel)->smgr_rlocator);
 630
 631         return buf;
 632 }