src/common/blkreftable.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * blkreftable.c
   4  *        Block reference tables.
   5  *
   6  * A block reference table is used to keep track of which blocks have
   7  * been modified by WAL records within a certain LSN range.
   8  *
   9  * For each relation fork, we keep track of all blocks that have appeared
  10  * in block reference in the WAL. We also keep track of the "limit block",
  11  * which is the smallest relation length in blocks known to have occurred
  12  * during that range of WAL records.  This should be set to 0 if the relation
  13  * fork is created or destroyed, and to the post-truncation length if
  14  * truncated.
  15  *
  16  * Whenever we set the limit block, we also forget about any modified blocks
  17  * beyond that point. Those blocks don't exist any more. Such blocks can
  18  * later be marked as modified again; if that happens, it means the relation
  19  * was re-extended.
  20  *
  21  * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
  22  *
  23  * src/common/blkreftable.c
  24  *
  25  *-------------------------------------------------------------------------
  26  */
  27
  28
  29 #ifndef FRONTEND
  30 #include "postgres.h"
  31 #else
  32 #include "postgres_fe.h"
  33 #endif
  34
  35 #ifdef FRONTEND
  36 #include "common/logging.h"
  37 #endif
  38
  39 #include "common/blkreftable.h"
  40 #include "common/hashfn.h"
  41 #include "port/pg_crc32c.h"
  42
  43 /*
  44  * A block reference table keeps track of the status of each relation
  45  * fork individually.
  46  */
  47 typedef struct BlockRefTableKey
  48 {
  49         RelFileLocator rlocator;
  50         ForkNumber      forknum;
  51 } BlockRefTableKey;
  52
  53 /*
  54  * We could need to store data either for a relation in which only a
  55  * tiny fraction of the blocks have been modified or for a relation in
  56  * which nearly every block has been modified, and we want a
  57  * space-efficient representation in both cases. To accomplish this,
  58  * we divide the relation into chunks of 2^16 blocks and choose between
  59  * an array representation and a bitmap representation for each chunk.
  60  *
  61  * When the number of modified blocks in a given chunk is small, we
  62  * essentially store an array of block numbers, but we need not store the
  63  * entire block number: instead, we store each block number as a 2-byte
  64  * offset from the start of the chunk.
  65  *
  66  * When the number of modified blocks in a given chunk is large, we switch
  67  * to a bitmap representation.
  68  *
  69  * These same basic representational choices are used both when a block
  70  * reference table is stored in memory and when it is serialized to disk.
  71  *
  72  * In the in-memory representation, we initially allocate each chunk with
  73  * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
  74  * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
  75  * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
  76  * to a bitmap, and thus never needs to grow further.
  77  */
  78 #define BLOCKS_PER_CHUNK                (1 << 16)
  79 #define BLOCKS_PER_ENTRY                (BITS_PER_BYTE * sizeof(uint16))
  80 #define MAX_ENTRIES_PER_CHUNK   (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
  81 #define INITIAL_ENTRIES_PER_CHUNK       16
  82 typedef uint16 *BlockRefTableChunk;
  83
  84 /*
  85  * State for one relation fork.
  86  *
  87  * 'rlocator' and 'forknum' identify the relation fork to which this entry
  88  * pertains.
  89  *
  90  * 'limit_block' is the shortest known length of the relation in blocks
  91  * within the LSN range covered by a particular block reference table.
  92  * It should be set to 0 if the relation fork is created or dropped. If the
  93  * relation fork is truncated, it should be set to the number of blocks that
  94  * remain after truncation.
  95  *
  96  * 'nchunks' is the allocated length of each of the three arrays that follow.
  97  * We can only represent the status of block numbers less than nchunks *
  98  * BLOCKS_PER_CHUNK.
  99  *
 100  * 'chunk_size' is an array storing the allocated size of each chunk.
 101  *
 102  * 'chunk_usage' is an array storing the number of elements used in each
 103  * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
 104  * chunk is used as an array; else the corresponding chunk is used as a bitmap.
 105  * When used as a bitmap, the least significant bit of the first array element
 106  * is the status of the lowest-numbered block covered by this chunk.
 107  *
 108  * 'chunk_data' is the array of chunks.
 109  */
 110 struct BlockRefTableEntry
 111 {
 112         BlockRefTableKey key;
 113         BlockNumber limit_block;
 114         char            status;
 115         uint32          nchunks;
 116         uint16     *chunk_size;
 117         uint16     *chunk_usage;
 118         BlockRefTableChunk *chunk_data;
 119 };
 120
 121 /* Declare and define a hash table over type BlockRefTableEntry. */
 122 #define SH_PREFIX blockreftable
 123 #define SH_ELEMENT_TYPE BlockRefTableEntry
 124 #define SH_KEY_TYPE BlockRefTableKey
 125 #define SH_KEY key
 126 #define SH_HASH_KEY(tb, key) \
 127         hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
 128 #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
 129 #define SH_SCOPE static inline
 130 #ifdef FRONTEND
 131 #define SH_RAW_ALLOCATOR pg_malloc0
 132 #endif
 133 #define SH_DEFINE
 134 #define SH_DECLARE
 135 #include "lib/simplehash.h"
 136
 137 /*
 138  * A block reference table is basically just the hash table, but we don't
 139  * want to expose that to outside callers.
 140  *
 141  * We keep track of the memory context in use explicitly too, so that it's
 142  * easy to place all of our allocations in the same context.
 143  */
 144 struct BlockRefTable
 145 {
 146         blockreftable_hash *hash;
 147 #ifndef FRONTEND
 148         MemoryContext mcxt;
 149 #endif
 150 };
 151
 152 /*
 153  * On-disk serialization format for block reference table entries.
 154  */
 155 typedef struct BlockRefTableSerializedEntry
 156 {
 157         RelFileLocator rlocator;
 158         ForkNumber      forknum;
 159         BlockNumber limit_block;
 160         uint32          nchunks;
 161 } BlockRefTableSerializedEntry;
 162
 163 /*
 164  * Buffer size, so that we avoid doing many small I/Os.
 165  */
 166 #define BUFSIZE                                 65536
 167
 168 /*
 169  * Ad-hoc buffer for file I/O.
 170  */
 171 typedef struct BlockRefTableBuffer
 172 {
 173         io_callback_fn io_callback;
 174         void       *io_callback_arg;
 175         char            data[BUFSIZE];
 176         int                     used;
 177         int                     cursor;
 178         pg_crc32c       crc;
 179 } BlockRefTableBuffer;
 180
 181 /*
 182  * State for keeping track of progress while incrementally reading a block
 183  * table reference file from disk.
 184  *
 185  * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
 186  * combination that is currently being read, and consumed_chunks is the number
 187  * of those that have been read. (We always read all the information for
 188  * a single chunk at one time, so we don't need to be able to represent the
 189  * state where a chunk has been partially read.)
 190  *
 191  * chunk_size is the array of chunk sizes. The length is given by total_chunks.
 192  *
 193  * chunk_data holds the current chunk.
 194  *
 195  * chunk_position helps us figure out how much progress we've made in returning
 196  * the block numbers for the current chunk to the caller. If the chunk is a
 197  * bitmap, it's the number of bits we've scanned; otherwise, it's the number
 198  * of chunk entries we've scanned.
 199  */
 200 struct BlockRefTableReader
 201 {
 202         BlockRefTableBuffer buffer;
 203         char       *error_filename;
 204         report_error_fn error_callback;
 205         void       *error_callback_arg;
 206         uint32          total_chunks;
 207         uint32          consumed_chunks;
 208         uint16     *chunk_size;
 209         uint16          chunk_data[MAX_ENTRIES_PER_CHUNK];
 210         uint32          chunk_position;
 211 };
 212
 213 /*
 214  * State for keeping track of progress while incrementally writing a block
 215  * reference table file to disk.
 216  */
 217 struct BlockRefTableWriter
 218 {
 219         BlockRefTableBuffer buffer;
 220 };
 221
 222 /* Function prototypes. */
 223 static int      BlockRefTableComparator(const void *a, const void *b);
 224 static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
 225 static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
 226                                                           int length);
 227 static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
 228                                                            int length);
 229 static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer);
 230
 231 /*
 232  * Create an empty block reference table.
 233  */
 234 BlockRefTable *
 235 CreateEmptyBlockRefTable(void)
 236 {
 237         BlockRefTable *brtab = palloc(sizeof(BlockRefTable));
 238
 239         /*
 240          * Even completely empty database has a few hundred relation forks, so it
 241          * seems best to size the hash on the assumption that we're going to have
 242          * at least a few thousand entries.
 243          */
 244 #ifdef FRONTEND
 245         brtab->hash = blockreftable_create(4096, NULL);
 246 #else
 247         brtab->mcxt = CurrentMemoryContext;
 248         brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
 249 #endif
 250
 251         return brtab;
 252 }
 253
 254 /*
 255  * Set the "limit block" for a relation fork and forget any modified blocks
 256  * with equal or higher block numbers.
 257  *
 258  * The "limit block" is the shortest known length of the relation within the
 259  * range of WAL records covered by this block reference table.
 260  */
 261 void
 262 BlockRefTableSetLimitBlock(BlockRefTable *brtab,
 263                                                    const RelFileLocator *rlocator,
 264                                                    ForkNumber forknum,
 265                                                    BlockNumber limit_block)
 266 {
 267         BlockRefTableEntry *brtentry;
 268         BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
 269         bool            found;
 270
 271         memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
 272         key.forknum = forknum;
 273         brtentry = blockreftable_insert(brtab->hash, key, &found);
 274
 275         if (!found)
 276         {
 277                 /*
 278                  * We have no existing data about this relation fork, so just record
 279                  * the limit_block value supplied by the caller, and make sure other
 280                  * parts of the entry are properly initialized.
 281                  */
 282                 brtentry->limit_block = limit_block;
 283                 brtentry->nchunks = 0;
 284                 brtentry->chunk_size = NULL;
 285                 brtentry->chunk_usage = NULL;
 286                 brtentry->chunk_data = NULL;
 287                 return;
 288         }
 289
 290         BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
 291 }
 292
 293 /*
 294  * Mark a block in a given relation fork as known to have been modified.
 295  */
 296 void
 297 BlockRefTableMarkBlockModified(BlockRefTable *brtab,
 298                                                            const RelFileLocator *rlocator,
 299                                                            ForkNumber forknum,
 300                                                            BlockNumber blknum)
 301 {
 302         BlockRefTableEntry *brtentry;
 303         BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
 304         bool            found;
 305 #ifndef FRONTEND
 306         MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
 307 #endif
 308
 309         memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
 310         key.forknum = forknum;
 311         brtentry = blockreftable_insert(brtab->hash, key, &found);
 312
 313         if (!found)
 314         {
 315                 /*
 316                  * We want to set the initial limit block value to something higher
 317                  * than any legal block number. InvalidBlockNumber fits the bill.
 318                  */
 319                 brtentry->limit_block = InvalidBlockNumber;
 320                 brtentry->nchunks = 0;
 321                 brtentry->chunk_size = NULL;
 322                 brtentry->chunk_usage = NULL;
 323                 brtentry->chunk_data = NULL;
 324         }
 325
 326         BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);
 327
 328 #ifndef FRONTEND
 329         MemoryContextSwitchTo(oldcontext);
 330 #endif
 331 }
 332
 333 /*
 334  * Get an entry from a block reference table.
 335  *
 336  * If the entry does not exist, this function returns NULL. Otherwise, it
 337  * returns the entry and sets *limit_block to the value from the entry.
 338  */
 339 BlockRefTableEntry *
 340 BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator,
 341                                           ForkNumber forknum, BlockNumber *limit_block)
 342 {
 343         BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
 344         BlockRefTableEntry *entry;
 345
 346         Assert(limit_block != NULL);
 347
 348         memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
 349         key.forknum = forknum;
 350         entry = blockreftable_lookup(brtab->hash, key);
 351
 352         if (entry != NULL)
 353                 *limit_block = entry->limit_block;
 354
 355         return entry;
 356 }
 357
 358 /*
 359  * Get block numbers from a table entry.
 360  *
 361  * 'blocks' must point to enough space to hold at least 'nblocks' block
 362  * numbers, and any block numbers we manage to get will be written there.
 363  * The return value is the number of block numbers actually written.
 364  *
 365  * We do not return block numbers unless they are greater than or equal to
 366  * start_blkno and strictly less than stop_blkno.
 367  */
 368 int
 369 BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry,
 370                                                         BlockNumber start_blkno,
 371                                                         BlockNumber stop_blkno,
 372                                                         BlockNumber *blocks,
 373                                                         int nblocks)
 374 {
 375         uint32          start_chunkno;
 376         uint32          stop_chunkno;
 377         uint32          chunkno;
 378         int                     nresults = 0;
 379
 380         Assert(entry != NULL);
 381
 382         /*
 383          * Figure out which chunks could potentially contain blocks of interest.
 384          *
 385          * We need to be careful about overflow here, because stop_blkno could be
 386          * InvalidBlockNumber or something very close to it.
 387          */
 388         start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
 389         stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
 390         if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
 391                 ++stop_chunkno;
 392         if (stop_chunkno > entry->nchunks)
 393                 stop_chunkno = entry->nchunks;
 394
 395         /*
 396          * Loop over chunks.
 397          */
 398         for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
 399         {
 400                 uint16          chunk_usage = entry->chunk_usage[chunkno];
 401                 BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
 402                 unsigned        start_offset = 0;
 403                 unsigned        stop_offset = BLOCKS_PER_CHUNK;
 404
 405                 /*
 406                  * If the start and/or stop block number falls within this chunk, the
 407                  * whole chunk may not be of interest. Figure out which portion we
 408                  * care about, if it's not the whole thing.
 409                  */
 410                 if (chunkno == start_chunkno)
 411                         start_offset = start_blkno % BLOCKS_PER_CHUNK;
 412                 if (chunkno == stop_chunkno - 1)
 413                 {
 414                         Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK);
 415                         stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK);
 416                         Assert(stop_offset <= BLOCKS_PER_CHUNK);
 417                 }
 418
 419                 /*
 420                  * Handling differs depending on whether this is an array of offsets
 421                  * or a bitmap.
 422                  */
 423                 if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
 424                 {
 425                         unsigned        i;
 426
 427                         /* It's a bitmap, so test every relevant bit. */
 428                         for (i = start_offset; i < stop_offset; ++i)
 429                         {
 430                                 uint16          w = chunk_data[i / BLOCKS_PER_ENTRY];
 431
 432                                 if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
 433                                 {
 434                                         BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;
 435
 436                                         blocks[nresults++] = blkno;
 437
 438                                         /* Early exit if we run out of output space. */
 439                                         if (nresults == nblocks)
 440                                                 return nresults;
 441                                 }
 442                         }
 443                 }
 444                 else
 445                 {
 446                         unsigned        i;
 447
 448                         /* It's an array of offsets, so check each one. */
 449                         for (i = 0; i < chunk_usage; ++i)
 450                         {
 451                                 uint16          offset = chunk_data[i];
 452
 453                                 if (offset >= start_offset && offset < stop_offset)
 454                                 {
 455                                         BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;
 456
 457                                         blocks[nresults++] = blkno;
 458
 459                                         /* Early exit if we run out of output space. */
 460                                         if (nresults == nblocks)
 461                                                 return nresults;
 462                                 }
 463                         }
 464                 }
 465         }
 466
 467         return nresults;
 468 }
 469
 470 /*
 471  * Serialize a block reference table to a file.
 472  */
 473 void
 474 WriteBlockRefTable(BlockRefTable *brtab,
 475                                    io_callback_fn write_callback,
 476                                    void *write_callback_arg)
 477 {
 478         BlockRefTableSerializedEntry *sdata = NULL;
 479         BlockRefTableBuffer buffer;
 480         uint32          magic = BLOCKREFTABLE_MAGIC;
 481
 482         /* Prepare buffer. */
 483         memset(&buffer, 0, sizeof(BlockRefTableBuffer));
 484         buffer.io_callback = write_callback;
 485         buffer.io_callback_arg = write_callback_arg;
 486         INIT_CRC32C(buffer.crc);
 487
 488         /* Write magic number. */
 489         BlockRefTableWrite(&buffer, &magic, sizeof(uint32));
 490
 491         /* Write the entries, assuming there are some. */
 492         if (brtab->hash->members > 0)
 493         {
 494                 unsigned        i = 0;
 495                 blockreftable_iterator it;
 496                 BlockRefTableEntry *brtentry;
 497
 498                 /* Extract entries into serializable format and sort them. */
 499                 sdata =
 500                         palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry));
 501                 blockreftable_start_iterate(brtab->hash, &it);
 502                 while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
 503                 {
 504                         BlockRefTableSerializedEntry *sentry = &sdata[i++];
 505
 506                         sentry->rlocator = brtentry->key.rlocator;
 507                         sentry->forknum = brtentry->key.forknum;
 508                         sentry->limit_block = brtentry->limit_block;
 509                         sentry->nchunks = brtentry->nchunks;
 510
 511                         /* trim trailing zero entries */
 512                         while (sentry->nchunks > 0 &&
 513                                    brtentry->chunk_usage[sentry->nchunks - 1] == 0)
 514                                 sentry->nchunks--;
 515                 }
 516                 Assert(i == brtab->hash->members);
 517                 qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
 518                           BlockRefTableComparator);
 519
 520                 /* Loop over entries in sorted order and serialize each one. */
 521                 for (i = 0; i < brtab->hash->members; ++i)
 522                 {
 523                         BlockRefTableSerializedEntry *sentry = &sdata[i];
 524                         BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
 525                         unsigned        j;
 526
 527                         /* Write the serialized entry itself. */
 528                         BlockRefTableWrite(&buffer, sentry,
 529                                                            sizeof(BlockRefTableSerializedEntry));
 530
 531                         /* Look up the original entry so we can access the chunks. */
 532                         memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
 533                         key.forknum = sentry->forknum;
 534                         brtentry = blockreftable_lookup(brtab->hash, key);
 535                         Assert(brtentry != NULL);
 536
 537                         /* Write the untruncated portion of the chunk length array. */
 538                         if (sentry->nchunks != 0)
 539                                 BlockRefTableWrite(&buffer, brtentry->chunk_usage,
 540                                                                    sentry->nchunks * sizeof(uint16));
 541
 542                         /* Write the contents of each chunk. */
 543                         for (j = 0; j < brtentry->nchunks; ++j)
 544                         {
 545                                 if (brtentry->chunk_usage[j] == 0)
 546                                         continue;
 547                                 BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
 548                                                                    brtentry->chunk_usage[j] * sizeof(uint16));
 549                         }
 550                 }
 551         }
 552
 553         /* Write out appropriate terminator and CRC and flush buffer. */
 554         BlockRefTableFileTerminate(&buffer);
 555 }
 556
 557 /*
 558  * Prepare to incrementally read a block reference table file.
 559  *
 560  * 'read_callback' is a function that can be called to read data from the
 561  * underlying file (or other data source) into our internal buffer.
 562  *
 563  * 'read_callback_arg' is an opaque argument to be passed to read_callback.
 564  *
 565  * 'error_filename' is the filename that should be included in error messages
 566  * if the file is found to be malformed. The value is not copied, so the
 567  * caller should ensure that it remains valid until done with this
 568  * BlockRefTableReader.
 569  *
 570  * 'error_callback' is a function to be called if the file is found to be
 571  * malformed. This is not used for I/O errors, which must be handled internally
 572  * by read_callback.
 573  *
 574  * 'error_callback_arg' is an opaque argument to be passed to error_callback.
 575  */
 576 BlockRefTableReader *
 577 CreateBlockRefTableReader(io_callback_fn read_callback,
 578                                                   void *read_callback_arg,
 579                                                   char *error_filename,
 580                                                   report_error_fn error_callback,
 581                                                   void *error_callback_arg)
 582 {
 583         BlockRefTableReader *reader;
 584         uint32          magic;
 585
 586         /* Initialize data structure. */
 587         reader = palloc0(sizeof(BlockRefTableReader));
 588         reader->buffer.io_callback = read_callback;
 589         reader->buffer.io_callback_arg = read_callback_arg;
 590         reader->error_filename = error_filename;
 591         reader->error_callback = error_callback;
 592         reader->error_callback_arg = error_callback_arg;
 593         INIT_CRC32C(reader->buffer.crc);
 594
 595         /* Verify magic number. */
 596         BlockRefTableRead(reader, &magic, sizeof(uint32));
 597         if (magic != BLOCKREFTABLE_MAGIC)
 598                 error_callback(error_callback_arg,
 599                                            "file \"%s\" has wrong magic number: expected %u, found %u",
 600                                            error_filename,
 601                                            BLOCKREFTABLE_MAGIC, magic);
 602
 603         return reader;
 604 }
 605
 606 /*
 607  * Read next relation fork covered by this block reference table file.
 608  *
 609  * After calling this function, you must call BlockRefTableReaderGetBlocks
 610  * until it returns 0 before calling it again.
 611  */
 612 bool
 613 BlockRefTableReaderNextRelation(BlockRefTableReader *reader,
 614                                                                 RelFileLocator *rlocator,
 615                                                                 ForkNumber *forknum,
 616                                                                 BlockNumber *limit_block)
 617 {
 618         BlockRefTableSerializedEntry sentry;
 619         BlockRefTableSerializedEntry zentry = {{0}};
 620
 621         /*
 622          * Sanity check: caller must read all blocks from all chunks before moving
 623          * on to the next relation.
 624          */
 625         Assert(reader->total_chunks == reader->consumed_chunks);
 626
 627         /* Read serialized entry. */
 628         BlockRefTableRead(reader, &sentry,
 629                                           sizeof(BlockRefTableSerializedEntry));
 630
 631         /*
 632          * If we just read the sentinel entry indicating that we've reached the
 633          * end, read and check the CRC.
 634          */
 635         if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
 636         {
 637                 pg_crc32c       expected_crc;
 638                 pg_crc32c       actual_crc;
 639
 640                 /*
 641                  * We want to know the CRC of the file excluding the 4-byte CRC
 642                  * itself, so copy the current value of the CRC accumulator before
 643                  * reading those bytes, and use the copy to finalize the calculation.
 644                  */
 645                 expected_crc = reader->buffer.crc;
 646                 FIN_CRC32C(expected_crc);
 647
 648                 /* Now we can read the actual value. */
 649                 BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));
 650
 651                 /* Throw an error if there is a mismatch. */
 652                 if (!EQ_CRC32C(expected_crc, actual_crc))
 653                         reader->error_callback(reader->error_callback_arg,
 654                                                                    "file \"%s\" has wrong checksum: expected %08X, found %08X",
 655                                                                    reader->error_filename, expected_crc, actual_crc);
 656
 657                 return false;
 658         }
 659
 660         /* Read chunk size array. */
 661         if (reader->chunk_size != NULL)
 662                 pfree(reader->chunk_size);
 663         reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16));
 664         BlockRefTableRead(reader, reader->chunk_size,
 665                                           sentry.nchunks * sizeof(uint16));
 666
 667         /* Set up for chunk scan. */
 668         reader->total_chunks = sentry.nchunks;
 669         reader->consumed_chunks = 0;
 670
 671         /* Return data to caller. */
 672         memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
 673         *forknum = sentry.forknum;
 674         *limit_block = sentry.limit_block;
 675         return true;
 676 }
 677
 678 /*
 679  * Get modified blocks associated with the relation fork returned by
 680  * the most recent call to BlockRefTableReaderNextRelation.
 681  *
 682  * On return, block numbers will be written into the 'blocks' array, whose
 683  * length should be passed via 'nblocks'. The return value is the number of
 684  * entries actually written into the 'blocks' array, which may be less than
 685  * 'nblocks' if we run out of modified blocks in the relation fork before
 686  * we run out of room in the array.
 687  */
 688 unsigned
 689 BlockRefTableReaderGetBlocks(BlockRefTableReader *reader,
 690                                                          BlockNumber *blocks,
 691                                                          int nblocks)
 692 {
 693         unsigned        blocks_found = 0;
 694
 695         /* Must provide space for at least one block number to be returned. */
 696         Assert(nblocks > 0);
 697
 698         /* Loop collecting blocks to return to caller. */
 699         for (;;)
 700         {
 701                 uint16          next_chunk_size;
 702
 703                 /*
 704                  * If we've read at least one chunk, maybe it contains some block
 705                  * numbers that could satisfy caller's request.
 706                  */
 707                 if (reader->consumed_chunks > 0)
 708                 {
 709                         uint32          chunkno = reader->consumed_chunks - 1;
 710                         uint16          chunk_size = reader->chunk_size[chunkno];
 711
 712                         if (chunk_size == MAX_ENTRIES_PER_CHUNK)
 713                         {
 714                                 /* Bitmap format, so search for bits that are set. */
 715                                 while (reader->chunk_position < BLOCKS_PER_CHUNK &&
 716                                            blocks_found < nblocks)
 717                                 {
 718                                         uint16          chunkoffset = reader->chunk_position;
 719                                         uint16          w;
 720
 721                                         w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
 722                                         if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
 723                                                 blocks[blocks_found++] =
 724                                                         chunkno * BLOCKS_PER_CHUNK + chunkoffset;
 725                                         ++reader->chunk_position;
 726                                 }
 727                         }
 728                         else
 729                         {
 730                                 /* Not in bitmap format, so each entry is a 2-byte offset. */
 731                                 while (reader->chunk_position < chunk_size &&
 732                                            blocks_found < nblocks)
 733                                 {
 734                                         blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
 735                                                 + reader->chunk_data[reader->chunk_position];
 736                                         ++reader->chunk_position;
 737                                 }
 738                         }
 739                 }
 740
 741                 /* We found enough blocks, so we're done. */
 742                 if (blocks_found >= nblocks)
 743                         break;
 744
 745                 /*
 746                  * We didn't find enough blocks, so we must need the next chunk. If
 747                  * there are none left, though, then we're done anyway.
 748                  */
 749                 if (reader->consumed_chunks == reader->total_chunks)
 750                         break;
 751
 752                 /*
 753                  * Read data for next chunk and reset scan position to beginning of
 754                  * chunk. Note that the next chunk might be empty, in which case we
 755                  * consume the chunk without actually consuming any bytes from the
 756                  * underlying file.
 757                  */
 758                 next_chunk_size = reader->chunk_size[reader->consumed_chunks];
 759                 if (next_chunk_size > 0)
 760                         BlockRefTableRead(reader, reader->chunk_data,
 761                                                           next_chunk_size * sizeof(uint16));
 762                 ++reader->consumed_chunks;
 763                 reader->chunk_position = 0;
 764         }
 765
 766         return blocks_found;
 767 }
 768
 769 /*
 770  * Release memory used while reading a block reference table from a file.
 771  */
 772 void
 773 DestroyBlockRefTableReader(BlockRefTableReader *reader)
 774 {
 775         if (reader->chunk_size != NULL)
 776         {
 777                 pfree(reader->chunk_size);
 778                 reader->chunk_size = NULL;
 779         }
 780         pfree(reader);
 781 }
 782
 783 /*
 784  * Prepare to write a block reference table file incrementally.
 785  *
 786  * Caller must be able to supply BlockRefTableEntry objects sorted in the
 787  * appropriate order.
 788  */
 789 BlockRefTableWriter *
 790 CreateBlockRefTableWriter(io_callback_fn write_callback,
 791                                                   void *write_callback_arg)
 792 {
 793         BlockRefTableWriter *writer;
 794         uint32          magic = BLOCKREFTABLE_MAGIC;
 795
 796         /* Prepare buffer and CRC check and save callbacks. */
 797         writer = palloc0(sizeof(BlockRefTableWriter));
 798         writer->buffer.io_callback = write_callback;
 799         writer->buffer.io_callback_arg = write_callback_arg;
 800         INIT_CRC32C(writer->buffer.crc);
 801
 802         /* Write magic number. */
 803         BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));
 804
 805         return writer;
 806 }
 807
 808 /*
 809  * Append one entry to a block reference table file.
 810  *
 811  * Note that entries must be written in the proper order, that is, sorted by
 812  * tablespace, then database, then relfilenumber, then fork number. Caller
 813  * is responsible for supplying data in the correct order. If that seems hard,
 814  * use an in-memory BlockRefTable instead.
 815  */
 816 void
 817 BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
 818 {
 819         BlockRefTableSerializedEntry sentry;
 820         unsigned        j;
 821
 822         /* Convert to serialized entry format. */
 823         sentry.rlocator = entry->key.rlocator;
 824         sentry.forknum = entry->key.forknum;
 825         sentry.limit_block = entry->limit_block;
 826         sentry.nchunks = entry->nchunks;
 827
 828         /* Trim trailing zero entries. */
 829         while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
 830                 sentry.nchunks--;
 831
 832         /* Write the serialized entry itself. */
 833         BlockRefTableWrite(&writer->buffer, &sentry,
 834                                            sizeof(BlockRefTableSerializedEntry));
 835
 836         /* Write the untruncated portion of the chunk length array. */
 837         if (sentry.nchunks != 0)
 838                 BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
 839                                                    sentry.nchunks * sizeof(uint16));
 840
 841         /* Write the contents of each chunk. */
 842         for (j = 0; j < entry->nchunks; ++j)
 843         {
 844                 if (entry->chunk_usage[j] == 0)
 845                         continue;
 846                 BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
 847                                                    entry->chunk_usage[j] * sizeof(uint16));
 848         }
 849 }
 850
 851 /*
 852  * Finalize an incremental write of a block reference table file.
 853  */
 854 void
 855 DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
 856 {
 857         BlockRefTableFileTerminate(&writer->buffer);
 858         pfree(writer);
 859 }
 860
 861 /*
 862  * Allocate a standalone BlockRefTableEntry.
 863  *
 864  * When we're manipulating a full in-memory BlockRefTable, the entries are
 865  * part of the hash table and are allocated by simplehash. This routine is
 866  * used by callers that want to write out a BlockRefTable to a file without
 867  * needing to store the whole thing in memory at once.
 868  *
 869  * Entries allocated by this function can be manipulated using the functions
 870  * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
 871  * and then written using BlockRefTableWriteEntry and freed using
 872  * BlockRefTableFreeEntry.
 873  */
 874 BlockRefTableEntry *
 875 CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
 876 {
 877         BlockRefTableEntry *entry = palloc0(sizeof(BlockRefTableEntry));
 878
 879         memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
 880         entry->key.forknum = forknum;
 881         entry->limit_block = InvalidBlockNumber;
 882
 883         return entry;
 884 }
 885
 886 /*
 887  * Update a BlockRefTableEntry with a new value for the "limit block" and
 888  * forget any equal-or-higher-numbered modified blocks.
 889  *
 890  * The "limit block" is the shortest known length of the relation within the
 891  * range of WAL records covered by this block reference table.
 892  */
 893 void
 894 BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry,
 895                                                                 BlockNumber limit_block)
 896 {
 897         unsigned        chunkno;
 898         unsigned        limit_chunkno;
 899         unsigned        limit_chunkoffset;
 900         BlockRefTableChunk limit_chunk;
 901
 902         /* If we already have an equal or lower limit block, do nothing. */
 903         if (limit_block >= entry->limit_block)
 904                 return;
 905
 906         /* Record the new limit block value. */
 907         entry->limit_block = limit_block;
 908
 909         /*
 910          * Figure out which chunk would store the state of the new limit block,
 911          * and which offset within that chunk.
 912          */
 913         limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
 914         limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;
 915
 916         /*
 917          * If the number of chunks is not large enough for any blocks with equal
 918          * or higher block numbers to exist, then there is nothing further to do.
 919          */
 920         if (limit_chunkno >= entry->nchunks)
 921                 return;
 922
 923         /* Discard entire contents of any higher-numbered chunks. */
 924         for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
 925                 entry->chunk_usage[chunkno] = 0;
 926
 927         /*
 928          * Next, we need to discard any offsets within the chunk that would
 929          * contain the limit_block. We must handle this differently depending on
 930          * whether the chunk that would contain limit_block is a bitmap or an
 931          * array of offsets.
 932          */
 933         limit_chunk = entry->chunk_data[limit_chunkno];
 934         if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
 935         {
 936                 unsigned        chunkoffset;
 937
 938                 /* It's a bitmap. Unset bits. */
 939                 for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
 940                          ++chunkoffset)
 941                         limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
 942                                 ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
 943         }
 944         else
 945         {
 946                 unsigned        i,
 947                                         j = 0;
 948
 949                 /* It's an offset array. Filter out large offsets. */
 950                 for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
 951                 {
 952                         Assert(j <= i);
 953                         if (limit_chunk[i] < limit_chunkoffset)
 954                                 limit_chunk[j++] = limit_chunk[i];
 955                 }
 956                 Assert(j <= entry->chunk_usage[limit_chunkno]);
 957                 entry->chunk_usage[limit_chunkno] = j;
 958         }
 959 }
 960
 961 /*
 962  * Mark a block in a given BlockRefTableEntry as known to have been modified.
 963  */
 964 void
 965 BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry,
 966                                                                         ForkNumber forknum,
 967                                                                         BlockNumber blknum)
 968 {
 969         unsigned        chunkno;
 970         unsigned        chunkoffset;
 971         unsigned        i;
 972
 973         /*
 974          * Which chunk should store the state of this block? And what is the
 975          * offset of this block relative to the start of that chunk?
 976          */
 977         chunkno = blknum / BLOCKS_PER_CHUNK;
 978         chunkoffset = blknum % BLOCKS_PER_CHUNK;
 979
 980         /*
 981          * If 'nchunks' isn't big enough for us to be able to represent the state
 982          * of this block, we need to enlarge our arrays.
 983          */
 984         if (chunkno >= entry->nchunks)
 985         {
 986                 unsigned        max_chunks;
 987                 unsigned        extra_chunks;
 988
 989                 /*
 990                  * New array size is a power of 2, at least 16, big enough so that
 991                  * chunkno will be a valid array index.
 992                  */
 993                 max_chunks = Max(16, entry->nchunks);
 994                 while (max_chunks < chunkno + 1)
 995                         max_chunks *= 2;
 996                 extra_chunks = max_chunks - entry->nchunks;
 997
 998                 if (entry->nchunks == 0)
 999                 {
1000                         entry->chunk_size = palloc0(sizeof(uint16) * max_chunks);
1001                         entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks);
1002                         entry->chunk_data =
1003                                 palloc0(sizeof(BlockRefTableChunk) * max_chunks);
1004                 }
1005                 else
1006                 {
1007                         entry->chunk_size = repalloc(entry->chunk_size,
1008                                                                                  sizeof(uint16) * max_chunks);
1009                         memset(&entry->chunk_size[entry->nchunks], 0,
1010                                    extra_chunks * sizeof(uint16));
1011                         entry->chunk_usage = repalloc(entry->chunk_usage,
1012                                                                                   sizeof(uint16) * max_chunks);
1013                         memset(&entry->chunk_usage[entry->nchunks], 0,
1014                                    extra_chunks * sizeof(uint16));
1015                         entry->chunk_data = repalloc(entry->chunk_data,
1016                                                                                  sizeof(BlockRefTableChunk) * max_chunks);
1017                         memset(&entry->chunk_data[entry->nchunks], 0,
1018                                    extra_chunks * sizeof(BlockRefTableChunk));
1019                 }
1020                 entry->nchunks = max_chunks;
1021         }
1022
1023         /*
1024          * If the chunk that covers this block number doesn't exist yet, create it
1025          * as an array and add the appropriate offset to it. We make it pretty
1026          * small initially, because there might only be 1 or a few block
1027          * references in this chunk and we don't want to use up too much memory.
1028          */
1029         if (entry->chunk_size[chunkno] == 0)
1030         {
1031                 entry->chunk_data[chunkno] =
1032                         palloc(sizeof(uint16) * INITIAL_ENTRIES_PER_CHUNK);
1033                 entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
1034                 entry->chunk_data[chunkno][0] = chunkoffset;
1035                 entry->chunk_usage[chunkno] = 1;
1036                 return;
1037         }
1038
1039         /*
1040          * If the number of entries in this chunk is already maximum, it must be a
1041          * bitmap. Just set the appropriate bit.
1042          */
1043         if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
1044         {
1045                 BlockRefTableChunk chunk = entry->chunk_data[chunkno];
1046
1047                 chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1048                         1 << (chunkoffset % BLOCKS_PER_ENTRY);
1049                 return;
1050         }
1051
1052         /*
1053          * There is an existing chunk and it's in array format. Let's find out
1054          * whether it already has an entry for this block. If so, we do not need
1055          * to do anything.
1056          */
1057         for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
1058         {
1059                 if (entry->chunk_data[chunkno][i] == chunkoffset)
1060                         return;
1061         }
1062
1063         /*
1064          * If the number of entries currently used is one less than the maximum,
1065          * it's time to convert to bitmap format.
1066          */
1067         if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
1068         {
1069                 BlockRefTableChunk newchunk;
1070                 unsigned        j;
1071
1072                 /* Allocate a new chunk. */
1073                 newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));
1074
1075                 /* Set the bit for each existing entry. */
1076                 for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
1077                 {
1078                         unsigned        coff = entry->chunk_data[chunkno][j];
1079
1080                         newchunk[coff / BLOCKS_PER_ENTRY] |=
1081                                 1 << (coff % BLOCKS_PER_ENTRY);
1082                 }
1083
1084                 /* Set the bit for the new entry. */
1085                 newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1086                         1 << (chunkoffset % BLOCKS_PER_ENTRY);
1087
1088                 /* Swap the new chunk into place and update metadata. */
1089                 pfree(entry->chunk_data[chunkno]);
1090                 entry->chunk_data[chunkno] = newchunk;
1091                 entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
1092                 entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
1093                 return;
1094         }
1095
1096         /*
1097          * OK, we currently have an array, and we don't need to convert to a
1098          * bitmap, but we do need to add a new element. If there's not enough
1099          * room, we'll have to expand the array.
1100          */
1101         if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
1102         {
1103                 unsigned        newsize = entry->chunk_size[chunkno] * 2;
1104
1105                 Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
1106                 entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
1107                                                                                           newsize * sizeof(uint16));
1108                 entry->chunk_size[chunkno] = newsize;
1109         }
1110
1111         /* Now we can add the new entry. */
1112         entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
1113                 chunkoffset;
1114         entry->chunk_usage[chunkno]++;
1115 }
1116
1117 /*
1118  * Release memory for a BlockRefTableEntry that was created by
1119  * CreateBlockRefTableEntry.
1120  */
1121 void
1122 BlockRefTableFreeEntry(BlockRefTableEntry *entry)
1123 {
1124         if (entry->chunk_size != NULL)
1125         {
1126                 pfree(entry->chunk_size);
1127                 entry->chunk_size = NULL;
1128         }
1129
1130         if (entry->chunk_usage != NULL)
1131         {
1132                 pfree(entry->chunk_usage);
1133                 entry->chunk_usage = NULL;
1134         }
1135
1136         if (entry->chunk_data != NULL)
1137         {
1138                 pfree(entry->chunk_data);
1139                 entry->chunk_data = NULL;
1140         }
1141
1142         pfree(entry);
1143 }
1144
1145 /*
1146  * Comparator for BlockRefTableSerializedEntry objects.
1147  *
1148  * We make the tablespace OID the first column of the sort key to match
1149  * the on-disk tree structure.
1150  */
1151 static int
1152 BlockRefTableComparator(const void *a, const void *b)
1153 {
1154         const BlockRefTableSerializedEntry *sa = a;
1155         const BlockRefTableSerializedEntry *sb = b;
1156
1157         if (sa->rlocator.spcOid > sb->rlocator.spcOid)
1158                 return 1;
1159         if (sa->rlocator.spcOid < sb->rlocator.spcOid)
1160                 return -1;
1161
1162         if (sa->rlocator.dbOid > sb->rlocator.dbOid)
1163                 return 1;
1164         if (sa->rlocator.dbOid < sb->rlocator.dbOid)
1165                 return -1;
1166
1167         if (sa->rlocator.relNumber > sb->rlocator.relNumber)
1168                 return 1;
1169         if (sa->rlocator.relNumber < sb->rlocator.relNumber)
1170                 return -1;
1171
1172         if (sa->forknum > sb->forknum)
1173                 return 1;
1174         if (sa->forknum < sb->forknum)
1175                 return -1;
1176
1177         return 0;
1178 }
1179
1180 /*
1181  * Flush any buffered data out of a BlockRefTableBuffer.
1182  */
1183 static void
1184 BlockRefTableFlush(BlockRefTableBuffer *buffer)
1185 {
1186         buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
1187         buffer->used = 0;
1188 }
1189
1190 /*
1191  * Read data from a BlockRefTableBuffer, and update the running CRC
1192  * calculation for the returned data (but not any data that we may have
1193  * buffered but not yet actually returned).
1194  */
1195 static void
1196 BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
1197 {
1198         BlockRefTableBuffer *buffer = &reader->buffer;
1199
1200         /* Loop until read is fully satisfied. */
1201         while (length > 0)
1202         {
1203                 if (buffer->cursor < buffer->used)
1204                 {
1205                         /*
1206                          * If any buffered data is available, use that to satisfy as much
1207                          * of the request as possible.
1208                          */
1209                         int                     bytes_to_copy = Min(length, buffer->used - buffer->cursor);
1210
1211                         memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
1212                         COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
1213                                                 bytes_to_copy);
1214                         buffer->cursor += bytes_to_copy;
1215                         data = ((char *) data) + bytes_to_copy;
1216                         length -= bytes_to_copy;
1217                 }
1218                 else if (length >= BUFSIZE)
1219                 {
1220                         /*
1221                          * If the request length is long, read directly into caller's
1222                          * buffer.
1223                          */
1224                         int                     bytes_read;
1225
1226                         bytes_read = buffer->io_callback(buffer->io_callback_arg,
1227                                                                                          data, length);
1228                         COMP_CRC32C(buffer->crc, data, bytes_read);
1229                         data = ((char *) data) + bytes_read;
1230                         length -= bytes_read;
1231
1232                         /* If we didn't get anything, that's bad. */
1233                         if (bytes_read == 0)
1234                                 reader->error_callback(reader->error_callback_arg,
1235                                                                            "file \"%s\" ends unexpectedly",
1236                                                                            reader->error_filename);
1237                 }
1238                 else
1239                 {
1240                         /*
1241                          * Refill our buffer.
1242                          */
1243                         buffer->used = buffer->io_callback(buffer->io_callback_arg,
1244                                                                                            buffer->data, BUFSIZE);
1245                         buffer->cursor = 0;
1246
1247                         /* If we didn't get anything, that's bad. */
1248                         if (buffer->used == 0)
1249                                 reader->error_callback(reader->error_callback_arg,
1250                                                                            "file \"%s\" ends unexpectedly",
1251                                                                            reader->error_filename);
1252                 }
1253         }
1254 }
1255
1256 /*
1257  * Supply data to a BlockRefTableBuffer for write to the underlying File,
1258  * and update the running CRC calculation for that data.
1259  */
1260 static void
1261 BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
1262 {
1263         /* Update running CRC calculation. */
1264         COMP_CRC32C(buffer->crc, data, length);
1265
1266         /* If the new data can't fit into the buffer, flush the buffer. */
1267         if (buffer->used + length > BUFSIZE)
1268         {
1269                 buffer->io_callback(buffer->io_callback_arg, buffer->data,
1270                                                         buffer->used);
1271                 buffer->used = 0;
1272         }
1273
1274         /* If the new data would fill the buffer, or more, write it directly. */
1275         if (length >= BUFSIZE)
1276         {
1277                 buffer->io_callback(buffer->io_callback_arg, data, length);
1278                 return;
1279         }
1280
1281         /* Otherwise, copy the new data into the buffer. */
1282         memcpy(&buffer->data[buffer->used], data, length);
1283         buffer->used += length;
1284         Assert(buffer->used <= BUFSIZE);
1285 }
1286
1287 /*
1288  * Generate the sentinel and CRC required at the end of a block reference
1289  * table file and flush them out of our internal buffer.
1290  */
1291 static void
1292 BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
1293 {
1294         BlockRefTableSerializedEntry zentry = {{0}};
1295         pg_crc32c       crc;
1296
1297         /* Write a sentinel indicating that there are no more entries. */
1298         BlockRefTableWrite(buffer, &zentry,
1299                                            sizeof(BlockRefTableSerializedEntry));
1300
1301         /*
1302          * Writing the checksum will perturb the ongoing checksum calculation, so
1303          * copy the state first and finalize the computation using the copy.
1304          */
1305         crc = buffer->crc;
1306         FIN_CRC32C(crc);
1307         BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));
1308
1309         /* Flush any leftover data out of our buffer. */
1310         BlockRefTableFlush(buffer);
1311 }