1 /*-------------------------------------------------------------------------
4 * Block reference tables.
6 * A block reference table is used to keep track of which blocks have
7 * been modified by WAL records within a certain LSN range.
9 * For each relation fork, we keep track of all blocks that have appeared
10 * in block reference in the WAL. We also keep track of the "limit block",
11 * which is the smallest relation length in blocks known to have occurred
12 * during that range of WAL records. This should be set to 0 if the relation
13 * fork is created or destroyed, and to the post-truncation length if
16 * Whenever we set the limit block, we also forget about any modified blocks
17 * beyond that point. Those blocks don't exist any more. Such blocks can
18 * later be marked as modified again; if that happens, it means the relation
21 * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
23 * src/common/blkreftable.c
25 *-------------------------------------------------------------------------
32 #include "postgres_fe.h"
36 #include "common/logging.h"
39 #include "common/blkreftable.h"
40 #include "common/hashfn.h"
41 #include "port/pg_crc32c.h"
44 * A block reference table keeps track of the status of each relation
47 typedef struct BlockRefTableKey
49 RelFileLocator rlocator
;
54 * We could need to store data either for a relation in which only a
55 * tiny fraction of the blocks have been modified or for a relation in
56 * which nearly every block has been modified, and we want a
57 * space-efficient representation in both cases. To accomplish this,
58 * we divide the relation into chunks of 2^16 blocks and choose between
59 * an array representation and a bitmap representation for each chunk.
61 * When the number of modified blocks in a given chunk is small, we
62 * essentially store an array of block numbers, but we need not store the
63 * entire block number: instead, we store each block number as a 2-byte
64 * offset from the start of the chunk.
66 * When the number of modified blocks in a given chunk is large, we switch
67 * to a bitmap representation.
69 * These same basic representational choices are used both when a block
70 * reference table is stored in memory and when it is serialized to disk.
72 * In the in-memory representation, we initially allocate each chunk with
73 * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
74 * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
75 * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
76 * to a bitmap, and thus never needs to grow further.
78 #define BLOCKS_PER_CHUNK (1 << 16)
79 #define BLOCKS_PER_ENTRY (BITS_PER_BYTE * sizeof(uint16))
80 #define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
81 #define INITIAL_ENTRIES_PER_CHUNK 16
82 typedef uint16
*BlockRefTableChunk
;
85 * State for one relation fork.
87 * 'rlocator' and 'forknum' identify the relation fork to which this entry
90 * 'limit_block' is the shortest known length of the relation in blocks
91 * within the LSN range covered by a particular block reference table.
92 * It should be set to 0 if the relation fork is created or dropped. If the
93 * relation fork is truncated, it should be set to the number of blocks that
94 * remain after truncation.
96 * 'nchunks' is the allocated length of each of the three arrays that follow.
97 * We can only represent the status of block numbers less than nchunks *
100 * 'chunk_size' is an array storing the allocated size of each chunk.
102 * 'chunk_usage' is an array storing the number of elements used in each
103 * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
104 * chunk is used as an array; else the corresponding chunk is used as a bitmap.
105 * When used as a bitmap, the least significant bit of the first array element
106 * is the status of the lowest-numbered block covered by this chunk.
108 * 'chunk_data' is the array of chunks.
110 struct BlockRefTableEntry
112 BlockRefTableKey key
;
113 BlockNumber limit_block
;
118 BlockRefTableChunk
*chunk_data
;
121 /* Declare and define a hash table over type BlockRefTableEntry. */
122 #define SH_PREFIX blockreftable
123 #define SH_ELEMENT_TYPE BlockRefTableEntry
124 #define SH_KEY_TYPE BlockRefTableKey
126 #define SH_HASH_KEY(tb, key) \
127 hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
128 #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
129 #define SH_SCOPE static inline
131 #define SH_RAW_ALLOCATOR pg_malloc0
135 #include "lib/simplehash.h"
138 * A block reference table is basically just the hash table, but we don't
139 * want to expose that to outside callers.
141 * We keep track of the memory context in use explicitly too, so that it's
142 * easy to place all of our allocations in the same context.
146 blockreftable_hash
*hash
;
153 * On-disk serialization format for block reference table entries.
155 typedef struct BlockRefTableSerializedEntry
157 RelFileLocator rlocator
;
159 BlockNumber limit_block
;
161 } BlockRefTableSerializedEntry
;
164 * Buffer size, so that we avoid doing many small I/Os.
166 #define BUFSIZE 65536
169 * Ad-hoc buffer for file I/O.
171 typedef struct BlockRefTableBuffer
173 io_callback_fn io_callback
;
174 void *io_callback_arg
;
179 } BlockRefTableBuffer
;
182 * State for keeping track of progress while incrementally reading a block
183 * table reference file from disk.
185 * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
186 * combination that is currently being read, and consumed_chunks is the number
187 * of those that have been read. (We always read all the information for
188 * a single chunk at one time, so we don't need to be able to represent the
189 * state where a chunk has been partially read.)
191 * chunk_size is the array of chunk sizes. The length is given by total_chunks.
193 * chunk_data holds the current chunk.
195 * chunk_position helps us figure out how much progress we've made in returning
196 * the block numbers for the current chunk to the caller. If the chunk is a
197 * bitmap, it's the number of bits we've scanned; otherwise, it's the number
198 * of chunk entries we've scanned.
200 struct BlockRefTableReader
202 BlockRefTableBuffer buffer
;
203 char *error_filename
;
204 report_error_fn error_callback
;
205 void *error_callback_arg
;
207 uint32 consumed_chunks
;
209 uint16 chunk_data
[MAX_ENTRIES_PER_CHUNK
];
210 uint32 chunk_position
;
214 * State for keeping track of progress while incrementally writing a block
215 * reference table file to disk.
217 struct BlockRefTableWriter
219 BlockRefTableBuffer buffer
;
222 /* Function prototypes. */
223 static int BlockRefTableComparator(const void *a
, const void *b
);
224 static void BlockRefTableFlush(BlockRefTableBuffer
*buffer
);
225 static void BlockRefTableRead(BlockRefTableReader
*reader
, void *data
,
227 static void BlockRefTableWrite(BlockRefTableBuffer
*buffer
, void *data
,
229 static void BlockRefTableFileTerminate(BlockRefTableBuffer
*buffer
);
232 * Create an empty block reference table.
235 CreateEmptyBlockRefTable(void)
237 BlockRefTable
*brtab
= palloc(sizeof(BlockRefTable
));
240 * Even completely empty database has a few hundred relation forks, so it
241 * seems best to size the hash on the assumption that we're going to have
242 * at least a few thousand entries.
245 brtab
->hash
= blockreftable_create(4096, NULL
);
247 brtab
->mcxt
= CurrentMemoryContext
;
248 brtab
->hash
= blockreftable_create(brtab
->mcxt
, 4096, NULL
);
255 * Set the "limit block" for a relation fork and forget any modified blocks
256 * with equal or higher block numbers.
258 * The "limit block" is the shortest known length of the relation within the
259 * range of WAL records covered by this block reference table.
262 BlockRefTableSetLimitBlock(BlockRefTable
*brtab
,
263 const RelFileLocator
*rlocator
,
265 BlockNumber limit_block
)
267 BlockRefTableEntry
*brtentry
;
268 BlockRefTableKey key
= {{0}}; /* make sure any padding is zero */
271 memcpy(&key
.rlocator
, rlocator
, sizeof(RelFileLocator
));
272 key
.forknum
= forknum
;
273 brtentry
= blockreftable_insert(brtab
->hash
, key
, &found
);
278 * We have no existing data about this relation fork, so just record
279 * the limit_block value supplied by the caller, and make sure other
280 * parts of the entry are properly initialized.
282 brtentry
->limit_block
= limit_block
;
283 brtentry
->nchunks
= 0;
284 brtentry
->chunk_size
= NULL
;
285 brtentry
->chunk_usage
= NULL
;
286 brtentry
->chunk_data
= NULL
;
290 BlockRefTableEntrySetLimitBlock(brtentry
, limit_block
);
294 * Mark a block in a given relation fork as known to have been modified.
297 BlockRefTableMarkBlockModified(BlockRefTable
*brtab
,
298 const RelFileLocator
*rlocator
,
302 BlockRefTableEntry
*brtentry
;
303 BlockRefTableKey key
= {{0}}; /* make sure any padding is zero */
306 MemoryContext oldcontext
= MemoryContextSwitchTo(brtab
->mcxt
);
309 memcpy(&key
.rlocator
, rlocator
, sizeof(RelFileLocator
));
310 key
.forknum
= forknum
;
311 brtentry
= blockreftable_insert(brtab
->hash
, key
, &found
);
316 * We want to set the initial limit block value to something higher
317 * than any legal block number. InvalidBlockNumber fits the bill.
319 brtentry
->limit_block
= InvalidBlockNumber
;
320 brtentry
->nchunks
= 0;
321 brtentry
->chunk_size
= NULL
;
322 brtentry
->chunk_usage
= NULL
;
323 brtentry
->chunk_data
= NULL
;
326 BlockRefTableEntryMarkBlockModified(brtentry
, forknum
, blknum
);
329 MemoryContextSwitchTo(oldcontext
);
334 * Get an entry from a block reference table.
336 * If the entry does not exist, this function returns NULL. Otherwise, it
337 * returns the entry and sets *limit_block to the value from the entry.
340 BlockRefTableGetEntry(BlockRefTable
*brtab
, const RelFileLocator
*rlocator
,
341 ForkNumber forknum
, BlockNumber
*limit_block
)
343 BlockRefTableKey key
= {{0}}; /* make sure any padding is zero */
344 BlockRefTableEntry
*entry
;
346 Assert(limit_block
!= NULL
);
348 memcpy(&key
.rlocator
, rlocator
, sizeof(RelFileLocator
));
349 key
.forknum
= forknum
;
350 entry
= blockreftable_lookup(brtab
->hash
, key
);
353 *limit_block
= entry
->limit_block
;
359 * Get block numbers from a table entry.
361 * 'blocks' must point to enough space to hold at least 'nblocks' block
362 * numbers, and any block numbers we manage to get will be written there.
363 * The return value is the number of block numbers actually written.
365 * We do not return block numbers unless they are greater than or equal to
366 * start_blkno and strictly less than stop_blkno.
369 BlockRefTableEntryGetBlocks(BlockRefTableEntry
*entry
,
370 BlockNumber start_blkno
,
371 BlockNumber stop_blkno
,
375 uint32 start_chunkno
;
380 Assert(entry
!= NULL
);
383 * Figure out which chunks could potentially contain blocks of interest.
385 * We need to be careful about overflow here, because stop_blkno could be
386 * InvalidBlockNumber or something very close to it.
388 start_chunkno
= start_blkno
/ BLOCKS_PER_CHUNK
;
389 stop_chunkno
= stop_blkno
/ BLOCKS_PER_CHUNK
;
390 if ((stop_blkno
% BLOCKS_PER_CHUNK
) != 0)
392 if (stop_chunkno
> entry
->nchunks
)
393 stop_chunkno
= entry
->nchunks
;
398 for (chunkno
= start_chunkno
; chunkno
< stop_chunkno
; ++chunkno
)
400 uint16 chunk_usage
= entry
->chunk_usage
[chunkno
];
401 BlockRefTableChunk chunk_data
= entry
->chunk_data
[chunkno
];
402 unsigned start_offset
= 0;
403 unsigned stop_offset
= BLOCKS_PER_CHUNK
;
406 * If the start and/or stop block number falls within this chunk, the
407 * whole chunk may not be of interest. Figure out which portion we
408 * care about, if it's not the whole thing.
410 if (chunkno
== start_chunkno
)
411 start_offset
= start_blkno
% BLOCKS_PER_CHUNK
;
412 if (chunkno
== stop_chunkno
- 1)
414 Assert(stop_blkno
> chunkno
* BLOCKS_PER_CHUNK
);
415 stop_offset
= stop_blkno
- (chunkno
* BLOCKS_PER_CHUNK
);
416 Assert(stop_offset
<= BLOCKS_PER_CHUNK
);
420 * Handling differs depending on whether this is an array of offsets
423 if (chunk_usage
== MAX_ENTRIES_PER_CHUNK
)
427 /* It's a bitmap, so test every relevant bit. */
428 for (i
= start_offset
; i
< stop_offset
; ++i
)
430 uint16 w
= chunk_data
[i
/ BLOCKS_PER_ENTRY
];
432 if ((w
& (1 << (i
% BLOCKS_PER_ENTRY
))) != 0)
434 BlockNumber blkno
= chunkno
* BLOCKS_PER_CHUNK
+ i
;
436 blocks
[nresults
++] = blkno
;
438 /* Early exit if we run out of output space. */
439 if (nresults
== nblocks
)
448 /* It's an array of offsets, so check each one. */
449 for (i
= 0; i
< chunk_usage
; ++i
)
451 uint16 offset
= chunk_data
[i
];
453 if (offset
>= start_offset
&& offset
< stop_offset
)
455 BlockNumber blkno
= chunkno
* BLOCKS_PER_CHUNK
+ offset
;
457 blocks
[nresults
++] = blkno
;
459 /* Early exit if we run out of output space. */
460 if (nresults
== nblocks
)
471 * Serialize a block reference table to a file.
474 WriteBlockRefTable(BlockRefTable
*brtab
,
475 io_callback_fn write_callback
,
476 void *write_callback_arg
)
478 BlockRefTableSerializedEntry
*sdata
= NULL
;
479 BlockRefTableBuffer buffer
;
480 uint32 magic
= BLOCKREFTABLE_MAGIC
;
482 /* Prepare buffer. */
483 memset(&buffer
, 0, sizeof(BlockRefTableBuffer
));
484 buffer
.io_callback
= write_callback
;
485 buffer
.io_callback_arg
= write_callback_arg
;
486 INIT_CRC32C(buffer
.crc
);
488 /* Write magic number. */
489 BlockRefTableWrite(&buffer
, &magic
, sizeof(uint32
));
491 /* Write the entries, assuming there are some. */
492 if (brtab
->hash
->members
> 0)
495 blockreftable_iterator it
;
496 BlockRefTableEntry
*brtentry
;
498 /* Extract entries into serializable format and sort them. */
500 palloc(brtab
->hash
->members
* sizeof(BlockRefTableSerializedEntry
));
501 blockreftable_start_iterate(brtab
->hash
, &it
);
502 while ((brtentry
= blockreftable_iterate(brtab
->hash
, &it
)) != NULL
)
504 BlockRefTableSerializedEntry
*sentry
= &sdata
[i
++];
506 sentry
->rlocator
= brtentry
->key
.rlocator
;
507 sentry
->forknum
= brtentry
->key
.forknum
;
508 sentry
->limit_block
= brtentry
->limit_block
;
509 sentry
->nchunks
= brtentry
->nchunks
;
511 /* trim trailing zero entries */
512 while (sentry
->nchunks
> 0 &&
513 brtentry
->chunk_usage
[sentry
->nchunks
- 1] == 0)
516 Assert(i
== brtab
->hash
->members
);
517 qsort(sdata
, i
, sizeof(BlockRefTableSerializedEntry
),
518 BlockRefTableComparator
);
520 /* Loop over entries in sorted order and serialize each one. */
521 for (i
= 0; i
< brtab
->hash
->members
; ++i
)
523 BlockRefTableSerializedEntry
*sentry
= &sdata
[i
];
524 BlockRefTableKey key
= {{0}}; /* make sure any padding is zero */
527 /* Write the serialized entry itself. */
528 BlockRefTableWrite(&buffer
, sentry
,
529 sizeof(BlockRefTableSerializedEntry
));
531 /* Look up the original entry so we can access the chunks. */
532 memcpy(&key
.rlocator
, &sentry
->rlocator
, sizeof(RelFileLocator
));
533 key
.forknum
= sentry
->forknum
;
534 brtentry
= blockreftable_lookup(brtab
->hash
, key
);
535 Assert(brtentry
!= NULL
);
537 /* Write the untruncated portion of the chunk length array. */
538 if (sentry
->nchunks
!= 0)
539 BlockRefTableWrite(&buffer
, brtentry
->chunk_usage
,
540 sentry
->nchunks
* sizeof(uint16
));
542 /* Write the contents of each chunk. */
543 for (j
= 0; j
< brtentry
->nchunks
; ++j
)
545 if (brtentry
->chunk_usage
[j
] == 0)
547 BlockRefTableWrite(&buffer
, brtentry
->chunk_data
[j
],
548 brtentry
->chunk_usage
[j
] * sizeof(uint16
));
553 /* Write out appropriate terminator and CRC and flush buffer. */
554 BlockRefTableFileTerminate(&buffer
);
558 * Prepare to incrementally read a block reference table file.
560 * 'read_callback' is a function that can be called to read data from the
561 * underlying file (or other data source) into our internal buffer.
563 * 'read_callback_arg' is an opaque argument to be passed to read_callback.
565 * 'error_filename' is the filename that should be included in error messages
566 * if the file is found to be malformed. The value is not copied, so the
567 * caller should ensure that it remains valid until done with this
568 * BlockRefTableReader.
570 * 'error_callback' is a function to be called if the file is found to be
571 * malformed. This is not used for I/O errors, which must be handled internally
574 * 'error_callback_arg' is an opaque argument to be passed to error_callback.
576 BlockRefTableReader
*
577 CreateBlockRefTableReader(io_callback_fn read_callback
,
578 void *read_callback_arg
,
579 char *error_filename
,
580 report_error_fn error_callback
,
581 void *error_callback_arg
)
583 BlockRefTableReader
*reader
;
586 /* Initialize data structure. */
587 reader
= palloc0(sizeof(BlockRefTableReader
));
588 reader
->buffer
.io_callback
= read_callback
;
589 reader
->buffer
.io_callback_arg
= read_callback_arg
;
590 reader
->error_filename
= error_filename
;
591 reader
->error_callback
= error_callback
;
592 reader
->error_callback_arg
= error_callback_arg
;
593 INIT_CRC32C(reader
->buffer
.crc
);
595 /* Verify magic number. */
596 BlockRefTableRead(reader
, &magic
, sizeof(uint32
));
597 if (magic
!= BLOCKREFTABLE_MAGIC
)
598 error_callback(error_callback_arg
,
599 "file \"%s\" has wrong magic number: expected %u, found %u",
601 BLOCKREFTABLE_MAGIC
, magic
);
607 * Read next relation fork covered by this block reference table file.
609 * After calling this function, you must call BlockRefTableReaderGetBlocks
610 * until it returns 0 before calling it again.
613 BlockRefTableReaderNextRelation(BlockRefTableReader
*reader
,
614 RelFileLocator
*rlocator
,
616 BlockNumber
*limit_block
)
618 BlockRefTableSerializedEntry sentry
;
619 BlockRefTableSerializedEntry zentry
= {{0}};
622 * Sanity check: caller must read all blocks from all chunks before moving
623 * on to the next relation.
625 Assert(reader
->total_chunks
== reader
->consumed_chunks
);
627 /* Read serialized entry. */
628 BlockRefTableRead(reader
, &sentry
,
629 sizeof(BlockRefTableSerializedEntry
));
632 * If we just read the sentinel entry indicating that we've reached the
633 * end, read and check the CRC.
635 if (memcmp(&sentry
, &zentry
, sizeof(BlockRefTableSerializedEntry
)) == 0)
637 pg_crc32c expected_crc
;
638 pg_crc32c actual_crc
;
641 * We want to know the CRC of the file excluding the 4-byte CRC
642 * itself, so copy the current value of the CRC accumulator before
643 * reading those bytes, and use the copy to finalize the calculation.
645 expected_crc
= reader
->buffer
.crc
;
646 FIN_CRC32C(expected_crc
);
648 /* Now we can read the actual value. */
649 BlockRefTableRead(reader
, &actual_crc
, sizeof(pg_crc32c
));
651 /* Throw an error if there is a mismatch. */
652 if (!EQ_CRC32C(expected_crc
, actual_crc
))
653 reader
->error_callback(reader
->error_callback_arg
,
654 "file \"%s\" has wrong checksum: expected %08X, found %08X",
655 reader
->error_filename
, expected_crc
, actual_crc
);
660 /* Read chunk size array. */
661 if (reader
->chunk_size
!= NULL
)
662 pfree(reader
->chunk_size
);
663 reader
->chunk_size
= palloc(sentry
.nchunks
* sizeof(uint16
));
664 BlockRefTableRead(reader
, reader
->chunk_size
,
665 sentry
.nchunks
* sizeof(uint16
));
667 /* Set up for chunk scan. */
668 reader
->total_chunks
= sentry
.nchunks
;
669 reader
->consumed_chunks
= 0;
671 /* Return data to caller. */
672 memcpy(rlocator
, &sentry
.rlocator
, sizeof(RelFileLocator
));
673 *forknum
= sentry
.forknum
;
674 *limit_block
= sentry
.limit_block
;
679 * Get modified blocks associated with the relation fork returned by
680 * the most recent call to BlockRefTableReaderNextRelation.
682 * On return, block numbers will be written into the 'blocks' array, whose
683 * length should be passed via 'nblocks'. The return value is the number of
684 * entries actually written into the 'blocks' array, which may be less than
685 * 'nblocks' if we run out of modified blocks in the relation fork before
686 * we run out of room in the array.
689 BlockRefTableReaderGetBlocks(BlockRefTableReader
*reader
,
693 unsigned blocks_found
= 0;
695 /* Must provide space for at least one block number to be returned. */
698 /* Loop collecting blocks to return to caller. */
701 uint16 next_chunk_size
;
704 * If we've read at least one chunk, maybe it contains some block
705 * numbers that could satisfy caller's request.
707 if (reader
->consumed_chunks
> 0)
709 uint32 chunkno
= reader
->consumed_chunks
- 1;
710 uint16 chunk_size
= reader
->chunk_size
[chunkno
];
712 if (chunk_size
== MAX_ENTRIES_PER_CHUNK
)
714 /* Bitmap format, so search for bits that are set. */
715 while (reader
->chunk_position
< BLOCKS_PER_CHUNK
&&
716 blocks_found
< nblocks
)
718 uint16 chunkoffset
= reader
->chunk_position
;
721 w
= reader
->chunk_data
[chunkoffset
/ BLOCKS_PER_ENTRY
];
722 if ((w
& (1u << (chunkoffset
% BLOCKS_PER_ENTRY
))) != 0)
723 blocks
[blocks_found
++] =
724 chunkno
* BLOCKS_PER_CHUNK
+ chunkoffset
;
725 ++reader
->chunk_position
;
730 /* Not in bitmap format, so each entry is a 2-byte offset. */
731 while (reader
->chunk_position
< chunk_size
&&
732 blocks_found
< nblocks
)
734 blocks
[blocks_found
++] = chunkno
* BLOCKS_PER_CHUNK
735 + reader
->chunk_data
[reader
->chunk_position
];
736 ++reader
->chunk_position
;
741 /* We found enough blocks, so we're done. */
742 if (blocks_found
>= nblocks
)
746 * We didn't find enough blocks, so we must need the next chunk. If
747 * there are none left, though, then we're done anyway.
749 if (reader
->consumed_chunks
== reader
->total_chunks
)
753 * Read data for next chunk and reset scan position to beginning of
754 * chunk. Note that the next chunk might be empty, in which case we
755 * consume the chunk without actually consuming any bytes from the
758 next_chunk_size
= reader
->chunk_size
[reader
->consumed_chunks
];
759 if (next_chunk_size
> 0)
760 BlockRefTableRead(reader
, reader
->chunk_data
,
761 next_chunk_size
* sizeof(uint16
));
762 ++reader
->consumed_chunks
;
763 reader
->chunk_position
= 0;
770 * Release memory used while reading a block reference table from a file.
773 DestroyBlockRefTableReader(BlockRefTableReader
*reader
)
775 if (reader
->chunk_size
!= NULL
)
777 pfree(reader
->chunk_size
);
778 reader
->chunk_size
= NULL
;
784 * Prepare to write a block reference table file incrementally.
786 * Caller must be able to supply BlockRefTableEntry objects sorted in the
789 BlockRefTableWriter
*
790 CreateBlockRefTableWriter(io_callback_fn write_callback
,
791 void *write_callback_arg
)
793 BlockRefTableWriter
*writer
;
794 uint32 magic
= BLOCKREFTABLE_MAGIC
;
796 /* Prepare buffer and CRC check and save callbacks. */
797 writer
= palloc0(sizeof(BlockRefTableWriter
));
798 writer
->buffer
.io_callback
= write_callback
;
799 writer
->buffer
.io_callback_arg
= write_callback_arg
;
800 INIT_CRC32C(writer
->buffer
.crc
);
802 /* Write magic number. */
803 BlockRefTableWrite(&writer
->buffer
, &magic
, sizeof(uint32
));
809 * Append one entry to a block reference table file.
811 * Note that entries must be written in the proper order, that is, sorted by
812 * tablespace, then database, then relfilenumber, then fork number. Caller
813 * is responsible for supplying data in the correct order. If that seems hard,
814 * use an in-memory BlockRefTable instead.
817 BlockRefTableWriteEntry(BlockRefTableWriter
*writer
, BlockRefTableEntry
*entry
)
819 BlockRefTableSerializedEntry sentry
;
822 /* Convert to serialized entry format. */
823 sentry
.rlocator
= entry
->key
.rlocator
;
824 sentry
.forknum
= entry
->key
.forknum
;
825 sentry
.limit_block
= entry
->limit_block
;
826 sentry
.nchunks
= entry
->nchunks
;
828 /* Trim trailing zero entries. */
829 while (sentry
.nchunks
> 0 && entry
->chunk_usage
[sentry
.nchunks
- 1] == 0)
832 /* Write the serialized entry itself. */
833 BlockRefTableWrite(&writer
->buffer
, &sentry
,
834 sizeof(BlockRefTableSerializedEntry
));
836 /* Write the untruncated portion of the chunk length array. */
837 if (sentry
.nchunks
!= 0)
838 BlockRefTableWrite(&writer
->buffer
, entry
->chunk_usage
,
839 sentry
.nchunks
* sizeof(uint16
));
841 /* Write the contents of each chunk. */
842 for (j
= 0; j
< entry
->nchunks
; ++j
)
844 if (entry
->chunk_usage
[j
] == 0)
846 BlockRefTableWrite(&writer
->buffer
, entry
->chunk_data
[j
],
847 entry
->chunk_usage
[j
] * sizeof(uint16
));
852 * Finalize an incremental write of a block reference table file.
855 DestroyBlockRefTableWriter(BlockRefTableWriter
*writer
)
857 BlockRefTableFileTerminate(&writer
->buffer
);
862 * Allocate a standalone BlockRefTableEntry.
864 * When we're manipulating a full in-memory BlockRefTable, the entries are
865 * part of the hash table and are allocated by simplehash. This routine is
866 * used by callers that want to write out a BlockRefTable to a file without
867 * needing to store the whole thing in memory at once.
869 * Entries allocated by this function can be manipulated using the functions
870 * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
871 * and then written using BlockRefTableWriteEntry and freed using
872 * BlockRefTableFreeEntry.
875 CreateBlockRefTableEntry(RelFileLocator rlocator
, ForkNumber forknum
)
877 BlockRefTableEntry
*entry
= palloc0(sizeof(BlockRefTableEntry
));
879 memcpy(&entry
->key
.rlocator
, &rlocator
, sizeof(RelFileLocator
));
880 entry
->key
.forknum
= forknum
;
881 entry
->limit_block
= InvalidBlockNumber
;
887 * Update a BlockRefTableEntry with a new value for the "limit block" and
888 * forget any equal-or-higher-numbered modified blocks.
890 * The "limit block" is the shortest known length of the relation within the
891 * range of WAL records covered by this block reference table.
894 BlockRefTableEntrySetLimitBlock(BlockRefTableEntry
*entry
,
895 BlockNumber limit_block
)
898 unsigned limit_chunkno
;
899 unsigned limit_chunkoffset
;
900 BlockRefTableChunk limit_chunk
;
902 /* If we already have an equal or lower limit block, do nothing. */
903 if (limit_block
>= entry
->limit_block
)
906 /* Record the new limit block value. */
907 entry
->limit_block
= limit_block
;
910 * Figure out which chunk would store the state of the new limit block,
911 * and which offset within that chunk.
913 limit_chunkno
= limit_block
/ BLOCKS_PER_CHUNK
;
914 limit_chunkoffset
= limit_block
% BLOCKS_PER_CHUNK
;
917 * If the number of chunks is not large enough for any blocks with equal
918 * or higher block numbers to exist, then there is nothing further to do.
920 if (limit_chunkno
>= entry
->nchunks
)
923 /* Discard entire contents of any higher-numbered chunks. */
924 for (chunkno
= limit_chunkno
+ 1; chunkno
< entry
->nchunks
; ++chunkno
)
925 entry
->chunk_usage
[chunkno
] = 0;
928 * Next, we need to discard any offsets within the chunk that would
929 * contain the limit_block. We must handle this differently depending on
930 * whether the chunk that would contain limit_block is a bitmap or an
933 limit_chunk
= entry
->chunk_data
[limit_chunkno
];
934 if (entry
->chunk_usage
[limit_chunkno
] == MAX_ENTRIES_PER_CHUNK
)
936 unsigned chunkoffset
;
938 /* It's a bitmap. Unset bits. */
939 for (chunkoffset
= limit_chunkoffset
; chunkoffset
< BLOCKS_PER_CHUNK
;
941 limit_chunk
[chunkoffset
/ BLOCKS_PER_ENTRY
] &=
942 ~(1 << (chunkoffset
% BLOCKS_PER_ENTRY
));
949 /* It's an offset array. Filter out large offsets. */
950 for (i
= 0; i
< entry
->chunk_usage
[limit_chunkno
]; ++i
)
953 if (limit_chunk
[i
] < limit_chunkoffset
)
954 limit_chunk
[j
++] = limit_chunk
[i
];
956 Assert(j
<= entry
->chunk_usage
[limit_chunkno
]);
957 entry
->chunk_usage
[limit_chunkno
] = j
;
962 * Mark a block in a given BlockRefTableEntry as known to have been modified.
965 BlockRefTableEntryMarkBlockModified(BlockRefTableEntry
*entry
,
970 unsigned chunkoffset
;
974 * Which chunk should store the state of this block? And what is the
975 * offset of this block relative to the start of that chunk?
977 chunkno
= blknum
/ BLOCKS_PER_CHUNK
;
978 chunkoffset
= blknum
% BLOCKS_PER_CHUNK
;
981 * If 'nchunks' isn't big enough for us to be able to represent the state
982 * of this block, we need to enlarge our arrays.
984 if (chunkno
>= entry
->nchunks
)
987 unsigned extra_chunks
;
990 * New array size is a power of 2, at least 16, big enough so that
991 * chunkno will be a valid array index.
993 max_chunks
= Max(16, entry
->nchunks
);
994 while (max_chunks
< chunkno
+ 1)
996 extra_chunks
= max_chunks
- entry
->nchunks
;
998 if (entry
->nchunks
== 0)
1000 entry
->chunk_size
= palloc0(sizeof(uint16
) * max_chunks
);
1001 entry
->chunk_usage
= palloc0(sizeof(uint16
) * max_chunks
);
1003 palloc0(sizeof(BlockRefTableChunk
) * max_chunks
);
1007 entry
->chunk_size
= repalloc(entry
->chunk_size
,
1008 sizeof(uint16
) * max_chunks
);
1009 memset(&entry
->chunk_size
[entry
->nchunks
], 0,
1010 extra_chunks
* sizeof(uint16
));
1011 entry
->chunk_usage
= repalloc(entry
->chunk_usage
,
1012 sizeof(uint16
) * max_chunks
);
1013 memset(&entry
->chunk_usage
[entry
->nchunks
], 0,
1014 extra_chunks
* sizeof(uint16
));
1015 entry
->chunk_data
= repalloc(entry
->chunk_data
,
1016 sizeof(BlockRefTableChunk
) * max_chunks
);
1017 memset(&entry
->chunk_data
[entry
->nchunks
], 0,
1018 extra_chunks
* sizeof(BlockRefTableChunk
));
1020 entry
->nchunks
= max_chunks
;
1024 * If the chunk that covers this block number doesn't exist yet, create it
1025 * as an array and add the appropriate offset to it. We make it pretty
1026 * small initially, because there might only be 1 or a few block
1027 * references in this chunk and we don't want to use up too much memory.
1029 if (entry
->chunk_size
[chunkno
] == 0)
1031 entry
->chunk_data
[chunkno
] =
1032 palloc(sizeof(uint16
) * INITIAL_ENTRIES_PER_CHUNK
);
1033 entry
->chunk_size
[chunkno
] = INITIAL_ENTRIES_PER_CHUNK
;
1034 entry
->chunk_data
[chunkno
][0] = chunkoffset
;
1035 entry
->chunk_usage
[chunkno
] = 1;
1040 * If the number of entries in this chunk is already maximum, it must be a
1041 * bitmap. Just set the appropriate bit.
1043 if (entry
->chunk_usage
[chunkno
] == MAX_ENTRIES_PER_CHUNK
)
1045 BlockRefTableChunk chunk
= entry
->chunk_data
[chunkno
];
1047 chunk
[chunkoffset
/ BLOCKS_PER_ENTRY
] |=
1048 1 << (chunkoffset
% BLOCKS_PER_ENTRY
);
1053 * There is an existing chunk and it's in array format. Let's find out
1054 * whether it already has an entry for this block. If so, we do not need
1057 for (i
= 0; i
< entry
->chunk_usage
[chunkno
]; ++i
)
1059 if (entry
->chunk_data
[chunkno
][i
] == chunkoffset
)
1064 * If the number of entries currently used is one less than the maximum,
1065 * it's time to convert to bitmap format.
1067 if (entry
->chunk_usage
[chunkno
] == MAX_ENTRIES_PER_CHUNK
- 1)
1069 BlockRefTableChunk newchunk
;
1072 /* Allocate a new chunk. */
1073 newchunk
= palloc0(MAX_ENTRIES_PER_CHUNK
* sizeof(uint16
));
1075 /* Set the bit for each existing entry. */
1076 for (j
= 0; j
< entry
->chunk_usage
[chunkno
]; ++j
)
1078 unsigned coff
= entry
->chunk_data
[chunkno
][j
];
1080 newchunk
[coff
/ BLOCKS_PER_ENTRY
] |=
1081 1 << (coff
% BLOCKS_PER_ENTRY
);
1084 /* Set the bit for the new entry. */
1085 newchunk
[chunkoffset
/ BLOCKS_PER_ENTRY
] |=
1086 1 << (chunkoffset
% BLOCKS_PER_ENTRY
);
1088 /* Swap the new chunk into place and update metadata. */
1089 pfree(entry
->chunk_data
[chunkno
]);
1090 entry
->chunk_data
[chunkno
] = newchunk
;
1091 entry
->chunk_size
[chunkno
] = MAX_ENTRIES_PER_CHUNK
;
1092 entry
->chunk_usage
[chunkno
] = MAX_ENTRIES_PER_CHUNK
;
1097 * OK, we currently have an array, and we don't need to convert to a
1098 * bitmap, but we do need to add a new element. If there's not enough
1099 * room, we'll have to expand the array.
1101 if (entry
->chunk_usage
[chunkno
] == entry
->chunk_size
[chunkno
])
1103 unsigned newsize
= entry
->chunk_size
[chunkno
] * 2;
1105 Assert(newsize
<= MAX_ENTRIES_PER_CHUNK
);
1106 entry
->chunk_data
[chunkno
] = repalloc(entry
->chunk_data
[chunkno
],
1107 newsize
* sizeof(uint16
));
1108 entry
->chunk_size
[chunkno
] = newsize
;
1111 /* Now we can add the new entry. */
1112 entry
->chunk_data
[chunkno
][entry
->chunk_usage
[chunkno
]] =
1114 entry
->chunk_usage
[chunkno
]++;
1118 * Release memory for a BlockRefTableEntry that was created by
1119 * CreateBlockRefTableEntry.
1122 BlockRefTableFreeEntry(BlockRefTableEntry
*entry
)
1124 if (entry
->chunk_size
!= NULL
)
1126 pfree(entry
->chunk_size
);
1127 entry
->chunk_size
= NULL
;
1130 if (entry
->chunk_usage
!= NULL
)
1132 pfree(entry
->chunk_usage
);
1133 entry
->chunk_usage
= NULL
;
1136 if (entry
->chunk_data
!= NULL
)
1138 pfree(entry
->chunk_data
);
1139 entry
->chunk_data
= NULL
;
1146 * Comparator for BlockRefTableSerializedEntry objects.
1148 * We make the tablespace OID the first column of the sort key to match
1149 * the on-disk tree structure.
1152 BlockRefTableComparator(const void *a
, const void *b
)
1154 const BlockRefTableSerializedEntry
*sa
= a
;
1155 const BlockRefTableSerializedEntry
*sb
= b
;
1157 if (sa
->rlocator
.spcOid
> sb
->rlocator
.spcOid
)
1159 if (sa
->rlocator
.spcOid
< sb
->rlocator
.spcOid
)
1162 if (sa
->rlocator
.dbOid
> sb
->rlocator
.dbOid
)
1164 if (sa
->rlocator
.dbOid
< sb
->rlocator
.dbOid
)
1167 if (sa
->rlocator
.relNumber
> sb
->rlocator
.relNumber
)
1169 if (sa
->rlocator
.relNumber
< sb
->rlocator
.relNumber
)
1172 if (sa
->forknum
> sb
->forknum
)
1174 if (sa
->forknum
< sb
->forknum
)
1181 * Flush any buffered data out of a BlockRefTableBuffer.
1184 BlockRefTableFlush(BlockRefTableBuffer
*buffer
)
1186 buffer
->io_callback(buffer
->io_callback_arg
, buffer
->data
, buffer
->used
);
1191 * Read data from a BlockRefTableBuffer, and update the running CRC
1192 * calculation for the returned data (but not any data that we may have
1193 * buffered but not yet actually returned).
1196 BlockRefTableRead(BlockRefTableReader
*reader
, void *data
, int length
)
1198 BlockRefTableBuffer
*buffer
= &reader
->buffer
;
1200 /* Loop until read is fully satisfied. */
1203 if (buffer
->cursor
< buffer
->used
)
1206 * If any buffered data is available, use that to satisfy as much
1207 * of the request as possible.
1209 int bytes_to_copy
= Min(length
, buffer
->used
- buffer
->cursor
);
1211 memcpy(data
, &buffer
->data
[buffer
->cursor
], bytes_to_copy
);
1212 COMP_CRC32C(buffer
->crc
, &buffer
->data
[buffer
->cursor
],
1214 buffer
->cursor
+= bytes_to_copy
;
1215 data
= ((char *) data
) + bytes_to_copy
;
1216 length
-= bytes_to_copy
;
1218 else if (length
>= BUFSIZE
)
1221 * If the request length is long, read directly into caller's
1226 bytes_read
= buffer
->io_callback(buffer
->io_callback_arg
,
1228 COMP_CRC32C(buffer
->crc
, data
, bytes_read
);
1229 data
= ((char *) data
) + bytes_read
;
1230 length
-= bytes_read
;
1232 /* If we didn't get anything, that's bad. */
1233 if (bytes_read
== 0)
1234 reader
->error_callback(reader
->error_callback_arg
,
1235 "file \"%s\" ends unexpectedly",
1236 reader
->error_filename
);
1241 * Refill our buffer.
1243 buffer
->used
= buffer
->io_callback(buffer
->io_callback_arg
,
1244 buffer
->data
, BUFSIZE
);
1247 /* If we didn't get anything, that's bad. */
1248 if (buffer
->used
== 0)
1249 reader
->error_callback(reader
->error_callback_arg
,
1250 "file \"%s\" ends unexpectedly",
1251 reader
->error_filename
);
1257 * Supply data to a BlockRefTableBuffer for write to the underlying File,
1258 * and update the running CRC calculation for that data.
1261 BlockRefTableWrite(BlockRefTableBuffer
*buffer
, void *data
, int length
)
1263 /* Update running CRC calculation. */
1264 COMP_CRC32C(buffer
->crc
, data
, length
);
1266 /* If the new data can't fit into the buffer, flush the buffer. */
1267 if (buffer
->used
+ length
> BUFSIZE
)
1269 buffer
->io_callback(buffer
->io_callback_arg
, buffer
->data
,
1274 /* If the new data would fill the buffer, or more, write it directly. */
1275 if (length
>= BUFSIZE
)
1277 buffer
->io_callback(buffer
->io_callback_arg
, data
, length
);
1281 /* Otherwise, copy the new data into the buffer. */
1282 memcpy(&buffer
->data
[buffer
->used
], data
, length
);
1283 buffer
->used
+= length
;
1284 Assert(buffer
->used
<= BUFSIZE
);
1288 * Generate the sentinel and CRC required at the end of a block reference
1289 * table file and flush them out of our internal buffer.
1292 BlockRefTableFileTerminate(BlockRefTableBuffer
*buffer
)
1294 BlockRefTableSerializedEntry zentry
= {{0}};
1297 /* Write a sentinel indicating that there are no more entries. */
1298 BlockRefTableWrite(buffer
, &zentry
,
1299 sizeof(BlockRefTableSerializedEntry
));
1302 * Writing the checksum will perturb the ongoing checksum calculation, so
1303 * copy the state first and finalize the computation using the copy.
1307 BlockRefTableWrite(buffer
, &crc
, sizeof(pg_crc32c
));
1309 /* Flush any leftover data out of our buffer. */
1310 BlockRefTableFlush(buffer
);