nbtree: fix read page recheck typo.
[pgsql.git] / src / backend / backup / basebackup_incremental.c
blobf581a5f9e68f93a24ec125bf1269e2aad8465b5d
1 /*-------------------------------------------------------------------------
3 * basebackup_incremental.c
4 * code for incremental backup support
6 * This code isn't actually in charge of taking an incremental backup;
7 * the actual construction of the incremental backup happens in
8 * basebackup.c. Here, we're concerned with providing the necessary
9 * supports for that operation. In particular, we need to parse the
10 * backup manifest supplied by the user taking the incremental backup
11 * and extract the required information from it.
13 * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
15 * IDENTIFICATION
16 * src/backend/backup/basebackup_incremental.c
18 *-------------------------------------------------------------------------
20 #include "postgres.h"
22 #include "access/timeline.h"
23 #include "access/xlog.h"
24 #include "backup/basebackup_incremental.h"
25 #include "backup/walsummary.h"
26 #include "common/blkreftable.h"
27 #include "common/hashfn.h"
28 #include "common/int.h"
29 #include "common/parse_manifest.h"
30 #include "datatype/timestamp.h"
31 #include "postmaster/walsummarizer.h"
32 #include "utils/timestamp.h"
34 #define BLOCKS_PER_READ 512
37 * We expect to find the last lines of the manifest, including the checksum,
38 * in the last MIN_CHUNK bytes of the manifest. We trigger an incremental
39 * parse step if we are about to overflow MAX_CHUNK bytes.
41 #define MIN_CHUNK 1024
42 #define MAX_CHUNK (128 * 1024)
45 * Details extracted from the WAL ranges present in the supplied backup manifest.
47 typedef struct
49 TimeLineID tli;
50 XLogRecPtr start_lsn;
51 XLogRecPtr end_lsn;
52 } backup_wal_range;
55 * Details extracted from the file list present in the supplied backup manifest.
57 typedef struct
59 uint32 status;
60 const char *path;
61 uint64 size;
62 } backup_file_entry;
64 static uint32 hash_string_pointer(const char *s);
65 #define SH_PREFIX backup_file
66 #define SH_ELEMENT_TYPE backup_file_entry
67 #define SH_KEY_TYPE const char *
68 #define SH_KEY path
69 #define SH_HASH_KEY(tb, key) hash_string_pointer(key)
70 #define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
71 #define SH_SCOPE static inline
72 #define SH_DECLARE
73 #define SH_DEFINE
74 #include "lib/simplehash.h"
76 struct IncrementalBackupInfo
78 /* Memory context for this object and its subsidiary objects. */
79 MemoryContext mcxt;
81 /* Temporary buffer for storing the manifest while parsing it. */
82 StringInfoData buf;
84 /* WAL ranges extracted from the backup manifest. */
85 List *manifest_wal_ranges;
88 * Files extracted from the backup manifest.
90 * We don't really need this information, because we use WAL summaries to
91 * figure out what's changed. It would be unsafe to just rely on the list
92 * of files that existed before, because it's possible for a file to be
93 * removed and a new one created with the same name and different
94 * contents. In such cases, the whole file must still be sent. We can tell
95 * from the WAL summaries whether that happened, but not from the file
96 * list.
98 * Nonetheless, this data is useful for sanity checking. If a file that we
99 * think we shouldn't need to send is not present in the manifest for the
100 * prior backup, something has gone terribly wrong. We retain the file
101 * names and sizes, but not the checksums or last modified times, for
102 * which we have no use.
104 * One significant downside of storing this data is that it consumes
105 * memory. If that turns out to be a problem, we might have to decide not
106 * to retain this information, or to make it optional.
108 backup_file_hash *manifest_files;
111 * Block-reference table for the incremental backup.
113 * It's possible that storing the entire block-reference table in memory
114 * will be a problem for some users. The in-memory format that we're using
115 * here is pretty efficient, converging to little more than 1 bit per
116 * block for relation forks with large numbers of modified blocks. It's
117 * possible, however, that if you try to perform an incremental backup of
118 * a database with a sufficiently large number of relations on a
119 * sufficiently small machine, you could run out of memory here. If that
120 * turns out to be a problem in practice, we'll need to be more clever.
122 BlockRefTable *brtab;
125 * State object for incremental JSON parsing
127 JsonManifestParseIncrementalState *inc_state;
130 static void manifest_process_version(JsonManifestParseContext *context,
131 int manifest_version);
132 static void manifest_process_system_identifier(JsonManifestParseContext *context,
133 uint64 manifest_system_identifier);
134 static void manifest_process_file(JsonManifestParseContext *context,
135 const char *pathname,
136 uint64 size,
137 pg_checksum_type checksum_type,
138 int checksum_length,
139 uint8 *checksum_payload);
140 static void manifest_process_wal_range(JsonManifestParseContext *context,
141 TimeLineID tli,
142 XLogRecPtr start_lsn,
143 XLogRecPtr end_lsn);
144 static void manifest_report_error(JsonManifestParseContext *context,
145 const char *fmt,...)
146 pg_attribute_printf(2, 3) pg_attribute_noreturn();
147 static int compare_block_numbers(const void *a, const void *b);
150 * Create a new object for storing information extracted from the manifest
151 * supplied when creating an incremental backup.
153 IncrementalBackupInfo *
154 CreateIncrementalBackupInfo(MemoryContext mcxt)
156 IncrementalBackupInfo *ib;
157 MemoryContext oldcontext;
158 JsonManifestParseContext *context;
160 oldcontext = MemoryContextSwitchTo(mcxt);
162 ib = palloc0(sizeof(IncrementalBackupInfo));
163 ib->mcxt = mcxt;
164 initStringInfo(&ib->buf);
167 * It's hard to guess how many files a "typical" installation will have in
168 * the data directory, but a fresh initdb creates almost 1000 files as of
169 * this writing, so it seems to make sense for our estimate to
170 * substantially higher.
172 ib->manifest_files = backup_file_create(mcxt, 10000, NULL);
174 context = palloc0(sizeof(JsonManifestParseContext));
175 /* Parse the manifest. */
176 context->private_data = ib;
177 context->version_cb = manifest_process_version;
178 context->system_identifier_cb = manifest_process_system_identifier;
179 context->per_file_cb = manifest_process_file;
180 context->per_wal_range_cb = manifest_process_wal_range;
181 context->error_cb = manifest_report_error;
183 ib->inc_state = json_parse_manifest_incremental_init(context);
185 MemoryContextSwitchTo(oldcontext);
187 return ib;
191 * Before taking an incremental backup, the caller must supply the backup
192 * manifest from a prior backup. Each chunk of manifest data received
193 * from the client should be passed to this function.
195 void
196 AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data,
197 int len)
199 MemoryContext oldcontext;
201 /* Switch to our memory context. */
202 oldcontext = MemoryContextSwitchTo(ib->mcxt);
204 if (ib->buf.len > MIN_CHUNK && ib->buf.len + len > MAX_CHUNK)
207 * time for an incremental parse. We'll do all but the last MIN_CHUNK
208 * so that we have enough left for the final piece.
210 json_parse_manifest_incremental_chunk(
211 ib->inc_state, ib->buf.data, ib->buf.len - MIN_CHUNK, false);
212 /* now remove what we just parsed */
213 memmove(ib->buf.data, ib->buf.data + (ib->buf.len - MIN_CHUNK),
214 MIN_CHUNK + 1);
215 ib->buf.len = MIN_CHUNK;
218 appendBinaryStringInfo(&ib->buf, data, len);
220 /* Switch back to previous memory context. */
221 MemoryContextSwitchTo(oldcontext);
225 * Finalize an IncrementalBackupInfo object after all manifest data has
226 * been supplied via calls to AppendIncrementalManifestData.
228 void
229 FinalizeIncrementalManifest(IncrementalBackupInfo *ib)
231 MemoryContext oldcontext;
233 /* Switch to our memory context. */
234 oldcontext = MemoryContextSwitchTo(ib->mcxt);
236 /* Parse the last chunk of the manifest */
237 json_parse_manifest_incremental_chunk(
238 ib->inc_state, ib->buf.data, ib->buf.len, true);
240 /* Done with the buffer, so release memory. */
241 pfree(ib->buf.data);
242 ib->buf.data = NULL;
244 /* Done with inc_state, so release that memory too */
245 json_parse_manifest_incremental_shutdown(ib->inc_state);
247 /* Switch back to previous memory context. */
248 MemoryContextSwitchTo(oldcontext);
252 * Prepare to take an incremental backup.
254 * Before this function is called, AppendIncrementalManifestData and
255 * FinalizeIncrementalManifest should have already been called to pass all
256 * the manifest data to this object.
258 * This function performs sanity checks on the data extracted from the
259 * manifest and figures out for which WAL ranges we need summaries, and
260 * whether those summaries are available. Then, it reads and combines the
261 * data from those summary files. It also updates the backup_state with the
262 * reference TLI and LSN for the prior backup.
264 void
265 PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
266 BackupState *backup_state)
268 MemoryContext oldcontext;
269 List *expectedTLEs;
270 List *all_wslist,
271 *required_wslist = NIL;
272 ListCell *lc;
273 TimeLineHistoryEntry **tlep;
274 int num_wal_ranges;
275 int i;
276 bool found_backup_start_tli = false;
277 TimeLineID earliest_wal_range_tli = 0;
278 XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
279 TimeLineID latest_wal_range_tli = 0;
281 Assert(ib->buf.data == NULL);
283 /* Switch to our memory context. */
284 oldcontext = MemoryContextSwitchTo(ib->mcxt);
287 * A valid backup manifest must always contain at least one WAL range
288 * (usually exactly one, unless the backup spanned a timeline switch).
290 num_wal_ranges = list_length(ib->manifest_wal_ranges);
291 if (num_wal_ranges == 0)
292 ereport(ERROR,
293 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
294 errmsg("manifest contains no required WAL ranges")));
297 * Match up the TLIs that appear in the WAL ranges of the backup manifest
298 * with those that appear in this server's timeline history. We expect
299 * every backup_wal_range to match to a TimeLineHistoryEntry; if it does
300 * not, that's an error.
302 * This loop also decides which of the WAL ranges is the manifest is most
303 * ancient and which one is the newest, according to the timeline history
304 * of this server, and stores TLIs of those WAL ranges into
305 * earliest_wal_range_tli and latest_wal_range_tli. It also updates
306 * earliest_wal_range_start_lsn to the start LSN of the WAL range for
307 * earliest_wal_range_tli.
309 * Note that the return value of readTimeLineHistory puts the latest
310 * timeline at the beginning of the list, not the end. Hence, the earliest
311 * TLI is the one that occurs nearest the end of the list returned by
312 * readTimeLineHistory, and the latest TLI is the one that occurs closest
313 * to the beginning.
315 expectedTLEs = readTimeLineHistory(backup_state->starttli);
316 tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *));
317 for (i = 0; i < num_wal_ranges; ++i)
319 backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
320 bool saw_earliest_wal_range_tli = false;
321 bool saw_latest_wal_range_tli = false;
323 /* Search this server's history for this WAL range's TLI. */
324 foreach(lc, expectedTLEs)
326 TimeLineHistoryEntry *tle = lfirst(lc);
328 if (tle->tli == range->tli)
330 tlep[i] = tle;
331 break;
334 if (tle->tli == earliest_wal_range_tli)
335 saw_earliest_wal_range_tli = true;
336 if (tle->tli == latest_wal_range_tli)
337 saw_latest_wal_range_tli = true;
341 * An incremental backup can only be taken relative to a backup that
342 * represents a previous state of this server. If the backup requires
343 * WAL from a timeline that's not in our history, that definitely
344 * isn't the case.
346 if (tlep[i] == NULL)
347 ereport(ERROR,
348 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
349 errmsg("timeline %u found in manifest, but not in this server's history",
350 range->tli)));
353 * If we found this TLI in the server's history before encountering
354 * the latest TLI seen so far in the server's history, then this TLI
355 * is the latest one seen so far.
357 * If on the other hand we saw the earliest TLI seen so far before
358 * finding this TLI, this TLI is earlier than the earliest one seen so
359 * far. And if this is the first TLI for which we've searched, it's
360 * also the earliest one seen so far.
362 * On the first loop iteration, both things should necessarily be
363 * true.
365 if (!saw_latest_wal_range_tli)
366 latest_wal_range_tli = range->tli;
367 if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli)
369 earliest_wal_range_tli = range->tli;
370 earliest_wal_range_start_lsn = range->start_lsn;
375 * Propagate information about the prior backup into the backup_label that
376 * will be generated for this backup.
378 backup_state->istartpoint = earliest_wal_range_start_lsn;
379 backup_state->istarttli = earliest_wal_range_tli;
382 * Sanity check start and end LSNs for the WAL ranges in the manifest.
384 * Commonly, there won't be any timeline switches during the prior backup
385 * at all, but if there are, they should happen at the same LSNs that this
386 * server switched timelines.
388 * Whether there are any timeline switches during the prior backup or not,
389 * the prior backup shouldn't require any WAL from a timeline prior to the
390 * start of that timeline. It also shouldn't require any WAL from later
391 * than the start of this backup.
393 * If any of these sanity checks fail, one possible explanation is that
394 * the user has generated WAL on the same timeline with the same LSNs more
395 * than once. For instance, if two standbys running on timeline 1 were
396 * both promoted and (due to a broken archiving setup) both selected new
397 * timeline ID 2, then it's possible that one of these checks might trip.
399 * Note that there are lots of ways for the user to do something very bad
400 * without tripping any of these checks, and they are not intended to be
401 * comprehensive. It's pretty hard to see how we could be certain of
402 * anything here. However, if there's a problem staring us right in the
403 * face, it's best to report it, so we do.
405 for (i = 0; i < num_wal_ranges; ++i)
407 backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
409 if (range->tli == earliest_wal_range_tli)
411 if (range->start_lsn < tlep[i]->begin)
412 ereport(ERROR,
413 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
414 errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X",
415 range->tli,
416 LSN_FORMAT_ARGS(range->start_lsn),
417 LSN_FORMAT_ARGS(tlep[i]->begin))));
419 else
421 if (range->start_lsn != tlep[i]->begin)
422 ereport(ERROR,
423 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
424 errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X",
425 range->tli,
426 LSN_FORMAT_ARGS(range->start_lsn),
427 LSN_FORMAT_ARGS(tlep[i]->begin))));
430 if (range->tli == latest_wal_range_tli)
432 if (range->end_lsn > backup_state->startpoint)
433 ereport(ERROR,
434 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
435 errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X",
436 range->tli,
437 LSN_FORMAT_ARGS(range->end_lsn),
438 LSN_FORMAT_ARGS(backup_state->startpoint)),
439 errhint("This can happen for incremental backups on a standby if there was little activity since the previous backup.")));
441 else
443 if (range->end_lsn != tlep[i]->end)
444 ereport(ERROR,
445 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
446 errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X",
447 range->tli,
448 LSN_FORMAT_ARGS(range->end_lsn),
449 LSN_FORMAT_ARGS(tlep[i]->end))));
455 * Wait for WAL summarization to catch up to the backup start LSN. This
456 * will throw an error if the WAL summarizer appears to be stuck. If WAL
457 * summarization gets disabled while we're waiting, this will return
458 * immediately, and we'll error out further down if the WAL summaries are
459 * incomplete.
461 WaitForWalSummarization(backup_state->startpoint);
464 * Retrieve a list of all WAL summaries on any timeline that overlap with
465 * the LSN range of interest. We could instead call GetWalSummaries() once
466 * per timeline in the loop that follows, but that would involve reading
467 * the directory multiple times. It should be mildly faster - and perhaps
468 * a bit safer - to do it just once.
470 all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn,
471 backup_state->startpoint);
474 * We need WAL summaries for everything that happened during the prior
475 * backup and everything that happened afterward up until the point where
476 * the current backup started.
478 foreach(lc, expectedTLEs)
480 TimeLineHistoryEntry *tle = lfirst(lc);
481 XLogRecPtr tli_start_lsn = tle->begin;
482 XLogRecPtr tli_end_lsn = tle->end;
483 XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr;
484 List *tli_wslist;
487 * Working through the history of this server from the current
488 * timeline backwards, we skip everything until we find the timeline
489 * where this backup started. Most of the time, this means we won't
490 * skip anything at all, as it's unlikely that the timeline has
491 * changed since the beginning of the backup moments ago.
493 if (tle->tli == backup_state->starttli)
495 found_backup_start_tli = true;
496 tli_end_lsn = backup_state->startpoint;
498 else if (!found_backup_start_tli)
499 continue;
502 * Find the summaries that overlap the LSN range of interest for this
503 * timeline. If this is the earliest timeline involved, the range of
504 * interest begins with the start LSN of the prior backup; otherwise,
505 * it begins at the LSN at which this timeline came into existence. If
506 * this is the latest TLI involved, the range of interest ends at the
507 * start LSN of the current backup; otherwise, it ends at the point
508 * where we switched from this timeline to the next one.
510 if (tle->tli == earliest_wal_range_tli)
511 tli_start_lsn = earliest_wal_range_start_lsn;
512 tli_wslist = FilterWalSummaries(all_wslist, tle->tli,
513 tli_start_lsn, tli_end_lsn);
516 * There is no guarantee that the WAL summaries we found cover the
517 * entire range of LSNs for which summaries are required, or indeed
518 * that we found any WAL summaries at all. Check whether we have a
519 * problem of that sort.
521 if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn,
522 &tli_missing_lsn))
524 if (XLogRecPtrIsInvalid(tli_missing_lsn))
525 ereport(ERROR,
526 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
527 errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist",
528 tle->tli,
529 LSN_FORMAT_ARGS(tli_start_lsn),
530 LSN_FORMAT_ARGS(tli_end_lsn))));
531 else
532 ereport(ERROR,
533 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
534 errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete",
535 tle->tli,
536 LSN_FORMAT_ARGS(tli_start_lsn),
537 LSN_FORMAT_ARGS(tli_end_lsn)),
538 errdetail("The first unsummarized LSN in this range is %X/%X.",
539 LSN_FORMAT_ARGS(tli_missing_lsn))));
543 * Remember that we need to read these summaries.
545 * Technically, it's possible that this could read more files than
546 * required, since tli_wslist in theory could contain redundant
547 * summaries. For instance, if we have a summary from 0/10000000 to
548 * 0/20000000 and also one from 0/00000000 to 0/30000000, then the
549 * latter subsumes the former and the former could be ignored.
551 * We ignore this possibility because the WAL summarizer only tries to
552 * generate summaries that do not overlap. If somehow they exist,
553 * we'll do a bit of extra work but the results should still be
554 * correct.
556 required_wslist = list_concat(required_wslist, tli_wslist);
559 * Timelines earlier than the one in which the prior backup began are
560 * not relevant.
562 if (tle->tli == earliest_wal_range_tli)
563 break;
567 * Read all of the required block reference table files and merge all of
568 * the data into a single in-memory block reference table.
570 * See the comments for struct IncrementalBackupInfo for some thoughts on
571 * memory usage.
573 ib->brtab = CreateEmptyBlockRefTable();
574 foreach(lc, required_wslist)
576 WalSummaryFile *ws = lfirst(lc);
577 WalSummaryIO wsio;
578 BlockRefTableReader *reader;
579 RelFileLocator rlocator;
580 ForkNumber forknum;
581 BlockNumber limit_block;
582 BlockNumber blocks[BLOCKS_PER_READ];
584 wsio.file = OpenWalSummaryFile(ws, false);
585 wsio.filepos = 0;
586 ereport(DEBUG1,
587 (errmsg_internal("reading WAL summary file \"%s\"",
588 FilePathName(wsio.file))));
589 reader = CreateBlockRefTableReader(ReadWalSummary, &wsio,
590 FilePathName(wsio.file),
591 ReportWalSummaryError, NULL);
592 while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum,
593 &limit_block))
595 BlockRefTableSetLimitBlock(ib->brtab, &rlocator,
596 forknum, limit_block);
598 while (1)
600 unsigned nblocks;
601 unsigned i;
603 nblocks = BlockRefTableReaderGetBlocks(reader, blocks,
604 BLOCKS_PER_READ);
605 if (nblocks == 0)
606 break;
608 for (i = 0; i < nblocks; ++i)
609 BlockRefTableMarkBlockModified(ib->brtab, &rlocator,
610 forknum, blocks[i]);
613 DestroyBlockRefTableReader(reader);
614 FileClose(wsio.file);
617 /* Switch back to previous memory context. */
618 MemoryContextSwitchTo(oldcontext);
622 * Get the pathname that should be used when a file is sent incrementally.
624 * The result is a palloc'd string.
626 char *
627 GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
628 ForkNumber forknum, unsigned segno)
630 char *path;
631 char *lastslash;
632 char *ipath;
634 path = GetRelationPath(dboid, spcoid, relfilenumber, INVALID_PROC_NUMBER,
635 forknum);
637 lastslash = strrchr(path, '/');
638 Assert(lastslash != NULL);
639 *lastslash = '\0';
641 if (segno > 0)
642 ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno);
643 else
644 ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1);
646 pfree(path);
648 return ipath;
652 * How should we back up a particular file as part of an incremental backup?
654 * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole
655 * file just as if this were not an incremental backup. The contents of the
656 * relative_block_numbers array are unspecified in this case.
658 * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include
659 * an incremental file in the backup instead of the entire file. On return,
660 * *num_blocks_required will be set to the number of blocks that need to be
661 * sent, and the actual block numbers will have been stored in
662 * relative_block_numbers, which should be an array of at least RELSEG_SIZE.
663 * In addition, *truncation_block_length will be set to the value that should
664 * be included in the incremental file.
666 FileBackupMethod
667 GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path,
668 Oid dboid, Oid spcoid,
669 RelFileNumber relfilenumber, ForkNumber forknum,
670 unsigned segno, size_t size,
671 unsigned *num_blocks_required,
672 BlockNumber *relative_block_numbers,
673 unsigned *truncation_block_length)
675 BlockNumber limit_block;
676 BlockNumber start_blkno;
677 BlockNumber stop_blkno;
678 RelFileLocator rlocator;
679 BlockRefTableEntry *brtentry;
680 unsigned i;
681 unsigned nblocks;
683 /* Should only be called after PrepareForIncrementalBackup. */
684 Assert(ib->buf.data == NULL);
687 * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber
688 * should have legal values.
690 Assert(OidIsValid(spcoid));
691 Assert(RelFileNumberIsValid(relfilenumber));
694 * If the file size is too large or not a multiple of BLCKSZ, then
695 * something weird is happening, so give up and send the whole file.
697 if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE)
698 return BACK_UP_FILE_FULLY;
701 * The free-space map fork is not properly WAL-logged, so we need to
702 * backup the entire file every time.
704 if (forknum == FSM_FORKNUM)
705 return BACK_UP_FILE_FULLY;
708 * If this file was not part of the prior backup, back it up fully.
710 * If this file was created after the prior backup and before the start of
711 * the current backup, then the WAL summary information will tell us to
712 * back up the whole file. However, if this file was created after the
713 * start of the current backup, then the WAL summary won't know anything
714 * about it. Without this logic, we would erroneously conclude that it was
715 * OK to send it incrementally.
717 * Note that the file could have existed at the time of the prior backup,
718 * gotten deleted, and then a new file with the same name could have been
719 * created. In that case, this logic won't prevent the file from being
720 * backed up incrementally. But, if the deletion happened before the start
721 * of the current backup, the limit block will be 0, inducing a full
722 * backup. If the deletion happened after the start of the current backup,
723 * reconstruction will erroneously combine blocks from the current
724 * lifespan of the file with blocks from the previous lifespan -- but in
725 * this type of case, WAL replay to reach backup consistency should remove
726 * and recreate the file anyway, so the initial bogus contents should not
727 * matter.
729 if (backup_file_lookup(ib->manifest_files, path) == NULL)
731 char *ipath;
733 ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber,
734 forknum, segno);
735 if (backup_file_lookup(ib->manifest_files, ipath) == NULL)
736 return BACK_UP_FILE_FULLY;
740 * Look up the special block reference table entry for the database as a
741 * whole.
743 rlocator.spcOid = spcoid;
744 rlocator.dbOid = dboid;
745 rlocator.relNumber = 0;
746 if (BlockRefTableGetEntry(ib->brtab, &rlocator, MAIN_FORKNUM,
747 &limit_block) != NULL)
750 * According to the WAL summary, this database OID/tablespace OID
751 * pairing has been created since the previous backup. So, everything
752 * in it must be backed up fully.
754 return BACK_UP_FILE_FULLY;
757 /* Look up the block reference table entry for this relfilenode. */
758 rlocator.relNumber = relfilenumber;
759 brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum,
760 &limit_block);
763 * If there is no entry, then there have been no WAL-logged changes to the
764 * relation since the predecessor backup was taken, so we can back it up
765 * incrementally and need not include any modified blocks.
767 * However, if the file is zero-length, we should do a full backup,
768 * because an incremental file is always more than zero length, and it's
769 * silly to take an incremental backup when a full backup would be
770 * smaller.
772 if (brtentry == NULL)
774 if (size == 0)
775 return BACK_UP_FILE_FULLY;
776 *num_blocks_required = 0;
777 *truncation_block_length = size / BLCKSZ;
778 return BACK_UP_FILE_INCREMENTALLY;
782 * If the limit_block is less than or equal to the point where this
783 * segment starts, send the whole file.
785 if (limit_block <= segno * RELSEG_SIZE)
786 return BACK_UP_FILE_FULLY;
789 * Get relevant entries from the block reference table entry.
791 * We shouldn't overflow computing the start or stop block numbers, but if
792 * it manages to happen somehow, detect it and throw an error.
794 start_blkno = segno * RELSEG_SIZE;
795 stop_blkno = start_blkno + (size / BLCKSZ);
796 if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno)
797 ereport(ERROR,
798 errcode(ERRCODE_INTERNAL_ERROR),
799 errmsg_internal("overflow computing block number bounds for segment %u with size %zu",
800 segno, size));
803 * This will write *absolute* block numbers into the output array, but
804 * we'll transpose them below.
806 nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno,
807 relative_block_numbers, RELSEG_SIZE);
808 Assert(nblocks <= RELSEG_SIZE);
811 * If we're going to have to send nearly all of the blocks, then just send
812 * the whole file, because that won't require much extra storage or
813 * transfer and will speed up and simplify backup restoration. It's not
814 * clear what threshold is most appropriate here and perhaps it ought to
815 * be configurable, but for now we're just going to say that if we'd need
816 * to send 90% of the blocks anyway, give up and send the whole file.
818 * NB: If you change the threshold here, at least make sure to back up the
819 * file fully when every single block must be sent, because there's
820 * nothing good about sending an incremental file in that case.
822 if (nblocks * BLCKSZ > size * 0.9)
823 return BACK_UP_FILE_FULLY;
826 * Looks like we can send an incremental file, so sort the block numbers
827 * and then transpose them from absolute block numbers to relative block
828 * numbers if necessary.
830 * NB: If the block reference table was using the bitmap representation
831 * for a given chunk, the block numbers in that chunk will already be
832 * sorted, but when the array-of-offsets representation is used, we can
833 * receive block numbers here out of order.
835 qsort(relative_block_numbers, nblocks, sizeof(BlockNumber),
836 compare_block_numbers);
837 if (start_blkno != 0)
839 for (i = 0; i < nblocks; ++i)
840 relative_block_numbers[i] -= start_blkno;
842 *num_blocks_required = nblocks;
845 * The truncation block length is the minimum length of the reconstructed
846 * file. Any block numbers below this threshold that are not present in
847 * the backup need to be fetched from the prior backup. At or above this
848 * threshold, blocks should only be included in the result if they are
849 * present in the backup. (This may require inserting zero blocks if the
850 * blocks included in the backup are non-consecutive.)
852 *truncation_block_length = size / BLCKSZ;
853 if (BlockNumberIsValid(limit_block))
855 unsigned relative_limit = limit_block - segno * RELSEG_SIZE;
857 if (*truncation_block_length < relative_limit)
858 *truncation_block_length = relative_limit;
861 /* Send it incrementally. */
862 return BACK_UP_FILE_INCREMENTALLY;
866 * Compute the size for a header of an incremental file containing a given
867 * number of blocks. The header is rounded to a multiple of BLCKSZ, but
868 * only if the file will store some block data.
870 size_t
871 GetIncrementalHeaderSize(unsigned num_blocks_required)
873 size_t result;
875 /* Make sure we're not going to overflow. */
876 Assert(num_blocks_required <= RELSEG_SIZE);
879 * Three four byte quantities (magic number, truncation block length,
880 * block count) followed by block numbers.
882 result = 3 * sizeof(uint32) + (sizeof(BlockNumber) * num_blocks_required);
885 * Round the header size to a multiple of BLCKSZ - when not a multiple of
886 * BLCKSZ, add the missing fraction of a block. But do this only if the
887 * file will store data for some blocks, otherwise keep it small.
889 if ((num_blocks_required > 0) && (result % BLCKSZ != 0))
890 result += BLCKSZ - (result % BLCKSZ);
892 return result;
896 * Compute the size for an incremental file containing a given number of blocks.
898 size_t
899 GetIncrementalFileSize(unsigned num_blocks_required)
901 size_t result;
903 /* Make sure we're not going to overflow. */
904 Assert(num_blocks_required <= RELSEG_SIZE);
907 * Header with three four byte quantities (magic number, truncation block
908 * length, block count) followed by block numbers, rounded to a multiple
909 * of BLCKSZ (for files with block data), followed by block contents.
911 result = GetIncrementalHeaderSize(num_blocks_required);
912 result += BLCKSZ * num_blocks_required;
914 return result;
918 * Helper function for filemap hash table.
920 static uint32
921 hash_string_pointer(const char *s)
923 unsigned char *ss = (unsigned char *) s;
925 return hash_bytes(ss, strlen(s));
929 * This callback to validate the manifest version for incremental backup.
931 static void
932 manifest_process_version(JsonManifestParseContext *context,
933 int manifest_version)
935 /* Incremental backups don't work with manifest version 1 */
936 if (manifest_version == 1)
937 context->error_cb(context,
938 "backup manifest version 1 does not support incremental backup");
942 * This callback to validate the manifest system identifier against the current
943 * database server.
945 static void
946 manifest_process_system_identifier(JsonManifestParseContext *context,
947 uint64 manifest_system_identifier)
949 uint64 system_identifier;
951 /* Get system identifier of current system */
952 system_identifier = GetSystemIdentifier();
954 if (manifest_system_identifier != system_identifier)
955 context->error_cb(context,
956 "system identifier in backup manifest is %llu, but database system identifier is %llu",
957 (unsigned long long) manifest_system_identifier,
958 (unsigned long long) system_identifier);
962 * This callback is invoked for each file mentioned in the backup manifest.
964 * We store the path to each file and the size of each file for sanity-checking
965 * purposes. For further details, see comments for IncrementalBackupInfo.
967 static void
968 manifest_process_file(JsonManifestParseContext *context,
969 const char *pathname, uint64 size,
970 pg_checksum_type checksum_type,
971 int checksum_length,
972 uint8 *checksum_payload)
974 IncrementalBackupInfo *ib = context->private_data;
975 backup_file_entry *entry;
976 bool found;
978 entry = backup_file_insert(ib->manifest_files, pathname, &found);
979 if (!found)
981 entry->path = MemoryContextStrdup(ib->manifest_files->ctx,
982 pathname);
983 entry->size = size;
988 * This callback is invoked for each WAL range mentioned in the backup
989 * manifest.
991 * We're just interested in learning the oldest LSN and the corresponding TLI
992 * that appear in any WAL range.
994 static void
995 manifest_process_wal_range(JsonManifestParseContext *context,
996 TimeLineID tli, XLogRecPtr start_lsn,
997 XLogRecPtr end_lsn)
999 IncrementalBackupInfo *ib = context->private_data;
1000 backup_wal_range *range = palloc(sizeof(backup_wal_range));
1002 range->tli = tli;
1003 range->start_lsn = start_lsn;
1004 range->end_lsn = end_lsn;
1005 ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range);
1009 * This callback is invoked if an error occurs while parsing the backup
1010 * manifest.
1012 static void
1013 manifest_report_error(JsonManifestParseContext *context, const char *fmt,...)
1015 StringInfoData errbuf;
1017 initStringInfo(&errbuf);
1019 for (;;)
1021 va_list ap;
1022 int needed;
1024 va_start(ap, fmt);
1025 needed = appendStringInfoVA(&errbuf, fmt, ap);
1026 va_end(ap);
1027 if (needed == 0)
1028 break;
1029 enlargeStringInfo(&errbuf, needed);
1032 ereport(ERROR,
1033 errmsg_internal("%s", errbuf.data));
1037 * Quicksort comparator for block numbers.
1039 static int
1040 compare_block_numbers(const void *a, const void *b)
1042 BlockNumber aa = *(BlockNumber *) a;
1043 BlockNumber bb = *(BlockNumber *) b;
1045 return pg_cmp_u32(aa, bb);