1 /*-------------------------------------------------------------------------
4 * code for taking a base backup and streaming it to a standby
6 * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
9 * src/backend/backup/basebackup.c
11 *-------------------------------------------------------------------------
19 #include "access/xlog_internal.h"
20 #include "access/xlogbackup.h"
21 #include "backup/backup_manifest.h"
22 #include "backup/basebackup.h"
23 #include "backup/basebackup_incremental.h"
24 #include "backup/basebackup_sink.h"
25 #include "backup/basebackup_target.h"
26 #include "catalog/pg_tablespace_d.h"
27 #include "commands/defrem.h"
28 #include "common/compression.h"
29 #include "common/file_perm.h"
30 #include "common/file_utils.h"
31 #include "lib/stringinfo.h"
32 #include "miscadmin.h"
33 #include "nodes/pg_list.h"
37 #include "postmaster/syslogger.h"
38 #include "postmaster/walsummarizer.h"
39 #include "replication/slot.h"
40 #include "replication/walsender.h"
41 #include "replication/walsender_private.h"
42 #include "storage/bufpage.h"
43 #include "storage/checksum.h"
44 #include "storage/dsm_impl.h"
45 #include "storage/ipc.h"
46 #include "storage/reinit.h"
47 #include "utils/builtins.h"
48 #include "utils/guc.h"
49 #include "utils/ps_status.h"
50 #include "utils/relcache.h"
51 #include "utils/resowner.h"
54 * How much data do we want to send in one CopyData message? Note that
55 * this may also result in reading the underlying files in chunks of this
58 * NB: The buffer size is required to be a multiple of the system block
59 * size, so use that value instead if it's bigger than our preference.
61 #define SINK_BUFFER_LENGTH Max(32768, BLCKSZ)
72 bool sendtblspcmapfile
;
75 BaseBackupTargetHandle
*target_handle
;
76 backup_manifest_option manifest
;
77 pg_compress_algorithm compression
;
78 pg_compress_specification compression_specification
;
79 pg_checksum_type manifest_checksum_type
;
82 static int64
sendTablespace(bbsink
*sink
, char *path
, Oid spcoid
, bool sizeonly
,
83 struct backup_manifest_info
*manifest
,
84 IncrementalBackupInfo
*ib
);
85 static int64
sendDir(bbsink
*sink
, const char *path
, int basepathlen
, bool sizeonly
,
86 List
*tablespaces
, bool sendtblspclinks
,
87 backup_manifest_info
*manifest
, Oid spcoid
,
88 IncrementalBackupInfo
*ib
);
89 static bool sendFile(bbsink
*sink
, const char *readfilename
, const char *tarfilename
,
90 struct stat
*statbuf
, bool missing_ok
,
91 Oid dboid
, Oid spcoid
, RelFileNumber relfilenumber
,
93 backup_manifest_info
*manifest
,
94 unsigned num_incremental_blocks
,
95 BlockNumber
*incremental_blocks
,
96 unsigned truncation_block_length
);
97 static off_t
read_file_data_into_buffer(bbsink
*sink
,
98 const char *readfilename
, int fd
,
99 off_t offset
, size_t length
,
101 bool verify_checksum
,
102 int *checksum_failures
);
103 static void push_to_sink(bbsink
*sink
, pg_checksum_context
*checksum_ctx
,
104 size_t *bytes_done
, void *data
, size_t length
);
105 static bool verify_page_checksum(Page page
, XLogRecPtr start_lsn
,
107 uint16
*expected_checksum
);
108 static void sendFileWithContent(bbsink
*sink
, const char *filename
,
109 const char *content
, int len
,
110 backup_manifest_info
*manifest
);
111 static int64
_tarWriteHeader(bbsink
*sink
, const char *filename
,
112 const char *linktarget
, struct stat
*statbuf
,
114 static void _tarWritePadding(bbsink
*sink
, int len
);
115 static void convert_link_to_directory(const char *pathbuf
, struct stat
*statbuf
);
116 static void perform_base_backup(basebackup_options
*opt
, bbsink
*sink
,
117 IncrementalBackupInfo
*ib
);
118 static void parse_basebackup_options(List
*options
, basebackup_options
*opt
);
119 static int compareWalFileNames(const ListCell
*a
, const ListCell
*b
);
120 static ssize_t
basebackup_read_file(int fd
, char *buf
, size_t nbytes
, off_t offset
,
121 const char *filename
, bool partial_read_ok
);
123 /* Was the backup currently in-progress initiated in recovery mode? */
124 static bool backup_started_in_recovery
= false;
126 /* Total number of checksum failures during base backup. */
127 static long long int total_checksum_failures
;
129 /* Do not verify checksums. */
130 static bool noverify_checksums
= false;
133 * Definition of one element part of an exclusion list, used for paths part
134 * of checksum validation or base backups. "name" is the name of the file
135 * or path to check for exclusion. If "match_prefix" is true, any items
136 * matching the name as prefix are excluded.
138 struct exclude_list_item
145 * The contents of these directories are removed or recreated during server
146 * start so they are not included in backups. The directories themselves are
147 * kept and included as empty to preserve access permissions.
149 * Note: this list should be kept in sync with the filter lists in pg_rewind's
152 static const char *const excludeDirContents
[] =
155 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
156 * because extensions like pg_stat_statements store data there.
161 * It is generally not useful to backup the contents of this directory
162 * even if the intention is to restore to another primary. See backup.sgml
163 * for a more detailed description.
167 /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
170 /* Contents removed on startup, see AsyncShmemInit(). */
174 * Old contents are loaded for possible debugging but are not required for
175 * normal operation, see SerialInit().
179 /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
182 /* Contents zeroed on startup, see StartupSUBTRANS(). */
190 * List of files excluded from backups.
192 static const struct exclude_list_item excludeFiles
[] =
194 /* Skip auto conf temporary file. */
195 {PG_AUTOCONF_FILENAME
".tmp", false},
197 /* Skip current log file temporary file */
198 {LOG_METAINFO_DATAFILE_TMP
, false},
201 * Skip relation cache because it is rebuilt on startup. This includes
204 {RELCACHE_INIT_FILENAME
, true},
207 * backup_label and tablespace_map should not exist in a running cluster
208 * capable of doing an online backup, but exclude them just in case.
210 {BACKUP_LABEL_FILE
, false},
211 {TABLESPACE_MAP
, false},
214 * If there's a backup_manifest, it belongs to a backup that was used to
215 * start this server. It is *not* correct for this backup. Our
216 * backup_manifest is injected into the backup separately if users want
219 {"backup_manifest", false},
221 {"postmaster.pid", false},
222 {"postmaster.opts", false},
229 * Actually do a base backup for the specified tablespaces.
231 * This is split out mainly to avoid complaints about "variable might be
232 * clobbered by longjmp" from stupider versions of gcc.
235 perform_base_backup(basebackup_options
*opt
, bbsink
*sink
,
236 IncrementalBackupInfo
*ib
)
241 backup_manifest_info manifest
;
242 BackupState
*backup_state
;
243 StringInfo tablespace_map
;
245 /* Initial backup state, insofar as we know it now. */
246 state
.tablespaces
= NIL
;
247 state
.tablespace_num
= 0;
248 state
.bytes_done
= 0;
249 state
.bytes_total
= 0;
250 state
.bytes_total_is_valid
= false;
252 /* we're going to use a BufFile, so we need a ResourceOwner */
253 Assert(AuxProcessResourceOwner
!= NULL
);
254 Assert(CurrentResourceOwner
== AuxProcessResourceOwner
||
255 CurrentResourceOwner
== NULL
);
256 CurrentResourceOwner
= AuxProcessResourceOwner
;
258 backup_started_in_recovery
= RecoveryInProgress();
260 InitializeBackupManifest(&manifest
, opt
->manifest
,
261 opt
->manifest_checksum_type
);
263 total_checksum_failures
= 0;
265 /* Allocate backup related variables. */
266 backup_state
= (BackupState
*) palloc0(sizeof(BackupState
));
267 tablespace_map
= makeStringInfo();
269 basebackup_progress_wait_checkpoint();
270 do_pg_backup_start(opt
->label
, opt
->fastcheckpoint
, &state
.tablespaces
,
271 backup_state
, tablespace_map
);
273 state
.startptr
= backup_state
->startpoint
;
274 state
.starttli
= backup_state
->starttli
;
277 * Once do_pg_backup_start has been called, ensure that any failure causes
278 * us to abort the backup so we don't "leak" a backup counter. For this
279 * reason, *all* functionality between do_pg_backup_start() and the end of
280 * do_pg_backup_stop() should be inside the error cleanup block!
283 PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup
, BoolGetDatum(false));
286 tablespaceinfo
*newti
;
288 /* If this is an incremental backup, execute preparatory steps. */
290 PrepareForIncrementalBackup(ib
, backup_state
);
292 /* Add a node for the base directory at the end */
293 newti
= palloc0(sizeof(tablespaceinfo
));
295 state
.tablespaces
= lappend(state
.tablespaces
, newti
);
298 * Calculate the total backup size by summing up the size of each
303 basebackup_progress_estimate_backup_size();
305 foreach(lc
, state
.tablespaces
)
307 tablespaceinfo
*tmp
= (tablespaceinfo
*) lfirst(lc
);
309 if (tmp
->path
== NULL
)
310 tmp
->size
= sendDir(sink
, ".", 1, true, state
.tablespaces
,
311 true, NULL
, InvalidOid
, NULL
);
313 tmp
->size
= sendTablespace(sink
, tmp
->path
, tmp
->oid
, true,
315 state
.bytes_total
+= tmp
->size
;
317 state
.bytes_total_is_valid
= true;
320 /* notify basebackup sink about start of backup */
321 bbsink_begin_backup(sink
, &state
, SINK_BUFFER_LENGTH
);
323 /* Send off our tablespaces one by one */
324 foreach(lc
, state
.tablespaces
)
326 tablespaceinfo
*ti
= (tablespaceinfo
*) lfirst(lc
);
328 if (ti
->path
== NULL
)
331 bool sendtblspclinks
= true;
334 bbsink_begin_archive(sink
, "base.tar");
336 /* In the main tar, include the backup_label first... */
337 backup_label
= build_backup_content(backup_state
, false);
338 sendFileWithContent(sink
, BACKUP_LABEL_FILE
,
339 backup_label
, -1, &manifest
);
342 /* Then the tablespace_map file, if required... */
343 if (opt
->sendtblspcmapfile
)
345 sendFileWithContent(sink
, TABLESPACE_MAP
,
346 tablespace_map
->data
, -1, &manifest
);
347 sendtblspclinks
= false;
350 /* Then the bulk of the files... */
351 sendDir(sink
, ".", 1, false, state
.tablespaces
,
352 sendtblspclinks
, &manifest
, InvalidOid
, ib
);
354 /* ... and pg_control after everything else. */
355 if (lstat(XLOG_CONTROL_FILE
, &statbuf
) != 0)
357 (errcode_for_file_access(),
358 errmsg("could not stat file \"%s\": %m",
359 XLOG_CONTROL_FILE
)));
360 sendFile(sink
, XLOG_CONTROL_FILE
, XLOG_CONTROL_FILE
, &statbuf
,
361 false, InvalidOid
, InvalidOid
,
362 InvalidRelFileNumber
, 0, &manifest
, 0, NULL
, 0);
366 char *archive_name
= psprintf("%u.tar", ti
->oid
);
368 bbsink_begin_archive(sink
, archive_name
);
370 sendTablespace(sink
, ti
->path
, ti
->oid
, false, &manifest
, ib
);
374 * If we're including WAL, and this is the main data directory we
375 * don't treat this as the end of the tablespace. Instead, we will
376 * include the xlog files below and stop afterwards. This is safe
377 * since the main data directory is always sent *last*.
379 if (opt
->includewal
&& ti
->path
== NULL
)
381 Assert(lnext(state
.tablespaces
, lc
) == NULL
);
385 /* Properly terminate the tarfile. */
386 StaticAssertDecl(2 * TAR_BLOCK_SIZE
<= BLCKSZ
,
387 "BLCKSZ too small for 2 tar blocks");
388 memset(sink
->bbs_buffer
, 0, 2 * TAR_BLOCK_SIZE
);
389 bbsink_archive_contents(sink
, 2 * TAR_BLOCK_SIZE
);
391 /* OK, that's the end of the archive. */
392 bbsink_end_archive(sink
);
396 basebackup_progress_wait_wal_archive(&state
);
397 do_pg_backup_stop(backup_state
, !opt
->nowait
);
399 endptr
= backup_state
->stoppoint
;
400 endtli
= backup_state
->stoptli
;
402 /* Deallocate backup-related variables. */
403 destroyStringInfo(tablespace_map
);
406 PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup
, BoolGetDatum(false));
412 * We've left the last tar file "open", so we can now append the
413 * required WAL files to it.
415 char pathbuf
[MAXPGPATH
];
417 XLogSegNo startsegno
;
420 List
*historyFileList
= NIL
;
421 List
*walFileList
= NIL
;
422 char firstoff
[MAXFNAMELEN
];
423 char lastoff
[MAXFNAMELEN
];
429 basebackup_progress_transfer_wal();
432 * I'd rather not worry about timelines here, so scan pg_wal and
433 * include all WAL files in the range between 'startptr' and 'endptr',
434 * regardless of the timeline the file is stamped with. If there are
435 * some spurious WAL files belonging to timelines that don't belong in
436 * this server's history, they will be included too. Normally there
437 * shouldn't be such files, but if there are, there's little harm in
440 XLByteToSeg(state
.startptr
, startsegno
, wal_segment_size
);
441 XLogFileName(firstoff
, state
.starttli
, startsegno
, wal_segment_size
);
442 XLByteToPrevSeg(endptr
, endsegno
, wal_segment_size
);
443 XLogFileName(lastoff
, endtli
, endsegno
, wal_segment_size
);
445 dir
= AllocateDir("pg_wal");
446 while ((de
= ReadDir(dir
, "pg_wal")) != NULL
)
448 /* Does it look like a WAL segment, and is it in the range? */
449 if (IsXLogFileName(de
->d_name
) &&
450 strcmp(de
->d_name
+ 8, firstoff
+ 8) >= 0 &&
451 strcmp(de
->d_name
+ 8, lastoff
+ 8) <= 0)
453 walFileList
= lappend(walFileList
, pstrdup(de
->d_name
));
455 /* Does it look like a timeline history file? */
456 else if (IsTLHistoryFileName(de
->d_name
))
458 historyFileList
= lappend(historyFileList
, pstrdup(de
->d_name
));
464 * Before we go any further, check that none of the WAL segments we
467 CheckXLogRemoved(startsegno
, state
.starttli
);
470 * Sort the WAL filenames. We want to send the files in order from
471 * oldest to newest, to reduce the chance that a file is recycled
472 * before we get a chance to send it over.
474 list_sort(walFileList
, compareWalFileNames
);
477 * There must be at least one xlog file in the pg_wal directory, since
478 * we are doing backup-including-xlog.
480 if (walFileList
== NIL
)
482 (errmsg("could not find any WAL files")));
485 * Sanity check: the first and last segment should cover startptr and
486 * endptr, with no gaps in between.
488 XLogFromFileName((char *) linitial(walFileList
),
489 &tli
, &segno
, wal_segment_size
);
490 if (segno
!= startsegno
)
492 char startfname
[MAXFNAMELEN
];
494 XLogFileName(startfname
, state
.starttli
, startsegno
,
497 (errmsg("could not find WAL file \"%s\"", startfname
)));
499 foreach(lc
, walFileList
)
501 char *walFileName
= (char *) lfirst(lc
);
502 XLogSegNo currsegno
= segno
;
503 XLogSegNo nextsegno
= segno
+ 1;
505 XLogFromFileName(walFileName
, &tli
, &segno
, wal_segment_size
);
506 if (!(nextsegno
== segno
|| currsegno
== segno
))
508 char nextfname
[MAXFNAMELEN
];
510 XLogFileName(nextfname
, tli
, nextsegno
, wal_segment_size
);
512 (errmsg("could not find WAL file \"%s\"", nextfname
)));
515 if (segno
!= endsegno
)
517 char endfname
[MAXFNAMELEN
];
519 XLogFileName(endfname
, endtli
, endsegno
, wal_segment_size
);
521 (errmsg("could not find WAL file \"%s\"", endfname
)));
524 /* Ok, we have everything we need. Send the WAL files. */
525 foreach(lc
, walFileList
)
527 char *walFileName
= (char *) lfirst(lc
);
532 snprintf(pathbuf
, MAXPGPATH
, XLOGDIR
"/%s", walFileName
);
533 XLogFromFileName(walFileName
, &tli
, &segno
, wal_segment_size
);
535 fd
= OpenTransientFile(pathbuf
, O_RDONLY
| PG_BINARY
);
538 int save_errno
= errno
;
541 * Most likely reason for this is that the file was already
542 * removed by a checkpoint, so check for that to get a better
545 CheckXLogRemoved(segno
, tli
);
549 (errcode_for_file_access(),
550 errmsg("could not open file \"%s\": %m", pathbuf
)));
553 if (fstat(fd
, &statbuf
) != 0)
555 (errcode_for_file_access(),
556 errmsg("could not stat file \"%s\": %m",
558 if (statbuf
.st_size
!= wal_segment_size
)
560 CheckXLogRemoved(segno
, tli
);
562 (errcode_for_file_access(),
563 errmsg("unexpected WAL file size \"%s\"", walFileName
)));
566 /* send the WAL file itself */
567 _tarWriteHeader(sink
, pathbuf
, NULL
, &statbuf
, false);
569 while ((cnt
= basebackup_read_file(fd
, sink
->bbs_buffer
,
570 Min(sink
->bbs_buffer_length
,
571 wal_segment_size
- len
),
572 len
, pathbuf
, true)) > 0)
574 CheckXLogRemoved(segno
, tli
);
575 bbsink_archive_contents(sink
, cnt
);
579 if (len
== wal_segment_size
)
583 if (len
!= wal_segment_size
)
585 CheckXLogRemoved(segno
, tli
);
587 (errcode_for_file_access(),
588 errmsg("unexpected WAL file size \"%s\"", walFileName
)));
592 * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
595 Assert(wal_segment_size
% TAR_BLOCK_SIZE
== 0);
597 CloseTransientFile(fd
);
600 * Mark file as archived, otherwise files can get archived again
601 * after promotion of a new node. This is in line with
602 * walreceiver.c always doing an XLogArchiveForceDone() after a
605 StatusFilePath(pathbuf
, walFileName
, ".done");
606 sendFileWithContent(sink
, pathbuf
, "", -1, &manifest
);
610 * Send timeline history files too. Only the latest timeline history
611 * file is required for recovery, and even that only if there happens
612 * to be a timeline switch in the first WAL segment that contains the
613 * checkpoint record, or if we're taking a base backup from a standby
614 * server and the target timeline changes while the backup is taken.
615 * But they are small and highly useful for debugging purposes, so
616 * better include them all, always.
618 foreach(lc
, historyFileList
)
620 char *fname
= lfirst(lc
);
622 snprintf(pathbuf
, MAXPGPATH
, XLOGDIR
"/%s", fname
);
624 if (lstat(pathbuf
, &statbuf
) != 0)
626 (errcode_for_file_access(),
627 errmsg("could not stat file \"%s\": %m", pathbuf
)));
629 sendFile(sink
, pathbuf
, pathbuf
, &statbuf
, false,
630 InvalidOid
, InvalidOid
, InvalidRelFileNumber
, 0,
631 &manifest
, 0, NULL
, 0);
633 /* unconditionally mark file as archived */
634 StatusFilePath(pathbuf
, fname
, ".done");
635 sendFileWithContent(sink
, pathbuf
, "", -1, &manifest
);
638 /* Properly terminate the tar file. */
639 StaticAssertStmt(2 * TAR_BLOCK_SIZE
<= BLCKSZ
,
640 "BLCKSZ too small for 2 tar blocks");
641 memset(sink
->bbs_buffer
, 0, 2 * TAR_BLOCK_SIZE
);
642 bbsink_archive_contents(sink
, 2 * TAR_BLOCK_SIZE
);
644 /* OK, that's the end of the archive. */
645 bbsink_end_archive(sink
);
648 AddWALInfoToBackupManifest(&manifest
, state
.startptr
, state
.starttli
,
651 SendBackupManifest(&manifest
, sink
);
653 bbsink_end_backup(sink
, endptr
, endtli
);
655 if (total_checksum_failures
)
657 if (total_checksum_failures
> 1)
659 (errmsg_plural("%lld total checksum verification failure",
660 "%lld total checksum verification failures",
661 total_checksum_failures
,
662 total_checksum_failures
)));
665 (errcode(ERRCODE_DATA_CORRUPTED
),
666 errmsg("checksum verification failure during base backup")));
670 * Make sure to free the manifest before the resource owners as manifests
671 * use cryptohash contexts that may depend on resource owners (like
674 FreeBackupManifest(&manifest
);
676 /* clean up the resource owner we created */
677 ReleaseAuxProcessResources(true);
679 basebackup_progress_done();
683 * list_sort comparison function, to compare log/seg portion of WAL segment
684 * filenames, ignoring the timeline portion.
687 compareWalFileNames(const ListCell
*a
, const ListCell
*b
)
689 char *fna
= (char *) lfirst(a
);
690 char *fnb
= (char *) lfirst(b
);
692 return strcmp(fna
+ 8, fnb
+ 8);
696 * Parse the base backup options passed down by the parser
699 parse_basebackup_options(List
*options
, basebackup_options
*opt
)
702 bool o_label
= false;
703 bool o_progress
= false;
704 bool o_checkpoint
= false;
705 bool o_nowait
= false;
707 bool o_incremental
= false;
708 bool o_maxrate
= false;
709 bool o_tablespace_map
= false;
710 bool o_noverify_checksums
= false;
711 bool o_manifest
= false;
712 bool o_manifest_checksums
= false;
713 bool o_target
= false;
714 bool o_target_detail
= false;
715 char *target_str
= NULL
;
716 char *target_detail_str
= NULL
;
717 bool o_compression
= false;
718 bool o_compression_detail
= false;
719 char *compression_detail_str
= NULL
;
721 MemSet(opt
, 0, sizeof(*opt
));
722 opt
->manifest
= MANIFEST_OPTION_NO
;
723 opt
->manifest_checksum_type
= CHECKSUM_TYPE_CRC32C
;
724 opt
->compression
= PG_COMPRESSION_NONE
;
725 opt
->compression_specification
.algorithm
= PG_COMPRESSION_NONE
;
727 foreach(lopt
, options
)
729 DefElem
*defel
= (DefElem
*) lfirst(lopt
);
731 if (strcmp(defel
->defname
, "label") == 0)
735 (errcode(ERRCODE_SYNTAX_ERROR
),
736 errmsg("duplicate option \"%s\"", defel
->defname
)));
737 opt
->label
= defGetString(defel
);
740 else if (strcmp(defel
->defname
, "progress") == 0)
744 (errcode(ERRCODE_SYNTAX_ERROR
),
745 errmsg("duplicate option \"%s\"", defel
->defname
)));
746 opt
->progress
= defGetBoolean(defel
);
749 else if (strcmp(defel
->defname
, "checkpoint") == 0)
751 char *optval
= defGetString(defel
);
755 (errcode(ERRCODE_SYNTAX_ERROR
),
756 errmsg("duplicate option \"%s\"", defel
->defname
)));
757 if (pg_strcasecmp(optval
, "fast") == 0)
758 opt
->fastcheckpoint
= true;
759 else if (pg_strcasecmp(optval
, "spread") == 0)
760 opt
->fastcheckpoint
= false;
763 (errcode(ERRCODE_SYNTAX_ERROR
),
764 errmsg("unrecognized checkpoint type: \"%s\"",
768 else if (strcmp(defel
->defname
, "wait") == 0)
772 (errcode(ERRCODE_SYNTAX_ERROR
),
773 errmsg("duplicate option \"%s\"", defel
->defname
)));
774 opt
->nowait
= !defGetBoolean(defel
);
777 else if (strcmp(defel
->defname
, "wal") == 0)
781 (errcode(ERRCODE_SYNTAX_ERROR
),
782 errmsg("duplicate option \"%s\"", defel
->defname
)));
783 opt
->includewal
= defGetBoolean(defel
);
786 else if (strcmp(defel
->defname
, "incremental") == 0)
790 (errcode(ERRCODE_SYNTAX_ERROR
),
791 errmsg("duplicate option \"%s\"", defel
->defname
)));
792 opt
->incremental
= defGetBoolean(defel
);
793 if (opt
->incremental
&& !summarize_wal
)
795 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
796 errmsg("incremental backups cannot be taken unless WAL summarization is enabled")));
797 o_incremental
= true;
799 else if (strcmp(defel
->defname
, "max_rate") == 0)
805 (errcode(ERRCODE_SYNTAX_ERROR
),
806 errmsg("duplicate option \"%s\"", defel
->defname
)));
808 maxrate
= defGetInt64(defel
);
809 if (maxrate
< MAX_RATE_LOWER
|| maxrate
> MAX_RATE_UPPER
)
811 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
812 errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
813 (int) maxrate
, "MAX_RATE", MAX_RATE_LOWER
, MAX_RATE_UPPER
)));
815 opt
->maxrate
= (uint32
) maxrate
;
818 else if (strcmp(defel
->defname
, "tablespace_map") == 0)
820 if (o_tablespace_map
)
822 (errcode(ERRCODE_SYNTAX_ERROR
),
823 errmsg("duplicate option \"%s\"", defel
->defname
)));
824 opt
->sendtblspcmapfile
= defGetBoolean(defel
);
825 o_tablespace_map
= true;
827 else if (strcmp(defel
->defname
, "verify_checksums") == 0)
829 if (o_noverify_checksums
)
831 (errcode(ERRCODE_SYNTAX_ERROR
),
832 errmsg("duplicate option \"%s\"", defel
->defname
)));
833 noverify_checksums
= !defGetBoolean(defel
);
834 o_noverify_checksums
= true;
836 else if (strcmp(defel
->defname
, "manifest") == 0)
838 char *optval
= defGetString(defel
);
843 (errcode(ERRCODE_SYNTAX_ERROR
),
844 errmsg("duplicate option \"%s\"", defel
->defname
)));
845 if (parse_bool(optval
, &manifest_bool
))
848 opt
->manifest
= MANIFEST_OPTION_YES
;
850 opt
->manifest
= MANIFEST_OPTION_NO
;
852 else if (pg_strcasecmp(optval
, "force-encode") == 0)
853 opt
->manifest
= MANIFEST_OPTION_FORCE_ENCODE
;
856 (errcode(ERRCODE_SYNTAX_ERROR
),
857 errmsg("unrecognized manifest option: \"%s\"",
861 else if (strcmp(defel
->defname
, "manifest_checksums") == 0)
863 char *optval
= defGetString(defel
);
865 if (o_manifest_checksums
)
867 (errcode(ERRCODE_SYNTAX_ERROR
),
868 errmsg("duplicate option \"%s\"", defel
->defname
)));
869 if (!pg_checksum_parse_type(optval
,
870 &opt
->manifest_checksum_type
))
872 (errcode(ERRCODE_SYNTAX_ERROR
),
873 errmsg("unrecognized checksum algorithm: \"%s\"",
875 o_manifest_checksums
= true;
877 else if (strcmp(defel
->defname
, "target") == 0)
881 (errcode(ERRCODE_SYNTAX_ERROR
),
882 errmsg("duplicate option \"%s\"", defel
->defname
)));
883 target_str
= defGetString(defel
);
886 else if (strcmp(defel
->defname
, "target_detail") == 0)
888 char *optval
= defGetString(defel
);
892 (errcode(ERRCODE_SYNTAX_ERROR
),
893 errmsg("duplicate option \"%s\"", defel
->defname
)));
894 target_detail_str
= optval
;
895 o_target_detail
= true;
897 else if (strcmp(defel
->defname
, "compression") == 0)
899 char *optval
= defGetString(defel
);
903 (errcode(ERRCODE_SYNTAX_ERROR
),
904 errmsg("duplicate option \"%s\"", defel
->defname
)));
905 if (!parse_compress_algorithm(optval
, &opt
->compression
))
907 (errcode(ERRCODE_SYNTAX_ERROR
),
908 errmsg("unrecognized compression algorithm: \"%s\"",
910 o_compression
= true;
912 else if (strcmp(defel
->defname
, "compression_detail") == 0)
914 if (o_compression_detail
)
916 (errcode(ERRCODE_SYNTAX_ERROR
),
917 errmsg("duplicate option \"%s\"", defel
->defname
)));
918 compression_detail_str
= defGetString(defel
);
919 o_compression_detail
= true;
923 (errcode(ERRCODE_SYNTAX_ERROR
),
924 errmsg("unrecognized base backup option: \"%s\"",
928 if (opt
->label
== NULL
)
929 opt
->label
= "base backup";
930 if (opt
->manifest
== MANIFEST_OPTION_NO
)
932 if (o_manifest_checksums
)
934 (errcode(ERRCODE_SYNTAX_ERROR
),
935 errmsg("manifest checksums require a backup manifest")));
936 opt
->manifest_checksum_type
= CHECKSUM_TYPE_NONE
;
939 if (target_str
== NULL
)
941 if (target_detail_str
!= NULL
)
943 (errcode(ERRCODE_SYNTAX_ERROR
),
944 errmsg("target detail cannot be used without target")));
945 opt
->use_copytblspc
= true;
946 opt
->send_to_client
= true;
948 else if (strcmp(target_str
, "client") == 0)
950 if (target_detail_str
!= NULL
)
952 (errcode(ERRCODE_SYNTAX_ERROR
),
953 errmsg("target \"%s\" does not accept a target detail",
955 opt
->send_to_client
= true;
959 BaseBackupGetTargetHandle(target_str
, target_detail_str
);
961 if (o_compression_detail
&& !o_compression
)
963 (errcode(ERRCODE_SYNTAX_ERROR
),
964 errmsg("compression detail cannot be specified unless compression is enabled")));
970 parse_compress_specification(opt
->compression
, compression_detail_str
,
971 &opt
->compression_specification
);
973 validate_compress_specification(&opt
->compression_specification
);
974 if (error_detail
!= NULL
)
976 errcode(ERRCODE_SYNTAX_ERROR
),
977 errmsg("invalid compression specification: %s",
984 * SendBaseBackup() - send a complete base backup.
986 * The function will put the system into backup mode like pg_backup_start()
987 * does, so that the backup is consistent even though we read directly from
988 * the filesystem, bypassing the buffer cache.
991 SendBaseBackup(BaseBackupCmd
*cmd
, IncrementalBackupInfo
*ib
)
993 basebackup_options opt
;
995 SessionBackupState status
= get_backup_status();
997 if (status
== SESSION_BACKUP_RUNNING
)
999 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1000 errmsg("a backup is already in progress in this session")));
1002 parse_basebackup_options(cmd
->options
, &opt
);
1004 WalSndSetState(WALSNDSTATE_BACKUP
);
1006 if (update_process_title
)
1008 char activitymsg
[50];
1010 snprintf(activitymsg
, sizeof(activitymsg
), "sending backup \"%s\"",
1012 set_ps_display(activitymsg
);
1016 * If we're asked to perform an incremental backup and the user has not
1017 * supplied a manifest, that's an ERROR.
1019 * If we're asked to perform a full backup and the user did supply a
1020 * manifest, just ignore it.
1022 if (!opt
.incremental
)
1024 else if (ib
== NULL
)
1026 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1027 errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP")));
1030 * If the target is specifically 'client' then set up to stream the backup
1031 * to the client; otherwise, it's being sent someplace else and should not
1032 * be sent to the client. BaseBackupGetSink has the job of setting up a
1033 * sink to send the backup data wherever it needs to go.
1035 sink
= bbsink_copystream_new(opt
.send_to_client
);
1036 if (opt
.target_handle
!= NULL
)
1037 sink
= BaseBackupGetSink(opt
.target_handle
, sink
);
1039 /* Set up network throttling, if client requested it */
1040 if (opt
.maxrate
> 0)
1041 sink
= bbsink_throttle_new(sink
, opt
.maxrate
);
1043 /* Set up server-side compression, if client requested it */
1044 if (opt
.compression
== PG_COMPRESSION_GZIP
)
1045 sink
= bbsink_gzip_new(sink
, &opt
.compression_specification
);
1046 else if (opt
.compression
== PG_COMPRESSION_LZ4
)
1047 sink
= bbsink_lz4_new(sink
, &opt
.compression_specification
);
1048 else if (opt
.compression
== PG_COMPRESSION_ZSTD
)
1049 sink
= bbsink_zstd_new(sink
, &opt
.compression_specification
);
1051 /* Set up progress reporting. */
1052 sink
= bbsink_progress_new(sink
, opt
.progress
);
1055 * Perform the base backup, but make sure we clean up the bbsink even if
1060 perform_base_backup(&opt
, sink
, ib
);
1064 bbsink_cleanup(sink
);
1070 * Inject a file with given name and content in the output tar stream.
1072 * "len" can optionally be set to an arbitrary length of data sent. If set
1073 * to -1, the content sent is treated as a string with strlen() as length.
1076 sendFileWithContent(bbsink
*sink
, const char *filename
, const char *content
,
1077 int len
, backup_manifest_info
*manifest
)
1079 struct stat statbuf
;
1081 pg_checksum_context checksum_ctx
;
1083 if (pg_checksum_init(&checksum_ctx
, manifest
->checksum_type
) < 0)
1084 elog(ERROR
, "could not initialize checksum of file \"%s\"",
1088 len
= strlen(content
);
1091 * Construct a stat struct for the file we're injecting in the tar.
1094 /* Windows doesn't have the concept of uid and gid */
1099 statbuf
.st_uid
= geteuid();
1100 statbuf
.st_gid
= getegid();
1102 statbuf
.st_mtime
= time(NULL
);
1103 statbuf
.st_mode
= pg_file_create_mode
;
1104 statbuf
.st_size
= len
;
1106 _tarWriteHeader(sink
, filename
, NULL
, &statbuf
, false);
1108 if (pg_checksum_update(&checksum_ctx
, (uint8
*) content
, len
) < 0)
1109 elog(ERROR
, "could not update checksum of file \"%s\"",
1112 while (bytes_done
< len
)
1114 size_t remaining
= len
- bytes_done
;
1115 size_t nbytes
= Min(sink
->bbs_buffer_length
, remaining
);
1117 memcpy(sink
->bbs_buffer
, content
, nbytes
);
1118 bbsink_archive_contents(sink
, nbytes
);
1119 bytes_done
+= nbytes
;
1123 _tarWritePadding(sink
, len
);
1125 AddFileToBackupManifest(manifest
, InvalidOid
, filename
, len
,
1126 (pg_time_t
) statbuf
.st_mtime
, &checksum_ctx
);
1130 * Include the tablespace directory pointed to by 'path' in the output tar
1131 * stream. If 'sizeonly' is true, we just calculate a total length and return
1132 * it, without actually sending anything.
1134 * Only used to send auxiliary tablespaces, not PGDATA.
1137 sendTablespace(bbsink
*sink
, char *path
, Oid spcoid
, bool sizeonly
,
1138 backup_manifest_info
*manifest
, IncrementalBackupInfo
*ib
)
1141 char pathbuf
[MAXPGPATH
];
1142 struct stat statbuf
;
1145 * 'path' points to the tablespace location, but we only want to include
1146 * the version directory in it that belongs to us.
1148 snprintf(pathbuf
, sizeof(pathbuf
), "%s/%s", path
,
1149 TABLESPACE_VERSION_DIRECTORY
);
1152 * Store a directory entry in the tar file so we get the permissions
1155 if (lstat(pathbuf
, &statbuf
) != 0)
1157 if (errno
!= ENOENT
)
1159 (errcode_for_file_access(),
1160 errmsg("could not stat file or directory \"%s\": %m",
1163 /* If the tablespace went away while scanning, it's no error. */
1167 size
= _tarWriteHeader(sink
, TABLESPACE_VERSION_DIRECTORY
, NULL
, &statbuf
,
1170 /* Send all the files in the tablespace version directory */
1171 size
+= sendDir(sink
, pathbuf
, strlen(path
), sizeonly
, NIL
, true, manifest
,
1178 * Include all files from the given directory in the output tar stream. If
1179 * 'sizeonly' is true, we just calculate a total length and return it, without
1180 * actually sending anything.
1182 * Omit any directory in the tablespaces list, to avoid backing up
1183 * tablespaces twice when they were created inside PGDATA.
1185 * If sendtblspclinks is true, we need to include symlink
1186 * information in the tar file. If not, we can skip that
1187 * as it will be sent separately in the tablespace_map file.
1190 sendDir(bbsink
*sink
, const char *path
, int basepathlen
, bool sizeonly
,
1191 List
*tablespaces
, bool sendtblspclinks
, backup_manifest_info
*manifest
,
1192 Oid spcoid
, IncrementalBackupInfo
*ib
)
1196 char pathbuf
[MAXPGPATH
* 2];
1197 struct stat statbuf
;
1199 const char *lastDir
; /* Split last dir from parent path. */
1200 bool isRelationDir
= false; /* Does directory contain relations? */
1201 bool isGlobalDir
= false;
1202 Oid dboid
= InvalidOid
;
1203 BlockNumber
*relative_block_numbers
= NULL
;
1206 * Since this array is relatively large, avoid putting it on the stack.
1207 * But we don't need it at all if this is not an incremental backup.
1210 relative_block_numbers
= palloc(sizeof(BlockNumber
) * RELSEG_SIZE
);
1213 * Determine if the current path is a database directory that can contain
1216 * Start by finding the location of the delimiter between the parent path
1217 * and the current path.
1219 lastDir
= last_dir_separator(path
);
1221 /* Does this path look like a database path (i.e. all digits)? */
1222 if (lastDir
!= NULL
&&
1223 strspn(lastDir
+ 1, "0123456789") == strlen(lastDir
+ 1))
1225 /* Part of path that contains the parent directory. */
1226 int parentPathLen
= lastDir
- path
;
1229 * Mark path as a database directory if the parent path is either
1230 * $PGDATA/base or a tablespace version path.
1232 if (strncmp(path
, "./base", parentPathLen
) == 0 ||
1233 (parentPathLen
>= (sizeof(TABLESPACE_VERSION_DIRECTORY
) - 1) &&
1234 strncmp(lastDir
- (sizeof(TABLESPACE_VERSION_DIRECTORY
) - 1),
1235 TABLESPACE_VERSION_DIRECTORY
,
1236 sizeof(TABLESPACE_VERSION_DIRECTORY
) - 1) == 0))
1238 isRelationDir
= true;
1239 dboid
= atooid(lastDir
+ 1);
1242 else if (strcmp(path
, "./global") == 0)
1244 isRelationDir
= true;
1248 dir
= AllocateDir(path
);
1249 while ((de
= ReadDir(dir
, path
)) != NULL
)
1253 RelFileNumber relfilenumber
= InvalidRelFileNumber
;
1254 ForkNumber relForkNum
= InvalidForkNumber
;
1256 bool isRelationFile
= false;
1258 /* Skip special stuff */
1259 if (strcmp(de
->d_name
, ".") == 0 || strcmp(de
->d_name
, "..") == 0)
1262 /* Skip temporary files */
1263 if (strncmp(de
->d_name
,
1264 PG_TEMP_FILE_PREFIX
,
1265 strlen(PG_TEMP_FILE_PREFIX
)) == 0)
1268 /* Skip macOS system files */
1269 if (strcmp(de
->d_name
, ".DS_Store") == 0)
1273 * Check if the postmaster has signaled us to exit, and abort with an
1274 * error in that case. The error handler further up will call
1275 * do_pg_abort_backup() for us. Also check that if the backup was
1276 * started while still in recovery, the server wasn't promoted.
1277 * do_pg_backup_stop() will check that too, but it's better to stop
1278 * the backup early than continue to the end and fail there.
1280 CHECK_FOR_INTERRUPTS();
1281 if (RecoveryInProgress() != backup_started_in_recovery
)
1283 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1284 errmsg("the standby was promoted during online backup"),
1285 errhint("This means that the backup being taken is corrupt "
1286 "and should not be used. "
1287 "Try taking another online backup.")));
1289 /* Scan for files that should be excluded */
1290 excludeFound
= false;
1291 for (excludeIdx
= 0; excludeFiles
[excludeIdx
].name
!= NULL
; excludeIdx
++)
1293 int cmplen
= strlen(excludeFiles
[excludeIdx
].name
);
1295 if (!excludeFiles
[excludeIdx
].match_prefix
)
1297 if (strncmp(de
->d_name
, excludeFiles
[excludeIdx
].name
, cmplen
) == 0)
1299 elog(DEBUG1
, "file \"%s\" excluded from backup", de
->d_name
);
1300 excludeFound
= true;
1309 * If there could be non-temporary relation files in this directory,
1310 * try to parse the filename.
1314 parse_filename_for_nontemp_relation(de
->d_name
,
1316 &relForkNum
, &segno
);
1318 /* Exclude all forks for unlogged tables except the init fork */
1319 if (isRelationFile
&& relForkNum
!= INIT_FORKNUM
)
1321 char initForkFile
[MAXPGPATH
];
1324 * If any other type of fork, check if there is an init fork with
1325 * the same RelFileNumber. If so, the file can be excluded.
1327 snprintf(initForkFile
, sizeof(initForkFile
), "%s/%u_init",
1328 path
, relfilenumber
);
1330 if (lstat(initForkFile
, &statbuf
) == 0)
1333 "unlogged relation file \"%s\" excluded from backup",
1340 /* Exclude temporary relations */
1341 if (OidIsValid(dboid
) && looks_like_temp_rel_name(de
->d_name
))
1344 "temporary relation file \"%s\" excluded from backup",
1350 snprintf(pathbuf
, sizeof(pathbuf
), "%s/%s", path
, de
->d_name
);
1352 /* Skip pg_control here to back up it last */
1353 if (strcmp(pathbuf
, "./global/pg_control") == 0)
1356 if (lstat(pathbuf
, &statbuf
) != 0)
1358 if (errno
!= ENOENT
)
1360 (errcode_for_file_access(),
1361 errmsg("could not stat file or directory \"%s\": %m",
1364 /* If the file went away while scanning, it's not an error. */
1368 /* Scan for directories whose contents should be excluded */
1369 excludeFound
= false;
1370 for (excludeIdx
= 0; excludeDirContents
[excludeIdx
] != NULL
; excludeIdx
++)
1372 if (strcmp(de
->d_name
, excludeDirContents
[excludeIdx
]) == 0)
1374 elog(DEBUG1
, "contents of directory \"%s\" excluded from backup", de
->d_name
);
1375 convert_link_to_directory(pathbuf
, &statbuf
);
1376 size
+= _tarWriteHeader(sink
, pathbuf
+ basepathlen
+ 1, NULL
,
1377 &statbuf
, sizeonly
);
1378 excludeFound
= true;
1387 * We can skip pg_wal, the WAL segments need to be fetched from the
1388 * WAL archive anyway. But include it as an empty directory anyway, so
1389 * we get permissions right.
1391 if (strcmp(pathbuf
, "./pg_wal") == 0)
1393 /* If pg_wal is a symlink, write it as a directory anyway */
1394 convert_link_to_directory(pathbuf
, &statbuf
);
1395 size
+= _tarWriteHeader(sink
, pathbuf
+ basepathlen
+ 1, NULL
,
1396 &statbuf
, sizeonly
);
1399 * Also send archive_status and summaries directories (by
1400 * hackishly reusing statbuf from above ...).
1402 size
+= _tarWriteHeader(sink
, "./pg_wal/archive_status", NULL
,
1403 &statbuf
, sizeonly
);
1404 size
+= _tarWriteHeader(sink
, "./pg_wal/summaries", NULL
,
1405 &statbuf
, sizeonly
);
1407 continue; /* don't recurse into pg_wal */
1410 /* Allow symbolic links in pg_tblspc only */
1411 if (strcmp(path
, "./pg_tblspc") == 0 && S_ISLNK(statbuf
.st_mode
))
1413 char linkpath
[MAXPGPATH
];
1416 rllen
= readlink(pathbuf
, linkpath
, sizeof(linkpath
));
1419 (errcode_for_file_access(),
1420 errmsg("could not read symbolic link \"%s\": %m",
1422 if (rllen
>= sizeof(linkpath
))
1424 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
1425 errmsg("symbolic link \"%s\" target is too long",
1427 linkpath
[rllen
] = '\0';
1429 size
+= _tarWriteHeader(sink
, pathbuf
+ basepathlen
+ 1, linkpath
,
1430 &statbuf
, sizeonly
);
1432 else if (S_ISDIR(statbuf
.st_mode
))
1434 bool skip_this_dir
= false;
1438 * Store a directory entry in the tar file so we can get the
1439 * permissions right.
1441 size
+= _tarWriteHeader(sink
, pathbuf
+ basepathlen
+ 1, NULL
, &statbuf
,
1445 * Call ourselves recursively for a directory, unless it happens
1446 * to be a separate tablespace located within PGDATA.
1448 foreach(lc
, tablespaces
)
1450 tablespaceinfo
*ti
= (tablespaceinfo
*) lfirst(lc
);
1453 * ti->rpath is the tablespace relative path within PGDATA, or
1454 * NULL if the tablespace has been properly located somewhere
1457 * Skip past the leading "./" in pathbuf when comparing.
1459 if (ti
->rpath
&& strcmp(ti
->rpath
, pathbuf
+ 2) == 0)
1461 skip_this_dir
= true;
1467 * skip sending directories inside pg_tblspc, if not required.
1469 if (strcmp(pathbuf
, "./pg_tblspc") == 0 && !sendtblspclinks
)
1470 skip_this_dir
= true;
1473 size
+= sendDir(sink
, pathbuf
, basepathlen
, sizeonly
, tablespaces
,
1474 sendtblspclinks
, manifest
, spcoid
, ib
);
1476 else if (S_ISREG(statbuf
.st_mode
))
1479 unsigned num_blocks_required
= 0;
1480 unsigned truncation_block_length
= 0;
1481 char tarfilenamebuf
[MAXPGPATH
* 2];
1482 char *tarfilename
= pathbuf
+ basepathlen
+ 1;
1483 FileBackupMethod method
= BACK_UP_FILE_FULLY
;
1485 if (ib
!= NULL
&& isRelationFile
)
1490 if (OidIsValid(spcoid
))
1493 lookup_path
= psprintf("%s/%u/%s", PG_TBLSPC_DIR
, spcoid
,
1499 relspcoid
= GLOBALTABLESPACE_OID
;
1501 relspcoid
= DEFAULTTABLESPACE_OID
;
1502 lookup_path
= pstrdup(tarfilename
);
1505 method
= GetFileBackupMethod(ib
, lookup_path
, dboid
, relspcoid
,
1506 relfilenumber
, relForkNum
,
1507 segno
, statbuf
.st_size
,
1508 &num_blocks_required
,
1509 relative_block_numbers
,
1510 &truncation_block_length
);
1511 if (method
== BACK_UP_FILE_INCREMENTALLY
)
1514 GetIncrementalFileSize(num_blocks_required
);
1515 snprintf(tarfilenamebuf
, sizeof(tarfilenamebuf
),
1516 "%s/INCREMENTAL.%s",
1517 path
+ basepathlen
+ 1,
1519 tarfilename
= tarfilenamebuf
;
1526 sent
= sendFile(sink
, pathbuf
, tarfilename
, &statbuf
,
1527 true, dboid
, spcoid
,
1528 relfilenumber
, segno
, manifest
,
1529 num_blocks_required
,
1530 method
== BACK_UP_FILE_INCREMENTALLY
? relative_block_numbers
: NULL
,
1531 truncation_block_length
);
1533 if (sent
|| sizeonly
)
1536 size
+= statbuf
.st_size
;
1538 /* Pad to a multiple of the tar block size. */
1539 size
+= tarPaddingBytesRequired(statbuf
.st_size
);
1541 /* Size of the header for the file. */
1542 size
+= TAR_BLOCK_SIZE
;
1547 (errmsg("skipping special file \"%s\"", pathbuf
)));
1550 if (relative_block_numbers
!= NULL
)
1551 pfree(relative_block_numbers
);
1558 * Given the member, write the TAR header & send the file.
1560 * If 'missing_ok' is true, will not throw an error if the file is not found.
1562 * If dboid is anything other than InvalidOid then any checksum failures
1563 * detected will get reported to the cumulative stats system.
1565 * If the file is to be sent incrementally, then num_incremental_blocks
1566 * should be the number of blocks to be sent, and incremental_blocks
1567 * an array of block numbers relative to the start of the current segment.
1568 * If the whole file is to be sent, then incremental_blocks should be NULL,
1569 * and num_incremental_blocks can have any value, as it will be ignored.
1571 * Returns true if the file was successfully sent, false if 'missing_ok',
1572 * and the file did not exist.
1575 sendFile(bbsink
*sink
, const char *readfilename
, const char *tarfilename
,
1576 struct stat
*statbuf
, bool missing_ok
, Oid dboid
, Oid spcoid
,
1577 RelFileNumber relfilenumber
, unsigned segno
,
1578 backup_manifest_info
*manifest
, unsigned num_incremental_blocks
,
1579 BlockNumber
*incremental_blocks
, unsigned truncation_block_length
)
1582 BlockNumber blkno
= 0;
1583 int checksum_failures
= 0;
1585 pgoff_t bytes_done
= 0;
1586 bool verify_checksum
= false;
1587 pg_checksum_context checksum_ctx
;
1590 if (pg_checksum_init(&checksum_ctx
, manifest
->checksum_type
) < 0)
1591 elog(ERROR
, "could not initialize checksum of file \"%s\"",
1594 fd
= OpenTransientFile(readfilename
, O_RDONLY
| PG_BINARY
);
1597 if (errno
== ENOENT
&& missing_ok
)
1600 (errcode_for_file_access(),
1601 errmsg("could not open file \"%s\": %m", readfilename
)));
1604 _tarWriteHeader(sink
, tarfilename
, NULL
, statbuf
, false);
1607 * Checksums are verified in multiples of BLCKSZ, so the buffer length
1608 * should be a multiple of the block size as well.
1610 Assert((sink
->bbs_buffer_length
% BLCKSZ
) == 0);
1613 * If we weren't told not to verify checksums, and if checksums are
1614 * enabled for this cluster, and if this is a relation file, then verify
1617 if (!noverify_checksums
&& DataChecksumsEnabled() &&
1618 RelFileNumberIsValid(relfilenumber
))
1619 verify_checksum
= true;
1622 * If we're sending an incremental file, write the file header.
1624 if (incremental_blocks
!= NULL
)
1626 unsigned magic
= INCREMENTAL_MAGIC
;
1627 size_t header_bytes_done
= 0;
1628 char padding
[BLCKSZ
];
1631 /* Emit header data. */
1632 push_to_sink(sink
, &checksum_ctx
, &header_bytes_done
,
1633 &magic
, sizeof(magic
));
1634 push_to_sink(sink
, &checksum_ctx
, &header_bytes_done
,
1635 &num_incremental_blocks
, sizeof(num_incremental_blocks
));
1636 push_to_sink(sink
, &checksum_ctx
, &header_bytes_done
,
1637 &truncation_block_length
, sizeof(truncation_block_length
));
1638 push_to_sink(sink
, &checksum_ctx
, &header_bytes_done
,
1640 sizeof(BlockNumber
) * num_incremental_blocks
);
1643 * Add padding to align header to a multiple of BLCKSZ, but only if
1644 * the incremental file has some blocks, and the alignment is actually
1645 * needed (i.e. header is not already a multiple of BLCKSZ). If there
1646 * are no blocks we don't want to make the file unnecessarily large,
1647 * as that might make some filesystem optimizations impossible.
1649 if ((num_incremental_blocks
> 0) && (header_bytes_done
% BLCKSZ
!= 0))
1651 paddinglen
= (BLCKSZ
- (header_bytes_done
% BLCKSZ
));
1653 memset(padding
, 0, paddinglen
);
1654 bytes_done
+= paddinglen
;
1656 push_to_sink(sink
, &checksum_ctx
, &header_bytes_done
,
1657 padding
, paddinglen
);
1660 /* Flush out any data still in the buffer so it's again empty. */
1661 if (header_bytes_done
> 0)
1663 bbsink_archive_contents(sink
, header_bytes_done
);
1664 if (pg_checksum_update(&checksum_ctx
,
1665 (uint8
*) sink
->bbs_buffer
,
1666 header_bytes_done
) < 0)
1667 elog(ERROR
, "could not update checksum of base backup");
1670 /* Update our notion of file position. */
1671 bytes_done
+= sizeof(magic
);
1672 bytes_done
+= sizeof(num_incremental_blocks
);
1673 bytes_done
+= sizeof(truncation_block_length
);
1674 bytes_done
+= sizeof(BlockNumber
) * num_incremental_blocks
;
1678 * Loop until we read the amount of data the caller told us to expect. The
1679 * file could be longer, if it was extended while we were sending it, but
1680 * for a base backup we can ignore such extended data. It will be restored
1686 * Determine whether we've read all the data that we need, and if not,
1689 if (incremental_blocks
== NULL
)
1691 size_t remaining
= statbuf
->st_size
- bytes_done
;
1694 * If we've read the required number of bytes, then it's time to
1697 if (bytes_done
>= statbuf
->st_size
)
1701 * Read as many bytes as will fit in the buffer, or however many
1702 * are left to read, whichever is less.
1704 cnt
= read_file_data_into_buffer(sink
, readfilename
, fd
,
1705 bytes_done
, remaining
,
1706 blkno
+ segno
* RELSEG_SIZE
,
1708 &checksum_failures
);
1712 BlockNumber relative_blkno
;
1715 * If we've read all the blocks, then it's time to stop.
1717 if (ibindex
>= num_incremental_blocks
)
1721 * Read just one block, whichever one is the next that we're
1722 * supposed to include.
1724 relative_blkno
= incremental_blocks
[ibindex
++];
1725 cnt
= read_file_data_into_buffer(sink
, readfilename
, fd
,
1726 relative_blkno
* BLCKSZ
,
1728 relative_blkno
+ segno
* RELSEG_SIZE
,
1730 &checksum_failures
);
1733 * If we get a partial read, that must mean that the relation is
1734 * being truncated. Ultimately, it should be truncated to a
1735 * multiple of BLCKSZ, since this path should only be reached for
1736 * relation files, but we might transiently observe an
1737 * intermediate value.
1739 * It should be fine to treat this just as if the entire block had
1740 * been truncated away - i.e. fill this and all later blocks with
1741 * zeroes. WAL replay will fix things up.
1748 * If the amount of data we were able to read was not a multiple of
1749 * BLCKSZ, we cannot verify checksums, which are block-level.
1751 if (verify_checksum
&& (cnt
% BLCKSZ
!= 0))
1754 (errmsg("could not verify checksum in file \"%s\", block "
1755 "%u: read buffer size %d and page size %d "
1757 readfilename
, blkno
, (int) cnt
, BLCKSZ
)));
1758 verify_checksum
= false;
1762 * If we hit end-of-file, a concurrent truncation must have occurred.
1763 * That's not an error condition, because WAL replay will fix things
1769 /* Update block number and # of bytes done for next loop iteration. */
1770 blkno
+= cnt
/ BLCKSZ
;
1774 * Make sure incremental files with block data are properly aligned
1775 * (header is a multiple of BLCKSZ, blocks are BLCKSZ too).
1777 Assert(!((incremental_blocks
!= NULL
&& num_incremental_blocks
> 0) &&
1778 (bytes_done
% BLCKSZ
!= 0)));
1780 /* Archive the data we just read. */
1781 bbsink_archive_contents(sink
, cnt
);
1783 /* Also feed it to the checksum machinery. */
1784 if (pg_checksum_update(&checksum_ctx
,
1785 (uint8
*) sink
->bbs_buffer
, cnt
) < 0)
1786 elog(ERROR
, "could not update checksum of base backup");
1789 /* If the file was truncated while we were sending it, pad it with zeros */
1790 while (bytes_done
< statbuf
->st_size
)
1792 size_t remaining
= statbuf
->st_size
- bytes_done
;
1793 size_t nbytes
= Min(sink
->bbs_buffer_length
, remaining
);
1795 MemSet(sink
->bbs_buffer
, 0, nbytes
);
1796 if (pg_checksum_update(&checksum_ctx
,
1797 (uint8
*) sink
->bbs_buffer
,
1799 elog(ERROR
, "could not update checksum of base backup");
1800 bbsink_archive_contents(sink
, nbytes
);
1801 bytes_done
+= nbytes
;
1805 * Pad to a block boundary, per tar format requirements. (This small piece
1806 * of data is probably not worth throttling, and is not checksummed
1807 * because it's not actually part of the file.)
1809 _tarWritePadding(sink
, bytes_done
);
1811 CloseTransientFile(fd
);
1813 if (checksum_failures
> 1)
1816 (errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
1817 "file \"%s\" has a total of %d checksum verification failures",
1819 readfilename
, checksum_failures
)));
1821 pgstat_report_checksum_failures_in_db(dboid
, checksum_failures
);
1824 total_checksum_failures
+= checksum_failures
;
1826 AddFileToBackupManifest(manifest
, spcoid
, tarfilename
, statbuf
->st_size
,
1827 (pg_time_t
) statbuf
->st_mtime
, &checksum_ctx
);
1833 * Read some more data from the file into the bbsink's buffer, verifying
1834 * checksums as required.
1836 * 'offset' is the file offset from which we should begin to read, and
1837 * 'length' is the amount of data that should be read. The actual amount
1838 * of data read will be less than the requested amount if the bbsink's
1839 * buffer isn't big enough to hold it all, or if the underlying file has
1840 * been truncated. The return value is the number of bytes actually read.
1842 * 'blkno' is the block number of the first page in the bbsink's buffer
1843 * relative to the start of the relation.
1845 * 'verify_checksum' indicates whether we should try to verify checksums
1846 * for the blocks we read. If we do this, we'll update *checksum_failures
1847 * and issue warnings as appropriate.
1850 read_file_data_into_buffer(bbsink
*sink
, const char *readfilename
, int fd
,
1851 off_t offset
, size_t length
, BlockNumber blkno
,
1852 bool verify_checksum
, int *checksum_failures
)
1858 /* Try to read some more data. */
1859 cnt
= basebackup_read_file(fd
, sink
->bbs_buffer
,
1860 Min(sink
->bbs_buffer_length
, length
),
1861 offset
, readfilename
, true);
1863 /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1864 if (!verify_checksum
|| (cnt
% BLCKSZ
) != 0)
1867 /* Verify checksum for each block. */
1868 for (i
= 0; i
< cnt
/ BLCKSZ
; i
++)
1871 uint16 expected_checksum
;
1873 page
= sink
->bbs_buffer
+ BLCKSZ
* i
;
1875 /* If the page is OK, go on to the next one. */
1876 if (verify_page_checksum(page
, sink
->bbs_state
->startptr
, blkno
+ i
,
1877 &expected_checksum
))
1881 * Retry the block on the first failure. It's possible that we read
1882 * the first 4K page of the block just before postgres updated the
1883 * entire block so it ends up looking torn to us. If, before we retry
1884 * the read, the concurrent write of the block finishes, the page LSN
1885 * will be updated and we'll realize that we should ignore this block.
1887 * There's no guarantee that this will actually happen, though: the
1888 * torn write could take an arbitrarily long time to complete.
1889 * Retrying multiple times wouldn't fix this problem, either, though
1890 * it would reduce the chances of it happening in practice. The only
1891 * real fix here seems to be to have some kind of interlock that
1892 * allows us to wait until we can be certain that no write to the
1893 * block is in progress. Since we don't have any such thing right now,
1894 * we just do this and hope for the best.
1897 basebackup_read_file(fd
, sink
->bbs_buffer
+ BLCKSZ
* i
,
1898 BLCKSZ
, offset
+ BLCKSZ
* i
,
1899 readfilename
, false);
1900 if (reread_cnt
== 0)
1903 * If we hit end-of-file, a concurrent truncation must have
1904 * occurred, so reduce cnt to reflect only the blocks already
1905 * processed and break out of this loop.
1911 /* If the page now looks OK, go on to the next one. */
1912 if (verify_page_checksum(page
, sink
->bbs_state
->startptr
, blkno
+ i
,
1913 &expected_checksum
))
1916 /* Handle checksum failure. */
1917 (*checksum_failures
)++;
1918 if (*checksum_failures
<= 5)
1920 (errmsg("checksum verification failed in "
1921 "file \"%s\", block %u: calculated "
1922 "%X but expected %X",
1923 readfilename
, blkno
+ i
, expected_checksum
,
1924 ((PageHeader
) page
)->pd_checksum
)));
1925 if (*checksum_failures
== 5)
1927 (errmsg("further checksum verification "
1928 "failures in file \"%s\" will not "
1929 "be reported", readfilename
)));
1936 * Push data into a bbsink.
1938 * It's better, when possible, to read data directly into the bbsink's buffer,
1939 * rather than using this function to copy it into the buffer; this function is
1940 * for cases where that approach is not practical.
1942 * bytes_done should point to a count of the number of bytes that are
1943 * currently used in the bbsink's buffer. Upon return, the bytes identified by
1944 * data and length will have been copied into the bbsink's buffer, flushing
1945 * as required, and *bytes_done will have been updated accordingly. If the
1946 * buffer was flushed, the previous contents will also have been fed to
1949 * Note that after one or more calls to this function it is the caller's
1950 * responsibility to perform any required final flush.
1953 push_to_sink(bbsink
*sink
, pg_checksum_context
*checksum_ctx
,
1954 size_t *bytes_done
, void *data
, size_t length
)
1958 size_t bytes_to_copy
;
1961 * We use < here rather than <= so that if the data exactly fills the
1962 * remaining buffer space, we trigger a flush now.
1964 if (length
< sink
->bbs_buffer_length
- *bytes_done
)
1966 /* Append remaining data to buffer. */
1967 memcpy(sink
->bbs_buffer
+ *bytes_done
, data
, length
);
1968 *bytes_done
+= length
;
1972 /* Copy until buffer is full and flush it. */
1973 bytes_to_copy
= sink
->bbs_buffer_length
- *bytes_done
;
1974 memcpy(sink
->bbs_buffer
+ *bytes_done
, data
, bytes_to_copy
);
1975 data
= ((char *) data
) + bytes_to_copy
;
1976 length
-= bytes_to_copy
;
1977 bbsink_archive_contents(sink
, sink
->bbs_buffer_length
);
1978 if (pg_checksum_update(checksum_ctx
, (uint8
*) sink
->bbs_buffer
,
1979 sink
->bbs_buffer_length
) < 0)
1980 elog(ERROR
, "could not update checksum");
1986 * Try to verify the checksum for the provided page, if it seems appropriate
1989 * Returns true if verification succeeds or if we decide not to check it,
1990 * and false if verification fails. When return false, it also sets
1991 * *expected_checksum to the computed value.
1994 verify_page_checksum(Page page
, XLogRecPtr start_lsn
, BlockNumber blkno
,
1995 uint16
*expected_checksum
)
2001 * Only check pages which have not been modified since the start of the
2002 * base backup. Otherwise, they might have been written only halfway and
2003 * the checksum would not be valid. However, replaying WAL would
2004 * reinstate the correct page in this case. We also skip completely new
2005 * pages, since they don't have a checksum yet.
2007 if (PageIsNew(page
) || PageGetLSN(page
) >= start_lsn
)
2010 /* Perform the actual checksum calculation. */
2011 checksum
= pg_checksum_page(page
, blkno
);
2013 /* See whether it matches the value from the page. */
2014 phdr
= (PageHeader
) page
;
2015 if (phdr
->pd_checksum
== checksum
)
2017 *expected_checksum
= checksum
;
2022 _tarWriteHeader(bbsink
*sink
, const char *filename
, const char *linktarget
,
2023 struct stat
*statbuf
, bool sizeonly
)
2030 * As of this writing, the smallest supported block size is 1kB, which
2031 * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a
2032 * multiple of BLCKSZ, it should be safe to assume that the buffer is
2033 * large enough to fit an entire tar block. We double-check by means
2034 * of these assertions.
2036 StaticAssertDecl(TAR_BLOCK_SIZE
<= BLCKSZ
,
2037 "BLCKSZ too small for tar block");
2038 Assert(sink
->bbs_buffer_length
>= TAR_BLOCK_SIZE
);
2040 rc
= tarCreateHeader(sink
->bbs_buffer
, filename
, linktarget
,
2041 statbuf
->st_size
, statbuf
->st_mode
,
2042 statbuf
->st_uid
, statbuf
->st_gid
,
2049 case TAR_NAME_TOO_LONG
:
2051 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
2052 errmsg("file name too long for tar format: \"%s\"",
2055 case TAR_SYMLINK_TOO_LONG
:
2057 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
2058 errmsg("symbolic link target too long for tar format: "
2059 "file name \"%s\", target \"%s\"",
2060 filename
, linktarget
)));
2063 elog(ERROR
, "unrecognized tar error: %d", rc
);
2066 bbsink_archive_contents(sink
, TAR_BLOCK_SIZE
);
2069 return TAR_BLOCK_SIZE
;
2073 * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE.
2076 _tarWritePadding(bbsink
*sink
, int len
)
2078 int pad
= tarPaddingBytesRequired(len
);
2081 * As in _tarWriteHeader, it should be safe to assume that the buffer is
2082 * large enough that we don't need to do this in multiple chunks.
2084 Assert(sink
->bbs_buffer_length
>= TAR_BLOCK_SIZE
);
2085 Assert(pad
<= TAR_BLOCK_SIZE
);
2089 MemSet(sink
->bbs_buffer
, 0, pad
);
2090 bbsink_archive_contents(sink
, pad
);
2095 * If the entry in statbuf is a link, then adjust statbuf to make it look like a
2096 * directory, so that it will be written that way.
2099 convert_link_to_directory(const char *pathbuf
, struct stat
*statbuf
)
2101 /* If symlink, write it as a directory anyway */
2102 if (S_ISLNK(statbuf
->st_mode
))
2103 statbuf
->st_mode
= S_IFDIR
| pg_dir_create_mode
;
2107 * Read some data from a file, setting a wait event and reporting any error
2110 * If partial_read_ok is false, also report an error if the number of bytes
2111 * read is not equal to the number of bytes requested.
2113 * Returns the number of bytes read.
2116 basebackup_read_file(int fd
, char *buf
, size_t nbytes
, off_t offset
,
2117 const char *filename
, bool partial_read_ok
)
2121 pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ
);
2122 rc
= pg_pread(fd
, buf
, nbytes
, offset
);
2123 pgstat_report_wait_end();
2127 (errcode_for_file_access(),
2128 errmsg("could not read file \"%s\": %m", filename
)));
2129 if (!partial_read_ok
&& rc
> 0 && rc
!= nbytes
)
2131 (errcode_for_file_access(),
2132 errmsg("could not read file \"%s\": read %zd of %zu",
2133 filename
, rc
, nbytes
)));