nbtree: fix read page recheck typo.
[pgsql.git] / src / backend / backup / basebackup.c
blob0f8cddcbeeba0d0ae84b2dcc942c09823467948e
1 /*-------------------------------------------------------------------------
3 * basebackup.c
4 * code for taking a base backup and streaming it to a standby
6 * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
8 * IDENTIFICATION
9 * src/backend/backup/basebackup.c
11 *-------------------------------------------------------------------------
13 #include "postgres.h"
15 #include <sys/stat.h>
16 #include <unistd.h>
17 #include <time.h>
19 #include "access/xlog_internal.h"
20 #include "access/xlogbackup.h"
21 #include "backup/backup_manifest.h"
22 #include "backup/basebackup.h"
23 #include "backup/basebackup_incremental.h"
24 #include "backup/basebackup_sink.h"
25 #include "backup/basebackup_target.h"
26 #include "catalog/pg_tablespace_d.h"
27 #include "commands/defrem.h"
28 #include "common/compression.h"
29 #include "common/file_perm.h"
30 #include "common/file_utils.h"
31 #include "lib/stringinfo.h"
32 #include "miscadmin.h"
33 #include "nodes/pg_list.h"
34 #include "pgstat.h"
35 #include "pgtar.h"
36 #include "port.h"
37 #include "postmaster/syslogger.h"
38 #include "postmaster/walsummarizer.h"
39 #include "replication/slot.h"
40 #include "replication/walsender.h"
41 #include "replication/walsender_private.h"
42 #include "storage/bufpage.h"
43 #include "storage/checksum.h"
44 #include "storage/dsm_impl.h"
45 #include "storage/ipc.h"
46 #include "storage/reinit.h"
47 #include "utils/builtins.h"
48 #include "utils/guc.h"
49 #include "utils/ps_status.h"
50 #include "utils/relcache.h"
51 #include "utils/resowner.h"
54 * How much data do we want to send in one CopyData message? Note that
55 * this may also result in reading the underlying files in chunks of this
56 * size.
58 * NB: The buffer size is required to be a multiple of the system block
59 * size, so use that value instead if it's bigger than our preference.
61 #define SINK_BUFFER_LENGTH Max(32768, BLCKSZ)
63 typedef struct
65 const char *label;
66 bool progress;
67 bool fastcheckpoint;
68 bool nowait;
69 bool includewal;
70 bool incremental;
71 uint32 maxrate;
72 bool sendtblspcmapfile;
73 bool send_to_client;
74 bool use_copytblspc;
75 BaseBackupTargetHandle *target_handle;
76 backup_manifest_option manifest;
77 pg_compress_algorithm compression;
78 pg_compress_specification compression_specification;
79 pg_checksum_type manifest_checksum_type;
80 } basebackup_options;
82 static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
83 struct backup_manifest_info *manifest,
84 IncrementalBackupInfo *ib);
85 static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
86 List *tablespaces, bool sendtblspclinks,
87 backup_manifest_info *manifest, Oid spcoid,
88 IncrementalBackupInfo *ib);
89 static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
90 struct stat *statbuf, bool missing_ok,
91 Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
92 unsigned segno,
93 backup_manifest_info *manifest,
94 unsigned num_incremental_blocks,
95 BlockNumber *incremental_blocks,
96 unsigned truncation_block_length);
97 static off_t read_file_data_into_buffer(bbsink *sink,
98 const char *readfilename, int fd,
99 off_t offset, size_t length,
100 BlockNumber blkno,
101 bool verify_checksum,
102 int *checksum_failures);
103 static void push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
104 size_t *bytes_done, void *data, size_t length);
105 static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
106 BlockNumber blkno,
107 uint16 *expected_checksum);
108 static void sendFileWithContent(bbsink *sink, const char *filename,
109 const char *content, int len,
110 backup_manifest_info *manifest);
111 static int64 _tarWriteHeader(bbsink *sink, const char *filename,
112 const char *linktarget, struct stat *statbuf,
113 bool sizeonly);
114 static void _tarWritePadding(bbsink *sink, int len);
115 static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf);
116 static void perform_base_backup(basebackup_options *opt, bbsink *sink,
117 IncrementalBackupInfo *ib);
118 static void parse_basebackup_options(List *options, basebackup_options *opt);
119 static int compareWalFileNames(const ListCell *a, const ListCell *b);
120 static ssize_t basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
121 const char *filename, bool partial_read_ok);
123 /* Was the backup currently in-progress initiated in recovery mode? */
124 static bool backup_started_in_recovery = false;
126 /* Total number of checksum failures during base backup. */
127 static long long int total_checksum_failures;
129 /* Do not verify checksums. */
130 static bool noverify_checksums = false;
133 * Definition of one element part of an exclusion list, used for paths part
134 * of checksum validation or base backups. "name" is the name of the file
135 * or path to check for exclusion. If "match_prefix" is true, any items
136 * matching the name as prefix are excluded.
138 struct exclude_list_item
140 const char *name;
141 bool match_prefix;
145 * The contents of these directories are removed or recreated during server
146 * start so they are not included in backups. The directories themselves are
147 * kept and included as empty to preserve access permissions.
149 * Note: this list should be kept in sync with the filter lists in pg_rewind's
150 * filemap.c.
152 static const char *const excludeDirContents[] =
155 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
156 * because extensions like pg_stat_statements store data there.
158 PG_STAT_TMP_DIR,
161 * It is generally not useful to backup the contents of this directory
162 * even if the intention is to restore to another primary. See backup.sgml
163 * for a more detailed description.
165 PG_REPLSLOT_DIR,
167 /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
168 PG_DYNSHMEM_DIR,
170 /* Contents removed on startup, see AsyncShmemInit(). */
171 "pg_notify",
174 * Old contents are loaded for possible debugging but are not required for
175 * normal operation, see SerialInit().
177 "pg_serial",
179 /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
180 "pg_snapshots",
182 /* Contents zeroed on startup, see StartupSUBTRANS(). */
183 "pg_subtrans",
185 /* end of list */
186 NULL
190 * List of files excluded from backups.
192 static const struct exclude_list_item excludeFiles[] =
194 /* Skip auto conf temporary file. */
195 {PG_AUTOCONF_FILENAME ".tmp", false},
197 /* Skip current log file temporary file */
198 {LOG_METAINFO_DATAFILE_TMP, false},
201 * Skip relation cache because it is rebuilt on startup. This includes
202 * temporary files.
204 {RELCACHE_INIT_FILENAME, true},
207 * backup_label and tablespace_map should not exist in a running cluster
208 * capable of doing an online backup, but exclude them just in case.
210 {BACKUP_LABEL_FILE, false},
211 {TABLESPACE_MAP, false},
214 * If there's a backup_manifest, it belongs to a backup that was used to
215 * start this server. It is *not* correct for this backup. Our
216 * backup_manifest is injected into the backup separately if users want
217 * it.
219 {"backup_manifest", false},
221 {"postmaster.pid", false},
222 {"postmaster.opts", false},
224 /* end of list */
225 {NULL, false}
229 * Actually do a base backup for the specified tablespaces.
231 * This is split out mainly to avoid complaints about "variable might be
232 * clobbered by longjmp" from stupider versions of gcc.
234 static void
235 perform_base_backup(basebackup_options *opt, bbsink *sink,
236 IncrementalBackupInfo *ib)
238 bbsink_state state;
239 XLogRecPtr endptr;
240 TimeLineID endtli;
241 backup_manifest_info manifest;
242 BackupState *backup_state;
243 StringInfo tablespace_map;
245 /* Initial backup state, insofar as we know it now. */
246 state.tablespaces = NIL;
247 state.tablespace_num = 0;
248 state.bytes_done = 0;
249 state.bytes_total = 0;
250 state.bytes_total_is_valid = false;
252 /* we're going to use a BufFile, so we need a ResourceOwner */
253 Assert(AuxProcessResourceOwner != NULL);
254 Assert(CurrentResourceOwner == AuxProcessResourceOwner ||
255 CurrentResourceOwner == NULL);
256 CurrentResourceOwner = AuxProcessResourceOwner;
258 backup_started_in_recovery = RecoveryInProgress();
260 InitializeBackupManifest(&manifest, opt->manifest,
261 opt->manifest_checksum_type);
263 total_checksum_failures = 0;
265 /* Allocate backup related variables. */
266 backup_state = (BackupState *) palloc0(sizeof(BackupState));
267 tablespace_map = makeStringInfo();
269 basebackup_progress_wait_checkpoint();
270 do_pg_backup_start(opt->label, opt->fastcheckpoint, &state.tablespaces,
271 backup_state, tablespace_map);
273 state.startptr = backup_state->startpoint;
274 state.starttli = backup_state->starttli;
277 * Once do_pg_backup_start has been called, ensure that any failure causes
278 * us to abort the backup so we don't "leak" a backup counter. For this
279 * reason, *all* functionality between do_pg_backup_start() and the end of
280 * do_pg_backup_stop() should be inside the error cleanup block!
283 PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
285 ListCell *lc;
286 tablespaceinfo *newti;
288 /* If this is an incremental backup, execute preparatory steps. */
289 if (ib != NULL)
290 PrepareForIncrementalBackup(ib, backup_state);
292 /* Add a node for the base directory at the end */
293 newti = palloc0(sizeof(tablespaceinfo));
294 newti->size = -1;
295 state.tablespaces = lappend(state.tablespaces, newti);
298 * Calculate the total backup size by summing up the size of each
299 * tablespace
301 if (opt->progress)
303 basebackup_progress_estimate_backup_size();
305 foreach(lc, state.tablespaces)
307 tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
309 if (tmp->path == NULL)
310 tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
311 true, NULL, InvalidOid, NULL);
312 else
313 tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
314 NULL, NULL);
315 state.bytes_total += tmp->size;
317 state.bytes_total_is_valid = true;
320 /* notify basebackup sink about start of backup */
321 bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);
323 /* Send off our tablespaces one by one */
324 foreach(lc, state.tablespaces)
326 tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
328 if (ti->path == NULL)
330 struct stat statbuf;
331 bool sendtblspclinks = true;
332 char *backup_label;
334 bbsink_begin_archive(sink, "base.tar");
336 /* In the main tar, include the backup_label first... */
337 backup_label = build_backup_content(backup_state, false);
338 sendFileWithContent(sink, BACKUP_LABEL_FILE,
339 backup_label, -1, &manifest);
340 pfree(backup_label);
342 /* Then the tablespace_map file, if required... */
343 if (opt->sendtblspcmapfile)
345 sendFileWithContent(sink, TABLESPACE_MAP,
346 tablespace_map->data, -1, &manifest);
347 sendtblspclinks = false;
350 /* Then the bulk of the files... */
351 sendDir(sink, ".", 1, false, state.tablespaces,
352 sendtblspclinks, &manifest, InvalidOid, ib);
354 /* ... and pg_control after everything else. */
355 if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
356 ereport(ERROR,
357 (errcode_for_file_access(),
358 errmsg("could not stat file \"%s\": %m",
359 XLOG_CONTROL_FILE)));
360 sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
361 false, InvalidOid, InvalidOid,
362 InvalidRelFileNumber, 0, &manifest, 0, NULL, 0);
364 else
366 char *archive_name = psprintf("%u.tar", ti->oid);
368 bbsink_begin_archive(sink, archive_name);
370 sendTablespace(sink, ti->path, ti->oid, false, &manifest, ib);
374 * If we're including WAL, and this is the main data directory we
375 * don't treat this as the end of the tablespace. Instead, we will
376 * include the xlog files below and stop afterwards. This is safe
377 * since the main data directory is always sent *last*.
379 if (opt->includewal && ti->path == NULL)
381 Assert(lnext(state.tablespaces, lc) == NULL);
383 else
385 /* Properly terminate the tarfile. */
386 StaticAssertDecl(2 * TAR_BLOCK_SIZE <= BLCKSZ,
387 "BLCKSZ too small for 2 tar blocks");
388 memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
389 bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
391 /* OK, that's the end of the archive. */
392 bbsink_end_archive(sink);
396 basebackup_progress_wait_wal_archive(&state);
397 do_pg_backup_stop(backup_state, !opt->nowait);
399 endptr = backup_state->stoppoint;
400 endtli = backup_state->stoptli;
402 /* Deallocate backup-related variables. */
403 destroyStringInfo(tablespace_map);
404 pfree(backup_state);
406 PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
409 if (opt->includewal)
412 * We've left the last tar file "open", so we can now append the
413 * required WAL files to it.
415 char pathbuf[MAXPGPATH];
416 XLogSegNo segno;
417 XLogSegNo startsegno;
418 XLogSegNo endsegno;
419 struct stat statbuf;
420 List *historyFileList = NIL;
421 List *walFileList = NIL;
422 char firstoff[MAXFNAMELEN];
423 char lastoff[MAXFNAMELEN];
424 DIR *dir;
425 struct dirent *de;
426 ListCell *lc;
427 TimeLineID tli;
429 basebackup_progress_transfer_wal();
432 * I'd rather not worry about timelines here, so scan pg_wal and
433 * include all WAL files in the range between 'startptr' and 'endptr',
434 * regardless of the timeline the file is stamped with. If there are
435 * some spurious WAL files belonging to timelines that don't belong in
436 * this server's history, they will be included too. Normally there
437 * shouldn't be such files, but if there are, there's little harm in
438 * including them.
440 XLByteToSeg(state.startptr, startsegno, wal_segment_size);
441 XLogFileName(firstoff, state.starttli, startsegno, wal_segment_size);
442 XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
443 XLogFileName(lastoff, endtli, endsegno, wal_segment_size);
445 dir = AllocateDir("pg_wal");
446 while ((de = ReadDir(dir, "pg_wal")) != NULL)
448 /* Does it look like a WAL segment, and is it in the range? */
449 if (IsXLogFileName(de->d_name) &&
450 strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
451 strcmp(de->d_name + 8, lastoff + 8) <= 0)
453 walFileList = lappend(walFileList, pstrdup(de->d_name));
455 /* Does it look like a timeline history file? */
456 else if (IsTLHistoryFileName(de->d_name))
458 historyFileList = lappend(historyFileList, pstrdup(de->d_name));
461 FreeDir(dir);
464 * Before we go any further, check that none of the WAL segments we
465 * need were removed.
467 CheckXLogRemoved(startsegno, state.starttli);
470 * Sort the WAL filenames. We want to send the files in order from
471 * oldest to newest, to reduce the chance that a file is recycled
472 * before we get a chance to send it over.
474 list_sort(walFileList, compareWalFileNames);
477 * There must be at least one xlog file in the pg_wal directory, since
478 * we are doing backup-including-xlog.
480 if (walFileList == NIL)
481 ereport(ERROR,
482 (errmsg("could not find any WAL files")));
485 * Sanity check: the first and last segment should cover startptr and
486 * endptr, with no gaps in between.
488 XLogFromFileName((char *) linitial(walFileList),
489 &tli, &segno, wal_segment_size);
490 if (segno != startsegno)
492 char startfname[MAXFNAMELEN];
494 XLogFileName(startfname, state.starttli, startsegno,
495 wal_segment_size);
496 ereport(ERROR,
497 (errmsg("could not find WAL file \"%s\"", startfname)));
499 foreach(lc, walFileList)
501 char *walFileName = (char *) lfirst(lc);
502 XLogSegNo currsegno = segno;
503 XLogSegNo nextsegno = segno + 1;
505 XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
506 if (!(nextsegno == segno || currsegno == segno))
508 char nextfname[MAXFNAMELEN];
510 XLogFileName(nextfname, tli, nextsegno, wal_segment_size);
511 ereport(ERROR,
512 (errmsg("could not find WAL file \"%s\"", nextfname)));
515 if (segno != endsegno)
517 char endfname[MAXFNAMELEN];
519 XLogFileName(endfname, endtli, endsegno, wal_segment_size);
520 ereport(ERROR,
521 (errmsg("could not find WAL file \"%s\"", endfname)));
524 /* Ok, we have everything we need. Send the WAL files. */
525 foreach(lc, walFileList)
527 char *walFileName = (char *) lfirst(lc);
528 int fd;
529 ssize_t cnt;
530 pgoff_t len = 0;
532 snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
533 XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
535 fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
536 if (fd < 0)
538 int save_errno = errno;
541 * Most likely reason for this is that the file was already
542 * removed by a checkpoint, so check for that to get a better
543 * error message.
545 CheckXLogRemoved(segno, tli);
547 errno = save_errno;
548 ereport(ERROR,
549 (errcode_for_file_access(),
550 errmsg("could not open file \"%s\": %m", pathbuf)));
553 if (fstat(fd, &statbuf) != 0)
554 ereport(ERROR,
555 (errcode_for_file_access(),
556 errmsg("could not stat file \"%s\": %m",
557 pathbuf)));
558 if (statbuf.st_size != wal_segment_size)
560 CheckXLogRemoved(segno, tli);
561 ereport(ERROR,
562 (errcode_for_file_access(),
563 errmsg("unexpected WAL file size \"%s\"", walFileName)));
566 /* send the WAL file itself */
567 _tarWriteHeader(sink, pathbuf, NULL, &statbuf, false);
569 while ((cnt = basebackup_read_file(fd, sink->bbs_buffer,
570 Min(sink->bbs_buffer_length,
571 wal_segment_size - len),
572 len, pathbuf, true)) > 0)
574 CheckXLogRemoved(segno, tli);
575 bbsink_archive_contents(sink, cnt);
577 len += cnt;
579 if (len == wal_segment_size)
580 break;
583 if (len != wal_segment_size)
585 CheckXLogRemoved(segno, tli);
586 ereport(ERROR,
587 (errcode_for_file_access(),
588 errmsg("unexpected WAL file size \"%s\"", walFileName)));
592 * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
593 * for padding.
595 Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);
597 CloseTransientFile(fd);
600 * Mark file as archived, otherwise files can get archived again
601 * after promotion of a new node. This is in line with
602 * walreceiver.c always doing an XLogArchiveForceDone() after a
603 * complete segment.
605 StatusFilePath(pathbuf, walFileName, ".done");
606 sendFileWithContent(sink, pathbuf, "", -1, &manifest);
610 * Send timeline history files too. Only the latest timeline history
611 * file is required for recovery, and even that only if there happens
612 * to be a timeline switch in the first WAL segment that contains the
613 * checkpoint record, or if we're taking a base backup from a standby
614 * server and the target timeline changes while the backup is taken.
615 * But they are small and highly useful for debugging purposes, so
616 * better include them all, always.
618 foreach(lc, historyFileList)
620 char *fname = lfirst(lc);
622 snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
624 if (lstat(pathbuf, &statbuf) != 0)
625 ereport(ERROR,
626 (errcode_for_file_access(),
627 errmsg("could not stat file \"%s\": %m", pathbuf)));
629 sendFile(sink, pathbuf, pathbuf, &statbuf, false,
630 InvalidOid, InvalidOid, InvalidRelFileNumber, 0,
631 &manifest, 0, NULL, 0);
633 /* unconditionally mark file as archived */
634 StatusFilePath(pathbuf, fname, ".done");
635 sendFileWithContent(sink, pathbuf, "", -1, &manifest);
638 /* Properly terminate the tar file. */
639 StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
640 "BLCKSZ too small for 2 tar blocks");
641 memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
642 bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
644 /* OK, that's the end of the archive. */
645 bbsink_end_archive(sink);
648 AddWALInfoToBackupManifest(&manifest, state.startptr, state.starttli,
649 endptr, endtli);
651 SendBackupManifest(&manifest, sink);
653 bbsink_end_backup(sink, endptr, endtli);
655 if (total_checksum_failures)
657 if (total_checksum_failures > 1)
658 ereport(WARNING,
659 (errmsg_plural("%lld total checksum verification failure",
660 "%lld total checksum verification failures",
661 total_checksum_failures,
662 total_checksum_failures)));
664 ereport(ERROR,
665 (errcode(ERRCODE_DATA_CORRUPTED),
666 errmsg("checksum verification failure during base backup")));
670 * Make sure to free the manifest before the resource owners as manifests
671 * use cryptohash contexts that may depend on resource owners (like
672 * OpenSSL).
674 FreeBackupManifest(&manifest);
676 /* clean up the resource owner we created */
677 ReleaseAuxProcessResources(true);
679 basebackup_progress_done();
683 * list_sort comparison function, to compare log/seg portion of WAL segment
684 * filenames, ignoring the timeline portion.
686 static int
687 compareWalFileNames(const ListCell *a, const ListCell *b)
689 char *fna = (char *) lfirst(a);
690 char *fnb = (char *) lfirst(b);
692 return strcmp(fna + 8, fnb + 8);
696 * Parse the base backup options passed down by the parser
698 static void
699 parse_basebackup_options(List *options, basebackup_options *opt)
701 ListCell *lopt;
702 bool o_label = false;
703 bool o_progress = false;
704 bool o_checkpoint = false;
705 bool o_nowait = false;
706 bool o_wal = false;
707 bool o_incremental = false;
708 bool o_maxrate = false;
709 bool o_tablespace_map = false;
710 bool o_noverify_checksums = false;
711 bool o_manifest = false;
712 bool o_manifest_checksums = false;
713 bool o_target = false;
714 bool o_target_detail = false;
715 char *target_str = NULL;
716 char *target_detail_str = NULL;
717 bool o_compression = false;
718 bool o_compression_detail = false;
719 char *compression_detail_str = NULL;
721 MemSet(opt, 0, sizeof(*opt));
722 opt->manifest = MANIFEST_OPTION_NO;
723 opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C;
724 opt->compression = PG_COMPRESSION_NONE;
725 opt->compression_specification.algorithm = PG_COMPRESSION_NONE;
727 foreach(lopt, options)
729 DefElem *defel = (DefElem *) lfirst(lopt);
731 if (strcmp(defel->defname, "label") == 0)
733 if (o_label)
734 ereport(ERROR,
735 (errcode(ERRCODE_SYNTAX_ERROR),
736 errmsg("duplicate option \"%s\"", defel->defname)));
737 opt->label = defGetString(defel);
738 o_label = true;
740 else if (strcmp(defel->defname, "progress") == 0)
742 if (o_progress)
743 ereport(ERROR,
744 (errcode(ERRCODE_SYNTAX_ERROR),
745 errmsg("duplicate option \"%s\"", defel->defname)));
746 opt->progress = defGetBoolean(defel);
747 o_progress = true;
749 else if (strcmp(defel->defname, "checkpoint") == 0)
751 char *optval = defGetString(defel);
753 if (o_checkpoint)
754 ereport(ERROR,
755 (errcode(ERRCODE_SYNTAX_ERROR),
756 errmsg("duplicate option \"%s\"", defel->defname)));
757 if (pg_strcasecmp(optval, "fast") == 0)
758 opt->fastcheckpoint = true;
759 else if (pg_strcasecmp(optval, "spread") == 0)
760 opt->fastcheckpoint = false;
761 else
762 ereport(ERROR,
763 (errcode(ERRCODE_SYNTAX_ERROR),
764 errmsg("unrecognized checkpoint type: \"%s\"",
765 optval)));
766 o_checkpoint = true;
768 else if (strcmp(defel->defname, "wait") == 0)
770 if (o_nowait)
771 ereport(ERROR,
772 (errcode(ERRCODE_SYNTAX_ERROR),
773 errmsg("duplicate option \"%s\"", defel->defname)));
774 opt->nowait = !defGetBoolean(defel);
775 o_nowait = true;
777 else if (strcmp(defel->defname, "wal") == 0)
779 if (o_wal)
780 ereport(ERROR,
781 (errcode(ERRCODE_SYNTAX_ERROR),
782 errmsg("duplicate option \"%s\"", defel->defname)));
783 opt->includewal = defGetBoolean(defel);
784 o_wal = true;
786 else if (strcmp(defel->defname, "incremental") == 0)
788 if (o_incremental)
789 ereport(ERROR,
790 (errcode(ERRCODE_SYNTAX_ERROR),
791 errmsg("duplicate option \"%s\"", defel->defname)));
792 opt->incremental = defGetBoolean(defel);
793 if (opt->incremental && !summarize_wal)
794 ereport(ERROR,
795 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
796 errmsg("incremental backups cannot be taken unless WAL summarization is enabled")));
797 o_incremental = true;
799 else if (strcmp(defel->defname, "max_rate") == 0)
801 int64 maxrate;
803 if (o_maxrate)
804 ereport(ERROR,
805 (errcode(ERRCODE_SYNTAX_ERROR),
806 errmsg("duplicate option \"%s\"", defel->defname)));
808 maxrate = defGetInt64(defel);
809 if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
810 ereport(ERROR,
811 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
812 errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
813 (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
815 opt->maxrate = (uint32) maxrate;
816 o_maxrate = true;
818 else if (strcmp(defel->defname, "tablespace_map") == 0)
820 if (o_tablespace_map)
821 ereport(ERROR,
822 (errcode(ERRCODE_SYNTAX_ERROR),
823 errmsg("duplicate option \"%s\"", defel->defname)));
824 opt->sendtblspcmapfile = defGetBoolean(defel);
825 o_tablespace_map = true;
827 else if (strcmp(defel->defname, "verify_checksums") == 0)
829 if (o_noverify_checksums)
830 ereport(ERROR,
831 (errcode(ERRCODE_SYNTAX_ERROR),
832 errmsg("duplicate option \"%s\"", defel->defname)));
833 noverify_checksums = !defGetBoolean(defel);
834 o_noverify_checksums = true;
836 else if (strcmp(defel->defname, "manifest") == 0)
838 char *optval = defGetString(defel);
839 bool manifest_bool;
841 if (o_manifest)
842 ereport(ERROR,
843 (errcode(ERRCODE_SYNTAX_ERROR),
844 errmsg("duplicate option \"%s\"", defel->defname)));
845 if (parse_bool(optval, &manifest_bool))
847 if (manifest_bool)
848 opt->manifest = MANIFEST_OPTION_YES;
849 else
850 opt->manifest = MANIFEST_OPTION_NO;
852 else if (pg_strcasecmp(optval, "force-encode") == 0)
853 opt->manifest = MANIFEST_OPTION_FORCE_ENCODE;
854 else
855 ereport(ERROR,
856 (errcode(ERRCODE_SYNTAX_ERROR),
857 errmsg("unrecognized manifest option: \"%s\"",
858 optval)));
859 o_manifest = true;
861 else if (strcmp(defel->defname, "manifest_checksums") == 0)
863 char *optval = defGetString(defel);
865 if (o_manifest_checksums)
866 ereport(ERROR,
867 (errcode(ERRCODE_SYNTAX_ERROR),
868 errmsg("duplicate option \"%s\"", defel->defname)));
869 if (!pg_checksum_parse_type(optval,
870 &opt->manifest_checksum_type))
871 ereport(ERROR,
872 (errcode(ERRCODE_SYNTAX_ERROR),
873 errmsg("unrecognized checksum algorithm: \"%s\"",
874 optval)));
875 o_manifest_checksums = true;
877 else if (strcmp(defel->defname, "target") == 0)
879 if (o_target)
880 ereport(ERROR,
881 (errcode(ERRCODE_SYNTAX_ERROR),
882 errmsg("duplicate option \"%s\"", defel->defname)));
883 target_str = defGetString(defel);
884 o_target = true;
886 else if (strcmp(defel->defname, "target_detail") == 0)
888 char *optval = defGetString(defel);
890 if (o_target_detail)
891 ereport(ERROR,
892 (errcode(ERRCODE_SYNTAX_ERROR),
893 errmsg("duplicate option \"%s\"", defel->defname)));
894 target_detail_str = optval;
895 o_target_detail = true;
897 else if (strcmp(defel->defname, "compression") == 0)
899 char *optval = defGetString(defel);
901 if (o_compression)
902 ereport(ERROR,
903 (errcode(ERRCODE_SYNTAX_ERROR),
904 errmsg("duplicate option \"%s\"", defel->defname)));
905 if (!parse_compress_algorithm(optval, &opt->compression))
906 ereport(ERROR,
907 (errcode(ERRCODE_SYNTAX_ERROR),
908 errmsg("unrecognized compression algorithm: \"%s\"",
909 optval)));
910 o_compression = true;
912 else if (strcmp(defel->defname, "compression_detail") == 0)
914 if (o_compression_detail)
915 ereport(ERROR,
916 (errcode(ERRCODE_SYNTAX_ERROR),
917 errmsg("duplicate option \"%s\"", defel->defname)));
918 compression_detail_str = defGetString(defel);
919 o_compression_detail = true;
921 else
922 ereport(ERROR,
923 (errcode(ERRCODE_SYNTAX_ERROR),
924 errmsg("unrecognized base backup option: \"%s\"",
925 defel->defname)));
928 if (opt->label == NULL)
929 opt->label = "base backup";
930 if (opt->manifest == MANIFEST_OPTION_NO)
932 if (o_manifest_checksums)
933 ereport(ERROR,
934 (errcode(ERRCODE_SYNTAX_ERROR),
935 errmsg("manifest checksums require a backup manifest")));
936 opt->manifest_checksum_type = CHECKSUM_TYPE_NONE;
939 if (target_str == NULL)
941 if (target_detail_str != NULL)
942 ereport(ERROR,
943 (errcode(ERRCODE_SYNTAX_ERROR),
944 errmsg("target detail cannot be used without target")));
945 opt->use_copytblspc = true;
946 opt->send_to_client = true;
948 else if (strcmp(target_str, "client") == 0)
950 if (target_detail_str != NULL)
951 ereport(ERROR,
952 (errcode(ERRCODE_SYNTAX_ERROR),
953 errmsg("target \"%s\" does not accept a target detail",
954 target_str)));
955 opt->send_to_client = true;
957 else
958 opt->target_handle =
959 BaseBackupGetTargetHandle(target_str, target_detail_str);
961 if (o_compression_detail && !o_compression)
962 ereport(ERROR,
963 (errcode(ERRCODE_SYNTAX_ERROR),
964 errmsg("compression detail cannot be specified unless compression is enabled")));
966 if (o_compression)
968 char *error_detail;
970 parse_compress_specification(opt->compression, compression_detail_str,
971 &opt->compression_specification);
972 error_detail =
973 validate_compress_specification(&opt->compression_specification);
974 if (error_detail != NULL)
975 ereport(ERROR,
976 errcode(ERRCODE_SYNTAX_ERROR),
977 errmsg("invalid compression specification: %s",
978 error_detail));
984 * SendBaseBackup() - send a complete base backup.
986 * The function will put the system into backup mode like pg_backup_start()
987 * does, so that the backup is consistent even though we read directly from
988 * the filesystem, bypassing the buffer cache.
990 void
991 SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib)
993 basebackup_options opt;
994 bbsink *sink;
995 SessionBackupState status = get_backup_status();
997 if (status == SESSION_BACKUP_RUNNING)
998 ereport(ERROR,
999 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1000 errmsg("a backup is already in progress in this session")));
1002 parse_basebackup_options(cmd->options, &opt);
1004 WalSndSetState(WALSNDSTATE_BACKUP);
1006 if (update_process_title)
1008 char activitymsg[50];
1010 snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
1011 opt.label);
1012 set_ps_display(activitymsg);
1016 * If we're asked to perform an incremental backup and the user has not
1017 * supplied a manifest, that's an ERROR.
1019 * If we're asked to perform a full backup and the user did supply a
1020 * manifest, just ignore it.
1022 if (!opt.incremental)
1023 ib = NULL;
1024 else if (ib == NULL)
1025 ereport(ERROR,
1026 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1027 errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP")));
1030 * If the target is specifically 'client' then set up to stream the backup
1031 * to the client; otherwise, it's being sent someplace else and should not
1032 * be sent to the client. BaseBackupGetSink has the job of setting up a
1033 * sink to send the backup data wherever it needs to go.
1035 sink = bbsink_copystream_new(opt.send_to_client);
1036 if (opt.target_handle != NULL)
1037 sink = BaseBackupGetSink(opt.target_handle, sink);
1039 /* Set up network throttling, if client requested it */
1040 if (opt.maxrate > 0)
1041 sink = bbsink_throttle_new(sink, opt.maxrate);
1043 /* Set up server-side compression, if client requested it */
1044 if (opt.compression == PG_COMPRESSION_GZIP)
1045 sink = bbsink_gzip_new(sink, &opt.compression_specification);
1046 else if (opt.compression == PG_COMPRESSION_LZ4)
1047 sink = bbsink_lz4_new(sink, &opt.compression_specification);
1048 else if (opt.compression == PG_COMPRESSION_ZSTD)
1049 sink = bbsink_zstd_new(sink, &opt.compression_specification);
1051 /* Set up progress reporting. */
1052 sink = bbsink_progress_new(sink, opt.progress);
1055 * Perform the base backup, but make sure we clean up the bbsink even if
1056 * an error occurs.
1058 PG_TRY();
1060 perform_base_backup(&opt, sink, ib);
1062 PG_FINALLY();
1064 bbsink_cleanup(sink);
1066 PG_END_TRY();
1070 * Inject a file with given name and content in the output tar stream.
1072 * "len" can optionally be set to an arbitrary length of data sent. If set
1073 * to -1, the content sent is treated as a string with strlen() as length.
1075 static void
1076 sendFileWithContent(bbsink *sink, const char *filename, const char *content,
1077 int len, backup_manifest_info *manifest)
1079 struct stat statbuf;
1080 int bytes_done = 0;
1081 pg_checksum_context checksum_ctx;
1083 if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1084 elog(ERROR, "could not initialize checksum of file \"%s\"",
1085 filename);
1087 if (len < 0)
1088 len = strlen(content);
1091 * Construct a stat struct for the file we're injecting in the tar.
1094 /* Windows doesn't have the concept of uid and gid */
1095 #ifdef WIN32
1096 statbuf.st_uid = 0;
1097 statbuf.st_gid = 0;
1098 #else
1099 statbuf.st_uid = geteuid();
1100 statbuf.st_gid = getegid();
1101 #endif
1102 statbuf.st_mtime = time(NULL);
1103 statbuf.st_mode = pg_file_create_mode;
1104 statbuf.st_size = len;
1106 _tarWriteHeader(sink, filename, NULL, &statbuf, false);
1108 if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0)
1109 elog(ERROR, "could not update checksum of file \"%s\"",
1110 filename);
1112 while (bytes_done < len)
1114 size_t remaining = len - bytes_done;
1115 size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1117 memcpy(sink->bbs_buffer, content, nbytes);
1118 bbsink_archive_contents(sink, nbytes);
1119 bytes_done += nbytes;
1120 content += nbytes;
1123 _tarWritePadding(sink, len);
1125 AddFileToBackupManifest(manifest, InvalidOid, filename, len,
1126 (pg_time_t) statbuf.st_mtime, &checksum_ctx);
1130 * Include the tablespace directory pointed to by 'path' in the output tar
1131 * stream. If 'sizeonly' is true, we just calculate a total length and return
1132 * it, without actually sending anything.
1134 * Only used to send auxiliary tablespaces, not PGDATA.
1136 static int64
1137 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
1138 backup_manifest_info *manifest, IncrementalBackupInfo *ib)
1140 int64 size;
1141 char pathbuf[MAXPGPATH];
1142 struct stat statbuf;
1145 * 'path' points to the tablespace location, but we only want to include
1146 * the version directory in it that belongs to us.
1148 snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
1149 TABLESPACE_VERSION_DIRECTORY);
1152 * Store a directory entry in the tar file so we get the permissions
1153 * right.
1155 if (lstat(pathbuf, &statbuf) != 0)
1157 if (errno != ENOENT)
1158 ereport(ERROR,
1159 (errcode_for_file_access(),
1160 errmsg("could not stat file or directory \"%s\": %m",
1161 pathbuf)));
1163 /* If the tablespace went away while scanning, it's no error. */
1164 return 0;
1167 size = _tarWriteHeader(sink, TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
1168 sizeonly);
1170 /* Send all the files in the tablespace version directory */
1171 size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest,
1172 spcoid, ib);
1174 return size;
1178 * Include all files from the given directory in the output tar stream. If
1179 * 'sizeonly' is true, we just calculate a total length and return it, without
1180 * actually sending anything.
1182 * Omit any directory in the tablespaces list, to avoid backing up
1183 * tablespaces twice when they were created inside PGDATA.
1185 * If sendtblspclinks is true, we need to include symlink
1186 * information in the tar file. If not, we can skip that
1187 * as it will be sent separately in the tablespace_map file.
1189 static int64
1190 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
1191 List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest,
1192 Oid spcoid, IncrementalBackupInfo *ib)
1194 DIR *dir;
1195 struct dirent *de;
1196 char pathbuf[MAXPGPATH * 2];
1197 struct stat statbuf;
1198 int64 size = 0;
1199 const char *lastDir; /* Split last dir from parent path. */
1200 bool isRelationDir = false; /* Does directory contain relations? */
1201 bool isGlobalDir = false;
1202 Oid dboid = InvalidOid;
1203 BlockNumber *relative_block_numbers = NULL;
1206 * Since this array is relatively large, avoid putting it on the stack.
1207 * But we don't need it at all if this is not an incremental backup.
1209 if (ib != NULL)
1210 relative_block_numbers = palloc(sizeof(BlockNumber) * RELSEG_SIZE);
1213 * Determine if the current path is a database directory that can contain
1214 * relations.
1216 * Start by finding the location of the delimiter between the parent path
1217 * and the current path.
1219 lastDir = last_dir_separator(path);
1221 /* Does this path look like a database path (i.e. all digits)? */
1222 if (lastDir != NULL &&
1223 strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1225 /* Part of path that contains the parent directory. */
1226 int parentPathLen = lastDir - path;
1229 * Mark path as a database directory if the parent path is either
1230 * $PGDATA/base or a tablespace version path.
1232 if (strncmp(path, "./base", parentPathLen) == 0 ||
1233 (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1234 strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1235 TABLESPACE_VERSION_DIRECTORY,
1236 sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1238 isRelationDir = true;
1239 dboid = atooid(lastDir + 1);
1242 else if (strcmp(path, "./global") == 0)
1244 isRelationDir = true;
1245 isGlobalDir = true;
1248 dir = AllocateDir(path);
1249 while ((de = ReadDir(dir, path)) != NULL)
1251 int excludeIdx;
1252 bool excludeFound;
1253 RelFileNumber relfilenumber = InvalidRelFileNumber;
1254 ForkNumber relForkNum = InvalidForkNumber;
1255 unsigned segno = 0;
1256 bool isRelationFile = false;
1258 /* Skip special stuff */
1259 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1260 continue;
1262 /* Skip temporary files */
1263 if (strncmp(de->d_name,
1264 PG_TEMP_FILE_PREFIX,
1265 strlen(PG_TEMP_FILE_PREFIX)) == 0)
1266 continue;
1268 /* Skip macOS system files */
1269 if (strcmp(de->d_name, ".DS_Store") == 0)
1270 continue;
1273 * Check if the postmaster has signaled us to exit, and abort with an
1274 * error in that case. The error handler further up will call
1275 * do_pg_abort_backup() for us. Also check that if the backup was
1276 * started while still in recovery, the server wasn't promoted.
1277 * do_pg_backup_stop() will check that too, but it's better to stop
1278 * the backup early than continue to the end and fail there.
1280 CHECK_FOR_INTERRUPTS();
1281 if (RecoveryInProgress() != backup_started_in_recovery)
1282 ereport(ERROR,
1283 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1284 errmsg("the standby was promoted during online backup"),
1285 errhint("This means that the backup being taken is corrupt "
1286 "and should not be used. "
1287 "Try taking another online backup.")));
1289 /* Scan for files that should be excluded */
1290 excludeFound = false;
1291 for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
1293 int cmplen = strlen(excludeFiles[excludeIdx].name);
1295 if (!excludeFiles[excludeIdx].match_prefix)
1296 cmplen++;
1297 if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
1299 elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1300 excludeFound = true;
1301 break;
1305 if (excludeFound)
1306 continue;
1309 * If there could be non-temporary relation files in this directory,
1310 * try to parse the filename.
1312 if (isRelationDir)
1313 isRelationFile =
1314 parse_filename_for_nontemp_relation(de->d_name,
1315 &relfilenumber,
1316 &relForkNum, &segno);
1318 /* Exclude all forks for unlogged tables except the init fork */
1319 if (isRelationFile && relForkNum != INIT_FORKNUM)
1321 char initForkFile[MAXPGPATH];
1324 * If any other type of fork, check if there is an init fork with
1325 * the same RelFileNumber. If so, the file can be excluded.
1327 snprintf(initForkFile, sizeof(initForkFile), "%s/%u_init",
1328 path, relfilenumber);
1330 if (lstat(initForkFile, &statbuf) == 0)
1332 elog(DEBUG2,
1333 "unlogged relation file \"%s\" excluded from backup",
1334 de->d_name);
1336 continue;
1340 /* Exclude temporary relations */
1341 if (OidIsValid(dboid) && looks_like_temp_rel_name(de->d_name))
1343 elog(DEBUG2,
1344 "temporary relation file \"%s\" excluded from backup",
1345 de->d_name);
1347 continue;
1350 snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1352 /* Skip pg_control here to back up it last */
1353 if (strcmp(pathbuf, "./global/pg_control") == 0)
1354 continue;
1356 if (lstat(pathbuf, &statbuf) != 0)
1358 if (errno != ENOENT)
1359 ereport(ERROR,
1360 (errcode_for_file_access(),
1361 errmsg("could not stat file or directory \"%s\": %m",
1362 pathbuf)));
1364 /* If the file went away while scanning, it's not an error. */
1365 continue;
1368 /* Scan for directories whose contents should be excluded */
1369 excludeFound = false;
1370 for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1372 if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1374 elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1375 convert_link_to_directory(pathbuf, &statbuf);
1376 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1377 &statbuf, sizeonly);
1378 excludeFound = true;
1379 break;
1383 if (excludeFound)
1384 continue;
1387 * We can skip pg_wal, the WAL segments need to be fetched from the
1388 * WAL archive anyway. But include it as an empty directory anyway, so
1389 * we get permissions right.
1391 if (strcmp(pathbuf, "./pg_wal") == 0)
1393 /* If pg_wal is a symlink, write it as a directory anyway */
1394 convert_link_to_directory(pathbuf, &statbuf);
1395 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1396 &statbuf, sizeonly);
1399 * Also send archive_status and summaries directories (by
1400 * hackishly reusing statbuf from above ...).
1402 size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL,
1403 &statbuf, sizeonly);
1404 size += _tarWriteHeader(sink, "./pg_wal/summaries", NULL,
1405 &statbuf, sizeonly);
1407 continue; /* don't recurse into pg_wal */
1410 /* Allow symbolic links in pg_tblspc only */
1411 if (strcmp(path, "./pg_tblspc") == 0 && S_ISLNK(statbuf.st_mode))
1413 char linkpath[MAXPGPATH];
1414 int rllen;
1416 rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1417 if (rllen < 0)
1418 ereport(ERROR,
1419 (errcode_for_file_access(),
1420 errmsg("could not read symbolic link \"%s\": %m",
1421 pathbuf)));
1422 if (rllen >= sizeof(linkpath))
1423 ereport(ERROR,
1424 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1425 errmsg("symbolic link \"%s\" target is too long",
1426 pathbuf)));
1427 linkpath[rllen] = '\0';
1429 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, linkpath,
1430 &statbuf, sizeonly);
1432 else if (S_ISDIR(statbuf.st_mode))
1434 bool skip_this_dir = false;
1435 ListCell *lc;
1438 * Store a directory entry in the tar file so we can get the
1439 * permissions right.
1441 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, &statbuf,
1442 sizeonly);
1445 * Call ourselves recursively for a directory, unless it happens
1446 * to be a separate tablespace located within PGDATA.
1448 foreach(lc, tablespaces)
1450 tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1453 * ti->rpath is the tablespace relative path within PGDATA, or
1454 * NULL if the tablespace has been properly located somewhere
1455 * else.
1457 * Skip past the leading "./" in pathbuf when comparing.
1459 if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1461 skip_this_dir = true;
1462 break;
1467 * skip sending directories inside pg_tblspc, if not required.
1469 if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1470 skip_this_dir = true;
1472 if (!skip_this_dir)
1473 size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces,
1474 sendtblspclinks, manifest, spcoid, ib);
1476 else if (S_ISREG(statbuf.st_mode))
1478 bool sent = false;
1479 unsigned num_blocks_required = 0;
1480 unsigned truncation_block_length = 0;
1481 char tarfilenamebuf[MAXPGPATH * 2];
1482 char *tarfilename = pathbuf + basepathlen + 1;
1483 FileBackupMethod method = BACK_UP_FILE_FULLY;
1485 if (ib != NULL && isRelationFile)
1487 Oid relspcoid;
1488 char *lookup_path;
1490 if (OidIsValid(spcoid))
1492 relspcoid = spcoid;
1493 lookup_path = psprintf("%s/%u/%s", PG_TBLSPC_DIR, spcoid,
1494 tarfilename);
1496 else
1498 if (isGlobalDir)
1499 relspcoid = GLOBALTABLESPACE_OID;
1500 else
1501 relspcoid = DEFAULTTABLESPACE_OID;
1502 lookup_path = pstrdup(tarfilename);
1505 method = GetFileBackupMethod(ib, lookup_path, dboid, relspcoid,
1506 relfilenumber, relForkNum,
1507 segno, statbuf.st_size,
1508 &num_blocks_required,
1509 relative_block_numbers,
1510 &truncation_block_length);
1511 if (method == BACK_UP_FILE_INCREMENTALLY)
1513 statbuf.st_size =
1514 GetIncrementalFileSize(num_blocks_required);
1515 snprintf(tarfilenamebuf, sizeof(tarfilenamebuf),
1516 "%s/INCREMENTAL.%s",
1517 path + basepathlen + 1,
1518 de->d_name);
1519 tarfilename = tarfilenamebuf;
1522 pfree(lookup_path);
1525 if (!sizeonly)
1526 sent = sendFile(sink, pathbuf, tarfilename, &statbuf,
1527 true, dboid, spcoid,
1528 relfilenumber, segno, manifest,
1529 num_blocks_required,
1530 method == BACK_UP_FILE_INCREMENTALLY ? relative_block_numbers : NULL,
1531 truncation_block_length);
1533 if (sent || sizeonly)
1535 /* Add size. */
1536 size += statbuf.st_size;
1538 /* Pad to a multiple of the tar block size. */
1539 size += tarPaddingBytesRequired(statbuf.st_size);
1541 /* Size of the header for the file. */
1542 size += TAR_BLOCK_SIZE;
1545 else
1546 ereport(WARNING,
1547 (errmsg("skipping special file \"%s\"", pathbuf)));
1550 if (relative_block_numbers != NULL)
1551 pfree(relative_block_numbers);
1553 FreeDir(dir);
1554 return size;
1558 * Given the member, write the TAR header & send the file.
1560 * If 'missing_ok' is true, will not throw an error if the file is not found.
1562 * If dboid is anything other than InvalidOid then any checksum failures
1563 * detected will get reported to the cumulative stats system.
1565 * If the file is to be sent incrementally, then num_incremental_blocks
1566 * should be the number of blocks to be sent, and incremental_blocks
1567 * an array of block numbers relative to the start of the current segment.
1568 * If the whole file is to be sent, then incremental_blocks should be NULL,
1569 * and num_incremental_blocks can have any value, as it will be ignored.
1571 * Returns true if the file was successfully sent, false if 'missing_ok',
1572 * and the file did not exist.
1574 static bool
1575 sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1576 struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid,
1577 RelFileNumber relfilenumber, unsigned segno,
1578 backup_manifest_info *manifest, unsigned num_incremental_blocks,
1579 BlockNumber *incremental_blocks, unsigned truncation_block_length)
1581 int fd;
1582 BlockNumber blkno = 0;
1583 int checksum_failures = 0;
1584 off_t cnt;
1585 pgoff_t bytes_done = 0;
1586 bool verify_checksum = false;
1587 pg_checksum_context checksum_ctx;
1588 int ibindex = 0;
1590 if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1591 elog(ERROR, "could not initialize checksum of file \"%s\"",
1592 readfilename);
1594 fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
1595 if (fd < 0)
1597 if (errno == ENOENT && missing_ok)
1598 return false;
1599 ereport(ERROR,
1600 (errcode_for_file_access(),
1601 errmsg("could not open file \"%s\": %m", readfilename)));
1604 _tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
1607 * Checksums are verified in multiples of BLCKSZ, so the buffer length
1608 * should be a multiple of the block size as well.
1610 Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1613 * If we weren't told not to verify checksums, and if checksums are
1614 * enabled for this cluster, and if this is a relation file, then verify
1615 * the checksum.
1617 if (!noverify_checksums && DataChecksumsEnabled() &&
1618 RelFileNumberIsValid(relfilenumber))
1619 verify_checksum = true;
1622 * If we're sending an incremental file, write the file header.
1624 if (incremental_blocks != NULL)
1626 unsigned magic = INCREMENTAL_MAGIC;
1627 size_t header_bytes_done = 0;
1628 char padding[BLCKSZ];
1629 size_t paddinglen;
1631 /* Emit header data. */
1632 push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1633 &magic, sizeof(magic));
1634 push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1635 &num_incremental_blocks, sizeof(num_incremental_blocks));
1636 push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1637 &truncation_block_length, sizeof(truncation_block_length));
1638 push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1639 incremental_blocks,
1640 sizeof(BlockNumber) * num_incremental_blocks);
1643 * Add padding to align header to a multiple of BLCKSZ, but only if
1644 * the incremental file has some blocks, and the alignment is actually
1645 * needed (i.e. header is not already a multiple of BLCKSZ). If there
1646 * are no blocks we don't want to make the file unnecessarily large,
1647 * as that might make some filesystem optimizations impossible.
1649 if ((num_incremental_blocks > 0) && (header_bytes_done % BLCKSZ != 0))
1651 paddinglen = (BLCKSZ - (header_bytes_done % BLCKSZ));
1653 memset(padding, 0, paddinglen);
1654 bytes_done += paddinglen;
1656 push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1657 padding, paddinglen);
1660 /* Flush out any data still in the buffer so it's again empty. */
1661 if (header_bytes_done > 0)
1663 bbsink_archive_contents(sink, header_bytes_done);
1664 if (pg_checksum_update(&checksum_ctx,
1665 (uint8 *) sink->bbs_buffer,
1666 header_bytes_done) < 0)
1667 elog(ERROR, "could not update checksum of base backup");
1670 /* Update our notion of file position. */
1671 bytes_done += sizeof(magic);
1672 bytes_done += sizeof(num_incremental_blocks);
1673 bytes_done += sizeof(truncation_block_length);
1674 bytes_done += sizeof(BlockNumber) * num_incremental_blocks;
1678 * Loop until we read the amount of data the caller told us to expect. The
1679 * file could be longer, if it was extended while we were sending it, but
1680 * for a base backup we can ignore such extended data. It will be restored
1681 * from WAL.
1683 while (1)
1686 * Determine whether we've read all the data that we need, and if not,
1687 * read some more.
1689 if (incremental_blocks == NULL)
1691 size_t remaining = statbuf->st_size - bytes_done;
1694 * If we've read the required number of bytes, then it's time to
1695 * stop.
1697 if (bytes_done >= statbuf->st_size)
1698 break;
1701 * Read as many bytes as will fit in the buffer, or however many
1702 * are left to read, whichever is less.
1704 cnt = read_file_data_into_buffer(sink, readfilename, fd,
1705 bytes_done, remaining,
1706 blkno + segno * RELSEG_SIZE,
1707 verify_checksum,
1708 &checksum_failures);
1710 else
1712 BlockNumber relative_blkno;
1715 * If we've read all the blocks, then it's time to stop.
1717 if (ibindex >= num_incremental_blocks)
1718 break;
1721 * Read just one block, whichever one is the next that we're
1722 * supposed to include.
1724 relative_blkno = incremental_blocks[ibindex++];
1725 cnt = read_file_data_into_buffer(sink, readfilename, fd,
1726 relative_blkno * BLCKSZ,
1727 BLCKSZ,
1728 relative_blkno + segno * RELSEG_SIZE,
1729 verify_checksum,
1730 &checksum_failures);
1733 * If we get a partial read, that must mean that the relation is
1734 * being truncated. Ultimately, it should be truncated to a
1735 * multiple of BLCKSZ, since this path should only be reached for
1736 * relation files, but we might transiently observe an
1737 * intermediate value.
1739 * It should be fine to treat this just as if the entire block had
1740 * been truncated away - i.e. fill this and all later blocks with
1741 * zeroes. WAL replay will fix things up.
1743 if (cnt < BLCKSZ)
1744 break;
1748 * If the amount of data we were able to read was not a multiple of
1749 * BLCKSZ, we cannot verify checksums, which are block-level.
1751 if (verify_checksum && (cnt % BLCKSZ != 0))
1753 ereport(WARNING,
1754 (errmsg("could not verify checksum in file \"%s\", block "
1755 "%u: read buffer size %d and page size %d "
1756 "differ",
1757 readfilename, blkno, (int) cnt, BLCKSZ)));
1758 verify_checksum = false;
1762 * If we hit end-of-file, a concurrent truncation must have occurred.
1763 * That's not an error condition, because WAL replay will fix things
1764 * up.
1766 if (cnt == 0)
1767 break;
1769 /* Update block number and # of bytes done for next loop iteration. */
1770 blkno += cnt / BLCKSZ;
1771 bytes_done += cnt;
1774 * Make sure incremental files with block data are properly aligned
1775 * (header is a multiple of BLCKSZ, blocks are BLCKSZ too).
1777 Assert(!((incremental_blocks != NULL && num_incremental_blocks > 0) &&
1778 (bytes_done % BLCKSZ != 0)));
1780 /* Archive the data we just read. */
1781 bbsink_archive_contents(sink, cnt);
1783 /* Also feed it to the checksum machinery. */
1784 if (pg_checksum_update(&checksum_ctx,
1785 (uint8 *) sink->bbs_buffer, cnt) < 0)
1786 elog(ERROR, "could not update checksum of base backup");
1789 /* If the file was truncated while we were sending it, pad it with zeros */
1790 while (bytes_done < statbuf->st_size)
1792 size_t remaining = statbuf->st_size - bytes_done;
1793 size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1795 MemSet(sink->bbs_buffer, 0, nbytes);
1796 if (pg_checksum_update(&checksum_ctx,
1797 (uint8 *) sink->bbs_buffer,
1798 nbytes) < 0)
1799 elog(ERROR, "could not update checksum of base backup");
1800 bbsink_archive_contents(sink, nbytes);
1801 bytes_done += nbytes;
1805 * Pad to a block boundary, per tar format requirements. (This small piece
1806 * of data is probably not worth throttling, and is not checksummed
1807 * because it's not actually part of the file.)
1809 _tarWritePadding(sink, bytes_done);
1811 CloseTransientFile(fd);
1813 if (checksum_failures > 1)
1815 ereport(WARNING,
1816 (errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
1817 "file \"%s\" has a total of %d checksum verification failures",
1818 checksum_failures,
1819 readfilename, checksum_failures)));
1821 pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
1824 total_checksum_failures += checksum_failures;
1826 AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size,
1827 (pg_time_t) statbuf->st_mtime, &checksum_ctx);
1829 return true;
1833 * Read some more data from the file into the bbsink's buffer, verifying
1834 * checksums as required.
1836 * 'offset' is the file offset from which we should begin to read, and
1837 * 'length' is the amount of data that should be read. The actual amount
1838 * of data read will be less than the requested amount if the bbsink's
1839 * buffer isn't big enough to hold it all, or if the underlying file has
1840 * been truncated. The return value is the number of bytes actually read.
1842 * 'blkno' is the block number of the first page in the bbsink's buffer
1843 * relative to the start of the relation.
1845 * 'verify_checksum' indicates whether we should try to verify checksums
1846 * for the blocks we read. If we do this, we'll update *checksum_failures
1847 * and issue warnings as appropriate.
1849 static off_t
1850 read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
1851 off_t offset, size_t length, BlockNumber blkno,
1852 bool verify_checksum, int *checksum_failures)
1854 off_t cnt;
1855 int i;
1856 char *page;
1858 /* Try to read some more data. */
1859 cnt = basebackup_read_file(fd, sink->bbs_buffer,
1860 Min(sink->bbs_buffer_length, length),
1861 offset, readfilename, true);
1863 /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1864 if (!verify_checksum || (cnt % BLCKSZ) != 0)
1865 return cnt;
1867 /* Verify checksum for each block. */
1868 for (i = 0; i < cnt / BLCKSZ; i++)
1870 int reread_cnt;
1871 uint16 expected_checksum;
1873 page = sink->bbs_buffer + BLCKSZ * i;
1875 /* If the page is OK, go on to the next one. */
1876 if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1877 &expected_checksum))
1878 continue;
1881 * Retry the block on the first failure. It's possible that we read
1882 * the first 4K page of the block just before postgres updated the
1883 * entire block so it ends up looking torn to us. If, before we retry
1884 * the read, the concurrent write of the block finishes, the page LSN
1885 * will be updated and we'll realize that we should ignore this block.
1887 * There's no guarantee that this will actually happen, though: the
1888 * torn write could take an arbitrarily long time to complete.
1889 * Retrying multiple times wouldn't fix this problem, either, though
1890 * it would reduce the chances of it happening in practice. The only
1891 * real fix here seems to be to have some kind of interlock that
1892 * allows us to wait until we can be certain that no write to the
1893 * block is in progress. Since we don't have any such thing right now,
1894 * we just do this and hope for the best.
1896 reread_cnt =
1897 basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
1898 BLCKSZ, offset + BLCKSZ * i,
1899 readfilename, false);
1900 if (reread_cnt == 0)
1903 * If we hit end-of-file, a concurrent truncation must have
1904 * occurred, so reduce cnt to reflect only the blocks already
1905 * processed and break out of this loop.
1907 cnt = BLCKSZ * i;
1908 break;
1911 /* If the page now looks OK, go on to the next one. */
1912 if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1913 &expected_checksum))
1914 continue;
1916 /* Handle checksum failure. */
1917 (*checksum_failures)++;
1918 if (*checksum_failures <= 5)
1919 ereport(WARNING,
1920 (errmsg("checksum verification failed in "
1921 "file \"%s\", block %u: calculated "
1922 "%X but expected %X",
1923 readfilename, blkno + i, expected_checksum,
1924 ((PageHeader) page)->pd_checksum)));
1925 if (*checksum_failures == 5)
1926 ereport(WARNING,
1927 (errmsg("further checksum verification "
1928 "failures in file \"%s\" will not "
1929 "be reported", readfilename)));
1932 return cnt;
1936 * Push data into a bbsink.
1938 * It's better, when possible, to read data directly into the bbsink's buffer,
1939 * rather than using this function to copy it into the buffer; this function is
1940 * for cases where that approach is not practical.
1942 * bytes_done should point to a count of the number of bytes that are
1943 * currently used in the bbsink's buffer. Upon return, the bytes identified by
1944 * data and length will have been copied into the bbsink's buffer, flushing
1945 * as required, and *bytes_done will have been updated accordingly. If the
1946 * buffer was flushed, the previous contents will also have been fed to
1947 * checksum_ctx.
1949 * Note that after one or more calls to this function it is the caller's
1950 * responsibility to perform any required final flush.
1952 static void
1953 push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
1954 size_t *bytes_done, void *data, size_t length)
1956 while (length > 0)
1958 size_t bytes_to_copy;
1961 * We use < here rather than <= so that if the data exactly fills the
1962 * remaining buffer space, we trigger a flush now.
1964 if (length < sink->bbs_buffer_length - *bytes_done)
1966 /* Append remaining data to buffer. */
1967 memcpy(sink->bbs_buffer + *bytes_done, data, length);
1968 *bytes_done += length;
1969 return;
1972 /* Copy until buffer is full and flush it. */
1973 bytes_to_copy = sink->bbs_buffer_length - *bytes_done;
1974 memcpy(sink->bbs_buffer + *bytes_done, data, bytes_to_copy);
1975 data = ((char *) data) + bytes_to_copy;
1976 length -= bytes_to_copy;
1977 bbsink_archive_contents(sink, sink->bbs_buffer_length);
1978 if (pg_checksum_update(checksum_ctx, (uint8 *) sink->bbs_buffer,
1979 sink->bbs_buffer_length) < 0)
1980 elog(ERROR, "could not update checksum");
1981 *bytes_done = 0;
1986 * Try to verify the checksum for the provided page, if it seems appropriate
1987 * to do so.
1989 * Returns true if verification succeeds or if we decide not to check it,
1990 * and false if verification fails. When return false, it also sets
1991 * *expected_checksum to the computed value.
1993 static bool
1994 verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno,
1995 uint16 *expected_checksum)
1997 PageHeader phdr;
1998 uint16 checksum;
2001 * Only check pages which have not been modified since the start of the
2002 * base backup. Otherwise, they might have been written only halfway and
2003 * the checksum would not be valid. However, replaying WAL would
2004 * reinstate the correct page in this case. We also skip completely new
2005 * pages, since they don't have a checksum yet.
2007 if (PageIsNew(page) || PageGetLSN(page) >= start_lsn)
2008 return true;
2010 /* Perform the actual checksum calculation. */
2011 checksum = pg_checksum_page(page, blkno);
2013 /* See whether it matches the value from the page. */
2014 phdr = (PageHeader) page;
2015 if (phdr->pd_checksum == checksum)
2016 return true;
2017 *expected_checksum = checksum;
2018 return false;
2021 static int64
2022 _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget,
2023 struct stat *statbuf, bool sizeonly)
2025 enum tarError rc;
2027 if (!sizeonly)
2030 * As of this writing, the smallest supported block size is 1kB, which
2031 * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a
2032 * multiple of BLCKSZ, it should be safe to assume that the buffer is
2033 * large enough to fit an entire tar block. We double-check by means
2034 * of these assertions.
2036 StaticAssertDecl(TAR_BLOCK_SIZE <= BLCKSZ,
2037 "BLCKSZ too small for tar block");
2038 Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2040 rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget,
2041 statbuf->st_size, statbuf->st_mode,
2042 statbuf->st_uid, statbuf->st_gid,
2043 statbuf->st_mtime);
2045 switch (rc)
2047 case TAR_OK:
2048 break;
2049 case TAR_NAME_TOO_LONG:
2050 ereport(ERROR,
2051 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2052 errmsg("file name too long for tar format: \"%s\"",
2053 filename)));
2054 break;
2055 case TAR_SYMLINK_TOO_LONG:
2056 ereport(ERROR,
2057 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2058 errmsg("symbolic link target too long for tar format: "
2059 "file name \"%s\", target \"%s\"",
2060 filename, linktarget)));
2061 break;
2062 default:
2063 elog(ERROR, "unrecognized tar error: %d", rc);
2066 bbsink_archive_contents(sink, TAR_BLOCK_SIZE);
2069 return TAR_BLOCK_SIZE;
2073 * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE.
2075 static void
2076 _tarWritePadding(bbsink *sink, int len)
2078 int pad = tarPaddingBytesRequired(len);
2081 * As in _tarWriteHeader, it should be safe to assume that the buffer is
2082 * large enough that we don't need to do this in multiple chunks.
2084 Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2085 Assert(pad <= TAR_BLOCK_SIZE);
2087 if (pad > 0)
2089 MemSet(sink->bbs_buffer, 0, pad);
2090 bbsink_archive_contents(sink, pad);
2095 * If the entry in statbuf is a link, then adjust statbuf to make it look like a
2096 * directory, so that it will be written that way.
2098 static void
2099 convert_link_to_directory(const char *pathbuf, struct stat *statbuf)
2101 /* If symlink, write it as a directory anyway */
2102 if (S_ISLNK(statbuf->st_mode))
2103 statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
2107 * Read some data from a file, setting a wait event and reporting any error
2108 * encountered.
2110 * If partial_read_ok is false, also report an error if the number of bytes
2111 * read is not equal to the number of bytes requested.
2113 * Returns the number of bytes read.
2115 static ssize_t
2116 basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
2117 const char *filename, bool partial_read_ok)
2119 ssize_t rc;
2121 pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
2122 rc = pg_pread(fd, buf, nbytes, offset);
2123 pgstat_report_wait_end();
2125 if (rc < 0)
2126 ereport(ERROR,
2127 (errcode_for_file_access(),
2128 errmsg("could not read file \"%s\": %m", filename)));
2129 if (!partial_read_ok && rc > 0 && rc != nbytes)
2130 ereport(ERROR,
2131 (errcode_for_file_access(),
2132 errmsg("could not read file \"%s\": read %zd of %zu",
2133 filename, rc, nbytes)));
2135 return rc;