Prevent BASE_BACKUP in the middle of another backup in the same session.
[pgsql.git] / src / backend / replication / basebackup.c
blob637c0ce459766a3aaa24a9b0fd9468a60cb7244c
1 /*-------------------------------------------------------------------------
3 * basebackup.c
4 * code for taking a base backup and streaming it to a standby
6 * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group
8 * IDENTIFICATION
9 * src/backend/replication/basebackup.c
11 *-------------------------------------------------------------------------
13 #include "postgres.h"
15 #include <sys/stat.h>
16 #include <unistd.h>
17 #include <time.h>
19 #include "access/xlog_internal.h" /* for pg_backup_start/stop */
20 #include "common/compression.h"
21 #include "common/file_perm.h"
22 #include "commands/defrem.h"
23 #include "lib/stringinfo.h"
24 #include "miscadmin.h"
25 #include "nodes/pg_list.h"
26 #include "pgstat.h"
27 #include "pgtar.h"
28 #include "port.h"
29 #include "postmaster/syslogger.h"
30 #include "replication/basebackup.h"
31 #include "replication/basebackup_sink.h"
32 #include "replication/basebackup_target.h"
33 #include "replication/backup_manifest.h"
34 #include "replication/walsender.h"
35 #include "replication/walsender_private.h"
36 #include "storage/bufpage.h"
37 #include "storage/checksum.h"
38 #include "storage/dsm_impl.h"
39 #include "storage/fd.h"
40 #include "storage/ipc.h"
41 #include "storage/reinit.h"
42 #include "utils/builtins.h"
43 #include "utils/ps_status.h"
44 #include "utils/relcache.h"
45 #include "utils/resowner.h"
46 #include "utils/timestamp.h"
49 * How much data do we want to send in one CopyData message? Note that
50 * this may also result in reading the underlying files in chunks of this
51 * size.
53 * NB: The buffer size is required to be a multiple of the system block
54 * size, so use that value instead if it's bigger than our preference.
56 #define SINK_BUFFER_LENGTH Max(32768, BLCKSZ)
58 typedef struct
60 const char *label;
61 bool progress;
62 bool fastcheckpoint;
63 bool nowait;
64 bool includewal;
65 uint32 maxrate;
66 bool sendtblspcmapfile;
67 bool send_to_client;
68 bool use_copytblspc;
69 BaseBackupTargetHandle *target_handle;
70 backup_manifest_option manifest;
71 pg_compress_algorithm compression;
72 pg_compress_specification compression_specification;
73 pg_checksum_type manifest_checksum_type;
74 } basebackup_options;
76 static int64 sendTablespace(bbsink *sink, char *path, char *oid, bool sizeonly,
77 struct backup_manifest_info *manifest);
78 static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
79 List *tablespaces, bool sendtblspclinks,
80 backup_manifest_info *manifest, const char *spcoid);
81 static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
82 struct stat *statbuf, bool missing_ok, Oid dboid,
83 backup_manifest_info *manifest, const char *spcoid);
84 static void sendFileWithContent(bbsink *sink, const char *filename,
85 const char *content,
86 backup_manifest_info *manifest);
87 static int64 _tarWriteHeader(bbsink *sink, const char *filename,
88 const char *linktarget, struct stat *statbuf,
89 bool sizeonly);
90 static void _tarWritePadding(bbsink *sink, int len);
91 static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf);
92 static void perform_base_backup(basebackup_options *opt, bbsink *sink);
93 static void parse_basebackup_options(List *options, basebackup_options *opt);
94 static int compareWalFileNames(const ListCell *a, const ListCell *b);
95 static bool is_checksummed_file(const char *fullpath, const char *filename);
96 static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
97 const char *filename, bool partial_read_ok);
99 /* Was the backup currently in-progress initiated in recovery mode? */
100 static bool backup_started_in_recovery = false;
102 /* Total number of checksum failures during base backup. */
103 static long long int total_checksum_failures;
105 /* Do not verify checksums. */
106 static bool noverify_checksums = false;
109 * Definition of one element part of an exclusion list, used for paths part
110 * of checksum validation or base backups. "name" is the name of the file
111 * or path to check for exclusion. If "match_prefix" is true, any items
112 * matching the name as prefix are excluded.
114 struct exclude_list_item
116 const char *name;
117 bool match_prefix;
121 * The contents of these directories are removed or recreated during server
122 * start so they are not included in backups. The directories themselves are
123 * kept and included as empty to preserve access permissions.
125 * Note: this list should be kept in sync with the filter lists in pg_rewind's
126 * filemap.c.
128 static const char *const excludeDirContents[] =
131 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
132 * because extensions like pg_stat_statements store data there.
134 PG_STAT_TMP_DIR,
137 * It is generally not useful to backup the contents of this directory
138 * even if the intention is to restore to another primary. See backup.sgml
139 * for a more detailed description.
141 "pg_replslot",
143 /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
144 PG_DYNSHMEM_DIR,
146 /* Contents removed on startup, see AsyncShmemInit(). */
147 "pg_notify",
150 * Old contents are loaded for possible debugging but are not required for
151 * normal operation, see SerialInit().
153 "pg_serial",
155 /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
156 "pg_snapshots",
158 /* Contents zeroed on startup, see StartupSUBTRANS(). */
159 "pg_subtrans",
161 /* end of list */
162 NULL
166 * List of files excluded from backups.
168 static const struct exclude_list_item excludeFiles[] =
170 /* Skip auto conf temporary file. */
171 {PG_AUTOCONF_FILENAME ".tmp", false},
173 /* Skip current log file temporary file */
174 {LOG_METAINFO_DATAFILE_TMP, false},
177 * Skip relation cache because it is rebuilt on startup. This includes
178 * temporary files.
180 {RELCACHE_INIT_FILENAME, true},
183 * backup_label and tablespace_map should not exist in a running cluster
184 * capable of doing an online backup, but exclude them just in case.
186 {BACKUP_LABEL_FILE, false},
187 {TABLESPACE_MAP, false},
190 * If there's a backup_manifest, it belongs to a backup that was used to
191 * start this server. It is *not* correct for this backup. Our
192 * backup_manifest is injected into the backup separately if users want
193 * it.
195 {"backup_manifest", false},
197 {"postmaster.pid", false},
198 {"postmaster.opts", false},
200 /* end of list */
201 {NULL, false}
205 * List of files excluded from checksum validation.
207 * Note: this list should be kept in sync with what pg_checksums.c
208 * includes.
210 static const struct exclude_list_item noChecksumFiles[] = {
211 {"pg_control", false},
212 {"pg_filenode.map", false},
213 {"pg_internal.init", true},
214 {"PG_VERSION", false},
215 #ifdef EXEC_BACKEND
216 {"config_exec_params", true},
217 #endif
218 {NULL, false}
222 * Actually do a base backup for the specified tablespaces.
224 * This is split out mainly to avoid complaints about "variable might be
225 * clobbered by longjmp" from stupider versions of gcc.
227 static void
228 perform_base_backup(basebackup_options *opt, bbsink *sink)
230 bbsink_state state;
231 XLogRecPtr endptr;
232 TimeLineID endtli;
233 StringInfo labelfile;
234 StringInfo tblspc_map_file;
235 backup_manifest_info manifest;
237 /* Initial backup state, insofar as we know it now. */
238 state.tablespaces = NIL;
239 state.tablespace_num = 0;
240 state.bytes_done = 0;
241 state.bytes_total = 0;
242 state.bytes_total_is_valid = false;
244 /* we're going to use a BufFile, so we need a ResourceOwner */
245 Assert(CurrentResourceOwner == NULL);
246 CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup");
248 backup_started_in_recovery = RecoveryInProgress();
250 labelfile = makeStringInfo();
251 tblspc_map_file = makeStringInfo();
252 InitializeBackupManifest(&manifest, opt->manifest,
253 opt->manifest_checksum_type);
255 total_checksum_failures = 0;
257 basebackup_progress_wait_checkpoint();
258 state.startptr = do_pg_backup_start(opt->label, opt->fastcheckpoint,
259 &state.starttli,
260 labelfile, &state.tablespaces,
261 tblspc_map_file);
264 * Once do_pg_backup_start has been called, ensure that any failure causes
265 * us to abort the backup so we don't "leak" a backup counter. For this
266 * reason, *all* functionality between do_pg_backup_start() and the end of
267 * do_pg_backup_stop() should be inside the error cleanup block!
270 PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
272 ListCell *lc;
273 tablespaceinfo *ti;
275 /* Add a node for the base directory at the end */
276 ti = palloc0(sizeof(tablespaceinfo));
277 ti->size = -1;
278 state.tablespaces = lappend(state.tablespaces, ti);
281 * Calculate the total backup size by summing up the size of each
282 * tablespace
284 if (opt->progress)
286 basebackup_progress_estimate_backup_size();
288 foreach(lc, state.tablespaces)
290 tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
292 if (tmp->path == NULL)
293 tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
294 true, NULL, NULL);
295 else
296 tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
297 NULL);
298 state.bytes_total += tmp->size;
300 state.bytes_total_is_valid = true;
303 /* notify basebackup sink about start of backup */
304 bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);
306 /* Send off our tablespaces one by one */
307 foreach(lc, state.tablespaces)
309 tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
311 if (ti->path == NULL)
313 struct stat statbuf;
314 bool sendtblspclinks = true;
316 bbsink_begin_archive(sink, "base.tar");
318 /* In the main tar, include the backup_label first... */
319 sendFileWithContent(sink, BACKUP_LABEL_FILE, labelfile->data,
320 &manifest);
322 /* Then the tablespace_map file, if required... */
323 if (opt->sendtblspcmapfile)
325 sendFileWithContent(sink, TABLESPACE_MAP, tblspc_map_file->data,
326 &manifest);
327 sendtblspclinks = false;
330 /* Then the bulk of the files... */
331 sendDir(sink, ".", 1, false, state.tablespaces,
332 sendtblspclinks, &manifest, NULL);
334 /* ... and pg_control after everything else. */
335 if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
336 ereport(ERROR,
337 (errcode_for_file_access(),
338 errmsg("could not stat file \"%s\": %m",
339 XLOG_CONTROL_FILE)));
340 sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
341 false, InvalidOid, &manifest, NULL);
343 else
345 char *archive_name = psprintf("%s.tar", ti->oid);
347 bbsink_begin_archive(sink, archive_name);
349 sendTablespace(sink, ti->path, ti->oid, false, &manifest);
353 * If we're including WAL, and this is the main data directory we
354 * don't treat this as the end of the tablespace. Instead, we will
355 * include the xlog files below and stop afterwards. This is safe
356 * since the main data directory is always sent *last*.
358 if (opt->includewal && ti->path == NULL)
360 Assert(lnext(state.tablespaces, lc) == NULL);
362 else
364 /* Properly terminate the tarfile. */
365 StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
366 "BLCKSZ too small for 2 tar blocks");
367 memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
368 bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
370 /* OK, that's the end of the archive. */
371 bbsink_end_archive(sink);
375 basebackup_progress_wait_wal_archive(&state);
376 endptr = do_pg_backup_stop(labelfile->data, !opt->nowait, &endtli);
378 PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
381 if (opt->includewal)
384 * We've left the last tar file "open", so we can now append the
385 * required WAL files to it.
387 char pathbuf[MAXPGPATH];
388 XLogSegNo segno;
389 XLogSegNo startsegno;
390 XLogSegNo endsegno;
391 struct stat statbuf;
392 List *historyFileList = NIL;
393 List *walFileList = NIL;
394 char firstoff[MAXFNAMELEN];
395 char lastoff[MAXFNAMELEN];
396 DIR *dir;
397 struct dirent *de;
398 ListCell *lc;
399 TimeLineID tli;
401 basebackup_progress_transfer_wal();
404 * I'd rather not worry about timelines here, so scan pg_wal and
405 * include all WAL files in the range between 'startptr' and 'endptr',
406 * regardless of the timeline the file is stamped with. If there are
407 * some spurious WAL files belonging to timelines that don't belong in
408 * this server's history, they will be included too. Normally there
409 * shouldn't be such files, but if there are, there's little harm in
410 * including them.
412 XLByteToSeg(state.startptr, startsegno, wal_segment_size);
413 XLogFileName(firstoff, state.starttli, startsegno, wal_segment_size);
414 XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
415 XLogFileName(lastoff, endtli, endsegno, wal_segment_size);
417 dir = AllocateDir("pg_wal");
418 while ((de = ReadDir(dir, "pg_wal")) != NULL)
420 /* Does it look like a WAL segment, and is it in the range? */
421 if (IsXLogFileName(de->d_name) &&
422 strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
423 strcmp(de->d_name + 8, lastoff + 8) <= 0)
425 walFileList = lappend(walFileList, pstrdup(de->d_name));
427 /* Does it look like a timeline history file? */
428 else if (IsTLHistoryFileName(de->d_name))
430 historyFileList = lappend(historyFileList, pstrdup(de->d_name));
433 FreeDir(dir);
436 * Before we go any further, check that none of the WAL segments we
437 * need were removed.
439 CheckXLogRemoved(startsegno, state.starttli);
442 * Sort the WAL filenames. We want to send the files in order from
443 * oldest to newest, to reduce the chance that a file is recycled
444 * before we get a chance to send it over.
446 list_sort(walFileList, compareWalFileNames);
449 * There must be at least one xlog file in the pg_wal directory, since
450 * we are doing backup-including-xlog.
452 if (walFileList == NIL)
453 ereport(ERROR,
454 (errmsg("could not find any WAL files")));
457 * Sanity check: the first and last segment should cover startptr and
458 * endptr, with no gaps in between.
460 XLogFromFileName((char *) linitial(walFileList),
461 &tli, &segno, wal_segment_size);
462 if (segno != startsegno)
464 char startfname[MAXFNAMELEN];
466 XLogFileName(startfname, state.starttli, startsegno,
467 wal_segment_size);
468 ereport(ERROR,
469 (errmsg("could not find WAL file \"%s\"", startfname)));
471 foreach(lc, walFileList)
473 char *walFileName = (char *) lfirst(lc);
474 XLogSegNo currsegno = segno;
475 XLogSegNo nextsegno = segno + 1;
477 XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
478 if (!(nextsegno == segno || currsegno == segno))
480 char nextfname[MAXFNAMELEN];
482 XLogFileName(nextfname, tli, nextsegno, wal_segment_size);
483 ereport(ERROR,
484 (errmsg("could not find WAL file \"%s\"", nextfname)));
487 if (segno != endsegno)
489 char endfname[MAXFNAMELEN];
491 XLogFileName(endfname, endtli, endsegno, wal_segment_size);
492 ereport(ERROR,
493 (errmsg("could not find WAL file \"%s\"", endfname)));
496 /* Ok, we have everything we need. Send the WAL files. */
497 foreach(lc, walFileList)
499 char *walFileName = (char *) lfirst(lc);
500 int fd;
501 size_t cnt;
502 pgoff_t len = 0;
504 snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
505 XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
507 fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
508 if (fd < 0)
510 int save_errno = errno;
513 * Most likely reason for this is that the file was already
514 * removed by a checkpoint, so check for that to get a better
515 * error message.
517 CheckXLogRemoved(segno, tli);
519 errno = save_errno;
520 ereport(ERROR,
521 (errcode_for_file_access(),
522 errmsg("could not open file \"%s\": %m", pathbuf)));
525 if (fstat(fd, &statbuf) != 0)
526 ereport(ERROR,
527 (errcode_for_file_access(),
528 errmsg("could not stat file \"%s\": %m",
529 pathbuf)));
530 if (statbuf.st_size != wal_segment_size)
532 CheckXLogRemoved(segno, tli);
533 ereport(ERROR,
534 (errcode_for_file_access(),
535 errmsg("unexpected WAL file size \"%s\"", walFileName)));
538 /* send the WAL file itself */
539 _tarWriteHeader(sink, pathbuf, NULL, &statbuf, false);
541 while ((cnt = basebackup_read_file(fd, sink->bbs_buffer,
542 Min(sink->bbs_buffer_length,
543 wal_segment_size - len),
544 len, pathbuf, true)) > 0)
546 CheckXLogRemoved(segno, tli);
547 bbsink_archive_contents(sink, cnt);
549 len += cnt;
551 if (len == wal_segment_size)
552 break;
555 if (len != wal_segment_size)
557 CheckXLogRemoved(segno, tli);
558 ereport(ERROR,
559 (errcode_for_file_access(),
560 errmsg("unexpected WAL file size \"%s\"", walFileName)));
564 * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
565 * for padding.
567 Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);
569 CloseTransientFile(fd);
572 * Mark file as archived, otherwise files can get archived again
573 * after promotion of a new node. This is in line with
574 * walreceiver.c always doing an XLogArchiveForceDone() after a
575 * complete segment.
577 StatusFilePath(pathbuf, walFileName, ".done");
578 sendFileWithContent(sink, pathbuf, "", &manifest);
582 * Send timeline history files too. Only the latest timeline history
583 * file is required for recovery, and even that only if there happens
584 * to be a timeline switch in the first WAL segment that contains the
585 * checkpoint record, or if we're taking a base backup from a standby
586 * server and the target timeline changes while the backup is taken.
587 * But they are small and highly useful for debugging purposes, so
588 * better include them all, always.
590 foreach(lc, historyFileList)
592 char *fname = lfirst(lc);
594 snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
596 if (lstat(pathbuf, &statbuf) != 0)
597 ereport(ERROR,
598 (errcode_for_file_access(),
599 errmsg("could not stat file \"%s\": %m", pathbuf)));
601 sendFile(sink, pathbuf, pathbuf, &statbuf, false, InvalidOid,
602 &manifest, NULL);
604 /* unconditionally mark file as archived */
605 StatusFilePath(pathbuf, fname, ".done");
606 sendFileWithContent(sink, pathbuf, "", &manifest);
609 /* Properly terminate the tar file. */
610 StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
611 "BLCKSZ too small for 2 tar blocks");
612 memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
613 bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
615 /* OK, that's the end of the archive. */
616 bbsink_end_archive(sink);
619 AddWALInfoToBackupManifest(&manifest, state.startptr, state.starttli,
620 endptr, endtli);
622 SendBackupManifest(&manifest, sink);
624 bbsink_end_backup(sink, endptr, endtli);
626 if (total_checksum_failures)
628 if (total_checksum_failures > 1)
629 ereport(WARNING,
630 (errmsg_plural("%lld total checksum verification failure",
631 "%lld total checksum verification failures",
632 total_checksum_failures,
633 total_checksum_failures)));
635 ereport(ERROR,
636 (errcode(ERRCODE_DATA_CORRUPTED),
637 errmsg("checksum verification failure during base backup")));
641 * Make sure to free the manifest before the resource owners as manifests
642 * use cryptohash contexts that may depend on resource owners (like
643 * OpenSSL).
645 FreeBackupManifest(&manifest);
647 /* clean up the resource owner we created */
648 WalSndResourceCleanup(true);
650 basebackup_progress_done();
654 * list_sort comparison function, to compare log/seg portion of WAL segment
655 * filenames, ignoring the timeline portion.
657 static int
658 compareWalFileNames(const ListCell *a, const ListCell *b)
660 char *fna = (char *) lfirst(a);
661 char *fnb = (char *) lfirst(b);
663 return strcmp(fna + 8, fnb + 8);
667 * Parse the base backup options passed down by the parser
669 static void
670 parse_basebackup_options(List *options, basebackup_options *opt)
672 ListCell *lopt;
673 bool o_label = false;
674 bool o_progress = false;
675 bool o_checkpoint = false;
676 bool o_nowait = false;
677 bool o_wal = false;
678 bool o_maxrate = false;
679 bool o_tablespace_map = false;
680 bool o_noverify_checksums = false;
681 bool o_manifest = false;
682 bool o_manifest_checksums = false;
683 bool o_target = false;
684 bool o_target_detail = false;
685 char *target_str = NULL;
686 char *target_detail_str = NULL;
687 bool o_compression = false;
688 bool o_compression_detail = false;
689 char *compression_detail_str = NULL;
691 MemSet(opt, 0, sizeof(*opt));
692 opt->manifest = MANIFEST_OPTION_NO;
693 opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C;
694 opt->compression = PG_COMPRESSION_NONE;
695 opt->compression_specification.algorithm = PG_COMPRESSION_NONE;
697 foreach(lopt, options)
699 DefElem *defel = (DefElem *) lfirst(lopt);
701 if (strcmp(defel->defname, "label") == 0)
703 if (o_label)
704 ereport(ERROR,
705 (errcode(ERRCODE_SYNTAX_ERROR),
706 errmsg("duplicate option \"%s\"", defel->defname)));
707 opt->label = defGetString(defel);
708 o_label = true;
710 else if (strcmp(defel->defname, "progress") == 0)
712 if (o_progress)
713 ereport(ERROR,
714 (errcode(ERRCODE_SYNTAX_ERROR),
715 errmsg("duplicate option \"%s\"", defel->defname)));
716 opt->progress = defGetBoolean(defel);
717 o_progress = true;
719 else if (strcmp(defel->defname, "checkpoint") == 0)
721 char *optval = defGetString(defel);
723 if (o_checkpoint)
724 ereport(ERROR,
725 (errcode(ERRCODE_SYNTAX_ERROR),
726 errmsg("duplicate option \"%s\"", defel->defname)));
727 if (pg_strcasecmp(optval, "fast") == 0)
728 opt->fastcheckpoint = true;
729 else if (pg_strcasecmp(optval, "spread") == 0)
730 opt->fastcheckpoint = false;
731 else
732 ereport(ERROR,
733 (errcode(ERRCODE_SYNTAX_ERROR),
734 errmsg("unrecognized checkpoint type: \"%s\"",
735 optval)));
736 o_checkpoint = true;
738 else if (strcmp(defel->defname, "wait") == 0)
740 if (o_nowait)
741 ereport(ERROR,
742 (errcode(ERRCODE_SYNTAX_ERROR),
743 errmsg("duplicate option \"%s\"", defel->defname)));
744 opt->nowait = !defGetBoolean(defel);
745 o_nowait = true;
747 else if (strcmp(defel->defname, "wal") == 0)
749 if (o_wal)
750 ereport(ERROR,
751 (errcode(ERRCODE_SYNTAX_ERROR),
752 errmsg("duplicate option \"%s\"", defel->defname)));
753 opt->includewal = defGetBoolean(defel);
754 o_wal = true;
756 else if (strcmp(defel->defname, "max_rate") == 0)
758 int64 maxrate;
760 if (o_maxrate)
761 ereport(ERROR,
762 (errcode(ERRCODE_SYNTAX_ERROR),
763 errmsg("duplicate option \"%s\"", defel->defname)));
765 maxrate = defGetInt64(defel);
766 if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
767 ereport(ERROR,
768 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
769 errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
770 (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
772 opt->maxrate = (uint32) maxrate;
773 o_maxrate = true;
775 else if (strcmp(defel->defname, "tablespace_map") == 0)
777 if (o_tablespace_map)
778 ereport(ERROR,
779 (errcode(ERRCODE_SYNTAX_ERROR),
780 errmsg("duplicate option \"%s\"", defel->defname)));
781 opt->sendtblspcmapfile = defGetBoolean(defel);
782 o_tablespace_map = true;
784 else if (strcmp(defel->defname, "verify_checksums") == 0)
786 if (o_noverify_checksums)
787 ereport(ERROR,
788 (errcode(ERRCODE_SYNTAX_ERROR),
789 errmsg("duplicate option \"%s\"", defel->defname)));
790 noverify_checksums = !defGetBoolean(defel);
791 o_noverify_checksums = true;
793 else if (strcmp(defel->defname, "manifest") == 0)
795 char *optval = defGetString(defel);
796 bool manifest_bool;
798 if (o_manifest)
799 ereport(ERROR,
800 (errcode(ERRCODE_SYNTAX_ERROR),
801 errmsg("duplicate option \"%s\"", defel->defname)));
802 if (parse_bool(optval, &manifest_bool))
804 if (manifest_bool)
805 opt->manifest = MANIFEST_OPTION_YES;
806 else
807 opt->manifest = MANIFEST_OPTION_NO;
809 else if (pg_strcasecmp(optval, "force-encode") == 0)
810 opt->manifest = MANIFEST_OPTION_FORCE_ENCODE;
811 else
812 ereport(ERROR,
813 (errcode(ERRCODE_SYNTAX_ERROR),
814 errmsg("unrecognized manifest option: \"%s\"",
815 optval)));
816 o_manifest = true;
818 else if (strcmp(defel->defname, "manifest_checksums") == 0)
820 char *optval = defGetString(defel);
822 if (o_manifest_checksums)
823 ereport(ERROR,
824 (errcode(ERRCODE_SYNTAX_ERROR),
825 errmsg("duplicate option \"%s\"", defel->defname)));
826 if (!pg_checksum_parse_type(optval,
827 &opt->manifest_checksum_type))
828 ereport(ERROR,
829 (errcode(ERRCODE_SYNTAX_ERROR),
830 errmsg("unrecognized checksum algorithm: \"%s\"",
831 optval)));
832 o_manifest_checksums = true;
834 else if (strcmp(defel->defname, "target") == 0)
836 if (o_target)
837 ereport(ERROR,
838 (errcode(ERRCODE_SYNTAX_ERROR),
839 errmsg("duplicate option \"%s\"", defel->defname)));
840 target_str = defGetString(defel);
841 o_target = true;
843 else if (strcmp(defel->defname, "target_detail") == 0)
845 char *optval = defGetString(defel);
847 if (o_target_detail)
848 ereport(ERROR,
849 (errcode(ERRCODE_SYNTAX_ERROR),
850 errmsg("duplicate option \"%s\"", defel->defname)));
851 target_detail_str = optval;
852 o_target_detail = true;
854 else if (strcmp(defel->defname, "compression") == 0)
856 char *optval = defGetString(defel);
858 if (o_compression)
859 ereport(ERROR,
860 (errcode(ERRCODE_SYNTAX_ERROR),
861 errmsg("duplicate option \"%s\"", defel->defname)));
862 if (!parse_compress_algorithm(optval, &opt->compression))
863 ereport(ERROR,
864 (errcode(ERRCODE_SYNTAX_ERROR),
865 errmsg("unrecognized compression algorithm \"%s\"",
866 optval)));
867 o_compression = true;
869 else if (strcmp(defel->defname, "compression_detail") == 0)
871 if (o_compression_detail)
872 ereport(ERROR,
873 (errcode(ERRCODE_SYNTAX_ERROR),
874 errmsg("duplicate option \"%s\"", defel->defname)));
875 compression_detail_str = defGetString(defel);
876 o_compression_detail = true;
878 else
879 ereport(ERROR,
880 (errcode(ERRCODE_SYNTAX_ERROR),
881 errmsg("unrecognized base backup option: \"%s\"",
882 defel->defname)));
885 if (opt->label == NULL)
886 opt->label = "base backup";
887 if (opt->manifest == MANIFEST_OPTION_NO)
889 if (o_manifest_checksums)
890 ereport(ERROR,
891 (errcode(ERRCODE_SYNTAX_ERROR),
892 errmsg("manifest checksums require a backup manifest")));
893 opt->manifest_checksum_type = CHECKSUM_TYPE_NONE;
896 if (target_str == NULL)
898 if (target_detail_str != NULL)
899 ereport(ERROR,
900 (errcode(ERRCODE_SYNTAX_ERROR),
901 errmsg("target detail cannot be used without target")));
902 opt->use_copytblspc = true;
903 opt->send_to_client = true;
905 else if (strcmp(target_str, "client") == 0)
907 if (target_detail_str != NULL)
908 ereport(ERROR,
909 (errcode(ERRCODE_SYNTAX_ERROR),
910 errmsg("target '%s' does not accept a target detail",
911 target_str)));
912 opt->send_to_client = true;
914 else
915 opt->target_handle =
916 BaseBackupGetTargetHandle(target_str, target_detail_str);
918 if (o_compression_detail && !o_compression)
919 ereport(ERROR,
920 (errcode(ERRCODE_SYNTAX_ERROR),
921 errmsg("compression detail requires compression")));
923 if (o_compression)
925 char *error_detail;
927 parse_compress_specification(opt->compression, compression_detail_str,
928 &opt->compression_specification);
929 error_detail =
930 validate_compress_specification(&opt->compression_specification);
931 if (error_detail != NULL)
932 ereport(ERROR,
933 errcode(ERRCODE_SYNTAX_ERROR),
934 errmsg("invalid compression specification: %s",
935 error_detail));
941 * SendBaseBackup() - send a complete base backup.
943 * The function will put the system into backup mode like pg_backup_start()
944 * does, so that the backup is consistent even though we read directly from
945 * the filesystem, bypassing the buffer cache.
947 void
948 SendBaseBackup(BaseBackupCmd *cmd)
950 basebackup_options opt;
951 bbsink *sink;
952 SessionBackupState status = get_backup_status();
954 if (status == SESSION_BACKUP_RUNNING)
955 ereport(ERROR,
956 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
957 errmsg("a backup is already in progress in this session")));
959 parse_basebackup_options(cmd->options, &opt);
961 WalSndSetState(WALSNDSTATE_BACKUP);
963 if (update_process_title)
965 char activitymsg[50];
967 snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
968 opt.label);
969 set_ps_display(activitymsg);
973 * If the target is specifically 'client' then set up to stream the backup
974 * to the client; otherwise, it's being sent someplace else and should not
975 * be sent to the client. BaseBackupGetSink has the job of setting up a
976 * sink to send the backup data wherever it needs to go.
978 sink = bbsink_copystream_new(opt.send_to_client);
979 if (opt.target_handle != NULL)
980 sink = BaseBackupGetSink(opt.target_handle, sink);
982 /* Set up network throttling, if client requested it */
983 if (opt.maxrate > 0)
984 sink = bbsink_throttle_new(sink, opt.maxrate);
986 /* Set up server-side compression, if client requested it */
987 if (opt.compression == PG_COMPRESSION_GZIP)
988 sink = bbsink_gzip_new(sink, &opt.compression_specification);
989 else if (opt.compression == PG_COMPRESSION_LZ4)
990 sink = bbsink_lz4_new(sink, &opt.compression_specification);
991 else if (opt.compression == PG_COMPRESSION_ZSTD)
992 sink = bbsink_zstd_new(sink, &opt.compression_specification);
994 /* Set up progress reporting. */
995 sink = bbsink_progress_new(sink, opt.progress);
998 * Perform the base backup, but make sure we clean up the bbsink even if
999 * an error occurs.
1001 PG_TRY();
1003 perform_base_backup(&opt, sink);
1005 PG_FINALLY();
1007 bbsink_cleanup(sink);
1009 PG_END_TRY();
1013 * Inject a file with given name and content in the output tar stream.
1015 static void
1016 sendFileWithContent(bbsink *sink, const char *filename, const char *content,
1017 backup_manifest_info *manifest)
1019 struct stat statbuf;
1020 int bytes_done = 0,
1021 len;
1022 pg_checksum_context checksum_ctx;
1024 if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1025 elog(ERROR, "could not initialize checksum of file \"%s\"",
1026 filename);
1028 len = strlen(content);
1031 * Construct a stat struct for the backup_label file we're injecting in
1032 * the tar.
1034 /* Windows doesn't have the concept of uid and gid */
1035 #ifdef WIN32
1036 statbuf.st_uid = 0;
1037 statbuf.st_gid = 0;
1038 #else
1039 statbuf.st_uid = geteuid();
1040 statbuf.st_gid = getegid();
1041 #endif
1042 statbuf.st_mtime = time(NULL);
1043 statbuf.st_mode = pg_file_create_mode;
1044 statbuf.st_size = len;
1046 _tarWriteHeader(sink, filename, NULL, &statbuf, false);
1048 if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0)
1049 elog(ERROR, "could not update checksum of file \"%s\"",
1050 filename);
1052 while (bytes_done < len)
1054 size_t remaining = len - bytes_done;
1055 size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1057 memcpy(sink->bbs_buffer, content, nbytes);
1058 bbsink_archive_contents(sink, nbytes);
1059 bytes_done += nbytes;
1062 _tarWritePadding(sink, len);
1064 AddFileToBackupManifest(manifest, NULL, filename, len,
1065 (pg_time_t) statbuf.st_mtime, &checksum_ctx);
1069 * Include the tablespace directory pointed to by 'path' in the output tar
1070 * stream. If 'sizeonly' is true, we just calculate a total length and return
1071 * it, without actually sending anything.
1073 * Only used to send auxiliary tablespaces, not PGDATA.
1075 static int64
1076 sendTablespace(bbsink *sink, char *path, char *spcoid, bool sizeonly,
1077 backup_manifest_info *manifest)
1079 int64 size;
1080 char pathbuf[MAXPGPATH];
1081 struct stat statbuf;
1084 * 'path' points to the tablespace location, but we only want to include
1085 * the version directory in it that belongs to us.
1087 snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
1088 TABLESPACE_VERSION_DIRECTORY);
1091 * Store a directory entry in the tar file so we get the permissions
1092 * right.
1094 if (lstat(pathbuf, &statbuf) != 0)
1096 if (errno != ENOENT)
1097 ereport(ERROR,
1098 (errcode_for_file_access(),
1099 errmsg("could not stat file or directory \"%s\": %m",
1100 pathbuf)));
1102 /* If the tablespace went away while scanning, it's no error. */
1103 return 0;
1106 size = _tarWriteHeader(sink, TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
1107 sizeonly);
1109 /* Send all the files in the tablespace version directory */
1110 size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest,
1111 spcoid);
1113 return size;
1117 * Include all files from the given directory in the output tar stream. If
1118 * 'sizeonly' is true, we just calculate a total length and return it, without
1119 * actually sending anything.
1121 * Omit any directory in the tablespaces list, to avoid backing up
1122 * tablespaces twice when they were created inside PGDATA.
1124 * If sendtblspclinks is true, we need to include symlink
1125 * information in the tar file. If not, we can skip that
1126 * as it will be sent separately in the tablespace_map file.
1128 static int64
1129 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
1130 List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest,
1131 const char *spcoid)
1133 DIR *dir;
1134 struct dirent *de;
1135 char pathbuf[MAXPGPATH * 2];
1136 struct stat statbuf;
1137 int64 size = 0;
1138 const char *lastDir; /* Split last dir from parent path. */
1139 bool isDbDir = false; /* Does this directory contain relations? */
1142 * Determine if the current path is a database directory that can contain
1143 * relations.
1145 * Start by finding the location of the delimiter between the parent path
1146 * and the current path.
1148 lastDir = last_dir_separator(path);
1150 /* Does this path look like a database path (i.e. all digits)? */
1151 if (lastDir != NULL &&
1152 strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1154 /* Part of path that contains the parent directory. */
1155 int parentPathLen = lastDir - path;
1158 * Mark path as a database directory if the parent path is either
1159 * $PGDATA/base or a tablespace version path.
1161 if (strncmp(path, "./base", parentPathLen) == 0 ||
1162 (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1163 strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1164 TABLESPACE_VERSION_DIRECTORY,
1165 sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1166 isDbDir = true;
1169 dir = AllocateDir(path);
1170 while ((de = ReadDir(dir, path)) != NULL)
1172 int excludeIdx;
1173 bool excludeFound;
1174 ForkNumber relForkNum; /* Type of fork if file is a relation */
1175 int relOidChars; /* Chars in filename that are the rel oid */
1177 /* Skip special stuff */
1178 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1179 continue;
1181 /* Skip temporary files */
1182 if (strncmp(de->d_name,
1183 PG_TEMP_FILE_PREFIX,
1184 strlen(PG_TEMP_FILE_PREFIX)) == 0)
1185 continue;
1188 * Check if the postmaster has signaled us to exit, and abort with an
1189 * error in that case. The error handler further up will call
1190 * do_pg_abort_backup() for us. Also check that if the backup was
1191 * started while still in recovery, the server wasn't promoted.
1192 * do_pg_backup_stop() will check that too, but it's better to stop
1193 * the backup early than continue to the end and fail there.
1195 CHECK_FOR_INTERRUPTS();
1196 if (RecoveryInProgress() != backup_started_in_recovery)
1197 ereport(ERROR,
1198 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1199 errmsg("the standby was promoted during online backup"),
1200 errhint("This means that the backup being taken is corrupt "
1201 "and should not be used. "
1202 "Try taking another online backup.")));
1204 /* Scan for files that should be excluded */
1205 excludeFound = false;
1206 for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
1208 int cmplen = strlen(excludeFiles[excludeIdx].name);
1210 if (!excludeFiles[excludeIdx].match_prefix)
1211 cmplen++;
1212 if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
1214 elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1215 excludeFound = true;
1216 break;
1220 if (excludeFound)
1221 continue;
1223 /* Exclude all forks for unlogged tables except the init fork */
1224 if (isDbDir &&
1225 parse_filename_for_nontemp_relation(de->d_name, &relOidChars,
1226 &relForkNum))
1228 /* Never exclude init forks */
1229 if (relForkNum != INIT_FORKNUM)
1231 char initForkFile[MAXPGPATH];
1232 char relOid[OIDCHARS + 1];
1235 * If any other type of fork, check if there is an init fork
1236 * with the same OID. If so, the file can be excluded.
1238 memcpy(relOid, de->d_name, relOidChars);
1239 relOid[relOidChars] = '\0';
1240 snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init",
1241 path, relOid);
1243 if (lstat(initForkFile, &statbuf) == 0)
1245 elog(DEBUG2,
1246 "unlogged relation file \"%s\" excluded from backup",
1247 de->d_name);
1249 continue;
1254 /* Exclude temporary relations */
1255 if (isDbDir && looks_like_temp_rel_name(de->d_name))
1257 elog(DEBUG2,
1258 "temporary relation file \"%s\" excluded from backup",
1259 de->d_name);
1261 continue;
1264 snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1266 /* Skip pg_control here to back up it last */
1267 if (strcmp(pathbuf, "./global/pg_control") == 0)
1268 continue;
1270 if (lstat(pathbuf, &statbuf) != 0)
1272 if (errno != ENOENT)
1273 ereport(ERROR,
1274 (errcode_for_file_access(),
1275 errmsg("could not stat file or directory \"%s\": %m",
1276 pathbuf)));
1278 /* If the file went away while scanning, it's not an error. */
1279 continue;
1282 /* Scan for directories whose contents should be excluded */
1283 excludeFound = false;
1284 for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1286 if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1288 elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1289 convert_link_to_directory(pathbuf, &statbuf);
1290 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1291 &statbuf, sizeonly);
1292 excludeFound = true;
1293 break;
1297 if (excludeFound)
1298 continue;
1301 * We can skip pg_wal, the WAL segments need to be fetched from the
1302 * WAL archive anyway. But include it as an empty directory anyway, so
1303 * we get permissions right.
1305 if (strcmp(pathbuf, "./pg_wal") == 0)
1307 /* If pg_wal is a symlink, write it as a directory anyway */
1308 convert_link_to_directory(pathbuf, &statbuf);
1309 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1310 &statbuf, sizeonly);
1313 * Also send archive_status directory (by hackishly reusing
1314 * statbuf from above ...).
1316 size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL,
1317 &statbuf, sizeonly);
1319 continue; /* don't recurse into pg_wal */
1322 /* Allow symbolic links in pg_tblspc only */
1323 if (strcmp(path, "./pg_tblspc") == 0 &&
1324 #ifndef WIN32
1325 S_ISLNK(statbuf.st_mode)
1326 #else
1327 pgwin32_is_junction(pathbuf)
1328 #endif
1331 #if defined(HAVE_READLINK) || defined(WIN32)
1332 char linkpath[MAXPGPATH];
1333 int rllen;
1335 rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1336 if (rllen < 0)
1337 ereport(ERROR,
1338 (errcode_for_file_access(),
1339 errmsg("could not read symbolic link \"%s\": %m",
1340 pathbuf)));
1341 if (rllen >= sizeof(linkpath))
1342 ereport(ERROR,
1343 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1344 errmsg("symbolic link \"%s\" target is too long",
1345 pathbuf)));
1346 linkpath[rllen] = '\0';
1348 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, linkpath,
1349 &statbuf, sizeonly);
1350 #else
1353 * If the platform does not have symbolic links, it should not be
1354 * possible to have tablespaces - clearly somebody else created
1355 * them. Warn about it and ignore.
1357 ereport(WARNING,
1358 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1359 errmsg("tablespaces are not supported on this platform")));
1360 continue;
1361 #endif /* HAVE_READLINK */
1363 else if (S_ISDIR(statbuf.st_mode))
1365 bool skip_this_dir = false;
1366 ListCell *lc;
1369 * Store a directory entry in the tar file so we can get the
1370 * permissions right.
1372 size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, &statbuf,
1373 sizeonly);
1376 * Call ourselves recursively for a directory, unless it happens
1377 * to be a separate tablespace located within PGDATA.
1379 foreach(lc, tablespaces)
1381 tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1384 * ti->rpath is the tablespace relative path within PGDATA, or
1385 * NULL if the tablespace has been properly located somewhere
1386 * else.
1388 * Skip past the leading "./" in pathbuf when comparing.
1390 if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1392 skip_this_dir = true;
1393 break;
1398 * skip sending directories inside pg_tblspc, if not required.
1400 if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1401 skip_this_dir = true;
1403 if (!skip_this_dir)
1404 size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces,
1405 sendtblspclinks, manifest, spcoid);
1407 else if (S_ISREG(statbuf.st_mode))
1409 bool sent = false;
1411 if (!sizeonly)
1412 sent = sendFile(sink, pathbuf, pathbuf + basepathlen + 1, &statbuf,
1413 true, isDbDir ? atooid(lastDir + 1) : InvalidOid,
1414 manifest, spcoid);
1416 if (sent || sizeonly)
1418 /* Add size. */
1419 size += statbuf.st_size;
1421 /* Pad to a multiple of the tar block size. */
1422 size += tarPaddingBytesRequired(statbuf.st_size);
1424 /* Size of the header for the file. */
1425 size += TAR_BLOCK_SIZE;
1428 else
1429 ereport(WARNING,
1430 (errmsg("skipping special file \"%s\"", pathbuf)));
1432 FreeDir(dir);
1433 return size;
1437 * Check if a file should have its checksum validated.
1438 * We validate checksums on files in regular tablespaces
1439 * (including global and default) only, and in those there
1440 * are some files that are explicitly excluded.
1442 static bool
1443 is_checksummed_file(const char *fullpath, const char *filename)
1445 /* Check that the file is in a tablespace */
1446 if (strncmp(fullpath, "./global/", 9) == 0 ||
1447 strncmp(fullpath, "./base/", 7) == 0 ||
1448 strncmp(fullpath, "/", 1) == 0)
1450 int excludeIdx;
1452 /* Compare file against noChecksumFiles skip list */
1453 for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++)
1455 int cmplen = strlen(noChecksumFiles[excludeIdx].name);
1457 if (!noChecksumFiles[excludeIdx].match_prefix)
1458 cmplen++;
1459 if (strncmp(filename, noChecksumFiles[excludeIdx].name,
1460 cmplen) == 0)
1461 return false;
1464 return true;
1466 else
1467 return false;
1470 /*****
1471 * Functions for handling tar file format
1473 * Copied from pg_dump, but modified to work with libpq for sending
1478 * Given the member, write the TAR header & send the file.
1480 * If 'missing_ok' is true, will not throw an error if the file is not found.
1482 * If dboid is anything other than InvalidOid then any checksum failures
1483 * detected will get reported to the cumulative stats system.
1485 * Returns true if the file was successfully sent, false if 'missing_ok',
1486 * and the file did not exist.
1488 static bool
1489 sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1490 struct stat *statbuf, bool missing_ok, Oid dboid,
1491 backup_manifest_info *manifest, const char *spcoid)
1493 int fd;
1494 BlockNumber blkno = 0;
1495 bool block_retry = false;
1496 uint16 checksum;
1497 int checksum_failures = 0;
1498 off_t cnt;
1499 int i;
1500 pgoff_t len = 0;
1501 char *page;
1502 PageHeader phdr;
1503 int segmentno = 0;
1504 char *segmentpath;
1505 bool verify_checksum = false;
1506 pg_checksum_context checksum_ctx;
1508 if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1509 elog(ERROR, "could not initialize checksum of file \"%s\"",
1510 readfilename);
1512 fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
1513 if (fd < 0)
1515 if (errno == ENOENT && missing_ok)
1516 return false;
1517 ereport(ERROR,
1518 (errcode_for_file_access(),
1519 errmsg("could not open file \"%s\": %m", readfilename)));
1522 _tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
1524 if (!noverify_checksums && DataChecksumsEnabled())
1526 char *filename;
1529 * Get the filename (excluding path). As last_dir_separator()
1530 * includes the last directory separator, we chop that off by
1531 * incrementing the pointer.
1533 filename = last_dir_separator(readfilename) + 1;
1535 if (is_checksummed_file(readfilename, filename))
1537 verify_checksum = true;
1540 * Cut off at the segment boundary (".") to get the segment number
1541 * in order to mix it into the checksum.
1543 segmentpath = strstr(filename, ".");
1544 if (segmentpath != NULL)
1546 segmentno = atoi(segmentpath + 1);
1547 if (segmentno == 0)
1548 ereport(ERROR,
1549 (errmsg("invalid segment number %d in file \"%s\"",
1550 segmentno, filename)));
1556 * Loop until we read the amount of data the caller told us to expect. The
1557 * file could be longer, if it was extended while we were sending it, but
1558 * for a base backup we can ignore such extended data. It will be restored
1559 * from WAL.
1561 while (len < statbuf->st_size)
1563 size_t remaining = statbuf->st_size - len;
1565 /* Try to read some more data. */
1566 cnt = basebackup_read_file(fd, sink->bbs_buffer,
1567 Min(sink->bbs_buffer_length, remaining),
1568 len, readfilename, true);
1571 * If we hit end-of-file, a concurrent truncation must have occurred.
1572 * That's not an error condition, because WAL replay will fix things
1573 * up.
1575 if (cnt == 0)
1576 break;
1579 * The checksums are verified at block level, so we iterate over the
1580 * buffer in chunks of BLCKSZ, after making sure that
1581 * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1582 * BLCKSZ bytes.
1584 Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1586 if (verify_checksum && (cnt % BLCKSZ != 0))
1588 ereport(WARNING,
1589 (errmsg("could not verify checksum in file \"%s\", block "
1590 "%u: read buffer size %d and page size %d "
1591 "differ",
1592 readfilename, blkno, (int) cnt, BLCKSZ)));
1593 verify_checksum = false;
1596 if (verify_checksum)
1598 for (i = 0; i < cnt / BLCKSZ; i++)
1600 page = sink->bbs_buffer + BLCKSZ * i;
1603 * Only check pages which have not been modified since the
1604 * start of the base backup. Otherwise, they might have been
1605 * written only halfway and the checksum would not be valid.
1606 * However, replaying WAL would reinstate the correct page in
1607 * this case. We also skip completely new pages, since they
1608 * don't have a checksum yet.
1610 if (!PageIsNew(page) && PageGetLSN(page) < sink->bbs_state->startptr)
1612 checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
1613 phdr = (PageHeader) page;
1614 if (phdr->pd_checksum != checksum)
1617 * Retry the block on the first failure. It's
1618 * possible that we read the first 4K page of the
1619 * block just before postgres updated the entire block
1620 * so it ends up looking torn to us. We only need to
1621 * retry once because the LSN should be updated to
1622 * something we can ignore on the next pass. If the
1623 * error happens again then it is a true validation
1624 * failure.
1626 if (block_retry == false)
1628 int reread_cnt;
1630 /* Reread the failed block */
1631 reread_cnt =
1632 basebackup_read_file(fd,
1633 sink->bbs_buffer + BLCKSZ * i,
1634 BLCKSZ, len + BLCKSZ * i,
1635 readfilename,
1636 false);
1637 if (reread_cnt == 0)
1640 * If we hit end-of-file, a concurrent
1641 * truncation must have occurred, so break out
1642 * of this loop just as if the initial fread()
1643 * returned 0. We'll drop through to the same
1644 * code that handles that case. (We must fix
1645 * up cnt first, though.)
1647 cnt = BLCKSZ * i;
1648 break;
1651 /* Set flag so we know a retry was attempted */
1652 block_retry = true;
1654 /* Reset loop to validate the block again */
1655 i--;
1656 continue;
1659 checksum_failures++;
1661 if (checksum_failures <= 5)
1662 ereport(WARNING,
1663 (errmsg("checksum verification failed in "
1664 "file \"%s\", block %u: calculated "
1665 "%X but expected %X",
1666 readfilename, blkno, checksum,
1667 phdr->pd_checksum)));
1668 if (checksum_failures == 5)
1669 ereport(WARNING,
1670 (errmsg("further checksum verification "
1671 "failures in file \"%s\" will not "
1672 "be reported", readfilename)));
1675 block_retry = false;
1676 blkno++;
1680 bbsink_archive_contents(sink, cnt);
1682 /* Also feed it to the checksum machinery. */
1683 if (pg_checksum_update(&checksum_ctx,
1684 (uint8 *) sink->bbs_buffer, cnt) < 0)
1685 elog(ERROR, "could not update checksum of base backup");
1687 len += cnt;
1690 /* If the file was truncated while we were sending it, pad it with zeros */
1691 while (len < statbuf->st_size)
1693 size_t remaining = statbuf->st_size - len;
1694 size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1696 MemSet(sink->bbs_buffer, 0, nbytes);
1697 if (pg_checksum_update(&checksum_ctx,
1698 (uint8 *) sink->bbs_buffer,
1699 nbytes) < 0)
1700 elog(ERROR, "could not update checksum of base backup");
1701 bbsink_archive_contents(sink, nbytes);
1702 len += nbytes;
1706 * Pad to a block boundary, per tar format requirements. (This small piece
1707 * of data is probably not worth throttling, and is not checksummed
1708 * because it's not actually part of the file.)
1710 _tarWritePadding(sink, len);
1712 CloseTransientFile(fd);
1714 if (checksum_failures > 1)
1716 ereport(WARNING,
1717 (errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
1718 "file \"%s\" has a total of %d checksum verification failures",
1719 checksum_failures,
1720 readfilename, checksum_failures)));
1722 pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
1725 total_checksum_failures += checksum_failures;
1727 AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size,
1728 (pg_time_t) statbuf->st_mtime, &checksum_ctx);
1730 return true;
1733 static int64
1734 _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget,
1735 struct stat *statbuf, bool sizeonly)
1737 enum tarError rc;
1739 if (!sizeonly)
1742 * As of this writing, the smallest supported block size is 1kB, which
1743 * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a
1744 * multiple of BLCKSZ, it should be safe to assume that the buffer is
1745 * large enough to fit an entire tar block. We double-check by means
1746 * of these assertions.
1748 StaticAssertStmt(TAR_BLOCK_SIZE <= BLCKSZ,
1749 "BLCKSZ too small for tar block");
1750 Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
1752 rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget,
1753 statbuf->st_size, statbuf->st_mode,
1754 statbuf->st_uid, statbuf->st_gid,
1755 statbuf->st_mtime);
1757 switch (rc)
1759 case TAR_OK:
1760 break;
1761 case TAR_NAME_TOO_LONG:
1762 ereport(ERROR,
1763 (errmsg("file name too long for tar format: \"%s\"",
1764 filename)));
1765 break;
1766 case TAR_SYMLINK_TOO_LONG:
1767 ereport(ERROR,
1768 (errmsg("symbolic link target too long for tar format: "
1769 "file name \"%s\", target \"%s\"",
1770 filename, linktarget)));
1771 break;
1772 default:
1773 elog(ERROR, "unrecognized tar error: %d", rc);
1776 bbsink_archive_contents(sink, TAR_BLOCK_SIZE);
1779 return TAR_BLOCK_SIZE;
1783 * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE.
1785 static void
1786 _tarWritePadding(bbsink *sink, int len)
1788 int pad = tarPaddingBytesRequired(len);
1791 * As in _tarWriteHeader, it should be safe to assume that the buffer is
1792 * large enough that we don't need to do this in multiple chunks.
1794 Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
1795 Assert(pad <= TAR_BLOCK_SIZE);
1797 if (pad > 0)
1799 MemSet(sink->bbs_buffer, 0, pad);
1800 bbsink_archive_contents(sink, pad);
1805 * If the entry in statbuf is a link, then adjust statbuf to make it look like a
1806 * directory, so that it will be written that way.
1808 static void
1809 convert_link_to_directory(const char *pathbuf, struct stat *statbuf)
1811 /* If symlink, write it as a directory anyway */
1812 #ifndef WIN32
1813 if (S_ISLNK(statbuf->st_mode))
1814 #else
1815 if (pgwin32_is_junction(pathbuf))
1816 #endif
1817 statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
1821 * Read some data from a file, setting a wait event and reporting any error
1822 * encountered.
1824 * If partial_read_ok is false, also report an error if the number of bytes
1825 * read is not equal to the number of bytes requested.
1827 * Returns the number of bytes read.
1829 static int
1830 basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
1831 const char *filename, bool partial_read_ok)
1833 int rc;
1835 pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
1836 rc = pg_pread(fd, buf, nbytes, offset);
1837 pgstat_report_wait_end();
1839 if (rc < 0)
1840 ereport(ERROR,
1841 (errcode_for_file_access(),
1842 errmsg("could not read file \"%s\": %m", filename)));
1843 if (!partial_read_ok && rc > 0 && rc != nbytes)
1844 ereport(ERROR,
1845 (errcode_for_file_access(),
1846 errmsg("could not read file \"%s\": read %d of %zu",
1847 filename, rc, nbytes)));
1849 return rc;