1 /*-------------------------------------------------------------------------
4 * Functions for WAL recovery, standby mode
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
17 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
20 * src/backend/access/transam/xlogrecovery.c
22 *-------------------------------------------------------------------------
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "common/file_utils.h"
47 #include "miscadmin.h"
49 #include "postmaster/bgwriter.h"
50 #include "postmaster/startup.h"
51 #include "replication/slot.h"
52 #include "replication/slotsync.h"
53 #include "replication/walreceiver.h"
54 #include "storage/fd.h"
55 #include "storage/ipc.h"
56 #include "storage/latch.h"
57 #include "storage/pmsignal.h"
58 #include "storage/procarray.h"
59 #include "storage/spin.h"
60 #include "utils/datetime.h"
61 #include "utils/fmgrprotos.h"
62 #include "utils/guc_hooks.h"
63 #include "utils/pg_lsn.h"
64 #include "utils/ps_status.h"
65 #include "utils/pg_rusage.h"
67 /* Unsupported old recovery command file names (relative to $PGDATA) */
68 #define RECOVERY_COMMAND_FILE "recovery.conf"
69 #define RECOVERY_COMMAND_DONE "recovery.done"
74 const struct config_enum_entry recovery_target_action_options
[] = {
75 {"pause", RECOVERY_TARGET_ACTION_PAUSE
, false},
76 {"promote", RECOVERY_TARGET_ACTION_PROMOTE
, false},
77 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN
, false},
81 /* options formerly taken from recovery.conf for archive recovery */
82 char *recoveryRestoreCommand
= NULL
;
83 char *recoveryEndCommand
= NULL
;
84 char *archiveCleanupCommand
= NULL
;
85 RecoveryTargetType recoveryTarget
= RECOVERY_TARGET_UNSET
;
86 bool recoveryTargetInclusive
= true;
87 int recoveryTargetAction
= RECOVERY_TARGET_ACTION_PAUSE
;
88 TransactionId recoveryTargetXid
;
89 char *recovery_target_time_string
;
90 TimestampTz recoveryTargetTime
;
91 const char *recoveryTargetName
;
92 XLogRecPtr recoveryTargetLSN
;
93 int recovery_min_apply_delay
= 0;
95 /* options formerly taken from recovery.conf for XLOG streaming */
96 char *PrimaryConnInfo
= NULL
;
97 char *PrimarySlotName
= NULL
;
98 bool wal_receiver_create_temp_slot
= false;
101 * recoveryTargetTimeLineGoal: what the user requested, if any
103 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105 * recoveryTargetTLI: the currently understood target timeline; changes
107 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108 * the timelines of its known parents, newest first (so recoveryTargetTLI is
109 * always the first list member). Only these TLIs are expected to be seen in
110 * the WAL segments we read, and indeed only these TLIs will be considered as
111 * candidate WAL files to open at all.
113 * curFileTLI: the TLI appearing in the name of the current input WAL file.
114 * (This is not necessarily the same as the timeline from which we are
115 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116 * scanning data that was copied from an ancestor timeline when the current
117 * file was created.) During a sequential scan we do not allow this value
120 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
= RECOVERY_TARGET_TIMELINE_LATEST
;
121 TimeLineID recoveryTargetTLIRequested
= 0;
122 TimeLineID recoveryTargetTLI
= 0;
123 static List
*expectedTLEs
;
124 static TimeLineID curFileTLI
;
127 * When ArchiveRecoveryRequested is set, archive recovery was requested,
128 * ie. signal files were present. When InArchiveRecovery is set, we are
129 * currently recovering using offline XLOG archives. These variables are only
130 * valid in the startup process.
132 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133 * currently performing crash recovery using only XLOG files in pg_wal, but
134 * will switch to using offline XLOG archives as soon as we reach the end of
137 bool ArchiveRecoveryRequested
= false;
138 bool InArchiveRecovery
= false;
141 * When StandbyModeRequested is set, standby mode was requested, i.e.
142 * standby.signal file was present. When StandbyMode is set, we are currently
143 * in standby mode. These variables are only valid in the startup process.
144 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146 static bool StandbyModeRequested
= false;
147 bool StandbyMode
= false;
149 /* was a signal file present at startup? */
150 static bool standby_signal_file_found
= false;
151 static bool recovery_signal_file_found
= false;
154 * CheckPointLoc is the position of the checkpoint record that determines
155 * where to start the replay. It comes from the backup label file or the
158 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159 * file or the control file. In standby mode, XLOG streaming usually starts
160 * from the position where an invalid record was found. But if we fail to
161 * read even the initial checkpoint record, we use the REDO location instead
162 * of the checkpoint location as the start position of XLOG streaming.
163 * Otherwise we would have to jump backwards to the REDO location after
164 * reading the checkpoint record, because the REDO record can precede the
167 static XLogRecPtr CheckPointLoc
= InvalidXLogRecPtr
;
168 static TimeLineID CheckPointTLI
= 0;
169 static XLogRecPtr RedoStartLSN
= InvalidXLogRecPtr
;
170 static TimeLineID RedoStartTLI
= 0;
173 * Local copy of SharedHotStandbyActive variable. False actually means "not
174 * known, need to check the shared state".
176 static bool LocalHotStandbyActive
= false;
179 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180 * known, need to check the shared state".
182 static bool LocalPromoteIsTriggered
= false;
184 /* Has the recovery code requested a walreceiver wakeup? */
185 static bool doRequestWalReceiverReply
;
187 /* XLogReader object used to parse the WAL records */
188 static XLogReaderState
*xlogreader
= NULL
;
190 /* XLogPrefetcher object used to consume WAL records with read-ahead */
191 static XLogPrefetcher
*xlogprefetcher
= NULL
;
193 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
194 typedef struct XLogPageReadPrivate
197 bool fetching_ckpt
; /* are we fetching a checkpoint record? */
199 TimeLineID replayTLI
;
200 } XLogPageReadPrivate
;
202 /* flag to tell XLogPageRead that we have started replaying */
203 static bool InRedo
= false;
206 * Codes indicating where we got a WAL file from during recovery, or where
207 * to attempt to get one.
211 XLOG_FROM_ANY
= 0, /* request to read WAL from any source */
212 XLOG_FROM_ARCHIVE
, /* restored using restore_command */
213 XLOG_FROM_PG_WAL
, /* existing file in pg_wal */
214 XLOG_FROM_STREAM
, /* streamed from primary */
217 /* human-readable names for XLogSources, for debugging output */
218 static const char *const xlogSourceNames
[] = {"any", "archive", "pg_wal", "stream"};
221 * readFile is -1 or a kernel FD for the log file segment that's currently
222 * open for reading. readSegNo identifies the segment. readOff is the offset
223 * of the page just read, readLen indicates how much of it has been read into
224 * readBuf, and readSource indicates where we got the currently open file from.
226 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228 * worthwhile, since the XLOG is not read by general-purpose sessions.
230 static int readFile
= -1;
231 static XLogSegNo readSegNo
= 0;
232 static uint32 readOff
= 0;
233 static uint32 readLen
= 0;
234 static XLogSource readSource
= XLOG_FROM_ANY
;
237 * Keeps track of which source we're currently reading from. This is
238 * different from readSource in that this is always set, even when we don't
239 * currently have a WAL file open. If lastSourceFailed is set, our last
240 * attempt to read from currentSource failed, and we should try another source
243 * pendingWalRcvRestart is set when a config change occurs that requires a
244 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
246 static XLogSource currentSource
= XLOG_FROM_ANY
;
247 static bool lastSourceFailed
= false;
248 static bool pendingWalRcvRestart
= false;
251 * These variables track when we last obtained some WAL data to process,
252 * and where we got it from. (XLogReceiptSource is initially the same as
253 * readSource, but readSource gets reset to zero when we don't have data
254 * to process right now. It is also different from currentSource, which
255 * also changes when we try to read from a source and fail, while
256 * XLogReceiptSource tracks where we last successfully read some WAL.)
258 static TimestampTz XLogReceiptTime
= 0;
259 static XLogSource XLogReceiptSource
= XLOG_FROM_ANY
;
261 /* Local copy of WalRcv->flushedUpto */
262 static XLogRecPtr flushedUpto
= 0;
263 static TimeLineID receiveTLI
= 0;
266 * Copy of minRecoveryPoint and backupEndPoint from the control file.
268 * In order to reach consistency, we must replay the WAL up to
269 * minRecoveryPoint. If backupEndRequired is true, we must also reach
270 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271 * to backupStartPoint.
273 * Note: In archive recovery, after consistency has been reached, the
274 * functions in xlog.c will start updating minRecoveryPoint in the control
275 * file. But this copy of minRecoveryPoint variable reflects the value at the
276 * beginning of recovery, and is *not* updated after consistency is reached.
278 static XLogRecPtr minRecoveryPoint
;
279 static TimeLineID minRecoveryPointTLI
;
281 static XLogRecPtr backupStartPoint
;
282 static XLogRecPtr backupEndPoint
;
283 static bool backupEndRequired
= false;
286 * Have we reached a consistent database state? In crash recovery, we have
287 * to replay all the WAL, so reachedConsistency is never set. During archive
288 * recovery, the database is consistent once minRecoveryPoint is reached.
290 * Consistent state means that the system is internally consistent, all
291 * the WAL has been replayed up to a certain point, and importantly, there
292 * is no trace of later actions on disk.
294 bool reachedConsistency
= false;
296 /* Buffers dedicated to consistency checks of size BLCKSZ */
297 static char *replay_image_masked
= NULL
;
298 static char *primary_image_masked
= NULL
;
302 * Shared-memory state for WAL recovery.
304 typedef struct XLogRecoveryCtlData
307 * SharedHotStandbyActive indicates if we allow hot standby queries to be
308 * run. Protected by info_lck.
310 bool SharedHotStandbyActive
;
313 * SharedPromoteIsTriggered indicates if a standby promotion has been
314 * triggered. Protected by info_lck.
316 bool SharedPromoteIsTriggered
;
319 * recoveryWakeupLatch is used to wake up the startup process to continue
320 * WAL replay, if it is waiting for WAL to arrive or promotion to be
323 * Note that the startup process also uses another latch, its procLatch,
324 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325 * signaling the startup process in favor of using its procLatch, which
326 * comports better with possible generic signal handlers using that latch.
327 * But we should not do that because the startup process doesn't assume
328 * that it's waken up by walreceiver process or SIGHUP signal handler
329 * while it's waiting for recovery conflict. The separate latches,
330 * recoveryWakeupLatch and procLatch, should be used for inter-process
331 * communication for WAL replay and recovery conflict, respectively.
333 Latch recoveryWakeupLatch
;
336 * Last record successfully replayed.
338 XLogRecPtr lastReplayedReadRecPtr
; /* start position */
339 XLogRecPtr lastReplayedEndRecPtr
; /* end+1 position */
340 TimeLineID lastReplayedTLI
; /* timeline */
343 * When we're currently replaying a record, ie. in a redo function,
344 * replayEndRecPtr points to the end+1 of the record being replayed,
345 * otherwise it's equal to lastReplayedEndRecPtr.
347 XLogRecPtr replayEndRecPtr
;
348 TimeLineID replayEndTLI
;
349 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
350 TimestampTz recoveryLastXTime
;
353 * timestamp of when we started replaying the current chunk of WAL data,
354 * only relevant for replication or archive recovery
356 TimestampTz currentChunkStartTime
;
357 /* Recovery pause state */
358 RecoveryPauseState recoveryPauseState
;
359 ConditionVariable recoveryNotPausedCV
;
361 slock_t info_lck
; /* locks shared variables shown above */
362 } XLogRecoveryCtlData
;
364 static XLogRecoveryCtlData
*XLogRecoveryCtl
= NULL
;
367 * abortedRecPtr is the start pointer of a broken record at end of WAL when
368 * recovery completes; missingContrecPtr is the location of the first
369 * contrecord that went missing. See CreateOverwriteContrecordRecord for
372 static XLogRecPtr abortedRecPtr
;
373 static XLogRecPtr missingContrecPtr
;
376 * if recoveryStopsBefore/After returns true, it saves information of the stop
379 static TransactionId recoveryStopXid
;
380 static TimestampTz recoveryStopTime
;
381 static XLogRecPtr recoveryStopLSN
;
382 static char recoveryStopName
[MAXFNAMELEN
];
383 static bool recoveryStopAfter
;
385 /* prototypes for local functions */
386 static void ApplyWalRecord(XLogReaderState
*xlogreader
, XLogRecord
*record
, TimeLineID
*replayTLI
);
388 static void EnableStandbyMode(void);
389 static void readRecoverySignalFile(void);
390 static void validateRecoveryParameters(void);
391 static bool read_backup_label(XLogRecPtr
*checkPointLoc
,
392 TimeLineID
*backupLabelTLI
,
393 bool *backupEndRequired
, bool *backupFromStandby
);
394 static bool read_tablespace_map(List
**tablespaces
);
396 static void xlogrecovery_redo(XLogReaderState
*record
, TimeLineID replayTLI
);
397 static void CheckRecoveryConsistency(void);
398 static void rm_redo_error_callback(void *arg
);
400 static void xlog_outrec(StringInfo buf
, XLogReaderState
*record
);
402 static void xlog_block_info(StringInfo buf
, XLogReaderState
*record
);
403 static void checkTimeLineSwitch(XLogRecPtr lsn
, TimeLineID newTLI
,
404 TimeLineID prevTLI
, TimeLineID replayTLI
);
405 static bool getRecordTimestamp(XLogReaderState
*record
, TimestampTz
*recordXtime
);
406 static void verifyBackupPageConsistency(XLogReaderState
*record
);
408 static bool recoveryStopsBefore(XLogReaderState
*record
);
409 static bool recoveryStopsAfter(XLogReaderState
*record
);
410 static char *getRecoveryStopReason(void);
411 static void recoveryPausesHere(bool endOfRecovery
);
412 static bool recoveryApplyDelay(XLogReaderState
*record
);
413 static void ConfirmRecoveryPaused(void);
415 static XLogRecord
*ReadRecord(XLogPrefetcher
*xlogprefetcher
,
416 int emode
, bool fetching_ckpt
,
417 TimeLineID replayTLI
);
419 static int XLogPageRead(XLogReaderState
*xlogreader
, XLogRecPtr targetPagePtr
,
420 int reqLen
, XLogRecPtr targetRecPtr
, char *readBuf
);
421 static XLogPageReadResult
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr
,
424 XLogRecPtr tliRecPtr
,
425 TimeLineID replayTLI
,
426 XLogRecPtr replayLSN
,
428 static int emode_for_corrupt_record(int emode
, XLogRecPtr RecPtr
);
429 static XLogRecord
*ReadCheckpointRecord(XLogPrefetcher
*xlogprefetcher
,
430 XLogRecPtr RecPtr
, TimeLineID replayTLI
);
431 static bool rescanLatestTimeLine(TimeLineID replayTLI
, XLogRecPtr replayLSN
);
432 static int XLogFileRead(XLogSegNo segno
, TimeLineID tli
,
433 XLogSource source
, bool notfoundOk
);
434 static int XLogFileReadAnyTLI(XLogSegNo segno
, XLogSource source
);
436 static bool CheckForStandbyTrigger(void);
437 static void SetPromoteIsTriggered(void);
438 static bool HotStandbyActiveInReplay(void);
440 static void SetCurrentChunkStartTime(TimestampTz xtime
);
441 static void SetLatestXTime(TimestampTz xtime
);
444 * Initialization of shared memory for WAL recovery
447 XLogRecoveryShmemSize(void)
451 /* XLogRecoveryCtl */
452 size
= sizeof(XLogRecoveryCtlData
);
458 XLogRecoveryShmemInit(void)
462 XLogRecoveryCtl
= (XLogRecoveryCtlData
*)
463 ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found
);
466 memset(XLogRecoveryCtl
, 0, sizeof(XLogRecoveryCtlData
));
468 SpinLockInit(&XLogRecoveryCtl
->info_lck
);
469 InitSharedLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
);
470 ConditionVariableInit(&XLogRecoveryCtl
->recoveryNotPausedCV
);
474 * A thin wrapper to enable StandbyMode and do other preparatory work as
478 EnableStandbyMode(void)
483 * To avoid server log bloat, we don't report recovery progress in a
484 * standby as it will always be in recovery unless promoted. We disable
485 * startup progress timeout in standby mode to avoid calling
486 * startup_progress_timeout_handler() unnecessarily.
488 disable_startup_progress_timeout();
492 * Prepare the system for WAL recovery, if needed.
494 * This is called by StartupXLOG() which coordinates the server startup
495 * sequence. This function analyzes the control file and the backup label
496 * file, if any, and figures out whether we need to perform crash recovery or
497 * archive recovery, and how far we need to replay the WAL to reach a
500 * This doesn't yet change the on-disk state, except for creating the symlinks
501 * from table space map file if any, and for fetching WAL files needed to find
502 * the checkpoint record. On entry, the caller has already read the control
503 * file into memory, and passes it as argument. This function updates it to
504 * reflect the recovery state, and the caller is expected to write it back to
505 * disk does after initializing other subsystems, but before calling
506 * PerformWalRecovery().
508 * This initializes some global variables like ArchiveRecoveryRequested, and
509 * StandbyModeRequested and InRecovery.
512 InitWalRecovery(ControlFileData
*ControlFile
, bool *wasShutdown_ptr
,
513 bool *haveBackupLabel_ptr
, bool *haveTblspcMap_ptr
)
515 XLogPageReadPrivate
*private;
519 DBState dbstate_at_startup
;
520 bool haveTblspcMap
= false;
521 bool haveBackupLabel
= false;
522 CheckPoint checkPoint
;
523 bool backupFromStandby
= false;
525 dbstate_at_startup
= ControlFile
->state
;
528 * Initialize on the assumption we want to recover to the latest timeline
529 * that's active according to pg_control.
531 if (ControlFile
->minRecoveryPointTLI
>
532 ControlFile
->checkPointCopy
.ThisTimeLineID
)
533 recoveryTargetTLI
= ControlFile
->minRecoveryPointTLI
;
535 recoveryTargetTLI
= ControlFile
->checkPointCopy
.ThisTimeLineID
;
538 * Check for signal files, and if so set up state for offline recovery
540 readRecoverySignalFile();
541 validateRecoveryParameters();
544 * Take ownership of the wakeup latch if we're going to sleep during
545 * recovery, if required.
547 if (ArchiveRecoveryRequested
)
548 OwnLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
);
551 * Set the WAL reading processor now, as it will be needed when reading
552 * the checkpoint record required (backup_label or not).
554 private = palloc0(sizeof(XLogPageReadPrivate
));
556 XLogReaderAllocate(wal_segment_size
, NULL
,
557 XL_ROUTINE(.page_read
= &XLogPageRead
,
558 .segment_open
= NULL
,
559 .segment_close
= wal_segment_close
),
563 (errcode(ERRCODE_OUT_OF_MEMORY
),
564 errmsg("out of memory"),
565 errdetail("Failed while allocating a WAL reading processor.")));
566 xlogreader
->system_identifier
= ControlFile
->system_identifier
;
569 * Set the WAL decode buffer size. This limits how far ahead we can read
572 XLogReaderSetDecodeBuffer(xlogreader
, NULL
, wal_decode_buffer_size
);
574 /* Create a WAL prefetcher. */
575 xlogprefetcher
= XLogPrefetcherAllocate(xlogreader
);
578 * Allocate two page buffers dedicated to WAL consistency checks. We do
579 * it this way, rather than just making static arrays, for two reasons:
580 * (1) no need to waste the storage in most instantiations of the backend;
581 * (2) a static char array isn't guaranteed to have any particular
582 * alignment, whereas palloc() will provide MAXALIGN'd storage.
584 replay_image_masked
= (char *) palloc(BLCKSZ
);
585 primary_image_masked
= (char *) palloc(BLCKSZ
);
588 * Read the backup_label file. We want to run this part of the recovery
589 * process after checking for signal files and after performing validation
590 * of the recovery parameters.
592 if (read_backup_label(&CheckPointLoc
, &CheckPointTLI
, &backupEndRequired
,
595 List
*tablespaces
= NIL
;
598 * Archive recovery was requested, and thanks to the backup label
599 * file, we know how far we need to replay to reach consistency. Enter
600 * archive recovery directly.
602 InArchiveRecovery
= true;
603 if (StandbyModeRequested
)
607 * Omitting backup_label when creating a new replica, PITR node etc.
608 * unfortunately is a common cause of corruption. Logging that
609 * backup_label was used makes it a bit easier to exclude that as the
610 * cause of observed corruption.
612 * Do so before we try to read the checkpoint record (which can fail),
613 * as otherwise it can be hard to understand why a checkpoint other
614 * than ControlFile->checkPoint is used.
617 (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
618 LSN_FORMAT_ARGS(RedoStartLSN
),
619 LSN_FORMAT_ARGS(CheckPointLoc
),
623 * When a backup_label file is present, we want to roll forward from
624 * the checkpoint it identifies, rather than using pg_control.
626 record
= ReadCheckpointRecord(xlogprefetcher
, CheckPointLoc
,
630 memcpy(&checkPoint
, XLogRecGetData(xlogreader
), sizeof(CheckPoint
));
631 wasShutdown
= ((record
->xl_info
& ~XLR_INFO_MASK
) == XLOG_CHECKPOINT_SHUTDOWN
);
633 (errmsg_internal("checkpoint record is at %X/%X",
634 LSN_FORMAT_ARGS(CheckPointLoc
))));
635 InRecovery
= true; /* force recovery even if SHUTDOWNED */
638 * Make sure that REDO location exists. This may not be the case
639 * if there was a crash during an online backup, which left a
640 * backup_label around that references a WAL segment that's
641 * already been archived.
643 if (checkPoint
.redo
< CheckPointLoc
)
645 XLogPrefetcherBeginRead(xlogprefetcher
, checkPoint
.redo
);
646 if (!ReadRecord(xlogprefetcher
, LOG
, false,
647 checkPoint
.ThisTimeLineID
))
649 (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
650 LSN_FORMAT_ARGS(checkPoint
.redo
), LSN_FORMAT_ARGS(CheckPointLoc
)),
651 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
652 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
653 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
654 DataDir
, DataDir
, DataDir
, DataDir
)));
660 (errmsg("could not locate required checkpoint record at %X/%X",
661 LSN_FORMAT_ARGS(CheckPointLoc
)),
662 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
663 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
664 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
665 DataDir
, DataDir
, DataDir
, DataDir
)));
666 wasShutdown
= false; /* keep compiler quiet */
669 /* Read the tablespace_map file if present and create symlinks. */
670 if (read_tablespace_map(&tablespaces
))
674 foreach(lc
, tablespaces
)
676 tablespaceinfo
*ti
= lfirst(lc
);
679 linkloc
= psprintf("%s/%u", PG_TBLSPC_DIR
, ti
->oid
);
682 * Remove the existing symlink if any and Create the symlink
685 remove_tablespace_symlink(linkloc
);
687 if (symlink(ti
->path
, linkloc
) < 0)
689 (errcode_for_file_access(),
690 errmsg("could not create symbolic link \"%s\": %m",
697 /* tell the caller to delete it later */
698 haveTblspcMap
= true;
701 /* tell the caller to delete it later */
702 haveBackupLabel
= true;
706 /* No backup_label file has been found if we are here. */
709 * If tablespace_map file is present without backup_label file, there
710 * is no use of such file. There is no harm in retaining it, but it
711 * is better to get rid of the map file so that we don't have any
712 * redundant file in data directory and it will avoid any sort of
713 * confusion. It seems prudent though to just rename the file out of
714 * the way rather than delete it completely, also we ignore any error
715 * that occurs in rename operation as even if map file is present
716 * without backup_label file, it is harmless.
718 if (stat(TABLESPACE_MAP
, &st
) == 0)
720 unlink(TABLESPACE_MAP_OLD
);
721 if (durable_rename(TABLESPACE_MAP
, TABLESPACE_MAP_OLD
, DEBUG1
) == 0)
723 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
724 TABLESPACE_MAP
, BACKUP_LABEL_FILE
),
725 errdetail("File \"%s\" was renamed to \"%s\".",
726 TABLESPACE_MAP
, TABLESPACE_MAP_OLD
)));
729 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
730 TABLESPACE_MAP
, BACKUP_LABEL_FILE
),
731 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
732 TABLESPACE_MAP
, TABLESPACE_MAP_OLD
)));
736 * It's possible that archive recovery was requested, but we don't
737 * know how far we need to replay the WAL before we reach consistency.
738 * This can happen for example if a base backup is taken from a
739 * running server using an atomic filesystem snapshot, without calling
740 * pg_backup_start/stop. Or if you just kill a running primary server
741 * and put it into archive recovery by creating a recovery signal
744 * Our strategy in that case is to perform crash recovery first,
745 * replaying all the WAL present in pg_wal, and only enter archive
746 * recovery after that.
748 * But usually we already know how far we need to replay the WAL (up
749 * to minRecoveryPoint, up to backupEndPoint, or until we see an
750 * end-of-backup record), and we can enter archive recovery directly.
752 if (ArchiveRecoveryRequested
&&
753 (ControlFile
->minRecoveryPoint
!= InvalidXLogRecPtr
||
754 ControlFile
->backupEndRequired
||
755 ControlFile
->backupEndPoint
!= InvalidXLogRecPtr
||
756 ControlFile
->state
== DB_SHUTDOWNED
))
758 InArchiveRecovery
= true;
759 if (StandbyModeRequested
)
764 * For the same reason as when starting up with backup_label present,
765 * emit a log message when we continue initializing from a base
768 if (!XLogRecPtrIsInvalid(ControlFile
->backupStartPoint
))
770 (errmsg("restarting backup recovery with redo LSN %X/%X",
771 LSN_FORMAT_ARGS(ControlFile
->backupStartPoint
))));
773 /* Get the last valid checkpoint record. */
774 CheckPointLoc
= ControlFile
->checkPoint
;
775 CheckPointTLI
= ControlFile
->checkPointCopy
.ThisTimeLineID
;
776 RedoStartLSN
= ControlFile
->checkPointCopy
.redo
;
777 RedoStartTLI
= ControlFile
->checkPointCopy
.ThisTimeLineID
;
778 record
= ReadCheckpointRecord(xlogprefetcher
, CheckPointLoc
,
783 (errmsg_internal("checkpoint record is at %X/%X",
784 LSN_FORMAT_ARGS(CheckPointLoc
))));
789 * We used to attempt to go back to a secondary checkpoint record
790 * here, but only when not in standby mode. We now just fail if we
791 * can't read the last checkpoint because this allows us to
792 * simplify processing around checkpoints.
795 (errmsg("could not locate a valid checkpoint record at %X/%X",
796 LSN_FORMAT_ARGS(CheckPointLoc
))));
798 memcpy(&checkPoint
, XLogRecGetData(xlogreader
), sizeof(CheckPoint
));
799 wasShutdown
= ((record
->xl_info
& ~XLR_INFO_MASK
) == XLOG_CHECKPOINT_SHUTDOWN
);
802 if (ArchiveRecoveryRequested
)
804 if (StandbyModeRequested
)
806 (errmsg("entering standby mode")));
807 else if (recoveryTarget
== RECOVERY_TARGET_XID
)
809 (errmsg("starting point-in-time recovery to XID %u",
810 recoveryTargetXid
)));
811 else if (recoveryTarget
== RECOVERY_TARGET_TIME
)
813 (errmsg("starting point-in-time recovery to %s",
814 timestamptz_to_str(recoveryTargetTime
))));
815 else if (recoveryTarget
== RECOVERY_TARGET_NAME
)
817 (errmsg("starting point-in-time recovery to \"%s\"",
818 recoveryTargetName
)));
819 else if (recoveryTarget
== RECOVERY_TARGET_LSN
)
821 (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
822 LSN_FORMAT_ARGS(recoveryTargetLSN
))));
823 else if (recoveryTarget
== RECOVERY_TARGET_IMMEDIATE
)
825 (errmsg("starting point-in-time recovery to earliest consistent point")));
828 (errmsg("starting archive recovery")));
832 * If the location of the checkpoint record is not on the expected
833 * timeline in the history of the requested timeline, we cannot proceed:
834 * the backup is not part of the history of the requested timeline.
836 Assert(expectedTLEs
); /* was initialized by reading checkpoint
838 if (tliOfPointInHistory(CheckPointLoc
, expectedTLEs
) !=
841 XLogRecPtr switchpoint
;
844 * tliSwitchPoint will throw an error if the checkpoint's timeline is
845 * not in expectedTLEs at all.
847 switchpoint
= tliSwitchPoint(ControlFile
->checkPointCopy
.ThisTimeLineID
, expectedTLEs
, NULL
);
849 (errmsg("requested timeline %u is not a child of this server's history",
851 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
852 LSN_FORMAT_ARGS(ControlFile
->checkPoint
),
853 ControlFile
->checkPointCopy
.ThisTimeLineID
,
854 LSN_FORMAT_ARGS(switchpoint
))));
858 * The min recovery point should be part of the requested timeline's
861 if (!XLogRecPtrIsInvalid(ControlFile
->minRecoveryPoint
) &&
862 tliOfPointInHistory(ControlFile
->minRecoveryPoint
- 1, expectedTLEs
) !=
863 ControlFile
->minRecoveryPointTLI
)
865 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
867 LSN_FORMAT_ARGS(ControlFile
->minRecoveryPoint
),
868 ControlFile
->minRecoveryPointTLI
)));
871 (errmsg_internal("redo record is at %X/%X; shutdown %s",
872 LSN_FORMAT_ARGS(checkPoint
.redo
),
873 wasShutdown
? "true" : "false")));
875 (errmsg_internal("next transaction ID: " UINT64_FORMAT
"; next OID: %u",
876 U64FromFullTransactionId(checkPoint
.nextXid
),
877 checkPoint
.nextOid
)));
879 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
880 checkPoint
.nextMulti
, checkPoint
.nextMultiOffset
)));
882 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
883 checkPoint
.oldestXid
, checkPoint
.oldestXidDB
)));
885 (errmsg_internal("oldest MultiXactId: %u, in database %u",
886 checkPoint
.oldestMulti
, checkPoint
.oldestMultiDB
)));
888 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
889 checkPoint
.oldestCommitTsXid
,
890 checkPoint
.newestCommitTsXid
)));
891 if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint
.nextXid
)))
893 (errmsg("invalid next transaction ID")));
896 if (checkPoint
.redo
> CheckPointLoc
)
898 (errmsg("invalid redo in checkpoint record")));
901 * Check whether we need to force recovery from WAL. If it appears to
902 * have been a clean shutdown and we did not have a recovery signal file,
903 * then assume no recovery needed.
905 if (checkPoint
.redo
< CheckPointLoc
)
909 (errmsg("invalid redo record in shutdown checkpoint")));
912 else if (ControlFile
->state
!= DB_SHUTDOWNED
)
914 else if (ArchiveRecoveryRequested
)
916 /* force recovery due to presence of recovery signal file */
921 * If recovery is needed, update our in-memory copy of pg_control to show
922 * that we are recovering and to show the selected checkpoint as the place
923 * we are starting from. We also mark pg_control with any minimum recovery
924 * stop point obtained from a backup history file.
926 * We don't write the changes to disk yet, though. Only do that after
927 * initializing various subsystems.
931 if (InArchiveRecovery
)
933 ControlFile
->state
= DB_IN_ARCHIVE_RECOVERY
;
938 (errmsg("database system was not properly shut down; "
939 "automatic recovery in progress")));
940 if (recoveryTargetTLI
> ControlFile
->checkPointCopy
.ThisTimeLineID
)
942 (errmsg("crash recovery starts in timeline %u "
943 "and has target timeline %u",
944 ControlFile
->checkPointCopy
.ThisTimeLineID
,
945 recoveryTargetTLI
)));
946 ControlFile
->state
= DB_IN_CRASH_RECOVERY
;
948 ControlFile
->checkPoint
= CheckPointLoc
;
949 ControlFile
->checkPointCopy
= checkPoint
;
950 if (InArchiveRecovery
)
952 /* initialize minRecoveryPoint if not set yet */
953 if (ControlFile
->minRecoveryPoint
< checkPoint
.redo
)
955 ControlFile
->minRecoveryPoint
= checkPoint
.redo
;
956 ControlFile
->minRecoveryPointTLI
= checkPoint
.ThisTimeLineID
;
961 * Set backupStartPoint if we're starting recovery from a base backup.
963 * Also set backupEndPoint and use minRecoveryPoint as the backup end
964 * location if we're starting recovery from a base backup which was
965 * taken from a standby. In this case, the database system status in
966 * pg_control must indicate that the database was already in recovery.
967 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
968 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
969 * before reaching this point; e.g. because restore_command or
970 * primary_conninfo were faulty.
972 * Any other state indicates that the backup somehow became corrupted
973 * and we can't sensibly continue with recovery.
977 ControlFile
->backupStartPoint
= checkPoint
.redo
;
978 ControlFile
->backupEndRequired
= backupEndRequired
;
980 if (backupFromStandby
)
982 if (dbstate_at_startup
!= DB_IN_ARCHIVE_RECOVERY
&&
983 dbstate_at_startup
!= DB_SHUTDOWNED_IN_RECOVERY
)
985 (errmsg("backup_label contains data inconsistent with control file"),
986 errhint("This means that the backup is corrupted and you will "
987 "have to use another backup for recovery.")));
988 ControlFile
->backupEndPoint
= ControlFile
->minRecoveryPoint
;
993 /* remember these, so that we know when we have reached consistency */
994 backupStartPoint
= ControlFile
->backupStartPoint
;
995 backupEndRequired
= ControlFile
->backupEndRequired
;
996 backupEndPoint
= ControlFile
->backupEndPoint
;
997 if (InArchiveRecovery
)
999 minRecoveryPoint
= ControlFile
->minRecoveryPoint
;
1000 minRecoveryPointTLI
= ControlFile
->minRecoveryPointTLI
;
1004 minRecoveryPoint
= InvalidXLogRecPtr
;
1005 minRecoveryPointTLI
= 0;
1009 * Start recovery assuming that the final record isn't lost.
1011 abortedRecPtr
= InvalidXLogRecPtr
;
1012 missingContrecPtr
= InvalidXLogRecPtr
;
1014 *wasShutdown_ptr
= wasShutdown
;
1015 *haveBackupLabel_ptr
= haveBackupLabel
;
1016 *haveTblspcMap_ptr
= haveTblspcMap
;
1020 * See if there are any recovery signal files and if so, set state for
1023 * See if there is a recovery command file (recovery.conf), and if so
1024 * throw an ERROR since as of PG12 we no longer recognize that.
1027 readRecoverySignalFile(void)
1029 struct stat stat_buf
;
1031 if (IsBootstrapProcessingMode())
1035 * Check for old recovery API file: recovery.conf
1037 if (stat(RECOVERY_COMMAND_FILE
, &stat_buf
) == 0)
1039 (errcode_for_file_access(),
1040 errmsg("using recovery command file \"%s\" is not supported",
1041 RECOVERY_COMMAND_FILE
)));
1044 * Remove unused .done file, if present. Ignore if absent.
1046 unlink(RECOVERY_COMMAND_DONE
);
1049 * Check for recovery signal files and if found, fsync them since they
1050 * represent server state information. We don't sweat too much about the
1051 * possibility of fsync failure, however.
1053 * If present, standby signal file takes precedence. If neither is present
1054 * then we won't enter archive recovery.
1056 if (stat(STANDBY_SIGNAL_FILE
, &stat_buf
) == 0)
1060 fd
= BasicOpenFilePerm(STANDBY_SIGNAL_FILE
, O_RDWR
| PG_BINARY
,
1064 (void) pg_fsync(fd
);
1067 standby_signal_file_found
= true;
1069 else if (stat(RECOVERY_SIGNAL_FILE
, &stat_buf
) == 0)
1073 fd
= BasicOpenFilePerm(RECOVERY_SIGNAL_FILE
, O_RDWR
| PG_BINARY
,
1077 (void) pg_fsync(fd
);
1080 recovery_signal_file_found
= true;
1083 StandbyModeRequested
= false;
1084 ArchiveRecoveryRequested
= false;
1085 if (standby_signal_file_found
)
1087 StandbyModeRequested
= true;
1088 ArchiveRecoveryRequested
= true;
1090 else if (recovery_signal_file_found
)
1092 StandbyModeRequested
= false;
1093 ArchiveRecoveryRequested
= true;
1099 * We don't support standby mode in standalone backends; that requires
1100 * other processes such as the WAL receiver to be alive.
1102 if (StandbyModeRequested
&& !IsUnderPostmaster
)
1104 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
1105 errmsg("standby mode is not supported by single-user servers")));
1109 validateRecoveryParameters(void)
1111 if (!ArchiveRecoveryRequested
)
1115 * Check for compulsory parameters
1117 if (StandbyModeRequested
)
1119 if ((PrimaryConnInfo
== NULL
|| strcmp(PrimaryConnInfo
, "") == 0) &&
1120 (recoveryRestoreCommand
== NULL
|| strcmp(recoveryRestoreCommand
, "") == 0))
1122 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1123 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1127 if (recoveryRestoreCommand
== NULL
||
1128 strcmp(recoveryRestoreCommand
, "") == 0)
1130 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
1131 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1135 * Override any inconsistent requests. Note that this is a change of
1136 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1137 * hot_standby = off, which was surprising behaviour.
1139 if (recoveryTargetAction
== RECOVERY_TARGET_ACTION_PAUSE
&&
1141 recoveryTargetAction
= RECOVERY_TARGET_ACTION_SHUTDOWN
;
1144 * Final parsing of recovery_target_time string; see also
1145 * check_recovery_target_time().
1147 if (recoveryTarget
== RECOVERY_TARGET_TIME
)
1149 recoveryTargetTime
= DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in
,
1150 CStringGetDatum(recovery_target_time_string
),
1151 ObjectIdGetDatum(InvalidOid
),
1152 Int32GetDatum(-1)));
1156 * If user specified recovery_target_timeline, validate it or compute the
1157 * "latest" value. We can't do this until after we've gotten the restore
1158 * command and set InArchiveRecovery, because we need to fetch timeline
1159 * history files from the archive.
1161 if (recoveryTargetTimeLineGoal
== RECOVERY_TARGET_TIMELINE_NUMERIC
)
1163 TimeLineID rtli
= recoveryTargetTLIRequested
;
1165 /* Timeline 1 does not have a history file, all else should */
1166 if (rtli
!= 1 && !existsTimeLineHistory(rtli
))
1168 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
1169 errmsg("recovery target timeline %u does not exist",
1171 recoveryTargetTLI
= rtli
;
1173 else if (recoveryTargetTimeLineGoal
== RECOVERY_TARGET_TIMELINE_LATEST
)
1175 /* We start the "latest" search from pg_control's timeline */
1176 recoveryTargetTLI
= findNewestTimeLine(recoveryTargetTLI
);
1181 * else we just use the recoveryTargetTLI as already read from
1184 Assert(recoveryTargetTimeLineGoal
== RECOVERY_TARGET_TIMELINE_CONTROLFILE
);
1189 * read_backup_label: check to see if a backup_label file is present
1191 * If we see a backup_label during recovery, we assume that we are recovering
1192 * from a backup dump file, and we therefore roll forward from the checkpoint
1193 * identified by the label file, NOT what pg_control says. This avoids the
1194 * problem that pg_control might have been archived one or more checkpoints
1195 * later than the start of the dump, and so if we rely on it as the start
1196 * point, we will fail to restore a consistent database state.
1198 * Returns true if a backup_label was found (and fills the checkpoint
1199 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1200 * returns false if not. If this backup_label came from a streamed backup,
1201 * *backupEndRequired is set to true. If this backup_label was created during
1202 * recovery, *backupFromStandby is set to true.
1204 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1205 * and TLI read from the backup file.
1208 read_backup_label(XLogRecPtr
*checkPointLoc
, TimeLineID
*backupLabelTLI
,
1209 bool *backupEndRequired
, bool *backupFromStandby
)
1211 char startxlogfilename
[MAXFNAMELEN
];
1212 TimeLineID tli_from_walseg
,
1216 char backuptype
[20];
1217 char backupfrom
[20];
1218 char backuplabel
[MAXPGPATH
];
1219 char backuptime
[128];
1223 /* suppress possible uninitialized-variable warnings */
1224 *checkPointLoc
= InvalidXLogRecPtr
;
1225 *backupLabelTLI
= 0;
1226 *backupEndRequired
= false;
1227 *backupFromStandby
= false;
1230 * See if label file is present
1232 lfp
= AllocateFile(BACKUP_LABEL_FILE
, "r");
1235 if (errno
!= ENOENT
)
1237 (errcode_for_file_access(),
1238 errmsg("could not read file \"%s\": %m",
1239 BACKUP_LABEL_FILE
)));
1240 return false; /* it's not there, all is fine */
1244 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1245 * is pretty crude, but we are not expecting any variability in the file
1248 if (fscanf(lfp
, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1249 &hi
, &lo
, &tli_from_walseg
, startxlogfilename
, &ch
) != 5 || ch
!= '\n')
1251 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1252 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE
)));
1253 RedoStartLSN
= ((uint64
) hi
) << 32 | lo
;
1254 RedoStartTLI
= tli_from_walseg
;
1255 if (fscanf(lfp
, "CHECKPOINT LOCATION: %X/%X%c",
1256 &hi
, &lo
, &ch
) != 3 || ch
!= '\n')
1258 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1259 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE
)));
1260 *checkPointLoc
= ((uint64
) hi
) << 32 | lo
;
1261 *backupLabelTLI
= tli_from_walseg
;
1264 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1265 * which could mean either pg_basebackup or the pg_backup_start/stop
1266 * method was used) or if this label came from somewhere else (the only
1267 * other option today being from pg_rewind). If this was a streamed
1268 * backup then we know that we need to play through until we get to the
1269 * end of the WAL which was generated during the backup (at which point we
1270 * will have reached consistency and backupEndRequired will be reset to be
1273 if (fscanf(lfp
, "BACKUP METHOD: %19s\n", backuptype
) == 1)
1275 if (strcmp(backuptype
, "streamed") == 0)
1276 *backupEndRequired
= true;
1280 * BACKUP FROM lets us know if this was from a primary or a standby. If
1281 * it was from a standby, we'll double-check that the control file state
1282 * matches that of a standby.
1284 if (fscanf(lfp
, "BACKUP FROM: %19s\n", backupfrom
) == 1)
1286 if (strcmp(backupfrom
, "standby") == 0)
1287 *backupFromStandby
= true;
1291 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1292 * but checking for their presence is useful for debugging and the next
1293 * sanity checks. Cope also with the fact that the result buffers have a
1294 * pre-allocated size, hence if the backup_label file has been generated
1295 * with strings longer than the maximum assumed here an incorrect parsing
1296 * happens. That's fine as only minor consistency checks are done
1299 if (fscanf(lfp
, "START TIME: %127[^\n]\n", backuptime
) == 1)
1301 (errmsg_internal("backup time %s in file \"%s\"",
1302 backuptime
, BACKUP_LABEL_FILE
)));
1304 if (fscanf(lfp
, "LABEL: %1023[^\n]\n", backuplabel
) == 1)
1306 (errmsg_internal("backup label %s in file \"%s\"",
1307 backuplabel
, BACKUP_LABEL_FILE
)));
1310 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1311 * it as a sanity check if present.
1313 if (fscanf(lfp
, "START TIMELINE: %u\n", &tli_from_file
) == 1)
1315 if (tli_from_walseg
!= tli_from_file
)
1317 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1318 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE
),
1319 errdetail("Timeline ID parsed is %u, but expected %u.",
1320 tli_from_file
, tli_from_walseg
)));
1323 (errmsg_internal("backup timeline %u in file \"%s\"",
1324 tli_from_file
, BACKUP_LABEL_FILE
)));
1327 if (fscanf(lfp
, "INCREMENTAL FROM LSN: %X/%X\n", &hi
, &lo
) > 0)
1329 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1330 errmsg("this is an incremental backup, not a data directory"),
1331 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1333 if (ferror(lfp
) || FreeFile(lfp
))
1335 (errcode_for_file_access(),
1336 errmsg("could not read file \"%s\": %m",
1337 BACKUP_LABEL_FILE
)));
1343 * read_tablespace_map: check to see if a tablespace_map file is present
1345 * If we see a tablespace_map file during recovery, we assume that we are
1346 * recovering from a backup dump file, and we therefore need to create symlinks
1347 * as per the information present in tablespace_map file.
1349 * Returns true if a tablespace_map file was found (and fills *tablespaces
1350 * with a tablespaceinfo struct for each tablespace listed in the file);
1351 * returns false if not.
1354 read_tablespace_map(List
**tablespaces
)
1358 char str
[MAXPGPATH
];
1365 * See if tablespace_map file is present
1367 lfp
= AllocateFile(TABLESPACE_MAP
, "r");
1370 if (errno
!= ENOENT
)
1372 (errcode_for_file_access(),
1373 errmsg("could not read file \"%s\": %m",
1375 return false; /* it's not there, all is fine */
1379 * Read and parse the link name and path lines from tablespace_map file
1380 * (this code is pretty crude, but we are not expecting any variability in
1381 * the file format). De-escape any backslashes that were inserted.
1384 was_backslash
= false;
1385 while ((ch
= fgetc(lfp
)) != EOF
)
1387 if (!was_backslash
&& (ch
== '\n' || ch
== '\r'))
1392 continue; /* \r immediately followed by \n */
1395 * The de-escaped line should contain an OID followed by exactly
1396 * one space followed by a path. The path might start with
1397 * spaces, so don't be too liberal about parsing.
1401 while (str
[n
] && str
[n
] != ' ')
1403 if (n
< 1 || n
>= i
- 1)
1405 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1406 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP
)));
1409 ti
= palloc0(sizeof(tablespaceinfo
));
1411 ti
->oid
= strtoul(str
, &endp
, 10);
1412 if (*endp
!= '\0' || errno
== EINVAL
|| errno
== ERANGE
)
1414 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1415 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP
)));
1416 ti
->path
= pstrdup(str
+ n
);
1417 *tablespaces
= lappend(*tablespaces
, ti
);
1422 else if (!was_backslash
&& ch
== '\\')
1423 was_backslash
= true;
1426 if (i
< sizeof(str
) - 1)
1428 was_backslash
= false;
1432 if (i
!= 0 || was_backslash
) /* last line not terminated? */
1434 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE
),
1435 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP
)));
1437 if (ferror(lfp
) || FreeFile(lfp
))
1439 (errcode_for_file_access(),
1440 errmsg("could not read file \"%s\": %m",
1447 * Finish WAL recovery.
1449 * This does not close the 'xlogreader' yet, because in some cases the caller
1450 * still wants to re-read the last checkpoint record by calling
1451 * ReadCheckpointRecord().
1453 * Returns the position of the last valid or applied record, after which new
1454 * WAL should be appended, information about why recovery was ended, and some
1455 * other things. See the EndOfWalRecoveryInfo struct for details.
1457 EndOfWalRecoveryInfo
*
1458 FinishWalRecovery(void)
1460 EndOfWalRecoveryInfo
*result
= palloc(sizeof(EndOfWalRecoveryInfo
));
1462 TimeLineID lastRecTLI
;
1463 XLogRecPtr endOfLog
;
1466 * Kill WAL receiver, if it's still running, before we continue to write
1467 * the startup checkpoint and aborted-contrecord records. It will trump
1468 * over these records and subsequent ones if it's still alive when we
1469 * start writing WAL.
1471 XLogShutdownWalRcv();
1474 * Shutdown the slot sync worker to drop any temporary slots acquired by
1475 * it and to prevent it from keep trying to fetch the failover slots.
1477 * We do not update the 'synced' column in 'pg_replication_slots' system
1478 * view from true to false here, as any failed update could leave 'synced'
1479 * column false for some slots. This could cause issues during slot sync
1480 * after restarting the server as a standby. While updating the 'synced'
1481 * column after switching to the new timeline is an option, it does not
1482 * simplify the handling for the 'synced' column. Therefore, we retain the
1483 * 'synced' column as true after promotion as it may provide useful
1484 * information about the slot origin.
1489 * We are now done reading the xlog from stream. Turn off streaming
1490 * recovery to force fetching the files (which would be required at end of
1491 * recovery, e.g., timeline history file) from archive or pg_wal.
1493 * Note that standby mode must be turned off after killing WAL receiver,
1494 * i.e., calling XLogShutdownWalRcv().
1496 Assert(!WalRcvStreaming());
1497 StandbyMode
= false;
1500 * Determine where to start writing WAL next.
1502 * Re-fetch the last valid or last applied record, so we can identify the
1503 * exact endpoint of what we consider the valid portion of WAL. There may
1504 * be an incomplete continuation record after that, in which case
1505 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1506 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1507 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1509 * An important side-effect of this is to load the last page into
1510 * xlogreader. The caller uses it to initialize the WAL for writing.
1514 lastRec
= CheckPointLoc
;
1515 lastRecTLI
= CheckPointTLI
;
1519 lastRec
= XLogRecoveryCtl
->lastReplayedReadRecPtr
;
1520 lastRecTLI
= XLogRecoveryCtl
->lastReplayedTLI
;
1522 XLogPrefetcherBeginRead(xlogprefetcher
, lastRec
);
1523 (void) ReadRecord(xlogprefetcher
, PANIC
, false, lastRecTLI
);
1524 endOfLog
= xlogreader
->EndRecPtr
;
1527 * Remember the TLI in the filename of the XLOG segment containing the
1528 * end-of-log. It could be different from the timeline that endOfLog
1529 * nominally belongs to, if there was a timeline switch in that segment,
1530 * and we were reading the old WAL from a segment belonging to a higher
1533 result
->endOfLogTLI
= xlogreader
->seg
.ws_tli
;
1535 if (ArchiveRecoveryRequested
)
1538 * We are no longer in archive recovery state.
1540 * We are now done reading the old WAL. Turn off archive fetching if
1543 Assert(InArchiveRecovery
);
1544 InArchiveRecovery
= false;
1547 * If the ending log segment is still open, close it (to avoid
1548 * problems on Windows with trying to rename or delete an open file).
1558 * Copy the last partial block to the caller, for initializing the WAL
1559 * buffer for appending new WAL.
1561 if (endOfLog
% XLOG_BLCKSZ
!= 0)
1565 XLogRecPtr pageBeginPtr
;
1567 pageBeginPtr
= endOfLog
- (endOfLog
% XLOG_BLCKSZ
);
1568 Assert(readOff
== XLogSegmentOffset(pageBeginPtr
, wal_segment_size
));
1570 /* Copy the valid part of the last block */
1571 len
= endOfLog
% XLOG_BLCKSZ
;
1573 memcpy(page
, xlogreader
->readBuf
, len
);
1575 result
->lastPageBeginPtr
= pageBeginPtr
;
1576 result
->lastPage
= page
;
1580 /* There is no partial block to copy. */
1581 result
->lastPageBeginPtr
= endOfLog
;
1582 result
->lastPage
= NULL
;
1586 * Create a comment for the history file to explain why and where timeline
1589 result
->recoveryStopReason
= getRecoveryStopReason();
1591 result
->lastRec
= lastRec
;
1592 result
->lastRecTLI
= lastRecTLI
;
1593 result
->endOfLog
= endOfLog
;
1595 result
->abortedRecPtr
= abortedRecPtr
;
1596 result
->missingContrecPtr
= missingContrecPtr
;
1598 result
->standby_signal_file_found
= standby_signal_file_found
;
1599 result
->recovery_signal_file_found
= recovery_signal_file_found
;
1605 * Clean up the WAL reader and leftovers from restoring WAL from archive
1608 ShutdownWalRecovery(void)
1610 char recoveryPath
[MAXPGPATH
];
1612 /* Final update of pg_stat_recovery_prefetch. */
1613 XLogPrefetcherComputeStats(xlogprefetcher
);
1615 /* Shut down xlogreader */
1621 XLogReaderFree(xlogreader
);
1622 XLogPrefetcherFree(xlogprefetcher
);
1624 if (ArchiveRecoveryRequested
)
1627 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1630 snprintf(recoveryPath
, MAXPGPATH
, XLOGDIR
"/RECOVERYXLOG");
1631 unlink(recoveryPath
); /* ignore any error */
1633 /* Get rid of any remaining recovered timeline-history file, too */
1634 snprintf(recoveryPath
, MAXPGPATH
, XLOGDIR
"/RECOVERYHISTORY");
1635 unlink(recoveryPath
); /* ignore any error */
1639 * We don't need the latch anymore. It's not strictly necessary to disown
1640 * it, but let's do it for the sake of tidiness.
1642 if (ArchiveRecoveryRequested
)
1643 DisownLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
);
1647 * Perform WAL recovery.
1649 * If the system was shut down cleanly, this is never called.
1652 PerformWalRecovery(void)
1655 bool reachedRecoveryTarget
= false;
1656 TimeLineID replayTLI
;
1659 * Initialize shared variables for tracking progress of WAL replay, as if
1660 * we had just replayed the record before the REDO location (or the
1661 * checkpoint record itself, if it's a shutdown checkpoint).
1663 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
1664 if (RedoStartLSN
< CheckPointLoc
)
1666 XLogRecoveryCtl
->lastReplayedReadRecPtr
= InvalidXLogRecPtr
;
1667 XLogRecoveryCtl
->lastReplayedEndRecPtr
= RedoStartLSN
;
1668 XLogRecoveryCtl
->lastReplayedTLI
= RedoStartTLI
;
1672 XLogRecoveryCtl
->lastReplayedReadRecPtr
= xlogreader
->ReadRecPtr
;
1673 XLogRecoveryCtl
->lastReplayedEndRecPtr
= xlogreader
->EndRecPtr
;
1674 XLogRecoveryCtl
->lastReplayedTLI
= CheckPointTLI
;
1676 XLogRecoveryCtl
->replayEndRecPtr
= XLogRecoveryCtl
->lastReplayedEndRecPtr
;
1677 XLogRecoveryCtl
->replayEndTLI
= XLogRecoveryCtl
->lastReplayedTLI
;
1678 XLogRecoveryCtl
->recoveryLastXTime
= 0;
1679 XLogRecoveryCtl
->currentChunkStartTime
= 0;
1680 XLogRecoveryCtl
->recoveryPauseState
= RECOVERY_NOT_PAUSED
;
1681 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
1683 /* Also ensure XLogReceiptTime has a sane value */
1684 XLogReceiptTime
= GetCurrentTimestamp();
1687 * Let postmaster know we've started redo now, so that it can launch the
1688 * archiver if necessary.
1690 if (IsUnderPostmaster
)
1691 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED
);
1694 * Allow read-only connections immediately if we're consistent already.
1696 CheckRecoveryConsistency();
1699 * Find the first record that logically follows the checkpoint --- it
1700 * might physically precede it, though.
1702 if (RedoStartLSN
< CheckPointLoc
)
1704 /* back up to find the record */
1705 replayTLI
= RedoStartTLI
;
1706 XLogPrefetcherBeginRead(xlogprefetcher
, RedoStartLSN
);
1707 record
= ReadRecord(xlogprefetcher
, PANIC
, false, replayTLI
);
1710 * If a checkpoint record's redo pointer points back to an earlier
1711 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1714 if (record
->xl_rmid
!= RM_XLOG_ID
||
1715 (record
->xl_info
& ~XLR_INFO_MASK
) != XLOG_CHECKPOINT_REDO
)
1717 (errmsg("unexpected record type found at redo point %X/%X",
1718 LSN_FORMAT_ARGS(xlogreader
->ReadRecPtr
))));
1722 /* just have to read next record after CheckPoint */
1723 Assert(xlogreader
->ReadRecPtr
== CheckPointLoc
);
1724 replayTLI
= CheckPointTLI
;
1725 record
= ReadRecord(xlogprefetcher
, LOG
, false, replayTLI
);
1733 pg_rusage_init(&ru0
);
1740 (errmsg("redo starts at %X/%X",
1741 LSN_FORMAT_ARGS(xlogreader
->ReadRecPtr
))));
1743 /* Prepare to report progress of the redo phase. */
1745 begin_startup_progress_phase();
1748 * main redo apply loop
1753 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1754 LSN_FORMAT_ARGS(xlogreader
->ReadRecPtr
));
1761 initStringInfo(&buf
);
1762 appendStringInfo(&buf
, "REDO @ %X/%X; LSN %X/%X: ",
1763 LSN_FORMAT_ARGS(xlogreader
->ReadRecPtr
),
1764 LSN_FORMAT_ARGS(xlogreader
->EndRecPtr
));
1765 xlog_outrec(&buf
, xlogreader
);
1766 appendStringInfoString(&buf
, " - ");
1767 xlog_outdesc(&buf
, xlogreader
);
1768 elog(LOG
, "%s", buf
.data
);
1773 /* Handle interrupt signals of startup process */
1774 HandleStartupProcInterrupts();
1777 * Pause WAL replay, if requested by a hot-standby session via
1778 * SetRecoveryPause().
1780 * Note that we intentionally don't take the info_lck spinlock
1781 * here. We might therefore read a slightly stale value of the
1782 * recoveryPause flag, but it can't be very stale (no worse than
1783 * the last spinlock we did acquire). Since a pause request is a
1784 * pretty asynchronous thing anyway, possibly responding to it one
1785 * WAL record later than we otherwise would is a minor issue, so
1786 * it doesn't seem worth adding another spinlock cycle to prevent
1789 if (((volatile XLogRecoveryCtlData
*) XLogRecoveryCtl
)->recoveryPauseState
!=
1790 RECOVERY_NOT_PAUSED
)
1791 recoveryPausesHere(false);
1794 * Have we reached our recovery target?
1796 if (recoveryStopsBefore(xlogreader
))
1798 reachedRecoveryTarget
= true;
1803 * If we've been asked to lag the primary, wait on latch until
1804 * enough time has passed.
1806 if (recoveryApplyDelay(xlogreader
))
1809 * We test for paused recovery again here. If user sets
1810 * delayed apply, it may be because they expect to pause
1811 * recovery in case of problems, so we must test again here
1812 * otherwise pausing during the delay-wait wouldn't work.
1814 if (((volatile XLogRecoveryCtlData
*) XLogRecoveryCtl
)->recoveryPauseState
!=
1815 RECOVERY_NOT_PAUSED
)
1816 recoveryPausesHere(false);
1822 ApplyWalRecord(xlogreader
, record
, &replayTLI
);
1824 /* Exit loop if we reached inclusive recovery target */
1825 if (recoveryStopsAfter(xlogreader
))
1827 reachedRecoveryTarget
= true;
1831 /* Else, try to fetch the next WAL record */
1832 record
= ReadRecord(xlogprefetcher
, LOG
, false, replayTLI
);
1833 } while (record
!= NULL
);
1836 * end of main redo apply loop
1839 if (reachedRecoveryTarget
)
1841 if (!reachedConsistency
)
1843 (errmsg("requested recovery stop point is before consistent recovery point")));
1846 * This is the last point where we can restart recovery with a new
1847 * recovery target, if we shutdown and begin again. After this,
1848 * Resource Managers may choose to do permanent corrective actions
1849 * at end of recovery.
1851 switch (recoveryTargetAction
)
1853 case RECOVERY_TARGET_ACTION_SHUTDOWN
:
1856 * exit with special return code to request shutdown of
1857 * postmaster. Log messages issued from postmaster.
1861 case RECOVERY_TARGET_ACTION_PAUSE
:
1862 SetRecoveryPause(true);
1863 recoveryPausesHere(true);
1865 /* drop into promote */
1867 case RECOVERY_TARGET_ACTION_PROMOTE
:
1875 (errmsg("redo done at %X/%X system usage: %s",
1876 LSN_FORMAT_ARGS(xlogreader
->ReadRecPtr
),
1877 pg_rusage_show(&ru0
))));
1878 xtime
= GetLatestXTime();
1881 (errmsg("last completed transaction was at log time %s",
1882 timestamptz_to_str(xtime
))));
1888 /* there are no WAL records following the checkpoint */
1890 (errmsg("redo is not required")));
1894 * This check is intentionally after the above log messages that indicate
1895 * how far recovery went.
1897 if (ArchiveRecoveryRequested
&&
1898 recoveryTarget
!= RECOVERY_TARGET_UNSET
&&
1899 !reachedRecoveryTarget
)
1901 (errcode(ERRCODE_CONFIG_FILE_ERROR
),
1902 errmsg("recovery ended before configured recovery target was reached")));
1906 * Subroutine of PerformWalRecovery, to apply one WAL record.
1909 ApplyWalRecord(XLogReaderState
*xlogreader
, XLogRecord
*record
, TimeLineID
*replayTLI
)
1911 ErrorContextCallback errcallback
;
1912 bool switchedTLI
= false;
1914 /* Setup error traceback support for ereport() */
1915 errcallback
.callback
= rm_redo_error_callback
;
1916 errcallback
.arg
= xlogreader
;
1917 errcallback
.previous
= error_context_stack
;
1918 error_context_stack
= &errcallback
;
1921 * TransamVariables->nextXid must be beyond record's xid.
1923 AdvanceNextFullTransactionIdPastXid(record
->xl_xid
);
1926 * Before replaying this record, check if this record causes the current
1927 * timeline to change. The record is already considered to be part of the
1928 * new timeline, so we update replayTLI before replaying it. That's
1929 * important so that replayEndTLI, which is recorded as the minimum
1930 * recovery point's TLI if recovery stops after this record, is set
1933 if (record
->xl_rmid
== RM_XLOG_ID
)
1935 TimeLineID newReplayTLI
= *replayTLI
;
1936 TimeLineID prevReplayTLI
= *replayTLI
;
1937 uint8 info
= record
->xl_info
& ~XLR_INFO_MASK
;
1939 if (info
== XLOG_CHECKPOINT_SHUTDOWN
)
1941 CheckPoint checkPoint
;
1943 memcpy(&checkPoint
, XLogRecGetData(xlogreader
), sizeof(CheckPoint
));
1944 newReplayTLI
= checkPoint
.ThisTimeLineID
;
1945 prevReplayTLI
= checkPoint
.PrevTimeLineID
;
1947 else if (info
== XLOG_END_OF_RECOVERY
)
1949 xl_end_of_recovery xlrec
;
1951 memcpy(&xlrec
, XLogRecGetData(xlogreader
), sizeof(xl_end_of_recovery
));
1952 newReplayTLI
= xlrec
.ThisTimeLineID
;
1953 prevReplayTLI
= xlrec
.PrevTimeLineID
;
1956 if (newReplayTLI
!= *replayTLI
)
1958 /* Check that it's OK to switch to this TLI */
1959 checkTimeLineSwitch(xlogreader
->EndRecPtr
,
1960 newReplayTLI
, prevReplayTLI
, *replayTLI
);
1962 /* Following WAL records should be run with new TLI */
1963 *replayTLI
= newReplayTLI
;
1969 * Update shared replayEndRecPtr before replaying this record, so that
1970 * XLogFlush will update minRecoveryPoint correctly.
1972 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
1973 XLogRecoveryCtl
->replayEndRecPtr
= xlogreader
->EndRecPtr
;
1974 XLogRecoveryCtl
->replayEndTLI
= *replayTLI
;
1975 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
1978 * If we are attempting to enter Hot Standby mode, process XIDs we see
1980 if (standbyState
>= STANDBY_INITIALIZED
&&
1981 TransactionIdIsValid(record
->xl_xid
))
1982 RecordKnownAssignedTransactionIds(record
->xl_xid
);
1985 * Some XLOG record types that are related to recovery are processed
1986 * directly here, rather than in xlog_redo()
1988 if (record
->xl_rmid
== RM_XLOG_ID
)
1989 xlogrecovery_redo(xlogreader
, *replayTLI
);
1991 /* Now apply the WAL record itself */
1992 GetRmgr(record
->xl_rmid
).rm_redo(xlogreader
);
1995 * After redo, check whether the backup pages associated with the WAL
1996 * record are consistent with the existing pages. This check is done only
1997 * if consistency check is enabled for this record.
1999 if ((record
->xl_info
& XLR_CHECK_CONSISTENCY
) != 0)
2000 verifyBackupPageConsistency(xlogreader
);
2002 /* Pop the error context stack */
2003 error_context_stack
= errcallback
.previous
;
2006 * Update lastReplayedEndRecPtr after this record has been successfully
2009 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
2010 XLogRecoveryCtl
->lastReplayedReadRecPtr
= xlogreader
->ReadRecPtr
;
2011 XLogRecoveryCtl
->lastReplayedEndRecPtr
= xlogreader
->EndRecPtr
;
2012 XLogRecoveryCtl
->lastReplayedTLI
= *replayTLI
;
2013 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
2016 * Wakeup walsenders:
2018 * On the standby, the WAL is flushed first (which will only wake up
2019 * physical walsenders) and then applied, which will only wake up logical
2022 * Indeed, logical walsenders on standby can't decode and send data until
2023 * it's been applied.
2025 * Physical walsenders don't need to be woken up during replay unless
2026 * cascading replication is allowed and time line change occurred (so that
2027 * they can notice that they are on a new time line).
2029 * That's why the wake up conditions are for:
2031 * - physical walsenders in case of new time line and cascade
2032 * replication is allowed
2033 * - logical walsenders in case cascade replication is allowed (could not
2034 * be created otherwise)
2037 if (AllowCascadeReplication())
2038 WalSndWakeup(switchedTLI
, true);
2041 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2042 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2043 * a reply to the primary.
2045 if (doRequestWalReceiverReply
)
2047 doRequestWalReceiverReply
= false;
2051 /* Allow read-only connections if we're consistent now */
2052 CheckRecoveryConsistency();
2054 /* Is this a timeline switch? */
2058 * Before we continue on the new timeline, clean up any (possibly
2059 * bogus) future WAL segments on the old timeline.
2061 RemoveNonParentXlogFiles(xlogreader
->EndRecPtr
, *replayTLI
);
2063 /* Reset the prefetcher. */
2064 XLogPrefetchReconfigure();
2069 * Some XLOG RM record types that are directly related to WAL recovery are
2070 * handled here rather than in the xlog_redo()
2073 xlogrecovery_redo(XLogReaderState
*record
, TimeLineID replayTLI
)
2075 uint8 info
= XLogRecGetInfo(record
) & ~XLR_INFO_MASK
;
2076 XLogRecPtr lsn
= record
->EndRecPtr
;
2078 Assert(XLogRecGetRmid(record
) == RM_XLOG_ID
);
2080 if (info
== XLOG_OVERWRITE_CONTRECORD
)
2082 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2083 xl_overwrite_contrecord xlrec
;
2085 memcpy(&xlrec
, XLogRecGetData(record
), sizeof(xl_overwrite_contrecord
));
2086 if (xlrec
.overwritten_lsn
!= record
->overwrittenRecPtr
)
2087 elog(FATAL
, "mismatching overwritten LSN %X/%X -> %X/%X",
2088 LSN_FORMAT_ARGS(xlrec
.overwritten_lsn
),
2089 LSN_FORMAT_ARGS(record
->overwrittenRecPtr
));
2091 /* We have safely skipped the aborted record */
2092 abortedRecPtr
= InvalidXLogRecPtr
;
2093 missingContrecPtr
= InvalidXLogRecPtr
;
2096 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2097 LSN_FORMAT_ARGS(xlrec
.overwritten_lsn
),
2098 timestamptz_to_str(xlrec
.overwrite_time
))));
2100 /* Verifying the record should only happen once */
2101 record
->overwrittenRecPtr
= InvalidXLogRecPtr
;
2103 else if (info
== XLOG_BACKUP_END
)
2105 XLogRecPtr startpoint
;
2107 memcpy(&startpoint
, XLogRecGetData(record
), sizeof(startpoint
));
2109 if (backupStartPoint
== startpoint
)
2112 * We have reached the end of base backup, the point where
2113 * pg_backup_stop() was done. The data on disk is now consistent
2114 * (assuming we have also reached minRecoveryPoint). Set
2115 * backupEndPoint to the current LSN, so that the next call to
2116 * CheckRecoveryConsistency() will notice it and do the
2117 * end-of-backup processing.
2119 elog(DEBUG1
, "end of backup record reached");
2121 backupEndPoint
= lsn
;
2124 elog(DEBUG1
, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2125 LSN_FORMAT_ARGS(startpoint
), LSN_FORMAT_ARGS(backupStartPoint
));
2130 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2133 * Replay of database creation XLOG records for databases that were later
2134 * dropped can create fake directories in pg_tblspc. By the time consistency
2135 * is reached these directories should have been removed; here we verify
2136 * that this did indeed happen. This is to be called at the point where
2137 * consistent state is reached.
2139 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2140 * useful for testing purposes, and also allows for an escape hatch in case
2144 CheckTablespaceDirectory(void)
2149 dir
= AllocateDir(PG_TBLSPC_DIR
);
2150 while ((de
= ReadDir(dir
, PG_TBLSPC_DIR
)) != NULL
)
2152 char path
[MAXPGPATH
+ sizeof(PG_TBLSPC_DIR
)];
2154 /* Skip entries of non-oid names */
2155 if (strspn(de
->d_name
, "0123456789") != strlen(de
->d_name
))
2158 snprintf(path
, sizeof(path
), "%s/%s", PG_TBLSPC_DIR
, de
->d_name
);
2160 if (get_dirent_type(path
, de
, false, ERROR
) != PGFILETYPE_LNK
)
2161 ereport(allow_in_place_tablespaces
? WARNING
: PANIC
,
2162 (errcode(ERRCODE_DATA_CORRUPTED
),
2163 errmsg("unexpected directory entry \"%s\" found in %s",
2164 de
->d_name
, PG_TBLSPC_DIR
),
2165 errdetail("All directory entries in %s/ should be symbolic links.",
2167 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2172 * Checks if recovery has reached a consistent state. When consistency is
2173 * reached and we have a valid starting standby snapshot, tell postmaster
2174 * that it can start accepting read-only connections.
2177 CheckRecoveryConsistency(void)
2179 XLogRecPtr lastReplayedEndRecPtr
;
2180 TimeLineID lastReplayedTLI
;
2183 * During crash recovery, we don't reach a consistent state until we've
2184 * replayed all the WAL.
2186 if (XLogRecPtrIsInvalid(minRecoveryPoint
))
2189 Assert(InArchiveRecovery
);
2192 * assume that we are called in the startup process, and hence don't need
2193 * a lock to read lastReplayedEndRecPtr
2195 lastReplayedEndRecPtr
= XLogRecoveryCtl
->lastReplayedEndRecPtr
;
2196 lastReplayedTLI
= XLogRecoveryCtl
->lastReplayedTLI
;
2199 * Have we reached the point where our base backup was completed?
2201 if (!XLogRecPtrIsInvalid(backupEndPoint
) &&
2202 backupEndPoint
<= lastReplayedEndRecPtr
)
2204 XLogRecPtr saveBackupStartPoint
= backupStartPoint
;
2205 XLogRecPtr saveBackupEndPoint
= backupEndPoint
;
2207 elog(DEBUG1
, "end of backup reached");
2210 * We have reached the end of base backup, as indicated by pg_control.
2211 * Update the control file accordingly.
2213 ReachedEndOfBackup(lastReplayedEndRecPtr
, lastReplayedTLI
);
2214 backupStartPoint
= InvalidXLogRecPtr
;
2215 backupEndPoint
= InvalidXLogRecPtr
;
2216 backupEndRequired
= false;
2219 (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2220 LSN_FORMAT_ARGS(saveBackupStartPoint
),
2221 LSN_FORMAT_ARGS(saveBackupEndPoint
))));
2225 * Have we passed our safe starting point? Note that minRecoveryPoint is
2226 * known to be incorrectly set if recovering from a backup, until the
2227 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2228 * All we know prior to that is that we're not consistent yet.
2230 if (!reachedConsistency
&& !backupEndRequired
&&
2231 minRecoveryPoint
<= lastReplayedEndRecPtr
)
2234 * Check to see if the XLOG sequence contained any unresolved
2235 * references to uninitialized pages.
2237 XLogCheckInvalidPages();
2240 * Check that pg_tblspc doesn't contain any real directories. Replay
2241 * of Database/CREATE_* records may have created fictitious tablespace
2242 * directories that should have been removed by the time consistency
2245 CheckTablespaceDirectory();
2247 reachedConsistency
= true;
2249 (errmsg("consistent recovery state reached at %X/%X",
2250 LSN_FORMAT_ARGS(lastReplayedEndRecPtr
))));
2254 * Have we got a valid starting snapshot that will allow queries to be
2255 * run? If so, we can tell postmaster that the database is consistent now,
2256 * enabling connections.
2258 if (standbyState
== STANDBY_SNAPSHOT_READY
&&
2259 !LocalHotStandbyActive
&&
2260 reachedConsistency
&&
2263 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
2264 XLogRecoveryCtl
->SharedHotStandbyActive
= true;
2265 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
2267 LocalHotStandbyActive
= true;
2269 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY
);
2274 * Error context callback for errors occurring during rm_redo().
2277 rm_redo_error_callback(void *arg
)
2279 XLogReaderState
*record
= (XLogReaderState
*) arg
;
2282 initStringInfo(&buf
);
2283 xlog_outdesc(&buf
, record
);
2284 xlog_block_info(&buf
, record
);
2286 /* translator: %s is a WAL record description */
2287 errcontext("WAL redo at %X/%X for %s",
2288 LSN_FORMAT_ARGS(record
->ReadRecPtr
),
2295 * Returns a string describing an XLogRecord, consisting of its identity
2296 * optionally followed by a colon, a space, and a further description.
2299 xlog_outdesc(StringInfo buf
, XLogReaderState
*record
)
2301 RmgrData rmgr
= GetRmgr(XLogRecGetRmid(record
));
2302 uint8 info
= XLogRecGetInfo(record
);
2305 appendStringInfoString(buf
, rmgr
.rm_name
);
2306 appendStringInfoChar(buf
, '/');
2308 id
= rmgr
.rm_identify(info
);
2310 appendStringInfo(buf
, "UNKNOWN (%X): ", info
& ~XLR_INFO_MASK
);
2312 appendStringInfo(buf
, "%s: ", id
);
2314 rmgr
.rm_desc(buf
, record
);
2320 xlog_outrec(StringInfo buf
, XLogReaderState
*record
)
2322 appendStringInfo(buf
, "prev %X/%X; xid %u",
2323 LSN_FORMAT_ARGS(XLogRecGetPrev(record
)),
2324 XLogRecGetXid(record
));
2326 appendStringInfo(buf
, "; len %u",
2327 XLogRecGetDataLen(record
));
2329 xlog_block_info(buf
, record
);
2331 #endif /* WAL_DEBUG */
2334 * Returns a string giving information about all the blocks in an
2338 xlog_block_info(StringInfo buf
, XLogReaderState
*record
)
2342 /* decode block references */
2343 for (block_id
= 0; block_id
<= XLogRecMaxBlockId(record
); block_id
++)
2345 RelFileLocator rlocator
;
2349 if (!XLogRecGetBlockTagExtended(record
, block_id
,
2350 &rlocator
, &forknum
, &blk
, NULL
))
2353 if (forknum
!= MAIN_FORKNUM
)
2354 appendStringInfo(buf
, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2356 rlocator
.spcOid
, rlocator
.dbOid
,
2361 appendStringInfo(buf
, "; blkref #%d: rel %u/%u/%u, blk %u",
2363 rlocator
.spcOid
, rlocator
.dbOid
,
2366 if (XLogRecHasBlockImage(record
, block_id
))
2367 appendStringInfoString(buf
, " FPW");
2373 * Check that it's OK to switch to new timeline during recovery.
2375 * 'lsn' is the address of the shutdown checkpoint record we're about to
2376 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2379 checkTimeLineSwitch(XLogRecPtr lsn
, TimeLineID newTLI
, TimeLineID prevTLI
,
2380 TimeLineID replayTLI
)
2382 /* Check that the record agrees on what the current (old) timeline is */
2383 if (prevTLI
!= replayTLI
)
2385 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2386 prevTLI
, replayTLI
)));
2389 * The new timeline better be in the list of timelines we expect to see,
2390 * according to the timeline history. It should also not decrease.
2392 if (newTLI
< replayTLI
|| !tliInHistory(newTLI
, expectedTLEs
))
2394 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2395 newTLI
, replayTLI
)));
2398 * If we have not yet reached min recovery point, and we're about to
2399 * switch to a timeline greater than the timeline of the min recovery
2400 * point: trouble. After switching to the new timeline, we could not
2401 * possibly visit the min recovery point on the correct timeline anymore.
2402 * This can happen if there is a newer timeline in the archive that
2403 * branched before the timeline the min recovery point is on, and you
2404 * attempt to do PITR to the new timeline.
2406 if (!XLogRecPtrIsInvalid(minRecoveryPoint
) &&
2407 lsn
< minRecoveryPoint
&&
2408 newTLI
> minRecoveryPointTLI
)
2410 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2412 LSN_FORMAT_ARGS(minRecoveryPoint
),
2413 minRecoveryPointTLI
)));
2420 * Extract timestamp from WAL record.
2422 * If the record contains a timestamp, returns true, and saves the timestamp
2423 * in *recordXtime. If the record type has no timestamp, returns false.
2424 * Currently, only transaction commit/abort records and restore points contain
2428 getRecordTimestamp(XLogReaderState
*record
, TimestampTz
*recordXtime
)
2430 uint8 info
= XLogRecGetInfo(record
) & ~XLR_INFO_MASK
;
2431 uint8 xact_info
= info
& XLOG_XACT_OPMASK
;
2432 uint8 rmid
= XLogRecGetRmid(record
);
2434 if (rmid
== RM_XLOG_ID
&& info
== XLOG_RESTORE_POINT
)
2436 *recordXtime
= ((xl_restore_point
*) XLogRecGetData(record
))->rp_time
;
2439 if (rmid
== RM_XACT_ID
&& (xact_info
== XLOG_XACT_COMMIT
||
2440 xact_info
== XLOG_XACT_COMMIT_PREPARED
))
2442 *recordXtime
= ((xl_xact_commit
*) XLogRecGetData(record
))->xact_time
;
2445 if (rmid
== RM_XACT_ID
&& (xact_info
== XLOG_XACT_ABORT
||
2446 xact_info
== XLOG_XACT_ABORT_PREPARED
))
2448 *recordXtime
= ((xl_xact_abort
*) XLogRecGetData(record
))->xact_time
;
2455 * Checks whether the current buffer page and backup page stored in the
2456 * WAL record are consistent or not. Before comparing the two pages, a
2457 * masking can be applied to the pages to ignore certain areas like hint bits,
2458 * unused space between pd_lower and pd_upper among other things. This
2459 * function should be called once WAL replay has been completed for a
2463 verifyBackupPageConsistency(XLogReaderState
*record
)
2465 RmgrData rmgr
= GetRmgr(XLogRecGetRmid(record
));
2466 RelFileLocator rlocator
;
2471 /* Records with no backup blocks have no need for consistency checks. */
2472 if (!XLogRecHasAnyBlockRefs(record
))
2475 Assert((XLogRecGetInfo(record
) & XLR_CHECK_CONSISTENCY
) != 0);
2477 for (block_id
= 0; block_id
<= XLogRecMaxBlockId(record
); block_id
++)
2482 if (!XLogRecGetBlockTagExtended(record
, block_id
,
2483 &rlocator
, &forknum
, &blkno
, NULL
))
2486 * WAL record doesn't contain a block reference with the given id.
2492 Assert(XLogRecHasBlockImage(record
, block_id
));
2494 if (XLogRecBlockImageApply(record
, block_id
))
2497 * WAL record has already applied the page, so bypass the
2498 * consistency check as that would result in comparing the full
2499 * page stored in the record with itself.
2505 * Read the contents from the current buffer and store it in a
2508 buf
= XLogReadBufferExtended(rlocator
, forknum
, blkno
,
2511 if (!BufferIsValid(buf
))
2514 LockBuffer(buf
, BUFFER_LOCK_EXCLUSIVE
);
2515 page
= BufferGetPage(buf
);
2518 * Take a copy of the local page where WAL has been applied to have a
2519 * comparison base before masking it...
2521 memcpy(replay_image_masked
, page
, BLCKSZ
);
2523 /* No need for this page anymore now that a copy is in. */
2524 UnlockReleaseBuffer(buf
);
2527 * If the block LSN is already ahead of this WAL record, we can't
2528 * expect contents to match. This can happen if recovery is
2531 if (PageGetLSN(replay_image_masked
) > record
->EndRecPtr
)
2535 * Read the contents from the backup copy, stored in WAL record and
2536 * store it in a temporary page. There is no need to allocate a new
2537 * page here, a local buffer is fine to hold its contents and a mask
2538 * can be directly applied on it.
2540 if (!RestoreBlockImage(record
, block_id
, primary_image_masked
))
2542 (errcode(ERRCODE_INTERNAL_ERROR
),
2543 errmsg_internal("%s", record
->errormsg_buf
)));
2546 * If masking function is defined, mask both the primary and replay
2549 if (rmgr
.rm_mask
!= NULL
)
2551 rmgr
.rm_mask(replay_image_masked
, blkno
);
2552 rmgr
.rm_mask(primary_image_masked
, blkno
);
2555 /* Time to compare the primary and replay images. */
2556 if (memcmp(replay_image_masked
, primary_image_masked
, BLCKSZ
) != 0)
2559 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2560 rlocator
.spcOid
, rlocator
.dbOid
, rlocator
.relNumber
,
2567 * For point-in-time recovery, this function decides whether we want to
2568 * stop applying the XLOG before the current record.
2570 * Returns true if we are stopping, false otherwise. If stopping, some
2571 * information is saved in recoveryStopXid et al for use in annotating the
2572 * new timeline's history file.
2575 recoveryStopsBefore(XLogReaderState
*record
)
2577 bool stopsHere
= false;
2580 TimestampTz recordXtime
= 0;
2581 TransactionId recordXid
;
2584 * Ignore recovery target settings when not in archive recovery (meaning
2585 * we are in crash recovery).
2587 if (!ArchiveRecoveryRequested
)
2590 /* Check if we should stop as soon as reaching consistency */
2591 if (recoveryTarget
== RECOVERY_TARGET_IMMEDIATE
&& reachedConsistency
)
2594 (errmsg("recovery stopping after reaching consistency")));
2596 recoveryStopAfter
= false;
2597 recoveryStopXid
= InvalidTransactionId
;
2598 recoveryStopLSN
= InvalidXLogRecPtr
;
2599 recoveryStopTime
= 0;
2600 recoveryStopName
[0] = '\0';
2604 /* Check if target LSN has been reached */
2605 if (recoveryTarget
== RECOVERY_TARGET_LSN
&&
2606 !recoveryTargetInclusive
&&
2607 record
->ReadRecPtr
>= recoveryTargetLSN
)
2609 recoveryStopAfter
= false;
2610 recoveryStopXid
= InvalidTransactionId
;
2611 recoveryStopLSN
= record
->ReadRecPtr
;
2612 recoveryStopTime
= 0;
2613 recoveryStopName
[0] = '\0';
2615 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2616 LSN_FORMAT_ARGS(recoveryStopLSN
))));
2620 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2621 if (XLogRecGetRmid(record
) != RM_XACT_ID
)
2624 xact_info
= XLogRecGetInfo(record
) & XLOG_XACT_OPMASK
;
2626 if (xact_info
== XLOG_XACT_COMMIT
)
2629 recordXid
= XLogRecGetXid(record
);
2631 else if (xact_info
== XLOG_XACT_COMMIT_PREPARED
)
2633 xl_xact_commit
*xlrec
= (xl_xact_commit
*) XLogRecGetData(record
);
2634 xl_xact_parsed_commit parsed
;
2637 ParseCommitRecord(XLogRecGetInfo(record
),
2640 recordXid
= parsed
.twophase_xid
;
2642 else if (xact_info
== XLOG_XACT_ABORT
)
2645 recordXid
= XLogRecGetXid(record
);
2647 else if (xact_info
== XLOG_XACT_ABORT_PREPARED
)
2649 xl_xact_abort
*xlrec
= (xl_xact_abort
*) XLogRecGetData(record
);
2650 xl_xact_parsed_abort parsed
;
2653 ParseAbortRecord(XLogRecGetInfo(record
),
2656 recordXid
= parsed
.twophase_xid
;
2661 if (recoveryTarget
== RECOVERY_TARGET_XID
&& !recoveryTargetInclusive
)
2664 * There can be only one transaction end record with this exact
2667 * when testing for an xid, we MUST test for equality only, since
2668 * transactions are numbered in the order they start, not the order
2669 * they complete. A higher numbered xid will complete before you about
2670 * 50% of the time...
2672 stopsHere
= (recordXid
== recoveryTargetXid
);
2676 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2677 * We don't expect getRecordTimestamp ever to fail, since we already know
2678 * this is a commit or abort record; but test its result anyway.
2680 if (getRecordTimestamp(record
, &recordXtime
) &&
2681 recoveryTarget
== RECOVERY_TARGET_TIME
)
2684 * There can be many transactions that share the same commit time, so
2685 * we stop after the last one, if we are inclusive, or stop at the
2686 * first one if we are exclusive
2688 if (recoveryTargetInclusive
)
2689 stopsHere
= (recordXtime
> recoveryTargetTime
);
2691 stopsHere
= (recordXtime
>= recoveryTargetTime
);
2696 recoveryStopAfter
= false;
2697 recoveryStopXid
= recordXid
;
2698 recoveryStopTime
= recordXtime
;
2699 recoveryStopLSN
= InvalidXLogRecPtr
;
2700 recoveryStopName
[0] = '\0';
2705 (errmsg("recovery stopping before commit of transaction %u, time %s",
2707 timestamptz_to_str(recoveryStopTime
))));
2712 (errmsg("recovery stopping before abort of transaction %u, time %s",
2714 timestamptz_to_str(recoveryStopTime
))));
2722 * Same as recoveryStopsBefore, but called after applying the record.
2724 * We also track the timestamp of the latest applied COMMIT/ABORT
2725 * record in XLogRecoveryCtl->recoveryLastXTime.
2728 recoveryStopsAfter(XLogReaderState
*record
)
2733 TimestampTz recordXtime
= 0;
2736 * Ignore recovery target settings when not in archive recovery (meaning
2737 * we are in crash recovery).
2739 if (!ArchiveRecoveryRequested
)
2742 info
= XLogRecGetInfo(record
) & ~XLR_INFO_MASK
;
2743 rmid
= XLogRecGetRmid(record
);
2746 * There can be many restore points that share the same name; we stop at
2749 if (recoveryTarget
== RECOVERY_TARGET_NAME
&&
2750 rmid
== RM_XLOG_ID
&& info
== XLOG_RESTORE_POINT
)
2752 xl_restore_point
*recordRestorePointData
;
2754 recordRestorePointData
= (xl_restore_point
*) XLogRecGetData(record
);
2756 if (strcmp(recordRestorePointData
->rp_name
, recoveryTargetName
) == 0)
2758 recoveryStopAfter
= true;
2759 recoveryStopXid
= InvalidTransactionId
;
2760 recoveryStopLSN
= InvalidXLogRecPtr
;
2761 (void) getRecordTimestamp(record
, &recoveryStopTime
);
2762 strlcpy(recoveryStopName
, recordRestorePointData
->rp_name
, MAXFNAMELEN
);
2765 (errmsg("recovery stopping at restore point \"%s\", time %s",
2767 timestamptz_to_str(recoveryStopTime
))));
2772 /* Check if the target LSN has been reached */
2773 if (recoveryTarget
== RECOVERY_TARGET_LSN
&&
2774 recoveryTargetInclusive
&&
2775 record
->ReadRecPtr
>= recoveryTargetLSN
)
2777 recoveryStopAfter
= true;
2778 recoveryStopXid
= InvalidTransactionId
;
2779 recoveryStopLSN
= record
->ReadRecPtr
;
2780 recoveryStopTime
= 0;
2781 recoveryStopName
[0] = '\0';
2783 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2784 LSN_FORMAT_ARGS(recoveryStopLSN
))));
2788 if (rmid
!= RM_XACT_ID
)
2791 xact_info
= info
& XLOG_XACT_OPMASK
;
2793 if (xact_info
== XLOG_XACT_COMMIT
||
2794 xact_info
== XLOG_XACT_COMMIT_PREPARED
||
2795 xact_info
== XLOG_XACT_ABORT
||
2796 xact_info
== XLOG_XACT_ABORT_PREPARED
)
2798 TransactionId recordXid
;
2800 /* Update the last applied transaction timestamp */
2801 if (getRecordTimestamp(record
, &recordXtime
))
2802 SetLatestXTime(recordXtime
);
2804 /* Extract the XID of the committed/aborted transaction */
2805 if (xact_info
== XLOG_XACT_COMMIT_PREPARED
)
2807 xl_xact_commit
*xlrec
= (xl_xact_commit
*) XLogRecGetData(record
);
2808 xl_xact_parsed_commit parsed
;
2810 ParseCommitRecord(XLogRecGetInfo(record
),
2813 recordXid
= parsed
.twophase_xid
;
2815 else if (xact_info
== XLOG_XACT_ABORT_PREPARED
)
2817 xl_xact_abort
*xlrec
= (xl_xact_abort
*) XLogRecGetData(record
);
2818 xl_xact_parsed_abort parsed
;
2820 ParseAbortRecord(XLogRecGetInfo(record
),
2823 recordXid
= parsed
.twophase_xid
;
2826 recordXid
= XLogRecGetXid(record
);
2829 * There can be only one transaction end record with this exact
2832 * when testing for an xid, we MUST test for equality only, since
2833 * transactions are numbered in the order they start, not the order
2834 * they complete. A higher numbered xid will complete before you about
2835 * 50% of the time...
2837 if (recoveryTarget
== RECOVERY_TARGET_XID
&& recoveryTargetInclusive
&&
2838 recordXid
== recoveryTargetXid
)
2840 recoveryStopAfter
= true;
2841 recoveryStopXid
= recordXid
;
2842 recoveryStopTime
= recordXtime
;
2843 recoveryStopLSN
= InvalidXLogRecPtr
;
2844 recoveryStopName
[0] = '\0';
2846 if (xact_info
== XLOG_XACT_COMMIT
||
2847 xact_info
== XLOG_XACT_COMMIT_PREPARED
)
2850 (errmsg("recovery stopping after commit of transaction %u, time %s",
2852 timestamptz_to_str(recoveryStopTime
))));
2854 else if (xact_info
== XLOG_XACT_ABORT
||
2855 xact_info
== XLOG_XACT_ABORT_PREPARED
)
2858 (errmsg("recovery stopping after abort of transaction %u, time %s",
2860 timestamptz_to_str(recoveryStopTime
))));
2866 /* Check if we should stop as soon as reaching consistency */
2867 if (recoveryTarget
== RECOVERY_TARGET_IMMEDIATE
&& reachedConsistency
)
2870 (errmsg("recovery stopping after reaching consistency")));
2872 recoveryStopAfter
= true;
2873 recoveryStopXid
= InvalidTransactionId
;
2874 recoveryStopTime
= 0;
2875 recoveryStopLSN
= InvalidXLogRecPtr
;
2876 recoveryStopName
[0] = '\0';
2884 * Create a comment for the history file to explain why and where
2888 getRecoveryStopReason(void)
2892 if (recoveryTarget
== RECOVERY_TARGET_XID
)
2893 snprintf(reason
, sizeof(reason
),
2894 "%s transaction %u",
2895 recoveryStopAfter
? "after" : "before",
2897 else if (recoveryTarget
== RECOVERY_TARGET_TIME
)
2898 snprintf(reason
, sizeof(reason
),
2900 recoveryStopAfter
? "after" : "before",
2901 timestamptz_to_str(recoveryStopTime
));
2902 else if (recoveryTarget
== RECOVERY_TARGET_LSN
)
2903 snprintf(reason
, sizeof(reason
),
2905 recoveryStopAfter
? "after" : "before",
2906 LSN_FORMAT_ARGS(recoveryStopLSN
));
2907 else if (recoveryTarget
== RECOVERY_TARGET_NAME
)
2908 snprintf(reason
, sizeof(reason
),
2909 "at restore point \"%s\"",
2911 else if (recoveryTarget
== RECOVERY_TARGET_IMMEDIATE
)
2912 snprintf(reason
, sizeof(reason
), "reached consistency");
2914 snprintf(reason
, sizeof(reason
), "no recovery target specified");
2916 return pstrdup(reason
);
2920 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2922 * endOfRecovery is true if the recovery target is reached and
2923 * the paused state starts at the end of recovery because of
2924 * recovery_target_action=pause, and false otherwise.
2927 recoveryPausesHere(bool endOfRecovery
)
2929 /* Don't pause unless users can connect! */
2930 if (!LocalHotStandbyActive
)
2933 /* Don't pause after standby promotion has been triggered */
2934 if (LocalPromoteIsTriggered
)
2939 (errmsg("pausing at the end of recovery"),
2940 errhint("Execute pg_wal_replay_resume() to promote.")));
2943 (errmsg("recovery has paused"),
2944 errhint("Execute pg_wal_replay_resume() to continue.")));
2946 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2947 while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED
)
2949 HandleStartupProcInterrupts();
2950 if (CheckForStandbyTrigger())
2954 * If recovery pause is requested then set it paused. While we are in
2955 * the loop, user might resume and pause again so set this every time.
2957 ConfirmRecoveryPaused();
2960 * We wait on a condition variable that will wake us as soon as the
2961 * pause ends, but we use a timeout so we can check the above exit
2962 * condition periodically too.
2964 ConditionVariableTimedSleep(&XLogRecoveryCtl
->recoveryNotPausedCV
, 1000,
2965 WAIT_EVENT_RECOVERY_PAUSE
);
2967 ConditionVariableCancelSleep();
2971 * When recovery_min_apply_delay is set, we wait long enough to make sure
2972 * certain record types are applied at least that interval behind the primary.
2974 * Returns true if we waited.
2976 * Note that the delay is calculated between the WAL record log time and
2977 * the current time on standby. We would prefer to keep track of when this
2978 * standby received each WAL record, which would allow a more consistent
2979 * approach and one not affected by time synchronisation issues, but that
2980 * is significantly more effort and complexity for little actual gain in
2984 recoveryApplyDelay(XLogReaderState
*record
)
2988 TimestampTz delayUntil
;
2991 /* nothing to do if no delay configured */
2992 if (recovery_min_apply_delay
<= 0)
2995 /* no delay is applied on a database not yet consistent */
2996 if (!reachedConsistency
)
2999 /* nothing to do if crash recovery is requested */
3000 if (!ArchiveRecoveryRequested
)
3004 * Is it a COMMIT record?
3006 * We deliberately choose not to delay aborts since they have no effect on
3007 * MVCC. We already allow replay of records that don't have a timestamp,
3008 * so there is already opportunity for issues caused by early conflicts on
3011 if (XLogRecGetRmid(record
) != RM_XACT_ID
)
3014 xact_info
= XLogRecGetInfo(record
) & XLOG_XACT_OPMASK
;
3016 if (xact_info
!= XLOG_XACT_COMMIT
&&
3017 xact_info
!= XLOG_XACT_COMMIT_PREPARED
)
3020 if (!getRecordTimestamp(record
, &xtime
))
3023 delayUntil
= TimestampTzPlusMilliseconds(xtime
, recovery_min_apply_delay
);
3026 * Exit without arming the latch if it's already past time to apply this
3029 msecs
= TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil
);
3035 ResetLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
);
3037 /* This might change recovery_min_apply_delay. */
3038 HandleStartupProcInterrupts();
3040 if (CheckForStandbyTrigger())
3044 * Recalculate delayUntil as recovery_min_apply_delay could have
3045 * changed while waiting in this loop.
3047 delayUntil
= TimestampTzPlusMilliseconds(xtime
, recovery_min_apply_delay
);
3050 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3052 msecs
= TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3058 elog(DEBUG2
, "recovery apply delay %ld milliseconds", msecs
);
3060 (void) WaitLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
,
3061 WL_LATCH_SET
| WL_TIMEOUT
| WL_EXIT_ON_PM_DEATH
,
3063 WAIT_EVENT_RECOVERY_APPLY_DELAY
);
3069 * Get the current state of the recovery pause request.
3072 GetRecoveryPauseState(void)
3074 RecoveryPauseState state
;
3076 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
3077 state
= XLogRecoveryCtl
->recoveryPauseState
;
3078 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
3084 * Set the recovery pause state.
3086 * If recovery pause is requested then sets the recovery pause state to
3087 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3088 * to 'not paused' to resume the recovery. The recovery pause will be
3089 * confirmed by the ConfirmRecoveryPaused.
3092 SetRecoveryPause(bool recoveryPause
)
3094 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
3097 XLogRecoveryCtl
->recoveryPauseState
= RECOVERY_NOT_PAUSED
;
3098 else if (XLogRecoveryCtl
->recoveryPauseState
== RECOVERY_NOT_PAUSED
)
3099 XLogRecoveryCtl
->recoveryPauseState
= RECOVERY_PAUSE_REQUESTED
;
3101 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
3104 ConditionVariableBroadcast(&XLogRecoveryCtl
->recoveryNotPausedCV
);
3108 * Confirm the recovery pause by setting the recovery pause state to
3112 ConfirmRecoveryPaused(void)
3114 /* If recovery pause is requested then set it paused */
3115 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
3116 if (XLogRecoveryCtl
->recoveryPauseState
== RECOVERY_PAUSE_REQUESTED
)
3117 XLogRecoveryCtl
->recoveryPauseState
= RECOVERY_PAUSED
;
3118 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
3123 * Attempt to read the next XLOG record.
3125 * Before first call, the reader needs to be positioned to the first record
3126 * by calling XLogPrefetcherBeginRead().
3128 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3129 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3130 * record is available.
3133 ReadRecord(XLogPrefetcher
*xlogprefetcher
, int emode
,
3134 bool fetching_ckpt
, TimeLineID replayTLI
)
3137 XLogReaderState
*xlogreader
= XLogPrefetcherGetReader(xlogprefetcher
);
3138 XLogPageReadPrivate
*private = (XLogPageReadPrivate
*) xlogreader
->private_data
;
3140 /* Pass through parameters to XLogPageRead */
3141 private->fetching_ckpt
= fetching_ckpt
;
3142 private->emode
= emode
;
3143 private->randAccess
= (xlogreader
->ReadRecPtr
== InvalidXLogRecPtr
);
3144 private->replayTLI
= replayTLI
;
3146 /* This is the first attempt to read this page. */
3147 lastSourceFailed
= false;
3153 record
= XLogPrefetcherReadRecord(xlogprefetcher
, &errormsg
);
3157 * When we find that WAL ends in an incomplete record, keep track
3158 * of that record. After recovery is done, we'll write a record
3159 * to indicate to downstream WAL readers that that portion is to
3162 * However, when ArchiveRecoveryRequested = true, we're going to
3163 * switch to a new timeline at the end of recovery. We will only
3164 * copy WAL over to the new timeline up to the end of the last
3165 * complete record, so if we did this, we would later create an
3166 * overwrite contrecord in the wrong place, breaking everything.
3168 if (!ArchiveRecoveryRequested
&&
3169 !XLogRecPtrIsInvalid(xlogreader
->abortedRecPtr
))
3171 abortedRecPtr
= xlogreader
->abortedRecPtr
;
3172 missingContrecPtr
= xlogreader
->missingContrecPtr
;
3182 * We only end up here without a message when XLogPageRead()
3183 * failed - in that case we already logged something. In
3184 * StandbyMode that only happens if we have been triggered, so we
3185 * shouldn't loop anymore in that case.
3188 ereport(emode_for_corrupt_record(emode
, xlogreader
->EndRecPtr
),
3189 (errmsg_internal("%s", errormsg
) /* already translated */ ));
3193 * Check page TLI is one of the expected values.
3195 else if (!tliInHistory(xlogreader
->latestPageTLI
, expectedTLEs
))
3197 char fname
[MAXFNAMELEN
];
3201 XLByteToSeg(xlogreader
->latestPagePtr
, segno
, wal_segment_size
);
3202 offset
= XLogSegmentOffset(xlogreader
->latestPagePtr
,
3204 XLogFileName(fname
, xlogreader
->seg
.ws_tli
, segno
,
3206 ereport(emode_for_corrupt_record(emode
, xlogreader
->EndRecPtr
),
3207 (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3208 xlogreader
->latestPageTLI
,
3210 LSN_FORMAT_ARGS(xlogreader
->latestPagePtr
),
3217 /* Great, got a record */
3222 /* No valid record available from this source */
3223 lastSourceFailed
= true;
3226 * If archive recovery was requested, but we were still doing
3227 * crash recovery, switch to archive recovery and retry using the
3228 * offline archive. We have now replayed all the valid WAL in
3229 * pg_wal, so we are presumably now consistent.
3231 * We require that there's at least some valid WAL present in
3232 * pg_wal, however (!fetching_ckpt). We could recover using the
3233 * WAL from the archive, even if pg_wal is completely empty, but
3234 * we'd have no idea how far we'd have to replay to reach
3235 * consistency. So err on the safe side and give up.
3237 if (!InArchiveRecovery
&& ArchiveRecoveryRequested
&&
3241 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3242 InArchiveRecovery
= true;
3243 if (StandbyModeRequested
)
3244 EnableStandbyMode();
3246 SwitchIntoArchiveRecovery(xlogreader
->EndRecPtr
, replayTLI
);
3247 minRecoveryPoint
= xlogreader
->EndRecPtr
;
3248 minRecoveryPointTLI
= replayTLI
;
3250 CheckRecoveryConsistency();
3253 * Before we retry, reset lastSourceFailed and currentSource
3254 * so that we will check the archive next.
3256 lastSourceFailed
= false;
3257 currentSource
= XLOG_FROM_ANY
;
3262 /* In standby mode, loop back to retry. Otherwise, give up. */
3263 if (StandbyMode
&& !CheckForStandbyTrigger())
3272 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3273 * already). Returns number of bytes read, if the page is read successfully,
3274 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3275 * but only if they have not been previously reported.
3277 * See XLogReaderRoutine.page_read for more details.
3279 * While prefetching, xlogreader->nonblocking may be set. In that case,
3280 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3282 * This is responsible for restoring files from archive as needed, as well
3283 * as for waiting for the requested WAL record to arrive in standby mode.
3285 * xlogreader->private_data->emode specifies the log level used for reporting
3286 * "file not found" or "end of WAL" situations in archive recovery, or in
3287 * standby mode when promotion is triggered. If set to WARNING or below,
3288 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3289 * levels the ereport() won't return.
3291 * In standby mode, if after a successful return of XLogPageRead() the
3292 * caller finds the record it's interested in to be broken, it should
3293 * ereport the error with the level determined by
3294 * emode_for_corrupt_record(), and then set lastSourceFailed
3295 * and call XLogPageRead() again with the same arguments. This lets
3296 * XLogPageRead() to try fetching the record from another source, or to
3300 XLogPageRead(XLogReaderState
*xlogreader
, XLogRecPtr targetPagePtr
, int reqLen
,
3301 XLogRecPtr targetRecPtr
, char *readBuf
)
3303 XLogPageReadPrivate
*private =
3304 (XLogPageReadPrivate
*) xlogreader
->private_data
;
3305 int emode
= private->emode
;
3306 uint32 targetPageOff
;
3307 XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY
;
3310 XLByteToSeg(targetPagePtr
, targetSegNo
, wal_segment_size
);
3311 targetPageOff
= XLogSegmentOffset(targetPagePtr
, wal_segment_size
);
3314 * See if we need to switch to a new segment because the requested record
3315 * is not in the currently open one.
3317 if (readFile
>= 0 &&
3318 !XLByteInSeg(targetPagePtr
, readSegNo
, wal_segment_size
))
3321 * Request a restartpoint if we've replayed too much xlog since the
3324 if (ArchiveRecoveryRequested
&& IsUnderPostmaster
)
3326 if (XLogCheckpointNeeded(readSegNo
))
3328 (void) GetRedoRecPtr();
3329 if (XLogCheckpointNeeded(readSegNo
))
3330 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG
);
3336 readSource
= XLOG_FROM_ANY
;
3339 XLByteToSeg(targetPagePtr
, readSegNo
, wal_segment_size
);
3342 /* See if we need to retrieve more data */
3344 (readSource
== XLOG_FROM_STREAM
&&
3345 flushedUpto
< targetPagePtr
+ reqLen
))
3347 if (readFile
>= 0 &&
3348 xlogreader
->nonblocking
&&
3349 readSource
== XLOG_FROM_STREAM
&&
3350 flushedUpto
< targetPagePtr
+ reqLen
)
3351 return XLREAD_WOULDBLOCK
;
3353 switch (WaitForWALToBecomeAvailable(targetPagePtr
+ reqLen
,
3354 private->randAccess
,
3355 private->fetching_ckpt
,
3358 xlogreader
->EndRecPtr
,
3359 xlogreader
->nonblocking
))
3361 case XLREAD_WOULDBLOCK
:
3362 return XLREAD_WOULDBLOCK
;
3368 readSource
= XLOG_FROM_ANY
;
3370 case XLREAD_SUCCESS
:
3376 * At this point, we have the right segment open and if we're streaming we
3377 * know the requested record is in it.
3379 Assert(readFile
!= -1);
3382 * If the current segment is being streamed from the primary, calculate
3383 * how much of the current page we have received already. We know the
3384 * requested record has been received, but this is for the benefit of
3385 * future calls, to allow quick exit at the top of this function.
3387 if (readSource
== XLOG_FROM_STREAM
)
3389 if (((targetPagePtr
) / XLOG_BLCKSZ
) != (flushedUpto
/ XLOG_BLCKSZ
))
3390 readLen
= XLOG_BLCKSZ
;
3392 readLen
= XLogSegmentOffset(flushedUpto
, wal_segment_size
) -
3396 readLen
= XLOG_BLCKSZ
;
3398 /* Read the requested page */
3399 readOff
= targetPageOff
;
3401 pgstat_report_wait_start(WAIT_EVENT_WAL_READ
);
3402 r
= pg_pread(readFile
, readBuf
, XLOG_BLCKSZ
, (off_t
) readOff
);
3403 if (r
!= XLOG_BLCKSZ
)
3405 char fname
[MAXFNAMELEN
];
3406 int save_errno
= errno
;
3408 pgstat_report_wait_end();
3409 XLogFileName(fname
, curFileTLI
, readSegNo
, wal_segment_size
);
3413 ereport(emode_for_corrupt_record(emode
, targetPagePtr
+ reqLen
),
3414 (errcode_for_file_access(),
3415 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3416 fname
, LSN_FORMAT_ARGS(targetPagePtr
),
3420 ereport(emode_for_corrupt_record(emode
, targetPagePtr
+ reqLen
),
3421 (errcode(ERRCODE_DATA_CORRUPTED
),
3422 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3423 fname
, LSN_FORMAT_ARGS(targetPagePtr
),
3424 readOff
, r
, (Size
) XLOG_BLCKSZ
)));
3425 goto next_record_is_invalid
;
3427 pgstat_report_wait_end();
3429 Assert(targetSegNo
== readSegNo
);
3430 Assert(targetPageOff
== readOff
);
3431 Assert(reqLen
<= readLen
);
3433 xlogreader
->seg
.ws_tli
= curFileTLI
;
3436 * Check the page header immediately, so that we can retry immediately if
3437 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3438 * validates the page header anyway, and would propagate the failure up to
3439 * ReadRecord(), which would retry. However, there's a corner case with
3440 * continuation records, if a record is split across two pages such that
3441 * we would need to read the two pages from different sources across two
3444 * The first page is only available locally, in pg_wal, because it's
3445 * already been recycled on the primary. The second page, however, is not
3446 * present in pg_wal, and we should stream it from the primary. There is a
3447 * recycled WAL segment present in pg_wal, with garbage contents, however.
3448 * We would read the first page from the local WAL segment, but when
3449 * reading the second page, we would read the bogus, recycled, WAL
3450 * segment. If we didn't catch that case here, we would never recover,
3451 * because ReadRecord() would retry reading the whole record from the
3454 * Of course, this only catches errors in the page header, which is what
3455 * happens in the case of a recycled WAL segment. Other kinds of errors or
3456 * corruption still has the same problem. But this at least fixes the
3457 * common case, which can happen as part of normal operation.
3459 * Validating the page header is cheap enough that doing it twice
3460 * shouldn't be a big deal from a performance point of view.
3462 * When not in standby mode, an invalid page header should cause recovery
3463 * to end, not retry reading the page, so we don't need to validate the
3464 * page header here for the retry. Instead, ReadPageInternal() is
3465 * responsible for the validation.
3468 (targetPagePtr
% wal_segment_size
) == 0 &&
3469 !XLogReaderValidatePageHeader(xlogreader
, targetPagePtr
, readBuf
))
3472 * Emit this error right now then retry this page immediately. Use
3473 * errmsg_internal() because the message was already translated.
3475 if (xlogreader
->errormsg_buf
[0])
3476 ereport(emode_for_corrupt_record(emode
, xlogreader
->EndRecPtr
),
3477 (errmsg_internal("%s", xlogreader
->errormsg_buf
)));
3479 /* reset any error XLogReaderValidatePageHeader() might have set */
3480 XLogReaderResetError(xlogreader
);
3481 goto next_record_is_invalid
;
3486 next_record_is_invalid
:
3489 * If we're reading ahead, give up fast. Retries and error reporting will
3490 * be handled by a later read when recovery catches up to this point.
3492 if (xlogreader
->nonblocking
)
3493 return XLREAD_WOULDBLOCK
;
3495 lastSourceFailed
= true;
3501 readSource
= XLOG_FROM_ANY
;
3503 /* In standby-mode, keep trying */
3511 * Open the WAL segment containing WAL location 'RecPtr'.
3513 * The segment can be fetched via restore_command, or via walreceiver having
3514 * streamed the record, or it can already be present in pg_wal. Checking
3515 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3516 * too, in case someone copies a new segment directly to pg_wal. That is not
3517 * documented or recommended, though.
3519 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3520 * prepare to read WAL starting from RedoStartLSN after this.
3522 * 'RecPtr' might not point to the beginning of the record we're interested
3523 * in, it might also point to the page or segment header. In that case,
3524 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3525 * used to decide which timeline to stream the requested WAL from.
3527 * 'replayLSN' is the current replay LSN, so that if we scan for new
3528 * timelines, we can reject a switch to a timeline that branched off before
3531 * If the record is not immediately available, the function returns false
3532 * if we're not in standby mode. In standby mode, waits for it to become
3535 * When the requested record becomes available, the function opens the file
3536 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3537 * of standby mode is triggered by the user, and there is no more WAL
3538 * available, returns XLREAD_FAIL.
3540 * If nonblocking is true, then give up immediately if we can't satisfy the
3541 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3543 static XLogPageReadResult
3544 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr
, bool randAccess
,
3545 bool fetching_ckpt
, XLogRecPtr tliRecPtr
,
3546 TimeLineID replayTLI
, XLogRecPtr replayLSN
,
3549 static TimestampTz last_fail_time
= 0;
3551 bool streaming_reply_sent
= false;
3554 * Standby mode is implemented by a state machine:
3556 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3557 * pg_wal (XLOG_FROM_PG_WAL)
3558 * 2. Check for promotion trigger request
3559 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3560 * 4. Rescan timelines
3561 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3563 * Failure to read from the current source advances the state machine to
3566 * 'currentSource' indicates the current state. There are no currentSource
3567 * values for "check trigger", "rescan timelines", and "sleep" states,
3568 * those actions are taken when reading from the previous source fails, as
3569 * part of advancing to the next state.
3571 * If standby mode is turned off while reading WAL from stream, we move
3572 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3573 * the files (which would be required at end of recovery, e.g., timeline
3574 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3575 * here because it's already stopped when standby mode is turned off at
3576 * the end of recovery.
3579 if (!InArchiveRecovery
)
3580 currentSource
= XLOG_FROM_PG_WAL
;
3581 else if (currentSource
== XLOG_FROM_ANY
||
3582 (!StandbyMode
&& currentSource
== XLOG_FROM_STREAM
))
3584 lastSourceFailed
= false;
3585 currentSource
= XLOG_FROM_ARCHIVE
;
3590 XLogSource oldSource
= currentSource
;
3591 bool startWalReceiver
= false;
3594 * First check if we failed to read from the current source, and
3595 * advance the state machine if so. The failure to read might've
3596 * happened outside this function, e.g when a CRC check fails on a
3597 * record, or within this loop.
3599 if (lastSourceFailed
)
3602 * Don't allow any retry loops to occur during nonblocking
3603 * readahead. Let the caller process everything that has been
3604 * decoded already first.
3607 return XLREAD_WOULDBLOCK
;
3609 switch (currentSource
)
3611 case XLOG_FROM_ARCHIVE
:
3612 case XLOG_FROM_PG_WAL
:
3615 * Check to see if promotion is requested. Note that we do
3616 * this only after failure, so when you promote, we still
3617 * finish replaying as much as we can from archive and
3618 * pg_wal before failover.
3620 if (StandbyMode
&& CheckForStandbyTrigger())
3622 XLogShutdownWalRcv();
3627 * Not in standby mode, and we've now tried the archive
3634 * Move to XLOG_FROM_STREAM state, and set to start a
3635 * walreceiver if necessary.
3637 currentSource
= XLOG_FROM_STREAM
;
3638 startWalReceiver
= true;
3641 case XLOG_FROM_STREAM
:
3644 * Failure while streaming. Most likely, we got here
3645 * because streaming replication was terminated, or
3646 * promotion was triggered. But we also get here if we
3647 * find an invalid record in the WAL streamed from the
3648 * primary, in which case something is seriously wrong.
3649 * There's little chance that the problem will just go
3650 * away, but PANIC is not good for availability either,
3651 * especially in hot standby mode. So, we treat that the
3652 * same as disconnection, and retry from archive/pg_wal
3653 * again. The WAL in the archive should be identical to
3654 * what was streamed, so it's unlikely that it helps, but
3659 * We should be able to move to XLOG_FROM_STREAM only in
3662 Assert(StandbyMode
);
3665 * Before we leave XLOG_FROM_STREAM state, make sure that
3666 * walreceiver is not active, so that it won't overwrite
3667 * WAL that we restore from archive.
3669 XLogShutdownWalRcv();
3672 * Before we sleep, re-scan for possible new timelines if
3673 * we were requested to recover to the latest timeline.
3675 if (recoveryTargetTimeLineGoal
== RECOVERY_TARGET_TIMELINE_LATEST
)
3677 if (rescanLatestTimeLine(replayTLI
, replayLSN
))
3679 currentSource
= XLOG_FROM_ARCHIVE
;
3685 * XLOG_FROM_STREAM is the last state in our state
3686 * machine, so we've exhausted all the options for
3687 * obtaining the requested WAL. We're going to loop back
3688 * and retry from the archive, but if it hasn't been long
3689 * since last attempt, sleep wal_retrieve_retry_interval
3690 * milliseconds to avoid busy-waiting.
3692 now
= GetCurrentTimestamp();
3693 if (!TimestampDifferenceExceeds(last_fail_time
, now
,
3694 wal_retrieve_retry_interval
))
3698 wait_time
= wal_retrieve_retry_interval
-
3699 TimestampDifferenceMilliseconds(last_fail_time
, now
);
3701 elog(LOG
, "waiting for WAL to become available at %X/%X",
3702 LSN_FORMAT_ARGS(RecPtr
));
3704 /* Do background tasks that might benefit us later. */
3705 KnownAssignedTransactionIdsIdleMaintenance();
3707 (void) WaitLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
,
3708 WL_LATCH_SET
| WL_TIMEOUT
|
3709 WL_EXIT_ON_PM_DEATH
,
3711 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL
);
3712 ResetLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
);
3713 now
= GetCurrentTimestamp();
3715 /* Handle interrupt signals of startup process */
3716 HandleStartupProcInterrupts();
3718 last_fail_time
= now
;
3719 currentSource
= XLOG_FROM_ARCHIVE
;
3723 elog(ERROR
, "unexpected WAL source %d", currentSource
);
3726 else if (currentSource
== XLOG_FROM_PG_WAL
)
3729 * We just successfully read a file in pg_wal. We prefer files in
3730 * the archive over ones in pg_wal, so try the next file again
3731 * from the archive first.
3733 if (InArchiveRecovery
)
3734 currentSource
= XLOG_FROM_ARCHIVE
;
3737 if (currentSource
!= oldSource
)
3738 elog(DEBUG2
, "switched WAL source from %s to %s after %s",
3739 xlogSourceNames
[oldSource
], xlogSourceNames
[currentSource
],
3740 lastSourceFailed
? "failure" : "success");
3743 * We've now handled possible failure. Try to read from the chosen
3746 lastSourceFailed
= false;
3748 switch (currentSource
)
3750 case XLOG_FROM_ARCHIVE
:
3751 case XLOG_FROM_PG_WAL
:
3754 * WAL receiver must not be running when reading WAL from
3755 * archive or pg_wal.
3757 Assert(!WalRcvStreaming());
3759 /* Close any old file we might have open. */
3765 /* Reset curFileTLI if random fetch. */
3770 * Try to restore the file from archive, or read an existing
3773 readFile
= XLogFileReadAnyTLI(readSegNo
,
3774 currentSource
== XLOG_FROM_ARCHIVE
? XLOG_FROM_ANY
:
3777 return XLREAD_SUCCESS
; /* success! */
3780 * Nope, not found in archive or pg_wal.
3782 lastSourceFailed
= true;
3785 case XLOG_FROM_STREAM
:
3790 * We should be able to move to XLOG_FROM_STREAM only in
3793 Assert(StandbyMode
);
3796 * First, shutdown walreceiver if its restart has been
3797 * requested -- but no point if we're already slated for
3800 if (pendingWalRcvRestart
&& !startWalReceiver
)
3802 XLogShutdownWalRcv();
3805 * Re-scan for possible new timelines if we were
3806 * requested to recover to the latest timeline.
3808 if (recoveryTargetTimeLineGoal
==
3809 RECOVERY_TARGET_TIMELINE_LATEST
)
3810 rescanLatestTimeLine(replayTLI
, replayLSN
);
3812 startWalReceiver
= true;
3814 pendingWalRcvRestart
= false;
3817 * Launch walreceiver if needed.
3819 * If fetching_ckpt is true, RecPtr points to the initial
3820 * checkpoint location. In that case, we use RedoStartLSN
3821 * as the streaming start position instead of RecPtr, so
3822 * that when we later jump backwards to start redo at
3823 * RedoStartLSN, we will have the logs streamed already.
3825 if (startWalReceiver
&&
3826 PrimaryConnInfo
&& strcmp(PrimaryConnInfo
, "") != 0)
3841 * Use the record begin position to determine the
3842 * TLI, rather than the position we're reading.
3844 tli
= tliOfPointInHistory(tliRecPtr
, expectedTLEs
);
3846 if (curFileTLI
> 0 && tli
< curFileTLI
)
3847 elog(ERROR
, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3848 LSN_FORMAT_ARGS(tliRecPtr
),
3852 SetInstallXLogFileSegmentActive();
3853 RequestXLogStreaming(tli
, ptr
, PrimaryConnInfo
,
3855 wal_receiver_create_temp_slot
);
3860 * Check if WAL receiver is active or wait to start up.
3862 if (!WalRcvStreaming())
3864 lastSourceFailed
= true;
3869 * Walreceiver is active, so see if new data has arrived.
3871 * We only advance XLogReceiptTime when we obtain fresh
3872 * WAL from walreceiver and observe that we had already
3873 * processed everything before the most recent "chunk"
3874 * that it flushed to disk. In steady state where we are
3875 * keeping up with the incoming data, XLogReceiptTime will
3876 * be updated on each cycle. When we are behind,
3877 * XLogReceiptTime will not advance, so the grace time
3878 * allotted to conflicting queries will decrease.
3880 if (RecPtr
< flushedUpto
)
3884 XLogRecPtr latestChunkStart
;
3886 flushedUpto
= GetWalRcvFlushRecPtr(&latestChunkStart
, &receiveTLI
);
3887 if (RecPtr
< flushedUpto
&& receiveTLI
== curFileTLI
)
3890 if (latestChunkStart
<= RecPtr
)
3892 XLogReceiptTime
= GetCurrentTimestamp();
3893 SetCurrentChunkStartTime(XLogReceiptTime
);
3902 * Great, streamed far enough. Open the file if it's
3903 * not open already. Also read the timeline history
3904 * file if we haven't initialized timeline history
3905 * yet; it should be streamed over and present in
3906 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3907 * info is set correctly and XLogReceiptTime isn't
3910 * NB: We must set readTimeLineHistory based on
3911 * recoveryTargetTLI, not receiveTLI. Normally they'll
3912 * be the same, but if recovery_target_timeline is
3913 * 'latest' and archiving is configured, then it's
3914 * possible that we managed to retrieve one or more
3915 * new timeline history files from the archive,
3916 * updating recoveryTargetTLI.
3921 expectedTLEs
= readTimeLineHistory(recoveryTargetTLI
);
3922 readFile
= XLogFileRead(readSegNo
, receiveTLI
,
3923 XLOG_FROM_STREAM
, false);
3924 Assert(readFile
>= 0);
3928 /* just make sure source info is correct... */
3929 readSource
= XLOG_FROM_STREAM
;
3930 XLogReceiptSource
= XLOG_FROM_STREAM
;
3931 return XLREAD_SUCCESS
;
3936 /* In nonblocking mode, return rather than sleeping. */
3938 return XLREAD_WOULDBLOCK
;
3941 * Data not here yet. Check for trigger, then wait for
3942 * walreceiver to wake us up when new WAL arrives.
3944 if (CheckForStandbyTrigger())
3947 * Note that we don't return XLREAD_FAIL immediately
3948 * here. After being triggered, we still want to
3949 * replay all the WAL that was already streamed. It's
3950 * in pg_wal now, so we just treat this as a failure,
3951 * and the state machine will move on to replay the
3952 * streamed WAL from pg_wal, and then recheck the
3953 * trigger and exit replay.
3955 lastSourceFailed
= true;
3960 * Since we have replayed everything we have received so
3961 * far and are about to start waiting for more WAL, let's
3962 * tell the upstream server our replay location now so
3963 * that pg_stat_replication doesn't show stale
3966 if (!streaming_reply_sent
)
3969 streaming_reply_sent
= true;
3972 /* Do any background tasks that might benefit us later. */
3973 KnownAssignedTransactionIdsIdleMaintenance();
3975 /* Update pg_stat_recovery_prefetch before sleeping. */
3976 XLogPrefetcherComputeStats(xlogprefetcher
);
3979 * Wait for more WAL to arrive, when we will be woken
3980 * immediately by the WAL receiver.
3982 (void) WaitLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
,
3983 WL_LATCH_SET
| WL_EXIT_ON_PM_DEATH
,
3985 WAIT_EVENT_RECOVERY_WAL_STREAM
);
3986 ResetLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
);
3991 elog(ERROR
, "unexpected WAL source %d", currentSource
);
3995 * Check for recovery pause here so that we can confirm more quickly
3996 * that a requested pause has actually taken effect.
3998 if (((volatile XLogRecoveryCtlData
*) XLogRecoveryCtl
)->recoveryPauseState
!=
3999 RECOVERY_NOT_PAUSED
)
4000 recoveryPausesHere(false);
4003 * This possibly-long loop needs to handle interrupts of startup
4006 HandleStartupProcInterrupts();
4009 return XLREAD_FAIL
; /* not reached */
4014 * Determine what log level should be used to report a corrupt WAL record
4015 * in the current WAL page, previously read by XLogPageRead().
4017 * 'emode' is the error mode that would be used to report a file-not-found
4018 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4019 * we're retrying the exact same record that we've tried previously, only
4020 * complain the first time to keep the noise down. However, we only do when
4021 * reading from pg_wal, because we don't expect any invalid records in archive
4022 * or in records streamed from the primary. Files in the archive should be complete,
4023 * and we should never hit the end of WAL because we stop and wait for more WAL
4024 * to arrive before replaying it.
4026 * NOTE: This function remembers the RecPtr value it was last called with,
4027 * to suppress repeated messages about the same record. Only call this when
4028 * you are about to ereport(), or you might cause a later message to be
4029 * erroneously suppressed.
4032 emode_for_corrupt_record(int emode
, XLogRecPtr RecPtr
)
4034 static XLogRecPtr lastComplaint
= 0;
4036 if (readSource
== XLOG_FROM_PG_WAL
&& emode
== LOG
)
4038 if (RecPtr
== lastComplaint
)
4041 lastComplaint
= RecPtr
;
4048 * Subroutine to try to fetch and validate a prior checkpoint record.
4051 ReadCheckpointRecord(XLogPrefetcher
*xlogprefetcher
, XLogRecPtr RecPtr
,
4052 TimeLineID replayTLI
)
4057 Assert(xlogreader
!= NULL
);
4059 if (!XRecOffIsValid(RecPtr
))
4062 (errmsg("invalid checkpoint location")));
4066 XLogPrefetcherBeginRead(xlogprefetcher
, RecPtr
);
4067 record
= ReadRecord(xlogprefetcher
, LOG
, true, replayTLI
);
4072 (errmsg("invalid checkpoint record")));
4075 if (record
->xl_rmid
!= RM_XLOG_ID
)
4078 (errmsg("invalid resource manager ID in checkpoint record")));
4081 info
= record
->xl_info
& ~XLR_INFO_MASK
;
4082 if (info
!= XLOG_CHECKPOINT_SHUTDOWN
&&
4083 info
!= XLOG_CHECKPOINT_ONLINE
)
4086 (errmsg("invalid xl_info in checkpoint record")));
4089 if (record
->xl_tot_len
!= SizeOfXLogRecord
+ SizeOfXLogRecordDataHeaderShort
+ sizeof(CheckPoint
))
4092 (errmsg("invalid length of checkpoint record")));
4099 * Scan for new timelines that might have appeared in the archive since we
4102 * If there are any, the function changes recovery target TLI to the latest
4103 * one and returns 'true'.
4106 rescanLatestTimeLine(TimeLineID replayTLI
, XLogRecPtr replayLSN
)
4108 List
*newExpectedTLEs
;
4111 TimeLineID newtarget
;
4112 TimeLineID oldtarget
= recoveryTargetTLI
;
4113 TimeLineHistoryEntry
*currentTle
= NULL
;
4115 newtarget
= findNewestTimeLine(recoveryTargetTLI
);
4116 if (newtarget
== recoveryTargetTLI
)
4118 /* No new timelines found */
4123 * Determine the list of expected TLIs for the new TLI
4126 newExpectedTLEs
= readTimeLineHistory(newtarget
);
4129 * If the current timeline is not part of the history of the new timeline,
4130 * we cannot proceed to it.
4133 foreach(cell
, newExpectedTLEs
)
4135 currentTle
= (TimeLineHistoryEntry
*) lfirst(cell
);
4137 if (currentTle
->tli
== recoveryTargetTLI
)
4146 (errmsg("new timeline %u is not a child of database system timeline %u",
4153 * The current timeline was found in the history file, but check that the
4154 * next timeline was forked off from it *after* the current recovery
4157 if (currentTle
->end
< replayLSN
)
4160 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4163 LSN_FORMAT_ARGS(replayLSN
))));
4167 /* The new timeline history seems valid. Switch target */
4168 recoveryTargetTLI
= newtarget
;
4169 list_free_deep(expectedTLEs
);
4170 expectedTLEs
= newExpectedTLEs
;
4173 * As in StartupXLOG(), try to ensure we have all the history files
4174 * between the old target and new target in pg_wal.
4176 restoreTimeLineHistoryFiles(oldtarget
+ 1, newtarget
);
4179 (errmsg("new target timeline is %u",
4180 recoveryTargetTLI
)));
4187 * Open a logfile segment for reading (during recovery).
4189 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4190 * Otherwise, it's assumed to be already available in pg_wal.
4193 XLogFileRead(XLogSegNo segno
, TimeLineID tli
,
4194 XLogSource source
, bool notfoundOk
)
4196 char xlogfname
[MAXFNAMELEN
];
4197 char activitymsg
[MAXFNAMELEN
+ 16];
4198 char path
[MAXPGPATH
];
4201 XLogFileName(xlogfname
, tli
, segno
, wal_segment_size
);
4205 case XLOG_FROM_ARCHIVE
:
4206 /* Report recovery progress in PS display */
4207 snprintf(activitymsg
, sizeof(activitymsg
), "waiting for %s",
4209 set_ps_display(activitymsg
);
4211 if (!RestoreArchivedFile(path
, xlogfname
,
4218 case XLOG_FROM_PG_WAL
:
4219 case XLOG_FROM_STREAM
:
4220 XLogFilePath(path
, tli
, segno
, wal_segment_size
);
4224 elog(ERROR
, "invalid XLogFileRead source %d", source
);
4228 * If the segment was fetched from archival storage, replace the existing
4229 * xlog segment (if any) with the archival version.
4231 if (source
== XLOG_FROM_ARCHIVE
)
4233 Assert(!IsInstallXLogFileSegmentActive());
4234 KeepFileRestoredFromArchive(path
, xlogfname
);
4237 * Set path to point at the new file in pg_wal.
4239 snprintf(path
, MAXPGPATH
, XLOGDIR
"/%s", xlogfname
);
4242 fd
= BasicOpenFile(path
, O_RDONLY
| PG_BINARY
);
4248 /* Report recovery progress in PS display */
4249 snprintf(activitymsg
, sizeof(activitymsg
), "recovering %s",
4251 set_ps_display(activitymsg
);
4253 /* Track source of data in assorted state variables */
4254 readSource
= source
;
4255 XLogReceiptSource
= source
;
4256 /* In FROM_STREAM case, caller tracks receipt time, not me */
4257 if (source
!= XLOG_FROM_STREAM
)
4258 XLogReceiptTime
= GetCurrentTimestamp();
4262 if (errno
!= ENOENT
|| !notfoundOk
) /* unexpected failure? */
4264 (errcode_for_file_access(),
4265 errmsg("could not open file \"%s\": %m", path
)));
4270 * Open a logfile segment for reading (during recovery).
4272 * This version searches for the segment with any TLI listed in expectedTLEs.
4275 XLogFileReadAnyTLI(XLogSegNo segno
, XLogSource source
)
4277 char path
[MAXPGPATH
];
4283 * Loop looking for a suitable timeline ID: we might need to read any of
4284 * the timelines listed in expectedTLEs.
4286 * We expect curFileTLI on entry to be the TLI of the preceding file in
4287 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4288 * to go backwards; this prevents us from picking up the wrong file when a
4289 * parent timeline extends to higher segment numbers than the child we
4292 * If we haven't read the timeline history file yet, read it now, so that
4293 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4294 * however, unless we actually find a valid segment. That way if there is
4295 * neither a timeline history file nor a WAL segment in the archive, and
4296 * streaming replication is set up, we'll read the timeline history file
4297 * streamed from the primary when we start streaming, instead of
4298 * recovering with a dummy history generated here.
4301 tles
= expectedTLEs
;
4303 tles
= readTimeLineHistory(recoveryTargetTLI
);
4307 TimeLineHistoryEntry
*hent
= (TimeLineHistoryEntry
*) lfirst(cell
);
4308 TimeLineID tli
= hent
->tli
;
4310 if (tli
< curFileTLI
)
4311 break; /* don't bother looking at too-old TLIs */
4314 * Skip scanning the timeline ID that the logfile segment to read
4317 if (hent
->begin
!= InvalidXLogRecPtr
)
4319 XLogSegNo beginseg
= 0;
4321 XLByteToSeg(hent
->begin
, beginseg
, wal_segment_size
);
4324 * The logfile segment that doesn't belong to the timeline is
4325 * older or newer than the segment that the timeline started or
4326 * ended at, respectively. It's sufficient to check only the
4327 * starting segment of the timeline here. Since the timelines are
4328 * scanned in descending order in this loop, any segments newer
4329 * than the ending segment should belong to newer timeline and
4330 * have already been read before. So it's not necessary to check
4331 * the ending segment of the timeline here.
4333 if (segno
< beginseg
)
4337 if (source
== XLOG_FROM_ANY
|| source
== XLOG_FROM_ARCHIVE
)
4339 fd
= XLogFileRead(segno
, tli
, XLOG_FROM_ARCHIVE
, true);
4342 elog(DEBUG1
, "got WAL segment from archive");
4344 expectedTLEs
= tles
;
4349 if (source
== XLOG_FROM_ANY
|| source
== XLOG_FROM_PG_WAL
)
4351 fd
= XLogFileRead(segno
, tli
, XLOG_FROM_PG_WAL
, true);
4355 expectedTLEs
= tles
;
4361 /* Couldn't find it. For simplicity, complain about front timeline */
4362 XLogFilePath(path
, recoveryTargetTLI
, segno
, wal_segment_size
);
4365 (errcode_for_file_access(),
4366 errmsg("could not open file \"%s\": %m", path
)));
4371 * Set flag to signal the walreceiver to restart. (The startup process calls
4372 * this on noticing a relevant configuration change.)
4375 StartupRequestWalReceiverRestart(void)
4377 if (currentSource
== XLOG_FROM_STREAM
&& WalRcvRunning())
4380 (errmsg("WAL receiver process shutdown requested")));
4382 pendingWalRcvRestart
= true;
4388 * Has a standby promotion already been triggered?
4390 * Unlike CheckForStandbyTrigger(), this works in any process
4391 * that's connected to shared memory.
4394 PromoteIsTriggered(void)
4397 * We check shared state each time only until a standby promotion is
4398 * triggered. We can't trigger a promotion again, so there's no need to
4399 * keep checking after the shared variable has once been seen true.
4401 if (LocalPromoteIsTriggered
)
4404 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4405 LocalPromoteIsTriggered
= XLogRecoveryCtl
->SharedPromoteIsTriggered
;
4406 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4408 return LocalPromoteIsTriggered
;
4412 SetPromoteIsTriggered(void)
4414 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4415 XLogRecoveryCtl
->SharedPromoteIsTriggered
= true;
4416 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4419 * Mark the recovery pause state as 'not paused' because the paused state
4420 * ends and promotion continues if a promotion is triggered while recovery
4421 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4422 * return 'paused' while a promotion is ongoing.
4424 SetRecoveryPause(false);
4426 LocalPromoteIsTriggered
= true;
4430 * Check whether a promote request has arrived.
4433 CheckForStandbyTrigger(void)
4435 if (LocalPromoteIsTriggered
)
4438 if (IsPromoteSignaled() && CheckPromoteSignal())
4440 ereport(LOG
, (errmsg("received promote request")));
4441 RemovePromoteSignalFiles();
4442 ResetPromoteSignaled();
4443 SetPromoteIsTriggered();
4451 * Remove the files signaling a standby promotion request.
4454 RemovePromoteSignalFiles(void)
4456 unlink(PROMOTE_SIGNAL_FILE
);
4460 * Check to see if a promote request has arrived.
4463 CheckPromoteSignal(void)
4465 struct stat stat_buf
;
4467 if (stat(PROMOTE_SIGNAL_FILE
, &stat_buf
) == 0)
4474 * Wake up startup process to replay newly arrived WAL, or to notice that
4475 * failover has been requested.
4478 WakeupRecovery(void)
4480 SetLatch(&XLogRecoveryCtl
->recoveryWakeupLatch
);
4484 * Schedule a walreceiver wakeup in the main recovery loop.
4487 XLogRequestWalReceiverReply(void)
4489 doRequestWalReceiverReply
= true;
4493 * Is HotStandby active yet? This is only important in special backends
4494 * since normal backends won't ever be able to connect until this returns
4495 * true. Postmaster knows this by way of signal, not via shared memory.
4497 * Unlike testing standbyState, this works in any process that's connected to
4498 * shared memory. (And note that standbyState alone doesn't tell the truth
4502 HotStandbyActive(void)
4505 * We check shared state each time only until Hot Standby is active. We
4506 * can't de-activate Hot Standby, so there's no need to keep checking
4507 * after the shared variable has once been seen true.
4509 if (LocalHotStandbyActive
)
4513 /* spinlock is essential on machines with weak memory ordering! */
4514 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4515 LocalHotStandbyActive
= XLogRecoveryCtl
->SharedHotStandbyActive
;
4516 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4518 return LocalHotStandbyActive
;
4523 * Like HotStandbyActive(), but to be used only in WAL replay code,
4524 * where we don't need to ask any other process what the state is.
4527 HotStandbyActiveInReplay(void)
4529 Assert(AmStartupProcess() || !IsPostmasterEnvironment
);
4530 return LocalHotStandbyActive
;
4534 * Get latest redo apply position.
4536 * Exported to allow WALReceiver to read the pointer directly.
4539 GetXLogReplayRecPtr(TimeLineID
*replayTLI
)
4544 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4545 recptr
= XLogRecoveryCtl
->lastReplayedEndRecPtr
;
4546 tli
= XLogRecoveryCtl
->lastReplayedTLI
;
4547 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4556 * Get position of last applied, or the record being applied.
4558 * This is different from GetXLogReplayRecPtr() in that if a WAL
4559 * record is currently being applied, this includes that record.
4562 GetCurrentReplayRecPtr(TimeLineID
*replayEndTLI
)
4567 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4568 recptr
= XLogRecoveryCtl
->replayEndRecPtr
;
4569 tli
= XLogRecoveryCtl
->replayEndTLI
;
4570 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4573 *replayEndTLI
= tli
;
4578 * Save timestamp of latest processed commit/abort record.
4580 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4581 * seen by processes other than the startup process. Note in particular
4582 * that CreateRestartPoint is executed in the checkpointer.
4585 SetLatestXTime(TimestampTz xtime
)
4587 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4588 XLogRecoveryCtl
->recoveryLastXTime
= xtime
;
4589 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4593 * Fetch timestamp of latest processed commit/abort record.
4596 GetLatestXTime(void)
4600 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4601 xtime
= XLogRecoveryCtl
->recoveryLastXTime
;
4602 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4608 * Save timestamp of the next chunk of WAL records to apply.
4610 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4611 * seen by all backends.
4614 SetCurrentChunkStartTime(TimestampTz xtime
)
4616 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4617 XLogRecoveryCtl
->currentChunkStartTime
= xtime
;
4618 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4622 * Fetch timestamp of latest processed commit/abort record.
4623 * Startup process maintains an accurate local copy in XLogReceiptTime
4626 GetCurrentChunkReplayStartTime(void)
4630 SpinLockAcquire(&XLogRecoveryCtl
->info_lck
);
4631 xtime
= XLogRecoveryCtl
->currentChunkStartTime
;
4632 SpinLockRelease(&XLogRecoveryCtl
->info_lck
);
4638 * Returns time of receipt of current chunk of XLOG data, as well as
4639 * whether it was received from streaming replication or from archives.
4642 GetXLogReceiptTime(TimestampTz
*rtime
, bool *fromStream
)
4645 * This must be executed in the startup process, since we don't export the
4646 * relevant state to shared memory.
4650 *rtime
= XLogReceiptTime
;
4651 *fromStream
= (XLogReceiptSource
== XLOG_FROM_STREAM
);
4655 * Note that text field supplied is a parameter name and does not require
4659 RecoveryRequiresIntParameter(const char *param_name
, int currValue
, int minValue
)
4661 if (currValue
< minValue
)
4663 if (HotStandbyActiveInReplay())
4665 bool warned_for_promote
= false;
4668 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
4669 errmsg("hot standby is not possible because of insufficient parameter settings"),
4670 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4675 SetRecoveryPause(true);
4678 (errmsg("recovery has paused"),
4679 errdetail("If recovery is unpaused, the server will shut down."),
4680 errhint("You can then restart the server after making the necessary configuration changes.")));
4682 while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED
)
4684 HandleStartupProcInterrupts();
4686 if (CheckForStandbyTrigger())
4688 if (!warned_for_promote
)
4690 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
4691 errmsg("promotion is not possible because of insufficient parameter settings"),
4694 * Repeat the detail from above so it's easy to find
4697 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4701 errhint("Restart the server after making the necessary configuration changes.")));
4702 warned_for_promote
= true;
4706 * If recovery pause is requested then set it paused. While
4707 * we are in the loop, user might resume and pause again so
4708 * set this every time.
4710 ConfirmRecoveryPaused();
4713 * We wait on a condition variable that will wake us as soon
4714 * as the pause ends, but we use a timeout so we can check the
4715 * above conditions periodically too.
4717 ConditionVariableTimedSleep(&XLogRecoveryCtl
->recoveryNotPausedCV
, 1000,
4718 WAIT_EVENT_RECOVERY_PAUSE
);
4720 ConditionVariableCancelSleep();
4724 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
4725 errmsg("recovery aborted because of insufficient parameter settings"),
4726 /* Repeat the detail from above so it's easy to find in the log. */
4727 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4731 errhint("You can restart the server after making the necessary configuration changes.")));
4737 * GUC check_hook for primary_slot_name
4740 check_primary_slot_name(char **newval
, void **extra
, GucSource source
)
4742 if (*newval
&& strcmp(*newval
, "") != 0 &&
4743 !ReplicationSlotValidateName(*newval
, WARNING
))
4750 * Recovery target settings: Only one of the several recovery_target* settings
4751 * may be set. Setting a second one results in an error. The global variable
4752 * recoveryTarget tracks which kind of recovery target was chosen. Other
4753 * variables store the actual target value (for example a string or a xid).
4754 * The assign functions of the parameters check whether a competing parameter
4755 * was already set. But we want to allow setting the same parameter multiple
4756 * times. We also want to allow unsetting a parameter and setting a different
4757 * one, so we unset recoveryTarget when the parameter is set to an empty
4760 * XXX this code is broken by design. Throwing an error from a GUC assign
4761 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4762 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4763 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4764 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4768 pg_attribute_noreturn()
4769 error_multiple_recovery_targets(void)
4772 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
4773 errmsg("multiple recovery targets specified"),
4774 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4778 * GUC check_hook for recovery_target
4781 check_recovery_target(char **newval
, void **extra
, GucSource source
)
4783 if (strcmp(*newval
, "immediate") != 0 && strcmp(*newval
, "") != 0)
4785 GUC_check_errdetail("The only allowed value is \"immediate\".");
4792 * GUC assign_hook for recovery_target
4795 assign_recovery_target(const char *newval
, void *extra
)
4797 if (recoveryTarget
!= RECOVERY_TARGET_UNSET
&&
4798 recoveryTarget
!= RECOVERY_TARGET_IMMEDIATE
)
4799 error_multiple_recovery_targets();
4801 if (newval
&& strcmp(newval
, "") != 0)
4802 recoveryTarget
= RECOVERY_TARGET_IMMEDIATE
;
4804 recoveryTarget
= RECOVERY_TARGET_UNSET
;
4808 * GUC check_hook for recovery_target_lsn
4811 check_recovery_target_lsn(char **newval
, void **extra
, GucSource source
)
4813 if (strcmp(*newval
, "") != 0)
4816 XLogRecPtr
*myextra
;
4817 bool have_error
= false;
4819 lsn
= pg_lsn_in_internal(*newval
, &have_error
);
4823 myextra
= (XLogRecPtr
*) guc_malloc(ERROR
, sizeof(XLogRecPtr
));
4831 * GUC assign_hook for recovery_target_lsn
4834 assign_recovery_target_lsn(const char *newval
, void *extra
)
4836 if (recoveryTarget
!= RECOVERY_TARGET_UNSET
&&
4837 recoveryTarget
!= RECOVERY_TARGET_LSN
)
4838 error_multiple_recovery_targets();
4840 if (newval
&& strcmp(newval
, "") != 0)
4842 recoveryTarget
= RECOVERY_TARGET_LSN
;
4843 recoveryTargetLSN
= *((XLogRecPtr
*) extra
);
4846 recoveryTarget
= RECOVERY_TARGET_UNSET
;
4850 * GUC check_hook for recovery_target_name
4853 check_recovery_target_name(char **newval
, void **extra
, GucSource source
)
4855 /* Use the value of newval directly */
4856 if (strlen(*newval
) >= MAXFNAMELEN
)
4858 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4859 "recovery_target_name", MAXFNAMELEN
- 1);
4866 * GUC assign_hook for recovery_target_name
4869 assign_recovery_target_name(const char *newval
, void *extra
)
4871 if (recoveryTarget
!= RECOVERY_TARGET_UNSET
&&
4872 recoveryTarget
!= RECOVERY_TARGET_NAME
)
4873 error_multiple_recovery_targets();
4875 if (newval
&& strcmp(newval
, "") != 0)
4877 recoveryTarget
= RECOVERY_TARGET_NAME
;
4878 recoveryTargetName
= newval
;
4881 recoveryTarget
= RECOVERY_TARGET_UNSET
;
4885 * GUC check_hook for recovery_target_time
4887 * The interpretation of the recovery_target_time string can depend on the
4888 * time zone setting, so we need to wait until after all GUC processing is
4889 * done before we can do the final parsing of the string. This check function
4890 * only does a parsing pass to catch syntax errors, but we store the string
4891 * and parse it again when we need to use it.
4894 check_recovery_target_time(char **newval
, void **extra
, GucSource source
)
4896 if (strcmp(*newval
, "") != 0)
4898 /* reject some special values */
4899 if (strcmp(*newval
, "now") == 0 ||
4900 strcmp(*newval
, "today") == 0 ||
4901 strcmp(*newval
, "tomorrow") == 0 ||
4902 strcmp(*newval
, "yesterday") == 0)
4908 * parse timestamp value (see also timestamptz_in())
4911 char *str
= *newval
;
4919 char *field
[MAXDATEFIELDS
];
4920 int ftype
[MAXDATEFIELDS
];
4921 char workbuf
[MAXDATELEN
+ MAXDATEFIELDS
];
4922 DateTimeErrorExtra dtextra
;
4923 TimestampTz timestamp
;
4925 dterr
= ParseDateTime(str
, workbuf
, sizeof(workbuf
),
4926 field
, ftype
, MAXDATEFIELDS
, &nf
);
4928 dterr
= DecodeDateTime(field
, ftype
, nf
,
4929 &dtype
, tm
, &fsec
, &tz
, &dtextra
);
4932 if (dtype
!= DTK_DATE
)
4935 if (tm2timestamp(tm
, fsec
, &tz
, ×tamp
) != 0)
4937 GUC_check_errdetail("Timestamp out of range: \"%s\".", str
);
4946 * GUC assign_hook for recovery_target_time
4949 assign_recovery_target_time(const char *newval
, void *extra
)
4951 if (recoveryTarget
!= RECOVERY_TARGET_UNSET
&&
4952 recoveryTarget
!= RECOVERY_TARGET_TIME
)
4953 error_multiple_recovery_targets();
4955 if (newval
&& strcmp(newval
, "") != 0)
4956 recoveryTarget
= RECOVERY_TARGET_TIME
;
4958 recoveryTarget
= RECOVERY_TARGET_UNSET
;
4962 * GUC check_hook for recovery_target_timeline
4965 check_recovery_target_timeline(char **newval
, void **extra
, GucSource source
)
4967 RecoveryTargetTimeLineGoal rttg
;
4968 RecoveryTargetTimeLineGoal
*myextra
;
4970 if (strcmp(*newval
, "current") == 0)
4971 rttg
= RECOVERY_TARGET_TIMELINE_CONTROLFILE
;
4972 else if (strcmp(*newval
, "latest") == 0)
4973 rttg
= RECOVERY_TARGET_TIMELINE_LATEST
;
4976 rttg
= RECOVERY_TARGET_TIMELINE_NUMERIC
;
4979 strtoul(*newval
, NULL
, 0);
4980 if (errno
== EINVAL
|| errno
== ERANGE
)
4982 GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4987 myextra
= (RecoveryTargetTimeLineGoal
*) guc_malloc(ERROR
, sizeof(RecoveryTargetTimeLineGoal
));
4995 * GUC assign_hook for recovery_target_timeline
4998 assign_recovery_target_timeline(const char *newval
, void *extra
)
5000 recoveryTargetTimeLineGoal
= *((RecoveryTargetTimeLineGoal
*) extra
);
5001 if (recoveryTargetTimeLineGoal
== RECOVERY_TARGET_TIMELINE_NUMERIC
)
5002 recoveryTargetTLIRequested
= (TimeLineID
) strtoul(newval
, NULL
, 0);
5004 recoveryTargetTLIRequested
= 0;
5008 * GUC check_hook for recovery_target_xid
5011 check_recovery_target_xid(char **newval
, void **extra
, GucSource source
)
5013 if (strcmp(*newval
, "") != 0)
5016 TransactionId
*myextra
;
5019 xid
= (TransactionId
) strtou64(*newval
, NULL
, 0);
5020 if (errno
== EINVAL
|| errno
== ERANGE
)
5023 myextra
= (TransactionId
*) guc_malloc(ERROR
, sizeof(TransactionId
));
5031 * GUC assign_hook for recovery_target_xid
5034 assign_recovery_target_xid(const char *newval
, void *extra
)
5036 if (recoveryTarget
!= RECOVERY_TARGET_UNSET
&&
5037 recoveryTarget
!= RECOVERY_TARGET_XID
)
5038 error_multiple_recovery_targets();
5040 if (newval
&& strcmp(newval
, "") != 0)
5042 recoveryTarget
= RECOVERY_TARGET_XID
;
5043 recoveryTargetXid
= *((TransactionId
*) extra
);
5046 recoveryTarget
= RECOVERY_TARGET_UNSET
;