src/backend/access/transam/xlogrecovery.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlogrecovery.c
   4  *              Functions for WAL recovery, standby mode
   5  *
   6  * This source file contains functions controlling WAL recovery.
   7  * InitWalRecovery() initializes the system for crash or archive recovery,
   8  * or standby mode, depending on configuration options and the state of
   9  * the control file and possible backup label file.  PerformWalRecovery()
  10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
  11  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
  12  * and prepares information needed to initialize the WAL for writes.  In
  13  * addition to these three main functions, there are a bunch of functions
  14  * for interrogating recovery state and controlling the recovery process.
  15  *
  16  *
  17  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  18  * Portions Copyright (c) 1994, Regents of the University of California
  19  *
  20  * src/backend/access/transam/xlogrecovery.c
  21  *
  22  *-------------------------------------------------------------------------
  23  */
  24
  25 #include "postgres.h"
  26
  27 #include <ctype.h>
  28 #include <math.h>
  29 #include <time.h>
  30 #include <sys/stat.h>
  31 #include <sys/time.h>
  32 #include <unistd.h>
  33
  34 #include "access/timeline.h"
  35 #include "access/transam.h"
  36 #include "access/xact.h"
  37 #include "access/xlog_internal.h"
  38 #include "access/xlogarchive.h"
  39 #include "access/xlogprefetcher.h"
  40 #include "access/xlogreader.h"
  41 #include "access/xlogrecovery.h"
  42 #include "access/xlogutils.h"
  43 #include "backup/basebackup.h"
  44 #include "catalog/pg_control.h"
  45 #include "commands/tablespace.h"
  46 #include "common/file_utils.h"
  47 #include "miscadmin.h"
  48 #include "pgstat.h"
  49 #include "postmaster/bgwriter.h"
  50 #include "postmaster/startup.h"
  51 #include "replication/slot.h"
  52 #include "replication/slotsync.h"
  53 #include "replication/walreceiver.h"
  54 #include "storage/fd.h"
  55 #include "storage/ipc.h"
  56 #include "storage/latch.h"
  57 #include "storage/pmsignal.h"
  58 #include "storage/procarray.h"
  59 #include "storage/spin.h"
  60 #include "utils/datetime.h"
  61 #include "utils/fmgrprotos.h"
  62 #include "utils/guc_hooks.h"
  63 #include "utils/pg_lsn.h"
  64 #include "utils/ps_status.h"
  65 #include "utils/pg_rusage.h"
  66
  67 /* Unsupported old recovery command file names (relative to $PGDATA) */
  68 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  69 #define RECOVERY_COMMAND_DONE   "recovery.done"
  70
  71 /*
  72  * GUC support
  73  */
  74 const struct config_enum_entry recovery_target_action_options[] = {
  75         {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
  76         {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
  77         {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
  78         {NULL, 0, false}
  79 };
  80
  81 /* options formerly taken from recovery.conf for archive recovery */
  82 char       *recoveryRestoreCommand = NULL;
  83 char       *recoveryEndCommand = NULL;
  84 char       *archiveCleanupCommand = NULL;
  85 RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
  86 bool            recoveryTargetInclusive = true;
  87 int                     recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
  88 TransactionId recoveryTargetXid;
  89 char       *recovery_target_time_string;
  90 TimestampTz recoveryTargetTime;
  91 const char *recoveryTargetName;
  92 XLogRecPtr      recoveryTargetLSN;
  93 int                     recovery_min_apply_delay = 0;
  94
  95 /* options formerly taken from recovery.conf for XLOG streaming */
  96 char       *PrimaryConnInfo = NULL;
  97 char       *PrimarySlotName = NULL;
  98 bool            wal_receiver_create_temp_slot = false;
  99
 100 /*
 101  * recoveryTargetTimeLineGoal: what the user requested, if any
 102  *
 103  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
 104  *
 105  * recoveryTargetTLI: the currently understood target timeline; changes
 106  *
 107  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
 108  * the timelines of its known parents, newest first (so recoveryTargetTLI is
 109  * always the first list member).  Only these TLIs are expected to be seen in
 110  * the WAL segments we read, and indeed only these TLIs will be considered as
 111  * candidate WAL files to open at all.
 112  *
 113  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 114  * (This is not necessarily the same as the timeline from which we are
 115  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
 116  * scanning data that was copied from an ancestor timeline when the current
 117  * file was created.)  During a sequential scan we do not allow this value
 118  * to decrease.
 119  */
 120 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
 121 TimeLineID      recoveryTargetTLIRequested = 0;
 122 TimeLineID      recoveryTargetTLI = 0;
 123 static List *expectedTLEs;
 124 static TimeLineID curFileTLI;
 125
 126 /*
 127  * When ArchiveRecoveryRequested is set, archive recovery was requested,
 128  * ie. signal files were present.  When InArchiveRecovery is set, we are
 129  * currently recovering using offline XLOG archives.  These variables are only
 130  * valid in the startup process.
 131  *
 132  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
 133  * currently performing crash recovery using only XLOG files in pg_wal, but
 134  * will switch to using offline XLOG archives as soon as we reach the end of
 135  * WAL in pg_wal.
 136  */
 137 bool            ArchiveRecoveryRequested = false;
 138 bool            InArchiveRecovery = false;
 139
 140 /*
 141  * When StandbyModeRequested is set, standby mode was requested, i.e.
 142  * standby.signal file was present.  When StandbyMode is set, we are currently
 143  * in standby mode.  These variables are only valid in the startup process.
 144  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
 145  */
 146 static bool StandbyModeRequested = false;
 147 bool            StandbyMode = false;
 148
 149 /* was a signal file present at startup? */
 150 static bool standby_signal_file_found = false;
 151 static bool recovery_signal_file_found = false;
 152
 153 /*
 154  * CheckPointLoc is the position of the checkpoint record that determines
 155  * where to start the replay.  It comes from the backup label file or the
 156  * control file.
 157  *
 158  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
 159  * file or the control file.  In standby mode, XLOG streaming usually starts
 160  * from the position where an invalid record was found.  But if we fail to
 161  * read even the initial checkpoint record, we use the REDO location instead
 162  * of the checkpoint location as the start position of XLOG streaming.
 163  * Otherwise we would have to jump backwards to the REDO location after
 164  * reading the checkpoint record, because the REDO record can precede the
 165  * checkpoint record.
 166  */
 167 static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
 168 static TimeLineID CheckPointTLI = 0;
 169 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
 170 static TimeLineID RedoStartTLI = 0;
 171
 172 /*
 173  * Local copy of SharedHotStandbyActive variable. False actually means "not
 174  * known, need to check the shared state".
 175  */
 176 static bool LocalHotStandbyActive = false;
 177
 178 /*
 179  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
 180  * known, need to check the shared state".
 181  */
 182 static bool LocalPromoteIsTriggered = false;
 183
 184 /* Has the recovery code requested a walreceiver wakeup? */
 185 static bool doRequestWalReceiverReply;
 186
 187 /* XLogReader object used to parse the WAL records */
 188 static XLogReaderState *xlogreader = NULL;
 189
 190 /* XLogPrefetcher object used to consume WAL records with read-ahead */
 191 static XLogPrefetcher *xlogprefetcher = NULL;
 192
 193 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
 194 typedef struct XLogPageReadPrivate
 195 {
 196         int                     emode;
 197         bool            fetching_ckpt;  /* are we fetching a checkpoint record? */
 198         bool            randAccess;
 199         TimeLineID      replayTLI;
 200 } XLogPageReadPrivate;
 201
 202 /* flag to tell XLogPageRead that we have started replaying */
 203 static bool InRedo = false;
 204
 205 /*
 206  * Codes indicating where we got a WAL file from during recovery, or where
 207  * to attempt to get one.
 208  */
 209 typedef enum
 210 {
 211         XLOG_FROM_ANY = 0,                      /* request to read WAL from any source */
 212         XLOG_FROM_ARCHIVE,                      /* restored using restore_command */
 213         XLOG_FROM_PG_WAL,                       /* existing file in pg_wal */
 214         XLOG_FROM_STREAM,                       /* streamed from primary */
 215 } XLogSource;
 216
 217 /* human-readable names for XLogSources, for debugging output */
 218 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
 219
 220 /*
 221  * readFile is -1 or a kernel FD for the log file segment that's currently
 222  * open for reading.  readSegNo identifies the segment.  readOff is the offset
 223  * of the page just read, readLen indicates how much of it has been read into
 224  * readBuf, and readSource indicates where we got the currently open file from.
 225  *
 226  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
 227  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
 228  * worthwhile, since the XLOG is not read by general-purpose sessions.
 229  */
 230 static int      readFile = -1;
 231 static XLogSegNo readSegNo = 0;
 232 static uint32 readOff = 0;
 233 static uint32 readLen = 0;
 234 static XLogSource readSource = XLOG_FROM_ANY;
 235
 236 /*
 237  * Keeps track of which source we're currently reading from. This is
 238  * different from readSource in that this is always set, even when we don't
 239  * currently have a WAL file open. If lastSourceFailed is set, our last
 240  * attempt to read from currentSource failed, and we should try another source
 241  * next.
 242  *
 243  * pendingWalRcvRestart is set when a config change occurs that requires a
 244  * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
 245  */
 246 static XLogSource currentSource = XLOG_FROM_ANY;
 247 static bool lastSourceFailed = false;
 248 static bool pendingWalRcvRestart = false;
 249
 250 /*
 251  * These variables track when we last obtained some WAL data to process,
 252  * and where we got it from.  (XLogReceiptSource is initially the same as
 253  * readSource, but readSource gets reset to zero when we don't have data
 254  * to process right now.  It is also different from currentSource, which
 255  * also changes when we try to read from a source and fail, while
 256  * XLogReceiptSource tracks where we last successfully read some WAL.)
 257  */
 258 static TimestampTz XLogReceiptTime = 0;
 259 static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
 260
 261 /* Local copy of WalRcv->flushedUpto */
 262 static XLogRecPtr flushedUpto = 0;
 263 static TimeLineID receiveTLI = 0;
 264
 265 /*
 266  * Copy of minRecoveryPoint and backupEndPoint from the control file.
 267  *
 268  * In order to reach consistency, we must replay the WAL up to
 269  * minRecoveryPoint.  If backupEndRequired is true, we must also reach
 270  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
 271  * to backupStartPoint.
 272  *
 273  * Note: In archive recovery, after consistency has been reached, the
 274  * functions in xlog.c will start updating minRecoveryPoint in the control
 275  * file.  But this copy of minRecoveryPoint variable reflects the value at the
 276  * beginning of recovery, and is *not* updated after consistency is reached.
 277  */
 278 static XLogRecPtr minRecoveryPoint;
 279 static TimeLineID minRecoveryPointTLI;
 280
 281 static XLogRecPtr backupStartPoint;
 282 static XLogRecPtr backupEndPoint;
 283 static bool backupEndRequired = false;
 284
 285 /*
 286  * Have we reached a consistent database state?  In crash recovery, we have
 287  * to replay all the WAL, so reachedConsistency is never set.  During archive
 288  * recovery, the database is consistent once minRecoveryPoint is reached.
 289  *
 290  * Consistent state means that the system is internally consistent, all
 291  * the WAL has been replayed up to a certain point, and importantly, there
 292  * is no trace of later actions on disk.
 293  */
 294 bool            reachedConsistency = false;
 295
 296 /* Buffers dedicated to consistency checks of size BLCKSZ */
 297 static char *replay_image_masked = NULL;
 298 static char *primary_image_masked = NULL;
 299
 300
 301 /*
 302  * Shared-memory state for WAL recovery.
 303  */
 304 typedef struct XLogRecoveryCtlData
 305 {
 306         /*
 307          * SharedHotStandbyActive indicates if we allow hot standby queries to be
 308          * run.  Protected by info_lck.
 309          */
 310         bool            SharedHotStandbyActive;
 311
 312         /*
 313          * SharedPromoteIsTriggered indicates if a standby promotion has been
 314          * triggered.  Protected by info_lck.
 315          */
 316         bool            SharedPromoteIsTriggered;
 317
 318         /*
 319          * recoveryWakeupLatch is used to wake up the startup process to continue
 320          * WAL replay, if it is waiting for WAL to arrive or promotion to be
 321          * requested.
 322          *
 323          * Note that the startup process also uses another latch, its procLatch,
 324          * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
 325          * signaling the startup process in favor of using its procLatch, which
 326          * comports better with possible generic signal handlers using that latch.
 327          * But we should not do that because the startup process doesn't assume
 328          * that it's waken up by walreceiver process or SIGHUP signal handler
 329          * while it's waiting for recovery conflict. The separate latches,
 330          * recoveryWakeupLatch and procLatch, should be used for inter-process
 331          * communication for WAL replay and recovery conflict, respectively.
 332          */
 333         Latch           recoveryWakeupLatch;
 334
 335         /*
 336          * Last record successfully replayed.
 337          */
 338         XLogRecPtr      lastReplayedReadRecPtr; /* start position */
 339         XLogRecPtr      lastReplayedEndRecPtr;  /* end+1 position */
 340         TimeLineID      lastReplayedTLI;        /* timeline */
 341
 342         /*
 343          * When we're currently replaying a record, ie. in a redo function,
 344          * replayEndRecPtr points to the end+1 of the record being replayed,
 345          * otherwise it's equal to lastReplayedEndRecPtr.
 346          */
 347         XLogRecPtr      replayEndRecPtr;
 348         TimeLineID      replayEndTLI;
 349         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 350         TimestampTz recoveryLastXTime;
 351
 352         /*
 353          * timestamp of when we started replaying the current chunk of WAL data,
 354          * only relevant for replication or archive recovery
 355          */
 356         TimestampTz currentChunkStartTime;
 357         /* Recovery pause state */
 358         RecoveryPauseState recoveryPauseState;
 359         ConditionVariable recoveryNotPausedCV;
 360
 361         slock_t         info_lck;               /* locks shared variables shown above */
 362 } XLogRecoveryCtlData;
 363
 364 static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
 365
 366 /*
 367  * abortedRecPtr is the start pointer of a broken record at end of WAL when
 368  * recovery completes; missingContrecPtr is the location of the first
 369  * contrecord that went missing.  See CreateOverwriteContrecordRecord for
 370  * details.
 371  */
 372 static XLogRecPtr abortedRecPtr;
 373 static XLogRecPtr missingContrecPtr;
 374
 375 /*
 376  * if recoveryStopsBefore/After returns true, it saves information of the stop
 377  * point here
 378  */
 379 static TransactionId recoveryStopXid;
 380 static TimestampTz recoveryStopTime;
 381 static XLogRecPtr recoveryStopLSN;
 382 static char recoveryStopName[MAXFNAMELEN];
 383 static bool recoveryStopAfter;
 384
 385 /* prototypes for local functions */
 386 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
 387
 388 static void EnableStandbyMode(void);
 389 static void readRecoverySignalFile(void);
 390 static void validateRecoveryParameters(void);
 391 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 392                                                           TimeLineID *backupLabelTLI,
 393                                                           bool *backupEndRequired, bool *backupFromStandby);
 394 static bool read_tablespace_map(List **tablespaces);
 395
 396 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
 397 static void CheckRecoveryConsistency(void);
 398 static void rm_redo_error_callback(void *arg);
 399 #ifdef WAL_DEBUG
 400 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 401 #endif
 402 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
 403 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 404                                                                 TimeLineID prevTLI, TimeLineID replayTLI);
 405 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
 406 static void verifyBackupPageConsistency(XLogReaderState *record);
 407
 408 static bool recoveryStopsBefore(XLogReaderState *record);
 409 static bool recoveryStopsAfter(XLogReaderState *record);
 410 static char *getRecoveryStopReason(void);
 411 static void recoveryPausesHere(bool endOfRecovery);
 412 static bool recoveryApplyDelay(XLogReaderState *record);
 413 static void ConfirmRecoveryPaused(void);
 414
 415 static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
 416                                                           int emode, bool fetching_ckpt,
 417                                                           TimeLineID replayTLI);
 418
 419 static int      XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
 420                                                  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
 421 static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
 422                                                                                                           bool randAccess,
 423                                                                                                           bool fetching_ckpt,
 424                                                                                                           XLogRecPtr tliRecPtr,
 425                                                                                                           TimeLineID replayTLI,
 426                                                                                                           XLogRecPtr replayLSN,
 427                                                                                                           bool nonblocking);
 428 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 429 static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
 430                                                                                 XLogRecPtr RecPtr, TimeLineID replayTLI);
 431 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
 432 static int      XLogFileRead(XLogSegNo segno, TimeLineID tli,
 433                                                  XLogSource source, bool notfoundOk);
 434 static int      XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
 435
 436 static bool CheckForStandbyTrigger(void);
 437 static void SetPromoteIsTriggered(void);
 438 static bool HotStandbyActiveInReplay(void);
 439
 440 static void SetCurrentChunkStartTime(TimestampTz xtime);
 441 static void SetLatestXTime(TimestampTz xtime);
 442
 443 /*
 444  * Initialization of shared memory for WAL recovery
 445  */
 446 Size
 447 XLogRecoveryShmemSize(void)
 448 {
 449         Size            size;
 450
 451         /* XLogRecoveryCtl */
 452         size = sizeof(XLogRecoveryCtlData);
 453
 454         return size;
 455 }
 456
 457 void
 458 XLogRecoveryShmemInit(void)
 459 {
 460         bool            found;
 461
 462         XLogRecoveryCtl = (XLogRecoveryCtlData *)
 463                 ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
 464         if (found)
 465                 return;
 466         memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
 467
 468         SpinLockInit(&XLogRecoveryCtl->info_lck);
 469         InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
 470         ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
 471 }
 472
 473 /*
 474  * A thin wrapper to enable StandbyMode and do other preparatory work as
 475  * needed.
 476  */
 477 static void
 478 EnableStandbyMode(void)
 479 {
 480         StandbyMode = true;
 481
 482         /*
 483          * To avoid server log bloat, we don't report recovery progress in a
 484          * standby as it will always be in recovery unless promoted. We disable
 485          * startup progress timeout in standby mode to avoid calling
 486          * startup_progress_timeout_handler() unnecessarily.
 487          */
 488         disable_startup_progress_timeout();
 489 }
 490
 491 /*
 492  * Prepare the system for WAL recovery, if needed.
 493  *
 494  * This is called by StartupXLOG() which coordinates the server startup
 495  * sequence.  This function analyzes the control file and the backup label
 496  * file, if any, and figures out whether we need to perform crash recovery or
 497  * archive recovery, and how far we need to replay the WAL to reach a
 498  * consistent state.
 499  *
 500  * This doesn't yet change the on-disk state, except for creating the symlinks
 501  * from table space map file if any, and for fetching WAL files needed to find
 502  * the checkpoint record.  On entry, the caller has already read the control
 503  * file into memory, and passes it as argument.  This function updates it to
 504  * reflect the recovery state, and the caller is expected to write it back to
 505  * disk does after initializing other subsystems, but before calling
 506  * PerformWalRecovery().
 507  *
 508  * This initializes some global variables like ArchiveRecoveryRequested, and
 509  * StandbyModeRequested and InRecovery.
 510  */
 511 void
 512 InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 513                                 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
 514 {
 515         XLogPageReadPrivate *private;
 516         struct stat st;
 517         bool            wasShutdown;
 518         XLogRecord *record;
 519         DBState         dbstate_at_startup;
 520         bool            haveTblspcMap = false;
 521         bool            haveBackupLabel = false;
 522         CheckPoint      checkPoint;
 523         bool            backupFromStandby = false;
 524
 525         dbstate_at_startup = ControlFile->state;
 526
 527         /*
 528          * Initialize on the assumption we want to recover to the latest timeline
 529          * that's active according to pg_control.
 530          */
 531         if (ControlFile->minRecoveryPointTLI >
 532                 ControlFile->checkPointCopy.ThisTimeLineID)
 533                 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
 534         else
 535                 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
 536
 537         /*
 538          * Check for signal files, and if so set up state for offline recovery
 539          */
 540         readRecoverySignalFile();
 541         validateRecoveryParameters();
 542
 543         /*
 544          * Take ownership of the wakeup latch if we're going to sleep during
 545          * recovery, if required.
 546          */
 547         if (ArchiveRecoveryRequested)
 548                 OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
 549
 550         /*
 551          * Set the WAL reading processor now, as it will be needed when reading
 552          * the checkpoint record required (backup_label or not).
 553          */
 554         private = palloc0(sizeof(XLogPageReadPrivate));
 555         xlogreader =
 556                 XLogReaderAllocate(wal_segment_size, NULL,
 557                                                    XL_ROUTINE(.page_read = &XLogPageRead,
 558                                                                           .segment_open = NULL,
 559                                                                           .segment_close = wal_segment_close),
 560                                                    private);
 561         if (!xlogreader)
 562                 ereport(ERROR,
 563                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 564                                  errmsg("out of memory"),
 565                                  errdetail("Failed while allocating a WAL reading processor.")));
 566         xlogreader->system_identifier = ControlFile->system_identifier;
 567
 568         /*
 569          * Set the WAL decode buffer size.  This limits how far ahead we can read
 570          * in the WAL.
 571          */
 572         XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
 573
 574         /* Create a WAL prefetcher. */
 575         xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
 576
 577         /*
 578          * Allocate two page buffers dedicated to WAL consistency checks.  We do
 579          * it this way, rather than just making static arrays, for two reasons:
 580          * (1) no need to waste the storage in most instantiations of the backend;
 581          * (2) a static char array isn't guaranteed to have any particular
 582          * alignment, whereas palloc() will provide MAXALIGN'd storage.
 583          */
 584         replay_image_masked = (char *) palloc(BLCKSZ);
 585         primary_image_masked = (char *) palloc(BLCKSZ);
 586
 587         /*
 588          * Read the backup_label file.  We want to run this part of the recovery
 589          * process after checking for signal files and after performing validation
 590          * of the recovery parameters.
 591          */
 592         if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
 593                                                   &backupFromStandby))
 594         {
 595                 List       *tablespaces = NIL;
 596
 597                 /*
 598                  * Archive recovery was requested, and thanks to the backup label
 599                  * file, we know how far we need to replay to reach consistency. Enter
 600                  * archive recovery directly.
 601                  */
 602                 InArchiveRecovery = true;
 603                 if (StandbyModeRequested)
 604                         EnableStandbyMode();
 605
 606                 /*
 607                  * Omitting backup_label when creating a new replica, PITR node etc.
 608                  * unfortunately is a common cause of corruption.  Logging that
 609                  * backup_label was used makes it a bit easier to exclude that as the
 610                  * cause of observed corruption.
 611                  *
 612                  * Do so before we try to read the checkpoint record (which can fail),
 613                  * as otherwise it can be hard to understand why a checkpoint other
 614                  * than ControlFile->checkPoint is used.
 615                  */
 616                 ereport(LOG,
 617                                 (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
 618                                                 LSN_FORMAT_ARGS(RedoStartLSN),
 619                                                 LSN_FORMAT_ARGS(CheckPointLoc),
 620                                                 CheckPointTLI)));
 621
 622                 /*
 623                  * When a backup_label file is present, we want to roll forward from
 624                  * the checkpoint it identifies, rather than using pg_control.
 625                  */
 626                 record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
 627                                                                           CheckPointTLI);
 628                 if (record != NULL)
 629                 {
 630                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
 631                         wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
 632                         ereport(DEBUG1,
 633                                         (errmsg_internal("checkpoint record is at %X/%X",
 634                                                                          LSN_FORMAT_ARGS(CheckPointLoc))));
 635                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
 636
 637                         /*
 638                          * Make sure that REDO location exists. This may not be the case
 639                          * if there was a crash during an online backup, which left a
 640                          * backup_label around that references a WAL segment that's
 641                          * already been archived.
 642                          */
 643                         if (checkPoint.redo < CheckPointLoc)
 644                         {
 645                                 XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
 646                                 if (!ReadRecord(xlogprefetcher, LOG, false,
 647                                                                 checkPoint.ThisTimeLineID))
 648                                         ereport(FATAL,
 649                                                         (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
 650                                                                         LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
 651                                                          errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
 652                                                                          "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
 653                                                                          "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
 654                                                                          DataDir, DataDir, DataDir, DataDir)));
 655                         }
 656                 }
 657                 else
 658                 {
 659                         ereport(FATAL,
 660                                         (errmsg("could not locate required checkpoint record at %X/%X",
 661                                                         LSN_FORMAT_ARGS(CheckPointLoc)),
 662                                          errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
 663                                                          "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
 664                                                          "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
 665                                                          DataDir, DataDir, DataDir, DataDir)));
 666                         wasShutdown = false;    /* keep compiler quiet */
 667                 }
 668
 669                 /* Read the tablespace_map file if present and create symlinks. */
 670                 if (read_tablespace_map(&tablespaces))
 671                 {
 672                         ListCell   *lc;
 673
 674                         foreach(lc, tablespaces)
 675                         {
 676                                 tablespaceinfo *ti = lfirst(lc);
 677                                 char       *linkloc;
 678
 679                                 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
 680
 681                                 /*
 682                                  * Remove the existing symlink if any and Create the symlink
 683                                  * under PGDATA.
 684                                  */
 685                                 remove_tablespace_symlink(linkloc);
 686
 687                                 if (symlink(ti->path, linkloc) < 0)
 688                                         ereport(ERROR,
 689                                                         (errcode_for_file_access(),
 690                                                          errmsg("could not create symbolic link \"%s\": %m",
 691                                                                         linkloc)));
 692
 693                                 pfree(ti->path);
 694                                 pfree(ti);
 695                         }
 696
 697                         /* tell the caller to delete it later */
 698                         haveTblspcMap = true;
 699                 }
 700
 701                 /* tell the caller to delete it later */
 702                 haveBackupLabel = true;
 703         }
 704         else
 705         {
 706                 /* No backup_label file has been found if we are here. */
 707
 708                 /*
 709                  * If tablespace_map file is present without backup_label file, there
 710                  * is no use of such file.  There is no harm in retaining it, but it
 711                  * is better to get rid of the map file so that we don't have any
 712                  * redundant file in data directory and it will avoid any sort of
 713                  * confusion.  It seems prudent though to just rename the file out of
 714                  * the way rather than delete it completely, also we ignore any error
 715                  * that occurs in rename operation as even if map file is present
 716                  * without backup_label file, it is harmless.
 717                  */
 718                 if (stat(TABLESPACE_MAP, &st) == 0)
 719                 {
 720                         unlink(TABLESPACE_MAP_OLD);
 721                         if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
 722                                 ereport(LOG,
 723                                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
 724                                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
 725                                                  errdetail("File \"%s\" was renamed to \"%s\".",
 726                                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
 727                         else
 728                                 ereport(LOG,
 729                                                 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
 730                                                                 TABLESPACE_MAP, BACKUP_LABEL_FILE),
 731                                                  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
 732                                                                    TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
 733                 }
 734
 735                 /*
 736                  * It's possible that archive recovery was requested, but we don't
 737                  * know how far we need to replay the WAL before we reach consistency.
 738                  * This can happen for example if a base backup is taken from a
 739                  * running server using an atomic filesystem snapshot, without calling
 740                  * pg_backup_start/stop. Or if you just kill a running primary server
 741                  * and put it into archive recovery by creating a recovery signal
 742                  * file.
 743                  *
 744                  * Our strategy in that case is to perform crash recovery first,
 745                  * replaying all the WAL present in pg_wal, and only enter archive
 746                  * recovery after that.
 747                  *
 748                  * But usually we already know how far we need to replay the WAL (up
 749                  * to minRecoveryPoint, up to backupEndPoint, or until we see an
 750                  * end-of-backup record), and we can enter archive recovery directly.
 751                  */
 752                 if (ArchiveRecoveryRequested &&
 753                         (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
 754                          ControlFile->backupEndRequired ||
 755                          ControlFile->backupEndPoint != InvalidXLogRecPtr ||
 756                          ControlFile->state == DB_SHUTDOWNED))
 757                 {
 758                         InArchiveRecovery = true;
 759                         if (StandbyModeRequested)
 760                                 EnableStandbyMode();
 761                 }
 762
 763                 /*
 764                  * For the same reason as when starting up with backup_label present,
 765                  * emit a log message when we continue initializing from a base
 766                  * backup.
 767                  */
 768                 if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
 769                         ereport(LOG,
 770                                         (errmsg("restarting backup recovery with redo LSN %X/%X",
 771                                                         LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
 772
 773                 /* Get the last valid checkpoint record. */
 774                 CheckPointLoc = ControlFile->checkPoint;
 775                 CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
 776                 RedoStartLSN = ControlFile->checkPointCopy.redo;
 777                 RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
 778                 record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
 779                                                                           CheckPointTLI);
 780                 if (record != NULL)
 781                 {
 782                         ereport(DEBUG1,
 783                                         (errmsg_internal("checkpoint record is at %X/%X",
 784                                                                          LSN_FORMAT_ARGS(CheckPointLoc))));
 785                 }
 786                 else
 787                 {
 788                         /*
 789                          * We used to attempt to go back to a secondary checkpoint record
 790                          * here, but only when not in standby mode. We now just fail if we
 791                          * can't read the last checkpoint because this allows us to
 792                          * simplify processing around checkpoints.
 793                          */
 794                         ereport(PANIC,
 795                                         (errmsg("could not locate a valid checkpoint record at %X/%X",
 796                                                         LSN_FORMAT_ARGS(CheckPointLoc))));
 797                 }
 798                 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
 799                 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
 800         }
 801
 802         if (ArchiveRecoveryRequested)
 803         {
 804                 if (StandbyModeRequested)
 805                         ereport(LOG,
 806                                         (errmsg("entering standby mode")));
 807                 else if (recoveryTarget == RECOVERY_TARGET_XID)
 808                         ereport(LOG,
 809                                         (errmsg("starting point-in-time recovery to XID %u",
 810                                                         recoveryTargetXid)));
 811                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
 812                         ereport(LOG,
 813                                         (errmsg("starting point-in-time recovery to %s",
 814                                                         timestamptz_to_str(recoveryTargetTime))));
 815                 else if (recoveryTarget == RECOVERY_TARGET_NAME)
 816                         ereport(LOG,
 817                                         (errmsg("starting point-in-time recovery to \"%s\"",
 818                                                         recoveryTargetName)));
 819                 else if (recoveryTarget == RECOVERY_TARGET_LSN)
 820                         ereport(LOG,
 821                                         (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
 822                                                         LSN_FORMAT_ARGS(recoveryTargetLSN))));
 823                 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
 824                         ereport(LOG,
 825                                         (errmsg("starting point-in-time recovery to earliest consistent point")));
 826                 else
 827                         ereport(LOG,
 828                                         (errmsg("starting archive recovery")));
 829         }
 830
 831         /*
 832          * If the location of the checkpoint record is not on the expected
 833          * timeline in the history of the requested timeline, we cannot proceed:
 834          * the backup is not part of the history of the requested timeline.
 835          */
 836         Assert(expectedTLEs);           /* was initialized by reading checkpoint
 837                                                                  * record */
 838         if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
 839                 CheckPointTLI)
 840         {
 841                 XLogRecPtr      switchpoint;
 842
 843                 /*
 844                  * tliSwitchPoint will throw an error if the checkpoint's timeline is
 845                  * not in expectedTLEs at all.
 846                  */
 847                 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
 848                 ereport(FATAL,
 849                                 (errmsg("requested timeline %u is not a child of this server's history",
 850                                                 recoveryTargetTLI),
 851                                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
 852                                                    LSN_FORMAT_ARGS(ControlFile->checkPoint),
 853                                                    ControlFile->checkPointCopy.ThisTimeLineID,
 854                                                    LSN_FORMAT_ARGS(switchpoint))));
 855         }
 856
 857         /*
 858          * The min recovery point should be part of the requested timeline's
 859          * history, too.
 860          */
 861         if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
 862                 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
 863                 ControlFile->minRecoveryPointTLI)
 864                 ereport(FATAL,
 865                                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
 866                                                 recoveryTargetTLI,
 867                                                 LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
 868                                                 ControlFile->minRecoveryPointTLI)));
 869
 870         ereport(DEBUG1,
 871                         (errmsg_internal("redo record is at %X/%X; shutdown %s",
 872                                                          LSN_FORMAT_ARGS(checkPoint.redo),
 873                                                          wasShutdown ? "true" : "false")));
 874         ereport(DEBUG1,
 875                         (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
 876                                                          U64FromFullTransactionId(checkPoint.nextXid),
 877                                                          checkPoint.nextOid)));
 878         ereport(DEBUG1,
 879                         (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
 880                                                          checkPoint.nextMulti, checkPoint.nextMultiOffset)));
 881         ereport(DEBUG1,
 882                         (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
 883                                                          checkPoint.oldestXid, checkPoint.oldestXidDB)));
 884         ereport(DEBUG1,
 885                         (errmsg_internal("oldest MultiXactId: %u, in database %u",
 886                                                          checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
 887         ereport(DEBUG1,
 888                         (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
 889                                                          checkPoint.oldestCommitTsXid,
 890                                                          checkPoint.newestCommitTsXid)));
 891         if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
 892                 ereport(PANIC,
 893                                 (errmsg("invalid next transaction ID")));
 894
 895         /* sanity check */
 896         if (checkPoint.redo > CheckPointLoc)
 897                 ereport(PANIC,
 898                                 (errmsg("invalid redo in checkpoint record")));
 899
 900         /*
 901          * Check whether we need to force recovery from WAL.  If it appears to
 902          * have been a clean shutdown and we did not have a recovery signal file,
 903          * then assume no recovery needed.
 904          */
 905         if (checkPoint.redo < CheckPointLoc)
 906         {
 907                 if (wasShutdown)
 908                         ereport(PANIC,
 909                                         (errmsg("invalid redo record in shutdown checkpoint")));
 910                 InRecovery = true;
 911         }
 912         else if (ControlFile->state != DB_SHUTDOWNED)
 913                 InRecovery = true;
 914         else if (ArchiveRecoveryRequested)
 915         {
 916                 /* force recovery due to presence of recovery signal file */
 917                 InRecovery = true;
 918         }
 919
 920         /*
 921          * If recovery is needed, update our in-memory copy of pg_control to show
 922          * that we are recovering and to show the selected checkpoint as the place
 923          * we are starting from. We also mark pg_control with any minimum recovery
 924          * stop point obtained from a backup history file.
 925          *
 926          * We don't write the changes to disk yet, though. Only do that after
 927          * initializing various subsystems.
 928          */
 929         if (InRecovery)
 930         {
 931                 if (InArchiveRecovery)
 932                 {
 933                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
 934                 }
 935                 else
 936                 {
 937                         ereport(LOG,
 938                                         (errmsg("database system was not properly shut down; "
 939                                                         "automatic recovery in progress")));
 940                         if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
 941                                 ereport(LOG,
 942                                                 (errmsg("crash recovery starts in timeline %u "
 943                                                                 "and has target timeline %u",
 944                                                                 ControlFile->checkPointCopy.ThisTimeLineID,
 945                                                                 recoveryTargetTLI)));
 946                         ControlFile->state = DB_IN_CRASH_RECOVERY;
 947                 }
 948                 ControlFile->checkPoint = CheckPointLoc;
 949                 ControlFile->checkPointCopy = checkPoint;
 950                 if (InArchiveRecovery)
 951                 {
 952                         /* initialize minRecoveryPoint if not set yet */
 953                         if (ControlFile->minRecoveryPoint < checkPoint.redo)
 954                         {
 955                                 ControlFile->minRecoveryPoint = checkPoint.redo;
 956                                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
 957                         }
 958                 }
 959
 960                 /*
 961                  * Set backupStartPoint if we're starting recovery from a base backup.
 962                  *
 963                  * Also set backupEndPoint and use minRecoveryPoint as the backup end
 964                  * location if we're starting recovery from a base backup which was
 965                  * taken from a standby. In this case, the database system status in
 966                  * pg_control must indicate that the database was already in recovery.
 967                  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
 968                  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
 969                  * before reaching this point; e.g. because restore_command or
 970                  * primary_conninfo were faulty.
 971                  *
 972                  * Any other state indicates that the backup somehow became corrupted
 973                  * and we can't sensibly continue with recovery.
 974                  */
 975                 if (haveBackupLabel)
 976                 {
 977                         ControlFile->backupStartPoint = checkPoint.redo;
 978                         ControlFile->backupEndRequired = backupEndRequired;
 979
 980                         if (backupFromStandby)
 981                         {
 982                                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
 983                                         dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
 984                                         ereport(FATAL,
 985                                                         (errmsg("backup_label contains data inconsistent with control file"),
 986                                                          errhint("This means that the backup is corrupted and you will "
 987                                                                          "have to use another backup for recovery.")));
 988                                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
 989                         }
 990                 }
 991         }
 992
 993         /* remember these, so that we know when we have reached consistency */
 994         backupStartPoint = ControlFile->backupStartPoint;
 995         backupEndRequired = ControlFile->backupEndRequired;
 996         backupEndPoint = ControlFile->backupEndPoint;
 997         if (InArchiveRecovery)
 998         {
 999                 minRecoveryPoint = ControlFile->minRecoveryPoint;
1000                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1001         }
1002         else
1003         {
1004                 minRecoveryPoint = InvalidXLogRecPtr;
1005                 minRecoveryPointTLI = 0;
1006         }
1007
1008         /*
1009          * Start recovery assuming that the final record isn't lost.
1010          */
1011         abortedRecPtr = InvalidXLogRecPtr;
1012         missingContrecPtr = InvalidXLogRecPtr;
1013
1014         *wasShutdown_ptr = wasShutdown;
1015         *haveBackupLabel_ptr = haveBackupLabel;
1016         *haveTblspcMap_ptr = haveTblspcMap;
1017 }
1018
1019 /*
1020  * See if there are any recovery signal files and if so, set state for
1021  * recovery.
1022  *
1023  * See if there is a recovery command file (recovery.conf), and if so
1024  * throw an ERROR since as of PG12 we no longer recognize that.
1025  */
1026 static void
1027 readRecoverySignalFile(void)
1028 {
1029         struct stat stat_buf;
1030
1031         if (IsBootstrapProcessingMode())
1032                 return;
1033
1034         /*
1035          * Check for old recovery API file: recovery.conf
1036          */
1037         if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1038                 ereport(FATAL,
1039                                 (errcode_for_file_access(),
1040                                  errmsg("using recovery command file \"%s\" is not supported",
1041                                                 RECOVERY_COMMAND_FILE)));
1042
1043         /*
1044          * Remove unused .done file, if present. Ignore if absent.
1045          */
1046         unlink(RECOVERY_COMMAND_DONE);
1047
1048         /*
1049          * Check for recovery signal files and if found, fsync them since they
1050          * represent server state information.  We don't sweat too much about the
1051          * possibility of fsync failure, however.
1052          *
1053          * If present, standby signal file takes precedence. If neither is present
1054          * then we won't enter archive recovery.
1055          */
1056         if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1057         {
1058                 int                     fd;
1059
1060                 fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1061                                                            S_IRUSR | S_IWUSR);
1062                 if (fd >= 0)
1063                 {
1064                         (void) pg_fsync(fd);
1065                         close(fd);
1066                 }
1067                 standby_signal_file_found = true;
1068         }
1069         else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1070         {
1071                 int                     fd;
1072
1073                 fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1074                                                            S_IRUSR | S_IWUSR);
1075                 if (fd >= 0)
1076                 {
1077                         (void) pg_fsync(fd);
1078                         close(fd);
1079                 }
1080                 recovery_signal_file_found = true;
1081         }
1082
1083         StandbyModeRequested = false;
1084         ArchiveRecoveryRequested = false;
1085         if (standby_signal_file_found)
1086         {
1087                 StandbyModeRequested = true;
1088                 ArchiveRecoveryRequested = true;
1089         }
1090         else if (recovery_signal_file_found)
1091         {
1092                 StandbyModeRequested = false;
1093                 ArchiveRecoveryRequested = true;
1094         }
1095         else
1096                 return;
1097
1098         /*
1099          * We don't support standby mode in standalone backends; that requires
1100          * other processes such as the WAL receiver to be alive.
1101          */
1102         if (StandbyModeRequested && !IsUnderPostmaster)
1103                 ereport(FATAL,
1104                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1105                                  errmsg("standby mode is not supported by single-user servers")));
1106 }
1107
1108 static void
1109 validateRecoveryParameters(void)
1110 {
1111         if (!ArchiveRecoveryRequested)
1112                 return;
1113
1114         /*
1115          * Check for compulsory parameters
1116          */
1117         if (StandbyModeRequested)
1118         {
1119                 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1120                         (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1121                         ereport(WARNING,
1122                                         (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1123                                          errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1124         }
1125         else
1126         {
1127                 if (recoveryRestoreCommand == NULL ||
1128                         strcmp(recoveryRestoreCommand, "") == 0)
1129                         ereport(FATAL,
1130                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1131                                          errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1132         }
1133
1134         /*
1135          * Override any inconsistent requests. Note that this is a change of
1136          * behaviour in 9.5; prior to this we simply ignored a request to pause if
1137          * hot_standby = off, which was surprising behaviour.
1138          */
1139         if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1140                 !EnableHotStandby)
1141                 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1142
1143         /*
1144          * Final parsing of recovery_target_time string; see also
1145          * check_recovery_target_time().
1146          */
1147         if (recoveryTarget == RECOVERY_TARGET_TIME)
1148         {
1149                 recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1150                                                                                                                                          CStringGetDatum(recovery_target_time_string),
1151                                                                                                                                          ObjectIdGetDatum(InvalidOid),
1152                                                                                                                                          Int32GetDatum(-1)));
1153         }
1154
1155         /*
1156          * If user specified recovery_target_timeline, validate it or compute the
1157          * "latest" value.  We can't do this until after we've gotten the restore
1158          * command and set InArchiveRecovery, because we need to fetch timeline
1159          * history files from the archive.
1160          */
1161         if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1162         {
1163                 TimeLineID      rtli = recoveryTargetTLIRequested;
1164
1165                 /* Timeline 1 does not have a history file, all else should */
1166                 if (rtli != 1 && !existsTimeLineHistory(rtli))
1167                         ereport(FATAL,
1168                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1169                                          errmsg("recovery target timeline %u does not exist",
1170                                                         rtli)));
1171                 recoveryTargetTLI = rtli;
1172         }
1173         else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1174         {
1175                 /* We start the "latest" search from pg_control's timeline */
1176                 recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1177         }
1178         else
1179         {
1180                 /*
1181                  * else we just use the recoveryTargetTLI as already read from
1182                  * ControlFile
1183                  */
1184                 Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1185         }
1186 }
1187
1188 /*
1189  * read_backup_label: check to see if a backup_label file is present
1190  *
1191  * If we see a backup_label during recovery, we assume that we are recovering
1192  * from a backup dump file, and we therefore roll forward from the checkpoint
1193  * identified by the label file, NOT what pg_control says.  This avoids the
1194  * problem that pg_control might have been archived one or more checkpoints
1195  * later than the start of the dump, and so if we rely on it as the start
1196  * point, we will fail to restore a consistent database state.
1197  *
1198  * Returns true if a backup_label was found (and fills the checkpoint
1199  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1200  * returns false if not. If this backup_label came from a streamed backup,
1201  * *backupEndRequired is set to true. If this backup_label was created during
1202  * recovery, *backupFromStandby is set to true.
1203  *
1204  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1205  * and TLI read from the backup file.
1206  */
1207 static bool
1208 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1209                                   bool *backupEndRequired, bool *backupFromStandby)
1210 {
1211         char            startxlogfilename[MAXFNAMELEN];
1212         TimeLineID      tli_from_walseg,
1213                                 tli_from_file;
1214         FILE       *lfp;
1215         char            ch;
1216         char            backuptype[20];
1217         char            backupfrom[20];
1218         char            backuplabel[MAXPGPATH];
1219         char            backuptime[128];
1220         uint32          hi,
1221                                 lo;
1222
1223         /* suppress possible uninitialized-variable warnings */
1224         *checkPointLoc = InvalidXLogRecPtr;
1225         *backupLabelTLI = 0;
1226         *backupEndRequired = false;
1227         *backupFromStandby = false;
1228
1229         /*
1230          * See if label file is present
1231          */
1232         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1233         if (!lfp)
1234         {
1235                 if (errno != ENOENT)
1236                         ereport(FATAL,
1237                                         (errcode_for_file_access(),
1238                                          errmsg("could not read file \"%s\": %m",
1239                                                         BACKUP_LABEL_FILE)));
1240                 return false;                   /* it's not there, all is fine */
1241         }
1242
1243         /*
1244          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1245          * is pretty crude, but we are not expecting any variability in the file
1246          * format).
1247          */
1248         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1249                            &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1250                 ereport(FATAL,
1251                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1252                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1253         RedoStartLSN = ((uint64) hi) << 32 | lo;
1254         RedoStartTLI = tli_from_walseg;
1255         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1256                            &hi, &lo, &ch) != 3 || ch != '\n')
1257                 ereport(FATAL,
1258                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1259                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1260         *checkPointLoc = ((uint64) hi) << 32 | lo;
1261         *backupLabelTLI = tli_from_walseg;
1262
1263         /*
1264          * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1265          * which could mean either pg_basebackup or the pg_backup_start/stop
1266          * method was used) or if this label came from somewhere else (the only
1267          * other option today being from pg_rewind).  If this was a streamed
1268          * backup then we know that we need to play through until we get to the
1269          * end of the WAL which was generated during the backup (at which point we
1270          * will have reached consistency and backupEndRequired will be reset to be
1271          * false).
1272          */
1273         if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1274         {
1275                 if (strcmp(backuptype, "streamed") == 0)
1276                         *backupEndRequired = true;
1277         }
1278
1279         /*
1280          * BACKUP FROM lets us know if this was from a primary or a standby.  If
1281          * it was from a standby, we'll double-check that the control file state
1282          * matches that of a standby.
1283          */
1284         if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1285         {
1286                 if (strcmp(backupfrom, "standby") == 0)
1287                         *backupFromStandby = true;
1288         }
1289
1290         /*
1291          * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1292          * but checking for their presence is useful for debugging and the next
1293          * sanity checks. Cope also with the fact that the result buffers have a
1294          * pre-allocated size, hence if the backup_label file has been generated
1295          * with strings longer than the maximum assumed here an incorrect parsing
1296          * happens. That's fine as only minor consistency checks are done
1297          * afterwards.
1298          */
1299         if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1300                 ereport(DEBUG1,
1301                                 (errmsg_internal("backup time %s in file \"%s\"",
1302                                                                  backuptime, BACKUP_LABEL_FILE)));
1303
1304         if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1305                 ereport(DEBUG1,
1306                                 (errmsg_internal("backup label %s in file \"%s\"",
1307                                                                  backuplabel, BACKUP_LABEL_FILE)));
1308
1309         /*
1310          * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1311          * it as a sanity check if present.
1312          */
1313         if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1314         {
1315                 if (tli_from_walseg != tli_from_file)
1316                         ereport(FATAL,
1317                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1318                                          errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1319                                          errdetail("Timeline ID parsed is %u, but expected %u.",
1320                                                            tli_from_file, tli_from_walseg)));
1321
1322                 ereport(DEBUG1,
1323                                 (errmsg_internal("backup timeline %u in file \"%s\"",
1324                                                                  tli_from_file, BACKUP_LABEL_FILE)));
1325         }
1326
1327         if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1328                 ereport(FATAL,
1329                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1330                                  errmsg("this is an incremental backup, not a data directory"),
1331                                  errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1332
1333         if (ferror(lfp) || FreeFile(lfp))
1334                 ereport(FATAL,
1335                                 (errcode_for_file_access(),
1336                                  errmsg("could not read file \"%s\": %m",
1337                                                 BACKUP_LABEL_FILE)));
1338
1339         return true;
1340 }
1341
1342 /*
1343  * read_tablespace_map: check to see if a tablespace_map file is present
1344  *
1345  * If we see a tablespace_map file during recovery, we assume that we are
1346  * recovering from a backup dump file, and we therefore need to create symlinks
1347  * as per the information present in tablespace_map file.
1348  *
1349  * Returns true if a tablespace_map file was found (and fills *tablespaces
1350  * with a tablespaceinfo struct for each tablespace listed in the file);
1351  * returns false if not.
1352  */
1353 static bool
1354 read_tablespace_map(List **tablespaces)
1355 {
1356         tablespaceinfo *ti;
1357         FILE       *lfp;
1358         char            str[MAXPGPATH];
1359         int                     ch,
1360                                 i,
1361                                 n;
1362         bool            was_backslash;
1363
1364         /*
1365          * See if tablespace_map file is present
1366          */
1367         lfp = AllocateFile(TABLESPACE_MAP, "r");
1368         if (!lfp)
1369         {
1370                 if (errno != ENOENT)
1371                         ereport(FATAL,
1372                                         (errcode_for_file_access(),
1373                                          errmsg("could not read file \"%s\": %m",
1374                                                         TABLESPACE_MAP)));
1375                 return false;                   /* it's not there, all is fine */
1376         }
1377
1378         /*
1379          * Read and parse the link name and path lines from tablespace_map file
1380          * (this code is pretty crude, but we are not expecting any variability in
1381          * the file format).  De-escape any backslashes that were inserted.
1382          */
1383         i = 0;
1384         was_backslash = false;
1385         while ((ch = fgetc(lfp)) != EOF)
1386         {
1387                 if (!was_backslash && (ch == '\n' || ch == '\r'))
1388                 {
1389                         char       *endp;
1390
1391                         if (i == 0)
1392                                 continue;               /* \r immediately followed by \n */
1393
1394                         /*
1395                          * The de-escaped line should contain an OID followed by exactly
1396                          * one space followed by a path.  The path might start with
1397                          * spaces, so don't be too liberal about parsing.
1398                          */
1399                         str[i] = '\0';
1400                         n = 0;
1401                         while (str[n] && str[n] != ' ')
1402                                 n++;
1403                         if (n < 1 || n >= i - 1)
1404                                 ereport(FATAL,
1405                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1406                                                  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1407                         str[n++] = '\0';
1408
1409                         ti = palloc0(sizeof(tablespaceinfo));
1410                         errno = 0;
1411                         ti->oid = strtoul(str, &endp, 10);
1412                         if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1413                                 ereport(FATAL,
1414                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1415                                                  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1416                         ti->path = pstrdup(str + n);
1417                         *tablespaces = lappend(*tablespaces, ti);
1418
1419                         i = 0;
1420                         continue;
1421                 }
1422                 else if (!was_backslash && ch == '\\')
1423                         was_backslash = true;
1424                 else
1425                 {
1426                         if (i < sizeof(str) - 1)
1427                                 str[i++] = ch;
1428                         was_backslash = false;
1429                 }
1430         }
1431
1432         if (i != 0 || was_backslash)    /* last line not terminated? */
1433                 ereport(FATAL,
1434                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1435                                  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1436
1437         if (ferror(lfp) || FreeFile(lfp))
1438                 ereport(FATAL,
1439                                 (errcode_for_file_access(),
1440                                  errmsg("could not read file \"%s\": %m",
1441                                                 TABLESPACE_MAP)));
1442
1443         return true;
1444 }
1445
1446 /*
1447  * Finish WAL recovery.
1448  *
1449  * This does not close the 'xlogreader' yet, because in some cases the caller
1450  * still wants to re-read the last checkpoint record by calling
1451  * ReadCheckpointRecord().
1452  *
1453  * Returns the position of the last valid or applied record, after which new
1454  * WAL should be appended, information about why recovery was ended, and some
1455  * other things. See the EndOfWalRecoveryInfo struct for details.
1456  */
1457 EndOfWalRecoveryInfo *
1458 FinishWalRecovery(void)
1459 {
1460         EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1461         XLogRecPtr      lastRec;
1462         TimeLineID      lastRecTLI;
1463         XLogRecPtr      endOfLog;
1464
1465         /*
1466          * Kill WAL receiver, if it's still running, before we continue to write
1467          * the startup checkpoint and aborted-contrecord records. It will trump
1468          * over these records and subsequent ones if it's still alive when we
1469          * start writing WAL.
1470          */
1471         XLogShutdownWalRcv();
1472
1473         /*
1474          * Shutdown the slot sync worker to drop any temporary slots acquired by
1475          * it and to prevent it from keep trying to fetch the failover slots.
1476          *
1477          * We do not update the 'synced' column in 'pg_replication_slots' system
1478          * view from true to false here, as any failed update could leave 'synced'
1479          * column false for some slots. This could cause issues during slot sync
1480          * after restarting the server as a standby. While updating the 'synced'
1481          * column after switching to the new timeline is an option, it does not
1482          * simplify the handling for the 'synced' column. Therefore, we retain the
1483          * 'synced' column as true after promotion as it may provide useful
1484          * information about the slot origin.
1485          */
1486         ShutDownSlotSync();
1487
1488         /*
1489          * We are now done reading the xlog from stream. Turn off streaming
1490          * recovery to force fetching the files (which would be required at end of
1491          * recovery, e.g., timeline history file) from archive or pg_wal.
1492          *
1493          * Note that standby mode must be turned off after killing WAL receiver,
1494          * i.e., calling XLogShutdownWalRcv().
1495          */
1496         Assert(!WalRcvStreaming());
1497         StandbyMode = false;
1498
1499         /*
1500          * Determine where to start writing WAL next.
1501          *
1502          * Re-fetch the last valid or last applied record, so we can identify the
1503          * exact endpoint of what we consider the valid portion of WAL.  There may
1504          * be an incomplete continuation record after that, in which case
1505          * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1506          * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1507          * it is intentionally missing.  See CreateOverwriteContrecordRecord().
1508          *
1509          * An important side-effect of this is to load the last page into
1510          * xlogreader. The caller uses it to initialize the WAL for writing.
1511          */
1512         if (!InRecovery)
1513         {
1514                 lastRec = CheckPointLoc;
1515                 lastRecTLI = CheckPointTLI;
1516         }
1517         else
1518         {
1519                 lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1520                 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1521         }
1522         XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1523         (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1524         endOfLog = xlogreader->EndRecPtr;
1525
1526         /*
1527          * Remember the TLI in the filename of the XLOG segment containing the
1528          * end-of-log.  It could be different from the timeline that endOfLog
1529          * nominally belongs to, if there was a timeline switch in that segment,
1530          * and we were reading the old WAL from a segment belonging to a higher
1531          * timeline.
1532          */
1533         result->endOfLogTLI = xlogreader->seg.ws_tli;
1534
1535         if (ArchiveRecoveryRequested)
1536         {
1537                 /*
1538                  * We are no longer in archive recovery state.
1539                  *
1540                  * We are now done reading the old WAL.  Turn off archive fetching if
1541                  * it was active.
1542                  */
1543                 Assert(InArchiveRecovery);
1544                 InArchiveRecovery = false;
1545
1546                 /*
1547                  * If the ending log segment is still open, close it (to avoid
1548                  * problems on Windows with trying to rename or delete an open file).
1549                  */
1550                 if (readFile >= 0)
1551                 {
1552                         close(readFile);
1553                         readFile = -1;
1554                 }
1555         }
1556
1557         /*
1558          * Copy the last partial block to the caller, for initializing the WAL
1559          * buffer for appending new WAL.
1560          */
1561         if (endOfLog % XLOG_BLCKSZ != 0)
1562         {
1563                 char       *page;
1564                 int                     len;
1565                 XLogRecPtr      pageBeginPtr;
1566
1567                 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1568                 Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1569
1570                 /* Copy the valid part of the last block */
1571                 len = endOfLog % XLOG_BLCKSZ;
1572                 page = palloc(len);
1573                 memcpy(page, xlogreader->readBuf, len);
1574
1575                 result->lastPageBeginPtr = pageBeginPtr;
1576                 result->lastPage = page;
1577         }
1578         else
1579         {
1580                 /* There is no partial block to copy. */
1581                 result->lastPageBeginPtr = endOfLog;
1582                 result->lastPage = NULL;
1583         }
1584
1585         /*
1586          * Create a comment for the history file to explain why and where timeline
1587          * changed.
1588          */
1589         result->recoveryStopReason = getRecoveryStopReason();
1590
1591         result->lastRec = lastRec;
1592         result->lastRecTLI = lastRecTLI;
1593         result->endOfLog = endOfLog;
1594
1595         result->abortedRecPtr = abortedRecPtr;
1596         result->missingContrecPtr = missingContrecPtr;
1597
1598         result->standby_signal_file_found = standby_signal_file_found;
1599         result->recovery_signal_file_found = recovery_signal_file_found;
1600
1601         return result;
1602 }
1603
1604 /*
1605  * Clean up the WAL reader and leftovers from restoring WAL from archive
1606  */
1607 void
1608 ShutdownWalRecovery(void)
1609 {
1610         char            recoveryPath[MAXPGPATH];
1611
1612         /* Final update of pg_stat_recovery_prefetch. */
1613         XLogPrefetcherComputeStats(xlogprefetcher);
1614
1615         /* Shut down xlogreader */
1616         if (readFile >= 0)
1617         {
1618                 close(readFile);
1619                 readFile = -1;
1620         }
1621         XLogReaderFree(xlogreader);
1622         XLogPrefetcherFree(xlogprefetcher);
1623
1624         if (ArchiveRecoveryRequested)
1625         {
1626                 /*
1627                  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1628                  * rid of it.
1629                  */
1630                 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1631                 unlink(recoveryPath);   /* ignore any error */
1632
1633                 /* Get rid of any remaining recovered timeline-history file, too */
1634                 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1635                 unlink(recoveryPath);   /* ignore any error */
1636         }
1637
1638         /*
1639          * We don't need the latch anymore. It's not strictly necessary to disown
1640          * it, but let's do it for the sake of tidiness.
1641          */
1642         if (ArchiveRecoveryRequested)
1643                 DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1644 }
1645
1646 /*
1647  * Perform WAL recovery.
1648  *
1649  * If the system was shut down cleanly, this is never called.
1650  */
1651 void
1652 PerformWalRecovery(void)
1653 {
1654         XLogRecord *record;
1655         bool            reachedRecoveryTarget = false;
1656         TimeLineID      replayTLI;
1657
1658         /*
1659          * Initialize shared variables for tracking progress of WAL replay, as if
1660          * we had just replayed the record before the REDO location (or the
1661          * checkpoint record itself, if it's a shutdown checkpoint).
1662          */
1663         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1664         if (RedoStartLSN < CheckPointLoc)
1665         {
1666                 XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1667                 XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1668                 XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1669         }
1670         else
1671         {
1672                 XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1673                 XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1674                 XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1675         }
1676         XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1677         XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1678         XLogRecoveryCtl->recoveryLastXTime = 0;
1679         XLogRecoveryCtl->currentChunkStartTime = 0;
1680         XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1681         SpinLockRelease(&XLogRecoveryCtl->info_lck);
1682
1683         /* Also ensure XLogReceiptTime has a sane value */
1684         XLogReceiptTime = GetCurrentTimestamp();
1685
1686         /*
1687          * Let postmaster know we've started redo now, so that it can launch the
1688          * archiver if necessary.
1689          */
1690         if (IsUnderPostmaster)
1691                 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1692
1693         /*
1694          * Allow read-only connections immediately if we're consistent already.
1695          */
1696         CheckRecoveryConsistency();
1697
1698         /*
1699          * Find the first record that logically follows the checkpoint --- it
1700          * might physically precede it, though.
1701          */
1702         if (RedoStartLSN < CheckPointLoc)
1703         {
1704                 /* back up to find the record */
1705                 replayTLI = RedoStartTLI;
1706                 XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1707                 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1708
1709                 /*
1710                  * If a checkpoint record's redo pointer points back to an earlier
1711                  * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1712                  * record.
1713                  */
1714                 if (record->xl_rmid != RM_XLOG_ID ||
1715                         (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1716                         ereport(FATAL,
1717                                         (errmsg("unexpected record type found at redo point %X/%X",
1718                                                         LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1719         }
1720         else
1721         {
1722                 /* just have to read next record after CheckPoint */
1723                 Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1724                 replayTLI = CheckPointTLI;
1725                 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1726         }
1727
1728         if (record != NULL)
1729         {
1730                 TimestampTz xtime;
1731                 PGRUsage        ru0;
1732
1733                 pg_rusage_init(&ru0);
1734
1735                 InRedo = true;
1736
1737                 RmgrStartup();
1738
1739                 ereport(LOG,
1740                                 (errmsg("redo starts at %X/%X",
1741                                                 LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1742
1743                 /* Prepare to report progress of the redo phase. */
1744                 if (!StandbyMode)
1745                         begin_startup_progress_phase();
1746
1747                 /*
1748                  * main redo apply loop
1749                  */
1750                 do
1751                 {
1752                         if (!StandbyMode)
1753                                 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1754                                                                                  LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1755
1756 #ifdef WAL_DEBUG
1757                         if (XLOG_DEBUG)
1758                         {
1759                                 StringInfoData buf;
1760
1761                                 initStringInfo(&buf);
1762                                 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1763                                                                  LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1764                                                                  LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1765                                 xlog_outrec(&buf, xlogreader);
1766                                 appendStringInfoString(&buf, " - ");
1767                                 xlog_outdesc(&buf, xlogreader);
1768                                 elog(LOG, "%s", buf.data);
1769                                 pfree(buf.data);
1770                         }
1771 #endif
1772
1773                         /* Handle interrupt signals of startup process */
1774                         HandleStartupProcInterrupts();
1775
1776                         /*
1777                          * Pause WAL replay, if requested by a hot-standby session via
1778                          * SetRecoveryPause().
1779                          *
1780                          * Note that we intentionally don't take the info_lck spinlock
1781                          * here.  We might therefore read a slightly stale value of the
1782                          * recoveryPause flag, but it can't be very stale (no worse than
1783                          * the last spinlock we did acquire).  Since a pause request is a
1784                          * pretty asynchronous thing anyway, possibly responding to it one
1785                          * WAL record later than we otherwise would is a minor issue, so
1786                          * it doesn't seem worth adding another spinlock cycle to prevent
1787                          * that.
1788                          */
1789                         if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1790                                 RECOVERY_NOT_PAUSED)
1791                                 recoveryPausesHere(false);
1792
1793                         /*
1794                          * Have we reached our recovery target?
1795                          */
1796                         if (recoveryStopsBefore(xlogreader))
1797                         {
1798                                 reachedRecoveryTarget = true;
1799                                 break;
1800                         }
1801
1802                         /*
1803                          * If we've been asked to lag the primary, wait on latch until
1804                          * enough time has passed.
1805                          */
1806                         if (recoveryApplyDelay(xlogreader))
1807                         {
1808                                 /*
1809                                  * We test for paused recovery again here. If user sets
1810                                  * delayed apply, it may be because they expect to pause
1811                                  * recovery in case of problems, so we must test again here
1812                                  * otherwise pausing during the delay-wait wouldn't work.
1813                                  */
1814                                 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1815                                         RECOVERY_NOT_PAUSED)
1816                                         recoveryPausesHere(false);
1817                         }
1818
1819                         /*
1820                          * Apply the record
1821                          */
1822                         ApplyWalRecord(xlogreader, record, &replayTLI);
1823
1824                         /* Exit loop if we reached inclusive recovery target */
1825                         if (recoveryStopsAfter(xlogreader))
1826                         {
1827                                 reachedRecoveryTarget = true;
1828                                 break;
1829                         }
1830
1831                         /* Else, try to fetch the next WAL record */
1832                         record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1833                 } while (record != NULL);
1834
1835                 /*
1836                  * end of main redo apply loop
1837                  */
1838
1839                 if (reachedRecoveryTarget)
1840                 {
1841                         if (!reachedConsistency)
1842                                 ereport(FATAL,
1843                                                 (errmsg("requested recovery stop point is before consistent recovery point")));
1844
1845                         /*
1846                          * This is the last point where we can restart recovery with a new
1847                          * recovery target, if we shutdown and begin again. After this,
1848                          * Resource Managers may choose to do permanent corrective actions
1849                          * at end of recovery.
1850                          */
1851                         switch (recoveryTargetAction)
1852                         {
1853                                 case RECOVERY_TARGET_ACTION_SHUTDOWN:
1854
1855                                         /*
1856                                          * exit with special return code to request shutdown of
1857                                          * postmaster.  Log messages issued from postmaster.
1858                                          */
1859                                         proc_exit(3);
1860
1861                                 case RECOVERY_TARGET_ACTION_PAUSE:
1862                                         SetRecoveryPause(true);
1863                                         recoveryPausesHere(true);
1864
1865                                         /* drop into promote */
1866
1867                                 case RECOVERY_TARGET_ACTION_PROMOTE:
1868                                         break;
1869                         }
1870                 }
1871
1872                 RmgrCleanup();
1873
1874                 ereport(LOG,
1875                                 (errmsg("redo done at %X/%X system usage: %s",
1876                                                 LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1877                                                 pg_rusage_show(&ru0))));
1878                 xtime = GetLatestXTime();
1879                 if (xtime)
1880                         ereport(LOG,
1881                                         (errmsg("last completed transaction was at log time %s",
1882                                                         timestamptz_to_str(xtime))));
1883
1884                 InRedo = false;
1885         }
1886         else
1887         {
1888                 /* there are no WAL records following the checkpoint */
1889                 ereport(LOG,
1890                                 (errmsg("redo is not required")));
1891         }
1892
1893         /*
1894          * This check is intentionally after the above log messages that indicate
1895          * how far recovery went.
1896          */
1897         if (ArchiveRecoveryRequested &&
1898                 recoveryTarget != RECOVERY_TARGET_UNSET &&
1899                 !reachedRecoveryTarget)
1900                 ereport(FATAL,
1901                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1902                                  errmsg("recovery ended before configured recovery target was reached")));
1903 }
1904
1905 /*
1906  * Subroutine of PerformWalRecovery, to apply one WAL record.
1907  */
1908 static void
1909 ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1910 {
1911         ErrorContextCallback errcallback;
1912         bool            switchedTLI = false;
1913
1914         /* Setup error traceback support for ereport() */
1915         errcallback.callback = rm_redo_error_callback;
1916         errcallback.arg = xlogreader;
1917         errcallback.previous = error_context_stack;
1918         error_context_stack = &errcallback;
1919
1920         /*
1921          * TransamVariables->nextXid must be beyond record's xid.
1922          */
1923         AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1924
1925         /*
1926          * Before replaying this record, check if this record causes the current
1927          * timeline to change. The record is already considered to be part of the
1928          * new timeline, so we update replayTLI before replaying it. That's
1929          * important so that replayEndTLI, which is recorded as the minimum
1930          * recovery point's TLI if recovery stops after this record, is set
1931          * correctly.
1932          */
1933         if (record->xl_rmid == RM_XLOG_ID)
1934         {
1935                 TimeLineID      newReplayTLI = *replayTLI;
1936                 TimeLineID      prevReplayTLI = *replayTLI;
1937                 uint8           info = record->xl_info & ~XLR_INFO_MASK;
1938
1939                 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1940                 {
1941                         CheckPoint      checkPoint;
1942
1943                         memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1944                         newReplayTLI = checkPoint.ThisTimeLineID;
1945                         prevReplayTLI = checkPoint.PrevTimeLineID;
1946                 }
1947                 else if (info == XLOG_END_OF_RECOVERY)
1948                 {
1949                         xl_end_of_recovery xlrec;
1950
1951                         memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1952                         newReplayTLI = xlrec.ThisTimeLineID;
1953                         prevReplayTLI = xlrec.PrevTimeLineID;
1954                 }
1955
1956                 if (newReplayTLI != *replayTLI)
1957                 {
1958                         /* Check that it's OK to switch to this TLI */
1959                         checkTimeLineSwitch(xlogreader->EndRecPtr,
1960                                                                 newReplayTLI, prevReplayTLI, *replayTLI);
1961
1962                         /* Following WAL records should be run with new TLI */
1963                         *replayTLI = newReplayTLI;
1964                         switchedTLI = true;
1965                 }
1966         }
1967
1968         /*
1969          * Update shared replayEndRecPtr before replaying this record, so that
1970          * XLogFlush will update minRecoveryPoint correctly.
1971          */
1972         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1973         XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1974         XLogRecoveryCtl->replayEndTLI = *replayTLI;
1975         SpinLockRelease(&XLogRecoveryCtl->info_lck);
1976
1977         /*
1978          * If we are attempting to enter Hot Standby mode, process XIDs we see
1979          */
1980         if (standbyState >= STANDBY_INITIALIZED &&
1981                 TransactionIdIsValid(record->xl_xid))
1982                 RecordKnownAssignedTransactionIds(record->xl_xid);
1983
1984         /*
1985          * Some XLOG record types that are related to recovery are processed
1986          * directly here, rather than in xlog_redo()
1987          */
1988         if (record->xl_rmid == RM_XLOG_ID)
1989                 xlogrecovery_redo(xlogreader, *replayTLI);
1990
1991         /* Now apply the WAL record itself */
1992         GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1993
1994         /*
1995          * After redo, check whether the backup pages associated with the WAL
1996          * record are consistent with the existing pages. This check is done only
1997          * if consistency check is enabled for this record.
1998          */
1999         if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2000                 verifyBackupPageConsistency(xlogreader);
2001
2002         /* Pop the error context stack */
2003         error_context_stack = errcallback.previous;
2004
2005         /*
2006          * Update lastReplayedEndRecPtr after this record has been successfully
2007          * replayed.
2008          */
2009         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2010         XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2011         XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2012         XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2013         SpinLockRelease(&XLogRecoveryCtl->info_lck);
2014
2015         /* ------
2016          * Wakeup walsenders:
2017          *
2018          * On the standby, the WAL is flushed first (which will only wake up
2019          * physical walsenders) and then applied, which will only wake up logical
2020          * walsenders.
2021          *
2022          * Indeed, logical walsenders on standby can't decode and send data until
2023          * it's been applied.
2024          *
2025          * Physical walsenders don't need to be woken up during replay unless
2026          * cascading replication is allowed and time line change occurred (so that
2027          * they can notice that they are on a new time line).
2028          *
2029          * That's why the wake up conditions are for:
2030          *
2031          *  - physical walsenders in case of new time line and cascade
2032          *    replication is allowed
2033          *  - logical walsenders in case cascade replication is allowed (could not
2034          *    be created otherwise)
2035          * ------
2036          */
2037         if (AllowCascadeReplication())
2038                 WalSndWakeup(switchedTLI, true);
2039
2040         /*
2041          * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2042          * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2043          * a reply to the primary.
2044          */
2045         if (doRequestWalReceiverReply)
2046         {
2047                 doRequestWalReceiverReply = false;
2048                 WalRcvForceReply();
2049         }
2050
2051         /* Allow read-only connections if we're consistent now */
2052         CheckRecoveryConsistency();
2053
2054         /* Is this a timeline switch? */
2055         if (switchedTLI)
2056         {
2057                 /*
2058                  * Before we continue on the new timeline, clean up any (possibly
2059                  * bogus) future WAL segments on the old timeline.
2060                  */
2061                 RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2062
2063                 /* Reset the prefetcher. */
2064                 XLogPrefetchReconfigure();
2065         }
2066 }
2067
2068 /*
2069  * Some XLOG RM record types that are directly related to WAL recovery are
2070  * handled here rather than in the xlog_redo()
2071  */
2072 static void
2073 xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2074 {
2075         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2076         XLogRecPtr      lsn = record->EndRecPtr;
2077
2078         Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2079
2080         if (info == XLOG_OVERWRITE_CONTRECORD)
2081         {
2082                 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2083                 xl_overwrite_contrecord xlrec;
2084
2085                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2086                 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2087                         elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2088                                  LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2089                                  LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2090
2091                 /* We have safely skipped the aborted record */
2092                 abortedRecPtr = InvalidXLogRecPtr;
2093                 missingContrecPtr = InvalidXLogRecPtr;
2094
2095                 ereport(LOG,
2096                                 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2097                                                 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2098                                                 timestamptz_to_str(xlrec.overwrite_time))));
2099
2100                 /* Verifying the record should only happen once */
2101                 record->overwrittenRecPtr = InvalidXLogRecPtr;
2102         }
2103         else if (info == XLOG_BACKUP_END)
2104         {
2105                 XLogRecPtr      startpoint;
2106
2107                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2108
2109                 if (backupStartPoint == startpoint)
2110                 {
2111                         /*
2112                          * We have reached the end of base backup, the point where
2113                          * pg_backup_stop() was done.  The data on disk is now consistent
2114                          * (assuming we have also reached minRecoveryPoint).  Set
2115                          * backupEndPoint to the current LSN, so that the next call to
2116                          * CheckRecoveryConsistency() will notice it and do the
2117                          * end-of-backup processing.
2118                          */
2119                         elog(DEBUG1, "end of backup record reached");
2120
2121                         backupEndPoint = lsn;
2122                 }
2123                 else
2124                         elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2125                                  LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2126         }
2127 }
2128
2129 /*
2130  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2131  * directories.
2132  *
2133  * Replay of database creation XLOG records for databases that were later
2134  * dropped can create fake directories in pg_tblspc.  By the time consistency
2135  * is reached these directories should have been removed; here we verify
2136  * that this did indeed happen.  This is to be called at the point where
2137  * consistent state is reached.
2138  *
2139  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2140  * useful for testing purposes, and also allows for an escape hatch in case
2141  * things go south.
2142  */
2143 static void
2144 CheckTablespaceDirectory(void)
2145 {
2146         DIR                *dir;
2147         struct dirent *de;
2148
2149         dir = AllocateDir(PG_TBLSPC_DIR);
2150         while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2151         {
2152                 char            path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2153
2154                 /* Skip entries of non-oid names */
2155                 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2156                         continue;
2157
2158                 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2159
2160                 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2161                         ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2162                                         (errcode(ERRCODE_DATA_CORRUPTED),
2163                                          errmsg("unexpected directory entry \"%s\" found in %s",
2164                                                         de->d_name, PG_TBLSPC_DIR),
2165                                          errdetail("All directory entries in %s/ should be symbolic links.",
2166                                                            PG_TBLSPC_DIR),
2167                                          errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2168         }
2169 }
2170
2171 /*
2172  * Checks if recovery has reached a consistent state. When consistency is
2173  * reached and we have a valid starting standby snapshot, tell postmaster
2174  * that it can start accepting read-only connections.
2175  */
2176 static void
2177 CheckRecoveryConsistency(void)
2178 {
2179         XLogRecPtr      lastReplayedEndRecPtr;
2180         TimeLineID      lastReplayedTLI;
2181
2182         /*
2183          * During crash recovery, we don't reach a consistent state until we've
2184          * replayed all the WAL.
2185          */
2186         if (XLogRecPtrIsInvalid(minRecoveryPoint))
2187                 return;
2188
2189         Assert(InArchiveRecovery);
2190
2191         /*
2192          * assume that we are called in the startup process, and hence don't need
2193          * a lock to read lastReplayedEndRecPtr
2194          */
2195         lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2196         lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2197
2198         /*
2199          * Have we reached the point where our base backup was completed?
2200          */
2201         if (!XLogRecPtrIsInvalid(backupEndPoint) &&
2202                 backupEndPoint <= lastReplayedEndRecPtr)
2203         {
2204                 XLogRecPtr      saveBackupStartPoint = backupStartPoint;
2205                 XLogRecPtr      saveBackupEndPoint = backupEndPoint;
2206
2207                 elog(DEBUG1, "end of backup reached");
2208
2209                 /*
2210                  * We have reached the end of base backup, as indicated by pg_control.
2211                  * Update the control file accordingly.
2212                  */
2213                 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2214                 backupStartPoint = InvalidXLogRecPtr;
2215                 backupEndPoint = InvalidXLogRecPtr;
2216                 backupEndRequired = false;
2217
2218                 ereport(LOG,
2219                                 (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2220                                                 LSN_FORMAT_ARGS(saveBackupStartPoint),
2221                                                 LSN_FORMAT_ARGS(saveBackupEndPoint))));
2222         }
2223
2224         /*
2225          * Have we passed our safe starting point? Note that minRecoveryPoint is
2226          * known to be incorrectly set if recovering from a backup, until the
2227          * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2228          * All we know prior to that is that we're not consistent yet.
2229          */
2230         if (!reachedConsistency && !backupEndRequired &&
2231                 minRecoveryPoint <= lastReplayedEndRecPtr)
2232         {
2233                 /*
2234                  * Check to see if the XLOG sequence contained any unresolved
2235                  * references to uninitialized pages.
2236                  */
2237                 XLogCheckInvalidPages();
2238
2239                 /*
2240                  * Check that pg_tblspc doesn't contain any real directories. Replay
2241                  * of Database/CREATE_* records may have created fictitious tablespace
2242                  * directories that should have been removed by the time consistency
2243                  * was reached.
2244                  */
2245                 CheckTablespaceDirectory();
2246
2247                 reachedConsistency = true;
2248                 ereport(LOG,
2249                                 (errmsg("consistent recovery state reached at %X/%X",
2250                                                 LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2251         }
2252
2253         /*
2254          * Have we got a valid starting snapshot that will allow queries to be
2255          * run? If so, we can tell postmaster that the database is consistent now,
2256          * enabling connections.
2257          */
2258         if (standbyState == STANDBY_SNAPSHOT_READY &&
2259                 !LocalHotStandbyActive &&
2260                 reachedConsistency &&
2261                 IsUnderPostmaster)
2262         {
2263                 SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2264                 XLogRecoveryCtl->SharedHotStandbyActive = true;
2265                 SpinLockRelease(&XLogRecoveryCtl->info_lck);
2266
2267                 LocalHotStandbyActive = true;
2268
2269                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2270         }
2271 }
2272
2273 /*
2274  * Error context callback for errors occurring during rm_redo().
2275  */
2276 static void
2277 rm_redo_error_callback(void *arg)
2278 {
2279         XLogReaderState *record = (XLogReaderState *) arg;
2280         StringInfoData buf;
2281
2282         initStringInfo(&buf);
2283         xlog_outdesc(&buf, record);
2284         xlog_block_info(&buf, record);
2285
2286         /* translator: %s is a WAL record description */
2287         errcontext("WAL redo at %X/%X for %s",
2288                            LSN_FORMAT_ARGS(record->ReadRecPtr),
2289                            buf.data);
2290
2291         pfree(buf.data);
2292 }
2293
2294 /*
2295  * Returns a string describing an XLogRecord, consisting of its identity
2296  * optionally followed by a colon, a space, and a further description.
2297  */
2298 void
2299 xlog_outdesc(StringInfo buf, XLogReaderState *record)
2300 {
2301         RmgrData        rmgr = GetRmgr(XLogRecGetRmid(record));
2302         uint8           info = XLogRecGetInfo(record);
2303         const char *id;
2304
2305         appendStringInfoString(buf, rmgr.rm_name);
2306         appendStringInfoChar(buf, '/');
2307
2308         id = rmgr.rm_identify(info);
2309         if (id == NULL)
2310                 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2311         else
2312                 appendStringInfo(buf, "%s: ", id);
2313
2314         rmgr.rm_desc(buf, record);
2315 }
2316
2317 #ifdef WAL_DEBUG
2318
2319 static void
2320 xlog_outrec(StringInfo buf, XLogReaderState *record)
2321 {
2322         appendStringInfo(buf, "prev %X/%X; xid %u",
2323                                          LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2324                                          XLogRecGetXid(record));
2325
2326         appendStringInfo(buf, "; len %u",
2327                                          XLogRecGetDataLen(record));
2328
2329         xlog_block_info(buf, record);
2330 }
2331 #endif                                                  /* WAL_DEBUG */
2332
2333 /*
2334  * Returns a string giving information about all the blocks in an
2335  * XLogRecord.
2336  */
2337 static void
2338 xlog_block_info(StringInfo buf, XLogReaderState *record)
2339 {
2340         int                     block_id;
2341
2342         /* decode block references */
2343         for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2344         {
2345                 RelFileLocator rlocator;
2346                 ForkNumber      forknum;
2347                 BlockNumber blk;
2348
2349                 if (!XLogRecGetBlockTagExtended(record, block_id,
2350                                                                                 &rlocator, &forknum, &blk, NULL))
2351                         continue;
2352
2353                 if (forknum != MAIN_FORKNUM)
2354                         appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2355                                                          block_id,
2356                                                          rlocator.spcOid, rlocator.dbOid,
2357                                                          rlocator.relNumber,
2358                                                          forknum,
2359                                                          blk);
2360                 else
2361                         appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2362                                                          block_id,
2363                                                          rlocator.spcOid, rlocator.dbOid,
2364                                                          rlocator.relNumber,
2365                                                          blk);
2366                 if (XLogRecHasBlockImage(record, block_id))
2367                         appendStringInfoString(buf, " FPW");
2368         }
2369 }
2370
2371
2372 /*
2373  * Check that it's OK to switch to new timeline during recovery.
2374  *
2375  * 'lsn' is the address of the shutdown checkpoint record we're about to
2376  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2377  */
2378 static void
2379 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2380                                         TimeLineID replayTLI)
2381 {
2382         /* Check that the record agrees on what the current (old) timeline is */
2383         if (prevTLI != replayTLI)
2384                 ereport(PANIC,
2385                                 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2386                                                 prevTLI, replayTLI)));
2387
2388         /*
2389          * The new timeline better be in the list of timelines we expect to see,
2390          * according to the timeline history. It should also not decrease.
2391          */
2392         if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2393                 ereport(PANIC,
2394                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2395                                                 newTLI, replayTLI)));
2396
2397         /*
2398          * If we have not yet reached min recovery point, and we're about to
2399          * switch to a timeline greater than the timeline of the min recovery
2400          * point: trouble. After switching to the new timeline, we could not
2401          * possibly visit the min recovery point on the correct timeline anymore.
2402          * This can happen if there is a newer timeline in the archive that
2403          * branched before the timeline the min recovery point is on, and you
2404          * attempt to do PITR to the new timeline.
2405          */
2406         if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
2407                 lsn < minRecoveryPoint &&
2408                 newTLI > minRecoveryPointTLI)
2409                 ereport(PANIC,
2410                                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2411                                                 newTLI,
2412                                                 LSN_FORMAT_ARGS(minRecoveryPoint),
2413                                                 minRecoveryPointTLI)));
2414
2415         /* Looks good */
2416 }
2417
2418
2419 /*
2420  * Extract timestamp from WAL record.
2421  *
2422  * If the record contains a timestamp, returns true, and saves the timestamp
2423  * in *recordXtime. If the record type has no timestamp, returns false.
2424  * Currently, only transaction commit/abort records and restore points contain
2425  * timestamps.
2426  */
2427 static bool
2428 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2429 {
2430         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2431         uint8           xact_info = info & XLOG_XACT_OPMASK;
2432         uint8           rmid = XLogRecGetRmid(record);
2433
2434         if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2435         {
2436                 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2437                 return true;
2438         }
2439         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2440                                                            xact_info == XLOG_XACT_COMMIT_PREPARED))
2441         {
2442                 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2443                 return true;
2444         }
2445         if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2446                                                            xact_info == XLOG_XACT_ABORT_PREPARED))
2447         {
2448                 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2449                 return true;
2450         }
2451         return false;
2452 }
2453
2454 /*
2455  * Checks whether the current buffer page and backup page stored in the
2456  * WAL record are consistent or not. Before comparing the two pages, a
2457  * masking can be applied to the pages to ignore certain areas like hint bits,
2458  * unused space between pd_lower and pd_upper among other things. This
2459  * function should be called once WAL replay has been completed for a
2460  * given record.
2461  */
2462 static void
2463 verifyBackupPageConsistency(XLogReaderState *record)
2464 {
2465         RmgrData        rmgr = GetRmgr(XLogRecGetRmid(record));
2466         RelFileLocator rlocator;
2467         ForkNumber      forknum;
2468         BlockNumber blkno;
2469         int                     block_id;
2470
2471         /* Records with no backup blocks have no need for consistency checks. */
2472         if (!XLogRecHasAnyBlockRefs(record))
2473                 return;
2474
2475         Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2476
2477         for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2478         {
2479                 Buffer          buf;
2480                 Page            page;
2481
2482                 if (!XLogRecGetBlockTagExtended(record, block_id,
2483                                                                                 &rlocator, &forknum, &blkno, NULL))
2484                 {
2485                         /*
2486                          * WAL record doesn't contain a block reference with the given id.
2487                          * Do nothing.
2488                          */
2489                         continue;
2490                 }
2491
2492                 Assert(XLogRecHasBlockImage(record, block_id));
2493
2494                 if (XLogRecBlockImageApply(record, block_id))
2495                 {
2496                         /*
2497                          * WAL record has already applied the page, so bypass the
2498                          * consistency check as that would result in comparing the full
2499                          * page stored in the record with itself.
2500                          */
2501                         continue;
2502                 }
2503
2504                 /*
2505                  * Read the contents from the current buffer and store it in a
2506                  * temporary page.
2507                  */
2508                 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2509                                                                          RBM_NORMAL_NO_LOG,
2510                                                                          InvalidBuffer);
2511                 if (!BufferIsValid(buf))
2512                         continue;
2513
2514                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2515                 page = BufferGetPage(buf);
2516
2517                 /*
2518                  * Take a copy of the local page where WAL has been applied to have a
2519                  * comparison base before masking it...
2520                  */
2521                 memcpy(replay_image_masked, page, BLCKSZ);
2522
2523                 /* No need for this page anymore now that a copy is in. */
2524                 UnlockReleaseBuffer(buf);
2525
2526                 /*
2527                  * If the block LSN is already ahead of this WAL record, we can't
2528                  * expect contents to match.  This can happen if recovery is
2529                  * restarted.
2530                  */
2531                 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2532                         continue;
2533
2534                 /*
2535                  * Read the contents from the backup copy, stored in WAL record and
2536                  * store it in a temporary page. There is no need to allocate a new
2537                  * page here, a local buffer is fine to hold its contents and a mask
2538                  * can be directly applied on it.
2539                  */
2540                 if (!RestoreBlockImage(record, block_id, primary_image_masked))
2541                         ereport(ERROR,
2542                                         (errcode(ERRCODE_INTERNAL_ERROR),
2543                                          errmsg_internal("%s", record->errormsg_buf)));
2544
2545                 /*
2546                  * If masking function is defined, mask both the primary and replay
2547                  * images
2548                  */
2549                 if (rmgr.rm_mask != NULL)
2550                 {
2551                         rmgr.rm_mask(replay_image_masked, blkno);
2552                         rmgr.rm_mask(primary_image_masked, blkno);
2553                 }
2554
2555                 /* Time to compare the primary and replay images. */
2556                 if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2557                 {
2558                         elog(FATAL,
2559                                  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2560                                  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2561                                  forknum, blkno);
2562                 }
2563         }
2564 }
2565
2566 /*
2567  * For point-in-time recovery, this function decides whether we want to
2568  * stop applying the XLOG before the current record.
2569  *
2570  * Returns true if we are stopping, false otherwise. If stopping, some
2571  * information is saved in recoveryStopXid et al for use in annotating the
2572  * new timeline's history file.
2573  */
2574 static bool
2575 recoveryStopsBefore(XLogReaderState *record)
2576 {
2577         bool            stopsHere = false;
2578         uint8           xact_info;
2579         bool            isCommit;
2580         TimestampTz recordXtime = 0;
2581         TransactionId recordXid;
2582
2583         /*
2584          * Ignore recovery target settings when not in archive recovery (meaning
2585          * we are in crash recovery).
2586          */
2587         if (!ArchiveRecoveryRequested)
2588                 return false;
2589
2590         /* Check if we should stop as soon as reaching consistency */
2591         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2592         {
2593                 ereport(LOG,
2594                                 (errmsg("recovery stopping after reaching consistency")));
2595
2596                 recoveryStopAfter = false;
2597                 recoveryStopXid = InvalidTransactionId;
2598                 recoveryStopLSN = InvalidXLogRecPtr;
2599                 recoveryStopTime = 0;
2600                 recoveryStopName[0] = '\0';
2601                 return true;
2602         }
2603
2604         /* Check if target LSN has been reached */
2605         if (recoveryTarget == RECOVERY_TARGET_LSN &&
2606                 !recoveryTargetInclusive &&
2607                 record->ReadRecPtr >= recoveryTargetLSN)
2608         {
2609                 recoveryStopAfter = false;
2610                 recoveryStopXid = InvalidTransactionId;
2611                 recoveryStopLSN = record->ReadRecPtr;
2612                 recoveryStopTime = 0;
2613                 recoveryStopName[0] = '\0';
2614                 ereport(LOG,
2615                                 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2616                                                 LSN_FORMAT_ARGS(recoveryStopLSN))));
2617                 return true;
2618         }
2619
2620         /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2621         if (XLogRecGetRmid(record) != RM_XACT_ID)
2622                 return false;
2623
2624         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2625
2626         if (xact_info == XLOG_XACT_COMMIT)
2627         {
2628                 isCommit = true;
2629                 recordXid = XLogRecGetXid(record);
2630         }
2631         else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2632         {
2633                 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2634                 xl_xact_parsed_commit parsed;
2635
2636                 isCommit = true;
2637                 ParseCommitRecord(XLogRecGetInfo(record),
2638                                                   xlrec,
2639                                                   &parsed);
2640                 recordXid = parsed.twophase_xid;
2641         }
2642         else if (xact_info == XLOG_XACT_ABORT)
2643         {
2644                 isCommit = false;
2645                 recordXid = XLogRecGetXid(record);
2646         }
2647         else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2648         {
2649                 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2650                 xl_xact_parsed_abort parsed;
2651
2652                 isCommit = false;
2653                 ParseAbortRecord(XLogRecGetInfo(record),
2654                                                  xlrec,
2655                                                  &parsed);
2656                 recordXid = parsed.twophase_xid;
2657         }
2658         else
2659                 return false;
2660
2661         if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2662         {
2663                 /*
2664                  * There can be only one transaction end record with this exact
2665                  * transactionid
2666                  *
2667                  * when testing for an xid, we MUST test for equality only, since
2668                  * transactions are numbered in the order they start, not the order
2669                  * they complete. A higher numbered xid will complete before you about
2670                  * 50% of the time...
2671                  */
2672                 stopsHere = (recordXid == recoveryTargetXid);
2673         }
2674
2675         /*
2676          * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2677          * We don't expect getRecordTimestamp ever to fail, since we already know
2678          * this is a commit or abort record; but test its result anyway.
2679          */
2680         if (getRecordTimestamp(record, &recordXtime) &&
2681                 recoveryTarget == RECOVERY_TARGET_TIME)
2682         {
2683                 /*
2684                  * There can be many transactions that share the same commit time, so
2685                  * we stop after the last one, if we are inclusive, or stop at the
2686                  * first one if we are exclusive
2687                  */
2688                 if (recoveryTargetInclusive)
2689                         stopsHere = (recordXtime > recoveryTargetTime);
2690                 else
2691                         stopsHere = (recordXtime >= recoveryTargetTime);
2692         }
2693
2694         if (stopsHere)
2695         {
2696                 recoveryStopAfter = false;
2697                 recoveryStopXid = recordXid;
2698                 recoveryStopTime = recordXtime;
2699                 recoveryStopLSN = InvalidXLogRecPtr;
2700                 recoveryStopName[0] = '\0';
2701
2702                 if (isCommit)
2703                 {
2704                         ereport(LOG,
2705                                         (errmsg("recovery stopping before commit of transaction %u, time %s",
2706                                                         recoveryStopXid,
2707                                                         timestamptz_to_str(recoveryStopTime))));
2708                 }
2709                 else
2710                 {
2711                         ereport(LOG,
2712                                         (errmsg("recovery stopping before abort of transaction %u, time %s",
2713                                                         recoveryStopXid,
2714                                                         timestamptz_to_str(recoveryStopTime))));
2715                 }
2716         }
2717
2718         return stopsHere;
2719 }
2720
2721 /*
2722  * Same as recoveryStopsBefore, but called after applying the record.
2723  *
2724  * We also track the timestamp of the latest applied COMMIT/ABORT
2725  * record in XLogRecoveryCtl->recoveryLastXTime.
2726  */
2727 static bool
2728 recoveryStopsAfter(XLogReaderState *record)
2729 {
2730         uint8           info;
2731         uint8           xact_info;
2732         uint8           rmid;
2733         TimestampTz recordXtime = 0;
2734
2735         /*
2736          * Ignore recovery target settings when not in archive recovery (meaning
2737          * we are in crash recovery).
2738          */
2739         if (!ArchiveRecoveryRequested)
2740                 return false;
2741
2742         info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2743         rmid = XLogRecGetRmid(record);
2744
2745         /*
2746          * There can be many restore points that share the same name; we stop at
2747          * the first one.
2748          */
2749         if (recoveryTarget == RECOVERY_TARGET_NAME &&
2750                 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2751         {
2752                 xl_restore_point *recordRestorePointData;
2753
2754                 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2755
2756                 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2757                 {
2758                         recoveryStopAfter = true;
2759                         recoveryStopXid = InvalidTransactionId;
2760                         recoveryStopLSN = InvalidXLogRecPtr;
2761                         (void) getRecordTimestamp(record, &recoveryStopTime);
2762                         strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2763
2764                         ereport(LOG,
2765                                         (errmsg("recovery stopping at restore point \"%s\", time %s",
2766                                                         recoveryStopName,
2767                                                         timestamptz_to_str(recoveryStopTime))));
2768                         return true;
2769                 }
2770         }
2771
2772         /* Check if the target LSN has been reached */
2773         if (recoveryTarget == RECOVERY_TARGET_LSN &&
2774                 recoveryTargetInclusive &&
2775                 record->ReadRecPtr >= recoveryTargetLSN)
2776         {
2777                 recoveryStopAfter = true;
2778                 recoveryStopXid = InvalidTransactionId;
2779                 recoveryStopLSN = record->ReadRecPtr;
2780                 recoveryStopTime = 0;
2781                 recoveryStopName[0] = '\0';
2782                 ereport(LOG,
2783                                 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2784                                                 LSN_FORMAT_ARGS(recoveryStopLSN))));
2785                 return true;
2786         }
2787
2788         if (rmid != RM_XACT_ID)
2789                 return false;
2790
2791         xact_info = info & XLOG_XACT_OPMASK;
2792
2793         if (xact_info == XLOG_XACT_COMMIT ||
2794                 xact_info == XLOG_XACT_COMMIT_PREPARED ||
2795                 xact_info == XLOG_XACT_ABORT ||
2796                 xact_info == XLOG_XACT_ABORT_PREPARED)
2797         {
2798                 TransactionId recordXid;
2799
2800                 /* Update the last applied transaction timestamp */
2801                 if (getRecordTimestamp(record, &recordXtime))
2802                         SetLatestXTime(recordXtime);
2803
2804                 /* Extract the XID of the committed/aborted transaction */
2805                 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2806                 {
2807                         xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2808                         xl_xact_parsed_commit parsed;
2809
2810                         ParseCommitRecord(XLogRecGetInfo(record),
2811                                                           xlrec,
2812                                                           &parsed);
2813                         recordXid = parsed.twophase_xid;
2814                 }
2815                 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2816                 {
2817                         xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2818                         xl_xact_parsed_abort parsed;
2819
2820                         ParseAbortRecord(XLogRecGetInfo(record),
2821                                                          xlrec,
2822                                                          &parsed);
2823                         recordXid = parsed.twophase_xid;
2824                 }
2825                 else
2826                         recordXid = XLogRecGetXid(record);
2827
2828                 /*
2829                  * There can be only one transaction end record with this exact
2830                  * transactionid
2831                  *
2832                  * when testing for an xid, we MUST test for equality only, since
2833                  * transactions are numbered in the order they start, not the order
2834                  * they complete. A higher numbered xid will complete before you about
2835                  * 50% of the time...
2836                  */
2837                 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2838                         recordXid == recoveryTargetXid)
2839                 {
2840                         recoveryStopAfter = true;
2841                         recoveryStopXid = recordXid;
2842                         recoveryStopTime = recordXtime;
2843                         recoveryStopLSN = InvalidXLogRecPtr;
2844                         recoveryStopName[0] = '\0';
2845
2846                         if (xact_info == XLOG_XACT_COMMIT ||
2847                                 xact_info == XLOG_XACT_COMMIT_PREPARED)
2848                         {
2849                                 ereport(LOG,
2850                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
2851                                                                 recoveryStopXid,
2852                                                                 timestamptz_to_str(recoveryStopTime))));
2853                         }
2854                         else if (xact_info == XLOG_XACT_ABORT ||
2855                                          xact_info == XLOG_XACT_ABORT_PREPARED)
2856                         {
2857                                 ereport(LOG,
2858                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
2859                                                                 recoveryStopXid,
2860                                                                 timestamptz_to_str(recoveryStopTime))));
2861                         }
2862                         return true;
2863                 }
2864         }
2865
2866         /* Check if we should stop as soon as reaching consistency */
2867         if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2868         {
2869                 ereport(LOG,
2870                                 (errmsg("recovery stopping after reaching consistency")));
2871
2872                 recoveryStopAfter = true;
2873                 recoveryStopXid = InvalidTransactionId;
2874                 recoveryStopTime = 0;
2875                 recoveryStopLSN = InvalidXLogRecPtr;
2876                 recoveryStopName[0] = '\0';
2877                 return true;
2878         }
2879
2880         return false;
2881 }
2882
2883 /*
2884  * Create a comment for the history file to explain why and where
2885  * timeline changed.
2886  */
2887 static char *
2888 getRecoveryStopReason(void)
2889 {
2890         char            reason[200];
2891
2892         if (recoveryTarget == RECOVERY_TARGET_XID)
2893                 snprintf(reason, sizeof(reason),
2894                                  "%s transaction %u",
2895                                  recoveryStopAfter ? "after" : "before",
2896                                  recoveryStopXid);
2897         else if (recoveryTarget == RECOVERY_TARGET_TIME)
2898                 snprintf(reason, sizeof(reason),
2899                                  "%s %s\n",
2900                                  recoveryStopAfter ? "after" : "before",
2901                                  timestamptz_to_str(recoveryStopTime));
2902         else if (recoveryTarget == RECOVERY_TARGET_LSN)
2903                 snprintf(reason, sizeof(reason),
2904                                  "%s LSN %X/%X\n",
2905                                  recoveryStopAfter ? "after" : "before",
2906                                  LSN_FORMAT_ARGS(recoveryStopLSN));
2907         else if (recoveryTarget == RECOVERY_TARGET_NAME)
2908                 snprintf(reason, sizeof(reason),
2909                                  "at restore point \"%s\"",
2910                                  recoveryStopName);
2911         else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2912                 snprintf(reason, sizeof(reason), "reached consistency");
2913         else
2914                 snprintf(reason, sizeof(reason), "no recovery target specified");
2915
2916         return pstrdup(reason);
2917 }
2918
2919 /*
2920  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2921  *
2922  * endOfRecovery is true if the recovery target is reached and
2923  * the paused state starts at the end of recovery because of
2924  * recovery_target_action=pause, and false otherwise.
2925  */
2926 static void
2927 recoveryPausesHere(bool endOfRecovery)
2928 {
2929         /* Don't pause unless users can connect! */
2930         if (!LocalHotStandbyActive)
2931                 return;
2932
2933         /* Don't pause after standby promotion has been triggered */
2934         if (LocalPromoteIsTriggered)
2935                 return;
2936
2937         if (endOfRecovery)
2938                 ereport(LOG,
2939                                 (errmsg("pausing at the end of recovery"),
2940                                  errhint("Execute pg_wal_replay_resume() to promote.")));
2941         else
2942                 ereport(LOG,
2943                                 (errmsg("recovery has paused"),
2944                                  errhint("Execute pg_wal_replay_resume() to continue.")));
2945
2946         /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2947         while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2948         {
2949                 HandleStartupProcInterrupts();
2950                 if (CheckForStandbyTrigger())
2951                         return;
2952
2953                 /*
2954                  * If recovery pause is requested then set it paused.  While we are in
2955                  * the loop, user might resume and pause again so set this every time.
2956                  */
2957                 ConfirmRecoveryPaused();
2958
2959                 /*
2960                  * We wait on a condition variable that will wake us as soon as the
2961                  * pause ends, but we use a timeout so we can check the above exit
2962                  * condition periodically too.
2963                  */
2964                 ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2965                                                                         WAIT_EVENT_RECOVERY_PAUSE);
2966         }
2967         ConditionVariableCancelSleep();
2968 }
2969
2970 /*
2971  * When recovery_min_apply_delay is set, we wait long enough to make sure
2972  * certain record types are applied at least that interval behind the primary.
2973  *
2974  * Returns true if we waited.
2975  *
2976  * Note that the delay is calculated between the WAL record log time and
2977  * the current time on standby. We would prefer to keep track of when this
2978  * standby received each WAL record, which would allow a more consistent
2979  * approach and one not affected by time synchronisation issues, but that
2980  * is significantly more effort and complexity for little actual gain in
2981  * usability.
2982  */
2983 static bool
2984 recoveryApplyDelay(XLogReaderState *record)
2985 {
2986         uint8           xact_info;
2987         TimestampTz xtime;
2988         TimestampTz delayUntil;
2989         long            msecs;
2990
2991         /* nothing to do if no delay configured */
2992         if (recovery_min_apply_delay <= 0)
2993                 return false;
2994
2995         /* no delay is applied on a database not yet consistent */
2996         if (!reachedConsistency)
2997                 return false;
2998
2999         /* nothing to do if crash recovery is requested */
3000         if (!ArchiveRecoveryRequested)
3001                 return false;
3002
3003         /*
3004          * Is it a COMMIT record?
3005          *
3006          * We deliberately choose not to delay aborts since they have no effect on
3007          * MVCC. We already allow replay of records that don't have a timestamp,
3008          * so there is already opportunity for issues caused by early conflicts on
3009          * standbys.
3010          */
3011         if (XLogRecGetRmid(record) != RM_XACT_ID)
3012                 return false;
3013
3014         xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3015
3016         if (xact_info != XLOG_XACT_COMMIT &&
3017                 xact_info != XLOG_XACT_COMMIT_PREPARED)
3018                 return false;
3019
3020         if (!getRecordTimestamp(record, &xtime))
3021                 return false;
3022
3023         delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3024
3025         /*
3026          * Exit without arming the latch if it's already past time to apply this
3027          * record
3028          */
3029         msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3030         if (msecs <= 0)
3031                 return false;
3032
3033         while (true)
3034         {
3035                 ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3036
3037                 /* This might change recovery_min_apply_delay. */
3038                 HandleStartupProcInterrupts();
3039
3040                 if (CheckForStandbyTrigger())
3041                         break;
3042
3043                 /*
3044                  * Recalculate delayUntil as recovery_min_apply_delay could have
3045                  * changed while waiting in this loop.
3046                  */
3047                 delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3048
3049                 /*
3050                  * Wait for difference between GetCurrentTimestamp() and delayUntil.
3051                  */
3052                 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3053                                                                                                 delayUntil);
3054
3055                 if (msecs <= 0)
3056                         break;
3057
3058                 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3059
3060                 (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3061                                                  WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3062                                                  msecs,
3063                                                  WAIT_EVENT_RECOVERY_APPLY_DELAY);
3064         }
3065         return true;
3066 }
3067
3068 /*
3069  * Get the current state of the recovery pause request.
3070  */
3071 RecoveryPauseState
3072 GetRecoveryPauseState(void)
3073 {
3074         RecoveryPauseState state;
3075
3076         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3077         state = XLogRecoveryCtl->recoveryPauseState;
3078         SpinLockRelease(&XLogRecoveryCtl->info_lck);
3079
3080         return state;
3081 }
3082
3083 /*
3084  * Set the recovery pause state.
3085  *
3086  * If recovery pause is requested then sets the recovery pause state to
3087  * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
3088  * to 'not paused' to resume the recovery.  The recovery pause will be
3089  * confirmed by the ConfirmRecoveryPaused.
3090  */
3091 void
3092 SetRecoveryPause(bool recoveryPause)
3093 {
3094         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3095
3096         if (!recoveryPause)
3097                 XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3098         else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3099                 XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3100
3101         SpinLockRelease(&XLogRecoveryCtl->info_lck);
3102
3103         if (!recoveryPause)
3104                 ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3105 }
3106
3107 /*
3108  * Confirm the recovery pause by setting the recovery pause state to
3109  * RECOVERY_PAUSED.
3110  */
3111 static void
3112 ConfirmRecoveryPaused(void)
3113 {
3114         /* If recovery pause is requested then set it paused */
3115         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3116         if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3117                 XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3118         SpinLockRelease(&XLogRecoveryCtl->info_lck);
3119 }
3120
3121
3122 /*
3123  * Attempt to read the next XLOG record.
3124  *
3125  * Before first call, the reader needs to be positioned to the first record
3126  * by calling XLogPrefetcherBeginRead().
3127  *
3128  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3129  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3130  * record is available.
3131  */
3132 static XLogRecord *
3133 ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3134                    bool fetching_ckpt, TimeLineID replayTLI)
3135 {
3136         XLogRecord *record;
3137         XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3138         XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3139
3140         /* Pass through parameters to XLogPageRead */
3141         private->fetching_ckpt = fetching_ckpt;
3142         private->emode = emode;
3143         private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3144         private->replayTLI = replayTLI;
3145
3146         /* This is the first attempt to read this page. */
3147         lastSourceFailed = false;
3148
3149         for (;;)
3150         {
3151                 char       *errormsg;
3152
3153                 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3154                 if (record == NULL)
3155                 {
3156                         /*
3157                          * When we find that WAL ends in an incomplete record, keep track
3158                          * of that record.  After recovery is done, we'll write a record
3159                          * to indicate to downstream WAL readers that that portion is to
3160                          * be ignored.
3161                          *
3162                          * However, when ArchiveRecoveryRequested = true, we're going to
3163                          * switch to a new timeline at the end of recovery. We will only
3164                          * copy WAL over to the new timeline up to the end of the last
3165                          * complete record, so if we did this, we would later create an
3166                          * overwrite contrecord in the wrong place, breaking everything.
3167                          */
3168                         if (!ArchiveRecoveryRequested &&
3169                                 !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
3170                         {
3171                                 abortedRecPtr = xlogreader->abortedRecPtr;
3172                                 missingContrecPtr = xlogreader->missingContrecPtr;
3173                         }
3174
3175                         if (readFile >= 0)
3176                         {
3177                                 close(readFile);
3178                                 readFile = -1;
3179                         }
3180
3181                         /*
3182                          * We only end up here without a message when XLogPageRead()
3183                          * failed - in that case we already logged something. In
3184                          * StandbyMode that only happens if we have been triggered, so we
3185                          * shouldn't loop anymore in that case.
3186                          */
3187                         if (errormsg)
3188                                 ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3189                                                 (errmsg_internal("%s", errormsg) /* already translated */ ));
3190                 }
3191
3192                 /*
3193                  * Check page TLI is one of the expected values.
3194                  */
3195                 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3196                 {
3197                         char            fname[MAXFNAMELEN];
3198                         XLogSegNo       segno;
3199                         int32           offset;
3200
3201                         XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3202                         offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3203                                                                            wal_segment_size);
3204                         XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3205                                                  wal_segment_size);
3206                         ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3207                                         (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3208                                                         xlogreader->latestPageTLI,
3209                                                         fname,
3210                                                         LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3211                                                         offset)));
3212                         record = NULL;
3213                 }
3214
3215                 if (record)
3216                 {
3217                         /* Great, got a record */
3218                         return record;
3219                 }
3220                 else
3221                 {
3222                         /* No valid record available from this source */
3223                         lastSourceFailed = true;
3224
3225                         /*
3226                          * If archive recovery was requested, but we were still doing
3227                          * crash recovery, switch to archive recovery and retry using the
3228                          * offline archive. We have now replayed all the valid WAL in
3229                          * pg_wal, so we are presumably now consistent.
3230                          *
3231                          * We require that there's at least some valid WAL present in
3232                          * pg_wal, however (!fetching_ckpt).  We could recover using the
3233                          * WAL from the archive, even if pg_wal is completely empty, but
3234                          * we'd have no idea how far we'd have to replay to reach
3235                          * consistency.  So err on the safe side and give up.
3236                          */
3237                         if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3238                                 !fetching_ckpt)
3239                         {
3240                                 ereport(DEBUG1,
3241                                                 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3242                                 InArchiveRecovery = true;
3243                                 if (StandbyModeRequested)
3244                                         EnableStandbyMode();
3245
3246                                 SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3247                                 minRecoveryPoint = xlogreader->EndRecPtr;
3248                                 minRecoveryPointTLI = replayTLI;
3249
3250                                 CheckRecoveryConsistency();
3251
3252                                 /*
3253                                  * Before we retry, reset lastSourceFailed and currentSource
3254                                  * so that we will check the archive next.
3255                                  */
3256                                 lastSourceFailed = false;
3257                                 currentSource = XLOG_FROM_ANY;
3258
3259                                 continue;
3260                         }
3261
3262                         /* In standby mode, loop back to retry. Otherwise, give up. */
3263                         if (StandbyMode && !CheckForStandbyTrigger())
3264                                 continue;
3265                         else
3266                                 return NULL;
3267                 }
3268         }
3269 }
3270
3271 /*
3272  * Read the XLOG page containing targetPagePtr into readBuf (if not read
3273  * already).  Returns number of bytes read, if the page is read successfully,
3274  * or XLREAD_FAIL in case of errors.  When errors occur, they are ereport'ed,
3275  * but only if they have not been previously reported.
3276  *
3277  * See XLogReaderRoutine.page_read for more details.
3278  *
3279  * While prefetching, xlogreader->nonblocking may be set.  In that case,
3280  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3281  *
3282  * This is responsible for restoring files from archive as needed, as well
3283  * as for waiting for the requested WAL record to arrive in standby mode.
3284  *
3285  * xlogreader->private_data->emode specifies the log level used for reporting
3286  * "file not found" or "end of WAL" situations in archive recovery, or in
3287  * standby mode when promotion is triggered. If set to WARNING or below,
3288  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3289  * levels the ereport() won't return.
3290  *
3291  * In standby mode, if after a successful return of XLogPageRead() the
3292  * caller finds the record it's interested in to be broken, it should
3293  * ereport the error with the level determined by
3294  * emode_for_corrupt_record(), and then set lastSourceFailed
3295  * and call XLogPageRead() again with the same arguments. This lets
3296  * XLogPageRead() to try fetching the record from another source, or to
3297  * sleep and retry.
3298  */
3299 static int
3300 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3301                          XLogRecPtr targetRecPtr, char *readBuf)
3302 {
3303         XLogPageReadPrivate *private =
3304                 (XLogPageReadPrivate *) xlogreader->private_data;
3305         int                     emode = private->emode;
3306         uint32          targetPageOff;
3307         XLogSegNo       targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3308         int                     r;
3309
3310         XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3311         targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3312
3313         /*
3314          * See if we need to switch to a new segment because the requested record
3315          * is not in the currently open one.
3316          */
3317         if (readFile >= 0 &&
3318                 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3319         {
3320                 /*
3321                  * Request a restartpoint if we've replayed too much xlog since the
3322                  * last one.
3323                  */
3324                 if (ArchiveRecoveryRequested && IsUnderPostmaster)
3325                 {
3326                         if (XLogCheckpointNeeded(readSegNo))
3327                         {
3328                                 (void) GetRedoRecPtr();
3329                                 if (XLogCheckpointNeeded(readSegNo))
3330                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3331                         }
3332                 }
3333
3334                 close(readFile);
3335                 readFile = -1;
3336                 readSource = XLOG_FROM_ANY;
3337         }
3338
3339         XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3340
3341 retry:
3342         /* See if we need to retrieve more data */
3343         if (readFile < 0 ||
3344                 (readSource == XLOG_FROM_STREAM &&
3345                  flushedUpto < targetPagePtr + reqLen))
3346         {
3347                 if (readFile >= 0 &&
3348                         xlogreader->nonblocking &&
3349                         readSource == XLOG_FROM_STREAM &&
3350                         flushedUpto < targetPagePtr + reqLen)
3351                         return XLREAD_WOULDBLOCK;
3352
3353                 switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3354                                                                                         private->randAccess,
3355                                                                                         private->fetching_ckpt,
3356                                                                                         targetRecPtr,
3357                                                                                         private->replayTLI,
3358                                                                                         xlogreader->EndRecPtr,
3359                                                                                         xlogreader->nonblocking))
3360                 {
3361                         case XLREAD_WOULDBLOCK:
3362                                 return XLREAD_WOULDBLOCK;
3363                         case XLREAD_FAIL:
3364                                 if (readFile >= 0)
3365                                         close(readFile);
3366                                 readFile = -1;
3367                                 readLen = 0;
3368                                 readSource = XLOG_FROM_ANY;
3369                                 return XLREAD_FAIL;
3370                         case XLREAD_SUCCESS:
3371                                 break;
3372                 }
3373         }
3374
3375         /*
3376          * At this point, we have the right segment open and if we're streaming we
3377          * know the requested record is in it.
3378          */
3379         Assert(readFile != -1);
3380
3381         /*
3382          * If the current segment is being streamed from the primary, calculate
3383          * how much of the current page we have received already. We know the
3384          * requested record has been received, but this is for the benefit of
3385          * future calls, to allow quick exit at the top of this function.
3386          */
3387         if (readSource == XLOG_FROM_STREAM)
3388         {
3389                 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3390                         readLen = XLOG_BLCKSZ;
3391                 else
3392                         readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3393                                 targetPageOff;
3394         }
3395         else
3396                 readLen = XLOG_BLCKSZ;
3397
3398         /* Read the requested page */
3399         readOff = targetPageOff;
3400
3401         pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3402         r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3403         if (r != XLOG_BLCKSZ)
3404         {
3405                 char            fname[MAXFNAMELEN];
3406                 int                     save_errno = errno;
3407
3408                 pgstat_report_wait_end();
3409                 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3410                 if (r < 0)
3411                 {
3412                         errno = save_errno;
3413                         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3414                                         (errcode_for_file_access(),
3415                                          errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3416                                                         fname, LSN_FORMAT_ARGS(targetPagePtr),
3417                                                         readOff)));
3418                 }
3419                 else
3420                         ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3421                                         (errcode(ERRCODE_DATA_CORRUPTED),
3422                                          errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3423                                                         fname, LSN_FORMAT_ARGS(targetPagePtr),
3424                                                         readOff, r, (Size) XLOG_BLCKSZ)));
3425                 goto next_record_is_invalid;
3426         }
3427         pgstat_report_wait_end();
3428
3429         Assert(targetSegNo == readSegNo);
3430         Assert(targetPageOff == readOff);
3431         Assert(reqLen <= readLen);
3432
3433         xlogreader->seg.ws_tli = curFileTLI;
3434
3435         /*
3436          * Check the page header immediately, so that we can retry immediately if
3437          * it's not valid. This may seem unnecessary, because ReadPageInternal()
3438          * validates the page header anyway, and would propagate the failure up to
3439          * ReadRecord(), which would retry. However, there's a corner case with
3440          * continuation records, if a record is split across two pages such that
3441          * we would need to read the two pages from different sources across two
3442          * WAL segments.
3443          *
3444          * The first page is only available locally, in pg_wal, because it's
3445          * already been recycled on the primary. The second page, however, is not
3446          * present in pg_wal, and we should stream it from the primary. There is a
3447          * recycled WAL segment present in pg_wal, with garbage contents, however.
3448          * We would read the first page from the local WAL segment, but when
3449          * reading the second page, we would read the bogus, recycled, WAL
3450          * segment. If we didn't catch that case here, we would never recover,
3451          * because ReadRecord() would retry reading the whole record from the
3452          * beginning.
3453          *
3454          * Of course, this only catches errors in the page header, which is what
3455          * happens in the case of a recycled WAL segment. Other kinds of errors or
3456          * corruption still has the same problem. But this at least fixes the
3457          * common case, which can happen as part of normal operation.
3458          *
3459          * Validating the page header is cheap enough that doing it twice
3460          * shouldn't be a big deal from a performance point of view.
3461          *
3462          * When not in standby mode, an invalid page header should cause recovery
3463          * to end, not retry reading the page, so we don't need to validate the
3464          * page header here for the retry. Instead, ReadPageInternal() is
3465          * responsible for the validation.
3466          */
3467         if (StandbyMode &&
3468                 (targetPagePtr % wal_segment_size) == 0 &&
3469                 !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3470         {
3471                 /*
3472                  * Emit this error right now then retry this page immediately. Use
3473                  * errmsg_internal() because the message was already translated.
3474                  */
3475                 if (xlogreader->errormsg_buf[0])
3476                         ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3477                                         (errmsg_internal("%s", xlogreader->errormsg_buf)));
3478
3479                 /* reset any error XLogReaderValidatePageHeader() might have set */
3480                 XLogReaderResetError(xlogreader);
3481                 goto next_record_is_invalid;
3482         }
3483
3484         return readLen;
3485
3486 next_record_is_invalid:
3487
3488         /*
3489          * If we're reading ahead, give up fast.  Retries and error reporting will
3490          * be handled by a later read when recovery catches up to this point.
3491          */
3492         if (xlogreader->nonblocking)
3493                 return XLREAD_WOULDBLOCK;
3494
3495         lastSourceFailed = true;
3496
3497         if (readFile >= 0)
3498                 close(readFile);
3499         readFile = -1;
3500         readLen = 0;
3501         readSource = XLOG_FROM_ANY;
3502
3503         /* In standby-mode, keep trying */
3504         if (StandbyMode)
3505                 goto retry;
3506         else
3507                 return XLREAD_FAIL;
3508 }
3509
3510 /*
3511  * Open the WAL segment containing WAL location 'RecPtr'.
3512  *
3513  * The segment can be fetched via restore_command, or via walreceiver having
3514  * streamed the record, or it can already be present in pg_wal. Checking
3515  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3516  * too, in case someone copies a new segment directly to pg_wal. That is not
3517  * documented or recommended, though.
3518  *
3519  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3520  * prepare to read WAL starting from RedoStartLSN after this.
3521  *
3522  * 'RecPtr' might not point to the beginning of the record we're interested
3523  * in, it might also point to the page or segment header. In that case,
3524  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3525  * used to decide which timeline to stream the requested WAL from.
3526  *
3527  * 'replayLSN' is the current replay LSN, so that if we scan for new
3528  * timelines, we can reject a switch to a timeline that branched off before
3529  * this point.
3530  *
3531  * If the record is not immediately available, the function returns false
3532  * if we're not in standby mode. In standby mode, waits for it to become
3533  * available.
3534  *
3535  * When the requested record becomes available, the function opens the file
3536  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3537  * of standby mode is triggered by the user, and there is no more WAL
3538  * available, returns XLREAD_FAIL.
3539  *
3540  * If nonblocking is true, then give up immediately if we can't satisfy the
3541  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3542  */
3543 static XLogPageReadResult
3544 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3545                                                         bool fetching_ckpt, XLogRecPtr tliRecPtr,
3546                                                         TimeLineID replayTLI, XLogRecPtr replayLSN,
3547                                                         bool nonblocking)
3548 {
3549         static TimestampTz last_fail_time = 0;
3550         TimestampTz now;
3551         bool            streaming_reply_sent = false;
3552
3553         /*-------
3554          * Standby mode is implemented by a state machine:
3555          *
3556          * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3557          *        pg_wal (XLOG_FROM_PG_WAL)
3558          * 2. Check for promotion trigger request
3559          * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3560          * 4. Rescan timelines
3561          * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3562          *
3563          * Failure to read from the current source advances the state machine to
3564          * the next state.
3565          *
3566          * 'currentSource' indicates the current state. There are no currentSource
3567          * values for "check trigger", "rescan timelines", and "sleep" states,
3568          * those actions are taken when reading from the previous source fails, as
3569          * part of advancing to the next state.
3570          *
3571          * If standby mode is turned off while reading WAL from stream, we move
3572          * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3573          * the files (which would be required at end of recovery, e.g., timeline
3574          * history file) from archive or pg_wal. We don't need to kill WAL receiver
3575          * here because it's already stopped when standby mode is turned off at
3576          * the end of recovery.
3577          *-------
3578          */
3579         if (!InArchiveRecovery)
3580                 currentSource = XLOG_FROM_PG_WAL;
3581         else if (currentSource == XLOG_FROM_ANY ||
3582                          (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3583         {
3584                 lastSourceFailed = false;
3585                 currentSource = XLOG_FROM_ARCHIVE;
3586         }
3587
3588         for (;;)
3589         {
3590                 XLogSource      oldSource = currentSource;
3591                 bool            startWalReceiver = false;
3592
3593                 /*
3594                  * First check if we failed to read from the current source, and
3595                  * advance the state machine if so. The failure to read might've
3596                  * happened outside this function, e.g when a CRC check fails on a
3597                  * record, or within this loop.
3598                  */
3599                 if (lastSourceFailed)
3600                 {
3601                         /*
3602                          * Don't allow any retry loops to occur during nonblocking
3603                          * readahead.  Let the caller process everything that has been
3604                          * decoded already first.
3605                          */
3606                         if (nonblocking)
3607                                 return XLREAD_WOULDBLOCK;
3608
3609                         switch (currentSource)
3610                         {
3611                                 case XLOG_FROM_ARCHIVE:
3612                                 case XLOG_FROM_PG_WAL:
3613
3614                                         /*
3615                                          * Check to see if promotion is requested. Note that we do
3616                                          * this only after failure, so when you promote, we still
3617                                          * finish replaying as much as we can from archive and
3618                                          * pg_wal before failover.
3619                                          */
3620                                         if (StandbyMode && CheckForStandbyTrigger())
3621                                         {
3622                                                 XLogShutdownWalRcv();
3623                                                 return XLREAD_FAIL;
3624                                         }
3625
3626                                         /*
3627                                          * Not in standby mode, and we've now tried the archive
3628                                          * and pg_wal.
3629                                          */
3630                                         if (!StandbyMode)
3631                                                 return XLREAD_FAIL;
3632
3633                                         /*
3634                                          * Move to XLOG_FROM_STREAM state, and set to start a
3635                                          * walreceiver if necessary.
3636                                          */
3637                                         currentSource = XLOG_FROM_STREAM;
3638                                         startWalReceiver = true;
3639                                         break;
3640
3641                                 case XLOG_FROM_STREAM:
3642
3643                                         /*
3644                                          * Failure while streaming. Most likely, we got here
3645                                          * because streaming replication was terminated, or
3646                                          * promotion was triggered. But we also get here if we
3647                                          * find an invalid record in the WAL streamed from the
3648                                          * primary, in which case something is seriously wrong.
3649                                          * There's little chance that the problem will just go
3650                                          * away, but PANIC is not good for availability either,
3651                                          * especially in hot standby mode. So, we treat that the
3652                                          * same as disconnection, and retry from archive/pg_wal
3653                                          * again. The WAL in the archive should be identical to
3654                                          * what was streamed, so it's unlikely that it helps, but
3655                                          * one can hope...
3656                                          */
3657
3658                                         /*
3659                                          * We should be able to move to XLOG_FROM_STREAM only in
3660                                          * standby mode.
3661                                          */
3662                                         Assert(StandbyMode);
3663
3664                                         /*
3665                                          * Before we leave XLOG_FROM_STREAM state, make sure that
3666                                          * walreceiver is not active, so that it won't overwrite
3667                                          * WAL that we restore from archive.
3668                                          */
3669                                         XLogShutdownWalRcv();
3670
3671                                         /*
3672                                          * Before we sleep, re-scan for possible new timelines if
3673                                          * we were requested to recover to the latest timeline.
3674                                          */
3675                                         if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3676                                         {
3677                                                 if (rescanLatestTimeLine(replayTLI, replayLSN))
3678                                                 {
3679                                                         currentSource = XLOG_FROM_ARCHIVE;
3680                                                         break;
3681                                                 }
3682                                         }
3683
3684                                         /*
3685                                          * XLOG_FROM_STREAM is the last state in our state
3686                                          * machine, so we've exhausted all the options for
3687                                          * obtaining the requested WAL. We're going to loop back
3688                                          * and retry from the archive, but if it hasn't been long
3689                                          * since last attempt, sleep wal_retrieve_retry_interval
3690                                          * milliseconds to avoid busy-waiting.
3691                                          */
3692                                         now = GetCurrentTimestamp();
3693                                         if (!TimestampDifferenceExceeds(last_fail_time, now,
3694                                                                                                         wal_retrieve_retry_interval))
3695                                         {
3696                                                 long            wait_time;
3697
3698                                                 wait_time = wal_retrieve_retry_interval -
3699                                                         TimestampDifferenceMilliseconds(last_fail_time, now);
3700
3701                                                 elog(LOG, "waiting for WAL to become available at %X/%X",
3702                                                          LSN_FORMAT_ARGS(RecPtr));
3703
3704                                                 /* Do background tasks that might benefit us later. */
3705                                                 KnownAssignedTransactionIdsIdleMaintenance();
3706
3707                                                 (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3708                                                                                  WL_LATCH_SET | WL_TIMEOUT |
3709                                                                                  WL_EXIT_ON_PM_DEATH,
3710                                                                                  wait_time,
3711                                                                                  WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3712                                                 ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3713                                                 now = GetCurrentTimestamp();
3714
3715                                                 /* Handle interrupt signals of startup process */
3716                                                 HandleStartupProcInterrupts();
3717                                         }
3718                                         last_fail_time = now;
3719                                         currentSource = XLOG_FROM_ARCHIVE;
3720                                         break;
3721
3722                                 default:
3723                                         elog(ERROR, "unexpected WAL source %d", currentSource);
3724                         }
3725                 }
3726                 else if (currentSource == XLOG_FROM_PG_WAL)
3727                 {
3728                         /*
3729                          * We just successfully read a file in pg_wal. We prefer files in
3730                          * the archive over ones in pg_wal, so try the next file again
3731                          * from the archive first.
3732                          */
3733                         if (InArchiveRecovery)
3734                                 currentSource = XLOG_FROM_ARCHIVE;
3735                 }
3736
3737                 if (currentSource != oldSource)
3738                         elog(DEBUG2, "switched WAL source from %s to %s after %s",
3739                                  xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3740                                  lastSourceFailed ? "failure" : "success");
3741
3742                 /*
3743                  * We've now handled possible failure. Try to read from the chosen
3744                  * source.
3745                  */
3746                 lastSourceFailed = false;
3747
3748                 switch (currentSource)
3749                 {
3750                         case XLOG_FROM_ARCHIVE:
3751                         case XLOG_FROM_PG_WAL:
3752
3753                                 /*
3754                                  * WAL receiver must not be running when reading WAL from
3755                                  * archive or pg_wal.
3756                                  */
3757                                 Assert(!WalRcvStreaming());
3758
3759                                 /* Close any old file we might have open. */
3760                                 if (readFile >= 0)
3761                                 {
3762                                         close(readFile);
3763                                         readFile = -1;
3764                                 }
3765                                 /* Reset curFileTLI if random fetch. */
3766                                 if (randAccess)
3767                                         curFileTLI = 0;
3768
3769                                 /*
3770                                  * Try to restore the file from archive, or read an existing
3771                                  * file from pg_wal.
3772                                  */
3773                                 readFile = XLogFileReadAnyTLI(readSegNo,
3774                                                                                           currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3775                                                                                           currentSource);
3776                                 if (readFile >= 0)
3777                                         return XLREAD_SUCCESS;  /* success! */
3778
3779                                 /*
3780                                  * Nope, not found in archive or pg_wal.
3781                                  */
3782                                 lastSourceFailed = true;
3783                                 break;
3784
3785                         case XLOG_FROM_STREAM:
3786                                 {
3787                                         bool            havedata;
3788
3789                                         /*
3790                                          * We should be able to move to XLOG_FROM_STREAM only in
3791                                          * standby mode.
3792                                          */
3793                                         Assert(StandbyMode);
3794
3795                                         /*
3796                                          * First, shutdown walreceiver if its restart has been
3797                                          * requested -- but no point if we're already slated for
3798                                          * starting it.
3799                                          */
3800                                         if (pendingWalRcvRestart && !startWalReceiver)
3801                                         {
3802                                                 XLogShutdownWalRcv();
3803
3804                                                 /*
3805                                                  * Re-scan for possible new timelines if we were
3806                                                  * requested to recover to the latest timeline.
3807                                                  */
3808                                                 if (recoveryTargetTimeLineGoal ==
3809                                                         RECOVERY_TARGET_TIMELINE_LATEST)
3810                                                         rescanLatestTimeLine(replayTLI, replayLSN);
3811
3812                                                 startWalReceiver = true;
3813                                         }
3814                                         pendingWalRcvRestart = false;
3815
3816                                         /*
3817                                          * Launch walreceiver if needed.
3818                                          *
3819                                          * If fetching_ckpt is true, RecPtr points to the initial
3820                                          * checkpoint location. In that case, we use RedoStartLSN
3821                                          * as the streaming start position instead of RecPtr, so
3822                                          * that when we later jump backwards to start redo at
3823                                          * RedoStartLSN, we will have the logs streamed already.
3824                                          */
3825                                         if (startWalReceiver &&
3826                                                 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3827                                         {
3828                                                 XLogRecPtr      ptr;
3829                                                 TimeLineID      tli;
3830
3831                                                 if (fetching_ckpt)
3832                                                 {
3833                                                         ptr = RedoStartLSN;
3834                                                         tli = RedoStartTLI;
3835                                                 }
3836                                                 else
3837                                                 {
3838                                                         ptr = RecPtr;
3839
3840                                                         /*
3841                                                          * Use the record begin position to determine the
3842                                                          * TLI, rather than the position we're reading.
3843                                                          */
3844                                                         tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3845
3846                                                         if (curFileTLI > 0 && tli < curFileTLI)
3847                                                                 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3848                                                                          LSN_FORMAT_ARGS(tliRecPtr),
3849                                                                          tli, curFileTLI);
3850                                                 }
3851                                                 curFileTLI = tli;
3852                                                 SetInstallXLogFileSegmentActive();
3853                                                 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3854                                                                                          PrimarySlotName,
3855                                                                                          wal_receiver_create_temp_slot);
3856                                                 flushedUpto = 0;
3857                                         }
3858
3859                                         /*
3860                                          * Check if WAL receiver is active or wait to start up.
3861                                          */
3862                                         if (!WalRcvStreaming())
3863                                         {
3864                                                 lastSourceFailed = true;
3865                                                 break;
3866                                         }
3867
3868                                         /*
3869                                          * Walreceiver is active, so see if new data has arrived.
3870                                          *
3871                                          * We only advance XLogReceiptTime when we obtain fresh
3872                                          * WAL from walreceiver and observe that we had already
3873                                          * processed everything before the most recent "chunk"
3874                                          * that it flushed to disk.  In steady state where we are
3875                                          * keeping up with the incoming data, XLogReceiptTime will
3876                                          * be updated on each cycle. When we are behind,
3877                                          * XLogReceiptTime will not advance, so the grace time
3878                                          * allotted to conflicting queries will decrease.
3879                                          */
3880                                         if (RecPtr < flushedUpto)
3881                                                 havedata = true;
3882                                         else
3883                                         {
3884                                                 XLogRecPtr      latestChunkStart;
3885
3886                                                 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3887                                                 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3888                                                 {
3889                                                         havedata = true;
3890                                                         if (latestChunkStart <= RecPtr)
3891                                                         {
3892                                                                 XLogReceiptTime = GetCurrentTimestamp();
3893                                                                 SetCurrentChunkStartTime(XLogReceiptTime);
3894                                                         }
3895                                                 }
3896                                                 else
3897                                                         havedata = false;
3898                                         }
3899                                         if (havedata)
3900                                         {
3901                                                 /*
3902                                                  * Great, streamed far enough.  Open the file if it's
3903                                                  * not open already.  Also read the timeline history
3904                                                  * file if we haven't initialized timeline history
3905                                                  * yet; it should be streamed over and present in
3906                                                  * pg_wal by now.  Use XLOG_FROM_STREAM so that source
3907                                                  * info is set correctly and XLogReceiptTime isn't
3908                                                  * changed.
3909                                                  *
3910                                                  * NB: We must set readTimeLineHistory based on
3911                                                  * recoveryTargetTLI, not receiveTLI. Normally they'll
3912                                                  * be the same, but if recovery_target_timeline is
3913                                                  * 'latest' and archiving is configured, then it's
3914                                                  * possible that we managed to retrieve one or more
3915                                                  * new timeline history files from the archive,
3916                                                  * updating recoveryTargetTLI.
3917                                                  */
3918                                                 if (readFile < 0)
3919                                                 {
3920                                                         if (!expectedTLEs)
3921                                                                 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3922                                                         readFile = XLogFileRead(readSegNo, receiveTLI,
3923                                                                                                         XLOG_FROM_STREAM, false);
3924                                                         Assert(readFile >= 0);
3925                                                 }
3926                                                 else
3927                                                 {
3928                                                         /* just make sure source info is correct... */
3929                                                         readSource = XLOG_FROM_STREAM;
3930                                                         XLogReceiptSource = XLOG_FROM_STREAM;
3931                                                         return XLREAD_SUCCESS;
3932                                                 }
3933                                                 break;
3934                                         }
3935
3936                                         /* In nonblocking mode, return rather than sleeping. */
3937                                         if (nonblocking)
3938                                                 return XLREAD_WOULDBLOCK;
3939
3940                                         /*
3941                                          * Data not here yet. Check for trigger, then wait for
3942                                          * walreceiver to wake us up when new WAL arrives.
3943                                          */
3944                                         if (CheckForStandbyTrigger())
3945                                         {
3946                                                 /*
3947                                                  * Note that we don't return XLREAD_FAIL immediately
3948                                                  * here. After being triggered, we still want to
3949                                                  * replay all the WAL that was already streamed. It's
3950                                                  * in pg_wal now, so we just treat this as a failure,
3951                                                  * and the state machine will move on to replay the
3952                                                  * streamed WAL from pg_wal, and then recheck the
3953                                                  * trigger and exit replay.
3954                                                  */
3955                                                 lastSourceFailed = true;
3956                                                 break;
3957                                         }
3958
3959                                         /*
3960                                          * Since we have replayed everything we have received so
3961                                          * far and are about to start waiting for more WAL, let's
3962                                          * tell the upstream server our replay location now so
3963                                          * that pg_stat_replication doesn't show stale
3964                                          * information.
3965                                          */
3966                                         if (!streaming_reply_sent)
3967                                         {
3968                                                 WalRcvForceReply();
3969                                                 streaming_reply_sent = true;
3970                                         }
3971
3972                                         /* Do any background tasks that might benefit us later. */
3973                                         KnownAssignedTransactionIdsIdleMaintenance();
3974
3975                                         /* Update pg_stat_recovery_prefetch before sleeping. */
3976                                         XLogPrefetcherComputeStats(xlogprefetcher);
3977
3978                                         /*
3979                                          * Wait for more WAL to arrive, when we will be woken
3980                                          * immediately by the WAL receiver.
3981                                          */
3982                                         (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3983                                                                          WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
3984                                                                          -1L,
3985                                                                          WAIT_EVENT_RECOVERY_WAL_STREAM);
3986                                         ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3987                                         break;
3988                                 }
3989
3990                         default:
3991                                 elog(ERROR, "unexpected WAL source %d", currentSource);
3992                 }
3993
3994                 /*
3995                  * Check for recovery pause here so that we can confirm more quickly
3996                  * that a requested pause has actually taken effect.
3997                  */
3998                 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3999                         RECOVERY_NOT_PAUSED)
4000                         recoveryPausesHere(false);
4001
4002                 /*
4003                  * This possibly-long loop needs to handle interrupts of startup
4004                  * process.
4005                  */
4006                 HandleStartupProcInterrupts();
4007         }
4008
4009         return XLREAD_FAIL;                     /* not reached */
4010 }
4011
4012
4013 /*
4014  * Determine what log level should be used to report a corrupt WAL record
4015  * in the current WAL page, previously read by XLogPageRead().
4016  *
4017  * 'emode' is the error mode that would be used to report a file-not-found
4018  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
4019  * we're retrying the exact same record that we've tried previously, only
4020  * complain the first time to keep the noise down.  However, we only do when
4021  * reading from pg_wal, because we don't expect any invalid records in archive
4022  * or in records streamed from the primary. Files in the archive should be complete,
4023  * and we should never hit the end of WAL because we stop and wait for more WAL
4024  * to arrive before replaying it.
4025  *
4026  * NOTE: This function remembers the RecPtr value it was last called with,
4027  * to suppress repeated messages about the same record. Only call this when
4028  * you are about to ereport(), or you might cause a later message to be
4029  * erroneously suppressed.
4030  */
4031 static int
4032 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4033 {
4034         static XLogRecPtr lastComplaint = 0;
4035
4036         if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4037         {
4038                 if (RecPtr == lastComplaint)
4039                         emode = DEBUG1;
4040                 else
4041                         lastComplaint = RecPtr;
4042         }
4043         return emode;
4044 }
4045
4046
4047 /*
4048  * Subroutine to try to fetch and validate a prior checkpoint record.
4049  */
4050 static XLogRecord *
4051 ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4052                                          TimeLineID replayTLI)
4053 {
4054         XLogRecord *record;
4055         uint8           info;
4056
4057         Assert(xlogreader != NULL);
4058
4059         if (!XRecOffIsValid(RecPtr))
4060         {
4061                 ereport(LOG,
4062                                 (errmsg("invalid checkpoint location")));
4063                 return NULL;
4064         }
4065
4066         XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4067         record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4068
4069         if (record == NULL)
4070         {
4071                 ereport(LOG,
4072                                 (errmsg("invalid checkpoint record")));
4073                 return NULL;
4074         }
4075         if (record->xl_rmid != RM_XLOG_ID)
4076         {
4077                 ereport(LOG,
4078                                 (errmsg("invalid resource manager ID in checkpoint record")));
4079                 return NULL;
4080         }
4081         info = record->xl_info & ~XLR_INFO_MASK;
4082         if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4083                 info != XLOG_CHECKPOINT_ONLINE)
4084         {
4085                 ereport(LOG,
4086                                 (errmsg("invalid xl_info in checkpoint record")));
4087                 return NULL;
4088         }
4089         if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4090         {
4091                 ereport(LOG,
4092                                 (errmsg("invalid length of checkpoint record")));
4093                 return NULL;
4094         }
4095         return record;
4096 }
4097
4098 /*
4099  * Scan for new timelines that might have appeared in the archive since we
4100  * started recovery.
4101  *
4102  * If there are any, the function changes recovery target TLI to the latest
4103  * one and returns 'true'.
4104  */
4105 static bool
4106 rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4107 {
4108         List       *newExpectedTLEs;
4109         bool            found;
4110         ListCell   *cell;
4111         TimeLineID      newtarget;
4112         TimeLineID      oldtarget = recoveryTargetTLI;
4113         TimeLineHistoryEntry *currentTle = NULL;
4114
4115         newtarget = findNewestTimeLine(recoveryTargetTLI);
4116         if (newtarget == recoveryTargetTLI)
4117         {
4118                 /* No new timelines found */
4119                 return false;
4120         }
4121
4122         /*
4123          * Determine the list of expected TLIs for the new TLI
4124          */
4125
4126         newExpectedTLEs = readTimeLineHistory(newtarget);
4127
4128         /*
4129          * If the current timeline is not part of the history of the new timeline,
4130          * we cannot proceed to it.
4131          */
4132         found = false;
4133         foreach(cell, newExpectedTLEs)
4134         {
4135                 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4136
4137                 if (currentTle->tli == recoveryTargetTLI)
4138                 {
4139                         found = true;
4140                         break;
4141                 }
4142         }
4143         if (!found)
4144         {
4145                 ereport(LOG,
4146                                 (errmsg("new timeline %u is not a child of database system timeline %u",
4147                                                 newtarget,
4148                                                 replayTLI)));
4149                 return false;
4150         }
4151
4152         /*
4153          * The current timeline was found in the history file, but check that the
4154          * next timeline was forked off from it *after* the current recovery
4155          * location.
4156          */
4157         if (currentTle->end < replayLSN)
4158         {
4159                 ereport(LOG,
4160                                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4161                                                 newtarget,
4162                                                 replayTLI,
4163                                                 LSN_FORMAT_ARGS(replayLSN))));
4164                 return false;
4165         }
4166
4167         /* The new timeline history seems valid. Switch target */
4168         recoveryTargetTLI = newtarget;
4169         list_free_deep(expectedTLEs);
4170         expectedTLEs = newExpectedTLEs;
4171
4172         /*
4173          * As in StartupXLOG(), try to ensure we have all the history files
4174          * between the old target and new target in pg_wal.
4175          */
4176         restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4177
4178         ereport(LOG,
4179                         (errmsg("new target timeline is %u",
4180                                         recoveryTargetTLI)));
4181
4182         return true;
4183 }
4184
4185
4186 /*
4187  * Open a logfile segment for reading (during recovery).
4188  *
4189  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4190  * Otherwise, it's assumed to be already available in pg_wal.
4191  */
4192 static int
4193 XLogFileRead(XLogSegNo segno, TimeLineID tli,
4194                          XLogSource source, bool notfoundOk)
4195 {
4196         char            xlogfname[MAXFNAMELEN];
4197         char            activitymsg[MAXFNAMELEN + 16];
4198         char            path[MAXPGPATH];
4199         int                     fd;
4200
4201         XLogFileName(xlogfname, tli, segno, wal_segment_size);
4202
4203         switch (source)
4204         {
4205                 case XLOG_FROM_ARCHIVE:
4206                         /* Report recovery progress in PS display */
4207                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4208                                          xlogfname);
4209                         set_ps_display(activitymsg);
4210
4211                         if (!RestoreArchivedFile(path, xlogfname,
4212                                                                          "RECOVERYXLOG",
4213                                                                          wal_segment_size,
4214                                                                          InRedo))
4215                                 return -1;
4216                         break;
4217
4218                 case XLOG_FROM_PG_WAL:
4219                 case XLOG_FROM_STREAM:
4220                         XLogFilePath(path, tli, segno, wal_segment_size);
4221                         break;
4222
4223                 default:
4224                         elog(ERROR, "invalid XLogFileRead source %d", source);
4225         }
4226
4227         /*
4228          * If the segment was fetched from archival storage, replace the existing
4229          * xlog segment (if any) with the archival version.
4230          */
4231         if (source == XLOG_FROM_ARCHIVE)
4232         {
4233                 Assert(!IsInstallXLogFileSegmentActive());
4234                 KeepFileRestoredFromArchive(path, xlogfname);
4235
4236                 /*
4237                  * Set path to point at the new file in pg_wal.
4238                  */
4239                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4240         }
4241
4242         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4243         if (fd >= 0)
4244         {
4245                 /* Success! */
4246                 curFileTLI = tli;
4247
4248                 /* Report recovery progress in PS display */
4249                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4250                                  xlogfname);
4251                 set_ps_display(activitymsg);
4252
4253                 /* Track source of data in assorted state variables */
4254                 readSource = source;
4255                 XLogReceiptSource = source;
4256                 /* In FROM_STREAM case, caller tracks receipt time, not me */
4257                 if (source != XLOG_FROM_STREAM)
4258                         XLogReceiptTime = GetCurrentTimestamp();
4259
4260                 return fd;
4261         }
4262         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4263                 ereport(PANIC,
4264                                 (errcode_for_file_access(),
4265                                  errmsg("could not open file \"%s\": %m", path)));
4266         return -1;
4267 }
4268
4269 /*
4270  * Open a logfile segment for reading (during recovery).
4271  *
4272  * This version searches for the segment with any TLI listed in expectedTLEs.
4273  */
4274 static int
4275 XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4276 {
4277         char            path[MAXPGPATH];
4278         ListCell   *cell;
4279         int                     fd;
4280         List       *tles;
4281
4282         /*
4283          * Loop looking for a suitable timeline ID: we might need to read any of
4284          * the timelines listed in expectedTLEs.
4285          *
4286          * We expect curFileTLI on entry to be the TLI of the preceding file in
4287          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
4288          * to go backwards; this prevents us from picking up the wrong file when a
4289          * parent timeline extends to higher segment numbers than the child we
4290          * want to read.
4291          *
4292          * If we haven't read the timeline history file yet, read it now, so that
4293          * we know which TLIs to scan.  We don't save the list in expectedTLEs,
4294          * however, unless we actually find a valid segment.  That way if there is
4295          * neither a timeline history file nor a WAL segment in the archive, and
4296          * streaming replication is set up, we'll read the timeline history file
4297          * streamed from the primary when we start streaming, instead of
4298          * recovering with a dummy history generated here.
4299          */
4300         if (expectedTLEs)
4301                 tles = expectedTLEs;
4302         else
4303                 tles = readTimeLineHistory(recoveryTargetTLI);
4304
4305         foreach(cell, tles)
4306         {
4307                 TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4308                 TimeLineID      tli = hent->tli;
4309
4310                 if (tli < curFileTLI)
4311                         break;                          /* don't bother looking at too-old TLIs */
4312
4313                 /*
4314                  * Skip scanning the timeline ID that the logfile segment to read
4315                  * doesn't belong to
4316                  */
4317                 if (hent->begin != InvalidXLogRecPtr)
4318                 {
4319                         XLogSegNo       beginseg = 0;
4320
4321                         XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4322
4323                         /*
4324                          * The logfile segment that doesn't belong to the timeline is
4325                          * older or newer than the segment that the timeline started or
4326                          * ended at, respectively. It's sufficient to check only the
4327                          * starting segment of the timeline here. Since the timelines are
4328                          * scanned in descending order in this loop, any segments newer
4329                          * than the ending segment should belong to newer timeline and
4330                          * have already been read before. So it's not necessary to check
4331                          * the ending segment of the timeline here.
4332                          */
4333                         if (segno < beginseg)
4334                                 continue;
4335                 }
4336
4337                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4338                 {
4339                         fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4340                         if (fd != -1)
4341                         {
4342                                 elog(DEBUG1, "got WAL segment from archive");
4343                                 if (!expectedTLEs)
4344                                         expectedTLEs = tles;
4345                                 return fd;
4346                         }
4347                 }
4348
4349                 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4350                 {
4351                         fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4352                         if (fd != -1)
4353                         {
4354                                 if (!expectedTLEs)
4355                                         expectedTLEs = tles;
4356                                 return fd;
4357                         }
4358                 }
4359         }
4360
4361         /* Couldn't find it.  For simplicity, complain about front timeline */
4362         XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4363         errno = ENOENT;
4364         ereport(DEBUG2,
4365                         (errcode_for_file_access(),
4366                          errmsg("could not open file \"%s\": %m", path)));
4367         return -1;
4368 }
4369
4370 /*
4371  * Set flag to signal the walreceiver to restart.  (The startup process calls
4372  * this on noticing a relevant configuration change.)
4373  */
4374 void
4375 StartupRequestWalReceiverRestart(void)
4376 {
4377         if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4378         {
4379                 ereport(LOG,
4380                                 (errmsg("WAL receiver process shutdown requested")));
4381
4382                 pendingWalRcvRestart = true;
4383         }
4384 }
4385
4386
4387 /*
4388  * Has a standby promotion already been triggered?
4389  *
4390  * Unlike CheckForStandbyTrigger(), this works in any process
4391  * that's connected to shared memory.
4392  */
4393 bool
4394 PromoteIsTriggered(void)
4395 {
4396         /*
4397          * We check shared state each time only until a standby promotion is
4398          * triggered. We can't trigger a promotion again, so there's no need to
4399          * keep checking after the shared variable has once been seen true.
4400          */
4401         if (LocalPromoteIsTriggered)
4402                 return true;
4403
4404         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4405         LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4406         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4407
4408         return LocalPromoteIsTriggered;
4409 }
4410
4411 static void
4412 SetPromoteIsTriggered(void)
4413 {
4414         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4415         XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4416         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4417
4418         /*
4419          * Mark the recovery pause state as 'not paused' because the paused state
4420          * ends and promotion continues if a promotion is triggered while recovery
4421          * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4422          * return 'paused' while a promotion is ongoing.
4423          */
4424         SetRecoveryPause(false);
4425
4426         LocalPromoteIsTriggered = true;
4427 }
4428
4429 /*
4430  * Check whether a promote request has arrived.
4431  */
4432 static bool
4433 CheckForStandbyTrigger(void)
4434 {
4435         if (LocalPromoteIsTriggered)
4436                 return true;
4437
4438         if (IsPromoteSignaled() && CheckPromoteSignal())
4439         {
4440                 ereport(LOG, (errmsg("received promote request")));
4441                 RemovePromoteSignalFiles();
4442                 ResetPromoteSignaled();
4443                 SetPromoteIsTriggered();
4444                 return true;
4445         }
4446
4447         return false;
4448 }
4449
4450 /*
4451  * Remove the files signaling a standby promotion request.
4452  */
4453 void
4454 RemovePromoteSignalFiles(void)
4455 {
4456         unlink(PROMOTE_SIGNAL_FILE);
4457 }
4458
4459 /*
4460  * Check to see if a promote request has arrived.
4461  */
4462 bool
4463 CheckPromoteSignal(void)
4464 {
4465         struct stat stat_buf;
4466
4467         if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4468                 return true;
4469
4470         return false;
4471 }
4472
4473 /*
4474  * Wake up startup process to replay newly arrived WAL, or to notice that
4475  * failover has been requested.
4476  */
4477 void
4478 WakeupRecovery(void)
4479 {
4480         SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4481 }
4482
4483 /*
4484  * Schedule a walreceiver wakeup in the main recovery loop.
4485  */
4486 void
4487 XLogRequestWalReceiverReply(void)
4488 {
4489         doRequestWalReceiverReply = true;
4490 }
4491
4492 /*
4493  * Is HotStandby active yet? This is only important in special backends
4494  * since normal backends won't ever be able to connect until this returns
4495  * true. Postmaster knows this by way of signal, not via shared memory.
4496  *
4497  * Unlike testing standbyState, this works in any process that's connected to
4498  * shared memory.  (And note that standbyState alone doesn't tell the truth
4499  * anyway.)
4500  */
4501 bool
4502 HotStandbyActive(void)
4503 {
4504         /*
4505          * We check shared state each time only until Hot Standby is active. We
4506          * can't de-activate Hot Standby, so there's no need to keep checking
4507          * after the shared variable has once been seen true.
4508          */
4509         if (LocalHotStandbyActive)
4510                 return true;
4511         else
4512         {
4513                 /* spinlock is essential on machines with weak memory ordering! */
4514                 SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4515                 LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4516                 SpinLockRelease(&XLogRecoveryCtl->info_lck);
4517
4518                 return LocalHotStandbyActive;
4519         }
4520 }
4521
4522 /*
4523  * Like HotStandbyActive(), but to be used only in WAL replay code,
4524  * where we don't need to ask any other process what the state is.
4525  */
4526 static bool
4527 HotStandbyActiveInReplay(void)
4528 {
4529         Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4530         return LocalHotStandbyActive;
4531 }
4532
4533 /*
4534  * Get latest redo apply position.
4535  *
4536  * Exported to allow WALReceiver to read the pointer directly.
4537  */
4538 XLogRecPtr
4539 GetXLogReplayRecPtr(TimeLineID *replayTLI)
4540 {
4541         XLogRecPtr      recptr;
4542         TimeLineID      tli;
4543
4544         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4545         recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4546         tli = XLogRecoveryCtl->lastReplayedTLI;
4547         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4548
4549         if (replayTLI)
4550                 *replayTLI = tli;
4551         return recptr;
4552 }
4553
4554
4555 /*
4556  * Get position of last applied, or the record being applied.
4557  *
4558  * This is different from GetXLogReplayRecPtr() in that if a WAL
4559  * record is currently being applied, this includes that record.
4560  */
4561 XLogRecPtr
4562 GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4563 {
4564         XLogRecPtr      recptr;
4565         TimeLineID      tli;
4566
4567         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4568         recptr = XLogRecoveryCtl->replayEndRecPtr;
4569         tli = XLogRecoveryCtl->replayEndTLI;
4570         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4571
4572         if (replayEndTLI)
4573                 *replayEndTLI = tli;
4574         return recptr;
4575 }
4576
4577 /*
4578  * Save timestamp of latest processed commit/abort record.
4579  *
4580  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4581  * seen by processes other than the startup process.  Note in particular
4582  * that CreateRestartPoint is executed in the checkpointer.
4583  */
4584 static void
4585 SetLatestXTime(TimestampTz xtime)
4586 {
4587         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4588         XLogRecoveryCtl->recoveryLastXTime = xtime;
4589         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4590 }
4591
4592 /*
4593  * Fetch timestamp of latest processed commit/abort record.
4594  */
4595 TimestampTz
4596 GetLatestXTime(void)
4597 {
4598         TimestampTz xtime;
4599
4600         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4601         xtime = XLogRecoveryCtl->recoveryLastXTime;
4602         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4603
4604         return xtime;
4605 }
4606
4607 /*
4608  * Save timestamp of the next chunk of WAL records to apply.
4609  *
4610  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4611  * seen by all backends.
4612  */
4613 static void
4614 SetCurrentChunkStartTime(TimestampTz xtime)
4615 {
4616         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4617         XLogRecoveryCtl->currentChunkStartTime = xtime;
4618         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4619 }
4620
4621 /*
4622  * Fetch timestamp of latest processed commit/abort record.
4623  * Startup process maintains an accurate local copy in XLogReceiptTime
4624  */
4625 TimestampTz
4626 GetCurrentChunkReplayStartTime(void)
4627 {
4628         TimestampTz xtime;
4629
4630         SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4631         xtime = XLogRecoveryCtl->currentChunkStartTime;
4632         SpinLockRelease(&XLogRecoveryCtl->info_lck);
4633
4634         return xtime;
4635 }
4636
4637 /*
4638  * Returns time of receipt of current chunk of XLOG data, as well as
4639  * whether it was received from streaming replication or from archives.
4640  */
4641 void
4642 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4643 {
4644         /*
4645          * This must be executed in the startup process, since we don't export the
4646          * relevant state to shared memory.
4647          */
4648         Assert(InRecovery);
4649
4650         *rtime = XLogReceiptTime;
4651         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4652 }
4653
4654 /*
4655  * Note that text field supplied is a parameter name and does not require
4656  * translation
4657  */
4658 void
4659 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4660 {
4661         if (currValue < minValue)
4662         {
4663                 if (HotStandbyActiveInReplay())
4664                 {
4665                         bool            warned_for_promote = false;
4666
4667                         ereport(WARNING,
4668                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4669                                          errmsg("hot standby is not possible because of insufficient parameter settings"),
4670                                          errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4671                                                            param_name,
4672                                                            currValue,
4673                                                            minValue)));
4674
4675                         SetRecoveryPause(true);
4676
4677                         ereport(LOG,
4678                                         (errmsg("recovery has paused"),
4679                                          errdetail("If recovery is unpaused, the server will shut down."),
4680                                          errhint("You can then restart the server after making the necessary configuration changes.")));
4681
4682                         while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4683                         {
4684                                 HandleStartupProcInterrupts();
4685
4686                                 if (CheckForStandbyTrigger())
4687                                 {
4688                                         if (!warned_for_promote)
4689                                                 ereport(WARNING,
4690                                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4691                                                                  errmsg("promotion is not possible because of insufficient parameter settings"),
4692
4693                                                 /*
4694                                                  * Repeat the detail from above so it's easy to find
4695                                                  * in the log.
4696                                                  */
4697                                                                  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4698                                                                                    param_name,
4699                                                                                    currValue,
4700                                                                                    minValue),
4701                                                                  errhint("Restart the server after making the necessary configuration changes.")));
4702                                         warned_for_promote = true;
4703                                 }
4704
4705                                 /*
4706                                  * If recovery pause is requested then set it paused.  While
4707                                  * we are in the loop, user might resume and pause again so
4708                                  * set this every time.
4709                                  */
4710                                 ConfirmRecoveryPaused();
4711
4712                                 /*
4713                                  * We wait on a condition variable that will wake us as soon
4714                                  * as the pause ends, but we use a timeout so we can check the
4715                                  * above conditions periodically too.
4716                                  */
4717                                 ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4718                                                                                         WAIT_EVENT_RECOVERY_PAUSE);
4719                         }
4720                         ConditionVariableCancelSleep();
4721                 }
4722
4723                 ereport(FATAL,
4724                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4725                                  errmsg("recovery aborted because of insufficient parameter settings"),
4726                 /* Repeat the detail from above so it's easy to find in the log. */
4727                                  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4728                                                    param_name,
4729                                                    currValue,
4730                                                    minValue),
4731                                  errhint("You can restart the server after making the necessary configuration changes.")));
4732         }
4733 }
4734
4735
4736 /*
4737  * GUC check_hook for primary_slot_name
4738  */
4739 bool
4740 check_primary_slot_name(char **newval, void **extra, GucSource source)
4741 {
4742         if (*newval && strcmp(*newval, "") != 0 &&
4743                 !ReplicationSlotValidateName(*newval, WARNING))
4744                 return false;
4745
4746         return true;
4747 }
4748
4749 /*
4750  * Recovery target settings: Only one of the several recovery_target* settings
4751  * may be set.  Setting a second one results in an error.  The global variable
4752  * recoveryTarget tracks which kind of recovery target was chosen.  Other
4753  * variables store the actual target value (for example a string or a xid).
4754  * The assign functions of the parameters check whether a competing parameter
4755  * was already set.  But we want to allow setting the same parameter multiple
4756  * times.  We also want to allow unsetting a parameter and setting a different
4757  * one, so we unset recoveryTarget when the parameter is set to an empty
4758  * string.
4759  *
4760  * XXX this code is broken by design.  Throwing an error from a GUC assign
4761  * hook breaks fundamental assumptions of guc.c.  So long as all the variables
4762  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4763  * since we'd just abort postmaster startup anyway.  Nonetheless it's likely
4764  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4765  */
4766
4767 static void
4768 pg_attribute_noreturn()
4769 error_multiple_recovery_targets(void)
4770 {
4771         ereport(ERROR,
4772                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4773                          errmsg("multiple recovery targets specified"),
4774                          errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4775 }
4776
4777 /*
4778  * GUC check_hook for recovery_target
4779  */
4780 bool
4781 check_recovery_target(char **newval, void **extra, GucSource source)
4782 {
4783         if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4784         {
4785                 GUC_check_errdetail("The only allowed value is \"immediate\".");
4786                 return false;
4787         }
4788         return true;
4789 }
4790
4791 /*
4792  * GUC assign_hook for recovery_target
4793  */
4794 void
4795 assign_recovery_target(const char *newval, void *extra)
4796 {
4797         if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4798                 recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4799                 error_multiple_recovery_targets();
4800
4801         if (newval && strcmp(newval, "") != 0)
4802                 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4803         else
4804                 recoveryTarget = RECOVERY_TARGET_UNSET;
4805 }
4806
4807 /*
4808  * GUC check_hook for recovery_target_lsn
4809  */
4810 bool
4811 check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4812 {
4813         if (strcmp(*newval, "") != 0)
4814         {
4815                 XLogRecPtr      lsn;
4816                 XLogRecPtr *myextra;
4817                 bool            have_error = false;
4818
4819                 lsn = pg_lsn_in_internal(*newval, &have_error);
4820                 if (have_error)
4821                         return false;
4822
4823                 myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4824                 *myextra = lsn;
4825                 *extra = myextra;
4826         }
4827         return true;
4828 }
4829
4830 /*
4831  * GUC assign_hook for recovery_target_lsn
4832  */
4833 void
4834 assign_recovery_target_lsn(const char *newval, void *extra)
4835 {
4836         if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4837                 recoveryTarget != RECOVERY_TARGET_LSN)
4838                 error_multiple_recovery_targets();
4839
4840         if (newval && strcmp(newval, "") != 0)
4841         {
4842                 recoveryTarget = RECOVERY_TARGET_LSN;
4843                 recoveryTargetLSN = *((XLogRecPtr *) extra);
4844         }
4845         else
4846                 recoveryTarget = RECOVERY_TARGET_UNSET;
4847 }
4848
4849 /*
4850  * GUC check_hook for recovery_target_name
4851  */
4852 bool
4853 check_recovery_target_name(char **newval, void **extra, GucSource source)
4854 {
4855         /* Use the value of newval directly */
4856         if (strlen(*newval) >= MAXFNAMELEN)
4857         {
4858                 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4859                                                         "recovery_target_name", MAXFNAMELEN - 1);
4860                 return false;
4861         }
4862         return true;
4863 }
4864
4865 /*
4866  * GUC assign_hook for recovery_target_name
4867  */
4868 void
4869 assign_recovery_target_name(const char *newval, void *extra)
4870 {
4871         if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4872                 recoveryTarget != RECOVERY_TARGET_NAME)
4873                 error_multiple_recovery_targets();
4874
4875         if (newval && strcmp(newval, "") != 0)
4876         {
4877                 recoveryTarget = RECOVERY_TARGET_NAME;
4878                 recoveryTargetName = newval;
4879         }
4880         else
4881                 recoveryTarget = RECOVERY_TARGET_UNSET;
4882 }
4883
4884 /*
4885  * GUC check_hook for recovery_target_time
4886  *
4887  * The interpretation of the recovery_target_time string can depend on the
4888  * time zone setting, so we need to wait until after all GUC processing is
4889  * done before we can do the final parsing of the string.  This check function
4890  * only does a parsing pass to catch syntax errors, but we store the string
4891  * and parse it again when we need to use it.
4892  */
4893 bool
4894 check_recovery_target_time(char **newval, void **extra, GucSource source)
4895 {
4896         if (strcmp(*newval, "") != 0)
4897         {
4898                 /* reject some special values */
4899                 if (strcmp(*newval, "now") == 0 ||
4900                         strcmp(*newval, "today") == 0 ||
4901                         strcmp(*newval, "tomorrow") == 0 ||
4902                         strcmp(*newval, "yesterday") == 0)
4903                 {
4904                         return false;
4905                 }
4906
4907                 /*
4908                  * parse timestamp value (see also timestamptz_in())
4909                  */
4910                 {
4911                         char       *str = *newval;
4912                         fsec_t          fsec;
4913                         struct pg_tm tt,
4914                                            *tm = &tt;
4915                         int                     tz;
4916                         int                     dtype;
4917                         int                     nf;
4918                         int                     dterr;
4919                         char       *field[MAXDATEFIELDS];
4920                         int                     ftype[MAXDATEFIELDS];
4921                         char            workbuf[MAXDATELEN + MAXDATEFIELDS];
4922                         DateTimeErrorExtra dtextra;
4923                         TimestampTz timestamp;
4924
4925                         dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4926                                                                   field, ftype, MAXDATEFIELDS, &nf);
4927                         if (dterr == 0)
4928                                 dterr = DecodeDateTime(field, ftype, nf,
4929                                                                            &dtype, tm, &fsec, &tz, &dtextra);
4930                         if (dterr != 0)
4931                                 return false;
4932                         if (dtype != DTK_DATE)
4933                                 return false;
4934
4935                         if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4936                         {
4937                                 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4938                                 return false;
4939                         }
4940                 }
4941         }
4942         return true;
4943 }
4944
4945 /*
4946  * GUC assign_hook for recovery_target_time
4947  */
4948 void
4949 assign_recovery_target_time(const char *newval, void *extra)
4950 {
4951         if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4952                 recoveryTarget != RECOVERY_TARGET_TIME)
4953                 error_multiple_recovery_targets();
4954
4955         if (newval && strcmp(newval, "") != 0)
4956                 recoveryTarget = RECOVERY_TARGET_TIME;
4957         else
4958                 recoveryTarget = RECOVERY_TARGET_UNSET;
4959 }
4960
4961 /*
4962  * GUC check_hook for recovery_target_timeline
4963  */
4964 bool
4965 check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4966 {
4967         RecoveryTargetTimeLineGoal rttg;
4968         RecoveryTargetTimeLineGoal *myextra;
4969
4970         if (strcmp(*newval, "current") == 0)
4971                 rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4972         else if (strcmp(*newval, "latest") == 0)
4973                 rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4974         else
4975         {
4976                 rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
4977
4978                 errno = 0;
4979                 strtoul(*newval, NULL, 0);
4980                 if (errno == EINVAL || errno == ERANGE)
4981                 {
4982                         GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4983                         return false;
4984                 }
4985         }
4986
4987         myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(ERROR, sizeof(RecoveryTargetTimeLineGoal));
4988         *myextra = rttg;
4989         *extra = myextra;
4990
4991         return true;
4992 }
4993
4994 /*
4995  * GUC assign_hook for recovery_target_timeline
4996  */
4997 void
4998 assign_recovery_target_timeline(const char *newval, void *extra)
4999 {
5000         recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5001         if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5002                 recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5003         else
5004                 recoveryTargetTLIRequested = 0;
5005 }
5006
5007 /*
5008  * GUC check_hook for recovery_target_xid
5009  */
5010 bool
5011 check_recovery_target_xid(char **newval, void **extra, GucSource source)
5012 {
5013         if (strcmp(*newval, "") != 0)
5014         {
5015                 TransactionId xid;
5016                 TransactionId *myextra;
5017
5018                 errno = 0;
5019                 xid = (TransactionId) strtou64(*newval, NULL, 0);
5020                 if (errno == EINVAL || errno == ERANGE)
5021                         return false;
5022
5023                 myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5024                 *myextra = xid;
5025                 *extra = myextra;
5026         }
5027         return true;
5028 }
5029
5030 /*
5031  * GUC assign_hook for recovery_target_xid
5032  */
5033 void
5034 assign_recovery_target_xid(const char *newval, void *extra)
5035 {
5036         if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5037                 recoveryTarget != RECOVERY_TARGET_XID)
5038                 error_multiple_recovery_targets();
5039
5040         if (newval && strcmp(newval, "") != 0)
5041         {
5042                 recoveryTarget = RECOVERY_TARGET_XID;
5043                 recoveryTargetXid = *((TransactionId *) extra);
5044         }
5045         else
5046                 recoveryTarget = RECOVERY_TARGET_UNSET;
5047 }