src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL write-ahead log manager
   5  *
   6  * The Write-Ahead Log (WAL) functionality is split into several source
   7  * files, in addition to this one:
   8  *
   9  * xloginsert.c - Functions for constructing WAL records
  10  * xlogrecovery.c - WAL recovery and standby code
  11  * xlogreader.c - Facility for reading WAL files and parsing WAL records
  12  * xlogutils.c - Helper functions for WAL redo routines
  13  *
  14  * This file contains functions for coordinating database startup and
  15  * checkpointing, and managing the write-ahead log buffers when the
  16  * system is running.
  17  *
  18  * StartupXLOG() is the main entry point of the startup process.  It
  19  * coordinates database startup, performing WAL recovery, and the
  20  * transition from WAL recovery into normal operations.
  21  *
  22  * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
  23  * callers should not call this directly, but use the functions in
  24  * xloginsert.c to construct the WAL record.  XLogFlush() can be used
  25  * to force the WAL to disk.
  26  *
  27  * In addition to those, there are many other functions for interrogating
  28  * the current system state, and for starting/stopping backups.
  29  *
  30  *
  31  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  32  * Portions Copyright (c) 1994, Regents of the University of California
  33  *
  34  * src/backend/access/transam/xlog.c
  35  *
  36  *-------------------------------------------------------------------------
  37  */
  38
  39 #include "postgres.h"
  40
  41 #include <ctype.h>
  42 #include <math.h>
  43 #include <time.h>
  44 #include <fcntl.h>
  45 #include <sys/stat.h>
  46 #include <sys/time.h>
  47 #include <unistd.h>
  48
  49 #include "access/clog.h"
  50 #include "access/commit_ts.h"
  51 #include "access/heaptoast.h"
  52 #include "access/multixact.h"
  53 #include "access/rewriteheap.h"
  54 #include "access/subtrans.h"
  55 #include "access/timeline.h"
  56 #include "access/transam.h"
  57 #include "access/twophase.h"
  58 #include "access/xact.h"
  59 #include "access/xlog_internal.h"
  60 #include "access/xlogarchive.h"
  61 #include "access/xloginsert.h"
  62 #include "access/xlogreader.h"
  63 #include "access/xlogrecovery.h"
  64 #include "access/xlogutils.h"
  65 #include "backup/basebackup.h"
  66 #include "catalog/catversion.h"
  67 #include "catalog/pg_control.h"
  68 #include "catalog/pg_database.h"
  69 #include "common/controldata_utils.h"
  70 #include "common/file_utils.h"
  71 #include "executor/instrument.h"
  72 #include "miscadmin.h"
  73 #include "pg_trace.h"
  74 #include "pgstat.h"
  75 #include "port/atomics.h"
  76 #include "postmaster/bgwriter.h"
  77 #include "postmaster/startup.h"
  78 #include "postmaster/walsummarizer.h"
  79 #include "postmaster/walwriter.h"
  80 #include "replication/origin.h"
  81 #include "replication/slot.h"
  82 #include "replication/snapbuild.h"
  83 #include "replication/walreceiver.h"
  84 #include "replication/walsender.h"
  85 #include "storage/bufmgr.h"
  86 #include "storage/fd.h"
  87 #include "storage/ipc.h"
  88 #include "storage/large_object.h"
  89 #include "storage/latch.h"
  90 #include "storage/predicate.h"
  91 #include "storage/proc.h"
  92 #include "storage/procarray.h"
  93 #include "storage/reinit.h"
  94 #include "storage/spin.h"
  95 #include "storage/sync.h"
  96 #include "utils/guc_hooks.h"
  97 #include "utils/guc_tables.h"
  98 #include "utils/injection_point.h"
  99 #include "utils/ps_status.h"
 100 #include "utils/relmapper.h"
 101 #include "utils/snapmgr.h"
 102 #include "utils/timeout.h"
 103 #include "utils/timestamp.h"
 104 #include "utils/varlena.h"
 105
 106 #ifdef WAL_DEBUG
 107 #include "utils/memutils.h"
 108 #endif
 109
 110 /* timeline ID to be used when bootstrapping */
 111 #define BootstrapTimeLineID             1
 112
 113 /* User-settable parameters */
 114 int                     max_wal_size_mb = 1024; /* 1 GB */
 115 int                     min_wal_size_mb = 80;   /* 80 MB */
 116 int                     wal_keep_size_mb = 0;
 117 int                     XLOGbuffers = -1;
 118 int                     XLogArchiveTimeout = 0;
 119 int                     XLogArchiveMode = ARCHIVE_MODE_OFF;
 120 char       *XLogArchiveCommand = NULL;
 121 bool            EnableHotStandby = false;
 122 bool            fullPageWrites = true;
 123 bool            wal_log_hints = false;
 124 int                     wal_compression = WAL_COMPRESSION_NONE;
 125 char       *wal_consistency_checking_string = NULL;
 126 bool       *wal_consistency_checking = NULL;
 127 bool            wal_init_zero = true;
 128 bool            wal_recycle = true;
 129 bool            log_checkpoints = true;
 130 int                     wal_sync_method = DEFAULT_WAL_SYNC_METHOD;
 131 int                     wal_level = WAL_LEVEL_REPLICA;
 132 int                     CommitDelay = 0;        /* precommit delay in microseconds */
 133 int                     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 134 int                     wal_retrieve_retry_interval = 5000;
 135 int                     max_slot_wal_keep_size_mb = -1;
 136 int                     wal_decode_buffer_size = 512 * 1024;
 137 bool            track_wal_io_timing = false;
 138
 139 #ifdef WAL_DEBUG
 140 bool            XLOG_DEBUG = false;
 141 #endif
 142
 143 int                     wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
 144
 145 /*
 146  * Number of WAL insertion locks to use. A higher value allows more insertions
 147  * to happen concurrently, but adds some CPU overhead to flushing the WAL,
 148  * which needs to iterate all the locks.
 149  */
 150 #define NUM_XLOGINSERT_LOCKS  8
 151
 152 /*
 153  * Max distance from last checkpoint, before triggering a new xlog-based
 154  * checkpoint.
 155  */
 156 int                     CheckPointSegments;
 157
 158 /* Estimated distance between checkpoints, in bytes */
 159 static double CheckPointDistanceEstimate = 0;
 160 static double PrevCheckPointDistance = 0;
 161
 162 /*
 163  * Track whether there were any deferred checks for custom resource managers
 164  * specified in wal_consistency_checking.
 165  */
 166 static bool check_wal_consistency_checking_deferred = false;
 167
 168 /*
 169  * GUC support
 170  */
 171 const struct config_enum_entry wal_sync_method_options[] = {
 172         {"fsync", WAL_SYNC_METHOD_FSYNC, false},
 173 #ifdef HAVE_FSYNC_WRITETHROUGH
 174         {"fsync_writethrough", WAL_SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 175 #endif
 176         {"fdatasync", WAL_SYNC_METHOD_FDATASYNC, false},
 177 #ifdef O_SYNC
 178         {"open_sync", WAL_SYNC_METHOD_OPEN, false},
 179 #endif
 180 #ifdef O_DSYNC
 181         {"open_datasync", WAL_SYNC_METHOD_OPEN_DSYNC, false},
 182 #endif
 183         {NULL, 0, false}
 184 };
 185
 186
 187 /*
 188  * Although only "on", "off", and "always" are documented,
 189  * we accept all the likely variants of "on" and "off".
 190  */
 191 const struct config_enum_entry archive_mode_options[] = {
 192         {"always", ARCHIVE_MODE_ALWAYS, false},
 193         {"on", ARCHIVE_MODE_ON, false},
 194         {"off", ARCHIVE_MODE_OFF, false},
 195         {"true", ARCHIVE_MODE_ON, true},
 196         {"false", ARCHIVE_MODE_OFF, true},
 197         {"yes", ARCHIVE_MODE_ON, true},
 198         {"no", ARCHIVE_MODE_OFF, true},
 199         {"1", ARCHIVE_MODE_ON, true},
 200         {"0", ARCHIVE_MODE_OFF, true},
 201         {NULL, 0, false}
 202 };
 203
 204 /*
 205  * Statistics for current checkpoint are collected in this global struct.
 206  * Because only the checkpointer or a stand-alone backend can perform
 207  * checkpoints, this will be unused in normal backends.
 208  */
 209 CheckpointStatsData CheckpointStats;
 210
 211 /*
 212  * During recovery, lastFullPageWrites keeps track of full_page_writes that
 213  * the replayed WAL records indicate. It's initialized with full_page_writes
 214  * that the recovery starting checkpoint record indicates, and then updated
 215  * each time XLOG_FPW_CHANGE record is replayed.
 216  */
 217 static bool lastFullPageWrites;
 218
 219 /*
 220  * Local copy of the state tracked by SharedRecoveryState in shared memory,
 221  * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
 222  * means "not known, need to check the shared state".
 223  */
 224 static bool LocalRecoveryInProgress = true;
 225
 226 /*
 227  * Local state for XLogInsertAllowed():
 228  *              1: unconditionally allowed to insert XLOG
 229  *              0: unconditionally not allowed to insert XLOG
 230  *              -1: must check RecoveryInProgress(); disallow until it is false
 231  * Most processes start with -1 and transition to 1 after seeing that recovery
 232  * is not in progress.  But we can also force the value for special cases.
 233  * The coding in XLogInsertAllowed() depends on the first two of these states
 234  * being numerically the same as bool true and false.
 235  */
 236 static int      LocalXLogInsertAllowed = -1;
 237
 238 /*
 239  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 240  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 241  * end+1 of the last record, and is reset when we end a top-level transaction,
 242  * or start a new one; so it can be used to tell if the current transaction has
 243  * created any XLOG records.
 244  *
 245  * While in parallel mode, this may not be fully up to date.  When committing,
 246  * a transaction can assume this covers all xlog records written either by the
 247  * user backend or by any parallel worker which was present at any point during
 248  * the transaction.  But when aborting, or when still in parallel mode, other
 249  * parallel backends may have written WAL records at later LSNs than the value
 250  * stored here.  The parallel leader advances its own copy, when necessary,
 251  * in WaitForParallelWorkersToFinish.
 252  */
 253 XLogRecPtr      ProcLastRecPtr = InvalidXLogRecPtr;
 254 XLogRecPtr      XactLastRecEnd = InvalidXLogRecPtr;
 255 XLogRecPtr      XactLastCommitEnd = InvalidXLogRecPtr;
 256
 257 /*
 258  * RedoRecPtr is this backend's local copy of the REDO record pointer
 259  * (which is almost but not quite the same as a pointer to the most recent
 260  * CHECKPOINT record).  We update this from the shared-memory copy,
 261  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 262  * hold an insertion lock).  See XLogInsertRecord for details.  We are also
 263  * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
 264  * see GetRedoRecPtr.
 265  *
 266  * NB: Code that uses this variable must be prepared not only for the
 267  * possibility that it may be arbitrarily out of date, but also for the
 268  * possibility that it might be set to InvalidXLogRecPtr. We used to
 269  * initialize it as a side effect of the first call to RecoveryInProgress(),
 270  * which meant that most code that might use it could assume that it had a
 271  * real if perhaps stale value. That's no longer the case.
 272  */
 273 static XLogRecPtr RedoRecPtr;
 274
 275 /*
 276  * doPageWrites is this backend's local copy of (fullPageWrites ||
 277  * runningBackups > 0).  It is used together with RedoRecPtr to decide whether
 278  * a full-page image of a page need to be taken.
 279  *
 280  * NB: Initially this is false, and there's no guarantee that it will be
 281  * initialized to any other value before it is first used. Any code that
 282  * makes use of it must recheck the value after obtaining a WALInsertLock,
 283  * and respond appropriately if it turns out that the previous value wasn't
 284  * accurate.
 285  */
 286 static bool doPageWrites;
 287
 288 /*----------
 289  * Shared-memory data structures for XLOG control
 290  *
 291  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 292  * the log up to (all records before that point must be written or fsynced).
 293  * The positions already written/fsynced are maintained in logWriteResult
 294  * and logFlushResult using atomic access.
 295  * In addition to the shared variable, each backend has a private copy of
 296  * both in LogwrtResult, which is updated when convenient.
 297  *
 298  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 299  * (protected by info_lck), but we don't need to cache any copies of it.
 300  *
 301  * info_lck is only held long enough to read/update the protected variables,
 302  * so it's a plain spinlock.  The other locks are held longer (potentially
 303  * over I/O operations), so we use LWLocks for them.  These locks are:
 304  *
 305  * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 306  * It is only held while initializing and changing the mapping.  If the
 307  * contents of the buffer being replaced haven't been written yet, the mapping
 308  * lock is released while the write is done, and reacquired afterwards.
 309  *
 310  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 311  * XLogFlush).
 312  *
 313  * ControlFileLock: must be held to read/update control file or create
 314  * new log file.
 315  *
 316  *----------
 317  */
 318
 319 typedef struct XLogwrtRqst
 320 {
 321         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 322         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 323 } XLogwrtRqst;
 324
 325 typedef struct XLogwrtResult
 326 {
 327         XLogRecPtr      Write;                  /* last byte + 1 written out */
 328         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 329 } XLogwrtResult;
 330
 331 /*
 332  * Inserting to WAL is protected by a small fixed number of WAL insertion
 333  * locks. To insert to the WAL, you must hold one of the locks - it doesn't
 334  * matter which one. To lock out other concurrent insertions, you must hold
 335  * of them. Each WAL insertion lock consists of a lightweight lock, plus an
 336  * indicator of how far the insertion has progressed (insertingAt).
 337  *
 338  * The insertingAt values are read when a process wants to flush WAL from
 339  * the in-memory buffers to disk, to check that all the insertions to the
 340  * region the process is about to write out have finished. You could simply
 341  * wait for all currently in-progress insertions to finish, but the
 342  * insertingAt indicator allows you to ignore insertions to later in the WAL,
 343  * so that you only wait for the insertions that are modifying the buffers
 344  * you're about to write out.
 345  *
 346  * This isn't just an optimization. If all the WAL buffers are dirty, an
 347  * inserter that's holding a WAL insert lock might need to evict an old WAL
 348  * buffer, which requires flushing the WAL. If it's possible for an inserter
 349  * to block on another inserter unnecessarily, deadlock can arise when two
 350  * inserters holding a WAL insert lock wait for each other to finish their
 351  * insertion.
 352  *
 353  * Small WAL records that don't cross a page boundary never update the value,
 354  * the WAL record is just copied to the page and the lock is released. But
 355  * to avoid the deadlock-scenario explained above, the indicator is always
 356  * updated before sleeping while holding an insertion lock.
 357  *
 358  * lastImportantAt contains the LSN of the last important WAL record inserted
 359  * using a given lock. This value is used to detect if there has been
 360  * important WAL activity since the last time some action, like a checkpoint,
 361  * was performed - allowing to not repeat the action if not. The LSN is
 362  * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
 363  * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
 364  * records.  Tracking the WAL activity directly in WALInsertLock has the
 365  * advantage of not needing any additional locks to update the value.
 366  */
 367 typedef struct
 368 {
 369         LWLock          lock;
 370         pg_atomic_uint64 insertingAt;
 371         XLogRecPtr      lastImportantAt;
 372 } WALInsertLock;
 373
 374 /*
 375  * All the WAL insertion locks are allocated as an array in shared memory. We
 376  * force the array stride to be a power of 2, which saves a few cycles in
 377  * indexing, but more importantly also ensures that individual slots don't
 378  * cross cache line boundaries. (Of course, we have to also ensure that the
 379  * array start address is suitably aligned.)
 380  */
 381 typedef union WALInsertLockPadded
 382 {
 383         WALInsertLock l;
 384         char            pad[PG_CACHE_LINE_SIZE];
 385 } WALInsertLockPadded;
 386
 387 /*
 388  * Session status of running backup, used for sanity checks in SQL-callable
 389  * functions to start and stop backups.
 390  */
 391 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
 392
 393 /*
 394  * Shared state data for WAL insertion.
 395  */
 396 typedef struct XLogCtlInsert
 397 {
 398         slock_t         insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
 399
 400         /*
 401          * CurrBytePos is the end of reserved WAL. The next record will be
 402          * inserted at that position. PrevBytePos is the start position of the
 403          * previously inserted (or rather, reserved) record - it is copied to the
 404          * prev-link of the next record. These are stored as "usable byte
 405          * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
 406          */
 407         uint64          CurrBytePos;
 408         uint64          PrevBytePos;
 409
 410         /*
 411          * Make sure the above heavily-contended spinlock and byte positions are
 412          * on their own cache line. In particular, the RedoRecPtr and full page
 413          * write variables below should be on a different cache line. They are
 414          * read on every WAL insertion, but updated rarely, and we don't want
 415          * those reads to steal the cache line containing Curr/PrevBytePos.
 416          */
 417         char            pad[PG_CACHE_LINE_SIZE];
 418
 419         /*
 420          * fullPageWrites is the authoritative value used by all backends to
 421          * determine whether to write full-page image to WAL. This shared value,
 422          * instead of the process-local fullPageWrites, is required because, when
 423          * full_page_writes is changed by SIGHUP, we must WAL-log it before it
 424          * actually affects WAL-logging by backends.  Checkpointer sets at startup
 425          * or after SIGHUP.
 426          *
 427          * To read these fields, you must hold an insertion lock. To modify them,
 428          * you must hold ALL the locks.
 429          */
 430         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 431         bool            fullPageWrites;
 432
 433         /*
 434          * runningBackups is a counter indicating the number of backups currently
 435          * in progress. lastBackupStart is the latest checkpoint redo location
 436          * used as a starting point for an online backup.
 437          */
 438         int                     runningBackups;
 439         XLogRecPtr      lastBackupStart;
 440
 441         /*
 442          * WAL insertion locks.
 443          */
 444         WALInsertLockPadded *WALInsertLocks;
 445 } XLogCtlInsert;
 446
 447 /*
 448  * Total shared-memory state for XLOG.
 449  */
 450 typedef struct XLogCtlData
 451 {
 452         XLogCtlInsert Insert;
 453
 454         /* Protected by info_lck: */
 455         XLogwrtRqst LogwrtRqst;
 456         XLogRecPtr      RedoRecPtr;             /* a recent copy of Insert->RedoRecPtr */
 457         FullTransactionId ckptFullXid;  /* nextXid of latest checkpoint */
 458         XLogRecPtr      asyncXactLSN;   /* LSN of newest async commit/abort */
 459         XLogRecPtr      replicationSlotMinLSN;  /* oldest LSN needed by any slot */
 460
 461         XLogSegNo       lastRemovedSegNo;       /* latest removed/recycled XLOG segment */
 462
 463         /* Fake LSN counter, for unlogged relations. */
 464         pg_atomic_uint64 unloggedLSN;
 465
 466         /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
 467         pg_time_t       lastSegSwitchTime;
 468         XLogRecPtr      lastSegSwitchLSN;
 469
 470         /* These are accessed using atomics -- info_lck not needed */
 471         pg_atomic_uint64 logInsertResult;       /* last byte + 1 inserted to buffers */
 472         pg_atomic_uint64 logWriteResult;        /* last byte + 1 written out */
 473         pg_atomic_uint64 logFlushResult;        /* last byte + 1 flushed */
 474
 475         /*
 476          * Latest initialized page in the cache (last byte position + 1).
 477          *
 478          * To change the identity of a buffer (and InitializedUpTo), you need to
 479          * hold WALBufMappingLock.  To change the identity of a buffer that's
 480          * still dirty, the old page needs to be written out first, and for that
 481          * you need WALWriteLock, and you need to ensure that there are no
 482          * in-progress insertions to the page by calling
 483          * WaitXLogInsertionsToFinish().
 484          */
 485         XLogRecPtr      InitializedUpTo;
 486
 487         /*
 488          * These values do not change after startup, although the pointed-to pages
 489          * and xlblocks values certainly do.  xlblocks values are protected by
 490          * WALBufMappingLock.
 491          */
 492         char       *pages;                      /* buffers for unwritten XLOG pages */
 493         pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
 494         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 495
 496         /*
 497          * InsertTimeLineID is the timeline into which new WAL is being inserted
 498          * and flushed. It is zero during recovery, and does not change once set.
 499          *
 500          * If we create a new timeline when the system was started up,
 501          * PrevTimeLineID is the old timeline's ID that we forked off from.
 502          * Otherwise it's equal to InsertTimeLineID.
 503          *
 504          * We set these fields while holding info_lck. Most that reads these
 505          * values knows that recovery is no longer in progress and so can safely
 506          * read the value without a lock, but code that could be run either during
 507          * or after recovery can take info_lck while reading these values.
 508          */
 509         TimeLineID      InsertTimeLineID;
 510         TimeLineID      PrevTimeLineID;
 511
 512         /*
 513          * SharedRecoveryState indicates if we're still in crash or archive
 514          * recovery.  Protected by info_lck.
 515          */
 516         RecoveryState SharedRecoveryState;
 517
 518         /*
 519          * InstallXLogFileSegmentActive indicates whether the checkpointer should
 520          * arrange for future segments by recycling and/or PreallocXlogFiles().
 521          * Protected by ControlFileLock.  Only the startup process changes it.  If
 522          * true, anyone can use InstallXLogFileSegment().  If false, the startup
 523          * process owns the exclusive right to install segments, by reading from
 524          * the archive and possibly replacing existing files.
 525          */
 526         bool            InstallXLogFileSegmentActive;
 527
 528         /*
 529          * WalWriterSleeping indicates whether the WAL writer is currently in
 530          * low-power mode (and hence should be nudged if an async commit occurs).
 531          * Protected by info_lck.
 532          */
 533         bool            WalWriterSleeping;
 534
 535         /*
 536          * During recovery, we keep a copy of the latest checkpoint record here.
 537          * lastCheckPointRecPtr points to start of checkpoint record and
 538          * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
 539          * checkpointer when it wants to create a restartpoint.
 540          *
 541          * Protected by info_lck.
 542          */
 543         XLogRecPtr      lastCheckPointRecPtr;
 544         XLogRecPtr      lastCheckPointEndPtr;
 545         CheckPoint      lastCheckPoint;
 546
 547         /*
 548          * lastFpwDisableRecPtr points to the start of the last replayed
 549          * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 550          */
 551         XLogRecPtr      lastFpwDisableRecPtr;
 552
 553         slock_t         info_lck;               /* locks shared variables shown above */
 554 } XLogCtlData;
 555
 556 /*
 557  * Classification of XLogInsertRecord operations.
 558  */
 559 typedef enum
 560 {
 561         WALINSERT_NORMAL,
 562         WALINSERT_SPECIAL_SWITCH,
 563         WALINSERT_SPECIAL_CHECKPOINT
 564 } WalInsertClass;
 565
 566 static XLogCtlData *XLogCtl = NULL;
 567
 568 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
 569 static WALInsertLockPadded *WALInsertLocks = NULL;
 570
 571 /*
 572  * We maintain an image of pg_control in shared memory.
 573  */
 574 static ControlFileData *ControlFile = NULL;
 575
 576 /*
 577  * Calculate the amount of space left on the page after 'endptr'. Beware
 578  * multiple evaluation!
 579  */
 580 #define INSERT_FREESPACE(endptr)        \
 581         (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
 582
 583 /* Macro to advance to next buffer index. */
 584 #define NextBufIdx(idx)         \
 585                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 586
 587 /*
 588  * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 589  * would hold if it was in cache, the page containing 'recptr'.
 590  */
 591 #define XLogRecPtrToBufIdx(recptr)      \
 592         (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
 593
 594 /*
 595  * These are the number of bytes in a WAL page usable for WAL data.
 596  */
 597 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 598
 599 /*
 600  * Convert values of GUCs measured in megabytes to equiv. segment count.
 601  * Rounds down.
 602  */
 603 #define ConvertToXSegs(x, segsize)      XLogMBVarToSegs((x), (segsize))
 604
 605 /* The number of bytes in a WAL segment usable for WAL data. */
 606 static int      UsableBytesInSegment;
 607
 608 /*
 609  * Private, possibly out-of-date copy of shared LogwrtResult.
 610  * See discussion above.
 611  */
 612 static XLogwrtResult LogwrtResult = {0, 0};
 613
 614 /*
 615  * Update local copy of shared XLogCtl->log{Write,Flush}Result
 616  *
 617  * It's critical that Flush always trails Write, so the order of the reads is
 618  * important, as is the barrier.  See also XLogWrite.
 619  */
 620 #define RefreshXLogWriteResult(_target) \
 621         do { \
 622                 _target.Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult); \
 623                 pg_read_barrier(); \
 624                 _target.Write = pg_atomic_read_u64(&XLogCtl->logWriteResult); \
 625         } while (0)
 626
 627 /*
 628  * openLogFile is -1 or a kernel FD for an open log file segment.
 629  * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
 630  * These variables are only used to write the XLOG, and so will normally refer
 631  * to the active segment.
 632  *
 633  * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
 634  */
 635 static int      openLogFile = -1;
 636 static XLogSegNo openLogSegNo = 0;
 637 static TimeLineID openLogTLI = 0;
 638
 639 /*
 640  * Local copies of equivalent fields in the control file.  When running
 641  * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
 642  * expect to replay all the WAL available, and updateMinRecoveryPoint is
 643  * switched to false to prevent any updates while replaying records.
 644  * Those values are kept consistent as long as crash recovery runs.
 645  */
 646 static XLogRecPtr LocalMinRecoveryPoint;
 647 static TimeLineID LocalMinRecoveryPointTLI;
 648 static bool updateMinRecoveryPoint = true;
 649
 650 /* For WALInsertLockAcquire/Release functions */
 651 static int      MyLockNo = 0;
 652 static bool holdingAllLocks = false;
 653
 654 #ifdef WAL_DEBUG
 655 static MemoryContext walDebugCxt = NULL;
 656 #endif
 657
 658 static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
 659                                                                                 XLogRecPtr EndOfLog,
 660                                                                                 TimeLineID newTLI);
 661 static void CheckRequiredParameterValues(void);
 662 static void XLogReportParameters(void);
 663 static int      LocalSetXLogInsertAllowed(void);
 664 static void CreateEndOfRecoveryRecord(void);
 665 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
 666                                                                                                   XLogRecPtr pagePtr,
 667                                                                                                   TimeLineID newTLI);
 668 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 669 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 670 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 671
 672 static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
 673                                                                   bool opportunistic);
 674 static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
 675 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 676                                                                    bool find_free, XLogSegNo max_segno,
 677                                                                    TimeLineID tli);
 678 static void XLogFileClose(void);
 679 static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
 680 static void RemoveTempXlogFiles(void);
 681 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
 682                                                            XLogRecPtr endptr, TimeLineID insertTLI);
 683 static void RemoveXlogFile(const struct dirent *segment_de,
 684                                                    XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
 685                                                    TimeLineID insertTLI);
 686 static void UpdateLastRemovedPtr(char *filename);
 687 static void ValidateXLOGDirectoryStructure(void);
 688 static void CleanupBackupHistory(void);
 689 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 690 static bool PerformRecoveryXLogAction(void);
 691 static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version);
 692 static void WriteControlFile(void);
 693 static void ReadControlFile(void);
 694 static void UpdateControlFile(void);
 695 static char *str_time(pg_time_t tnow);
 696
 697 static int      get_sync_bit(int method);
 698
 699 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
 700                                                                 XLogRecData *rdata,
 701                                                                 XLogRecPtr StartPos, XLogRecPtr EndPos,
 702                                                                 TimeLineID tli);
 703 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
 704                                                                           XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
 705 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 706                                                           XLogRecPtr *PrevPtr);
 707 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 708 static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
 709 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 710 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 711 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
 712
 713 static void WALInsertLockAcquire(void);
 714 static void WALInsertLockAcquireExclusive(void);
 715 static void WALInsertLockRelease(void);
 716 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 717
 718 /*
 719  * Insert an XLOG record represented by an already-constructed chain of data
 720  * chunks.  This is a low-level routine; to construct the WAL record header
 721  * and data, use the higher-level routines in xloginsert.c.
 722  *
 723  * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
 724  * WAL record applies to, that were not included in the record as full page
 725  * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
 726  * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
 727  * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
 728  * record is always inserted.
 729  *
 730  * 'flags' gives more in-depth control on the record being inserted. See
 731  * XLogSetRecordFlags() for details.
 732  *
 733  * 'topxid_included' tells whether the top-transaction id is logged along with
 734  * current subtransaction. See XLogRecordAssemble().
 735  *
 736  * The first XLogRecData in the chain must be for the record header, and its
 737  * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
 738  * xl_crc fields in the header, the rest of the header must already be filled
 739  * by the caller.
 740  *
 741  * Returns XLOG pointer to end of record (beginning of next record).
 742  * This can be used as LSN for data pages affected by the logged action.
 743  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 744  * before the data page can be written out.  This implements the basic
 745  * WAL rule "write the log before the data".)
 746  */
 747 XLogRecPtr
 748 XLogInsertRecord(XLogRecData *rdata,
 749                                  XLogRecPtr fpw_lsn,
 750                                  uint8 flags,
 751                                  int num_fpi,
 752                                  bool topxid_included)
 753 {
 754         XLogCtlInsert *Insert = &XLogCtl->Insert;
 755         pg_crc32c       rdata_crc;
 756         bool            inserted;
 757         XLogRecord *rechdr = (XLogRecord *) rdata->data;
 758         uint8           info = rechdr->xl_info & ~XLR_INFO_MASK;
 759         WalInsertClass class = WALINSERT_NORMAL;
 760         XLogRecPtr      StartPos;
 761         XLogRecPtr      EndPos;
 762         bool            prevDoPageWrites = doPageWrites;
 763         TimeLineID      insertTLI;
 764
 765         /* Does this record type require special handling? */
 766         if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
 767         {
 768                 if (info == XLOG_SWITCH)
 769                         class = WALINSERT_SPECIAL_SWITCH;
 770                 else if (info == XLOG_CHECKPOINT_REDO)
 771                         class = WALINSERT_SPECIAL_CHECKPOINT;
 772         }
 773
 774         /* we assume that all of the record header is in the first chunk */
 775         Assert(rdata->len >= SizeOfXLogRecord);
 776
 777         /* cross-check on whether we should be here or not */
 778         if (!XLogInsertAllowed())
 779                 elog(ERROR, "cannot make new WAL entries during recovery");
 780
 781         /*
 782          * Given that we're not in recovery, InsertTimeLineID is set and can't
 783          * change, so we can read it without a lock.
 784          */
 785         insertTLI = XLogCtl->InsertTimeLineID;
 786
 787         /*----------
 788          *
 789          * We have now done all the preparatory work we can without holding a
 790          * lock or modifying shared state. From here on, inserting the new WAL
 791          * record to the shared WAL buffer cache is a two-step process:
 792          *
 793          * 1. Reserve the right amount of space from the WAL. The current head of
 794          *        reserved space is kept in Insert->CurrBytePos, and is protected by
 795          *        insertpos_lck.
 796          *
 797          * 2. Copy the record to the reserved WAL space. This involves finding the
 798          *        correct WAL buffer containing the reserved space, and copying the
 799          *        record in place. This can be done concurrently in multiple processes.
 800          *
 801          * To keep track of which insertions are still in-progress, each concurrent
 802          * inserter acquires an insertion lock. In addition to just indicating that
 803          * an insertion is in progress, the lock tells others how far the inserter
 804          * has progressed. There is a small fixed number of insertion locks,
 805          * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
 806          * boundary, it updates the value stored in the lock to the how far it has
 807          * inserted, to allow the previous buffer to be flushed.
 808          *
 809          * Holding onto an insertion lock also protects RedoRecPtr and
 810          * fullPageWrites from changing until the insertion is finished.
 811          *
 812          * Step 2 can usually be done completely in parallel. If the required WAL
 813          * page is not initialized yet, you have to grab WALBufMappingLock to
 814          * initialize it, but the WAL writer tries to do that ahead of insertions
 815          * to avoid that from happening in the critical path.
 816          *
 817          *----------
 818          */
 819         START_CRIT_SECTION();
 820
 821         if (likely(class == WALINSERT_NORMAL))
 822         {
 823                 WALInsertLockAcquire();
 824
 825                 /*
 826                  * Check to see if my copy of RedoRecPtr is out of date. If so, may
 827                  * have to go back and have the caller recompute everything. This can
 828                  * only happen just after a checkpoint, so it's better to be slow in
 829                  * this case and fast otherwise.
 830                  *
 831                  * Also check to see if fullPageWrites was just turned on or there's a
 832                  * running backup (which forces full-page writes); if we weren't
 833                  * already doing full-page writes then go back and recompute.
 834                  *
 835                  * If we aren't doing full-page writes then RedoRecPtr doesn't
 836                  * actually affect the contents of the XLOG record, so we'll update
 837                  * our local copy but not force a recomputation.  (If doPageWrites was
 838                  * just turned off, we could recompute the record without full pages,
 839                  * but we choose not to bother.)
 840                  */
 841                 if (RedoRecPtr != Insert->RedoRecPtr)
 842                 {
 843                         Assert(RedoRecPtr < Insert->RedoRecPtr);
 844                         RedoRecPtr = Insert->RedoRecPtr;
 845                 }
 846                 doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
 847
 848                 if (doPageWrites &&
 849                         (!prevDoPageWrites ||
 850                          (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
 851                 {
 852                         /*
 853                          * Oops, some buffer now needs to be backed up that the caller
 854                          * didn't back up.  Start over.
 855                          */
 856                         WALInsertLockRelease();
 857                         END_CRIT_SECTION();
 858                         return InvalidXLogRecPtr;
 859                 }
 860
 861                 /*
 862                  * Reserve space for the record in the WAL. This also sets the xl_prev
 863                  * pointer.
 864                  */
 865                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
 866                                                                   &rechdr->xl_prev);
 867
 868                 /* Normal records are always inserted. */
 869                 inserted = true;
 870         }
 871         else if (class == WALINSERT_SPECIAL_SWITCH)
 872         {
 873                 /*
 874                  * In order to insert an XLOG_SWITCH record, we need to hold all of
 875                  * the WAL insertion locks, not just one, so that no one else can
 876                  * begin inserting a record until we've figured out how much space
 877                  * remains in the current WAL segment and claimed all of it.
 878                  *
 879                  * Nonetheless, this case is simpler than the normal cases handled
 880                  * below, which must check for changes in doPageWrites and RedoRecPtr.
 881                  * Those checks are only needed for records that can contain buffer
 882                  * references, and an XLOG_SWITCH record never does.
 883                  */
 884                 Assert(fpw_lsn == InvalidXLogRecPtr);
 885                 WALInsertLockAcquireExclusive();
 886                 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
 887         }
 888         else
 889         {
 890                 Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
 891
 892                 /*
 893                  * We need to update both the local and shared copies of RedoRecPtr,
 894                  * which means that we need to hold all the WAL insertion locks.
 895                  * However, there can't be any buffer references, so as above, we need
 896                  * not check RedoRecPtr before inserting the record; we just need to
 897                  * update it afterwards.
 898                  */
 899                 Assert(fpw_lsn == InvalidXLogRecPtr);
 900                 WALInsertLockAcquireExclusive();
 901                 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
 902                                                                   &rechdr->xl_prev);
 903                 RedoRecPtr = Insert->RedoRecPtr = StartPos;
 904                 inserted = true;
 905         }
 906
 907         if (inserted)
 908         {
 909                 /*
 910                  * Now that xl_prev has been filled in, calculate CRC of the record
 911                  * header.
 912                  */
 913                 rdata_crc = rechdr->xl_crc;
 914                 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
 915                 FIN_CRC32C(rdata_crc);
 916                 rechdr->xl_crc = rdata_crc;
 917
 918                 /*
 919                  * All the record data, including the header, is now ready to be
 920                  * inserted. Copy the record in the space reserved.
 921                  */
 922                 CopyXLogRecordToWAL(rechdr->xl_tot_len,
 923                                                         class == WALINSERT_SPECIAL_SWITCH, rdata,
 924                                                         StartPos, EndPos, insertTLI);
 925
 926                 /*
 927                  * Unless record is flagged as not important, update LSN of last
 928                  * important record in the current slot. When holding all locks, just
 929                  * update the first one.
 930                  */
 931                 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
 932                 {
 933                         int                     lockno = holdingAllLocks ? 0 : MyLockNo;
 934
 935                         WALInsertLocks[lockno].l.lastImportantAt = StartPos;
 936                 }
 937         }
 938         else
 939         {
 940                 /*
 941                  * This was an xlog-switch record, but the current insert location was
 942                  * already exactly at the beginning of a segment, so there was no need
 943                  * to do anything.
 944                  */
 945         }
 946
 947         /*
 948          * Done! Let others know that we're finished.
 949          */
 950         WALInsertLockRelease();
 951
 952         END_CRIT_SECTION();
 953
 954         MarkCurrentTransactionIdLoggedIfAny();
 955
 956         /*
 957          * Mark top transaction id is logged (if needed) so that we should not try
 958          * to log it again with the next WAL record in the current subtransaction.
 959          */
 960         if (topxid_included)
 961                 MarkSubxactTopXidLogged();
 962
 963         /*
 964          * Update shared LogwrtRqst.Write, if we crossed page boundary.
 965          */
 966         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
 967         {
 968                 SpinLockAcquire(&XLogCtl->info_lck);
 969                 /* advance global request to include new block(s) */
 970                 if (XLogCtl->LogwrtRqst.Write < EndPos)
 971                         XLogCtl->LogwrtRqst.Write = EndPos;
 972                 SpinLockRelease(&XLogCtl->info_lck);
 973                 RefreshXLogWriteResult(LogwrtResult);
 974         }
 975
 976         /*
 977          * If this was an XLOG_SWITCH record, flush the record and the empty
 978          * padding space that fills the rest of the segment, and perform
 979          * end-of-segment actions (eg, notifying archiver).
 980          */
 981         if (class == WALINSERT_SPECIAL_SWITCH)
 982         {
 983                 TRACE_POSTGRESQL_WAL_SWITCH();
 984                 XLogFlush(EndPos);
 985
 986                 /*
 987                  * Even though we reserved the rest of the segment for us, which is
 988                  * reflected in EndPos, we return a pointer to just the end of the
 989                  * xlog-switch record.
 990                  */
 991                 if (inserted)
 992                 {
 993                         EndPos = StartPos + SizeOfXLogRecord;
 994                         if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
 995                         {
 996                                 uint64          offset = XLogSegmentOffset(EndPos, wal_segment_size);
 997
 998                                 if (offset == EndPos % XLOG_BLCKSZ)
 999                                         EndPos += SizeOfXLogLongPHD;
1000                                 else
1001                                         EndPos += SizeOfXLogShortPHD;
1002                         }
1003                 }
1004         }
1005
1006 #ifdef WAL_DEBUG
1007         if (XLOG_DEBUG)
1008         {
1009                 static XLogReaderState *debug_reader = NULL;
1010                 XLogRecord *record;
1011                 DecodedXLogRecord *decoded;
1012                 StringInfoData buf;
1013                 StringInfoData recordBuf;
1014                 char       *errormsg = NULL;
1015                 MemoryContext oldCxt;
1016
1017                 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1018
1019                 initStringInfo(&buf);
1020                 appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
1021
1022                 /*
1023                  * We have to piece together the WAL record data from the XLogRecData
1024                  * entries, so that we can pass it to the rm_desc function as one
1025                  * contiguous chunk.
1026                  */
1027                 initStringInfo(&recordBuf);
1028                 for (; rdata != NULL; rdata = rdata->next)
1029                         appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1030
1031                 /* We also need temporary space to decode the record. */
1032                 record = (XLogRecord *) recordBuf.data;
1033                 decoded = (DecodedXLogRecord *)
1034                         palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
1035
1036                 if (!debug_reader)
1037                         debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
1038                                                                                           XL_ROUTINE(.page_read = NULL,
1039                                                                                                                  .segment_open = NULL,
1040                                                                                                                  .segment_close = NULL),
1041                                                                                           NULL);
1042                 if (!debug_reader)
1043                 {
1044                         appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
1045                 }
1046                 else if (!DecodeXLogRecord(debug_reader,
1047                                                                    decoded,
1048                                                                    record,
1049                                                                    EndPos,
1050                                                                    &errormsg))
1051                 {
1052                         appendStringInfo(&buf, "error decoding record: %s",
1053                                                          errormsg ? errormsg : "no error message");
1054                 }
1055                 else
1056                 {
1057                         appendStringInfoString(&buf, " - ");
1058
1059                         debug_reader->record = decoded;
1060                         xlog_outdesc(&buf, debug_reader);
1061                         debug_reader->record = NULL;
1062                 }
1063                 elog(LOG, "%s", buf.data);
1064
1065                 pfree(decoded);
1066                 pfree(buf.data);
1067                 pfree(recordBuf.data);
1068                 MemoryContextSwitchTo(oldCxt);
1069         }
1070 #endif
1071
1072         /*
1073          * Update our global variables
1074          */
1075         ProcLastRecPtr = StartPos;
1076         XactLastRecEnd = EndPos;
1077
1078         /* Report WAL traffic to the instrumentation. */
1079         if (inserted)
1080         {
1081                 pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1082                 pgWalUsage.wal_records++;
1083                 pgWalUsage.wal_fpi += num_fpi;
1084         }
1085
1086         return EndPos;
1087 }
1088
1089 /*
1090  * Reserves the right amount of space for a record of given size from the WAL.
1091  * *StartPos is set to the beginning of the reserved section, *EndPos to
1092  * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1093  * used to set the xl_prev of this record.
1094  *
1095  * This is the performance critical part of XLogInsert that must be serialized
1096  * across backends. The rest can happen mostly in parallel. Try to keep this
1097  * section as short as possible, insertpos_lck can be heavily contended on a
1098  * busy system.
1099  *
1100  * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1101  * where we actually copy the record to the reserved space.
1102  *
1103  * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
1104  * however, because there are two call sites, the compiler is reluctant to
1105  * inline. We use pg_attribute_always_inline here to try to convince it.
1106  */
1107 static pg_attribute_always_inline void
1108 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1109                                                   XLogRecPtr *PrevPtr)
1110 {
1111         XLogCtlInsert *Insert = &XLogCtl->Insert;
1112         uint64          startbytepos;
1113         uint64          endbytepos;
1114         uint64          prevbytepos;
1115
1116         size = MAXALIGN(size);
1117
1118         /* All (non xlog-switch) records should contain data. */
1119         Assert(size > SizeOfXLogRecord);
1120
1121         /*
1122          * The duration the spinlock needs to be held is minimized by minimizing
1123          * the calculations that have to be done while holding the lock. The
1124          * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1125          * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1126          * page headers. The mapping between "usable" byte positions and physical
1127          * positions (XLogRecPtrs) can be done outside the locked region, and
1128          * because the usable byte position doesn't include any headers, reserving
1129          * X bytes from WAL is almost as simple as "CurrBytePos += X".
1130          */
1131         SpinLockAcquire(&Insert->insertpos_lck);
1132
1133         startbytepos = Insert->CurrBytePos;
1134         endbytepos = startbytepos + size;
1135         prevbytepos = Insert->PrevBytePos;
1136         Insert->CurrBytePos = endbytepos;
1137         Insert->PrevBytePos = startbytepos;
1138
1139         SpinLockRelease(&Insert->insertpos_lck);
1140
1141         *StartPos = XLogBytePosToRecPtr(startbytepos);
1142         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1143         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1144
1145         /*
1146          * Check that the conversions between "usable byte positions" and
1147          * XLogRecPtrs work consistently in both directions.
1148          */
1149         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1150         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1151         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1152 }
1153
1154 /*
1155  * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1156  *
1157  * A log-switch record is handled slightly differently. The rest of the
1158  * segment will be reserved for this insertion, as indicated by the returned
1159  * *EndPos value. However, if we are already at the beginning of the current
1160  * segment, *StartPos and *EndPos are set to the current location without
1161  * reserving any space, and the function returns false.
1162 */
1163 static bool
1164 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1165 {
1166         XLogCtlInsert *Insert = &XLogCtl->Insert;
1167         uint64          startbytepos;
1168         uint64          endbytepos;
1169         uint64          prevbytepos;
1170         uint32          size = MAXALIGN(SizeOfXLogRecord);
1171         XLogRecPtr      ptr;
1172         uint32          segleft;
1173
1174         /*
1175          * These calculations are a bit heavy-weight to be done while holding a
1176          * spinlock, but since we're holding all the WAL insertion locks, there
1177          * are no other inserters competing for it. GetXLogInsertRecPtr() does
1178          * compete for it, but that's not called very frequently.
1179          */
1180         SpinLockAcquire(&Insert->insertpos_lck);
1181
1182         startbytepos = Insert->CurrBytePos;
1183
1184         ptr = XLogBytePosToEndRecPtr(startbytepos);
1185         if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1186         {
1187                 SpinLockRelease(&Insert->insertpos_lck);
1188                 *EndPos = *StartPos = ptr;
1189                 return false;
1190         }
1191
1192         endbytepos = startbytepos + size;
1193         prevbytepos = Insert->PrevBytePos;
1194
1195         *StartPos = XLogBytePosToRecPtr(startbytepos);
1196         *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1197
1198         segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1199         if (segleft != wal_segment_size)
1200         {
1201                 /* consume the rest of the segment */
1202                 *EndPos += segleft;
1203                 endbytepos = XLogRecPtrToBytePos(*EndPos);
1204         }
1205         Insert->CurrBytePos = endbytepos;
1206         Insert->PrevBytePos = startbytepos;
1207
1208         SpinLockRelease(&Insert->insertpos_lck);
1209
1210         *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1211
1212         Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1213         Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1214         Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1215         Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1216
1217         return true;
1218 }
1219
1220 /*
1221  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1222  * area in the WAL.
1223  */
1224 static void
1225 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1226                                         XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
1227 {
1228         char       *currpos;
1229         int                     freespace;
1230         int                     written;
1231         XLogRecPtr      CurrPos;
1232         XLogPageHeader pagehdr;
1233
1234         /*
1235          * Get a pointer to the right place in the right WAL buffer to start
1236          * inserting to.
1237          */
1238         CurrPos = StartPos;
1239         currpos = GetXLogBuffer(CurrPos, tli);
1240         freespace = INSERT_FREESPACE(CurrPos);
1241
1242         /*
1243          * there should be enough space for at least the first field (xl_tot_len)
1244          * on this page.
1245          */
1246         Assert(freespace >= sizeof(uint32));
1247
1248         /* Copy record data */
1249         written = 0;
1250         while (rdata != NULL)
1251         {
1252                 const char *rdata_data = rdata->data;
1253                 int                     rdata_len = rdata->len;
1254
1255                 while (rdata_len > freespace)
1256                 {
1257                         /*
1258                          * Write what fits on this page, and continue on the next page.
1259                          */
1260                         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1261                         memcpy(currpos, rdata_data, freespace);
1262                         rdata_data += freespace;
1263                         rdata_len -= freespace;
1264                         written += freespace;
1265                         CurrPos += freespace;
1266
1267                         /*
1268                          * Get pointer to beginning of next page, and set the xlp_rem_len
1269                          * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1270                          *
1271                          * It's safe to set the contrecord flag and xlp_rem_len without a
1272                          * lock on the page. All the other flags were already set when the
1273                          * page was initialized, in AdvanceXLInsertBuffer, and we're the
1274                          * only backend that needs to set the contrecord flag.
1275                          */
1276                         currpos = GetXLogBuffer(CurrPos, tli);
1277                         pagehdr = (XLogPageHeader) currpos;
1278                         pagehdr->xlp_rem_len = write_len - written;
1279                         pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1280
1281                         /* skip over the page header */
1282                         if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1283                         {
1284                                 CurrPos += SizeOfXLogLongPHD;
1285                                 currpos += SizeOfXLogLongPHD;
1286                         }
1287                         else
1288                         {
1289                                 CurrPos += SizeOfXLogShortPHD;
1290                                 currpos += SizeOfXLogShortPHD;
1291                         }
1292                         freespace = INSERT_FREESPACE(CurrPos);
1293                 }
1294
1295                 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1296                 memcpy(currpos, rdata_data, rdata_len);
1297                 currpos += rdata_len;
1298                 CurrPos += rdata_len;
1299                 freespace -= rdata_len;
1300                 written += rdata_len;
1301
1302                 rdata = rdata->next;
1303         }
1304         Assert(written == write_len);
1305
1306         /*
1307          * If this was an xlog-switch, it's not enough to write the switch record,
1308          * we also have to consume all the remaining space in the WAL segment.  We
1309          * have already reserved that space, but we need to actually fill it.
1310          */
1311         if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1312         {
1313                 /* An xlog-switch record doesn't contain any data besides the header */
1314                 Assert(write_len == SizeOfXLogRecord);
1315
1316                 /* Assert that we did reserve the right amount of space */
1317                 Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1318
1319                 /* Use up all the remaining space on the current page */
1320                 CurrPos += freespace;
1321
1322                 /*
1323                  * Cause all remaining pages in the segment to be flushed, leaving the
1324                  * XLog position where it should be, at the start of the next segment.
1325                  * We do this one page at a time, to make sure we don't deadlock
1326                  * against ourselves if wal_buffers < wal_segment_size.
1327                  */
1328                 while (CurrPos < EndPos)
1329                 {
1330                         /*
1331                          * The minimal action to flush the page would be to call
1332                          * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1333                          * AdvanceXLInsertBuffer(...).  The page would be left initialized
1334                          * mostly to zeros, except for the page header (always the short
1335                          * variant, as this is never a segment's first page).
1336                          *
1337                          * The large vistas of zeros are good for compressibility, but the
1338                          * headers interrupting them every XLOG_BLCKSZ (with values that
1339                          * differ from page to page) are not.  The effect varies with
1340                          * compression tool, but bzip2 for instance compresses about an
1341                          * order of magnitude worse if those headers are left in place.
1342                          *
1343                          * Rather than complicating AdvanceXLInsertBuffer itself (which is
1344                          * called in heavily-loaded circumstances as well as this lightly-
1345                          * loaded one) with variant behavior, we just use GetXLogBuffer
1346                          * (which itself calls the two methods we need) to get the pointer
1347                          * and zero most of the page.  Then we just zero the page header.
1348                          */
1349                         currpos = GetXLogBuffer(CurrPos, tli);
1350                         MemSet(currpos, 0, SizeOfXLogShortPHD);
1351
1352                         CurrPos += XLOG_BLCKSZ;
1353                 }
1354         }
1355         else
1356         {
1357                 /* Align the end position, so that the next record starts aligned */
1358                 CurrPos = MAXALIGN64(CurrPos);
1359         }
1360
1361         if (CurrPos != EndPos)
1362                 ereport(PANIC,
1363                                 errcode(ERRCODE_DATA_CORRUPTED),
1364                                 errmsg_internal("space reserved for WAL record does not match what was written"));
1365 }
1366
1367 /*
1368  * Acquire a WAL insertion lock, for inserting to WAL.
1369  */
1370 static void
1371 WALInsertLockAcquire(void)
1372 {
1373         bool            immed;
1374
1375         /*
1376          * It doesn't matter which of the WAL insertion locks we acquire, so try
1377          * the one we used last time.  If the system isn't particularly busy, it's
1378          * a good bet that it's still available, and it's good to have some
1379          * affinity to a particular lock so that you don't unnecessarily bounce
1380          * cache lines between processes when there's no contention.
1381          *
1382          * If this is the first time through in this backend, pick a lock
1383          * (semi-)randomly.  This allows the locks to be used evenly if you have a
1384          * lot of very short connections.
1385          */
1386         static int      lockToTry = -1;
1387
1388         if (lockToTry == -1)
1389                 lockToTry = MyProcNumber % NUM_XLOGINSERT_LOCKS;
1390         MyLockNo = lockToTry;
1391
1392         /*
1393          * The insertingAt value is initially set to 0, as we don't know our
1394          * insert location yet.
1395          */
1396         immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1397         if (!immed)
1398         {
1399                 /*
1400                  * If we couldn't get the lock immediately, try another lock next
1401                  * time.  On a system with more insertion locks than concurrent
1402                  * inserters, this causes all the inserters to eventually migrate to a
1403                  * lock that no-one else is using.  On a system with more inserters
1404                  * than locks, it still helps to distribute the inserters evenly
1405                  * across the locks.
1406                  */
1407                 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1408         }
1409 }
1410
1411 /*
1412  * Acquire all WAL insertion locks, to prevent other backends from inserting
1413  * to WAL.
1414  */
1415 static void
1416 WALInsertLockAcquireExclusive(void)
1417 {
1418         int                     i;
1419
1420         /*
1421          * When holding all the locks, all but the last lock's insertingAt
1422          * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1423          * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1424          */
1425         for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1426         {
1427                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1428                 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1429                                                 &WALInsertLocks[i].l.insertingAt,
1430                                                 PG_UINT64_MAX);
1431         }
1432         /* Variable value reset to 0 at release */
1433         LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1434
1435         holdingAllLocks = true;
1436 }
1437
1438 /*
1439  * Release our insertion lock (or locks, if we're holding them all).
1440  *
1441  * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1442  * next time the lock is acquired.
1443  */
1444 static void
1445 WALInsertLockRelease(void)
1446 {
1447         if (holdingAllLocks)
1448         {
1449                 int                     i;
1450
1451                 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1452                         LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1453                                                                   &WALInsertLocks[i].l.insertingAt,
1454                                                                   0);
1455
1456                 holdingAllLocks = false;
1457         }
1458         else
1459         {
1460                 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1461                                                           &WALInsertLocks[MyLockNo].l.insertingAt,
1462                                                           0);
1463         }
1464 }
1465
1466 /*
1467  * Update our insertingAt value, to let others know that we've finished
1468  * inserting up to that point.
1469  */
1470 static void
1471 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1472 {
1473         if (holdingAllLocks)
1474         {
1475                 /*
1476                  * We use the last lock to mark our actual position, see comments in
1477                  * WALInsertLockAcquireExclusive.
1478                  */
1479                 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1480                                                 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1481                                                 insertingAt);
1482         }
1483         else
1484                 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1485                                                 &WALInsertLocks[MyLockNo].l.insertingAt,
1486                                                 insertingAt);
1487 }
1488
1489 /*
1490  * Wait for any WAL insertions < upto to finish.
1491  *
1492  * Returns the location of the oldest insertion that is still in-progress.
1493  * Any WAL prior to that point has been fully copied into WAL buffers, and
1494  * can be flushed out to disk. Because this waits for any insertions older
1495  * than 'upto' to finish, the return value is always >= 'upto'.
1496  *
1497  * Note: When you are about to write out WAL, you must call this function
1498  * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1499  * need to wait for an insertion to finish (or at least advance to next
1500  * uninitialized page), and the inserter might need to evict an old WAL buffer
1501  * to make room for a new one, which in turn requires WALWriteLock.
1502  */
1503 static XLogRecPtr
1504 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1505 {
1506         uint64          bytepos;
1507         XLogRecPtr      inserted;
1508         XLogRecPtr      reservedUpto;
1509         XLogRecPtr      finishedUpto;
1510         XLogCtlInsert *Insert = &XLogCtl->Insert;
1511         int                     i;
1512
1513         if (MyProc == NULL)
1514                 elog(PANIC, "cannot wait without a PGPROC structure");
1515
1516         /*
1517          * Check if there's any work to do.  Use a barrier to ensure we get the
1518          * freshest value.
1519          */
1520         inserted = pg_atomic_read_membarrier_u64(&XLogCtl->logInsertResult);
1521         if (upto <= inserted)
1522                 return inserted;
1523
1524         /* Read the current insert position */
1525         SpinLockAcquire(&Insert->insertpos_lck);
1526         bytepos = Insert->CurrBytePos;
1527         SpinLockRelease(&Insert->insertpos_lck);
1528         reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1529
1530         /*
1531          * No-one should request to flush a piece of WAL that hasn't even been
1532          * reserved yet. However, it can happen if there is a block with a bogus
1533          * LSN on disk, for example. XLogFlush checks for that situation and
1534          * complains, but only after the flush. Here we just assume that to mean
1535          * that all WAL that has been reserved needs to be finished. In this
1536          * corner-case, the return value can be smaller than 'upto' argument.
1537          */
1538         if (upto > reservedUpto)
1539         {
1540                 ereport(LOG,
1541                                 (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
1542                                                 LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
1543                 upto = reservedUpto;
1544         }
1545
1546         /*
1547          * Loop through all the locks, sleeping on any in-progress insert older
1548          * than 'upto'.
1549          *
1550          * finishedUpto is our return value, indicating the point upto which all
1551          * the WAL insertions have been finished. Initialize it to the head of
1552          * reserved WAL, and as we iterate through the insertion locks, back it
1553          * out for any insertion that's still in progress.
1554          */
1555         finishedUpto = reservedUpto;
1556         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1557         {
1558                 XLogRecPtr      insertingat = InvalidXLogRecPtr;
1559
1560                 do
1561                 {
1562                         /*
1563                          * See if this insertion is in progress.  LWLockWaitForVar will
1564                          * wait for the lock to be released, or for the 'value' to be set
1565                          * by a LWLockUpdateVar call.  When a lock is initially acquired,
1566                          * its value is 0 (InvalidXLogRecPtr), which means that we don't
1567                          * know where it's inserting yet.  We will have to wait for it. If
1568                          * it's a small insertion, the record will most likely fit on the
1569                          * same page and the inserter will release the lock without ever
1570                          * calling LWLockUpdateVar.  But if it has to sleep, it will
1571                          * advertise the insertion point with LWLockUpdateVar before
1572                          * sleeping.
1573                          *
1574                          * In this loop we are only waiting for insertions that started
1575                          * before WaitXLogInsertionsToFinish was called.  The lack of
1576                          * memory barriers in the loop means that we might see locks as
1577                          * "unused" that have since become used.  This is fine because
1578                          * they only can be used for later insertions that we would not
1579                          * want to wait on anyway.  Not taking a lock to acquire the
1580                          * current insertingAt value means that we might see older
1581                          * insertingAt values.  This is also fine, because if we read a
1582                          * value too old, we will add ourselves to the wait queue, which
1583                          * contains atomic operations.
1584                          */
1585                         if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1586                                                                  &WALInsertLocks[i].l.insertingAt,
1587                                                                  insertingat, &insertingat))
1588                         {
1589                                 /* the lock was free, so no insertion in progress */
1590                                 insertingat = InvalidXLogRecPtr;
1591                                 break;
1592                         }
1593
1594                         /*
1595                          * This insertion is still in progress. Have to wait, unless the
1596                          * inserter has proceeded past 'upto'.
1597                          */
1598                 } while (insertingat < upto);
1599
1600                 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1601                         finishedUpto = insertingat;
1602         }
1603
1604         /*
1605          * Advance the limit we know to have been inserted and return the freshest
1606          * value we know of, which might be beyond what we requested if somebody
1607          * is concurrently doing this with an 'upto' pointer ahead of us.
1608          */
1609         finishedUpto = pg_atomic_monotonic_advance_u64(&XLogCtl->logInsertResult,
1610                                                                                                    finishedUpto);
1611
1612         return finishedUpto;
1613 }
1614
1615 /*
1616  * Get a pointer to the right location in the WAL buffer containing the
1617  * given XLogRecPtr.
1618  *
1619  * If the page is not initialized yet, it is initialized. That might require
1620  * evicting an old dirty buffer from the buffer cache, which means I/O.
1621  *
1622  * The caller must ensure that the page containing the requested location
1623  * isn't evicted yet, and won't be evicted. The way to ensure that is to
1624  * hold onto a WAL insertion lock with the insertingAt position set to
1625  * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1626  * to evict an old page from the buffer. (This means that once you call
1627  * GetXLogBuffer() with a given 'ptr', you must not access anything before
1628  * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1629  * later, because older buffers might be recycled already)
1630  */
1631 static char *
1632 GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
1633 {
1634         int                     idx;
1635         XLogRecPtr      endptr;
1636         static uint64 cachedPage = 0;
1637         static char *cachedPos = NULL;
1638         XLogRecPtr      expectedEndPtr;
1639
1640         /*
1641          * Fast path for the common case that we need to access again the same
1642          * page as last time.
1643          */
1644         if (ptr / XLOG_BLCKSZ == cachedPage)
1645         {
1646                 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1647                 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1648                 return cachedPos + ptr % XLOG_BLCKSZ;
1649         }
1650
1651         /*
1652          * The XLog buffer cache is organized so that a page is always loaded to a
1653          * particular buffer.  That way we can easily calculate the buffer a given
1654          * page must be loaded into, from the XLogRecPtr alone.
1655          */
1656         idx = XLogRecPtrToBufIdx(ptr);
1657
1658         /*
1659          * See what page is loaded in the buffer at the moment. It could be the
1660          * page we're looking for, or something older. It can't be anything newer
1661          * - that would imply the page we're looking for has already been written
1662          * out to disk and evicted, and the caller is responsible for making sure
1663          * that doesn't happen.
1664          *
1665          * We don't hold a lock while we read the value. If someone is just about
1666          * to initialize or has just initialized the page, it's possible that we
1667          * get InvalidXLogRecPtr. That's ok, we'll grab the mapping lock (in
1668          * AdvanceXLInsertBuffer) and retry if we see anything other than the page
1669          * we're looking for.
1670          */
1671         expectedEndPtr = ptr;
1672         expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1673
1674         endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1675         if (expectedEndPtr != endptr)
1676         {
1677                 XLogRecPtr      initializedUpto;
1678
1679                 /*
1680                  * Before calling AdvanceXLInsertBuffer(), which can block, let others
1681                  * know how far we're finished with inserting the record.
1682                  *
1683                  * NB: If 'ptr' points to just after the page header, advertise a
1684                  * position at the beginning of the page rather than 'ptr' itself. If
1685                  * there are no other insertions running, someone might try to flush
1686                  * up to our advertised location. If we advertised a position after
1687                  * the page header, someone might try to flush the page header, even
1688                  * though page might actually not be initialized yet. As the first
1689                  * inserter on the page, we are effectively responsible for making
1690                  * sure that it's initialized, before we let insertingAt to move past
1691                  * the page header.
1692                  */
1693                 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1694                         XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1695                         initializedUpto = ptr - SizeOfXLogShortPHD;
1696                 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1697                                  XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1698                         initializedUpto = ptr - SizeOfXLogLongPHD;
1699                 else
1700                         initializedUpto = ptr;
1701
1702                 WALInsertLockUpdateInsertingAt(initializedUpto);
1703
1704                 AdvanceXLInsertBuffer(ptr, tli, false);
1705                 endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1706
1707                 if (expectedEndPtr != endptr)
1708                         elog(PANIC, "could not find WAL buffer for %X/%X",
1709                                  LSN_FORMAT_ARGS(ptr));
1710         }
1711         else
1712         {
1713                 /*
1714                  * Make sure the initialization of the page is visible to us, and
1715                  * won't arrive later to overwrite the WAL data we write on the page.
1716                  */
1717                 pg_memory_barrier();
1718         }
1719
1720         /*
1721          * Found the buffer holding this page. Return a pointer to the right
1722          * offset within the page.
1723          */
1724         cachedPage = ptr / XLOG_BLCKSZ;
1725         cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1726
1727         Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1728         Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1729
1730         return cachedPos + ptr % XLOG_BLCKSZ;
1731 }
1732
1733 /*
1734  * Read WAL data directly from WAL buffers, if available. Returns the number
1735  * of bytes read successfully.
1736  *
1737  * Fewer than 'count' bytes may be read if some of the requested WAL data has
1738  * already been evicted.
1739  *
1740  * No locks are taken.
1741  *
1742  * Caller should ensure that it reads no further than LogwrtResult.Write
1743  * (which should have been updated by the caller when determining how far to
1744  * read). The 'tli' argument is only used as a convenient safety check so that
1745  * callers do not read from WAL buffers on a historical timeline.
1746  */
1747 Size
1748 WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
1749                                    TimeLineID tli)
1750 {
1751         char       *pdst = dstbuf;
1752         XLogRecPtr      recptr = startptr;
1753         XLogRecPtr      inserted;
1754         Size            nbytes = count;
1755
1756         if (RecoveryInProgress() || tli != GetWALInsertionTimeLine())
1757                 return 0;
1758
1759         Assert(!XLogRecPtrIsInvalid(startptr));
1760
1761         /*
1762          * Caller should ensure that the requested data has been inserted into WAL
1763          * buffers before we try to read it.
1764          */
1765         inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
1766         if (startptr + count > inserted)
1767                 ereport(ERROR,
1768                                 errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X",
1769                                            LSN_FORMAT_ARGS(startptr + count),
1770                                            LSN_FORMAT_ARGS(inserted)));
1771
1772         /*
1773          * Loop through the buffers without a lock. For each buffer, atomically
1774          * read and verify the end pointer, then copy the data out, and finally
1775          * re-read and re-verify the end pointer.
1776          *
1777          * Once a page is evicted, it never returns to the WAL buffers, so if the
1778          * end pointer matches the expected end pointer before and after we copy
1779          * the data, then the right page must have been present during the data
1780          * copy. Read barriers are necessary to ensure that the data copy actually
1781          * happens between the two verification steps.
1782          *
1783          * If either verification fails, we simply terminate the loop and return
1784          * with the data that had been already copied out successfully.
1785          */
1786         while (nbytes > 0)
1787         {
1788                 uint32          offset = recptr % XLOG_BLCKSZ;
1789                 int                     idx = XLogRecPtrToBufIdx(recptr);
1790                 XLogRecPtr      expectedEndPtr;
1791                 XLogRecPtr      endptr;
1792                 const char *page;
1793                 const char *psrc;
1794                 Size            npagebytes;
1795
1796                 /*
1797                  * Calculate the end pointer we expect in the xlblocks array if the
1798                  * correct page is present.
1799                  */
1800                 expectedEndPtr = recptr + (XLOG_BLCKSZ - offset);
1801
1802                 /*
1803                  * First verification step: check that the correct page is present in
1804                  * the WAL buffers.
1805                  */
1806                 endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1807                 if (expectedEndPtr != endptr)
1808                         break;
1809
1810                 /*
1811                  * The correct page is present (or was at the time the endptr was
1812                  * read; must re-verify later). Calculate pointer to source data and
1813                  * determine how much data to read from this page.
1814                  */
1815                 page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1816                 psrc = page + offset;
1817                 npagebytes = Min(nbytes, XLOG_BLCKSZ - offset);
1818
1819                 /*
1820                  * Ensure that the data copy and the first verification step are not
1821                  * reordered.
1822                  */
1823                 pg_read_barrier();
1824
1825                 /* data copy */
1826                 memcpy(pdst, psrc, npagebytes);
1827
1828                 /*
1829                  * Ensure that the data copy and the second verification step are not
1830                  * reordered.
1831                  */
1832                 pg_read_barrier();
1833
1834                 /*
1835                  * Second verification step: check that the page we read from wasn't
1836                  * evicted while we were copying the data.
1837                  */
1838                 endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1839                 if (expectedEndPtr != endptr)
1840                         break;
1841
1842                 pdst += npagebytes;
1843                 recptr += npagebytes;
1844                 nbytes -= npagebytes;
1845         }
1846
1847         Assert(pdst - dstbuf <= count);
1848
1849         return pdst - dstbuf;
1850 }
1851
1852 /*
1853  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1854  * is the position starting from the beginning of WAL, excluding all WAL
1855  * page headers.
1856  */
1857 static XLogRecPtr
1858 XLogBytePosToRecPtr(uint64 bytepos)
1859 {
1860         uint64          fullsegs;
1861         uint64          fullpages;
1862         uint64          bytesleft;
1863         uint32          seg_offset;
1864         XLogRecPtr      result;
1865
1866         fullsegs = bytepos / UsableBytesInSegment;
1867         bytesleft = bytepos % UsableBytesInSegment;
1868
1869         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1870         {
1871                 /* fits on first page of segment */
1872                 seg_offset = bytesleft + SizeOfXLogLongPHD;
1873         }
1874         else
1875         {
1876                 /* account for the first page on segment with long header */
1877                 seg_offset = XLOG_BLCKSZ;
1878                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1879
1880                 fullpages = bytesleft / UsableBytesInPage;
1881                 bytesleft = bytesleft % UsableBytesInPage;
1882
1883                 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1884         }
1885
1886         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1887
1888         return result;
1889 }
1890
1891 /*
1892  * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1893  * returns a pointer to the beginning of the page (ie. before page header),
1894  * not to where the first xlog record on that page would go to. This is used
1895  * when converting a pointer to the end of a record.
1896  */
1897 static XLogRecPtr
1898 XLogBytePosToEndRecPtr(uint64 bytepos)
1899 {
1900         uint64          fullsegs;
1901         uint64          fullpages;
1902         uint64          bytesleft;
1903         uint32          seg_offset;
1904         XLogRecPtr      result;
1905
1906         fullsegs = bytepos / UsableBytesInSegment;
1907         bytesleft = bytepos % UsableBytesInSegment;
1908
1909         if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1910         {
1911                 /* fits on first page of segment */
1912                 if (bytesleft == 0)
1913                         seg_offset = 0;
1914                 else
1915                         seg_offset = bytesleft + SizeOfXLogLongPHD;
1916         }
1917         else
1918         {
1919                 /* account for the first page on segment with long header */
1920                 seg_offset = XLOG_BLCKSZ;
1921                 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1922
1923                 fullpages = bytesleft / UsableBytesInPage;
1924                 bytesleft = bytesleft % UsableBytesInPage;
1925
1926                 if (bytesleft == 0)
1927                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1928                 else
1929                         seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1930         }
1931
1932         XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1933
1934         return result;
1935 }
1936
1937 /*
1938  * Convert an XLogRecPtr to a "usable byte position".
1939  */
1940 static uint64
1941 XLogRecPtrToBytePos(XLogRecPtr ptr)
1942 {
1943         uint64          fullsegs;
1944         uint32          fullpages;
1945         uint32          offset;
1946         uint64          result;
1947
1948         XLByteToSeg(ptr, fullsegs, wal_segment_size);
1949
1950         fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
1951         offset = ptr % XLOG_BLCKSZ;
1952
1953         if (fullpages == 0)
1954         {
1955                 result = fullsegs * UsableBytesInSegment;
1956                 if (offset > 0)
1957                 {
1958                         Assert(offset >= SizeOfXLogLongPHD);
1959                         result += offset - SizeOfXLogLongPHD;
1960                 }
1961         }
1962         else
1963         {
1964                 result = fullsegs * UsableBytesInSegment +
1965                         (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
1966                         (fullpages - 1) * UsableBytesInPage;    /* full pages */
1967                 if (offset > 0)
1968                 {
1969                         Assert(offset >= SizeOfXLogShortPHD);
1970                         result += offset - SizeOfXLogShortPHD;
1971                 }
1972         }
1973
1974         return result;
1975 }
1976
1977 /*
1978  * Initialize XLOG buffers, writing out old buffers if they still contain
1979  * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
1980  * true, initialize as many pages as we can without having to write out
1981  * unwritten data. Any new pages are initialized to zeros, with pages headers
1982  * initialized properly.
1983  */
1984 static void
1985 AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
1986 {
1987         XLogCtlInsert *Insert = &XLogCtl->Insert;
1988         int                     nextidx;
1989         XLogRecPtr      OldPageRqstPtr;
1990         XLogwrtRqst WriteRqst;
1991         XLogRecPtr      NewPageEndPtr = InvalidXLogRecPtr;
1992         XLogRecPtr      NewPageBeginPtr;
1993         XLogPageHeader NewPage;
1994         int                     npages pg_attribute_unused() = 0;
1995
1996         LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1997
1998         /*
1999          * Now that we have the lock, check if someone initialized the page
2000          * already.
2001          */
2002         while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2003         {
2004                 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2005
2006                 /*
2007                  * Get ending-offset of the buffer page we need to replace (this may
2008                  * be zero if the buffer hasn't been used yet).  Fall through if it's
2009                  * already written out.
2010                  */
2011                 OldPageRqstPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]);
2012                 if (LogwrtResult.Write < OldPageRqstPtr)
2013                 {
2014                         /*
2015                          * Nope, got work to do. If we just want to pre-initialize as much
2016                          * as we can without flushing, give up now.
2017                          */
2018                         if (opportunistic)
2019                                 break;
2020
2021                         /* Advance shared memory write request position */
2022                         SpinLockAcquire(&XLogCtl->info_lck);
2023                         if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2024                                 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2025                         SpinLockRelease(&XLogCtl->info_lck);
2026
2027                         /*
2028                          * Acquire an up-to-date LogwrtResult value and see if we still
2029                          * need to write it or if someone else already did.
2030                          */
2031                         RefreshXLogWriteResult(LogwrtResult);
2032                         if (LogwrtResult.Write < OldPageRqstPtr)
2033                         {
2034                                 /*
2035                                  * Must acquire write lock. Release WALBufMappingLock first,
2036                                  * to make sure that all insertions that we need to wait for
2037                                  * can finish (up to this same position). Otherwise we risk
2038                                  * deadlock.
2039                                  */
2040                                 LWLockRelease(WALBufMappingLock);
2041
2042                                 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2043
2044                                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2045
2046                                 RefreshXLogWriteResult(LogwrtResult);
2047                                 if (LogwrtResult.Write >= OldPageRqstPtr)
2048                                 {
2049                                         /* OK, someone wrote it already */
2050                                         LWLockRelease(WALWriteLock);
2051                                 }
2052                                 else
2053                                 {
2054                                         /* Have to write it ourselves */
2055                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2056                                         WriteRqst.Write = OldPageRqstPtr;
2057                                         WriteRqst.Flush = 0;
2058                                         XLogWrite(WriteRqst, tli, false);
2059                                         LWLockRelease(WALWriteLock);
2060                                         PendingWalStats.wal_buffers_full++;
2061                                         TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2062                                 }
2063                                 /* Re-acquire WALBufMappingLock and retry */
2064                                 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2065                                 continue;
2066                         }
2067                 }
2068
2069                 /*
2070                  * Now the next buffer slot is free and we can set it up to be the
2071                  * next output page.
2072                  */
2073                 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2074                 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2075
2076                 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2077
2078                 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2079
2080                 /*
2081                  * Mark the xlblock with InvalidXLogRecPtr and issue a write barrier
2082                  * before initializing. Otherwise, the old page may be partially
2083                  * zeroed but look valid.
2084                  */
2085                 pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
2086                 pg_write_barrier();
2087
2088                 /*
2089                  * Be sure to re-zero the buffer so that bytes beyond what we've
2090                  * written will look like zeroes and not valid XLOG records...
2091                  */
2092                 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2093
2094                 /*
2095                  * Fill the new page's header
2096                  */
2097                 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2098
2099                 /* NewPage->xlp_info = 0; */    /* done by memset */
2100                 NewPage->xlp_tli = tli;
2101                 NewPage->xlp_pageaddr = NewPageBeginPtr;
2102
2103                 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2104
2105                 /*
2106                  * If online backup is not in progress, mark the header to indicate
2107                  * that WAL records beginning in this page have removable backup
2108                  * blocks.  This allows the WAL archiver to know whether it is safe to
2109                  * compress archived WAL data by transforming full-block records into
2110                  * the non-full-block format.  It is sufficient to record this at the
2111                  * page level because we force a page switch (in fact a segment
2112                  * switch) when starting a backup, so the flag will be off before any
2113                  * records can be written during the backup.  At the end of a backup,
2114                  * the last page will be marked as all unsafe when perhaps only part
2115                  * is unsafe, but at worst the archiver would miss the opportunity to
2116                  * compress a few records.
2117                  */
2118                 if (Insert->runningBackups == 0)
2119                         NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2120
2121                 /*
2122                  * If first page of an XLOG segment file, make it a long header.
2123                  */
2124                 if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2125                 {
2126                         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2127
2128                         NewLongPage->xlp_sysid = ControlFile->system_identifier;
2129                         NewLongPage->xlp_seg_size = wal_segment_size;
2130                         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2131                         NewPage->xlp_info |= XLP_LONG_HEADER;
2132                 }
2133
2134                 /*
2135                  * Make sure the initialization of the page becomes visible to others
2136                  * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2137                  * holding a lock.
2138                  */
2139                 pg_write_barrier();
2140
2141                 pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
2142                 XLogCtl->InitializedUpTo = NewPageEndPtr;
2143
2144                 npages++;
2145         }
2146         LWLockRelease(WALBufMappingLock);
2147
2148 #ifdef WAL_DEBUG
2149         if (XLOG_DEBUG && npages > 0)
2150         {
2151                 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2152                          npages, LSN_FORMAT_ARGS(NewPageEndPtr));
2153         }
2154 #endif
2155 }
2156
2157 /*
2158  * Calculate CheckPointSegments based on max_wal_size_mb and
2159  * checkpoint_completion_target.
2160  */
2161 static void
2162 CalculateCheckpointSegments(void)
2163 {
2164         double          target;
2165
2166         /*-------
2167          * Calculate the distance at which to trigger a checkpoint, to avoid
2168          * exceeding max_wal_size_mb. This is based on two assumptions:
2169          *
2170          * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2171          *    WAL for two checkpoint cycles to allow us to recover from the
2172          *    secondary checkpoint if the first checkpoint failed, though we
2173          *    only did this on the primary anyway, not on standby. Keeping just
2174          *    one checkpoint simplifies processing and reduces disk space in
2175          *    many smaller databases.)
2176          * b) during checkpoint, we consume checkpoint_completion_target *
2177          *        number of segments consumed between checkpoints.
2178          *-------
2179          */
2180         target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2181                 (1.0 + CheckPointCompletionTarget);
2182
2183         /* round down */
2184         CheckPointSegments = (int) target;
2185
2186         if (CheckPointSegments < 1)
2187                 CheckPointSegments = 1;
2188 }
2189
2190 void
2191 assign_max_wal_size(int newval, void *extra)
2192 {
2193         max_wal_size_mb = newval;
2194         CalculateCheckpointSegments();
2195 }
2196
2197 void
2198 assign_checkpoint_completion_target(double newval, void *extra)
2199 {
2200         CheckPointCompletionTarget = newval;
2201         CalculateCheckpointSegments();
2202 }
2203
2204 bool
2205 check_wal_segment_size(int *newval, void **extra, GucSource source)
2206 {
2207         if (!IsValidWalSegSize(*newval))
2208         {
2209                 GUC_check_errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
2210                 return false;
2211         }
2212
2213         return true;
2214 }
2215
2216 /*
2217  * GUC check_hook for max_slot_wal_keep_size
2218  *
2219  * We don't allow the value of max_slot_wal_keep_size other than -1 during the
2220  * binary upgrade. See start_postmaster() in pg_upgrade for more details.
2221  */
2222 bool
2223 check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source)
2224 {
2225         if (IsBinaryUpgrade && *newval != -1)
2226         {
2227                 GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.",
2228                                                         "max_slot_wal_keep_size");
2229                 return false;
2230         }
2231
2232         return true;
2233 }
2234
2235 /*
2236  * At a checkpoint, how many WAL segments to recycle as preallocated future
2237  * XLOG segments? Returns the highest segment that should be preallocated.
2238  */
2239 static XLogSegNo
2240 XLOGfileslop(XLogRecPtr lastredoptr)
2241 {
2242         XLogSegNo       minSegNo;
2243         XLogSegNo       maxSegNo;
2244         double          distance;
2245         XLogSegNo       recycleSegNo;
2246
2247         /*
2248          * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2249          * correspond to. Always recycle enough segments to meet the minimum, and
2250          * remove enough segments to stay below the maximum.
2251          */
2252         minSegNo = lastredoptr / wal_segment_size +
2253                 ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2254         maxSegNo = lastredoptr / wal_segment_size +
2255                 ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2256
2257         /*
2258          * Between those limits, recycle enough segments to get us through to the
2259          * estimated end of next checkpoint.
2260          *
2261          * To estimate where the next checkpoint will finish, assume that the
2262          * system runs steadily consuming CheckPointDistanceEstimate bytes between
2263          * every checkpoint.
2264          */
2265         distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2266         /* add 10% for good measure. */
2267         distance *= 1.10;
2268
2269         recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2270                                                                         wal_segment_size);
2271
2272         if (recycleSegNo < minSegNo)
2273                 recycleSegNo = minSegNo;
2274         if (recycleSegNo > maxSegNo)
2275                 recycleSegNo = maxSegNo;
2276
2277         return recycleSegNo;
2278 }
2279
2280 /*
2281  * Check whether we've consumed enough xlog space that a checkpoint is needed.
2282  *
2283  * new_segno indicates a log file that has just been filled up (or read
2284  * during recovery). We measure the distance from RedoRecPtr to new_segno
2285  * and see if that exceeds CheckPointSegments.
2286  *
2287  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2288  */
2289 bool
2290 XLogCheckpointNeeded(XLogSegNo new_segno)
2291 {
2292         XLogSegNo       old_segno;
2293
2294         XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2295
2296         if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2297                 return true;
2298         return false;
2299 }
2300
2301 /*
2302  * Write and/or fsync the log at least as far as WriteRqst indicates.
2303  *
2304  * If flexible == true, we don't have to write as far as WriteRqst, but
2305  * may stop at any convenient boundary (such as a cache or logfile boundary).
2306  * This option allows us to avoid uselessly issuing multiple writes when a
2307  * single one would do.
2308  *
2309  * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2310  * must be called before grabbing the lock, to make sure the data is ready to
2311  * write.
2312  */
2313 static void
2314 XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
2315 {
2316         bool            ispartialpage;
2317         bool            last_iteration;
2318         bool            finishing_seg;
2319         int                     curridx;
2320         int                     npages;
2321         int                     startidx;
2322         uint32          startoffset;
2323
2324         /* We should always be inside a critical section here */
2325         Assert(CritSectionCount > 0);
2326
2327         /*
2328          * Update local LogwrtResult (caller probably did this already, but...)
2329          */
2330         RefreshXLogWriteResult(LogwrtResult);
2331
2332         /*
2333          * Since successive pages in the xlog cache are consecutively allocated,
2334          * we can usually gather multiple pages together and issue just one
2335          * write() call.  npages is the number of pages we have determined can be
2336          * written together; startidx is the cache block index of the first one,
2337          * and startoffset is the file offset at which it should go. The latter
2338          * two variables are only valid when npages > 0, but we must initialize
2339          * all of them to keep the compiler quiet.
2340          */
2341         npages = 0;
2342         startidx = 0;
2343         startoffset = 0;
2344
2345         /*
2346          * Within the loop, curridx is the cache block index of the page to
2347          * consider writing.  Begin at the buffer containing the next unwritten
2348          * page, or last partially written page.
2349          */
2350         curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2351
2352         while (LogwrtResult.Write < WriteRqst.Write)
2353         {
2354                 /*
2355                  * Make sure we're not ahead of the insert process.  This could happen
2356                  * if we're passed a bogus WriteRqst.Write that is past the end of the
2357                  * last page that's been initialized by AdvanceXLInsertBuffer.
2358                  */
2359                 XLogRecPtr      EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
2360
2361                 if (LogwrtResult.Write >= EndPtr)
2362                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2363                                  LSN_FORMAT_ARGS(LogwrtResult.Write),
2364                                  LSN_FORMAT_ARGS(EndPtr));
2365
2366                 /* Advance LogwrtResult.Write to end of current buffer page */
2367                 LogwrtResult.Write = EndPtr;
2368                 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2369
2370                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2371                                                          wal_segment_size))
2372                 {
2373                         /*
2374                          * Switch to new logfile segment.  We cannot have any pending
2375                          * pages here (since we dump what we have at segment end).
2376                          */
2377                         Assert(npages == 0);
2378                         if (openLogFile >= 0)
2379                                 XLogFileClose();
2380                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2381                                                         wal_segment_size);
2382                         openLogTLI = tli;
2383
2384                         /* create/use new log file */
2385                         openLogFile = XLogFileInit(openLogSegNo, tli);
2386                         ReserveExternalFD();
2387                 }
2388
2389                 /* Make sure we have the current logfile open */
2390                 if (openLogFile < 0)
2391                 {
2392                         XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2393                                                         wal_segment_size);
2394                         openLogTLI = tli;
2395                         openLogFile = XLogFileOpen(openLogSegNo, tli);
2396                         ReserveExternalFD();
2397                 }
2398
2399                 /* Add current page to the set of pending pages-to-dump */
2400                 if (npages == 0)
2401                 {
2402                         /* first of group */
2403                         startidx = curridx;
2404                         startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2405                                                                                         wal_segment_size);
2406                 }
2407                 npages++;
2408
2409                 /*
2410                  * Dump the set if this will be the last loop iteration, or if we are
2411                  * at the last page of the cache area (since the next page won't be
2412                  * contiguous in memory), or if we are at the end of the logfile
2413                  * segment.
2414                  */
2415                 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2416
2417                 finishing_seg = !ispartialpage &&
2418                         (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2419
2420                 if (last_iteration ||
2421                         curridx == XLogCtl->XLogCacheBlck ||
2422                         finishing_seg)
2423                 {
2424                         char       *from;
2425                         Size            nbytes;
2426                         Size            nleft;
2427                         ssize_t         written;
2428                         instr_time      start;
2429
2430                         /* OK to write the page(s) */
2431                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2432                         nbytes = npages * (Size) XLOG_BLCKSZ;
2433                         nleft = nbytes;
2434                         do
2435                         {
2436                                 errno = 0;
2437
2438                                 /*
2439                                  * Measure I/O timing to write WAL data, for pg_stat_io and/or
2440                                  * pg_stat_wal.
2441                                  */
2442                                 start = pgstat_prepare_io_time(track_io_timing || track_wal_io_timing);
2443
2444                                 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2445                                 written = pg_pwrite(openLogFile, from, nleft, startoffset);
2446                                 pgstat_report_wait_end();
2447
2448                                 pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
2449                                                                                 IOOP_WRITE, start, 1, written);
2450
2451                                 /*
2452                                  * Increment the I/O timing and the number of times WAL data
2453                                  * were written out to disk.
2454                                  */
2455                                 if (track_wal_io_timing)
2456                                 {
2457                                         instr_time      end;
2458
2459                                         INSTR_TIME_SET_CURRENT(end);
2460                                         INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_write_time, end, start);
2461                                 }
2462
2463                                 PendingWalStats.wal_write++;
2464
2465                                 if (written <= 0)
2466                                 {
2467                                         char            xlogfname[MAXFNAMELEN];
2468                                         int                     save_errno;
2469
2470                                         if (errno == EINTR)
2471                                                 continue;
2472
2473                                         save_errno = errno;
2474                                         XLogFileName(xlogfname, tli, openLogSegNo,
2475                                                                  wal_segment_size);
2476                                         errno = save_errno;
2477                                         ereport(PANIC,
2478                                                         (errcode_for_file_access(),
2479                                                          errmsg("could not write to log file \"%s\" at offset %u, length %zu: %m",
2480                                                                         xlogfname, startoffset, nleft)));
2481                                 }
2482                                 nleft -= written;
2483                                 from += written;
2484                                 startoffset += written;
2485                         } while (nleft > 0);
2486
2487                         npages = 0;
2488
2489                         /*
2490                          * If we just wrote the whole last page of a logfile segment,
2491                          * fsync the segment immediately.  This avoids having to go back
2492                          * and re-open prior segments when an fsync request comes along
2493                          * later. Doing it here ensures that one and only one backend will
2494                          * perform this fsync.
2495                          *
2496                          * This is also the right place to notify the Archiver that the
2497                          * segment is ready to copy to archival storage, and to update the
2498                          * timer for archive_timeout, and to signal for a checkpoint if
2499                          * too many logfile segments have been used since the last
2500                          * checkpoint.
2501                          */
2502                         if (finishing_seg)
2503                         {
2504                                 issue_xlog_fsync(openLogFile, openLogSegNo, tli);
2505
2506                                 /* signal that we need to wakeup walsenders later */
2507                                 WalSndWakeupRequest();
2508
2509                                 LogwrtResult.Flush = LogwrtResult.Write;        /* end of page */
2510
2511                                 if (XLogArchivingActive())
2512                                         XLogArchiveNotifySeg(openLogSegNo, tli);
2513
2514                                 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2515                                 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2516
2517                                 /*
2518                                  * Request a checkpoint if we've consumed too much xlog since
2519                                  * the last one.  For speed, we first check using the local
2520                                  * copy of RedoRecPtr, which might be out of date; if it looks
2521                                  * like a checkpoint is needed, forcibly update RedoRecPtr and
2522                                  * recheck.
2523                                  */
2524                                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2525                                 {
2526                                         (void) GetRedoRecPtr();
2527                                         if (XLogCheckpointNeeded(openLogSegNo))
2528                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2529                                 }
2530                         }
2531                 }
2532
2533                 if (ispartialpage)
2534                 {
2535                         /* Only asked to write a partial page */
2536                         LogwrtResult.Write = WriteRqst.Write;
2537                         break;
2538                 }
2539                 curridx = NextBufIdx(curridx);
2540
2541                 /* If flexible, break out of loop as soon as we wrote something */
2542                 if (flexible && npages == 0)
2543                         break;
2544         }
2545
2546         Assert(npages == 0);
2547
2548         /*
2549          * If asked to flush, do so
2550          */
2551         if (LogwrtResult.Flush < WriteRqst.Flush &&
2552                 LogwrtResult.Flush < LogwrtResult.Write)
2553         {
2554                 /*
2555                  * Could get here without iterating above loop, in which case we might
2556                  * have no open file or the wrong one.  However, we do not need to
2557                  * fsync more than one file.
2558                  */
2559                 if (wal_sync_method != WAL_SYNC_METHOD_OPEN &&
2560                         wal_sync_method != WAL_SYNC_METHOD_OPEN_DSYNC)
2561                 {
2562                         if (openLogFile >= 0 &&
2563                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2564                                                                  wal_segment_size))
2565                                 XLogFileClose();
2566                         if (openLogFile < 0)
2567                         {
2568                                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2569                                                                 wal_segment_size);
2570                                 openLogTLI = tli;
2571                                 openLogFile = XLogFileOpen(openLogSegNo, tli);
2572                                 ReserveExternalFD();
2573                         }
2574
2575                         issue_xlog_fsync(openLogFile, openLogSegNo, tli);
2576                 }
2577
2578                 /* signal that we need to wakeup walsenders later */
2579                 WalSndWakeupRequest();
2580
2581                 LogwrtResult.Flush = LogwrtResult.Write;
2582         }
2583
2584         /*
2585          * Update shared-memory status
2586          *
2587          * We make sure that the shared 'request' values do not fall behind the
2588          * 'result' values.  This is not absolutely essential, but it saves some
2589          * code in a couple of places.
2590          */
2591         SpinLockAcquire(&XLogCtl->info_lck);
2592         if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2593                 XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2594         if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2595                 XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2596         SpinLockRelease(&XLogCtl->info_lck);
2597
2598         /*
2599          * We write Write first, bar, then Flush.  When reading, the opposite must
2600          * be done (with a matching barrier in between), so that we always see a
2601          * Flush value that trails behind the Write value seen.
2602          */
2603         pg_atomic_write_u64(&XLogCtl->logWriteResult, LogwrtResult.Write);
2604         pg_write_barrier();
2605         pg_atomic_write_u64(&XLogCtl->logFlushResult, LogwrtResult.Flush);
2606
2607 #ifdef USE_ASSERT_CHECKING
2608         {
2609                 XLogRecPtr      Flush;
2610                 XLogRecPtr      Write;
2611                 XLogRecPtr      Insert;
2612
2613                 Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult);
2614                 pg_read_barrier();
2615                 Write = pg_atomic_read_u64(&XLogCtl->logWriteResult);
2616                 pg_read_barrier();
2617                 Insert = pg_atomic_read_u64(&XLogCtl->logInsertResult);
2618
2619                 /* WAL written to disk is always ahead of WAL flushed */
2620                 Assert(Write >= Flush);
2621
2622                 /* WAL inserted to buffers is always ahead of WAL written */
2623                 Assert(Insert >= Write);
2624         }
2625 #endif
2626 }
2627
2628 /*
2629  * Record the LSN for an asynchronous transaction commit/abort
2630  * and nudge the WALWriter if there is work for it to do.
2631  * (This should not be called for synchronous commits.)
2632  */
2633 void
2634 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2635 {
2636         XLogRecPtr      WriteRqstPtr = asyncXactLSN;
2637         bool            sleeping;
2638         bool            wakeup = false;
2639         XLogRecPtr      prevAsyncXactLSN;
2640
2641         SpinLockAcquire(&XLogCtl->info_lck);
2642         sleeping = XLogCtl->WalWriterSleeping;
2643         prevAsyncXactLSN = XLogCtl->asyncXactLSN;
2644         if (XLogCtl->asyncXactLSN < asyncXactLSN)
2645                 XLogCtl->asyncXactLSN = asyncXactLSN;
2646         SpinLockRelease(&XLogCtl->info_lck);
2647
2648         /*
2649          * If somebody else already called this function with a more aggressive
2650          * LSN, they will have done what we needed (and perhaps more).
2651          */
2652         if (asyncXactLSN <= prevAsyncXactLSN)
2653                 return;
2654
2655         /*
2656          * If the WALWriter is sleeping, kick it to make it come out of low-power
2657          * mode, so that this async commit will reach disk within the expected
2658          * amount of time.  Otherwise, determine whether it has enough WAL
2659          * available to flush, the same way that XLogBackgroundFlush() does.
2660          */
2661         if (sleeping)
2662                 wakeup = true;
2663         else
2664         {
2665                 int                     flushblocks;
2666
2667                 RefreshXLogWriteResult(LogwrtResult);
2668
2669                 flushblocks =
2670                         WriteRqstPtr / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
2671
2672                 if (WalWriterFlushAfter == 0 || flushblocks >= WalWriterFlushAfter)
2673                         wakeup = true;
2674         }
2675
2676         if (wakeup)
2677         {
2678                 volatile PROC_HDR *procglobal = ProcGlobal;
2679                 ProcNumber      walwriterProc = procglobal->walwriterProc;
2680
2681                 if (walwriterProc != INVALID_PROC_NUMBER)
2682                         SetLatch(&GetPGProcByNumber(walwriterProc)->procLatch);
2683         }
2684 }
2685
2686 /*
2687  * Record the LSN up to which we can remove WAL because it's not required by
2688  * any replication slot.
2689  */
2690 void
2691 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2692 {
2693         SpinLockAcquire(&XLogCtl->info_lck);
2694         XLogCtl->replicationSlotMinLSN = lsn;
2695         SpinLockRelease(&XLogCtl->info_lck);
2696 }
2697
2698
2699 /*
2700  * Return the oldest LSN we must retain to satisfy the needs of some
2701  * replication slot.
2702  */
2703 static XLogRecPtr
2704 XLogGetReplicationSlotMinimumLSN(void)
2705 {
2706         XLogRecPtr      retval;
2707
2708         SpinLockAcquire(&XLogCtl->info_lck);
2709         retval = XLogCtl->replicationSlotMinLSN;
2710         SpinLockRelease(&XLogCtl->info_lck);
2711
2712         return retval;
2713 }
2714
2715 /*
2716  * Advance minRecoveryPoint in control file.
2717  *
2718  * If we crash during recovery, we must reach this point again before the
2719  * database is consistent.
2720  *
2721  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2722  * is only updated if it's not already greater than or equal to 'lsn'.
2723  */
2724 static void
2725 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2726 {
2727         /* Quick check using our local copy of the variable */
2728         if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
2729                 return;
2730
2731         /*
2732          * An invalid minRecoveryPoint means that we need to recover all the WAL,
2733          * i.e., we're doing crash recovery.  We never modify the control file's
2734          * value in that case, so we can short-circuit future checks here too. The
2735          * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2736          * updated until crash recovery finishes.  We only do this for the startup
2737          * process as it should not update its own reference of minRecoveryPoint
2738          * until it has finished crash recovery to make sure that all WAL
2739          * available is replayed in this case.  This also saves from extra locks
2740          * taken on the control file from the startup process.
2741          */
2742         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
2743         {
2744                 updateMinRecoveryPoint = false;
2745                 return;
2746         }
2747
2748         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2749
2750         /* update local copy */
2751         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
2752         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2753
2754         if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
2755                 updateMinRecoveryPoint = false;
2756         else if (force || LocalMinRecoveryPoint < lsn)
2757         {
2758                 XLogRecPtr      newMinRecoveryPoint;
2759                 TimeLineID      newMinRecoveryPointTLI;
2760
2761                 /*
2762                  * To avoid having to update the control file too often, we update it
2763                  * all the way to the last record being replayed, even though 'lsn'
2764                  * would suffice for correctness.  This also allows the 'force' case
2765                  * to not need a valid 'lsn' value.
2766                  *
2767                  * Another important reason for doing it this way is that the passed
2768                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2769                  * the caller got it from a corrupted heap page.  Accepting such a
2770                  * value as the min recovery point would prevent us from coming up at
2771                  * all.  Instead, we just log a warning and continue with recovery.
2772                  * (See also the comments about corrupt LSNs in XLogFlush.)
2773                  */
2774                 newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
2775                 if (!force && newMinRecoveryPoint < lsn)
2776                         elog(WARNING,
2777                                  "xlog min recovery request %X/%X is past current point %X/%X",
2778                                  LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
2779
2780                 /* update control file */
2781                 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2782                 {
2783                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2784                         ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2785                         UpdateControlFile();
2786                         LocalMinRecoveryPoint = newMinRecoveryPoint;
2787                         LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
2788
2789                         ereport(DEBUG2,
2790                                         (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
2791                                                                          LSN_FORMAT_ARGS(newMinRecoveryPoint),
2792                                                                          newMinRecoveryPointTLI)));
2793                 }
2794         }
2795         LWLockRelease(ControlFileLock);
2796 }
2797
2798 /*
2799  * Ensure that all XLOG data through the given position is flushed to disk.
2800  *
2801  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2802  * already held, and we try to avoid acquiring it if possible.
2803  */
2804 void
2805 XLogFlush(XLogRecPtr record)
2806 {
2807         XLogRecPtr      WriteRqstPtr;
2808         XLogwrtRqst WriteRqst;
2809         TimeLineID      insertTLI = XLogCtl->InsertTimeLineID;
2810
2811         /*
2812          * During REDO, we are reading not writing WAL.  Therefore, instead of
2813          * trying to flush the WAL, we should update minRecoveryPoint instead. We
2814          * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2815          * to act this way too, and because when it tries to write the
2816          * end-of-recovery checkpoint, it should indeed flush.
2817          */
2818         if (!XLogInsertAllowed())
2819         {
2820                 UpdateMinRecoveryPoint(record, false);
2821                 return;
2822         }
2823
2824         /* Quick exit if already known flushed */
2825         if (record <= LogwrtResult.Flush)
2826                 return;
2827
2828 #ifdef WAL_DEBUG
2829         if (XLOG_DEBUG)
2830                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2831                          LSN_FORMAT_ARGS(record),
2832                          LSN_FORMAT_ARGS(LogwrtResult.Write),
2833                          LSN_FORMAT_ARGS(LogwrtResult.Flush));
2834 #endif
2835
2836         START_CRIT_SECTION();
2837
2838         /*
2839          * Since fsync is usually a horribly expensive operation, we try to
2840          * piggyback as much data as we can on each fsync: if we see any more data
2841          * entered into the xlog buffer, we'll write and fsync that too, so that
2842          * the final value of LogwrtResult.Flush is as large as possible. This
2843          * gives us some chance of avoiding another fsync immediately after.
2844          */
2845
2846         /* initialize to given target; may increase below */
2847         WriteRqstPtr = record;
2848
2849         /*
2850          * Now wait until we get the write lock, or someone else does the flush
2851          * for us.
2852          */
2853         for (;;)
2854         {
2855                 XLogRecPtr      insertpos;
2856
2857                 /* done already? */
2858                 RefreshXLogWriteResult(LogwrtResult);
2859                 if (record <= LogwrtResult.Flush)
2860                         break;
2861
2862                 /*
2863                  * Before actually performing the write, wait for all in-flight
2864                  * insertions to the pages we're about to write to finish.
2865                  */
2866                 SpinLockAcquire(&XLogCtl->info_lck);
2867                 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2868                         WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2869                 SpinLockRelease(&XLogCtl->info_lck);
2870                 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2871
2872                 /*
2873                  * Try to get the write lock. If we can't get it immediately, wait
2874                  * until it's released, and recheck if we still need to do the flush
2875                  * or if the backend that held the lock did it for us already. This
2876                  * helps to maintain a good rate of group committing when the system
2877                  * is bottlenecked by the speed of fsyncing.
2878                  */
2879                 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2880                 {
2881                         /*
2882                          * The lock is now free, but we didn't acquire it yet. Before we
2883                          * do, loop back to check if someone else flushed the record for
2884                          * us already.
2885                          */
2886                         continue;
2887                 }
2888
2889                 /* Got the lock; recheck whether request is satisfied */
2890                 RefreshXLogWriteResult(LogwrtResult);
2891                 if (record <= LogwrtResult.Flush)
2892                 {
2893                         LWLockRelease(WALWriteLock);
2894                         break;
2895                 }
2896
2897                 /*
2898                  * Sleep before flush! By adding a delay here, we may give further
2899                  * backends the opportunity to join the backlog of group commit
2900                  * followers; this can significantly improve transaction throughput,
2901                  * at the risk of increasing transaction latency.
2902                  *
2903                  * We do not sleep if enableFsync is not turned on, nor if there are
2904                  * fewer than CommitSiblings other backends with active transactions.
2905                  */
2906                 if (CommitDelay > 0 && enableFsync &&
2907                         MinimumActiveBackends(CommitSiblings))
2908                 {
2909                         pg_usleep(CommitDelay);
2910
2911                         /*
2912                          * Re-check how far we can now flush the WAL. It's generally not
2913                          * safe to call WaitXLogInsertionsToFinish while holding
2914                          * WALWriteLock, because an in-progress insertion might need to
2915                          * also grab WALWriteLock to make progress. But we know that all
2916                          * the insertions up to insertpos have already finished, because
2917                          * that's what the earlier WaitXLogInsertionsToFinish() returned.
2918                          * We're only calling it again to allow insertpos to be moved
2919                          * further forward, not to actually wait for anyone.
2920                          */
2921                         insertpos = WaitXLogInsertionsToFinish(insertpos);
2922                 }
2923
2924                 /* try to write/flush later additions to XLOG as well */
2925                 WriteRqst.Write = insertpos;
2926                 WriteRqst.Flush = insertpos;
2927
2928                 XLogWrite(WriteRqst, insertTLI, false);
2929
2930                 LWLockRelease(WALWriteLock);
2931                 /* done */
2932                 break;
2933         }
2934
2935         END_CRIT_SECTION();
2936
2937         /* wake up walsenders now that we've released heavily contended locks */
2938         WalSndWakeupProcessRequests(true, !RecoveryInProgress());
2939
2940         /*
2941          * If we still haven't flushed to the request point then we have a
2942          * problem; most likely, the requested flush point is past end of XLOG.
2943          * This has been seen to occur when a disk page has a corrupted LSN.
2944          *
2945          * Formerly we treated this as a PANIC condition, but that hurts the
2946          * system's robustness rather than helping it: we do not want to take down
2947          * the whole system due to corruption on one data page.  In particular, if
2948          * the bad page is encountered again during recovery then we would be
2949          * unable to restart the database at all!  (This scenario actually
2950          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2951          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2952          * the only time we can reach here during recovery is while flushing the
2953          * end-of-recovery checkpoint record, and we don't expect that to have a
2954          * bad LSN.
2955          *
2956          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2957          * since xact.c calls this routine inside a critical section.  However,
2958          * calls from bufmgr.c are not within critical sections and so we will not
2959          * force a restart for a bad LSN on a data page.
2960          */
2961         if (LogwrtResult.Flush < record)
2962                 elog(ERROR,
2963                          "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2964                          LSN_FORMAT_ARGS(record),
2965                          LSN_FORMAT_ARGS(LogwrtResult.Flush));
2966 }
2967
2968 /*
2969  * Write & flush xlog, but without specifying exactly where to.
2970  *
2971  * We normally write only completed blocks; but if there is nothing to do on
2972  * that basis, we check for unwritten async commits in the current incomplete
2973  * block, and write through the latest one of those.  Thus, if async commits
2974  * are not being used, we will write complete blocks only.
2975  *
2976  * If, based on the above, there's anything to write we do so immediately. But
2977  * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2978  * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2979  * more than wal_writer_flush_after unflushed blocks.
2980  *
2981  * We can guarantee that async commits reach disk after at most three
2982  * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2983  * to write "flexibly", meaning it can stop at the end of the buffer ring;
2984  * this makes a difference only with very high load or long wal_writer_delay,
2985  * but imposes one extra cycle for the worst case for async commits.)
2986  *
2987  * This routine is invoked periodically by the background walwriter process.
2988  *
2989  * Returns true if there was any work to do, even if we skipped flushing due
2990  * to wal_writer_delay/wal_writer_flush_after.
2991  */
2992 bool
2993 XLogBackgroundFlush(void)
2994 {
2995         XLogwrtRqst WriteRqst;
2996         bool            flexible = true;
2997         static TimestampTz lastflush;
2998         TimestampTz now;
2999         int                     flushblocks;
3000         TimeLineID      insertTLI;
3001
3002         /* XLOG doesn't need flushing during recovery */
3003         if (RecoveryInProgress())
3004                 return false;
3005
3006         /*
3007          * Since we're not in recovery, InsertTimeLineID is set and can't change,
3008          * so we can read it without a lock.
3009          */
3010         insertTLI = XLogCtl->InsertTimeLineID;
3011
3012         /* read updated LogwrtRqst */
3013         SpinLockAcquire(&XLogCtl->info_lck);
3014         WriteRqst = XLogCtl->LogwrtRqst;
3015         SpinLockRelease(&XLogCtl->info_lck);
3016
3017         /* back off to last completed page boundary */
3018         WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3019
3020         /* if we have already flushed that far, consider async commit records */
3021         RefreshXLogWriteResult(LogwrtResult);
3022         if (WriteRqst.Write <= LogwrtResult.Flush)
3023         {
3024                 SpinLockAcquire(&XLogCtl->info_lck);
3025                 WriteRqst.Write = XLogCtl->asyncXactLSN;
3026                 SpinLockRelease(&XLogCtl->info_lck);
3027                 flexible = false;               /* ensure it all gets written */
3028         }
3029
3030         /*
3031          * If already known flushed, we're done. Just need to check if we are
3032          * holding an open file handle to a logfile that's no longer in use,
3033          * preventing the file from being deleted.
3034          */
3035         if (WriteRqst.Write <= LogwrtResult.Flush)
3036         {
3037                 if (openLogFile >= 0)
3038                 {
3039                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3040                                                                  wal_segment_size))
3041                         {
3042                                 XLogFileClose();
3043                         }
3044                 }
3045                 return false;
3046         }
3047
3048         /*
3049          * Determine how far to flush WAL, based on the wal_writer_delay and
3050          * wal_writer_flush_after GUCs.
3051          *
3052          * Note that XLogSetAsyncXactLSN() performs similar calculation based on
3053          * wal_writer_flush_after, to decide when to wake us up.  Make sure the
3054          * logic is the same in both places if you change this.
3055          */
3056         now = GetCurrentTimestamp();
3057         flushblocks =
3058                 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3059
3060         if (WalWriterFlushAfter == 0 || lastflush == 0)
3061         {
3062                 /* first call, or block based limits disabled */
3063                 WriteRqst.Flush = WriteRqst.Write;
3064                 lastflush = now;
3065         }
3066         else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3067         {
3068                 /*
3069                  * Flush the writes at least every WalWriterDelay ms. This is
3070                  * important to bound the amount of time it takes for an asynchronous
3071                  * commit to hit disk.
3072                  */
3073                 WriteRqst.Flush = WriteRqst.Write;
3074                 lastflush = now;
3075         }
3076         else if (flushblocks >= WalWriterFlushAfter)
3077         {
3078                 /* exceeded wal_writer_flush_after blocks, flush */
3079                 WriteRqst.Flush = WriteRqst.Write;
3080                 lastflush = now;
3081         }
3082         else
3083         {
3084                 /* no flushing, this time round */
3085                 WriteRqst.Flush = 0;
3086         }
3087
3088 #ifdef WAL_DEBUG
3089         if (XLOG_DEBUG)
3090                 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3091                          LSN_FORMAT_ARGS(WriteRqst.Write),
3092                          LSN_FORMAT_ARGS(WriteRqst.Flush),
3093                          LSN_FORMAT_ARGS(LogwrtResult.Write),
3094                          LSN_FORMAT_ARGS(LogwrtResult.Flush));
3095 #endif
3096
3097         START_CRIT_SECTION();
3098
3099         /* now wait for any in-progress insertions to finish and get write lock */
3100         WaitXLogInsertionsToFinish(WriteRqst.Write);
3101         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3102         RefreshXLogWriteResult(LogwrtResult);
3103         if (WriteRqst.Write > LogwrtResult.Write ||
3104                 WriteRqst.Flush > LogwrtResult.Flush)
3105         {
3106                 XLogWrite(WriteRqst, insertTLI, flexible);
3107         }
3108         LWLockRelease(WALWriteLock);
3109
3110         END_CRIT_SECTION();
3111
3112         /* wake up walsenders now that we've released heavily contended locks */
3113         WalSndWakeupProcessRequests(true, !RecoveryInProgress());
3114
3115         /*
3116          * Great, done. To take some work off the critical path, try to initialize
3117          * as many of the no-longer-needed WAL buffers for future use as we can.
3118          */
3119         AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
3120
3121         /*
3122          * If we determined that we need to write data, but somebody else
3123          * wrote/flushed already, it should be considered as being active, to
3124          * avoid hibernating too early.
3125          */
3126         return true;
3127 }
3128
3129 /*
3130  * Test whether XLOG data has been flushed up to (at least) the given position.
3131  *
3132  * Returns true if a flush is still needed.  (It may be that someone else
3133  * is already in process of flushing that far, however.)
3134  */
3135 bool
3136 XLogNeedsFlush(XLogRecPtr record)
3137 {
3138         /*
3139          * During recovery, we don't flush WAL but update minRecoveryPoint
3140          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3141          * would need to be updated.
3142          */
3143         if (RecoveryInProgress())
3144         {
3145                 /*
3146                  * An invalid minRecoveryPoint means that we need to recover all the
3147                  * WAL, i.e., we're doing crash recovery.  We never modify the control
3148                  * file's value in that case, so we can short-circuit future checks
3149                  * here too.  This triggers a quick exit path for the startup process,
3150                  * which cannot update its local copy of minRecoveryPoint as long as
3151                  * it has not replayed all WAL available when doing crash recovery.
3152                  */
3153                 if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
3154                         updateMinRecoveryPoint = false;
3155
3156                 /* Quick exit if already known to be updated or cannot be updated */
3157                 if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
3158                         return false;
3159
3160                 /*
3161                  * Update local copy of minRecoveryPoint. But if the lock is busy,
3162                  * just return a conservative guess.
3163                  */
3164                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3165                         return true;
3166                 LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
3167                 LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3168                 LWLockRelease(ControlFileLock);
3169
3170                 /*
3171                  * Check minRecoveryPoint for any other process than the startup
3172                  * process doing crash recovery, which should not update the control
3173                  * file value if crash recovery is still running.
3174                  */
3175                 if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
3176                         updateMinRecoveryPoint = false;
3177
3178                 /* check again */
3179                 if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
3180                         return false;
3181                 else
3182                         return true;
3183         }
3184
3185         /* Quick exit if already known flushed */
3186         if (record <= LogwrtResult.Flush)
3187                 return false;
3188
3189         /* read LogwrtResult and update local state */
3190         RefreshXLogWriteResult(LogwrtResult);
3191
3192         /* check again */
3193         if (record <= LogwrtResult.Flush)
3194                 return false;
3195
3196         return true;
3197 }
3198
3199 /*
3200  * Try to make a given XLOG file segment exist.
3201  *
3202  * logsegno: identify segment.
3203  *
3204  * *added: on return, true if this call raised the number of extant segments.
3205  *
3206  * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
3207  *
3208  * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
3209  * wanting an open segment should attempt to open "path", which usually will
3210  * succeed.  (This is weird, but it's efficient for the callers.)
3211  */
3212 static int
3213 XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
3214                                          bool *added, char *path)
3215 {
3216         char            tmppath[MAXPGPATH];
3217         XLogSegNo       installed_segno;
3218         XLogSegNo       max_segno;
3219         int                     fd;
3220         int                     save_errno;
3221         int                     open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
3222         instr_time      io_start;
3223
3224         Assert(logtli != 0);
3225
3226         XLogFilePath(path, logtli, logsegno, wal_segment_size);
3227
3228         /*
3229          * Try to use existent file (checkpoint maker may have created it already)
3230          */
3231         *added = false;
3232         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3233                                            get_sync_bit(wal_sync_method));
3234         if (fd < 0)
3235         {
3236                 if (errno != ENOENT)
3237                         ereport(ERROR,
3238                                         (errcode_for_file_access(),
3239                                          errmsg("could not open file \"%s\": %m", path)));
3240         }
3241         else
3242                 return fd;
3243
3244         /*
3245          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3246          * another process is doing the same thing.  If so, we will end up
3247          * pre-creating an extra log segment.  That seems OK, and better than
3248          * holding the lock throughout this lengthy process.
3249          */
3250         elog(DEBUG2, "creating and filling new WAL file");
3251
3252         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3253
3254         unlink(tmppath);
3255
3256         if (io_direct_flags & IO_DIRECT_WAL_INIT)
3257                 open_flags |= PG_O_DIRECT;
3258
3259         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3260         fd = BasicOpenFile(tmppath, open_flags);
3261         if (fd < 0)
3262                 ereport(ERROR,
3263                                 (errcode_for_file_access(),
3264                                  errmsg("could not create file \"%s\": %m", tmppath)));
3265
3266         /* Measure I/O timing when initializing segment */
3267         io_start = pgstat_prepare_io_time(track_io_timing);
3268
3269         pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3270         save_errno = 0;
3271         if (wal_init_zero)
3272         {
3273                 ssize_t         rc;
3274
3275                 /*
3276                  * Zero-fill the file.  With this setting, we do this the hard way to
3277                  * ensure that all the file space has really been allocated.  On
3278                  * platforms that allow "holes" in files, just seeking to the end
3279                  * doesn't allocate intermediate space.  This way, we know that we
3280                  * have all the space and (after the fsync below) that all the
3281                  * indirect blocks are down on disk.  Therefore, fdatasync(2) or
3282                  * O_DSYNC will be sufficient to sync future writes to the log file.
3283                  */
3284                 rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
3285
3286                 if (rc < 0)
3287                         save_errno = errno;
3288         }
3289         else
3290         {
3291                 /*
3292                  * Otherwise, seeking to the end and writing a solitary byte is
3293                  * enough.
3294                  */
3295                 errno = 0;
3296                 if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
3297                 {
3298                         /* if write didn't set errno, assume no disk space */
3299                         save_errno = errno ? errno : ENOSPC;
3300                 }
3301         }
3302         pgstat_report_wait_end();
3303
3304         /*
3305          * A full segment worth of data is written when using wal_init_zero. One
3306          * byte is written when not using it.
3307          */
3308         pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
3309                                                         io_start, 1,
3310                                                         wal_init_zero ? wal_segment_size : 1);
3311
3312         if (save_errno)
3313         {
3314                 /*
3315                  * If we fail to make the file, delete it to release disk space
3316                  */
3317                 unlink(tmppath);
3318
3319                 close(fd);
3320
3321                 errno = save_errno;
3322
3323                 ereport(ERROR,
3324                                 (errcode_for_file_access(),
3325                                  errmsg("could not write to file \"%s\": %m", tmppath)));
3326         }
3327
3328         /* Measure I/O timing when flushing segment */
3329         io_start = pgstat_prepare_io_time(track_io_timing);
3330
3331         pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3332         if (pg_fsync(fd) != 0)
3333         {
3334                 save_errno = errno;
3335                 close(fd);
3336                 errno = save_errno;
3337                 ereport(ERROR,
3338                                 (errcode_for_file_access(),
3339                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3340         }
3341         pgstat_report_wait_end();
3342
3343         pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
3344                                                         IOOP_FSYNC, io_start, 1, 0);
3345
3346         if (close(fd) != 0)
3347                 ereport(ERROR,
3348                                 (errcode_for_file_access(),
3349                                  errmsg("could not close file \"%s\": %m", tmppath)));
3350
3351         /*
3352          * Now move the segment into place with its final name.  Cope with
3353          * possibility that someone else has created the file while we were
3354          * filling ours: if so, use ours to pre-create a future log segment.
3355          */
3356         installed_segno = logsegno;
3357
3358         /*
3359          * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3360          * that was a constant, but that was always a bit dubious: normally, at a
3361          * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3362          * here, it was the offset from the insert location. We can't do the
3363          * normal XLOGfileslop calculation here because we don't have access to
3364          * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3365          * CheckPointSegments.
3366          */
3367         max_segno = logsegno + CheckPointSegments;
3368         if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
3369                                                            logtli))
3370         {
3371                 *added = true;
3372                 elog(DEBUG2, "done creating and filling new WAL file");
3373         }
3374         else
3375         {
3376                 /*
3377                  * No need for any more future segments, or InstallXLogFileSegment()
3378                  * failed to rename the file into place. If the rename failed, a
3379                  * caller opening the file may fail.
3380                  */
3381                 unlink(tmppath);
3382                 elog(DEBUG2, "abandoned new WAL file");
3383         }
3384
3385         return -1;
3386 }
3387
3388 /*
3389  * Create a new XLOG file segment, or open a pre-existing one.
3390  *
3391  * logsegno: identify segment to be created/opened.
3392  *
3393  * Returns FD of opened file.
3394  *
3395  * Note: errors here are ERROR not PANIC because we might or might not be
3396  * inside a critical section (eg, during checkpoint there is no reason to
3397  * take down the system on failure).  They will promote to PANIC if we are
3398  * in a critical section.
3399  */
3400 int
3401 XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
3402 {
3403         bool            ignore_added;
3404         char            path[MAXPGPATH];
3405         int                     fd;
3406
3407         Assert(logtli != 0);
3408
3409         fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
3410         if (fd >= 0)
3411                 return fd;
3412
3413         /* Now open original target segment (might not be file I just made) */
3414         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3415                                            get_sync_bit(wal_sync_method));
3416         if (fd < 0)
3417                 ereport(ERROR,
3418                                 (errcode_for_file_access(),
3419                                  errmsg("could not open file \"%s\": %m", path)));
3420         return fd;
3421 }
3422
3423 /*
3424  * Create a new XLOG file segment by copying a pre-existing one.
3425  *
3426  * destsegno: identify segment to be created.
3427  *
3428  * srcTLI, srcsegno: identify segment to be copied (could be from
3429  *              a different timeline)
3430  *
3431  * upto: how much of the source file to copy (the rest is filled with
3432  *              zeros)
3433  *
3434  * Currently this is only used during recovery, and so there are no locking
3435  * considerations.  But we should be just as tense as XLogFileInit to avoid
3436  * emplacing a bogus file.
3437  */
3438 static void
3439 XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
3440                          TimeLineID srcTLI, XLogSegNo srcsegno,
3441                          int upto)
3442 {
3443         char            path[MAXPGPATH];
3444         char            tmppath[MAXPGPATH];
3445         PGAlignedXLogBlock buffer;
3446         int                     srcfd;
3447         int                     fd;
3448         int                     nbytes;
3449
3450         /*
3451          * Open the source file
3452          */
3453         XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3454         srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3455         if (srcfd < 0)
3456                 ereport(ERROR,
3457                                 (errcode_for_file_access(),
3458                                  errmsg("could not open file \"%s\": %m", path)));
3459
3460         /*
3461          * Copy into a temp file name.
3462          */
3463         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3464
3465         unlink(tmppath);
3466
3467         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3468         fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3469         if (fd < 0)
3470                 ereport(ERROR,
3471                                 (errcode_for_file_access(),
3472                                  errmsg("could not create file \"%s\": %m", tmppath)));
3473
3474         /*
3475          * Do the data copying.
3476          */
3477         for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3478         {
3479                 int                     nread;
3480
3481                 nread = upto - nbytes;
3482
3483                 /*
3484                  * The part that is not read from the source file is filled with
3485                  * zeros.
3486                  */
3487                 if (nread < sizeof(buffer))
3488                         memset(buffer.data, 0, sizeof(buffer));
3489
3490                 if (nread > 0)
3491                 {
3492                         int                     r;
3493
3494                         if (nread > sizeof(buffer))
3495                                 nread = sizeof(buffer);
3496                         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3497                         r = read(srcfd, buffer.data, nread);
3498                         if (r != nread)
3499                         {
3500                                 if (r < 0)
3501                                         ereport(ERROR,
3502                                                         (errcode_for_file_access(),
3503                                                          errmsg("could not read file \"%s\": %m",
3504                                                                         path)));
3505                                 else
3506                                         ereport(ERROR,
3507                                                         (errcode(ERRCODE_DATA_CORRUPTED),
3508                                                          errmsg("could not read file \"%s\": read %d of %zu",
3509                                                                         path, r, (Size) nread)));
3510                         }
3511                         pgstat_report_wait_end();
3512                 }
3513                 errno = 0;
3514                 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3515                 if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3516                 {
3517                         int                     save_errno = errno;
3518
3519                         /*
3520                          * If we fail to make the file, delete it to release disk space
3521                          */
3522                         unlink(tmppath);
3523                         /* if write didn't set errno, assume problem is no disk space */
3524                         errno = save_errno ? save_errno : ENOSPC;
3525
3526                         ereport(ERROR,
3527                                         (errcode_for_file_access(),
3528                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3529                 }
3530                 pgstat_report_wait_end();
3531         }
3532
3533         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3534         if (pg_fsync(fd) != 0)
3535                 ereport(data_sync_elevel(ERROR),
3536                                 (errcode_for_file_access(),
3537                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3538         pgstat_report_wait_end();
3539
3540         if (CloseTransientFile(fd) != 0)
3541                 ereport(ERROR,
3542                                 (errcode_for_file_access(),
3543                                  errmsg("could not close file \"%s\": %m", tmppath)));
3544
3545         if (CloseTransientFile(srcfd) != 0)
3546                 ereport(ERROR,
3547                                 (errcode_for_file_access(),
3548                                  errmsg("could not close file \"%s\": %m", path)));
3549
3550         /*
3551          * Now move the segment into place with its final name.
3552          */
3553         if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
3554                 elog(ERROR, "InstallXLogFileSegment should not have failed");
3555 }
3556
3557 /*
3558  * Install a new XLOG segment file as a current or future log segment.
3559  *
3560  * This is used both to install a newly-created segment (which has a temp
3561  * filename while it's being created) and to recycle an old segment.
3562  *
3563  * *segno: identify segment to install as (or first possible target).
3564  * When find_free is true, this is modified on return to indicate the
3565  * actual installation location or last segment searched.
3566  *
3567  * tmppath: initial name of file to install.  It will be renamed into place.
3568  *
3569  * find_free: if true, install the new segment at the first empty segno
3570  * number at or after the passed numbers.  If false, install the new segment
3571  * exactly where specified, deleting any existing segment file there.
3572  *
3573  * max_segno: maximum segment number to install the new file as.  Fail if no
3574  * free slot is found between *segno and max_segno. (Ignored when find_free
3575  * is false.)
3576  *
3577  * tli: The timeline on which the new segment should be installed.
3578  *
3579  * Returns true if the file was installed successfully.  false indicates that
3580  * max_segno limit was exceeded, the startup process has disabled this
3581  * function for now, or an error occurred while renaming the file into place.
3582  */
3583 static bool
3584 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3585                                            bool find_free, XLogSegNo max_segno, TimeLineID tli)
3586 {
3587         char            path[MAXPGPATH];
3588         struct stat stat_buf;
3589
3590         Assert(tli != 0);
3591
3592         XLogFilePath(path, tli, *segno, wal_segment_size);
3593
3594         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3595         if (!XLogCtl->InstallXLogFileSegmentActive)
3596         {
3597                 LWLockRelease(ControlFileLock);
3598                 return false;
3599         }
3600
3601         if (!find_free)
3602         {
3603                 /* Force installation: get rid of any pre-existing segment file */
3604                 durable_unlink(path, DEBUG1);
3605         }
3606         else
3607         {
3608                 /* Find a free slot to put it in */
3609                 while (stat(path, &stat_buf) == 0)
3610                 {
3611                         if ((*segno) >= max_segno)
3612                         {
3613                                 /* Failed to find a free slot within specified range */
3614                                 LWLockRelease(ControlFileLock);
3615                                 return false;
3616                         }
3617                         (*segno)++;
3618                         XLogFilePath(path, tli, *segno, wal_segment_size);
3619                 }
3620         }
3621
3622         Assert(access(path, F_OK) != 0 && errno == ENOENT);
3623         if (durable_rename(tmppath, path, LOG) != 0)
3624         {
3625                 LWLockRelease(ControlFileLock);
3626                 /* durable_rename already emitted log message */
3627                 return false;
3628         }
3629
3630         LWLockRelease(ControlFileLock);
3631
3632         return true;
3633 }
3634
3635 /*
3636  * Open a pre-existing logfile segment for writing.
3637  */
3638 int
3639 XLogFileOpen(XLogSegNo segno, TimeLineID tli)
3640 {
3641         char            path[MAXPGPATH];
3642         int                     fd;
3643
3644         XLogFilePath(path, tli, segno, wal_segment_size);
3645
3646         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3647                                            get_sync_bit(wal_sync_method));
3648         if (fd < 0)
3649                 ereport(PANIC,
3650                                 (errcode_for_file_access(),
3651                                  errmsg("could not open file \"%s\": %m", path)));
3652
3653         return fd;
3654 }
3655
3656 /*
3657  * Close the current logfile segment for writing.
3658  */
3659 static void
3660 XLogFileClose(void)
3661 {
3662         Assert(openLogFile >= 0);
3663
3664         /*
3665          * WAL segment files will not be re-read in normal operation, so we advise
3666          * the OS to release any cached pages.  But do not do so if WAL archiving
3667          * or streaming is active, because archiver and walsender process could
3668          * use the cache to read the WAL segment.
3669          */
3670 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3671         if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
3672                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3673 #endif
3674
3675         if (close(openLogFile) != 0)
3676         {
3677                 char            xlogfname[MAXFNAMELEN];
3678                 int                     save_errno = errno;
3679
3680                 XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
3681                 errno = save_errno;
3682                 ereport(PANIC,
3683                                 (errcode_for_file_access(),
3684                                  errmsg("could not close file \"%s\": %m", xlogfname)));
3685         }
3686
3687         openLogFile = -1;
3688         ReleaseExternalFD();
3689 }
3690
3691 /*
3692  * Preallocate log files beyond the specified log endpoint.
3693  *
3694  * XXX this is currently extremely conservative, since it forces only one
3695  * future log segment to exist, and even that only if we are 75% done with
3696  * the current one.  This is only appropriate for very low-WAL-volume systems.
3697  * High-volume systems will be OK once they've built up a sufficient set of
3698  * recycled log segments, but the startup transient is likely to include
3699  * a lot of segment creations by foreground processes, which is not so good.
3700  *
3701  * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
3702  * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
3703  * and/or ControlFile updates already completed.  If a RequestCheckpoint()
3704  * initiated the present checkpoint and an ERROR ends this function, the
3705  * command that called RequestCheckpoint() fails.  That's not ideal, but it's
3706  * not worth contorting more functions to use caller-specified elevel values.
3707  * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
3708  * reporting and resource reclamation.)
3709  */
3710 static void
3711 PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
3712 {
3713         XLogSegNo       _logSegNo;
3714         int                     lf;
3715         bool            added;
3716         char            path[MAXPGPATH];
3717         uint64          offset;
3718
3719         if (!XLogCtl->InstallXLogFileSegmentActive)
3720                 return;                                 /* unlocked check says no */
3721
3722         XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3723         offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3724         if (offset >= (uint32) (0.75 * wal_segment_size))
3725         {
3726                 _logSegNo++;
3727                 lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
3728                 if (lf >= 0)
3729                         close(lf);
3730                 if (added)
3731                         CheckpointStats.ckpt_segs_added++;
3732         }
3733 }
3734
3735 /*
3736  * Throws an error if the given log segment has already been removed or
3737  * recycled. The caller should only pass a segment that it knows to have
3738  * existed while the server has been running, as this function always
3739  * succeeds if no WAL segments have been removed since startup.
3740  * 'tli' is only used in the error message.
3741  *
3742  * Note: this function guarantees to keep errno unchanged on return.
3743  * This supports callers that use this to possibly deliver a better
3744  * error message about a missing file, while still being able to throw
3745  * a normal file-access error afterwards, if this does return.
3746  */
3747 void
3748 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3749 {
3750         int                     save_errno = errno;
3751         XLogSegNo       lastRemovedSegNo;
3752
3753         SpinLockAcquire(&XLogCtl->info_lck);
3754         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3755         SpinLockRelease(&XLogCtl->info_lck);
3756
3757         if (segno <= lastRemovedSegNo)
3758         {
3759                 char            filename[MAXFNAMELEN];
3760
3761                 XLogFileName(filename, tli, segno, wal_segment_size);
3762                 errno = save_errno;
3763                 ereport(ERROR,
3764                                 (errcode_for_file_access(),
3765                                  errmsg("requested WAL segment %s has already been removed",
3766                                                 filename)));
3767         }
3768         errno = save_errno;
3769 }
3770
3771 /*
3772  * Return the last WAL segment removed, or 0 if no segment has been removed
3773  * since startup.
3774  *
3775  * NB: the result can be out of date arbitrarily fast, the caller has to deal
3776  * with that.
3777  */
3778 XLogSegNo
3779 XLogGetLastRemovedSegno(void)
3780 {
3781         XLogSegNo       lastRemovedSegNo;
3782
3783         SpinLockAcquire(&XLogCtl->info_lck);
3784         lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3785         SpinLockRelease(&XLogCtl->info_lck);
3786
3787         return lastRemovedSegNo;
3788 }
3789
3790 /*
3791  * Return the oldest WAL segment on the given TLI that still exists in
3792  * XLOGDIR, or 0 if none.
3793  */
3794 XLogSegNo
3795 XLogGetOldestSegno(TimeLineID tli)
3796 {
3797         DIR                *xldir;
3798         struct dirent *xlde;
3799         XLogSegNo       oldest_segno = 0;
3800
3801         xldir = AllocateDir(XLOGDIR);
3802         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3803         {
3804                 TimeLineID      file_tli;
3805                 XLogSegNo       file_segno;
3806
3807                 /* Ignore files that are not XLOG segments. */
3808                 if (!IsXLogFileName(xlde->d_name))
3809                         continue;
3810
3811                 /* Parse filename to get TLI and segno. */
3812                 XLogFromFileName(xlde->d_name, &file_tli, &file_segno,
3813                                                  wal_segment_size);
3814
3815                 /* Ignore anything that's not from the TLI of interest. */
3816                 if (tli != file_tli)
3817                         continue;
3818
3819                 /* If it's the oldest so far, update oldest_segno. */
3820                 if (oldest_segno == 0 || file_segno < oldest_segno)
3821                         oldest_segno = file_segno;
3822         }
3823
3824         FreeDir(xldir);
3825         return oldest_segno;
3826 }
3827
3828 /*
3829  * Update the last removed segno pointer in shared memory, to reflect that the
3830  * given XLOG file has been removed.
3831  */
3832 static void
3833 UpdateLastRemovedPtr(char *filename)
3834 {
3835         uint32          tli;
3836         XLogSegNo       segno;
3837
3838         XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3839
3840         SpinLockAcquire(&XLogCtl->info_lck);
3841         if (segno > XLogCtl->lastRemovedSegNo)
3842                 XLogCtl->lastRemovedSegNo = segno;
3843         SpinLockRelease(&XLogCtl->info_lck);
3844 }
3845
3846 /*
3847  * Remove all temporary log files in pg_wal
3848  *
3849  * This is called at the beginning of recovery after a previous crash,
3850  * at a point where no other processes write fresh WAL data.
3851  */
3852 static void
3853 RemoveTempXlogFiles(void)
3854 {
3855         DIR                *xldir;
3856         struct dirent *xlde;
3857
3858         elog(DEBUG2, "removing all temporary WAL segments");
3859
3860         xldir = AllocateDir(XLOGDIR);
3861         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3862         {
3863                 char            path[MAXPGPATH];
3864
3865                 if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
3866                         continue;
3867
3868                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3869                 unlink(path);
3870                 elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3871         }
3872         FreeDir(xldir);
3873 }
3874
3875 /*
3876  * Recycle or remove all log files older or equal to passed segno.
3877  *
3878  * endptr is current (or recent) end of xlog, and lastredoptr is the
3879  * redo pointer of the last checkpoint. These are used to determine
3880  * whether we want to recycle rather than delete no-longer-wanted log files.
3881  *
3882  * insertTLI is the current timeline for XLOG insertion. Any recycled
3883  * segments should be reused for this timeline.
3884  */
3885 static void
3886 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
3887                                    TimeLineID insertTLI)
3888 {
3889         DIR                *xldir;
3890         struct dirent *xlde;
3891         char            lastoff[MAXFNAMELEN];
3892         XLogSegNo       endlogSegNo;
3893         XLogSegNo       recycleSegNo;
3894
3895         /* Initialize info about where to try to recycle to */
3896         XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
3897         recycleSegNo = XLOGfileslop(lastredoptr);
3898
3899         /*
3900          * Construct a filename of the last segment to be kept. The timeline ID
3901          * doesn't matter, we ignore that in the comparison. (During recovery,
3902          * InsertTimeLineID isn't set, so we can't use that.)
3903          */
3904         XLogFileName(lastoff, 0, segno, wal_segment_size);
3905
3906         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3907                  lastoff);
3908
3909         xldir = AllocateDir(XLOGDIR);
3910
3911         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3912         {
3913                 /* Ignore files that are not XLOG segments */
3914                 if (!IsXLogFileName(xlde->d_name) &&
3915                         !IsPartialXLogFileName(xlde->d_name))
3916                         continue;
3917
3918                 /*
3919                  * We ignore the timeline part of the XLOG segment identifiers in
3920                  * deciding whether a segment is still needed.  This ensures that we
3921                  * won't prematurely remove a segment from a parent timeline. We could
3922                  * probably be a little more proactive about removing segments of
3923                  * non-parent timelines, but that would be a whole lot more
3924                  * complicated.
3925                  *
3926                  * We use the alphanumeric sorting property of the filenames to decide
3927                  * which ones are earlier than the lastoff segment.
3928                  */
3929                 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3930                 {
3931                         if (XLogArchiveCheckDone(xlde->d_name))
3932                         {
3933                                 /* Update the last removed location in shared memory first */
3934                                 UpdateLastRemovedPtr(xlde->d_name);
3935
3936                                 RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
3937                         }
3938                 }
3939         }
3940
3941         FreeDir(xldir);
3942 }
3943
3944 /*
3945  * Recycle or remove WAL files that are not part of the given timeline's
3946  * history.
3947  *
3948  * This is called during recovery, whenever we switch to follow a new
3949  * timeline, and at the end of recovery when we create a new timeline. We
3950  * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3951  * might be leftover pre-allocated or recycled WAL segments on the old timeline
3952  * that we haven't used yet, and contain garbage. If we just leave them in
3953  * pg_wal, they will eventually be archived, and we can't let that happen.
3954  * Files that belong to our timeline history are valid, because we have
3955  * successfully replayed them, but from others we can't be sure.
3956  *
3957  * 'switchpoint' is the current point in WAL where we switch to new timeline,
3958  * and 'newTLI' is the new timeline we switch to.
3959  */
3960 void
3961 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3962 {
3963         DIR                *xldir;
3964         struct dirent *xlde;
3965         char            switchseg[MAXFNAMELEN];
3966         XLogSegNo       endLogSegNo;
3967         XLogSegNo       switchLogSegNo;
3968         XLogSegNo       recycleSegNo;
3969
3970         /*
3971          * Initialize info about where to begin the work.  This will recycle,
3972          * somewhat arbitrarily, 10 future segments.
3973          */
3974         XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
3975         XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
3976         recycleSegNo = endLogSegNo + 10;
3977
3978         /*
3979          * Construct a filename of the last segment to be kept.
3980          */
3981         XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
3982
3983         elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
3984                  switchseg);
3985
3986         xldir = AllocateDir(XLOGDIR);
3987
3988         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3989         {
3990                 /* Ignore files that are not XLOG segments */
3991                 if (!IsXLogFileName(xlde->d_name))
3992                         continue;
3993
3994                 /*
3995                  * Remove files that are on a timeline older than the new one we're
3996                  * switching to, but with a segment number >= the first segment on the
3997                  * new timeline.
3998                  */
3999                 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4000                         strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4001                 {
4002                         /*
4003                          * If the file has already been marked as .ready, however, don't
4004                          * remove it yet. It should be OK to remove it - files that are
4005                          * not part of our timeline history are not required for recovery
4006                          * - but seems safer to let them be archived and removed later.
4007                          */
4008                         if (!XLogArchiveIsReady(xlde->d_name))
4009                                 RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
4010                 }
4011         }
4012
4013         FreeDir(xldir);
4014 }
4015
4016 /*
4017  * Recycle or remove a log file that's no longer needed.
4018  *
4019  * segment_de is the dirent structure of the segment to recycle or remove.
4020  * recycleSegNo is the segment number to recycle up to.  endlogSegNo is
4021  * the segment number of the current (or recent) end of WAL.
4022  *
4023  * endlogSegNo gets incremented if the segment is recycled so as it is not
4024  * checked again with future callers of this function.
4025  *
4026  * insertTLI is the current timeline for XLOG insertion. Any recycled segments
4027  * should be used for this timeline.
4028  */
4029 static void
4030 RemoveXlogFile(const struct dirent *segment_de,
4031                            XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
4032                            TimeLineID insertTLI)
4033 {
4034         char            path[MAXPGPATH];
4035 #ifdef WIN32
4036         char            newpath[MAXPGPATH];
4037 #endif
4038         const char *segname = segment_de->d_name;
4039
4040         snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4041
4042         /*
4043          * Before deleting the file, see if it can be recycled as a future log
4044          * segment. Only recycle normal files, because we don't want to recycle
4045          * symbolic links pointing to a separate archive directory.
4046          */
4047         if (wal_recycle &&
4048                 *endlogSegNo <= recycleSegNo &&
4049                 XLogCtl->InstallXLogFileSegmentActive &&        /* callee rechecks this */
4050                 get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
4051                 InstallXLogFileSegment(endlogSegNo, path,
4052                                                            true, recycleSegNo, insertTLI))
4053         {
4054                 ereport(DEBUG2,
4055                                 (errmsg_internal("recycled write-ahead log file \"%s\"",
4056                                                                  segname)));
4057                 CheckpointStats.ckpt_segs_recycled++;
4058                 /* Needn't recheck that slot on future iterations */
4059                 (*endlogSegNo)++;
4060         }
4061         else
4062         {
4063                 /* No need for any more future segments, or recycling failed ... */
4064                 int                     rc;
4065
4066                 ereport(DEBUG2,
4067                                 (errmsg_internal("removing write-ahead log file \"%s\"",
4068                                                                  segname)));
4069
4070 #ifdef WIN32
4071
4072                 /*
4073                  * On Windows, if another process (e.g another backend) holds the file
4074                  * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4075                  * will still show up in directory listing until the last handle is
4076                  * closed. To avoid confusing the lingering deleted file for a live
4077                  * WAL file that needs to be archived, rename it before deleting it.
4078                  *
4079                  * If another process holds the file open without FILE_SHARE_DELETE
4080                  * flag, rename will fail. We'll try again at the next checkpoint.
4081                  */
4082                 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4083                 if (rename(path, newpath) != 0)
4084                 {
4085                         ereport(LOG,
4086                                         (errcode_for_file_access(),
4087                                          errmsg("could not rename file \"%s\": %m",
4088                                                         path)));
4089                         return;
4090                 }
4091                 rc = durable_unlink(newpath, LOG);
4092 #else
4093                 rc = durable_unlink(path, LOG);
4094 #endif
4095                 if (rc != 0)
4096                 {
4097                         /* Message already logged by durable_unlink() */
4098                         return;
4099                 }
4100                 CheckpointStats.ckpt_segs_removed++;
4101         }
4102
4103         XLogArchiveCleanup(segname);
4104 }
4105
4106 /*
4107  * Verify whether pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
4108  * If the latter do not exist, recreate them.
4109  *
4110  * It is not the goal of this function to verify the contents of these
4111  * directories, but to help in cases where someone has performed a cluster
4112  * copy for PITR purposes but omitted pg_wal from the copy.
4113  *
4114  * We could also recreate pg_wal if it doesn't exist, but a deliberate
4115  * policy decision was made not to.  It is fairly common for pg_wal to be
4116  * a symlink, and if that was the DBA's intent then automatically making a
4117  * plain directory would result in degraded performance with no notice.
4118  */
4119 static void
4120 ValidateXLOGDirectoryStructure(void)
4121 {
4122         char            path[MAXPGPATH];
4123         struct stat stat_buf;
4124
4125         /* Check for pg_wal; if it doesn't exist, error out */
4126         if (stat(XLOGDIR, &stat_buf) != 0 ||
4127                 !S_ISDIR(stat_buf.st_mode))
4128                 ereport(FATAL,
4129                                 (errcode_for_file_access(),
4130                                  errmsg("required WAL directory \"%s\" does not exist",
4131                                                 XLOGDIR)));
4132
4133         /* Check for archive_status */
4134         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4135         if (stat(path, &stat_buf) == 0)
4136         {
4137                 /* Check for weird cases where it exists but isn't a directory */
4138                 if (!S_ISDIR(stat_buf.st_mode))
4139                         ereport(FATAL,
4140                                         (errcode_for_file_access(),
4141                                          errmsg("required WAL directory \"%s\" does not exist",
4142                                                         path)));
4143         }
4144         else
4145         {
4146                 ereport(LOG,
4147                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4148                 if (MakePGDirectory(path) < 0)
4149                         ereport(FATAL,
4150                                         (errcode_for_file_access(),
4151                                          errmsg("could not create missing directory \"%s\": %m",
4152                                                         path)));
4153         }
4154
4155         /* Check for summaries */
4156         snprintf(path, MAXPGPATH, XLOGDIR "/summaries");
4157         if (stat(path, &stat_buf) == 0)
4158         {
4159                 /* Check for weird cases where it exists but isn't a directory */
4160                 if (!S_ISDIR(stat_buf.st_mode))
4161                         ereport(FATAL,
4162                                         (errmsg("required WAL directory \"%s\" does not exist",
4163                                                         path)));
4164         }
4165         else
4166         {
4167                 ereport(LOG,
4168                                 (errmsg("creating missing WAL directory \"%s\"", path)));
4169                 if (MakePGDirectory(path) < 0)
4170                         ereport(FATAL,
4171                                         (errmsg("could not create missing directory \"%s\": %m",
4172                                                         path)));
4173         }
4174 }
4175
4176 /*
4177  * Remove previous backup history files.  This also retries creation of
4178  * .ready files for any backup history files for which XLogArchiveNotify
4179  * failed earlier.
4180  */
4181 static void
4182 CleanupBackupHistory(void)
4183 {
4184         DIR                *xldir;
4185         struct dirent *xlde;
4186         char            path[MAXPGPATH + sizeof(XLOGDIR)];
4187
4188         xldir = AllocateDir(XLOGDIR);
4189
4190         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4191         {
4192                 if (IsBackupHistoryFileName(xlde->d_name))
4193                 {
4194                         if (XLogArchiveCheckDone(xlde->d_name))
4195                         {
4196                                 elog(DEBUG2, "removing WAL backup history file \"%s\"",
4197                                          xlde->d_name);
4198                                 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4199                                 unlink(path);
4200                                 XLogArchiveCleanup(xlde->d_name);
4201                         }
4202                 }
4203         }
4204
4205         FreeDir(xldir);
4206 }
4207
4208 /*
4209  * I/O routines for pg_control
4210  *
4211  * *ControlFile is a buffer in shared memory that holds an image of the
4212  * contents of pg_control.  WriteControlFile() initializes pg_control
4213  * given a preloaded buffer, ReadControlFile() loads the buffer from
4214  * the pg_control file (during postmaster or standalone-backend startup),
4215  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4216  * InitControlFile() fills the buffer with initial values.
4217  *
4218  * For simplicity, WriteControlFile() initializes the fields of pg_control
4219  * that are related to checking backend/database compatibility, and
4220  * ReadControlFile() verifies they are correct.  We could split out the
4221  * I/O and compatibility-check functions, but there seems no need currently.
4222  */
4223
4224 static void
4225 InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
4226 {
4227         char            mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4228
4229         /*
4230          * Generate a random nonce. This is used for authentication requests that
4231          * will fail because the user does not exist. The nonce is used to create
4232          * a genuine-looking password challenge for the non-existent user, in lieu
4233          * of an actual stored password.
4234          */
4235         if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4236                 ereport(PANIC,
4237                                 (errcode(ERRCODE_INTERNAL_ERROR),
4238                                  errmsg("could not generate secret authorization token")));
4239
4240         memset(ControlFile, 0, sizeof(ControlFileData));
4241         /* Initialize pg_control status fields */
4242         ControlFile->system_identifier = sysidentifier;
4243         memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
4244         ControlFile->state = DB_SHUTDOWNED;
4245         ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
4246
4247         /* Set important parameter values for use when replaying WAL */
4248         ControlFile->MaxConnections = MaxConnections;
4249         ControlFile->max_worker_processes = max_worker_processes;
4250         ControlFile->max_wal_senders = max_wal_senders;
4251         ControlFile->max_prepared_xacts = max_prepared_xacts;
4252         ControlFile->max_locks_per_xact = max_locks_per_xact;
4253         ControlFile->wal_level = wal_level;
4254         ControlFile->wal_log_hints = wal_log_hints;
4255         ControlFile->track_commit_timestamp = track_commit_timestamp;
4256         ControlFile->data_checksum_version = data_checksum_version;
4257 }
4258
4259 static void
4260 WriteControlFile(void)
4261 {
4262         int                     fd;
4263         char            buffer[PG_CONTROL_FILE_SIZE];   /* need not be aligned */
4264
4265         /*
4266          * Initialize version and compatibility-check fields
4267          */
4268         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4269         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4270
4271         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4272         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4273
4274         ControlFile->blcksz = BLCKSZ;
4275         ControlFile->relseg_size = RELSEG_SIZE;
4276         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4277         ControlFile->xlog_seg_size = wal_segment_size;
4278
4279         ControlFile->nameDataLen = NAMEDATALEN;
4280         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4281
4282         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4283         ControlFile->loblksize = LOBLKSIZE;
4284
4285         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4286
4287         /* Contents are protected with a CRC */
4288         INIT_CRC32C(ControlFile->crc);
4289         COMP_CRC32C(ControlFile->crc,
4290                                 (char *) ControlFile,
4291                                 offsetof(ControlFileData, crc));
4292         FIN_CRC32C(ControlFile->crc);
4293
4294         /*
4295          * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4296          * the excess over sizeof(ControlFileData).  This reduces the odds of
4297          * premature-EOF errors when reading pg_control.  We'll still fail when we
4298          * check the contents of the file, but hopefully with a more specific
4299          * error than "couldn't read pg_control".
4300          */
4301         memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4302         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4303
4304         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4305                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4306         if (fd < 0)
4307                 ereport(PANIC,
4308                                 (errcode_for_file_access(),
4309                                  errmsg("could not create file \"%s\": %m",
4310                                                 XLOG_CONTROL_FILE)));
4311
4312         errno = 0;
4313         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4314         if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4315         {
4316                 /* if write didn't set errno, assume problem is no disk space */
4317                 if (errno == 0)
4318                         errno = ENOSPC;
4319                 ereport(PANIC,
4320                                 (errcode_for_file_access(),
4321                                  errmsg("could not write to file \"%s\": %m",
4322                                                 XLOG_CONTROL_FILE)));
4323         }
4324         pgstat_report_wait_end();
4325
4326         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4327         if (pg_fsync(fd) != 0)
4328                 ereport(PANIC,
4329                                 (errcode_for_file_access(),
4330                                  errmsg("could not fsync file \"%s\": %m",
4331                                                 XLOG_CONTROL_FILE)));
4332         pgstat_report_wait_end();
4333
4334         if (close(fd) != 0)
4335                 ereport(PANIC,
4336                                 (errcode_for_file_access(),
4337                                  errmsg("could not close file \"%s\": %m",
4338                                                 XLOG_CONTROL_FILE)));
4339 }
4340
4341 static void
4342 ReadControlFile(void)
4343 {
4344         pg_crc32c       crc;
4345         int                     fd;
4346         char            wal_segsz_str[20];
4347         int                     r;
4348
4349         /*
4350          * Read data...
4351          */
4352         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4353                                            O_RDWR | PG_BINARY);
4354         if (fd < 0)
4355                 ereport(PANIC,
4356                                 (errcode_for_file_access(),
4357                                  errmsg("could not open file \"%s\": %m",
4358                                                 XLOG_CONTROL_FILE)));
4359
4360         pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4361         r = read(fd, ControlFile, sizeof(ControlFileData));
4362         if (r != sizeof(ControlFileData))
4363         {
4364                 if (r < 0)
4365                         ereport(PANIC,
4366                                         (errcode_for_file_access(),
4367                                          errmsg("could not read file \"%s\": %m",
4368                                                         XLOG_CONTROL_FILE)));
4369                 else
4370                         ereport(PANIC,
4371                                         (errcode(ERRCODE_DATA_CORRUPTED),
4372                                          errmsg("could not read file \"%s\": read %d of %zu",
4373                                                         XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4374         }
4375         pgstat_report_wait_end();
4376
4377         close(fd);
4378
4379         /*
4380          * Check for expected pg_control format version.  If this is wrong, the
4381          * CRC check will likely fail because we'll be checking the wrong number
4382          * of bytes.  Complaining about wrong version will probably be more
4383          * enlightening than complaining about wrong CRC.
4384          */
4385
4386         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4387                 ereport(FATAL,
4388                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4389                                  errmsg("database files are incompatible with server"),
4390                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4391                                                    " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4392                                                    ControlFile->pg_control_version, ControlFile->pg_control_version,
4393                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4394                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4395
4396         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4397                 ereport(FATAL,
4398                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4399                                  errmsg("database files are incompatible with server"),
4400                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4401                                                    " but the server was compiled with PG_CONTROL_VERSION %d.",
4402                                                    ControlFile->pg_control_version, PG_CONTROL_VERSION),
4403                                  errhint("It looks like you need to initdb.")));
4404
4405         /* Now check the CRC. */
4406         INIT_CRC32C(crc);
4407         COMP_CRC32C(crc,
4408                                 (char *) ControlFile,
4409                                 offsetof(ControlFileData, crc));
4410         FIN_CRC32C(crc);
4411
4412         if (!EQ_CRC32C(crc, ControlFile->crc))
4413                 ereport(FATAL,
4414                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4415                                  errmsg("incorrect checksum in control file")));
4416
4417         /*
4418          * Do compatibility checking immediately.  If the database isn't
4419          * compatible with the backend executable, we want to abort before we can
4420          * possibly do any damage.
4421          */
4422         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4423                 ereport(FATAL,
4424                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4425                                  errmsg("database files are incompatible with server"),
4426                 /* translator: %s is a variable name and %d is its value */
4427                                  errdetail("The database cluster was initialized with %s %d,"
4428                                                    " but the server was compiled with %s %d.",
4429                                                    "CATALOG_VERSION_NO", ControlFile->catalog_version_no,
4430                                                    "CATALOG_VERSION_NO", CATALOG_VERSION_NO),
4431                                  errhint("It looks like you need to initdb.")));
4432         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4433                 ereport(FATAL,
4434                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4435                                  errmsg("database files are incompatible with server"),
4436                 /* translator: %s is a variable name and %d is its value */
4437                                  errdetail("The database cluster was initialized with %s %d,"
4438                                                    " but the server was compiled with %s %d.",
4439                                                    "MAXALIGN", ControlFile->maxAlign,
4440                                                    "MAXALIGN", MAXIMUM_ALIGNOF),
4441                                  errhint("It looks like you need to initdb.")));
4442         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4443                 ereport(FATAL,
4444                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4445                                  errmsg("database files are incompatible with server"),
4446                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4447                                  errhint("It looks like you need to initdb.")));
4448         if (ControlFile->blcksz != BLCKSZ)
4449                 ereport(FATAL,
4450                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4451                                  errmsg("database files are incompatible with server"),
4452                 /* translator: %s is a variable name and %d is its value */
4453                                  errdetail("The database cluster was initialized with %s %d,"
4454                                                    " but the server was compiled with %s %d.",
4455                                                    "BLCKSZ", ControlFile->blcksz,
4456                                                    "BLCKSZ", BLCKSZ),
4457                                  errhint("It looks like you need to recompile or initdb.")));
4458         if (ControlFile->relseg_size != RELSEG_SIZE)
4459                 ereport(FATAL,
4460                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4461                                  errmsg("database files are incompatible with server"),
4462                 /* translator: %s is a variable name and %d is its value */
4463                                  errdetail("The database cluster was initialized with %s %d,"
4464                                                    " but the server was compiled with %s %d.",
4465                                                    "RELSEG_SIZE", ControlFile->relseg_size,
4466                                                    "RELSEG_SIZE", RELSEG_SIZE),
4467                                  errhint("It looks like you need to recompile or initdb.")));
4468         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4469                 ereport(FATAL,
4470                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4471                                  errmsg("database files are incompatible with server"),
4472                 /* translator: %s is a variable name and %d is its value */
4473                                  errdetail("The database cluster was initialized with %s %d,"
4474                                                    " but the server was compiled with %s %d.",
4475                                                    "XLOG_BLCKSZ", ControlFile->xlog_blcksz,
4476                                                    "XLOG_BLCKSZ", XLOG_BLCKSZ),
4477                                  errhint("It looks like you need to recompile or initdb.")));
4478         if (ControlFile->nameDataLen != NAMEDATALEN)
4479                 ereport(FATAL,
4480                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4481                                  errmsg("database files are incompatible with server"),
4482                 /* translator: %s is a variable name and %d is its value */
4483                                  errdetail("The database cluster was initialized with %s %d,"
4484                                                    " but the server was compiled with %s %d.",
4485                                                    "NAMEDATALEN", ControlFile->nameDataLen,
4486                                                    "NAMEDATALEN", NAMEDATALEN),
4487                                  errhint("It looks like you need to recompile or initdb.")));
4488         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4489                 ereport(FATAL,
4490                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4491                                  errmsg("database files are incompatible with server"),
4492                 /* translator: %s is a variable name and %d is its value */
4493                                  errdetail("The database cluster was initialized with %s %d,"
4494                                                    " but the server was compiled with %s %d.",
4495                                                    "INDEX_MAX_KEYS", ControlFile->indexMaxKeys,
4496                                                    "INDEX_MAX_KEYS", INDEX_MAX_KEYS),
4497                                  errhint("It looks like you need to recompile or initdb.")));
4498         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4499                 ereport(FATAL,
4500                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4501                                  errmsg("database files are incompatible with server"),
4502                 /* translator: %s is a variable name and %d is its value */
4503                                  errdetail("The database cluster was initialized with %s %d,"
4504                                                    " but the server was compiled with %s %d.",
4505                                                    "TOAST_MAX_CHUNK_SIZE", ControlFile->toast_max_chunk_size,
4506                                                    "TOAST_MAX_CHUNK_SIZE", (int) TOAST_MAX_CHUNK_SIZE),
4507                                  errhint("It looks like you need to recompile or initdb.")));
4508         if (ControlFile->loblksize != LOBLKSIZE)
4509                 ereport(FATAL,
4510                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4511                                  errmsg("database files are incompatible with server"),
4512                 /* translator: %s is a variable name and %d is its value */
4513                                  errdetail("The database cluster was initialized with %s %d,"
4514                                                    " but the server was compiled with %s %d.",
4515                                                    "LOBLKSIZE", ControlFile->loblksize,
4516                                                    "LOBLKSIZE", (int) LOBLKSIZE),
4517                                  errhint("It looks like you need to recompile or initdb.")));
4518
4519 #ifdef USE_FLOAT8_BYVAL
4520         if (ControlFile->float8ByVal != true)
4521                 ereport(FATAL,
4522                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4523                                  errmsg("database files are incompatible with server"),
4524                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4525                                                    " but the server was compiled with USE_FLOAT8_BYVAL."),
4526                                  errhint("It looks like you need to recompile or initdb.")));
4527 #else
4528         if (ControlFile->float8ByVal != false)
4529                 ereport(FATAL,
4530                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4531                                  errmsg("database files are incompatible with server"),
4532                                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4533                                                    " but the server was compiled without USE_FLOAT8_BYVAL."),
4534                                  errhint("It looks like you need to recompile or initdb.")));
4535 #endif
4536
4537         wal_segment_size = ControlFile->xlog_seg_size;
4538
4539         if (!IsValidWalSegSize(wal_segment_size))
4540                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4541                                                 errmsg_plural("invalid WAL segment size in control file (%d byte)",
4542                                                                           "invalid WAL segment size in control file (%d bytes)",
4543                                                                           wal_segment_size,
4544                                                                           wal_segment_size),
4545                                                 errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.")));
4546
4547         snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4548         SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4549                                         PGC_S_DYNAMIC_DEFAULT);
4550
4551         /* check and update variables dependent on wal_segment_size */
4552         if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4553                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4554                 /* translator: both %s are GUC names */
4555                                                 errmsg("\"%s\" must be at least twice \"%s\"",
4556                                                            "min_wal_size", "wal_segment_size")));
4557
4558         if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4559                 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4560                 /* translator: both %s are GUC names */
4561                                                 errmsg("\"%s\" must be at least twice \"%s\"",
4562                                                            "max_wal_size", "wal_segment_size")));
4563
4564         UsableBytesInSegment =
4565                 (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4566                 (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4567
4568         CalculateCheckpointSegments();
4569
4570         /* Make the initdb settings visible as GUC variables, too */
4571         SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4572                                         PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
4573 }
4574
4575 /*
4576  * Utility wrapper to update the control file.  Note that the control
4577  * file gets flushed.
4578  */
4579 static void
4580 UpdateControlFile(void)
4581 {
4582         update_controlfile(DataDir, ControlFile, true);
4583 }
4584
4585 /*
4586  * Returns the unique system identifier from control file.
4587  */
4588 uint64
4589 GetSystemIdentifier(void)
4590 {
4591         Assert(ControlFile != NULL);
4592         return ControlFile->system_identifier;
4593 }
4594
4595 /*
4596  * Returns the random nonce from control file.
4597  */
4598 char *
4599 GetMockAuthenticationNonce(void)
4600 {
4601         Assert(ControlFile != NULL);
4602         return ControlFile->mock_authentication_nonce;
4603 }
4604
4605 /*
4606  * Are checksums enabled for data pages?
4607  */
4608 bool
4609 DataChecksumsEnabled(void)
4610 {
4611         Assert(ControlFile != NULL);
4612         return (ControlFile->data_checksum_version > 0);
4613 }
4614
4615 /*
4616  * Returns a fake LSN for unlogged relations.
4617  *
4618  * Each call generates an LSN that is greater than any previous value
4619  * returned. The current counter value is saved and restored across clean
4620  * shutdowns, but like unlogged relations, does not survive a crash. This can
4621  * be used in lieu of real LSN values returned by XLogInsert, if you need an
4622  * LSN-like increasing sequence of numbers without writing any WAL.
4623  */
4624 XLogRecPtr
4625 GetFakeLSNForUnloggedRel(void)
4626 {
4627         return pg_atomic_fetch_add_u64(&XLogCtl->unloggedLSN, 1);
4628 }
4629
4630 /*
4631  * Auto-tune the number of XLOG buffers.
4632  *
4633  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4634  * a maximum of one XLOG segment (there is little reason to think that more
4635  * is helpful, at least so long as we force an fsync when switching log files)
4636  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4637  * 9.1, when auto-tuning was added).
4638  *
4639  * This should not be called until NBuffers has received its final value.
4640  */
4641 static int
4642 XLOGChooseNumBuffers(void)
4643 {
4644         int                     xbuffers;
4645
4646         xbuffers = NBuffers / 32;
4647         if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4648                 xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4649         if (xbuffers < 8)
4650                 xbuffers = 8;
4651         return xbuffers;
4652 }
4653
4654 /*
4655  * GUC check_hook for wal_buffers
4656  */
4657 bool
4658 check_wal_buffers(int *newval, void **extra, GucSource source)
4659 {
4660         /*
4661          * -1 indicates a request for auto-tune.
4662          */
4663         if (*newval == -1)
4664         {
4665                 /*
4666                  * If we haven't yet changed the boot_val default of -1, just let it
4667                  * be.  We'll fix it when XLOGShmemSize is called.
4668                  */
4669                 if (XLOGbuffers == -1)
4670                         return true;
4671
4672                 /* Otherwise, substitute the auto-tune value */
4673                 *newval = XLOGChooseNumBuffers();
4674         }
4675
4676         /*
4677          * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4678          * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4679          * the case, we just silently treat such values as a request for the
4680          * minimum.  (We could throw an error instead, but that doesn't seem very
4681          * helpful.)
4682          */
4683         if (*newval < 4)
4684                 *newval = 4;
4685
4686         return true;
4687 }
4688
4689 /*
4690  * GUC check_hook for wal_consistency_checking
4691  */
4692 bool
4693 check_wal_consistency_checking(char **newval, void **extra, GucSource source)
4694 {
4695         char       *rawstring;
4696         List       *elemlist;
4697         ListCell   *l;
4698         bool            newwalconsistency[RM_MAX_ID + 1];
4699
4700         /* Initialize the array */
4701         MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
4702
4703         /* Need a modifiable copy of string */
4704         rawstring = pstrdup(*newval);
4705
4706         /* Parse string into list of identifiers */
4707         if (!SplitIdentifierString(rawstring, ',', &elemlist))
4708         {
4709                 /* syntax error in list */
4710                 GUC_check_errdetail("List syntax is invalid.");
4711                 pfree(rawstring);
4712                 list_free(elemlist);
4713                 return false;
4714         }
4715
4716         foreach(l, elemlist)
4717         {
4718                 char       *tok = (char *) lfirst(l);
4719                 int                     rmid;
4720
4721                 /* Check for 'all'. */
4722                 if (pg_strcasecmp(tok, "all") == 0)
4723                 {
4724                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4725                                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
4726                                         newwalconsistency[rmid] = true;
4727                 }
4728                 else
4729                 {
4730                         /* Check if the token matches any known resource manager. */
4731                         bool            found = false;
4732
4733                         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4734                         {
4735                                 if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
4736                                         pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
4737                                 {
4738                                         newwalconsistency[rmid] = true;
4739                                         found = true;
4740                                         break;
4741                                 }
4742                         }
4743                         if (!found)
4744                         {
4745                                 /*
4746                                  * During startup, it might be a not-yet-loaded custom
4747                                  * resource manager.  Defer checking until
4748                                  * InitializeWalConsistencyChecking().
4749                                  */
4750                                 if (!process_shared_preload_libraries_done)
4751                                 {
4752                                         check_wal_consistency_checking_deferred = true;
4753                                 }
4754                                 else
4755                                 {
4756                                         GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
4757                                         pfree(rawstring);
4758                                         list_free(elemlist);
4759                                         return false;
4760                                 }
4761                         }
4762                 }
4763         }
4764
4765         pfree(rawstring);
4766         list_free(elemlist);
4767
4768         /* assign new value */
4769         *extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool));
4770         memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
4771         return true;
4772 }
4773
4774 /*
4775  * GUC assign_hook for wal_consistency_checking
4776  */
4777 void
4778 assign_wal_consistency_checking(const char *newval, void *extra)
4779 {
4780         /*
4781          * If some checks were deferred, it's possible that the checks will fail
4782          * later during InitializeWalConsistencyChecking(). But in that case, the
4783          * postmaster will exit anyway, so it's safe to proceed with the
4784          * assignment.
4785          *
4786          * Any built-in resource managers specified are assigned immediately,
4787          * which affects WAL created before shared_preload_libraries are
4788          * processed. Any custom resource managers specified won't be assigned
4789          * until after shared_preload_libraries are processed, but that's OK
4790          * because WAL for a custom resource manager can't be written before the
4791          * module is loaded anyway.
4792          */
4793         wal_consistency_checking = extra;
4794 }
4795
4796 /*
4797  * InitializeWalConsistencyChecking: run after loading custom resource managers
4798  *
4799  * If any unknown resource managers were specified in the
4800  * wal_consistency_checking GUC, processing was deferred.  Now that
4801  * shared_preload_libraries have been loaded, process wal_consistency_checking
4802  * again.
4803  */
4804 void
4805 InitializeWalConsistencyChecking(void)
4806 {
4807         Assert(process_shared_preload_libraries_done);
4808
4809         if (check_wal_consistency_checking_deferred)
4810         {
4811                 struct config_generic *guc;
4812
4813                 guc = find_option("wal_consistency_checking", false, false, ERROR);
4814
4815                 check_wal_consistency_checking_deferred = false;
4816
4817                 set_config_option_ext("wal_consistency_checking",
4818                                                           wal_consistency_checking_string,
4819                                                           guc->scontext, guc->source, guc->srole,
4820                                                           GUC_ACTION_SET, true, ERROR, false);
4821
4822                 /* checking should not be deferred again */
4823                 Assert(!check_wal_consistency_checking_deferred);
4824         }
4825 }
4826
4827 /*
4828  * GUC show_hook for archive_command
4829  */
4830 const char *
4831 show_archive_command(void)
4832 {
4833         if (XLogArchivingActive())
4834                 return XLogArchiveCommand;
4835         else
4836                 return "(disabled)";
4837 }
4838
4839 /*
4840  * GUC show_hook for in_hot_standby
4841  */
4842 const char *
4843 show_in_hot_standby(void)
4844 {
4845         /*
4846          * We display the actual state based on shared memory, so that this GUC
4847          * reports up-to-date state if examined intra-query.  The underlying
4848          * variable (in_hot_standby_guc) changes only when we transmit a new value
4849          * to the client.
4850          */
4851         return RecoveryInProgress() ? "on" : "off";
4852 }
4853
4854 /*
4855  * Read the control file, set respective GUCs.
4856  *
4857  * This is to be called during startup, including a crash recovery cycle,
4858  * unless in bootstrap mode, where no control file yet exists.  As there's no
4859  * usable shared memory yet (its sizing can depend on the contents of the
4860  * control file!), first store the contents in local memory. XLOGShmemInit()
4861  * will then copy it to shared memory later.
4862  *
4863  * reset just controls whether previous contents are to be expected (in the
4864  * reset case, there's a dangling pointer into old shared memory), or not.
4865  */
4866 void
4867 LocalProcessControlFile(bool reset)
4868 {
4869         Assert(reset || ControlFile == NULL);
4870         ControlFile = palloc(sizeof(ControlFileData));
4871         ReadControlFile();
4872 }
4873
4874 /*
4875  * Get the wal_level from the control file. For a standby, this value should be
4876  * considered as its active wal_level, because it may be different from what
4877  * was originally configured on standby.
4878  */
4879 WalLevel
4880 GetActiveWalLevelOnStandby(void)
4881 {
4882         return ControlFile->wal_level;
4883 }
4884
4885 /*
4886  * Initialization of shared memory for XLOG
4887  */
4888 Size
4889 XLOGShmemSize(void)
4890 {
4891         Size            size;
4892
4893         /*
4894          * If the value of wal_buffers is -1, use the preferred auto-tune value.
4895          * This isn't an amazingly clean place to do this, but we must wait till
4896          * NBuffers has received its final value, and must do it before using the
4897          * value of XLOGbuffers to do anything important.
4898          *
4899          * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
4900          * However, if the DBA explicitly set wal_buffers = -1 in the config file,
4901          * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
4902          * the matter with PGC_S_OVERRIDE.
4903          */
4904         if (XLOGbuffers == -1)
4905         {
4906                 char            buf[32];
4907
4908                 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4909                 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
4910                                                 PGC_S_DYNAMIC_DEFAULT);
4911                 if (XLOGbuffers == -1)  /* failed to apply it? */
4912                         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
4913                                                         PGC_S_OVERRIDE);
4914         }
4915         Assert(XLOGbuffers > 0);
4916
4917         /* XLogCtl */
4918         size = sizeof(XLogCtlData);
4919
4920         /* WAL insertion locks, plus alignment */
4921         size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4922         /* xlblocks array */
4923         size = add_size(size, mul_size(sizeof(pg_atomic_uint64), XLOGbuffers));
4924         /* extra alignment padding for XLOG I/O buffers */
4925         size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
4926         /* and the buffers themselves */
4927         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4928
4929         /*
4930          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4931          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4932          * routine again below to compute the actual allocation size.
4933          */
4934
4935         return size;
4936 }
4937
4938 void
4939 XLOGShmemInit(void)
4940 {
4941         bool            foundCFile,
4942                                 foundXLog;
4943         char       *allocptr;
4944         int                     i;
4945         ControlFileData *localControlFile;
4946
4947 #ifdef WAL_DEBUG
4948
4949         /*
4950          * Create a memory context for WAL debugging that's exempt from the normal
4951          * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4952          * an allocation fails, but wal_debug is not for production use anyway.
4953          */
4954         if (walDebugCxt == NULL)
4955         {
4956                 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4957                                                                                         "WAL Debug",
4958                                                                                         ALLOCSET_DEFAULT_SIZES);
4959                 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4960         }
4961 #endif
4962
4963
4964         XLogCtl = (XLogCtlData *)
4965                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4966
4967         localControlFile = ControlFile;
4968         ControlFile = (ControlFileData *)
4969                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4970
4971         if (foundCFile || foundXLog)
4972         {
4973                 /* both should be present or neither */
4974                 Assert(foundCFile && foundXLog);
4975
4976                 /* Initialize local copy of WALInsertLocks */
4977                 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4978
4979                 if (localControlFile)
4980                         pfree(localControlFile);
4981                 return;
4982         }
4983         memset(XLogCtl, 0, sizeof(XLogCtlData));
4984
4985         /*
4986          * Already have read control file locally, unless in bootstrap mode. Move
4987          * contents into shared memory.
4988          */
4989         if (localControlFile)
4990         {
4991                 memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
4992                 pfree(localControlFile);
4993         }
4994
4995         /*
4996          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4997          * multiple of the alignment for same, so no extra alignment padding is
4998          * needed here.
4999          */
5000         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5001         XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
5002         allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
5003
5004         for (i = 0; i < XLOGbuffers; i++)
5005         {
5006                 pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
5007         }
5008
5009         /* WAL insertion locks. Ensure they're aligned to the full padded size */
5010         allocptr += sizeof(WALInsertLockPadded) -
5011                 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5012         WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5013                 (WALInsertLockPadded *) allocptr;
5014         allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5015
5016         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5017         {
5018                 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5019                 pg_atomic_init_u64(&WALInsertLocks[i].l.insertingAt, InvalidXLogRecPtr);
5020                 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5021         }
5022
5023         /*
5024          * Align the start of the page buffers to a full xlog block size boundary.
5025          * This simplifies some calculations in XLOG insertion. It is also
5026          * required for O_DIRECT.
5027          */
5028         allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5029         XLogCtl->pages = allocptr;
5030         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5031
5032         /*
5033          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5034          * in additional info.)
5035          */
5036         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5037         XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5038         XLogCtl->InstallXLogFileSegmentActive = false;
5039         XLogCtl->WalWriterSleeping = false;
5040
5041         SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5042         SpinLockInit(&XLogCtl->info_lck);
5043         pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
5044         pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
5045         pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
5046         pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
5047 }
5048
5049 /*
5050  * This func must be called ONCE on system install.  It creates pg_control
5051  * and the initial XLOG segment.
5052  */
5053 void
5054 BootStrapXLOG(uint32 data_checksum_version)
5055 {
5056         CheckPoint      checkPoint;
5057         char       *buffer;
5058         XLogPageHeader page;
5059         XLogLongPageHeader longpage;
5060         XLogRecord *record;
5061         char       *recptr;
5062         uint64          sysidentifier;
5063         struct timeval tv;
5064         pg_crc32c       crc;
5065
5066         /* allow ordinary WAL segment creation, like StartupXLOG() would */
5067         SetInstallXLogFileSegmentActive();
5068
5069         /*
5070          * Select a hopefully-unique system identifier code for this installation.
5071          * We use the result of gettimeofday(), including the fractional seconds
5072          * field, as being about as unique as we can easily get.  (Think not to
5073          * use random(), since it hasn't been seeded and there's no portable way
5074          * to seed it other than the system clock value...)  The upper half of the
5075          * uint64 value is just the tv_sec part, while the lower half contains the
5076          * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5077          * PID for a little extra uniqueness.  A person knowing this encoding can
5078          * determine the initialization time of the installation, which could
5079          * perhaps be useful sometimes.
5080          */
5081         gettimeofday(&tv, NULL);
5082         sysidentifier = ((uint64) tv.tv_sec) << 32;
5083         sysidentifier |= ((uint64) tv.tv_usec) << 12;
5084         sysidentifier |= getpid() & 0xFFF;
5085
5086         /* page buffer must be aligned suitably for O_DIRECT */
5087         buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5088         page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5089         memset(page, 0, XLOG_BLCKSZ);
5090
5091         /*
5092          * Set up information for the initial checkpoint record
5093          *
5094          * The initial checkpoint record is written to the beginning of the WAL
5095          * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5096          * used, so that we can use 0/0 to mean "before any valid WAL segment".
5097          */
5098         checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5099         checkPoint.ThisTimeLineID = BootstrapTimeLineID;
5100         checkPoint.PrevTimeLineID = BootstrapTimeLineID;
5101         checkPoint.fullPageWrites = fullPageWrites;
5102         checkPoint.wal_level = wal_level;
5103         checkPoint.nextXid =
5104                 FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5105         checkPoint.nextOid = FirstGenbkiObjectId;
5106         checkPoint.nextMulti = FirstMultiXactId;
5107         checkPoint.nextMultiOffset = 0;
5108         checkPoint.oldestXid = FirstNormalTransactionId;
5109         checkPoint.oldestXidDB = Template1DbOid;
5110         checkPoint.oldestMulti = FirstMultiXactId;
5111         checkPoint.oldestMultiDB = Template1DbOid;
5112         checkPoint.oldestCommitTsXid = InvalidTransactionId;
5113         checkPoint.newestCommitTsXid = InvalidTransactionId;
5114         checkPoint.time = (pg_time_t) time(NULL);
5115         checkPoint.oldestActiveXid = InvalidTransactionId;
5116
5117         TransamVariables->nextXid = checkPoint.nextXid;
5118         TransamVariables->nextOid = checkPoint.nextOid;
5119         TransamVariables->oidCount = 0;
5120         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5121         AdvanceOldestClogXid(checkPoint.oldestXid);
5122         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5123         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5124         SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5125
5126         /* Set up the XLOG page header */
5127         page->xlp_magic = XLOG_PAGE_MAGIC;
5128         page->xlp_info = XLP_LONG_HEADER;
5129         page->xlp_tli = BootstrapTimeLineID;
5130         page->xlp_pageaddr = wal_segment_size;
5131         longpage = (XLogLongPageHeader) page;
5132         longpage->xlp_sysid = sysidentifier;
5133         longpage->xlp_seg_size = wal_segment_size;
5134         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5135
5136         /* Insert the initial checkpoint record */
5137         recptr = ((char *) page + SizeOfXLogLongPHD);
5138         record = (XLogRecord *) recptr;
5139         record->xl_prev = 0;
5140         record->xl_xid = InvalidTransactionId;
5141         record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5142         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5143         record->xl_rmid = RM_XLOG_ID;
5144         recptr += SizeOfXLogRecord;
5145         /* fill the XLogRecordDataHeaderShort struct */
5146         *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5147         *(recptr++) = sizeof(checkPoint);
5148         memcpy(recptr, &checkPoint, sizeof(checkPoint));
5149         recptr += sizeof(checkPoint);
5150         Assert(recptr - (char *) record == record->xl_tot_len);
5151
5152         INIT_CRC32C(crc);
5153         COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5154         COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5155         FIN_CRC32C(crc);
5156         record->xl_crc = crc;
5157
5158         /* Create first XLOG segment file */
5159         openLogTLI = BootstrapTimeLineID;
5160         openLogFile = XLogFileInit(1, BootstrapTimeLineID);
5161
5162         /*
5163          * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
5164          * close the file again in a moment.
5165          */
5166
5167         /* Write the first page with the initial record */
5168         errno = 0;
5169         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5170         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5171         {
5172                 /* if write didn't set errno, assume problem is no disk space */
5173                 if (errno == 0)
5174                         errno = ENOSPC;
5175                 ereport(PANIC,
5176                                 (errcode_for_file_access(),
5177                                  errmsg("could not write bootstrap write-ahead log file: %m")));
5178         }
5179         pgstat_report_wait_end();
5180
5181         pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5182         if (pg_fsync(openLogFile) != 0)
5183                 ereport(PANIC,
5184                                 (errcode_for_file_access(),
5185                                  errmsg("could not fsync bootstrap write-ahead log file: %m")));
5186         pgstat_report_wait_end();
5187
5188         if (close(openLogFile) != 0)
5189                 ereport(PANIC,
5190                                 (errcode_for_file_access(),
5191                                  errmsg("could not close bootstrap write-ahead log file: %m")));
5192
5193         openLogFile = -1;
5194
5195         /* Now create pg_control */
5196         InitControlFile(sysidentifier, data_checksum_version);
5197         ControlFile->time = checkPoint.time;
5198         ControlFile->checkPoint = checkPoint.redo;
5199         ControlFile->checkPointCopy = checkPoint;
5200
5201         /* some additional ControlFile fields are set in WriteControlFile() */
5202         WriteControlFile();
5203
5204         /* Bootstrap the commit log, too */
5205         BootStrapCLOG();
5206         BootStrapCommitTs();
5207         BootStrapSUBTRANS();
5208         BootStrapMultiXact();
5209
5210         pfree(buffer);
5211
5212         /*
5213          * Force control file to be read - in contrast to normal processing we'd
5214          * otherwise never run the checks and GUC related initializations therein.
5215          */
5216         ReadControlFile();
5217 }
5218
5219 static char *
5220 str_time(pg_time_t tnow)
5221 {
5222         char       *buf = palloc(128);
5223
5224         pg_strftime(buf, 128,
5225                                 "%Y-%m-%d %H:%M:%S %Z",
5226                                 pg_localtime(&tnow, log_timezone));
5227
5228         return buf;
5229 }
5230
5231 /*
5232  * Initialize the first WAL segment on new timeline.
5233  */
5234 static void
5235 XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
5236 {
5237         char            xlogfname[MAXFNAMELEN];
5238         XLogSegNo       endLogSegNo;
5239         XLogSegNo       startLogSegNo;
5240
5241         /* we always switch to a new timeline after archive recovery */
5242         Assert(endTLI != newTLI);
5243
5244         /*
5245          * Update min recovery point one last time.
5246          */
5247         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5248
5249         /*
5250          * Calculate the last segment on the old timeline, and the first segment
5251          * on the new timeline. If the switch happens in the middle of a segment,
5252          * they are the same, but if the switch happens exactly at a segment
5253          * boundary, startLogSegNo will be endLogSegNo + 1.
5254          */
5255         XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5256         XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5257
5258         /*
5259          * Initialize the starting WAL segment for the new timeline. If the switch
5260          * happens in the middle of a segment, copy data from the last WAL segment
5261          * of the old timeline up to the switch point, to the starting WAL segment
5262          * on the new timeline.
5263          */
5264         if (endLogSegNo == startLogSegNo)
5265         {
5266                 /*
5267                  * Make a copy of the file on the new timeline.
5268                  *
5269                  * Writing WAL isn't allowed yet, so there are no locking
5270                  * considerations. But we should be just as tense as XLogFileInit to
5271                  * avoid emplacing a bogus file.
5272                  */
5273                 XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
5274                                          XLogSegmentOffset(endOfLog, wal_segment_size));
5275         }
5276         else
5277         {
5278                 /*
5279                  * The switch happened at a segment boundary, so just create the next
5280                  * segment on the new timeline.
5281                  */
5282                 int                     fd;
5283
5284                 fd = XLogFileInit(startLogSegNo, newTLI);
5285
5286                 if (close(fd) != 0)
5287                 {
5288                         int                     save_errno = errno;
5289
5290                         XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
5291                         errno = save_errno;
5292                         ereport(ERROR,
5293                                         (errcode_for_file_access(),
5294                                          errmsg("could not close file \"%s\": %m", xlogfname)));
5295                 }
5296         }
5297
5298         /*
5299          * Let's just make real sure there are not .ready or .done flags posted
5300          * for the new segment.
5301          */
5302         XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
5303         XLogArchiveCleanup(xlogfname);
5304 }
5305
5306 /*
5307  * Perform cleanup actions at the conclusion of archive recovery.
5308  */
5309 static void
5310 CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
5311                                                         TimeLineID newTLI)
5312 {
5313         /*
5314          * Execute the recovery_end_command, if any.
5315          */
5316         if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
5317                 ExecuteRecoveryCommand(recoveryEndCommand,
5318                                                            "recovery_end_command",
5319                                                            true,
5320                                                            WAIT_EVENT_RECOVERY_END_COMMAND);
5321
5322         /*
5323          * We switched to a new timeline. Clean up segments on the old timeline.
5324          *
5325          * If there are any higher-numbered segments on the old timeline, remove
5326          * them. They might contain valid WAL, but they might also be
5327          * pre-allocated files containing garbage. In any case, they are not part
5328          * of the new timeline's history so we don't need them.
5329          */
5330         RemoveNonParentXlogFiles(EndOfLog, newTLI);
5331
5332         /*
5333          * If the switch happened in the middle of a segment, what to do with the
5334          * last, partial segment on the old timeline? If we don't archive it, and
5335          * the server that created the WAL never archives it either (e.g. because
5336          * it was hit by a meteor), it will never make it to the archive. That's
5337          * OK from our point of view, because the new segment that we created with
5338          * the new TLI contains all the WAL from the old timeline up to the switch
5339          * point. But if you later try to do PITR to the "missing" WAL on the old
5340          * timeline, recovery won't find it in the archive. It's physically
5341          * present in the new file with new TLI, but recovery won't look there
5342          * when it's recovering to the older timeline. On the other hand, if we
5343          * archive the partial segment, and the original server on that timeline
5344          * is still running and archives the completed version of the same segment
5345          * later, it will fail. (We used to do that in 9.4 and below, and it
5346          * caused such problems).
5347          *
5348          * As a compromise, we rename the last segment with the .partial suffix,
5349          * and archive it. Archive recovery will never try to read .partial
5350          * segments, so they will normally go unused. But in the odd PITR case,
5351          * the administrator can copy them manually to the pg_wal directory
5352          * (removing the suffix). They can be useful in debugging, too.
5353          *
5354          * If a .done or .ready file already exists for the old timeline, however,
5355          * we had already determined that the segment is complete, so we can let
5356          * it be archived normally. (In particular, if it was restored from the
5357          * archive to begin with, it's expected to have a .done file).
5358          */
5359         if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
5360                 XLogArchivingActive())
5361         {
5362                 char            origfname[MAXFNAMELEN];
5363                 XLogSegNo       endLogSegNo;
5364
5365                 XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
5366                 XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
5367
5368                 if (!XLogArchiveIsReadyOrDone(origfname))
5369                 {
5370                         char            origpath[MAXPGPATH];
5371                         char            partialfname[MAXFNAMELEN];
5372                         char            partialpath[MAXPGPATH];
5373
5374                         /*
5375                          * If we're summarizing WAL, we can't rename the partial file
5376                          * until the summarizer finishes with it, else it will fail.
5377                          */
5378                         if (summarize_wal)
5379                                 WaitForWalSummarization(EndOfLog);
5380
5381                         XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
5382                         snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
5383                         snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
5384
5385                         /*
5386                          * Make sure there's no .done or .ready file for the .partial
5387                          * file.
5388                          */
5389                         XLogArchiveCleanup(partialfname);
5390
5391                         durable_rename(origpath, partialpath, ERROR);
5392                         XLogArchiveNotify(partialfname);
5393                 }
5394         }
5395 }
5396
5397 /*
5398  * Check to see if required parameters are set high enough on this server
5399  * for various aspects of recovery operation.
5400  *
5401  * Note that all the parameters which this function tests need to be
5402  * listed in Administrator's Overview section in high-availability.sgml.
5403  * If you change them, don't forget to update the list.
5404  */
5405 static void
5406 CheckRequiredParameterValues(void)
5407 {
5408         /*
5409          * For archive recovery, the WAL must be generated with at least 'replica'
5410          * wal_level.
5411          */
5412         if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5413         {
5414                 ereport(FATAL,
5415                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
5416                                  errmsg("WAL was generated with \"wal_level=minimal\", cannot continue recovering"),
5417                                  errdetail("This happens if you temporarily set \"wal_level=minimal\" on the server."),
5418                                  errhint("Use a backup taken after setting \"wal_level\" to higher than \"minimal\".")));
5419         }
5420
5421         /*
5422          * For Hot Standby, the WAL must be generated with 'replica' mode, and we
5423          * must have at least as many backend slots as the primary.
5424          */
5425         if (ArchiveRecoveryRequested && EnableHotStandby)
5426         {
5427                 /* We ignore autovacuum_worker_slots when we make this test. */
5428                 RecoveryRequiresIntParameter("max_connections",
5429                                                                          MaxConnections,
5430                                                                          ControlFile->MaxConnections);
5431                 RecoveryRequiresIntParameter("max_worker_processes",
5432                                                                          max_worker_processes,
5433                                                                          ControlFile->max_worker_processes);
5434                 RecoveryRequiresIntParameter("max_wal_senders",
5435                                                                          max_wal_senders,
5436                                                                          ControlFile->max_wal_senders);
5437                 RecoveryRequiresIntParameter("max_prepared_transactions",
5438                                                                          max_prepared_xacts,
5439                                                                          ControlFile->max_prepared_xacts);
5440                 RecoveryRequiresIntParameter("max_locks_per_transaction",
5441                                                                          max_locks_per_xact,
5442                                                                          ControlFile->max_locks_per_xact);
5443         }
5444 }
5445
5446 /*
5447  * This must be called ONCE during postmaster or standalone-backend startup
5448  */
5449 void
5450 StartupXLOG(void)
5451 {
5452         XLogCtlInsert *Insert;
5453         CheckPoint      checkPoint;
5454         bool            wasShutdown;
5455         bool            didCrash;
5456         bool            haveTblspcMap;
5457         bool            haveBackupLabel;
5458         XLogRecPtr      EndOfLog;
5459         TimeLineID      EndOfLogTLI;
5460         TimeLineID      newTLI;
5461         bool            performedWalRecovery;
5462         EndOfWalRecoveryInfo *endOfRecoveryInfo;
5463         XLogRecPtr      abortedRecPtr;
5464         XLogRecPtr      missingContrecPtr;
5465         TransactionId oldestActiveXID;
5466         bool            promoted = false;
5467
5468         /*
5469          * We should have an aux process resource owner to use, and we should not
5470          * be in a transaction that's installed some other resowner.
5471          */
5472         Assert(AuxProcessResourceOwner != NULL);
5473         Assert(CurrentResourceOwner == NULL ||
5474                    CurrentResourceOwner == AuxProcessResourceOwner);
5475         CurrentResourceOwner = AuxProcessResourceOwner;
5476
5477         /*
5478          * Check that contents look valid.
5479          */
5480         if (!XRecOffIsValid(ControlFile->checkPoint))
5481                 ereport(FATAL,
5482                                 (errcode(ERRCODE_DATA_CORRUPTED),
5483                                  errmsg("control file contains invalid checkpoint location")));
5484
5485         switch (ControlFile->state)
5486         {
5487                 case DB_SHUTDOWNED:
5488
5489                         /*
5490                          * This is the expected case, so don't be chatty in standalone
5491                          * mode
5492                          */
5493                         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
5494                                         (errmsg("database system was shut down at %s",
5495                                                         str_time(ControlFile->time))));
5496                         break;
5497
5498                 case DB_SHUTDOWNED_IN_RECOVERY:
5499                         ereport(LOG,
5500                                         (errmsg("database system was shut down in recovery at %s",
5501                                                         str_time(ControlFile->time))));
5502                         break;
5503
5504                 case DB_SHUTDOWNING:
5505                         ereport(LOG,
5506                                         (errmsg("database system shutdown was interrupted; last known up at %s",
5507                                                         str_time(ControlFile->time))));
5508                         break;
5509
5510                 case DB_IN_CRASH_RECOVERY:
5511                         ereport(LOG,
5512                                         (errmsg("database system was interrupted while in recovery at %s",
5513                                                         str_time(ControlFile->time)),
5514                                          errhint("This probably means that some data is corrupted and"
5515                                                          " you will have to use the last backup for recovery.")));
5516                         break;
5517
5518                 case DB_IN_ARCHIVE_RECOVERY:
5519                         ereport(LOG,
5520                                         (errmsg("database system was interrupted while in recovery at log time %s",
5521                                                         str_time(ControlFile->checkPointCopy.time)),
5522                                          errhint("If this has occurred more than once some data might be corrupted"
5523                                                          " and you might need to choose an earlier recovery target.")));
5524                         break;
5525
5526                 case DB_IN_PRODUCTION:
5527                         ereport(LOG,
5528                                         (errmsg("database system was interrupted; last known up at %s",
5529                                                         str_time(ControlFile->time))));
5530                         break;
5531
5532                 default:
5533                         ereport(FATAL,
5534                                         (errcode(ERRCODE_DATA_CORRUPTED),
5535                                          errmsg("control file contains invalid database cluster state")));
5536         }
5537
5538         /* This is just to allow attaching to startup process with a debugger */
5539 #ifdef XLOG_REPLAY_DELAY
5540         if (ControlFile->state != DB_SHUTDOWNED)
5541                 pg_usleep(60000000L);
5542 #endif
5543
5544         /*
5545          * Verify that pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
5546          * In cases where someone has performed a copy for PITR, these directories
5547          * may have been excluded and need to be re-created.
5548          */
5549         ValidateXLOGDirectoryStructure();
5550
5551         /* Set up timeout handler needed to report startup progress. */
5552         if (!IsBootstrapProcessingMode())
5553                 RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
5554                                                 startup_progress_timeout_handler);
5555
5556         /*----------
5557          * If we previously crashed, perform a couple of actions:
5558          *
5559          * - The pg_wal directory may still include some temporary WAL segments
5560          *   used when creating a new segment, so perform some clean up to not
5561          *   bloat this path.  This is done first as there is no point to sync
5562          *   this temporary data.
5563          *
5564          * - There might be data which we had written, intending to fsync it, but
5565          *   which we had not actually fsync'd yet.  Therefore, a power failure in
5566          *   the near future might cause earlier unflushed writes to be lost, even
5567          *   though more recent data written to disk from here on would be
5568          *   persisted.  To avoid that, fsync the entire data directory.
5569          */
5570         if (ControlFile->state != DB_SHUTDOWNED &&
5571                 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
5572         {
5573                 RemoveTempXlogFiles();
5574                 SyncDataDirectory();
5575                 didCrash = true;
5576         }
5577         else
5578                 didCrash = false;
5579
5580         /*
5581          * Prepare for WAL recovery if needed.
5582          *
5583          * InitWalRecovery analyzes the control file and the backup label file, if
5584          * any.  It updates the in-memory ControlFile buffer according to the
5585          * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
5586          * It also applies the tablespace map file, if any.
5587          */
5588         InitWalRecovery(ControlFile, &wasShutdown,
5589                                         &haveBackupLabel, &haveTblspcMap);
5590         checkPoint = ControlFile->checkPointCopy;
5591
5592         /* initialize shared memory variables from the checkpoint record */
5593         TransamVariables->nextXid = checkPoint.nextXid;
5594         TransamVariables->nextOid = checkPoint.nextOid;
5595         TransamVariables->oidCount = 0;
5596         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5597         AdvanceOldestClogXid(checkPoint.oldestXid);
5598         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5599         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5600         SetCommitTsLimit(checkPoint.oldestCommitTsXid,
5601                                          checkPoint.newestCommitTsXid);
5602         XLogCtl->ckptFullXid = checkPoint.nextXid;
5603
5604         /*
5605          * Clear out any old relcache cache files.  This is *necessary* if we do
5606          * any WAL replay, since that would probably result in the cache files
5607          * being out of sync with database reality.  In theory we could leave them
5608          * in place if the database had been cleanly shut down, but it seems
5609          * safest to just remove them always and let them be rebuilt during the
5610          * first backend startup.  These files needs to be removed from all
5611          * directories including pg_tblspc, however the symlinks are created only
5612          * after reading tablespace_map file in case of archive recovery from
5613          * backup, so needs to clear old relcache files here after creating
5614          * symlinks.
5615          */
5616         RelationCacheInitFileRemove();
5617
5618         /*
5619          * Initialize replication slots, before there's a chance to remove
5620          * required resources.
5621          */
5622         StartupReplicationSlots();
5623
5624         /*
5625          * Startup logical state, needs to be setup now so we have proper data
5626          * during crash recovery.
5627          */
5628         StartupReorderBuffer();
5629
5630         /*
5631          * Startup CLOG. This must be done after TransamVariables->nextXid has
5632          * been initialized and before we accept connections or begin WAL replay.
5633          */
5634         StartupCLOG();
5635
5636         /*
5637          * Startup MultiXact. We need to do this early to be able to replay
5638          * truncations.
5639          */
5640         StartupMultiXact();
5641
5642         /*
5643          * Ditto for commit timestamps.  Activate the facility if the setting is
5644          * enabled in the control file, as there should be no tracking of commit
5645          * timestamps done when the setting was disabled.  This facility can be
5646          * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
5647          */
5648         if (ControlFile->track_commit_timestamp)
5649                 StartupCommitTs();
5650
5651         /*
5652          * Recover knowledge about replay progress of known replication partners.
5653          */
5654         StartupReplicationOrigin();
5655
5656         /*
5657          * Initialize unlogged LSN. On a clean shutdown, it's restored from the
5658          * control file. On recovery, all unlogged relations are blown away, so
5659          * the unlogged LSN counter can be reset too.
5660          */
5661         if (ControlFile->state == DB_SHUTDOWNED)
5662                 pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
5663                                                                            ControlFile->unloggedLSN);
5664         else
5665                 pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
5666                                                                            FirstNormalUnloggedLSN);
5667
5668         /*
5669          * Copy any missing timeline history files between 'now' and the recovery
5670          * target timeline from archive to pg_wal. While we don't need those files
5671          * ourselves - the history file of the recovery target timeline covers all
5672          * the previous timelines in the history too - a cascading standby server
5673          * might be interested in them. Or, if you archive the WAL from this
5674          * server to a different archive than the primary, it'd be good for all
5675          * the history files to get archived there after failover, so that you can
5676          * use one of the old timelines as a PITR target. Timeline history files
5677          * are small, so it's better to copy them unnecessarily than not copy them
5678          * and regret later.
5679          */
5680         restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
5681
5682         /*
5683          * Before running in recovery, scan pg_twophase and fill in its status to
5684          * be able to work on entries generated by redo.  Doing a scan before
5685          * taking any recovery action has the merit to discard any 2PC files that
5686          * are newer than the first record to replay, saving from any conflicts at
5687          * replay.  This avoids as well any subsequent scans when doing recovery
5688          * of the on-disk two-phase data.
5689          */
5690         restoreTwoPhaseData();
5691
5692         /*
5693          * When starting with crash recovery, reset pgstat data - it might not be
5694          * valid. Otherwise restore pgstat data. It's safe to do this here,
5695          * because postmaster will not yet have started any other processes.
5696          *
5697          * NB: Restoring replication slot stats relies on slot state to have
5698          * already been restored from disk.
5699          *
5700          * TODO: With a bit of extra work we could just start with a pgstat file
5701          * associated with the checkpoint redo location we're starting from.
5702          */
5703         if (didCrash)
5704                 pgstat_discard_stats();
5705         else
5706                 pgstat_restore_stats(checkPoint.redo);
5707
5708         lastFullPageWrites = checkPoint.fullPageWrites;
5709
5710         RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5711         doPageWrites = lastFullPageWrites;
5712
5713         /* REDO */
5714         if (InRecovery)
5715         {
5716                 /* Initialize state for RecoveryInProgress() */
5717                 SpinLockAcquire(&XLogCtl->info_lck);
5718                 if (InArchiveRecovery)
5719                         XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
5720                 else
5721                         XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5722                 SpinLockRelease(&XLogCtl->info_lck);
5723
5724                 /*
5725                  * Update pg_control to show that we are recovering and to show the
5726                  * selected checkpoint as the place we are starting from. We also mark
5727                  * pg_control with any minimum recovery stop point obtained from a
5728                  * backup history file.
5729                  *
5730                  * No need to hold ControlFileLock yet, we aren't up far enough.
5731                  */
5732                 UpdateControlFile();
5733
5734                 /*
5735                  * If there was a backup label file, it's done its job and the info
5736                  * has now been propagated into pg_control.  We must get rid of the
5737                  * label file so that if we crash during recovery, we'll pick up at
5738                  * the latest recovery restartpoint instead of going all the way back
5739                  * to the backup start point.  It seems prudent though to just rename
5740                  * the file out of the way rather than delete it completely.
5741                  */
5742                 if (haveBackupLabel)
5743                 {
5744                         unlink(BACKUP_LABEL_OLD);
5745                         durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
5746                 }
5747
5748                 /*
5749                  * If there was a tablespace_map file, it's done its job and the
5750                  * symlinks have been created.  We must get rid of the map file so
5751                  * that if we crash during recovery, we don't create symlinks again.
5752                  * It seems prudent though to just rename the file out of the way
5753                  * rather than delete it completely.
5754                  */
5755                 if (haveTblspcMap)
5756                 {
5757                         unlink(TABLESPACE_MAP_OLD);
5758                         durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
5759                 }
5760
5761                 /*
5762                  * Initialize our local copy of minRecoveryPoint.  When doing crash
5763                  * recovery we want to replay up to the end of WAL.  Particularly, in
5764                  * the case of a promoted standby minRecoveryPoint value in the
5765                  * control file is only updated after the first checkpoint.  However,
5766                  * if the instance crashes before the first post-recovery checkpoint
5767                  * is completed then recovery will use a stale location causing the
5768                  * startup process to think that there are still invalid page
5769                  * references when checking for data consistency.
5770                  */
5771                 if (InArchiveRecovery)
5772                 {
5773                         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
5774                         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5775                 }
5776                 else
5777                 {
5778                         LocalMinRecoveryPoint = InvalidXLogRecPtr;
5779                         LocalMinRecoveryPointTLI = 0;
5780                 }
5781
5782                 /* Check that the GUCs used to generate the WAL allow recovery */
5783                 CheckRequiredParameterValues();
5784
5785                 /*
5786                  * We're in recovery, so unlogged relations may be trashed and must be
5787                  * reset.  This should be done BEFORE allowing Hot Standby
5788                  * connections, so that read-only backends don't try to read whatever
5789                  * garbage is left over from before.
5790                  */
5791                 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
5792
5793                 /*
5794                  * Likewise, delete any saved transaction snapshot files that got left
5795                  * behind by crashed backends.
5796                  */
5797                 DeleteAllExportedSnapshotFiles();
5798
5799                 /*
5800                  * Initialize for Hot Standby, if enabled. We won't let backends in
5801                  * yet, not until we've reached the min recovery point specified in
5802                  * control file and we've established a recovery snapshot from a
5803                  * running-xacts WAL record.
5804                  */
5805                 if (ArchiveRecoveryRequested && EnableHotStandby)
5806                 {
5807                         TransactionId *xids;
5808                         int                     nxids;
5809
5810                         ereport(DEBUG1,
5811                                         (errmsg_internal("initializing for hot standby")));
5812
5813                         InitRecoveryTransactionEnvironment();
5814
5815                         if (wasShutdown)
5816                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
5817                         else
5818                                 oldestActiveXID = checkPoint.oldestActiveXid;
5819                         Assert(TransactionIdIsValid(oldestActiveXID));
5820
5821                         /* Tell procarray about the range of xids it has to deal with */
5822                         ProcArrayInitRecovery(XidFromFullTransactionId(TransamVariables->nextXid));
5823
5824                         /*
5825                          * Startup subtrans only.  CLOG, MultiXact and commit timestamp
5826                          * have already been started up and other SLRUs are not maintained
5827                          * during recovery and need not be started yet.
5828                          */
5829                         StartupSUBTRANS(oldestActiveXID);
5830
5831                         /*
5832                          * If we're beginning at a shutdown checkpoint, we know that
5833                          * nothing was running on the primary at this point. So fake-up an
5834                          * empty running-xacts record and use that here and now. Recover
5835                          * additional standby state for prepared transactions.
5836                          */
5837                         if (wasShutdown)
5838                         {
5839                                 RunningTransactionsData running;
5840                                 TransactionId latestCompletedXid;
5841
5842                                 /* Update pg_subtrans entries for any prepared transactions */
5843                                 StandbyRecoverPreparedTransactions();
5844
5845                                 /*
5846                                  * Construct a RunningTransactions snapshot representing a
5847                                  * shut down server, with only prepared transactions still
5848                                  * alive. We're never overflowed at this point because all
5849                                  * subxids are listed with their parent prepared transactions.
5850                                  */
5851                                 running.xcnt = nxids;
5852                                 running.subxcnt = 0;
5853                                 running.subxid_status = SUBXIDS_IN_SUBTRANS;
5854                                 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
5855                                 running.oldestRunningXid = oldestActiveXID;
5856                                 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
5857                                 TransactionIdRetreat(latestCompletedXid);
5858                                 Assert(TransactionIdIsNormal(latestCompletedXid));
5859                                 running.latestCompletedXid = latestCompletedXid;
5860                                 running.xids = xids;
5861
5862                                 ProcArrayApplyRecoveryInfo(&running);
5863                         }
5864                 }
5865
5866                 /*
5867                  * We're all set for replaying the WAL now. Do it.
5868                  */
5869                 PerformWalRecovery();
5870                 performedWalRecovery = true;
5871         }
5872         else
5873                 performedWalRecovery = false;
5874
5875         /*
5876          * Finish WAL recovery.
5877          */
5878         endOfRecoveryInfo = FinishWalRecovery();
5879         EndOfLog = endOfRecoveryInfo->endOfLog;
5880         EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
5881         abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
5882         missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
5883
5884         /*
5885          * Reset ps status display, so as no information related to recovery shows
5886          * up.
5887          */
5888         set_ps_display("");
5889
5890         /*
5891          * When recovering from a backup (we are in recovery, and archive recovery
5892          * was requested), complain if we did not roll forward far enough to reach
5893          * the point where the database is consistent.  For regular online
5894          * backup-from-primary, that means reaching the end-of-backup WAL record
5895          * (at which point we reset backupStartPoint to be Invalid), for
5896          * backup-from-replica (which can't inject records into the WAL stream),
5897          * that point is when we reach the minRecoveryPoint in pg_control (which
5898          * we purposefully copy last when backing up from a replica).  For
5899          * pg_rewind (which creates a backup_label with a method of "pg_rewind")
5900          * or snapshot-style backups (which don't), backupEndRequired will be set
5901          * to false.
5902          *
5903          * Note: it is indeed okay to look at the local variable
5904          * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
5905          * might be further ahead --- ControlFile->minRecoveryPoint cannot have
5906          * been advanced beyond the WAL we processed.
5907          */
5908         if (InRecovery &&
5909                 (EndOfLog < LocalMinRecoveryPoint ||
5910                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
5911         {
5912                 /*
5913                  * Ran off end of WAL before reaching end-of-backup WAL record, or
5914                  * minRecoveryPoint. That's a bad sign, indicating that you tried to
5915                  * recover from an online backup but never called pg_backup_stop(), or
5916                  * you didn't archive all the WAL needed.
5917                  */
5918                 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
5919                 {
5920                         if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
5921                                 ereport(FATAL,
5922                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
5923                                                  errmsg("WAL ends before end of online backup"),
5924                                                  errhint("All WAL generated while online backup was taken must be available at recovery.")));
5925                         else
5926                                 ereport(FATAL,
5927                                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
5928                                                  errmsg("WAL ends before consistent recovery point")));
5929                 }
5930         }
5931
5932         /*
5933          * Reset unlogged relations to the contents of their INIT fork. This is
5934          * done AFTER recovery is complete so as to include any unlogged relations
5935          * created during recovery, but BEFORE recovery is marked as having
5936          * completed successfully. Otherwise we'd not retry if any of the post
5937          * end-of-recovery steps fail.
5938          */
5939         if (InRecovery)
5940                 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
5941
5942         /*
5943          * Pre-scan prepared transactions to find out the range of XIDs present.
5944          * This information is not quite needed yet, but it is positioned here so
5945          * as potential problems are detected before any on-disk change is done.
5946          */
5947         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
5948
5949         /*
5950          * Allow ordinary WAL segment creation before possibly switching to a new
5951          * timeline, which creates a new segment, and after the last ReadRecord().
5952          */
5953         SetInstallXLogFileSegmentActive();
5954
5955         /*
5956          * Consider whether we need to assign a new timeline ID.
5957          *
5958          * If we did archive recovery, we always assign a new ID.  This handles a
5959          * couple of issues.  If we stopped short of the end of WAL during
5960          * recovery, then we are clearly generating a new timeline and must assign
5961          * it a unique new ID.  Even if we ran to the end, modifying the current
5962          * last segment is problematic because it may result in trying to
5963          * overwrite an already-archived copy of that segment, and we encourage
5964          * DBAs to make their archive_commands reject that.  We can dodge the
5965          * problem by making the new active segment have a new timeline ID.
5966          *
5967          * In a normal crash recovery, we can just extend the timeline we were in.
5968          */
5969         newTLI = endOfRecoveryInfo->lastRecTLI;
5970         if (ArchiveRecoveryRequested)
5971         {
5972                 newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
5973                 ereport(LOG,
5974                                 (errmsg("selected new timeline ID: %u", newTLI)));
5975
5976                 /*
5977                  * Make a writable copy of the last WAL segment.  (Note that we also
5978                  * have a copy of the last block of the old WAL in
5979                  * endOfRecovery->lastPage; we will use that below.)
5980                  */
5981                 XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
5982
5983                 /*
5984                  * Remove the signal files out of the way, so that we don't
5985                  * accidentally re-enter archive recovery mode in a subsequent crash.
5986                  */
5987                 if (endOfRecoveryInfo->standby_signal_file_found)
5988                         durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5989
5990                 if (endOfRecoveryInfo->recovery_signal_file_found)
5991                         durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5992
5993                 /*
5994                  * Write the timeline history file, and have it archived. After this
5995                  * point (or rather, as soon as the file is archived), the timeline
5996                  * will appear as "taken" in the WAL archive and to any standby
5997                  * servers.  If we crash before actually switching to the new
5998                  * timeline, standby servers will nevertheless think that we switched
5999                  * to the new timeline, and will try to connect to the new timeline.
6000                  * To minimize the window for that, try to do as little as possible
6001                  * between here and writing the end-of-recovery record.
6002                  */
6003                 writeTimeLineHistory(newTLI, recoveryTargetTLI,
6004                                                          EndOfLog, endOfRecoveryInfo->recoveryStopReason);
6005
6006                 ereport(LOG,
6007                                 (errmsg("archive recovery complete")));
6008         }
6009
6010         /* Save the selected TimeLineID in shared memory, too */
6011         SpinLockAcquire(&XLogCtl->info_lck);
6012         XLogCtl->InsertTimeLineID = newTLI;
6013         XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
6014         SpinLockRelease(&XLogCtl->info_lck);
6015
6016         /*
6017          * Actually, if WAL ended in an incomplete record, skip the parts that
6018          * made it through and start writing after the portion that persisted.
6019          * (It's critical to first write an OVERWRITE_CONTRECORD message, which
6020          * we'll do as soon as we're open for writing new WAL.)
6021          */
6022         if (!XLogRecPtrIsInvalid(missingContrecPtr))
6023         {
6024                 /*
6025                  * We should only have a missingContrecPtr if we're not switching to a
6026                  * new timeline. When a timeline switch occurs, WAL is copied from the
6027                  * old timeline to the new only up to the end of the last complete
6028                  * record, so there can't be an incomplete WAL record that we need to
6029                  * disregard.
6030                  */
6031                 Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
6032                 Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
6033                 EndOfLog = missingContrecPtr;
6034         }
6035
6036         /*
6037          * Prepare to write WAL starting at EndOfLog location, and init xlog
6038          * buffer cache using the block containing the last record from the
6039          * previous incarnation.
6040          */
6041         Insert = &XLogCtl->Insert;
6042         Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
6043         Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
6044
6045         /*
6046          * Tricky point here: lastPage contains the *last* block that the LastRec
6047          * record spans, not the one it starts in.  The last block is indeed the
6048          * one we want to use.
6049          */
6050         if (EndOfLog % XLOG_BLCKSZ != 0)
6051         {
6052                 char       *page;
6053                 int                     len;
6054                 int                     firstIdx;
6055
6056                 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
6057                 len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
6058                 Assert(len < XLOG_BLCKSZ);
6059
6060                 /* Copy the valid part of the last block, and zero the rest */
6061                 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
6062                 memcpy(page, endOfRecoveryInfo->lastPage, len);
6063                 memset(page + len, 0, XLOG_BLCKSZ - len);
6064
6065                 pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
6066                 XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
6067         }
6068         else
6069         {
6070                 /*
6071                  * There is no partial block to copy. Just set InitializedUpTo, and
6072                  * let the first attempt to insert a log record to initialize the next
6073                  * buffer.
6074                  */
6075                 XLogCtl->InitializedUpTo = EndOfLog;
6076         }
6077
6078         /*
6079          * Update local and shared status.  This is OK to do without any locks
6080          * because no other process can be reading or writing WAL yet.
6081          */
6082         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6083         pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
6084         pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
6085         pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
6086         XLogCtl->LogwrtRqst.Write = EndOfLog;
6087         XLogCtl->LogwrtRqst.Flush = EndOfLog;
6088
6089         /*
6090          * Preallocate additional log files, if wanted.
6091          */
6092         PreallocXlogFiles(EndOfLog, newTLI);
6093
6094         /*
6095          * Okay, we're officially UP.
6096          */
6097         InRecovery = false;
6098
6099         /* start the archive_timeout timer and LSN running */
6100         XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
6101         XLogCtl->lastSegSwitchLSN = EndOfLog;
6102
6103         /* also initialize latestCompletedXid, to nextXid - 1 */
6104         LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6105         TransamVariables->latestCompletedXid = TransamVariables->nextXid;
6106         FullTransactionIdRetreat(&TransamVariables->latestCompletedXid);
6107         LWLockRelease(ProcArrayLock);
6108
6109         /*
6110          * Start up subtrans, if not already done for hot standby.  (commit
6111          * timestamps are started below, if necessary.)
6112          */
6113         if (standbyState == STANDBY_DISABLED)
6114                 StartupSUBTRANS(oldestActiveXID);
6115
6116         /*
6117          * Perform end of recovery actions for any SLRUs that need it.
6118          */
6119         TrimCLOG();
6120         TrimMultiXact();
6121
6122         /*
6123          * Reload shared-memory state for prepared transactions.  This needs to
6124          * happen before renaming the last partial segment of the old timeline as
6125          * it may be possible that we have to recover some transactions from it.
6126          */
6127         RecoverPreparedTransactions();
6128
6129         /* Shut down xlogreader */
6130         ShutdownWalRecovery();
6131
6132         /* Enable WAL writes for this backend only. */
6133         LocalSetXLogInsertAllowed();
6134
6135         /* If necessary, write overwrite-contrecord before doing anything else */
6136         if (!XLogRecPtrIsInvalid(abortedRecPtr))
6137         {
6138                 Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
6139                 CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
6140         }
6141
6142         /*
6143          * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
6144          * record before resource manager writes cleanup WAL records or checkpoint
6145          * record is written.
6146          */
6147         Insert->fullPageWrites = lastFullPageWrites;
6148         UpdateFullPageWrites();
6149
6150         /*
6151          * Emit checkpoint or end-of-recovery record in XLOG, if required.
6152          */
6153         if (performedWalRecovery)
6154                 promoted = PerformRecoveryXLogAction();
6155
6156         /*
6157          * If any of the critical GUCs have changed, log them before we allow
6158          * backends to write WAL.
6159          */
6160         XLogReportParameters();
6161
6162         /* If this is archive recovery, perform post-recovery cleanup actions. */
6163         if (ArchiveRecoveryRequested)
6164                 CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
6165
6166         /*
6167          * Local WAL inserts enabled, so it's time to finish initialization of
6168          * commit timestamp.
6169          */
6170         CompleteCommitTsInitialization();
6171
6172         /*
6173          * All done with end-of-recovery actions.
6174          *
6175          * Now allow backends to write WAL and update the control file status in
6176          * consequence.  SharedRecoveryState, that controls if backends can write
6177          * WAL, is updated while holding ControlFileLock to prevent other backends
6178          * to look at an inconsistent state of the control file in shared memory.
6179          * There is still a small window during which backends can write WAL and
6180          * the control file is still referring to a system not in DB_IN_PRODUCTION
6181          * state while looking at the on-disk control file.
6182          *
6183          * Also, we use info_lck to update SharedRecoveryState to ensure that
6184          * there are no race conditions concerning visibility of other recent
6185          * updates to shared memory.
6186          */
6187         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6188         ControlFile->state = DB_IN_PRODUCTION;
6189
6190         SpinLockAcquire(&XLogCtl->info_lck);
6191         XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
6192         SpinLockRelease(&XLogCtl->info_lck);
6193
6194         UpdateControlFile();
6195         LWLockRelease(ControlFileLock);
6196
6197         /*
6198          * Shutdown the recovery environment.  This must occur after
6199          * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
6200          * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
6201          * any session building a snapshot will not rely on KnownAssignedXids as
6202          * RecoveryInProgress() would return false at this stage.  This is
6203          * particularly critical for prepared 2PC transactions, that would still
6204          * need to be included in snapshots once recovery has ended.
6205          */
6206         if (standbyState != STANDBY_DISABLED)
6207                 ShutdownRecoveryTransactionEnvironment();
6208
6209         /*
6210          * If there were cascading standby servers connected to us, nudge any wal
6211          * sender processes to notice that we've been promoted.
6212          */
6213         WalSndWakeup(true, true);
6214
6215         /*
6216          * If this was a promotion, request an (online) checkpoint now. This isn't
6217          * required for consistency, but the last restartpoint might be far back,
6218          * and in case of a crash, recovering from it might take a longer than is
6219          * appropriate now that we're not in standby mode anymore.
6220          */
6221         if (promoted)
6222                 RequestCheckpoint(CHECKPOINT_FORCE);
6223 }
6224
6225 /*
6226  * Callback from PerformWalRecovery(), called when we switch from crash
6227  * recovery to archive recovery mode.  Updates the control file accordingly.
6228  */
6229 void
6230 SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
6231 {
6232         /* initialize minRecoveryPoint to this record */
6233         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6234         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6235         if (ControlFile->minRecoveryPoint < EndRecPtr)
6236         {
6237                 ControlFile->minRecoveryPoint = EndRecPtr;
6238                 ControlFile->minRecoveryPointTLI = replayTLI;
6239         }
6240         /* update local copy */
6241         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
6242         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6243
6244         /*
6245          * The startup process can update its local copy of minRecoveryPoint from
6246          * this point.
6247          */
6248         updateMinRecoveryPoint = true;
6249
6250         UpdateControlFile();
6251
6252         /*
6253          * We update SharedRecoveryState while holding the lock on ControlFileLock
6254          * so both states are consistent in shared memory.
6255          */
6256         SpinLockAcquire(&XLogCtl->info_lck);
6257         XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6258         SpinLockRelease(&XLogCtl->info_lck);
6259
6260         LWLockRelease(ControlFileLock);
6261 }
6262
6263 /*
6264  * Callback from PerformWalRecovery(), called when we reach the end of backup.
6265  * Updates the control file accordingly.
6266  */
6267 void
6268 ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
6269 {
6270         /*
6271          * We have reached the end of base backup, as indicated by pg_control. The
6272          * data on disk is now consistent (unless minRecoveryPoint is further
6273          * ahead, which can happen if we crashed during previous recovery).  Reset
6274          * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
6275          * make sure we don't allow starting up at an earlier point even if
6276          * recovery is stopped and restarted soon after this.
6277          */
6278         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6279
6280         if (ControlFile->minRecoveryPoint < EndRecPtr)
6281         {
6282                 ControlFile->minRecoveryPoint = EndRecPtr;
6283                 ControlFile->minRecoveryPointTLI = tli;
6284         }
6285
6286         ControlFile->backupStartPoint = InvalidXLogRecPtr;
6287         ControlFile->backupEndPoint = InvalidXLogRecPtr;
6288         ControlFile->backupEndRequired = false;
6289         UpdateControlFile();
6290
6291         LWLockRelease(ControlFileLock);
6292 }
6293
6294 /*
6295  * Perform whatever XLOG actions are necessary at end of REDO.
6296  *
6297  * The goal here is to make sure that we'll be able to recover properly if
6298  * we crash again. If we choose to write a checkpoint, we'll write a shutdown
6299  * checkpoint rather than an on-line one. This is not particularly critical,
6300  * but since we may be assigning a new TLI, using a shutdown checkpoint allows
6301  * us to have the rule that TLI only changes in shutdown checkpoints, which
6302  * allows some extra error checking in xlog_redo.
6303  */
6304 static bool
6305 PerformRecoveryXLogAction(void)
6306 {
6307         bool            promoted = false;
6308
6309         /*
6310          * Perform a checkpoint to update all our recovery activity to disk.
6311          *
6312          * Note that we write a shutdown checkpoint rather than an on-line one.
6313          * This is not particularly critical, but since we may be assigning a new
6314          * TLI, using a shutdown checkpoint allows us to have the rule that TLI
6315          * only changes in shutdown checkpoints, which allows some extra error
6316          * checking in xlog_redo.
6317          *
6318          * In promotion, only create a lightweight end-of-recovery record instead
6319          * of a full checkpoint. A checkpoint is requested later, after we're
6320          * fully out of recovery mode and already accepting queries.
6321          */
6322         if (ArchiveRecoveryRequested && IsUnderPostmaster &&
6323                 PromoteIsTriggered())
6324         {
6325                 promoted = true;
6326
6327                 /*
6328                  * Insert a special WAL record to mark the end of recovery, since we
6329                  * aren't doing a checkpoint. That means that the checkpointer process
6330                  * may likely be in the middle of a time-smoothed restartpoint and
6331                  * could continue to be for minutes after this.  That sounds strange,
6332                  * but the effect is roughly the same and it would be stranger to try
6333                  * to come out of the restartpoint and then checkpoint. We request a
6334                  * checkpoint later anyway, just for safety.
6335                  */
6336                 CreateEndOfRecoveryRecord();
6337         }
6338         else
6339         {
6340                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6341                                                   CHECKPOINT_IMMEDIATE |
6342                                                   CHECKPOINT_WAIT);
6343         }
6344
6345         return promoted;
6346 }
6347
6348 /*
6349  * Is the system still in recovery?
6350  *
6351  * Unlike testing InRecovery, this works in any process that's connected to
6352  * shared memory.
6353  */
6354 bool
6355 RecoveryInProgress(void)
6356 {
6357         /*
6358          * We check shared state each time only until we leave recovery mode. We
6359          * can't re-enter recovery, so there's no need to keep checking after the
6360          * shared variable has once been seen false.
6361          */
6362         if (!LocalRecoveryInProgress)
6363                 return false;
6364         else
6365         {
6366                 /*
6367                  * use volatile pointer to make sure we make a fresh read of the
6368                  * shared variable.
6369                  */
6370                 volatile XLogCtlData *xlogctl = XLogCtl;
6371
6372                 LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
6373
6374                 /*
6375                  * Note: We don't need a memory barrier when we're still in recovery.
6376                  * We might exit recovery immediately after return, so the caller
6377                  * can't rely on 'true' meaning that we're still in recovery anyway.
6378                  */
6379
6380                 return LocalRecoveryInProgress;
6381         }
6382 }
6383
6384 /*
6385  * Returns current recovery state from shared memory.
6386  *
6387  * This returned state is kept consistent with the contents of the control
6388  * file.  See details about the possible values of RecoveryState in xlog.h.
6389  */
6390 RecoveryState
6391 GetRecoveryState(void)
6392 {
6393         RecoveryState retval;
6394
6395         SpinLockAcquire(&XLogCtl->info_lck);
6396         retval = XLogCtl->SharedRecoveryState;
6397         SpinLockRelease(&XLogCtl->info_lck);
6398
6399         return retval;
6400 }
6401
6402 /*
6403  * Is this process allowed to insert new WAL records?
6404  *
6405  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
6406  * But we also have provisions for forcing the result "true" or "false"
6407  * within specific processes regardless of the global state.
6408  */
6409 bool
6410 XLogInsertAllowed(void)
6411 {
6412         /*
6413          * If value is "unconditionally true" or "unconditionally false", just
6414          * return it.  This provides the normal fast path once recovery is known
6415          * done.
6416          */
6417         if (LocalXLogInsertAllowed >= 0)
6418                 return (bool) LocalXLogInsertAllowed;
6419
6420         /*
6421          * Else, must check to see if we're still in recovery.
6422          */
6423         if (RecoveryInProgress())
6424                 return false;
6425
6426         /*
6427          * On exit from recovery, reset to "unconditionally true", since there is
6428          * no need to keep checking.
6429          */
6430         LocalXLogInsertAllowed = 1;
6431         return true;
6432 }
6433
6434 /*
6435  * Make XLogInsertAllowed() return true in the current process only.
6436  *
6437  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6438  * and even call LocalSetXLogInsertAllowed() again after that.
6439  *
6440  * Returns the previous value of LocalXLogInsertAllowed.
6441  */
6442 static int
6443 LocalSetXLogInsertAllowed(void)
6444 {
6445         int                     oldXLogAllowed = LocalXLogInsertAllowed;
6446
6447         LocalXLogInsertAllowed = 1;
6448
6449         return oldXLogAllowed;
6450 }
6451
6452 /*
6453  * Return the current Redo pointer from shared memory.
6454  *
6455  * As a side-effect, the local RedoRecPtr copy is updated.
6456  */
6457 XLogRecPtr
6458 GetRedoRecPtr(void)
6459 {
6460         XLogRecPtr      ptr;
6461
6462         /*
6463          * The possibly not up-to-date copy in XlogCtl is enough. Even if we
6464          * grabbed a WAL insertion lock to read the authoritative value in
6465          * Insert->RedoRecPtr, someone might update it just after we've released
6466          * the lock.
6467          */
6468         SpinLockAcquire(&XLogCtl->info_lck);
6469         ptr = XLogCtl->RedoRecPtr;
6470         SpinLockRelease(&XLogCtl->info_lck);
6471
6472         if (RedoRecPtr < ptr)
6473                 RedoRecPtr = ptr;
6474
6475         return RedoRecPtr;
6476 }
6477
6478 /*
6479  * Return information needed to decide whether a modified block needs a
6480  * full-page image to be included in the WAL record.
6481  *
6482  * The returned values are cached copies from backend-private memory, and
6483  * possibly out-of-date or, indeed, uninitialized, in which case they will
6484  * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
6485  * re-check them against up-to-date values, while holding the WAL insert lock.
6486  */
6487 void
6488 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
6489 {
6490         *RedoRecPtr_p = RedoRecPtr;
6491         *doPageWrites_p = doPageWrites;
6492 }
6493
6494 /*
6495  * GetInsertRecPtr -- Returns the current insert position.
6496  *
6497  * NOTE: The value *actually* returned is the position of the last full
6498  * xlog page. It lags behind the real insert position by at most 1 page.
6499  * For that, we don't need to scan through WAL insertion locks, and an
6500  * approximation is enough for the current usage of this function.
6501  */
6502 XLogRecPtr
6503 GetInsertRecPtr(void)
6504 {
6505         XLogRecPtr      recptr;
6506
6507         SpinLockAcquire(&XLogCtl->info_lck);
6508         recptr = XLogCtl->LogwrtRqst.Write;
6509         SpinLockRelease(&XLogCtl->info_lck);
6510
6511         return recptr;
6512 }
6513
6514 /*
6515  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6516  * position known to be fsync'd to disk. This should only be used on a
6517  * system that is known not to be in recovery.
6518  */
6519 XLogRecPtr
6520 GetFlushRecPtr(TimeLineID *insertTLI)
6521 {
6522         Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6523
6524         RefreshXLogWriteResult(LogwrtResult);
6525
6526         /*
6527          * If we're writing and flushing WAL, the time line can't be changing, so
6528          * no lock is required.
6529          */
6530         if (insertTLI)
6531                 *insertTLI = XLogCtl->InsertTimeLineID;
6532
6533         return LogwrtResult.Flush;
6534 }
6535
6536 /*
6537  * GetWALInsertionTimeLine -- Returns the current timeline of a system that
6538  * is not in recovery.
6539  */
6540 TimeLineID
6541 GetWALInsertionTimeLine(void)
6542 {
6543         Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6544
6545         /* Since the value can't be changing, no lock is required. */
6546         return XLogCtl->InsertTimeLineID;
6547 }
6548
6549 /*
6550  * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
6551  * the WAL insertion timeline; else, returns 0. Wherever possible, use
6552  * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
6553  * function decides recovery has ended as soon as the insert TLI is set, which
6554  * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
6555  */
6556 TimeLineID
6557 GetWALInsertionTimeLineIfSet(void)
6558 {
6559         TimeLineID      insertTLI;
6560
6561         SpinLockAcquire(&XLogCtl->info_lck);
6562         insertTLI = XLogCtl->InsertTimeLineID;
6563         SpinLockRelease(&XLogCtl->info_lck);
6564
6565         return insertTLI;
6566 }
6567
6568 /*
6569  * GetLastImportantRecPtr -- Returns the LSN of the last important record
6570  * inserted. All records not explicitly marked as unimportant are considered
6571  * important.
6572  *
6573  * The LSN is determined by computing the maximum of
6574  * WALInsertLocks[i].lastImportantAt.
6575  */
6576 XLogRecPtr
6577 GetLastImportantRecPtr(void)
6578 {
6579         XLogRecPtr      res = InvalidXLogRecPtr;
6580         int                     i;
6581
6582         for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
6583         {
6584                 XLogRecPtr      last_important;
6585
6586                 /*
6587                  * Need to take a lock to prevent torn reads of the LSN, which are
6588                  * possible on some of the supported platforms. WAL insert locks only
6589                  * support exclusive mode, so we have to use that.
6590                  */
6591                 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
6592                 last_important = WALInsertLocks[i].l.lastImportantAt;
6593                 LWLockRelease(&WALInsertLocks[i].l.lock);
6594
6595                 if (res < last_important)
6596                         res = last_important;
6597         }
6598
6599         return res;
6600 }
6601
6602 /*
6603  * Get the time and LSN of the last xlog segment switch
6604  */
6605 pg_time_t
6606 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
6607 {
6608         pg_time_t       result;
6609
6610         /* Need WALWriteLock, but shared lock is sufficient */
6611         LWLockAcquire(WALWriteLock, LW_SHARED);
6612         result = XLogCtl->lastSegSwitchTime;
6613         *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
6614         LWLockRelease(WALWriteLock);
6615
6616         return result;
6617 }
6618
6619 /*
6620  * This must be called ONCE during postmaster or standalone-backend shutdown
6621  */
6622 void
6623 ShutdownXLOG(int code, Datum arg)
6624 {
6625         /*
6626          * We should have an aux process resource owner to use, and we should not
6627          * be in a transaction that's installed some other resowner.
6628          */
6629         Assert(AuxProcessResourceOwner != NULL);
6630         Assert(CurrentResourceOwner == NULL ||
6631                    CurrentResourceOwner == AuxProcessResourceOwner);
6632         CurrentResourceOwner = AuxProcessResourceOwner;
6633
6634         /* Don't be chatty in standalone mode */
6635         ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6636                         (errmsg("shutting down")));
6637
6638         /*
6639          * Signal walsenders to move to stopping state.
6640          */
6641         WalSndInitStopping();
6642
6643         /*
6644          * Wait for WAL senders to be in stopping state.  This prevents commands
6645          * from writing new WAL.
6646          */
6647         WalSndWaitStopping();
6648
6649         if (RecoveryInProgress())
6650                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6651         else
6652         {
6653                 /*
6654                  * If archiving is enabled, rotate the last XLOG file so that all the
6655                  * remaining records are archived (postmaster wakes up the archiver
6656                  * process one more time at the end of shutdown). The checkpoint
6657                  * record will go to the next XLOG file and won't be archived (yet).
6658                  */
6659                 if (XLogArchivingActive())
6660                         RequestXLogSwitch(false);
6661
6662                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6663         }
6664 }
6665
6666 /*
6667  * Log start of a checkpoint.
6668  */
6669 static void
6670 LogCheckpointStart(int flags, bool restartpoint)
6671 {
6672         if (restartpoint)
6673                 ereport(LOG,
6674                 /* translator: the placeholders show checkpoint options */
6675                                 (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
6676                                                 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6677                                                 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6678                                                 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6679                                                 (flags & CHECKPOINT_FORCE) ? " force" : "",
6680                                                 (flags & CHECKPOINT_WAIT) ? " wait" : "",
6681                                                 (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
6682                                                 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6683                                                 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
6684         else
6685                 ereport(LOG,
6686                 /* translator: the placeholders show checkpoint options */
6687                                 (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
6688                                                 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6689                                                 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6690                                                 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6691                                                 (flags & CHECKPOINT_FORCE) ? " force" : "",
6692                                                 (flags & CHECKPOINT_WAIT) ? " wait" : "",
6693                                                 (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
6694                                                 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6695                                                 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
6696 }
6697
6698 /*
6699  * Log end of a checkpoint.
6700  */
6701 static void
6702 LogCheckpointEnd(bool restartpoint)
6703 {
6704         long            write_msecs,
6705                                 sync_msecs,
6706                                 total_msecs,
6707                                 longest_msecs,
6708                                 average_msecs;
6709         uint64          average_sync_time;
6710
6711         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6712
6713         write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
6714                                                                                                   CheckpointStats.ckpt_sync_t);
6715
6716         sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
6717                                                                                                  CheckpointStats.ckpt_sync_end_t);
6718
6719         /* Accumulate checkpoint timing summary data, in milliseconds. */
6720         PendingCheckpointerStats.write_time += write_msecs;
6721         PendingCheckpointerStats.sync_time += sync_msecs;
6722
6723         /*
6724          * All of the published timing statistics are accounted for.  Only
6725          * continue if a log message is to be written.
6726          */
6727         if (!log_checkpoints)
6728                 return;
6729
6730         total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
6731                                                                                                   CheckpointStats.ckpt_end_t);
6732
6733         /*
6734          * Timing values returned from CheckpointStats are in microseconds.
6735          * Convert to milliseconds for consistent printing.
6736          */
6737         longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
6738
6739         average_sync_time = 0;
6740         if (CheckpointStats.ckpt_sync_rels > 0)
6741                 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
6742                         CheckpointStats.ckpt_sync_rels;
6743         average_msecs = (long) ((average_sync_time + 999) / 1000);
6744
6745         /*
6746          * ControlFileLock is not required to see ControlFile->checkPoint and
6747          * ->checkPointCopy here as we are the only updator of those variables at
6748          * this moment.
6749          */
6750         if (restartpoint)
6751                 ereport(LOG,
6752                                 (errmsg("restartpoint complete: wrote %d buffers (%.1f%%), "
6753                                                 "wrote %d SLRU buffers; %d WAL file(s) added, "
6754                                                 "%d removed, %d recycled; write=%ld.%03d s, "
6755                                                 "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
6756                                                 "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
6757                                                 "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
6758                                                 CheckpointStats.ckpt_bufs_written,
6759                                                 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6760                                                 CheckpointStats.ckpt_slru_written,
6761                                                 CheckpointStats.ckpt_segs_added,
6762                                                 CheckpointStats.ckpt_segs_removed,
6763                                                 CheckpointStats.ckpt_segs_recycled,
6764                                                 write_msecs / 1000, (int) (write_msecs % 1000),
6765                                                 sync_msecs / 1000, (int) (sync_msecs % 1000),
6766                                                 total_msecs / 1000, (int) (total_msecs % 1000),
6767                                                 CheckpointStats.ckpt_sync_rels,
6768                                                 longest_msecs / 1000, (int) (longest_msecs % 1000),
6769                                                 average_msecs / 1000, (int) (average_msecs % 1000),
6770                                                 (int) (PrevCheckPointDistance / 1024.0),
6771                                                 (int) (CheckPointDistanceEstimate / 1024.0),
6772                                                 LSN_FORMAT_ARGS(ControlFile->checkPoint),
6773                                                 LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6774         else
6775                 ereport(LOG,
6776                                 (errmsg("checkpoint complete: wrote %d buffers (%.1f%%), "
6777                                                 "wrote %d SLRU buffers; %d WAL file(s) added, "
6778                                                 "%d removed, %d recycled; write=%ld.%03d s, "
6779                                                 "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
6780                                                 "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
6781                                                 "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
6782                                                 CheckpointStats.ckpt_bufs_written,
6783                                                 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6784                                                 CheckpointStats.ckpt_slru_written,
6785                                                 CheckpointStats.ckpt_segs_added,
6786                                                 CheckpointStats.ckpt_segs_removed,
6787                                                 CheckpointStats.ckpt_segs_recycled,
6788                                                 write_msecs / 1000, (int) (write_msecs % 1000),
6789                                                 sync_msecs / 1000, (int) (sync_msecs % 1000),
6790                                                 total_msecs / 1000, (int) (total_msecs % 1000),
6791                                                 CheckpointStats.ckpt_sync_rels,
6792                                                 longest_msecs / 1000, (int) (longest_msecs % 1000),
6793                                                 average_msecs / 1000, (int) (average_msecs % 1000),
6794                                                 (int) (PrevCheckPointDistance / 1024.0),
6795                                                 (int) (CheckPointDistanceEstimate / 1024.0),
6796                                                 LSN_FORMAT_ARGS(ControlFile->checkPoint),
6797                                                 LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6798 }
6799
6800 /*
6801  * Update the estimate of distance between checkpoints.
6802  *
6803  * The estimate is used to calculate the number of WAL segments to keep
6804  * preallocated, see XLOGfileslop().
6805  */
6806 static void
6807 UpdateCheckPointDistanceEstimate(uint64 nbytes)
6808 {
6809         /*
6810          * To estimate the number of segments consumed between checkpoints, keep a
6811          * moving average of the amount of WAL generated in previous checkpoint
6812          * cycles. However, if the load is bursty, with quiet periods and busy
6813          * periods, we want to cater for the peak load. So instead of a plain
6814          * moving average, let the average decline slowly if the previous cycle
6815          * used less WAL than estimated, but bump it up immediately if it used
6816          * more.
6817          *
6818          * When checkpoints are triggered by max_wal_size, this should converge to
6819          * CheckpointSegments * wal_segment_size,
6820          *
6821          * Note: This doesn't pay any attention to what caused the checkpoint.
6822          * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
6823          * starting a base backup, are counted the same as those created
6824          * automatically. The slow-decline will largely mask them out, if they are
6825          * not frequent. If they are frequent, it seems reasonable to count them
6826          * in as any others; if you issue a manual checkpoint every 5 minutes and
6827          * never let a timed checkpoint happen, it makes sense to base the
6828          * preallocation on that 5 minute interval rather than whatever
6829          * checkpoint_timeout is set to.
6830          */
6831         PrevCheckPointDistance = nbytes;
6832         if (CheckPointDistanceEstimate < nbytes)
6833                 CheckPointDistanceEstimate = nbytes;
6834         else
6835                 CheckPointDistanceEstimate =
6836                         (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
6837 }
6838
6839 /*
6840  * Update the ps display for a process running a checkpoint.  Note that
6841  * this routine should not do any allocations so as it can be called
6842  * from a critical section.
6843  */
6844 static void
6845 update_checkpoint_display(int flags, bool restartpoint, bool reset)
6846 {
6847         /*
6848          * The status is reported only for end-of-recovery and shutdown
6849          * checkpoints or shutdown restartpoints.  Updating the ps display is
6850          * useful in those situations as it may not be possible to rely on
6851          * pg_stat_activity to see the status of the checkpointer or the startup
6852          * process.
6853          */
6854         if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
6855                 return;
6856
6857         if (reset)
6858                 set_ps_display("");
6859         else
6860         {
6861                 char            activitymsg[128];
6862
6863                 snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
6864                                  (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
6865                                  (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
6866                                  restartpoint ? "restartpoint" : "checkpoint");
6867                 set_ps_display(activitymsg);
6868         }
6869 }
6870
6871
6872 /*
6873  * Perform a checkpoint --- either during shutdown, or on-the-fly
6874  *
6875  * flags is a bitwise OR of the following:
6876  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6877  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
6878  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
6879  *              ignoring checkpoint_completion_target parameter.
6880  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
6881  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6882  *              CHECKPOINT_END_OF_RECOVERY).
6883  *      CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
6884  *
6885  * Note: flags contains other bits, of interest here only for logging purposes.
6886  * In particular note that this routine is synchronous and does not pay
6887  * attention to CHECKPOINT_WAIT.
6888  *
6889  * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
6890  * record is inserted into WAL at the logical location of the checkpoint, before
6891  * flushing anything to disk, and when the checkpoint is eventually completed,
6892  * and it is from this point that WAL replay will begin in the case of a recovery
6893  * from this checkpoint. Once everything is written to disk, an
6894  * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
6895  * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
6896  * other write-ahead log records to be written while the checkpoint is in
6897  * progress, but we must be very careful about order of operations. This function
6898  * may take many minutes to execute on a busy system.
6899  *
6900  * On the other hand, when shutdown is true, concurrent insertion into the
6901  * write-ahead log is impossible, so there is no need for two separate records.
6902  * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
6903  * both the record marking the completion of the checkpoint and the location
6904  * from which WAL replay would begin if needed.
6905  *
6906  * Returns true if a new checkpoint was performed, or false if it was skipped
6907  * because the system was idle.
6908  */
6909 bool
6910 CreateCheckPoint(int flags)
6911 {
6912         bool            shutdown;
6913         CheckPoint      checkPoint;
6914         XLogRecPtr      recptr;
6915         XLogSegNo       _logSegNo;
6916         XLogCtlInsert *Insert = &XLogCtl->Insert;
6917         uint32          freespace;
6918         XLogRecPtr      PriorRedoPtr;
6919         XLogRecPtr      last_important_lsn;
6920         VirtualTransactionId *vxids;
6921         int                     nvxids;
6922         int                     oldXLogAllowed = 0;
6923
6924         /*
6925          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
6926          * issued at a different time.
6927          */
6928         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
6929                 shutdown = true;
6930         else
6931                 shutdown = false;
6932
6933         /* sanity check */
6934         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
6935                 elog(ERROR, "can't create a checkpoint during recovery");
6936
6937         /*
6938          * Prepare to accumulate statistics.
6939          *
6940          * Note: because it is possible for log_checkpoints to change while a
6941          * checkpoint proceeds, we always accumulate stats, even if
6942          * log_checkpoints is currently off.
6943          */
6944         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6945         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6946
6947         /*
6948          * Let smgr prepare for checkpoint; this has to happen outside the
6949          * critical section and before we determine the REDO pointer.  Note that
6950          * smgr must not do anything that'd have to be undone if we decide no
6951          * checkpoint is needed.
6952          */
6953         SyncPreCheckpoint();
6954
6955         /*
6956          * Use a critical section to force system panic if we have trouble.
6957          */
6958         START_CRIT_SECTION();
6959
6960         if (shutdown)
6961         {
6962                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6963                 ControlFile->state = DB_SHUTDOWNING;
6964                 UpdateControlFile();
6965                 LWLockRelease(ControlFileLock);
6966         }
6967
6968         /* Begin filling in the checkpoint WAL record */
6969         MemSet(&checkPoint, 0, sizeof(checkPoint));
6970         checkPoint.time = (pg_time_t) time(NULL);
6971
6972         /*
6973          * For Hot Standby, derive the oldestActiveXid before we fix the redo
6974          * pointer. This allows us to begin accumulating changes to assemble our
6975          * starting snapshot of locks and transactions.
6976          */
6977         if (!shutdown && XLogStandbyInfoActive())
6978                 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
6979         else
6980                 checkPoint.oldestActiveXid = InvalidTransactionId;
6981
6982         /*
6983          * Get location of last important record before acquiring insert locks (as
6984          * GetLastImportantRecPtr() also locks WAL locks).
6985          */
6986         last_important_lsn = GetLastImportantRecPtr();
6987
6988         /*
6989          * If this isn't a shutdown or forced checkpoint, and if there has been no
6990          * WAL activity requiring a checkpoint, skip it.  The idea here is to
6991          * avoid inserting duplicate checkpoints when the system is idle.
6992          */
6993         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
6994                                   CHECKPOINT_FORCE)) == 0)
6995         {
6996                 if (last_important_lsn == ControlFile->checkPoint)
6997                 {
6998                         END_CRIT_SECTION();
6999                         ereport(DEBUG1,
7000                                         (errmsg_internal("checkpoint skipped because system is idle")));
7001                         return false;
7002                 }
7003         }
7004
7005         /*
7006          * An end-of-recovery checkpoint is created before anyone is allowed to
7007          * write WAL. To allow us to write the checkpoint record, temporarily
7008          * enable XLogInsertAllowed.
7009          */
7010         if (flags & CHECKPOINT_END_OF_RECOVERY)
7011                 oldXLogAllowed = LocalSetXLogInsertAllowed();
7012
7013         checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
7014         if (flags & CHECKPOINT_END_OF_RECOVERY)
7015                 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7016         else
7017                 checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
7018
7019         /*
7020          * We must block concurrent insertions while examining insert state.
7021          */
7022         WALInsertLockAcquireExclusive();
7023
7024         checkPoint.fullPageWrites = Insert->fullPageWrites;
7025         checkPoint.wal_level = wal_level;
7026
7027         if (shutdown)
7028         {
7029                 XLogRecPtr      curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
7030
7031                 /*
7032                  * Compute new REDO record ptr = location of next XLOG record.
7033                  *
7034                  * Since this is a shutdown checkpoint, there can't be any concurrent
7035                  * WAL insertion.
7036                  */
7037                 freespace = INSERT_FREESPACE(curInsert);
7038                 if (freespace == 0)
7039                 {
7040                         if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
7041                                 curInsert += SizeOfXLogLongPHD;
7042                         else
7043                                 curInsert += SizeOfXLogShortPHD;
7044                 }
7045                 checkPoint.redo = curInsert;
7046
7047                 /*
7048                  * Here we update the shared RedoRecPtr for future XLogInsert calls;
7049                  * this must be done while holding all the insertion locks.
7050                  *
7051                  * Note: if we fail to complete the checkpoint, RedoRecPtr will be
7052                  * left pointing past where it really needs to point.  This is okay;
7053                  * the only consequence is that XLogInsert might back up whole buffers
7054                  * that it didn't really need to.  We can't postpone advancing
7055                  * RedoRecPtr because XLogInserts that happen while we are dumping
7056                  * buffers must assume that their buffer changes are not included in
7057                  * the checkpoint.
7058                  */
7059                 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
7060         }
7061
7062         /*
7063          * Now we can release the WAL insertion locks, allowing other xacts to
7064          * proceed while we are flushing disk buffers.
7065          */
7066         WALInsertLockRelease();
7067
7068         /*
7069          * If this is an online checkpoint, we have not yet determined the redo
7070          * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
7071          * record; the LSN at which it starts becomes the new redo pointer. We
7072          * don't do this for a shutdown checkpoint, because in that case no WAL
7073          * can be written between the redo point and the insertion of the
7074          * checkpoint record itself, so the checkpoint record itself serves to
7075          * mark the redo point.
7076          */
7077         if (!shutdown)
7078         {
7079                 /* Include WAL level in record for WAL summarizer's benefit. */
7080                 XLogBeginInsert();
7081                 XLogRegisterData((char *) &wal_level, sizeof(wal_level));
7082                 (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
7083
7084                 /*
7085                  * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
7086                  * shared memory and RedoRecPtr in backend-local memory, but we need
7087                  * to copy that into the record that will be inserted when the
7088                  * checkpoint is complete.
7089                  */
7090                 checkPoint.redo = RedoRecPtr;
7091         }
7092
7093         /* Update the info_lck-protected copy of RedoRecPtr as well */
7094         SpinLockAcquire(&XLogCtl->info_lck);
7095         XLogCtl->RedoRecPtr = checkPoint.redo;
7096         SpinLockRelease(&XLogCtl->info_lck);
7097
7098         /*
7099          * If enabled, log checkpoint start.  We postpone this until now so as not
7100          * to log anything if we decided to skip the checkpoint.
7101          */
7102         if (log_checkpoints)
7103                 LogCheckpointStart(flags, false);
7104
7105         /* Update the process title */
7106         update_checkpoint_display(flags, false, false);
7107
7108         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7109
7110         /*
7111          * Get the other info we need for the checkpoint record.
7112          *
7113          * We don't need to save oldestClogXid in the checkpoint, it only matters
7114          * for the short period in which clog is being truncated, and if we crash
7115          * during that we'll redo the clog truncation and fix up oldestClogXid
7116          * there.
7117          */
7118         LWLockAcquire(XidGenLock, LW_SHARED);
7119         checkPoint.nextXid = TransamVariables->nextXid;
7120         checkPoint.oldestXid = TransamVariables->oldestXid;
7121         checkPoint.oldestXidDB = TransamVariables->oldestXidDB;
7122         LWLockRelease(XidGenLock);
7123
7124         LWLockAcquire(CommitTsLock, LW_SHARED);
7125         checkPoint.oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
7126         checkPoint.newestCommitTsXid = TransamVariables->newestCommitTsXid;
7127         LWLockRelease(CommitTsLock);
7128
7129         LWLockAcquire(OidGenLock, LW_SHARED);
7130         checkPoint.nextOid = TransamVariables->nextOid;
7131         if (!shutdown)
7132                 checkPoint.nextOid += TransamVariables->oidCount;
7133         LWLockRelease(OidGenLock);
7134
7135         MultiXactGetCheckptMulti(shutdown,
7136                                                          &checkPoint.nextMulti,
7137                                                          &checkPoint.nextMultiOffset,
7138                                                          &checkPoint.oldestMulti,
7139                                                          &checkPoint.oldestMultiDB);
7140
7141         /*
7142          * Having constructed the checkpoint record, ensure all shmem disk buffers
7143          * and commit-log buffers are flushed to disk.
7144          *
7145          * This I/O could fail for various reasons.  If so, we will fail to
7146          * complete the checkpoint, but there is no reason to force a system
7147          * panic. Accordingly, exit critical section while doing it.
7148          */
7149         END_CRIT_SECTION();
7150
7151         /*
7152          * In some cases there are groups of actions that must all occur on one
7153          * side or the other of a checkpoint record. Before flushing the
7154          * checkpoint record we must explicitly wait for any backend currently
7155          * performing those groups of actions.
7156          *
7157          * One example is end of transaction, so we must wait for any transactions
7158          * that are currently in commit critical sections.  If an xact inserted
7159          * its commit record into XLOG just before the REDO point, then a crash
7160          * restart from the REDO point would not replay that record, which means
7161          * that our flushing had better include the xact's update of pg_xact.  So
7162          * we wait till he's out of his commit critical section before proceeding.
7163          * See notes in RecordTransactionCommit().
7164          *
7165          * Because we've already released the insertion locks, this test is a bit
7166          * fuzzy: it is possible that we will wait for xacts we didn't really need
7167          * to wait for.  But the delay should be short and it seems better to make
7168          * checkpoint take a bit longer than to hold off insertions longer than
7169          * necessary. (In fact, the whole reason we have this issue is that xact.c
7170          * does commit record XLOG insertion and clog update as two separate steps
7171          * protected by different locks, but again that seems best on grounds of
7172          * minimizing lock contention.)
7173          *
7174          * A transaction that has not yet set delayChkptFlags when we look cannot
7175          * be at risk, since it has not inserted its commit record yet; and one
7176          * that's already cleared it is not at risk either, since it's done fixing
7177          * clog and we will correctly flush the update below.  So we cannot miss
7178          * any xacts we need to wait for.
7179          */
7180         vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
7181         if (nvxids > 0)
7182         {
7183                 do
7184                 {
7185                         /*
7186                          * Keep absorbing fsync requests while we wait. There could even
7187                          * be a deadlock if we don't, if the process that prevents the
7188                          * checkpoint is trying to add a request to the queue.
7189                          */
7190                         AbsorbSyncRequests();
7191
7192                         pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_START);
7193                         pg_usleep(10000L);      /* wait for 10 msec */
7194                         pgstat_report_wait_end();
7195                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
7196                                                                                           DELAY_CHKPT_START));
7197         }
7198         pfree(vxids);
7199
7200         CheckPointGuts(checkPoint.redo, flags);
7201
7202         vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
7203         if (nvxids > 0)
7204         {
7205                 do
7206                 {
7207                         AbsorbSyncRequests();
7208
7209                         pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_COMPLETE);
7210                         pg_usleep(10000L);      /* wait for 10 msec */
7211                         pgstat_report_wait_end();
7212                 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
7213                                                                                           DELAY_CHKPT_COMPLETE));
7214         }
7215         pfree(vxids);
7216
7217         /*
7218          * Take a snapshot of running transactions and write this to WAL. This
7219          * allows us to reconstruct the state of running transactions during
7220          * archive recovery, if required. Skip, if this info disabled.
7221          *
7222          * If we are shutting down, or Startup process is completing crash
7223          * recovery we don't need to write running xact data.
7224          */
7225         if (!shutdown && XLogStandbyInfoActive())
7226                 LogStandbySnapshot();
7227
7228         START_CRIT_SECTION();
7229
7230         /*
7231          * Now insert the checkpoint record into XLOG.
7232          */
7233         XLogBeginInsert();
7234         XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
7235         recptr = XLogInsert(RM_XLOG_ID,
7236                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7237                                                 XLOG_CHECKPOINT_ONLINE);
7238
7239         XLogFlush(recptr);
7240
7241         /*
7242          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7243          * overwritten at next startup.  No-one should even try, this just allows
7244          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7245          * to just temporarily disable writing until the system has exited
7246          * recovery.
7247          */
7248         if (shutdown)
7249         {
7250                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7251                         LocalXLogInsertAllowed = oldXLogAllowed;
7252                 else
7253                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7254         }
7255
7256         /*
7257          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7258          * = end of actual checkpoint record.
7259          */
7260         if (shutdown && checkPoint.redo != ProcLastRecPtr)
7261                 ereport(PANIC,
7262                                 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
7263
7264         /*
7265          * Remember the prior checkpoint's redo ptr for
7266          * UpdateCheckPointDistanceEstimate()
7267          */
7268         PriorRedoPtr = ControlFile->checkPointCopy.redo;
7269
7270         /*
7271          * Update the control file.
7272          */
7273         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7274         if (shutdown)
7275                 ControlFile->state = DB_SHUTDOWNED;
7276         ControlFile->checkPoint = ProcLastRecPtr;
7277         ControlFile->checkPointCopy = checkPoint;
7278         /* crash recovery should always recover to the end of WAL */
7279         ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
7280         ControlFile->minRecoveryPointTLI = 0;
7281
7282         /*
7283          * Persist unloggedLSN value. It's reset on crash recovery, so this goes
7284          * unused on non-shutdown checkpoints, but seems useful to store it always
7285          * for debugging purposes.
7286          */
7287         ControlFile->unloggedLSN = pg_atomic_read_membarrier_u64(&XLogCtl->unloggedLSN);
7288
7289         UpdateControlFile();
7290         LWLockRelease(ControlFileLock);
7291
7292         /* Update shared-memory copy of checkpoint XID/epoch */
7293         SpinLockAcquire(&XLogCtl->info_lck);
7294         XLogCtl->ckptFullXid = checkPoint.nextXid;
7295         SpinLockRelease(&XLogCtl->info_lck);
7296
7297         /*
7298          * We are now done with critical updates; no need for system panic if we
7299          * have trouble while fooling with old log segments.
7300          */
7301         END_CRIT_SECTION();
7302
7303         /*
7304          * WAL summaries end when the next XLOG_CHECKPOINT_REDO or
7305          * XLOG_CHECKPOINT_SHUTDOWN record is reached. This is the first point
7306          * where (a) we're not inside of a critical section and (b) we can be
7307          * certain that the relevant record has been flushed to disk, which must
7308          * happen before it can be summarized.
7309          *
7310          * If this is a shutdown checkpoint, then this happens reasonably
7311          * promptly: we've only just inserted and flushed the
7312          * XLOG_CHECKPOINT_SHUTDOWN record. If this is not a shutdown checkpoint,
7313          * then this might not be very prompt at all: the XLOG_CHECKPOINT_REDO
7314          * record was written before we began flushing data to disk, and that
7315          * could be many minutes ago at this point. However, we don't XLogFlush()
7316          * after inserting that record, so we're not guaranteed that it's on disk
7317          * until after the above call that flushes the XLOG_CHECKPOINT_ONLINE
7318          * record.
7319          */
7320         WakeupWalSummarizer();
7321
7322         /*
7323          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7324          */
7325         SyncPostCheckpoint();
7326
7327         /*
7328          * Update the average distance between checkpoints if the prior checkpoint
7329          * exists.
7330          */
7331         if (PriorRedoPtr != InvalidXLogRecPtr)
7332                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
7333
7334         /*
7335          * Delete old log files, those no longer needed for last checkpoint to
7336          * prevent the disk holding the xlog from growing full.
7337          */
7338         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7339         KeepLogSeg(recptr, &_logSegNo);
7340         if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
7341                                                                                    _logSegNo, InvalidOid,
7342                                                                                    InvalidTransactionId))
7343         {
7344                 /*
7345                  * Some slots have been invalidated; recalculate the old-segment
7346                  * horizon, starting again from RedoRecPtr.
7347                  */
7348                 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7349                 KeepLogSeg(recptr, &_logSegNo);
7350         }
7351         _logSegNo--;
7352         RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
7353                                            checkPoint.ThisTimeLineID);
7354
7355         /*
7356          * Make more log segments if needed.  (Do this after recycling old log
7357          * segments, since that may supply some of the needed files.)
7358          */
7359         if (!shutdown)
7360                 PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
7361
7362         /*
7363          * Truncate pg_subtrans if possible.  We can throw away all data before
7364          * the oldest XMIN of any running transaction.  No future transaction will
7365          * attempt to reference any pg_subtrans entry older than that (see Asserts
7366          * in subtrans.c).  During recovery, though, we mustn't do this because
7367          * StartupSUBTRANS hasn't been called yet.
7368          */
7369         if (!RecoveryInProgress())
7370                 TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
7371
7372         /* Real work is done; log and update stats. */
7373         LogCheckpointEnd(false);
7374
7375         /* Reset the process title */
7376         update_checkpoint_display(flags, false, true);
7377
7378         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7379                                                                          NBuffers,
7380                                                                          CheckpointStats.ckpt_segs_added,
7381                                                                          CheckpointStats.ckpt_segs_removed,
7382                                                                          CheckpointStats.ckpt_segs_recycled);
7383
7384         return true;
7385 }
7386
7387 /*
7388  * Mark the end of recovery in WAL though without running a full checkpoint.
7389  * We can expect that a restartpoint is likely to be in progress as we
7390  * do this, though we are unwilling to wait for it to complete.
7391  *
7392  * CreateRestartPoint() allows for the case where recovery may end before
7393  * the restartpoint completes so there is no concern of concurrent behaviour.
7394  */
7395 static void
7396 CreateEndOfRecoveryRecord(void)
7397 {
7398         xl_end_of_recovery xlrec;
7399         XLogRecPtr      recptr;
7400
7401         /* sanity check */
7402         if (!RecoveryInProgress())
7403                 elog(ERROR, "can only be used to end recovery");
7404
7405         xlrec.end_time = GetCurrentTimestamp();
7406         xlrec.wal_level = wal_level;
7407
7408         WALInsertLockAcquireExclusive();
7409         xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
7410         xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7411         WALInsertLockRelease();
7412
7413         START_CRIT_SECTION();
7414
7415         XLogBeginInsert();
7416         XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
7417         recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
7418
7419         XLogFlush(recptr);
7420
7421         /*
7422          * Update the control file so that crash recovery can follow the timeline
7423          * changes to this point.
7424          */
7425         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7426         ControlFile->minRecoveryPoint = recptr;
7427         ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
7428         UpdateControlFile();
7429         LWLockRelease(ControlFileLock);
7430
7431         END_CRIT_SECTION();
7432 }
7433
7434 /*
7435  * Write an OVERWRITE_CONTRECORD message.
7436  *
7437  * When on WAL replay we expect a continuation record at the start of a page
7438  * that is not there, recovery ends and WAL writing resumes at that point.
7439  * But it's wrong to resume writing new WAL back at the start of the record
7440  * that was broken, because downstream consumers of that WAL (physical
7441  * replicas) are not prepared to "rewind".  So the first action after
7442  * finishing replay of all valid WAL must be to write a record of this type
7443  * at the point where the contrecord was missing; to support xlogreader
7444  * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
7445  * to the page header where the record occurs.  xlogreader has an ad-hoc
7446  * mechanism to report metadata about the broken record, which is what we
7447  * use here.
7448  *
7449  * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
7450  * skip the record it was reading, and pass back the LSN of the skipped
7451  * record, so that its caller can verify (on "replay" of that record) that the
7452  * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
7453  *
7454  * 'aborted_lsn' is the beginning position of the record that was incomplete.
7455  * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
7456  * beginning of the XLOG page where the record is to be inserted.  They must
7457  * match the current WAL insert position, they're passed here just so that we
7458  * can verify that.
7459  */
7460 static XLogRecPtr
7461 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
7462                                                                 TimeLineID newTLI)
7463 {
7464         xl_overwrite_contrecord xlrec;
7465         XLogRecPtr      recptr;
7466         XLogPageHeader pagehdr;
7467         XLogRecPtr      startPos;
7468
7469         /* sanity checks */
7470         if (!RecoveryInProgress())
7471                 elog(ERROR, "can only be used at end of recovery");
7472         if (pagePtr % XLOG_BLCKSZ != 0)
7473                 elog(ERROR, "invalid position for missing continuation record %X/%X",
7474                          LSN_FORMAT_ARGS(pagePtr));
7475
7476         /* The current WAL insert position should be right after the page header */
7477         startPos = pagePtr;
7478         if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
7479                 startPos += SizeOfXLogLongPHD;
7480         else
7481                 startPos += SizeOfXLogShortPHD;
7482         recptr = GetXLogInsertRecPtr();
7483         if (recptr != startPos)
7484                 elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
7485                          LSN_FORMAT_ARGS(recptr));
7486
7487         START_CRIT_SECTION();
7488
7489         /*
7490          * Initialize the XLOG page header (by GetXLogBuffer), and set the
7491          * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
7492          *
7493          * No other backend is allowed to write WAL yet, so acquiring the WAL
7494          * insertion lock is just pro forma.
7495          */
7496         WALInsertLockAcquire();
7497         pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
7498         pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
7499         WALInsertLockRelease();
7500
7501         /*
7502          * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
7503          * page.  We know it becomes the first record, because no other backend is
7504          * allowed to write WAL yet.
7505          */
7506         XLogBeginInsert();
7507         xlrec.overwritten_lsn = aborted_lsn;
7508         xlrec.overwrite_time = GetCurrentTimestamp();
7509         XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
7510         recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
7511
7512         /* check that the record was inserted to the right place */
7513         if (ProcLastRecPtr != startPos)
7514                 elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
7515                          LSN_FORMAT_ARGS(ProcLastRecPtr));
7516
7517         XLogFlush(recptr);
7518
7519         END_CRIT_SECTION();
7520
7521         return recptr;
7522 }
7523
7524 /*
7525  * Flush all data in shared memory to disk, and fsync
7526  *
7527  * This is the common code shared between regular checkpoints and
7528  * recovery restartpoints.
7529  */
7530 static void
7531 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7532 {
7533         CheckPointRelationMap();
7534         CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN);
7535         CheckPointSnapBuild();
7536         CheckPointLogicalRewriteHeap();
7537         CheckPointReplicationOrigin();
7538
7539         /* Write out all dirty data in SLRUs and the main buffer pool */
7540         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
7541         CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
7542         CheckPointCLOG();
7543         CheckPointCommitTs();
7544         CheckPointSUBTRANS();
7545         CheckPointMultiXact();
7546         CheckPointPredicate();
7547         CheckPointBuffers(flags);
7548
7549         /* Perform all queued up fsyncs */
7550         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
7551         CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
7552         ProcessSyncRequests();
7553         CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
7554         TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
7555
7556         /* We deliberately delay 2PC checkpointing as long as possible */
7557         CheckPointTwoPhase(checkPointRedo);
7558 }
7559
7560 /*
7561  * Save a checkpoint for recovery restart if appropriate
7562  *
7563  * This function is called each time a checkpoint record is read from XLOG.
7564  * It must determine whether the checkpoint represents a safe restartpoint or
7565  * not.  If so, the checkpoint record is stashed in shared memory so that
7566  * CreateRestartPoint can consult it.  (Note that the latter function is
7567  * executed by the checkpointer, while this one will be executed by the
7568  * startup process.)
7569  */
7570 static void
7571 RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
7572 {
7573         /*
7574          * Also refrain from creating a restartpoint if we have seen any
7575          * references to non-existent pages. Restarting recovery from the
7576          * restartpoint would not see the references, so we would lose the
7577          * cross-check that the pages belonged to a relation that was dropped
7578          * later.
7579          */
7580         if (XLogHaveInvalidPages())
7581         {
7582                 elog(DEBUG2,
7583                          "could not record restart point at %X/%X because there "
7584                          "are unresolved references to invalid pages",
7585                          LSN_FORMAT_ARGS(checkPoint->redo));
7586                 return;
7587         }
7588
7589         /*
7590          * Copy the checkpoint record to shared memory, so that checkpointer can
7591          * work out the next time it wants to perform a restartpoint.
7592          */
7593         SpinLockAcquire(&XLogCtl->info_lck);
7594         XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
7595         XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
7596         XLogCtl->lastCheckPoint = *checkPoint;
7597         SpinLockRelease(&XLogCtl->info_lck);
7598 }
7599
7600 /*
7601  * Establish a restartpoint if possible.
7602  *
7603  * This is similar to CreateCheckPoint, but is used during WAL recovery
7604  * to establish a point from which recovery can roll forward without
7605  * replaying the entire recovery log.
7606  *
7607  * Returns true if a new restartpoint was established. We can only establish
7608  * a restartpoint if we have replayed a safe checkpoint record since last
7609  * restartpoint.
7610  */
7611 bool
7612 CreateRestartPoint(int flags)
7613 {
7614         XLogRecPtr      lastCheckPointRecPtr;
7615         XLogRecPtr      lastCheckPointEndPtr;
7616         CheckPoint      lastCheckPoint;
7617         XLogRecPtr      PriorRedoPtr;
7618         XLogRecPtr      receivePtr;
7619         XLogRecPtr      replayPtr;
7620         TimeLineID      replayTLI;
7621         XLogRecPtr      endptr;
7622         XLogSegNo       _logSegNo;
7623         TimestampTz xtime;
7624
7625         /* Concurrent checkpoint/restartpoint cannot happen */
7626         Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
7627
7628         /* Get a local copy of the last safe checkpoint record. */
7629         SpinLockAcquire(&XLogCtl->info_lck);
7630         lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
7631         lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
7632         lastCheckPoint = XLogCtl->lastCheckPoint;
7633         SpinLockRelease(&XLogCtl->info_lck);
7634
7635         /*
7636          * Check that we're still in recovery mode. It's ok if we exit recovery
7637          * mode after this check, the restart point is valid anyway.
7638          */
7639         if (!RecoveryInProgress())
7640         {
7641                 ereport(DEBUG2,
7642                                 (errmsg_internal("skipping restartpoint, recovery has already ended")));
7643                 return false;
7644         }
7645
7646         /*
7647          * If the last checkpoint record we've replayed is already our last
7648          * restartpoint, we can't perform a new restart point. We still update
7649          * minRecoveryPoint in that case, so that if this is a shutdown restart
7650          * point, we won't start up earlier than before. That's not strictly
7651          * necessary, but when hot standby is enabled, it would be rather weird if
7652          * the database opened up for read-only connections at a point-in-time
7653          * before the last shutdown. Such time travel is still possible in case of
7654          * immediate shutdown, though.
7655          *
7656          * We don't explicitly advance minRecoveryPoint when we do create a
7657          * restartpoint. It's assumed that flushing the buffers will do that as a
7658          * side-effect.
7659          */
7660         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
7661                 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
7662         {
7663                 ereport(DEBUG2,
7664                                 (errmsg_internal("skipping restartpoint, already performed at %X/%X",
7665                                                                  LSN_FORMAT_ARGS(lastCheckPoint.redo))));
7666
7667                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7668                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7669                 {
7670                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7671                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7672                         UpdateControlFile();
7673                         LWLockRelease(ControlFileLock);
7674                 }
7675                 return false;
7676         }
7677
7678         /*
7679          * Update the shared RedoRecPtr so that the startup process can calculate
7680          * the number of segments replayed since last restartpoint, and request a
7681          * restartpoint if it exceeds CheckPointSegments.
7682          *
7683          * Like in CreateCheckPoint(), hold off insertions to update it, although
7684          * during recovery this is just pro forma, because no WAL insertions are
7685          * happening.
7686          */
7687         WALInsertLockAcquireExclusive();
7688         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
7689         WALInsertLockRelease();
7690
7691         /* Also update the info_lck-protected copy */
7692         SpinLockAcquire(&XLogCtl->info_lck);
7693         XLogCtl->RedoRecPtr = lastCheckPoint.redo;
7694         SpinLockRelease(&XLogCtl->info_lck);
7695
7696         /*
7697          * Prepare to accumulate statistics.
7698          *
7699          * Note: because it is possible for log_checkpoints to change while a
7700          * checkpoint proceeds, we always accumulate stats, even if
7701          * log_checkpoints is currently off.
7702          */
7703         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7704         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7705
7706         if (log_checkpoints)
7707                 LogCheckpointStart(flags, true);
7708
7709         /* Update the process title */
7710         update_checkpoint_display(flags, true, false);
7711
7712         CheckPointGuts(lastCheckPoint.redo, flags);
7713
7714         /*
7715          * This location needs to be after CheckPointGuts() to ensure that some
7716          * work has already happened during this checkpoint.
7717          */
7718         INJECTION_POINT("create-restart-point");
7719
7720         /*
7721          * Remember the prior checkpoint's redo ptr for
7722          * UpdateCheckPointDistanceEstimate()
7723          */
7724         PriorRedoPtr = ControlFile->checkPointCopy.redo;
7725
7726         /*
7727          * Update pg_control, using current time.  Check that it still shows an
7728          * older checkpoint, else do nothing; this is a quick hack to make sure
7729          * nothing really bad happens if somehow we get here after the
7730          * end-of-recovery checkpoint.
7731          */
7732         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7733         if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
7734         {
7735                 /*
7736                  * Update the checkpoint information.  We do this even if the cluster
7737                  * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
7738                  * segments recycled below.
7739                  */
7740                 ControlFile->checkPoint = lastCheckPointRecPtr;
7741                 ControlFile->checkPointCopy = lastCheckPoint;
7742
7743                 /*
7744                  * Ensure minRecoveryPoint is past the checkpoint record and update it
7745                  * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
7746                  * this will have happened already while writing out dirty buffers,
7747                  * but not necessarily - e.g. because no buffers were dirtied.  We do
7748                  * this because a backup performed in recovery uses minRecoveryPoint
7749                  * to determine which WAL files must be included in the backup, and
7750                  * the file (or files) containing the checkpoint record must be
7751                  * included, at a minimum.  Note that for an ordinary restart of
7752                  * recovery there's no value in having the minimum recovery point any
7753                  * earlier than this anyway, because redo will begin just after the
7754                  * checkpoint record.
7755                  */
7756                 if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
7757                 {
7758                         if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
7759                         {
7760                                 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
7761                                 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
7762
7763                                 /* update local copy */
7764                                 LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
7765                                 LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7766                         }
7767                         if (flags & CHECKPOINT_IS_SHUTDOWN)
7768                                 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7769                 }
7770                 UpdateControlFile();
7771         }
7772         LWLockRelease(ControlFileLock);
7773
7774         /*
7775          * Update the average distance between checkpoints/restartpoints if the
7776          * prior checkpoint exists.
7777          */
7778         if (PriorRedoPtr != InvalidXLogRecPtr)
7779                 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
7780
7781         /*
7782          * Delete old log files, those no longer needed for last restartpoint to
7783          * prevent the disk holding the xlog from growing full.
7784          */
7785         XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7786
7787         /*
7788          * Retreat _logSegNo using the current end of xlog replayed or received,
7789          * whichever is later.
7790          */
7791         receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
7792         replayPtr = GetXLogReplayRecPtr(&replayTLI);
7793         endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
7794         KeepLogSeg(endptr, &_logSegNo);
7795         if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED,
7796                                                                                    _logSegNo, InvalidOid,
7797                                                                                    InvalidTransactionId))
7798         {
7799                 /*
7800                  * Some slots have been invalidated; recalculate the old-segment
7801                  * horizon, starting again from RedoRecPtr.
7802                  */
7803                 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7804                 KeepLogSeg(endptr, &_logSegNo);
7805         }
7806         _logSegNo--;
7807
7808         /*
7809          * Try to recycle segments on a useful timeline. If we've been promoted
7810          * since the beginning of this restartpoint, use the new timeline chosen
7811          * at end of recovery.  If we're still in recovery, use the timeline we're
7812          * currently replaying.
7813          *
7814          * There is no guarantee that the WAL segments will be useful on the
7815          * current timeline; if recovery proceeds to a new timeline right after
7816          * this, the pre-allocated WAL segments on this timeline will not be used,
7817          * and will go wasted until recycled on the next restartpoint. We'll live
7818          * with that.
7819          */
7820         if (!RecoveryInProgress())
7821                 replayTLI = XLogCtl->InsertTimeLineID;
7822
7823         RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
7824
7825         /*
7826          * Make more log segments if needed.  (Do this after recycling old log
7827          * segments, since that may supply some of the needed files.)
7828          */
7829         PreallocXlogFiles(endptr, replayTLI);
7830
7831         /*
7832          * Truncate pg_subtrans if possible.  We can throw away all data before
7833          * the oldest XMIN of any running transaction.  No future transaction will
7834          * attempt to reference any pg_subtrans entry older than that (see Asserts
7835          * in subtrans.c).  When hot standby is disabled, though, we mustn't do
7836          * this because StartupSUBTRANS hasn't been called yet.
7837          */
7838         if (EnableHotStandby)
7839                 TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
7840
7841         /* Real work is done; log and update stats. */
7842         LogCheckpointEnd(true);
7843
7844         /* Reset the process title */
7845         update_checkpoint_display(flags, true, true);
7846
7847         xtime = GetLatestXTime();
7848         ereport((log_checkpoints ? LOG : DEBUG2),
7849                         (errmsg("recovery restart point at %X/%X",
7850                                         LSN_FORMAT_ARGS(lastCheckPoint.redo)),
7851                          xtime ? errdetail("Last completed transaction was at log time %s.",
7852                                                            timestamptz_to_str(xtime)) : 0));
7853
7854         /*
7855          * Finally, execute archive_cleanup_command, if any.
7856          */
7857         if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
7858                 ExecuteRecoveryCommand(archiveCleanupCommand,
7859                                                            "archive_cleanup_command",
7860                                                            false,
7861                                                            WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
7862
7863         return true;
7864 }
7865
7866 /*
7867  * Report availability of WAL for the given target LSN
7868  *              (typically a slot's restart_lsn)
7869  *
7870  * Returns one of the following enum values:
7871  *
7872  * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
7873  *   max_wal_size.
7874  *
7875  * * WALAVAIL_EXTENDED means it is still available by preserving extra
7876  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
7877  *   than max_wal_size, this state is not returned.
7878  *
7879  * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
7880  *   remove reserved segments. The walsender using this slot may return to the
7881  *   above.
7882  *
7883  * * WALAVAIL_REMOVED means it has been removed. A replication stream on
7884  *   a slot with this LSN cannot continue.  (Any associated walsender
7885  *   processes should have been terminated already.)
7886  *
7887  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
7888  */
7889 WALAvailability
7890 GetWALAvailability(XLogRecPtr targetLSN)
7891 {
7892         XLogRecPtr      currpos;                /* current write LSN */
7893         XLogSegNo       currSeg;                /* segid of currpos */
7894         XLogSegNo       targetSeg;              /* segid of targetLSN */
7895         XLogSegNo       oldestSeg;              /* actual oldest segid */
7896         XLogSegNo       oldestSegMaxWalSize;    /* oldest segid kept by max_wal_size */
7897         XLogSegNo       oldestSlotSeg;  /* oldest segid kept by slot */
7898         uint64          keepSegs;
7899
7900         /*
7901          * slot does not reserve WAL. Either deactivated, or has never been active
7902          */
7903         if (XLogRecPtrIsInvalid(targetLSN))
7904                 return WALAVAIL_INVALID_LSN;
7905
7906         /*
7907          * Calculate the oldest segment currently reserved by all slots,
7908          * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
7909          * oldestSlotSeg to the current segment.
7910          */
7911         currpos = GetXLogWriteRecPtr();
7912         XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
7913         KeepLogSeg(currpos, &oldestSlotSeg);
7914
7915         /*
7916          * Find the oldest extant segment file. We get 1 until checkpoint removes
7917          * the first WAL segment file since startup, which causes the status being
7918          * wrong under certain abnormal conditions but that doesn't actually harm.
7919          */
7920         oldestSeg = XLogGetLastRemovedSegno() + 1;
7921
7922         /* calculate oldest segment by max_wal_size */
7923         XLByteToSeg(currpos, currSeg, wal_segment_size);
7924         keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
7925
7926         if (currSeg > keepSegs)
7927                 oldestSegMaxWalSize = currSeg - keepSegs;
7928         else
7929                 oldestSegMaxWalSize = 1;
7930
7931         /* the segment we care about */
7932         XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
7933
7934         /*
7935          * No point in returning reserved or extended status values if the
7936          * targetSeg is known to be lost.
7937          */
7938         if (targetSeg >= oldestSlotSeg)
7939         {
7940                 /* show "reserved" when targetSeg is within max_wal_size */
7941                 if (targetSeg >= oldestSegMaxWalSize)
7942                         return WALAVAIL_RESERVED;
7943
7944                 /* being retained by slots exceeding max_wal_size */
7945                 return WALAVAIL_EXTENDED;
7946         }
7947
7948         /* WAL segments are no longer retained but haven't been removed yet */
7949         if (targetSeg >= oldestSeg)
7950                 return WALAVAIL_UNRESERVED;
7951
7952         /* Definitely lost */
7953         return WALAVAIL_REMOVED;
7954 }
7955
7956
7957 /*
7958  * Retreat *logSegNo to the last segment that we need to retain because of
7959  * either wal_keep_size or replication slots.
7960  *
7961  * This is calculated by subtracting wal_keep_size from the given xlog
7962  * location, recptr and by making sure that that result is below the
7963  * requirement of replication slots.  For the latter criterion we do consider
7964  * the effects of max_slot_wal_keep_size: reserve at most that much space back
7965  * from recptr.
7966  *
7967  * Note about replication slots: if this function calculates a value
7968  * that's further ahead than what slots need reserved, then affected
7969  * slots need to be invalidated and this function invoked again.
7970  * XXX it might be a good idea to rewrite this function so that
7971  * invalidation is optionally done here, instead.
7972  */
7973 static void
7974 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
7975 {
7976         XLogSegNo       currSegNo;
7977         XLogSegNo       segno;
7978         XLogRecPtr      keep;
7979
7980         XLByteToSeg(recptr, currSegNo, wal_segment_size);
7981         segno = currSegNo;
7982
7983         /*
7984          * Calculate how many segments are kept by slots first, adjusting for
7985          * max_slot_wal_keep_size.
7986          */
7987         keep = XLogGetReplicationSlotMinimumLSN();
7988         if (keep != InvalidXLogRecPtr && keep < recptr)
7989         {
7990                 XLByteToSeg(keep, segno, wal_segment_size);
7991
7992                 /* Cap by max_slot_wal_keep_size ... */
7993                 if (max_slot_wal_keep_size_mb >= 0)
7994                 {
7995                         uint64          slot_keep_segs;
7996
7997                         slot_keep_segs =
7998                                 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
7999
8000                         if (currSegNo - segno > slot_keep_segs)
8001                                 segno = currSegNo - slot_keep_segs;
8002                 }
8003         }
8004
8005         /*
8006          * If WAL summarization is in use, don't remove WAL that has yet to be
8007          * summarized.
8008          */
8009         keep = GetOldestUnsummarizedLSN(NULL, NULL);
8010         if (keep != InvalidXLogRecPtr)
8011         {
8012                 XLogSegNo       unsummarized_segno;
8013
8014                 XLByteToSeg(keep, unsummarized_segno, wal_segment_size);
8015                 if (unsummarized_segno < segno)
8016                         segno = unsummarized_segno;
8017         }
8018
8019         /* but, keep at least wal_keep_size if that's set */
8020         if (wal_keep_size_mb > 0)
8021         {
8022                 uint64          keep_segs;
8023
8024                 keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
8025                 if (currSegNo - segno < keep_segs)
8026                 {
8027                         /* avoid underflow, don't go below 1 */
8028                         if (currSegNo <= keep_segs)
8029                                 segno = 1;
8030                         else
8031                                 segno = currSegNo - keep_segs;
8032                 }
8033         }
8034
8035         /* don't delete WAL segments newer than the calculated segment */
8036         if (segno < *logSegNo)
8037                 *logSegNo = segno;
8038 }
8039
8040 /*
8041  * Write a NEXTOID log record
8042  */
8043 void
8044 XLogPutNextOid(Oid nextOid)
8045 {
8046         XLogBeginInsert();
8047         XLogRegisterData((char *) (&nextOid), sizeof(Oid));
8048         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
8049
8050         /*
8051          * We need not flush the NEXTOID record immediately, because any of the
8052          * just-allocated OIDs could only reach disk as part of a tuple insert or
8053          * update that would have its own XLOG record that must follow the NEXTOID
8054          * record.  Therefore, the standard buffer LSN interlock applied to those
8055          * records will ensure no such OID reaches disk before the NEXTOID record
8056          * does.
8057          *
8058          * Note, however, that the above statement only covers state "within" the
8059          * database.  When we use a generated OID as a file or directory name, we
8060          * are in a sense violating the basic WAL rule, because that filesystem
8061          * change may reach disk before the NEXTOID WAL record does.  The impact
8062          * of this is that if a database crash occurs immediately afterward, we
8063          * might after restart re-generate the same OID and find that it conflicts
8064          * with the leftover file or directory.  But since for safety's sake we
8065          * always loop until finding a nonconflicting filename, this poses no real
8066          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8067          */
8068 }
8069
8070 /*
8071  * Write an XLOG SWITCH record.
8072  *
8073  * Here we just blindly issue an XLogInsert request for the record.
8074  * All the magic happens inside XLogInsert.
8075  *
8076  * The return value is either the end+1 address of the switch record,
8077  * or the end+1 address of the prior segment if we did not need to
8078  * write a switch record because we are already at segment start.
8079  */
8080 XLogRecPtr
8081 RequestXLogSwitch(bool mark_unimportant)
8082 {
8083         XLogRecPtr      RecPtr;
8084
8085         /* XLOG SWITCH has no data */
8086         XLogBeginInsert();
8087
8088         if (mark_unimportant)
8089                 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
8090         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
8091
8092         return RecPtr;
8093 }
8094
8095 /*
8096  * Write a RESTORE POINT record
8097  */
8098 XLogRecPtr
8099 XLogRestorePoint(const char *rpName)
8100 {
8101         XLogRecPtr      RecPtr;
8102         xl_restore_point xlrec;
8103
8104         xlrec.rp_time = GetCurrentTimestamp();
8105         strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8106
8107         XLogBeginInsert();
8108         XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
8109
8110         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
8111
8112         ereport(LOG,
8113                         (errmsg("restore point \"%s\" created at %X/%X",
8114                                         rpName, LSN_FORMAT_ARGS(RecPtr))));
8115
8116         return RecPtr;
8117 }
8118
8119 /*
8120  * Check if any of the GUC parameters that are critical for hot standby
8121  * have changed, and update the value in pg_control file if necessary.
8122  */
8123 static void
8124 XLogReportParameters(void)
8125 {
8126         if (wal_level != ControlFile->wal_level ||
8127                 wal_log_hints != ControlFile->wal_log_hints ||
8128                 MaxConnections != ControlFile->MaxConnections ||
8129                 max_worker_processes != ControlFile->max_worker_processes ||
8130                 max_wal_senders != ControlFile->max_wal_senders ||
8131                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
8132                 max_locks_per_xact != ControlFile->max_locks_per_xact ||
8133                 track_commit_timestamp != ControlFile->track_commit_timestamp)
8134         {
8135                 /*
8136                  * The change in number of backend slots doesn't need to be WAL-logged
8137                  * if archiving is not enabled, as you can't start archive recovery
8138                  * with wal_level=minimal anyway. We don't really care about the
8139                  * values in pg_control either if wal_level=minimal, but seems better
8140                  * to keep them up-to-date to avoid confusion.
8141                  */
8142                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8143                 {
8144                         xl_parameter_change xlrec;
8145                         XLogRecPtr      recptr;
8146
8147                         xlrec.MaxConnections = MaxConnections;
8148                         xlrec.max_worker_processes = max_worker_processes;
8149                         xlrec.max_wal_senders = max_wal_senders;
8150                         xlrec.max_prepared_xacts = max_prepared_xacts;
8151                         xlrec.max_locks_per_xact = max_locks_per_xact;
8152                         xlrec.wal_level = wal_level;
8153                         xlrec.wal_log_hints = wal_log_hints;
8154                         xlrec.track_commit_timestamp = track_commit_timestamp;
8155
8156                         XLogBeginInsert();
8157                         XLogRegisterData((char *) &xlrec, sizeof(xlrec));
8158
8159                         recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
8160                         XLogFlush(recptr);
8161                 }
8162
8163                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8164
8165                 ControlFile->MaxConnections = MaxConnections;
8166                 ControlFile->max_worker_processes = max_worker_processes;
8167                 ControlFile->max_wal_senders = max_wal_senders;
8168                 ControlFile->max_prepared_xacts = max_prepared_xacts;
8169                 ControlFile->max_locks_per_xact = max_locks_per_xact;
8170                 ControlFile->wal_level = wal_level;
8171                 ControlFile->wal_log_hints = wal_log_hints;
8172                 ControlFile->track_commit_timestamp = track_commit_timestamp;
8173                 UpdateControlFile();
8174
8175                 LWLockRelease(ControlFileLock);
8176         }
8177 }
8178
8179 /*
8180  * Update full_page_writes in shared memory, and write an
8181  * XLOG_FPW_CHANGE record if necessary.
8182  *
8183  * Note: this function assumes there is no other process running
8184  * concurrently that could update it.
8185  */
8186 void
8187 UpdateFullPageWrites(void)
8188 {
8189         XLogCtlInsert *Insert = &XLogCtl->Insert;
8190         bool            recoveryInProgress;
8191
8192         /*
8193          * Do nothing if full_page_writes has not been changed.
8194          *
8195          * It's safe to check the shared full_page_writes without the lock,
8196          * because we assume that there is no concurrently running process which
8197          * can update it.
8198          */
8199         if (fullPageWrites == Insert->fullPageWrites)
8200                 return;
8201
8202         /*
8203          * Perform this outside critical section so that the WAL insert
8204          * initialization done by RecoveryInProgress() doesn't trigger an
8205          * assertion failure.
8206          */
8207         recoveryInProgress = RecoveryInProgress();
8208
8209         START_CRIT_SECTION();
8210
8211         /*
8212          * It's always safe to take full page images, even when not strictly
8213          * required, but not the other round. So if we're setting full_page_writes
8214          * to true, first set it true and then write the WAL record. If we're
8215          * setting it to false, first write the WAL record and then set the global
8216          * flag.
8217          */
8218         if (fullPageWrites)
8219         {
8220                 WALInsertLockAcquireExclusive();
8221                 Insert->fullPageWrites = true;
8222                 WALInsertLockRelease();
8223         }
8224
8225         /*
8226          * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8227          * full_page_writes during archive recovery, if required.
8228          */
8229         if (XLogStandbyInfoActive() && !recoveryInProgress)
8230         {
8231                 XLogBeginInsert();
8232                 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
8233
8234                 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
8235         }
8236
8237         if (!fullPageWrites)
8238         {
8239                 WALInsertLockAcquireExclusive();
8240                 Insert->fullPageWrites = false;
8241                 WALInsertLockRelease();
8242         }
8243         END_CRIT_SECTION();
8244 }
8245
8246 /*
8247  * XLOG resource manager's routines
8248  *
8249  * Definitions of info values are in include/catalog/pg_control.h, though
8250  * not all record types are related to control file updates.
8251  *
8252  * NOTE: Some XLOG record types that are directly related to WAL recovery
8253  * are handled in xlogrecovery_redo().
8254  */
8255 void
8256 xlog_redo(XLogReaderState *record)
8257 {
8258         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8259         XLogRecPtr      lsn = record->EndRecPtr;
8260
8261         /*
8262          * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
8263          * XLOG_FPI_FOR_HINT records.
8264          */
8265         Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
8266                    !XLogRecHasAnyBlockRefs(record));
8267
8268         if (info == XLOG_NEXTOID)
8269         {
8270                 Oid                     nextOid;
8271
8272                 /*
8273                  * We used to try to take the maximum of TransamVariables->nextOid and
8274                  * the recorded nextOid, but that fails if the OID counter wraps
8275                  * around.  Since no OID allocation should be happening during replay
8276                  * anyway, better to just believe the record exactly.  We still take
8277                  * OidGenLock while setting the variable, just in case.
8278                  */
8279                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
8280                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8281                 TransamVariables->nextOid = nextOid;
8282                 TransamVariables->oidCount = 0;
8283                 LWLockRelease(OidGenLock);
8284         }
8285         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
8286         {
8287                 CheckPoint      checkPoint;
8288                 TimeLineID      replayTLI;
8289
8290                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8291                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
8292                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8293                 TransamVariables->nextXid = checkPoint.nextXid;
8294                 LWLockRelease(XidGenLock);
8295                 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8296                 TransamVariables->nextOid = checkPoint.nextOid;
8297                 TransamVariables->oidCount = 0;
8298                 LWLockRelease(OidGenLock);
8299                 MultiXactSetNextMXact(checkPoint.nextMulti,
8300                                                           checkPoint.nextMultiOffset);
8301
8302                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
8303                                                            checkPoint.oldestMultiDB);
8304
8305                 /*
8306                  * No need to set oldestClogXid here as well; it'll be set when we
8307                  * redo an xl_clog_truncate if it changed since initialization.
8308                  */
8309                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
8310
8311                 /*
8312                  * If we see a shutdown checkpoint while waiting for an end-of-backup
8313                  * record, the backup was canceled and the end-of-backup record will
8314                  * never arrive.
8315                  */
8316                 if (ArchiveRecoveryRequested &&
8317                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
8318                         XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
8319                         ereport(PANIC,
8320                                         (errmsg("online backup was canceled, recovery cannot continue")));
8321
8322                 /*
8323                  * If we see a shutdown checkpoint, we know that nothing was running
8324                  * on the primary at this point. So fake-up an empty running-xacts
8325                  * record and use that here and now. Recover additional standby state
8326                  * for prepared transactions.
8327                  */
8328                 if (standbyState >= STANDBY_INITIALIZED)
8329                 {
8330                         TransactionId *xids;
8331                         int                     nxids;
8332                         TransactionId oldestActiveXID;
8333                         TransactionId latestCompletedXid;
8334                         RunningTransactionsData running;
8335
8336                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
8337
8338                         /* Update pg_subtrans entries for any prepared transactions */
8339                         StandbyRecoverPreparedTransactions();
8340
8341                         /*
8342                          * Construct a RunningTransactions snapshot representing a shut
8343                          * down server, with only prepared transactions still alive. We're
8344                          * never overflowed at this point because all subxids are listed
8345                          * with their parent prepared transactions.
8346                          */
8347                         running.xcnt = nxids;
8348                         running.subxcnt = 0;
8349                         running.subxid_status = SUBXIDS_IN_SUBTRANS;
8350                         running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
8351                         running.oldestRunningXid = oldestActiveXID;
8352                         latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
8353                         TransactionIdRetreat(latestCompletedXid);
8354                         Assert(TransactionIdIsNormal(latestCompletedXid));
8355                         running.latestCompletedXid = latestCompletedXid;
8356                         running.xids = xids;
8357
8358                         ProcArrayApplyRecoveryInfo(&running);
8359                 }
8360
8361                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8362                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8363                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8364                 LWLockRelease(ControlFileLock);
8365
8366                 /* Update shared-memory copy of checkpoint XID/epoch */
8367                 SpinLockAcquire(&XLogCtl->info_lck);
8368                 XLogCtl->ckptFullXid = checkPoint.nextXid;
8369                 SpinLockRelease(&XLogCtl->info_lck);
8370
8371                 /*
8372                  * We should've already switched to the new TLI before replaying this
8373                  * record.
8374                  */
8375                 (void) GetCurrentReplayRecPtr(&replayTLI);
8376                 if (checkPoint.ThisTimeLineID != replayTLI)
8377                         ereport(PANIC,
8378                                         (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
8379                                                         checkPoint.ThisTimeLineID, replayTLI)));
8380
8381                 RecoveryRestartPoint(&checkPoint, record);
8382         }
8383         else if (info == XLOG_CHECKPOINT_ONLINE)
8384         {
8385                 CheckPoint      checkPoint;
8386                 TimeLineID      replayTLI;
8387
8388                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8389                 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8390                 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8391                 if (FullTransactionIdPrecedes(TransamVariables->nextXid,
8392                                                                           checkPoint.nextXid))
8393                         TransamVariables->nextXid = checkPoint.nextXid;
8394                 LWLockRelease(XidGenLock);
8395
8396                 /*
8397                  * We ignore the nextOid counter in an ONLINE checkpoint, preferring
8398                  * to track OID assignment through XLOG_NEXTOID records.  The nextOid
8399                  * counter is from the start of the checkpoint and might well be stale
8400                  * compared to later XLOG_NEXTOID records.  We could try to take the
8401                  * maximum of the nextOid counter and our latest value, but since
8402                  * there's no particular guarantee about the speed with which the OID
8403                  * counter wraps around, that's a risky thing to do.  In any case,
8404                  * users of the nextOid counter are required to avoid assignment of
8405                  * duplicates, so that a somewhat out-of-date value should be safe.
8406                  */
8407
8408                 /* Handle multixact */
8409                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8410                                                                   checkPoint.nextMultiOffset);
8411
8412                 /*
8413                  * NB: This may perform multixact truncation when replaying WAL
8414                  * generated by an older primary.
8415                  */
8416                 MultiXactAdvanceOldest(checkPoint.oldestMulti,
8417                                                            checkPoint.oldestMultiDB);
8418                 if (TransactionIdPrecedes(TransamVariables->oldestXid,
8419                                                                   checkPoint.oldestXid))
8420                         SetTransactionIdLimit(checkPoint.oldestXid,
8421                                                                   checkPoint.oldestXidDB);
8422                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8423                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8424                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8425                 LWLockRelease(ControlFileLock);
8426
8427                 /* Update shared-memory copy of checkpoint XID/epoch */
8428                 SpinLockAcquire(&XLogCtl->info_lck);
8429                 XLogCtl->ckptFullXid = checkPoint.nextXid;
8430                 SpinLockRelease(&XLogCtl->info_lck);
8431
8432                 /* TLI should not change in an on-line checkpoint */
8433                 (void) GetCurrentReplayRecPtr(&replayTLI);
8434                 if (checkPoint.ThisTimeLineID != replayTLI)
8435                         ereport(PANIC,
8436                                         (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
8437                                                         checkPoint.ThisTimeLineID, replayTLI)));
8438
8439                 RecoveryRestartPoint(&checkPoint, record);
8440         }
8441         else if (info == XLOG_OVERWRITE_CONTRECORD)
8442         {
8443                 /* nothing to do here, handled in xlogrecovery_redo() */
8444         }
8445         else if (info == XLOG_END_OF_RECOVERY)
8446         {
8447                 xl_end_of_recovery xlrec;
8448                 TimeLineID      replayTLI;
8449
8450                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
8451
8452                 /*
8453                  * For Hot Standby, we could treat this like a Shutdown Checkpoint,
8454                  * but this case is rarer and harder to test, so the benefit doesn't
8455                  * outweigh the potential extra cost of maintenance.
8456                  */
8457
8458                 /*
8459                  * We should've already switched to the new TLI before replaying this
8460                  * record.
8461                  */
8462                 (void) GetCurrentReplayRecPtr(&replayTLI);
8463                 if (xlrec.ThisTimeLineID != replayTLI)
8464                         ereport(PANIC,
8465                                         (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
8466                                                         xlrec.ThisTimeLineID, replayTLI)));
8467         }
8468         else if (info == XLOG_NOOP)
8469         {
8470                 /* nothing to do here */
8471         }
8472         else if (info == XLOG_SWITCH)
8473         {
8474                 /* nothing to do here */
8475         }
8476         else if (info == XLOG_RESTORE_POINT)
8477         {
8478                 /* nothing to do here, handled in xlogrecovery.c */
8479         }
8480         else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
8481         {
8482                 /*
8483                  * XLOG_FPI records contain nothing else but one or more block
8484                  * references. Every block reference must include a full-page image
8485                  * even if full_page_writes was disabled when the record was generated
8486                  * - otherwise there would be no point in this record.
8487                  *
8488                  * XLOG_FPI_FOR_HINT records are generated when a page needs to be
8489                  * WAL-logged because of a hint bit update. They are only generated
8490                  * when checksums and/or wal_log_hints are enabled. They may include
8491                  * no full-page images if full_page_writes was disabled when they were
8492                  * generated. In this case there is nothing to do here.
8493                  *
8494                  * No recovery conflicts are generated by these generic records - if a
8495                  * resource manager needs to generate conflicts, it has to define a
8496                  * separate WAL record type and redo routine.
8497                  */
8498                 for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
8499                 {
8500                         Buffer          buffer;
8501
8502                         if (!XLogRecHasBlockImage(record, block_id))
8503                         {
8504                                 if (info == XLOG_FPI)
8505                                         elog(ERROR, "XLOG_FPI record did not contain a full-page image");
8506                                 continue;
8507                         }
8508
8509                         if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
8510                                 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
8511                         UnlockReleaseBuffer(buffer);
8512                 }
8513         }
8514         else if (info == XLOG_BACKUP_END)
8515         {
8516                 /* nothing to do here, handled in xlogrecovery_redo() */
8517         }
8518         else if (info == XLOG_PARAMETER_CHANGE)
8519         {
8520                 xl_parameter_change xlrec;
8521
8522                 /* Update our copy of the parameters in pg_control */
8523                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8524
8525                 /*
8526                  * Invalidate logical slots if we are in hot standby and the primary
8527                  * does not have a WAL level sufficient for logical decoding. No need
8528                  * to search for potentially conflicting logically slots if standby is
8529                  * running with wal_level lower than logical, because in that case, we
8530                  * would have either disallowed creation of logical slots or
8531                  * invalidated existing ones.
8532                  */
8533                 if (InRecovery && InHotStandby &&
8534                         xlrec.wal_level < WAL_LEVEL_LOGICAL &&
8535                         wal_level >= WAL_LEVEL_LOGICAL)
8536                         InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
8537                                                                                            0, InvalidOid,
8538                                                                                            InvalidTransactionId);
8539
8540                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8541                 ControlFile->MaxConnections = xlrec.MaxConnections;
8542                 ControlFile->max_worker_processes = xlrec.max_worker_processes;
8543                 ControlFile->max_wal_senders = xlrec.max_wal_senders;
8544                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8545                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8546                 ControlFile->wal_level = xlrec.wal_level;
8547                 ControlFile->wal_log_hints = xlrec.wal_log_hints;
8548
8549                 /*
8550                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8551                  * recover back up to this point before allowing hot standby again.
8552                  * This is important if the max_* settings are decreased, to ensure
8553                  * you don't run queries against the WAL preceding the change. The
8554                  * local copies cannot be updated as long as crash recovery is
8555                  * happening and we expect all the WAL to be replayed.
8556                  */
8557                 if (InArchiveRecovery)
8558                 {
8559                         LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
8560                         LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8561                 }
8562                 if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
8563                 {
8564                         TimeLineID      replayTLI;
8565
8566                         (void) GetCurrentReplayRecPtr(&replayTLI);
8567                         ControlFile->minRecoveryPoint = lsn;
8568                         ControlFile->minRecoveryPointTLI = replayTLI;
8569                 }
8570
8571                 CommitTsParameterChange(xlrec.track_commit_timestamp,
8572                                                                 ControlFile->track_commit_timestamp);
8573                 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
8574
8575                 UpdateControlFile();
8576                 LWLockRelease(ControlFileLock);
8577
8578                 /* Check to see if any parameter change gives a problem on recovery */
8579                 CheckRequiredParameterValues();
8580         }
8581         else if (info == XLOG_FPW_CHANGE)
8582         {
8583                 bool            fpw;
8584
8585                 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8586
8587                 /*
8588                  * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8589                  * do_pg_backup_start() and do_pg_backup_stop() can check whether
8590                  * full_page_writes has been disabled during online backup.
8591                  */
8592                 if (!fpw)
8593                 {
8594                         SpinLockAcquire(&XLogCtl->info_lck);
8595                         if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
8596                                 XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
8597                         SpinLockRelease(&XLogCtl->info_lck);
8598                 }
8599
8600                 /* Keep track of full_page_writes */
8601                 lastFullPageWrites = fpw;
8602         }
8603         else if (info == XLOG_CHECKPOINT_REDO)
8604         {
8605                 /* nothing to do here, just for informational purposes */
8606         }
8607 }
8608
8609 /*
8610  * Return the extra open flags used for opening a file, depending on the
8611  * value of the GUCs wal_sync_method, fsync and debug_io_direct.
8612  */
8613 static int
8614 get_sync_bit(int method)
8615 {
8616         int                     o_direct_flag = 0;
8617
8618         /*
8619          * Use O_DIRECT if requested, except in walreceiver process.  The WAL
8620          * written by walreceiver is normally read by the startup process soon
8621          * after it's written.  Also, walreceiver performs unaligned writes, which
8622          * don't work with O_DIRECT, so it is required for correctness too.
8623          */
8624         if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
8625                 o_direct_flag = PG_O_DIRECT;
8626
8627         /* If fsync is disabled, never open in sync mode */
8628         if (!enableFsync)
8629                 return o_direct_flag;
8630
8631         switch (method)
8632         {
8633                         /*
8634                          * enum values for all sync options are defined even if they are
8635                          * not supported on the current platform.  But if not, they are
8636                          * not included in the enum option array, and therefore will never
8637                          * be seen here.
8638                          */
8639                 case WAL_SYNC_METHOD_FSYNC:
8640                 case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
8641                 case WAL_SYNC_METHOD_FDATASYNC:
8642                         return o_direct_flag;
8643 #ifdef O_SYNC
8644                 case WAL_SYNC_METHOD_OPEN:
8645                         return O_SYNC | o_direct_flag;
8646 #endif
8647 #ifdef O_DSYNC
8648                 case WAL_SYNC_METHOD_OPEN_DSYNC:
8649                         return O_DSYNC | o_direct_flag;
8650 #endif
8651                 default:
8652                         /* can't happen (unless we are out of sync with option array) */
8653                         elog(ERROR, "unrecognized \"wal_sync_method\": %d", method);
8654                         return 0;                       /* silence warning */
8655         }
8656 }
8657
8658 /*
8659  * GUC support
8660  */
8661 void
8662 assign_wal_sync_method(int new_wal_sync_method, void *extra)
8663 {
8664         if (wal_sync_method != new_wal_sync_method)
8665         {
8666                 /*
8667                  * To ensure that no blocks escape unsynced, force an fsync on the
8668                  * currently open log segment (if any).  Also, if the open flag is
8669                  * changing, close the log file so it will be reopened (with new flag
8670                  * bit) at next use.
8671                  */
8672                 if (openLogFile >= 0)
8673                 {
8674                         pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
8675                         if (pg_fsync(openLogFile) != 0)
8676                         {
8677                                 char            xlogfname[MAXFNAMELEN];
8678                                 int                     save_errno;
8679
8680                                 save_errno = errno;
8681                                 XLogFileName(xlogfname, openLogTLI, openLogSegNo,
8682                                                          wal_segment_size);
8683                                 errno = save_errno;
8684                                 ereport(PANIC,
8685                                                 (errcode_for_file_access(),
8686                                                  errmsg("could not fsync file \"%s\": %m", xlogfname)));
8687                         }
8688
8689                         pgstat_report_wait_end();
8690                         if (get_sync_bit(wal_sync_method) != get_sync_bit(new_wal_sync_method))
8691                                 XLogFileClose();
8692                 }
8693         }
8694 }
8695
8696
8697 /*
8698  * Issue appropriate kind of fsync (if any) for an XLOG output file.
8699  *
8700  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8701  * 'segno' is for error reporting purposes.
8702  */
8703 void
8704 issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
8705 {
8706         char       *msg = NULL;
8707         instr_time      start;
8708
8709         Assert(tli != 0);
8710
8711         /*
8712          * Quick exit if fsync is disabled or write() has already synced the WAL
8713          * file.
8714          */
8715         if (!enableFsync ||
8716                 wal_sync_method == WAL_SYNC_METHOD_OPEN ||
8717                 wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC)
8718                 return;
8719
8720         /*
8721          * Measure I/O timing to sync the WAL file for pg_stat_io and/or
8722          * pg_stat_wal.
8723          */
8724         start = pgstat_prepare_io_time(track_io_timing || track_wal_io_timing);
8725
8726         pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
8727         switch (wal_sync_method)
8728         {
8729                 case WAL_SYNC_METHOD_FSYNC:
8730                         if (pg_fsync_no_writethrough(fd) != 0)
8731                                 msg = _("could not fsync file \"%s\": %m");
8732                         break;
8733 #ifdef HAVE_FSYNC_WRITETHROUGH
8734                 case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
8735                         if (pg_fsync_writethrough(fd) != 0)
8736                                 msg = _("could not fsync write-through file \"%s\": %m");
8737                         break;
8738 #endif
8739                 case WAL_SYNC_METHOD_FDATASYNC:
8740                         if (pg_fdatasync(fd) != 0)
8741                                 msg = _("could not fdatasync file \"%s\": %m");
8742                         break;
8743                 case WAL_SYNC_METHOD_OPEN:
8744                 case WAL_SYNC_METHOD_OPEN_DSYNC:
8745                         /* not reachable */
8746                         Assert(false);
8747                         break;
8748                 default:
8749                         ereport(PANIC,
8750                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8751                                         errmsg_internal("unrecognized \"wal_sync_method\": %d", wal_sync_method));
8752                         break;
8753         }
8754
8755         /* PANIC if failed to fsync */
8756         if (msg)
8757         {
8758                 char            xlogfname[MAXFNAMELEN];
8759                 int                     save_errno = errno;
8760
8761                 XLogFileName(xlogfname, tli, segno, wal_segment_size);
8762                 errno = save_errno;
8763                 ereport(PANIC,
8764                                 (errcode_for_file_access(),
8765                                  errmsg(msg, xlogfname)));
8766         }
8767
8768         pgstat_report_wait_end();
8769
8770         /*
8771          * Increment the I/O timing and the number of times WAL files were synced.
8772          */
8773         if (track_wal_io_timing)
8774         {
8775                 instr_time      end;
8776
8777                 INSTR_TIME_SET_CURRENT(end);
8778                 INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, end, start);
8779         }
8780
8781         pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
8782                                                         start, 1, 0);
8783
8784         PendingWalStats.wal_sync++;
8785 }
8786
8787 /*
8788  * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
8789  * function. It creates the necessary starting checkpoint and constructs the
8790  * backup state and tablespace map.
8791  *
8792  * Input parameters are "state" (the backup state), "fast" (if true, we do
8793  * the checkpoint in immediate mode to make it faster), and "tablespaces"
8794  * (if non-NULL, indicates a list of tablespaceinfo structs describing the
8795  * cluster's tablespaces.).
8796  *
8797  * The tablespace map contents are appended to passed-in parameter
8798  * tablespace_map and the caller is responsible for including it in the backup
8799  * archive as 'tablespace_map'. The tablespace_map file is required mainly for
8800  * tar format in windows as native windows utilities are not able to create
8801  * symlinks while extracting files from tar. However for consistency and
8802  * platform-independence, we do it the same way everywhere.
8803  *
8804  * It fills in "state" with the information required for the backup, such
8805  * as the minimum WAL location that must be present to restore from this
8806  * backup (starttli) and the corresponding timeline ID (starttli).
8807  *
8808  * Every successfully started backup must be stopped by calling
8809  * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
8810  * backups active at the same time.
8811  *
8812  * It is the responsibility of the caller of this function to verify the
8813  * permissions of the calling user!
8814  */
8815 void
8816 do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
8817                                    BackupState *state, StringInfo tblspcmapfile)
8818 {
8819         bool            backup_started_in_recovery;
8820
8821         Assert(state != NULL);
8822         backup_started_in_recovery = RecoveryInProgress();
8823
8824         /*
8825          * During recovery, we don't need to check WAL level. Because, if WAL
8826          * level is not sufficient, it's impossible to get here during recovery.
8827          */
8828         if (!backup_started_in_recovery && !XLogIsNeeded())
8829                 ereport(ERROR,
8830                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8831                                  errmsg("WAL level not sufficient for making an online backup"),
8832                                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
8833
8834         if (strlen(backupidstr) > MAXPGPATH)
8835                 ereport(ERROR,
8836                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8837                                  errmsg("backup label too long (max %d bytes)",
8838                                                 MAXPGPATH)));
8839
8840         strlcpy(state->name, backupidstr, sizeof(state->name));
8841
8842         /*
8843          * Mark backup active in shared memory.  We must do full-page WAL writes
8844          * during an on-line backup even if not doing so at other times, because
8845          * it's quite possible for the backup dump to obtain a "torn" (partially
8846          * written) copy of a database page if it reads the page concurrently with
8847          * our write to the same page.  This can be fixed as long as the first
8848          * write to the page in the WAL sequence is a full-page write. Hence, we
8849          * increment runningBackups then force a CHECKPOINT, to ensure there are
8850          * no dirty pages in shared memory that might get dumped while the backup
8851          * is in progress without having a corresponding WAL record.  (Once the
8852          * backup is complete, we need not force full-page writes anymore, since
8853          * we expect that any pages not modified during the backup interval must
8854          * have been correctly captured by the backup.)
8855          *
8856          * Note that forcing full-page writes has no effect during an online
8857          * backup from the standby.
8858          *
8859          * We must hold all the insertion locks to change the value of
8860          * runningBackups, to ensure adequate interlocking against
8861          * XLogInsertRecord().
8862          */
8863         WALInsertLockAcquireExclusive();
8864         XLogCtl->Insert.runningBackups++;
8865         WALInsertLockRelease();
8866
8867         /*
8868          * Ensure we decrement runningBackups if we fail below. NB -- for this to
8869          * work correctly, it is critical that sessionBackupState is only updated
8870          * after this block is over.
8871          */
8872         PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
8873         {
8874                 bool            gotUniqueStartpoint = false;
8875                 DIR                *tblspcdir;
8876                 struct dirent *de;
8877                 tablespaceinfo *ti;
8878                 int                     datadirpathlen;
8879
8880                 /*
8881                  * Force an XLOG file switch before the checkpoint, to ensure that the
8882                  * WAL segment the checkpoint is written to doesn't contain pages with
8883                  * old timeline IDs.  That would otherwise happen if you called
8884                  * pg_backup_start() right after restoring from a PITR archive: the
8885                  * first WAL segment containing the startup checkpoint has pages in
8886                  * the beginning with the old timeline ID.  That can cause trouble at
8887                  * recovery: we won't have a history file covering the old timeline if
8888                  * pg_wal directory was not included in the base backup and the WAL
8889                  * archive was cleared too before starting the backup.
8890                  *
8891                  * This also ensures that we have emitted a WAL page header that has
8892                  * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
8893                  * Therefore, if a WAL archiver (such as pglesslog) is trying to
8894                  * compress out removable backup blocks, it won't remove any that
8895                  * occur after this point.
8896                  *
8897                  * During recovery, we skip forcing XLOG file switch, which means that
8898                  * the backup taken during recovery is not available for the special
8899                  * recovery case described above.
8900                  */
8901                 if (!backup_started_in_recovery)
8902                         RequestXLogSwitch(false);
8903
8904                 do
8905                 {
8906                         bool            checkpointfpw;
8907
8908                         /*
8909                          * Force a CHECKPOINT.  Aside from being necessary to prevent torn
8910                          * page problems, this guarantees that two successive backup runs
8911                          * will have different checkpoint positions and hence different
8912                          * history file names, even if nothing happened in between.
8913                          *
8914                          * During recovery, establish a restartpoint if possible. We use
8915                          * the last restartpoint as the backup starting checkpoint. This
8916                          * means that two successive backup runs can have same checkpoint
8917                          * positions.
8918                          *
8919                          * Since the fact that we are executing do_pg_backup_start()
8920                          * during recovery means that checkpointer is running, we can use
8921                          * RequestCheckpoint() to establish a restartpoint.
8922                          *
8923                          * We use CHECKPOINT_IMMEDIATE only if requested by user (via
8924                          * passing fast = true).  Otherwise this can take awhile.
8925                          */
8926                         RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8927                                                           (fast ? CHECKPOINT_IMMEDIATE : 0));
8928
8929                         /*
8930                          * Now we need to fetch the checkpoint record location, and also
8931                          * its REDO pointer.  The oldest point in WAL that would be needed
8932                          * to restore starting from the checkpoint is precisely the REDO
8933                          * pointer.
8934                          */
8935                         LWLockAcquire(ControlFileLock, LW_SHARED);
8936                         state->checkpointloc = ControlFile->checkPoint;
8937                         state->startpoint = ControlFile->checkPointCopy.redo;
8938                         state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
8939                         checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
8940                         LWLockRelease(ControlFileLock);
8941
8942                         if (backup_started_in_recovery)
8943                         {
8944                                 XLogRecPtr      recptr;
8945
8946                                 /*
8947                                  * Check to see if all WAL replayed during online backup
8948                                  * (i.e., since last restartpoint used as backup starting
8949                                  * checkpoint) contain full-page writes.
8950                                  */
8951                                 SpinLockAcquire(&XLogCtl->info_lck);
8952                                 recptr = XLogCtl->lastFpwDisableRecPtr;
8953                                 SpinLockRelease(&XLogCtl->info_lck);
8954
8955                                 if (!checkpointfpw || state->startpoint <= recptr)
8956                                         ereport(ERROR,
8957                                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8958                                                          errmsg("WAL generated with \"full_page_writes=off\" was replayed "
8959                                                                         "since last restartpoint"),
8960                                                          errhint("This means that the backup being taken on the standby "
8961                                                                          "is corrupt and should not be used. "
8962                                                                          "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
8963                                                                          "and then try an online backup again.")));
8964
8965                                 /*
8966                                  * During recovery, since we don't use the end-of-backup WAL
8967                                  * record and don't write the backup history file, the
8968                                  * starting WAL location doesn't need to be unique. This means
8969                                  * that two base backups started at the same time might use
8970                                  * the same checkpoint as starting locations.
8971                                  */
8972                                 gotUniqueStartpoint = true;
8973                         }
8974
8975                         /*
8976                          * If two base backups are started at the same time (in WAL sender
8977                          * processes), we need to make sure that they use different
8978                          * checkpoints as starting locations, because we use the starting
8979                          * WAL location as a unique identifier for the base backup in the
8980                          * end-of-backup WAL record and when we write the backup history
8981                          * file. Perhaps it would be better generate a separate unique ID
8982                          * for each backup instead of forcing another checkpoint, but
8983                          * taking a checkpoint right after another is not that expensive
8984                          * either because only few buffers have been dirtied yet.
8985                          */
8986                         WALInsertLockAcquireExclusive();
8987                         if (XLogCtl->Insert.lastBackupStart < state->startpoint)
8988                         {
8989                                 XLogCtl->Insert.lastBackupStart = state->startpoint;
8990                                 gotUniqueStartpoint = true;
8991                         }
8992                         WALInsertLockRelease();
8993                 } while (!gotUniqueStartpoint);
8994
8995                 /*
8996                  * Construct tablespace_map file.
8997                  */
8998                 datadirpathlen = strlen(DataDir);
8999
9000                 /* Collect information about all tablespaces */
9001                 tblspcdir = AllocateDir(PG_TBLSPC_DIR);
9002                 while ((de = ReadDir(tblspcdir, PG_TBLSPC_DIR)) != NULL)
9003                 {
9004                         char            fullpath[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
9005                         char            linkpath[MAXPGPATH];
9006                         char       *relpath = NULL;
9007                         char       *s;
9008                         PGFileType      de_type;
9009                         char       *badp;
9010                         Oid                     tsoid;
9011
9012                         /*
9013                          * Try to parse the directory name as an unsigned integer.
9014                          *
9015                          * Tablespace directories should be positive integers that can be
9016                          * represented in 32 bits, with no leading zeroes or trailing
9017                          * garbage. If we come across a name that doesn't meet those
9018                          * criteria, skip it.
9019                          */
9020                         if (de->d_name[0] < '1' || de->d_name[1] > '9')
9021                                 continue;
9022                         errno = 0;
9023                         tsoid = strtoul(de->d_name, &badp, 10);
9024                         if (*badp != '\0' || errno == EINVAL || errno == ERANGE)
9025                                 continue;
9026
9027                         snprintf(fullpath, sizeof(fullpath), "%s/%s", PG_TBLSPC_DIR, de->d_name);
9028
9029                         de_type = get_dirent_type(fullpath, de, false, ERROR);
9030
9031                         if (de_type == PGFILETYPE_LNK)
9032                         {
9033                                 StringInfoData escapedpath;
9034                                 int                     rllen;
9035
9036                                 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
9037                                 if (rllen < 0)
9038                                 {
9039                                         ereport(WARNING,
9040                                                         (errmsg("could not read symbolic link \"%s\": %m",
9041                                                                         fullpath)));
9042                                         continue;
9043                                 }
9044                                 else if (rllen >= sizeof(linkpath))
9045                                 {
9046                                         ereport(WARNING,
9047                                                         (errmsg("symbolic link \"%s\" target is too long",
9048                                                                         fullpath)));
9049                                         continue;
9050                                 }
9051                                 linkpath[rllen] = '\0';
9052
9053                                 /*
9054                                  * Relpath holds the relative path of the tablespace directory
9055                                  * when it's located within PGDATA, or NULL if it's located
9056                                  * elsewhere.
9057                                  */
9058                                 if (rllen > datadirpathlen &&
9059                                         strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
9060                                         IS_DIR_SEP(linkpath[datadirpathlen]))
9061                                         relpath = pstrdup(linkpath + datadirpathlen + 1);
9062
9063                                 /*
9064                                  * Add a backslash-escaped version of the link path to the
9065                                  * tablespace map file.
9066                                  */
9067                                 initStringInfo(&escapedpath);
9068                                 for (s = linkpath; *s; s++)
9069                                 {
9070                                         if (*s == '\n' || *s == '\r' || *s == '\\')
9071                                                 appendStringInfoChar(&escapedpath, '\\');
9072                                         appendStringInfoChar(&escapedpath, *s);
9073                                 }
9074                                 appendStringInfo(tblspcmapfile, "%s %s\n",
9075                                                                  de->d_name, escapedpath.data);
9076                                 pfree(escapedpath.data);
9077                         }
9078                         else if (de_type == PGFILETYPE_DIR)
9079                         {
9080                                 /*
9081                                  * It's possible to use allow_in_place_tablespaces to create
9082                                  * directories directly under pg_tblspc, for testing purposes
9083                                  * only.
9084                                  *
9085                                  * In this case, we store a relative path rather than an
9086                                  * absolute path into the tablespaceinfo.
9087                                  */
9088                                 snprintf(linkpath, sizeof(linkpath), "%s/%s",
9089                                                  PG_TBLSPC_DIR, de->d_name);
9090                                 relpath = pstrdup(linkpath);
9091                         }
9092                         else
9093                         {
9094                                 /* Skip any other file type that appears here. */
9095                                 continue;
9096                         }
9097
9098                         ti = palloc(sizeof(tablespaceinfo));
9099                         ti->oid = tsoid;
9100                         ti->path = pstrdup(linkpath);
9101                         ti->rpath = relpath;
9102                         ti->size = -1;
9103
9104                         if (tablespaces)
9105                                 *tablespaces = lappend(*tablespaces, ti);
9106                 }
9107                 FreeDir(tblspcdir);
9108
9109                 state->starttime = (pg_time_t) time(NULL);
9110         }
9111         PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
9112
9113         state->started_in_recovery = backup_started_in_recovery;
9114
9115         /*
9116          * Mark that the start phase has correctly finished for the backup.
9117          */
9118         sessionBackupState = SESSION_BACKUP_RUNNING;
9119 }
9120
9121 /*
9122  * Utility routine to fetch the session-level status of a backup running.
9123  */
9124 SessionBackupState
9125 get_backup_status(void)
9126 {
9127         return sessionBackupState;
9128 }
9129
9130 /*
9131  * do_pg_backup_stop
9132  *
9133  * Utility function called at the end of an online backup.  It creates history
9134  * file (if required), resets sessionBackupState and so on.  It can optionally
9135  * wait for WAL segments to be archived.
9136  *
9137  * "state" is filled with the information necessary to restore from this
9138  * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
9139  *
9140  * It is the responsibility of the caller of this function to verify the
9141  * permissions of the calling user!
9142  */
9143 void
9144 do_pg_backup_stop(BackupState *state, bool waitforarchive)
9145 {
9146         bool            backup_stopped_in_recovery = false;
9147         char            histfilepath[MAXPGPATH];
9148         char            lastxlogfilename[MAXFNAMELEN];
9149         char            histfilename[MAXFNAMELEN];
9150         XLogSegNo       _logSegNo;
9151         FILE       *fp;
9152         int                     seconds_before_warning;
9153         int                     waits = 0;
9154         bool            reported_waiting = false;
9155
9156         Assert(state != NULL);
9157
9158         backup_stopped_in_recovery = RecoveryInProgress();
9159
9160         /*
9161          * During recovery, we don't need to check WAL level. Because, if WAL
9162          * level is not sufficient, it's impossible to get here during recovery.
9163          */
9164         if (!backup_stopped_in_recovery && !XLogIsNeeded())
9165                 ereport(ERROR,
9166                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9167                                  errmsg("WAL level not sufficient for making an online backup"),
9168                                  errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
9169
9170         /*
9171          * OK to update backup counter and session-level lock.
9172          *
9173          * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
9174          * otherwise they can be updated inconsistently, which might cause
9175          * do_pg_abort_backup() to fail.
9176          */
9177         WALInsertLockAcquireExclusive();
9178
9179         /*
9180          * It is expected that each do_pg_backup_start() call is matched by
9181          * exactly one do_pg_backup_stop() call.
9182          */
9183         Assert(XLogCtl->Insert.runningBackups > 0);
9184         XLogCtl->Insert.runningBackups--;
9185
9186         /*
9187          * Clean up session-level lock.
9188          *
9189          * You might think that WALInsertLockRelease() can be called before
9190          * cleaning up session-level lock because session-level lock doesn't need
9191          * to be protected with WAL insertion lock. But since
9192          * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
9193          * cleaned up before it.
9194          */
9195         sessionBackupState = SESSION_BACKUP_NONE;
9196
9197         WALInsertLockRelease();
9198
9199         /*
9200          * If we are taking an online backup from the standby, we confirm that the
9201          * standby has not been promoted during the backup.
9202          */
9203         if (state->started_in_recovery && !backup_stopped_in_recovery)
9204                 ereport(ERROR,
9205                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9206                                  errmsg("the standby was promoted during online backup"),
9207                                  errhint("This means that the backup being taken is corrupt "
9208                                                  "and should not be used. "
9209                                                  "Try taking another online backup.")));
9210
9211         /*
9212          * During recovery, we don't write an end-of-backup record. We assume that
9213          * pg_control was backed up last and its minimum recovery point can be
9214          * available as the backup end location. Since we don't have an
9215          * end-of-backup record, we use the pg_control value to check whether
9216          * we've reached the end of backup when starting recovery from this
9217          * backup. We have no way of checking if pg_control wasn't backed up last
9218          * however.
9219          *
9220          * We don't force a switch to new WAL file but it is still possible to
9221          * wait for all the required files to be archived if waitforarchive is
9222          * true. This is okay if we use the backup to start a standby and fetch
9223          * the missing WAL using streaming replication. But in the case of an
9224          * archive recovery, a user should set waitforarchive to true and wait for
9225          * them to be archived to ensure that all the required files are
9226          * available.
9227          *
9228          * We return the current minimum recovery point as the backup end
9229          * location. Note that it can be greater than the exact backup end
9230          * location if the minimum recovery point is updated after the backup of
9231          * pg_control. This is harmless for current uses.
9232          *
9233          * XXX currently a backup history file is for informational and debug
9234          * purposes only. It's not essential for an online backup. Furthermore,
9235          * even if it's created, it will not be archived during recovery because
9236          * an archiver is not invoked. So it doesn't seem worthwhile to write a
9237          * backup history file during recovery.
9238          */
9239         if (backup_stopped_in_recovery)
9240         {
9241                 XLogRecPtr      recptr;
9242
9243                 /*
9244                  * Check to see if all WAL replayed during online backup contain
9245                  * full-page writes.
9246                  */
9247                 SpinLockAcquire(&XLogCtl->info_lck);
9248                 recptr = XLogCtl->lastFpwDisableRecPtr;
9249                 SpinLockRelease(&XLogCtl->info_lck);
9250
9251                 if (state->startpoint <= recptr)
9252                         ereport(ERROR,
9253                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9254                                          errmsg("WAL generated with \"full_page_writes=off\" was replayed "
9255                                                         "during online backup"),
9256                                          errhint("This means that the backup being taken on the standby "
9257                                                          "is corrupt and should not be used. "
9258                                                          "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
9259                                                          "and then try an online backup again.")));
9260
9261
9262                 LWLockAcquire(ControlFileLock, LW_SHARED);
9263                 state->stoppoint = ControlFile->minRecoveryPoint;
9264                 state->stoptli = ControlFile->minRecoveryPointTLI;
9265                 LWLockRelease(ControlFileLock);
9266         }
9267         else
9268         {
9269                 char       *history_file;
9270
9271                 /*
9272                  * Write the backup-end xlog record
9273                  */
9274                 XLogBeginInsert();
9275                 XLogRegisterData((char *) (&state->startpoint),
9276                                                  sizeof(state->startpoint));
9277                 state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
9278
9279                 /*
9280                  * Given that we're not in recovery, InsertTimeLineID is set and can't
9281                  * change, so we can read it without a lock.
9282                  */
9283                 state->stoptli = XLogCtl->InsertTimeLineID;
9284
9285                 /*
9286                  * Force a switch to a new xlog segment file, so that the backup is
9287                  * valid as soon as archiver moves out the current segment file.
9288                  */
9289                 RequestXLogSwitch(false);
9290
9291                 state->stoptime = (pg_time_t) time(NULL);
9292
9293                 /*
9294                  * Write the backup history file
9295                  */
9296                 XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
9297                 BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
9298                                                           state->startpoint, wal_segment_size);
9299                 fp = AllocateFile(histfilepath, "w");
9300                 if (!fp)
9301                         ereport(ERROR,
9302                                         (errcode_for_file_access(),
9303                                          errmsg("could not create file \"%s\": %m",
9304                                                         histfilepath)));
9305
9306                 /* Build and save the contents of the backup history file */
9307                 history_file = build_backup_content(state, true);
9308                 fprintf(fp, "%s", history_file);
9309                 pfree(history_file);
9310
9311                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
9312                         ereport(ERROR,
9313                                         (errcode_for_file_access(),
9314                                          errmsg("could not write file \"%s\": %m",
9315                                                         histfilepath)));
9316
9317                 /*
9318                  * Clean out any no-longer-needed history files.  As a side effect,
9319                  * this will post a .ready file for the newly created history file,
9320                  * notifying the archiver that history file may be archived
9321                  * immediately.
9322                  */
9323                 CleanupBackupHistory();
9324         }
9325
9326         /*
9327          * If archiving is enabled, wait for all the required WAL files to be
9328          * archived before returning. If archiving isn't enabled, the required WAL
9329          * needs to be transported via streaming replication (hopefully with
9330          * wal_keep_size set high enough), or some more exotic mechanism like
9331          * polling and copying files from pg_wal with script. We have no knowledge
9332          * of those mechanisms, so it's up to the user to ensure that he gets all
9333          * the required WAL.
9334          *
9335          * We wait until both the last WAL file filled during backup and the
9336          * history file have been archived, and assume that the alphabetic sorting
9337          * property of the WAL files ensures any earlier WAL files are safely
9338          * archived as well.
9339          *
9340          * We wait forever, since archive_command is supposed to work and we
9341          * assume the admin wanted his backup to work completely. If you don't
9342          * wish to wait, then either waitforarchive should be passed in as false,
9343          * or you can set statement_timeout.  Also, some notices are issued to
9344          * clue in anyone who might be doing this interactively.
9345          */
9346
9347         if (waitforarchive &&
9348                 ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
9349                  (backup_stopped_in_recovery && XLogArchivingAlways())))
9350         {
9351                 XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
9352                 XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
9353                                          wal_segment_size);
9354
9355                 XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
9356                 BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
9357                                                           state->startpoint, wal_segment_size);
9358
9359                 seconds_before_warning = 60;
9360                 waits = 0;
9361
9362                 while (XLogArchiveIsBusy(lastxlogfilename) ||
9363                            XLogArchiveIsBusy(histfilename))
9364                 {
9365                         CHECK_FOR_INTERRUPTS();
9366
9367                         if (!reported_waiting && waits > 5)
9368                         {
9369                                 ereport(NOTICE,
9370                                                 (errmsg("base backup done, waiting for required WAL segments to be archived")));
9371                                 reported_waiting = true;
9372                         }
9373
9374                         (void) WaitLatch(MyLatch,
9375                                                          WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
9376                                                          1000L,
9377                                                          WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
9378                         ResetLatch(MyLatch);
9379
9380                         if (++waits >= seconds_before_warning)
9381                         {
9382                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
9383                                 ereport(WARNING,
9384                                                 (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9385                                                                 waits),
9386                                                  errhint("Check that your \"archive_command\" is executing properly.  "
9387                                                                  "You can safely cancel this backup, "
9388                                                                  "but the database backup will not be usable without all the WAL segments.")));
9389                         }
9390                 }
9391
9392                 ereport(NOTICE,
9393                                 (errmsg("all required WAL segments have been archived")));
9394         }
9395         else if (waitforarchive)
9396                 ereport(NOTICE,
9397                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9398 }
9399
9400
9401 /*
9402  * do_pg_abort_backup: abort a running backup
9403  *
9404  * This does just the most basic steps of do_pg_backup_stop(), by taking the
9405  * system out of backup mode, thus making it a lot more safe to call from
9406  * an error handler.
9407  *
9408  * 'arg' indicates that it's being called during backup setup; so
9409  * sessionBackupState has not been modified yet, but runningBackups has
9410  * already been incremented.  When it's false, then it's invoked as a
9411  * before_shmem_exit handler, and therefore we must not change state
9412  * unless sessionBackupState indicates that a backup is actually running.
9413  *
9414  * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
9415  * before_shmem_exit handler, hence the odd-looking signature.
9416  */
9417 void
9418 do_pg_abort_backup(int code, Datum arg)
9419 {
9420         bool            during_backup_start = DatumGetBool(arg);
9421
9422         /* If called during backup start, there shouldn't be one already running */
9423         Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
9424
9425         if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
9426         {
9427                 WALInsertLockAcquireExclusive();
9428                 Assert(XLogCtl->Insert.runningBackups > 0);
9429                 XLogCtl->Insert.runningBackups--;
9430
9431                 sessionBackupState = SESSION_BACKUP_NONE;
9432                 WALInsertLockRelease();
9433
9434                 if (!during_backup_start)
9435                         ereport(WARNING,
9436                                         errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
9437         }
9438 }
9439
9440 /*
9441  * Register a handler that will warn about unterminated backups at end of
9442  * session, unless this has already been done.
9443  */
9444 void
9445 register_persistent_abort_backup_handler(void)
9446 {
9447         static bool already_done = false;
9448
9449         if (already_done)
9450                 return;
9451         before_shmem_exit(do_pg_abort_backup, DatumGetBool(false));
9452         already_done = true;
9453 }
9454
9455 /*
9456  * Get latest WAL insert pointer
9457  */
9458 XLogRecPtr
9459 GetXLogInsertRecPtr(void)
9460 {
9461         XLogCtlInsert *Insert = &XLogCtl->Insert;
9462         uint64          current_bytepos;
9463
9464         SpinLockAcquire(&Insert->insertpos_lck);
9465         current_bytepos = Insert->CurrBytePos;
9466         SpinLockRelease(&Insert->insertpos_lck);
9467
9468         return XLogBytePosToRecPtr(current_bytepos);
9469 }
9470
9471 /*
9472  * Get latest WAL write pointer
9473  */
9474 XLogRecPtr
9475 GetXLogWriteRecPtr(void)
9476 {
9477         RefreshXLogWriteResult(LogwrtResult);
9478
9479         return LogwrtResult.Write;
9480 }
9481
9482 /*
9483  * Returns the redo pointer of the last checkpoint or restartpoint. This is
9484  * the oldest point in WAL that we still need, if we have to restart recovery.
9485  */
9486 void
9487 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
9488 {
9489         LWLockAcquire(ControlFileLock, LW_SHARED);
9490         *oldrecptr = ControlFile->checkPointCopy.redo;
9491         *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
9492         LWLockRelease(ControlFileLock);
9493 }
9494
9495 /* Thin wrapper around ShutdownWalRcv(). */
9496 void
9497 XLogShutdownWalRcv(void)
9498 {
9499         ShutdownWalRcv();
9500
9501         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9502         XLogCtl->InstallXLogFileSegmentActive = false;
9503         LWLockRelease(ControlFileLock);
9504 }
9505
9506 /* Enable WAL file recycling and preallocation. */
9507 void
9508 SetInstallXLogFileSegmentActive(void)
9509 {
9510         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9511         XLogCtl->InstallXLogFileSegmentActive = true;
9512         LWLockRelease(ControlFileLock);
9513 }
9514
9515 bool
9516 IsInstallXLogFileSegmentActive(void)
9517 {
9518         bool            result;
9519
9520         LWLockAcquire(ControlFileLock, LW_SHARED);
9521         result = XLogCtl->InstallXLogFileSegmentActive;
9522         LWLockRelease(ControlFileLock);
9523
9524         return result;
9525 }
9526
9527 /*
9528  * Update the WalWriterSleeping flag.
9529  */
9530 void
9531 SetWalWriterSleeping(bool sleeping)
9532 {
9533         SpinLockAcquire(&XLogCtl->info_lck);
9534         XLogCtl->WalWriterSleeping = sleeping;
9535         SpinLockRelease(&XLogCtl->info_lck);
9536 }