src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <sys/wait.h>
  24 #include <unistd.h>
  25
  26 #include "access/clog.h"
  27 #include "access/multixact.h"
  28 #include "access/subtrans.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_type.h"
  38 #include "funcapi.h"
  39 #include "libpq/pqsignal.h"
  40 #include "miscadmin.h"
  41 #include "pgstat.h"
  42 #include "postmaster/bgwriter.h"
  43 #include "storage/bufmgr.h"
  44 #include "storage/fd.h"
  45 #include "storage/ipc.h"
  46 #include "storage/pmsignal.h"
  47 #include "storage/procarray.h"
  48 #include "storage/smgr.h"
  49 #include "storage/spin.h"
  50 #include "utils/builtins.h"
  51 #include "utils/flatfiles.h"
  52 #include "utils/guc.h"
  53 #include "utils/ps_status.h"
  54 #include "pg_trace.h"
  55
  56
  57 /* File path names (all relative to $PGDATA) */
  58 #define BACKUP_LABEL_FILE               "backup_label"
  59 #define BACKUP_LABEL_OLD                "backup_label.old"
  60 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  61 #define RECOVERY_COMMAND_DONE   "recovery.done"
  62
  63
  64 /* User-settable parameters */
  65 int                     CheckPointSegments = 3;
  66 int                     XLOGbuffers = 8;
  67 int                     XLogArchiveTimeout = 0;
  68 bool            XLogArchiveMode = false;
  69 char       *XLogArchiveCommand = NULL;
  70 bool            fullPageWrites = true;
  71 bool            log_checkpoints = false;
  72 int                     sync_method = DEFAULT_SYNC_METHOD;
  73
  74 #ifdef WAL_DEBUG
  75 bool            XLOG_DEBUG = false;
  76 #endif
  77
  78 /*
  79  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  80  * When we are done with an old XLOG segment file, we will recycle it as a
  81  * future XLOG segment as long as there aren't already XLOGfileslop future
  82  * segments; else we'll delete it.  This could be made a separate GUC
  83  * variable, but at present I think it's sufficient to hardwire it as
  84  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  85  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  86  * of them; the +1 allows boundary cases to happen without wasting a
  87  * delete/create-segment cycle.
  88  */
  89 #define XLOGfileslop    (2*CheckPointSegments + 1)
  90
  91 /*
  92  * GUC support
  93  */
  94 const struct config_enum_entry sync_method_options[] = {
  95         {"fsync", SYNC_METHOD_FSYNC, false},
  96 #ifdef HAVE_FSYNC_WRITETHROUGH
  97         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
  98 #endif
  99 #ifdef HAVE_FDATASYNC
 100         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 101 #endif
 102 #ifdef OPEN_SYNC_FLAG
 103         {"open_sync", SYNC_METHOD_OPEN, false},
 104 #endif
 105 #ifdef OPEN_DATASYNC_FLAG
 106         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 107 #endif
 108         {NULL, 0, false}
 109 };
 110
 111 /*
 112  * Statistics for current checkpoint are collected in this global struct.
 113  * Because only the background writer or a stand-alone backend can perform
 114  * checkpoints, this will be unused in normal backends.
 115  */
 116 CheckpointStatsData CheckpointStats;
 117
 118 /*
 119  * ThisTimeLineID will be same in all backends --- it identifies current
 120  * WAL timeline for the database system.
 121  */
 122 TimeLineID      ThisTimeLineID = 0;
 123
 124 /*
 125  * Are we doing recovery from XLOG?
 126  *
 127  * This is only ever true in the startup process; it should be read as meaning
 128  * "this process is replaying WAL records", rather than "the system is in
 129  * recovery mode".  It should be examined primarily by functions that need
 130  * to act differently when called from a WAL redo function (e.g., to skip WAL
 131  * logging).  To check whether the system is in recovery regardless of which
 132  * process you're running in, use RecoveryInProgress().
 133  */
 134 bool            InRecovery = false;
 135
 136 /*
 137  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 138  * known, need to check the shared state".
 139  */
 140 static bool LocalRecoveryInProgress = true;
 141
 142 /*
 143  * Local state for XLogInsertAllowed():
 144  *              1: unconditionally allowed to insert XLOG
 145  *              0: unconditionally not allowed to insert XLOG
 146  *              -1: must check RecoveryInProgress(); disallow until it is false
 147  * Most processes start with -1 and transition to 1 after seeing that recovery
 148  * is not in progress.  But we can also force the value for special cases.
 149  * The coding in XLogInsertAllowed() depends on the first two of these states
 150  * being numerically the same as bool true and false.
 151  */
 152 static int      LocalXLogInsertAllowed = -1;
 153
 154 /* Are we recovering using offline XLOG archives? */
 155 static bool InArchiveRecovery = false;
 156
 157 /* Was the last xlog file restored from archive, or local? */
 158 static bool restoredFromArchive = false;
 159
 160 /* options taken from recovery.conf */
 161 static char *recoveryRestoreCommand = NULL;
 162 static char *recoveryEndCommand = NULL;
 163 static bool recoveryTarget = false;
 164 static bool recoveryTargetExact = false;
 165 static bool recoveryTargetInclusive = true;
 166 static TransactionId recoveryTargetXid;
 167 static TimestampTz recoveryTargetTime;
 168 static TimestampTz recoveryLastXTime = 0;
 169
 170 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 171 static TransactionId recoveryStopXid;
 172 static TimestampTz recoveryStopTime;
 173 static bool recoveryStopAfter;
 174
 175 /*
 176  * During normal operation, the only timeline we care about is ThisTimeLineID.
 177  * During recovery, however, things are more complicated.  To simplify life
 178  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 179  * scan through the WAL history (that is, it is the line that was active when
 180  * the currently-scanned WAL record was generated).  We also need these
 181  * timeline values:
 182  *
 183  * recoveryTargetTLI: the desired timeline that we want to end in.
 184  *
 185  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 186  * its known parents, newest first (so recoveryTargetTLI is always the
 187  * first list member).  Only these TLIs are expected to be seen in the WAL
 188  * segments we read, and indeed only these TLIs will be considered as
 189  * candidate WAL files to open at all.
 190  *
 191  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 192  * (This is not necessarily the same as ThisTimeLineID, because we could
 193  * be scanning data that was copied from an ancestor timeline when the current
 194  * file was created.)  During a sequential scan we do not allow this value
 195  * to decrease.
 196  */
 197 static TimeLineID recoveryTargetTLI;
 198 static List *expectedTLIs;
 199 static TimeLineID curFileTLI;
 200
 201 /*
 202  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 203  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 204  * end+1 of the last record, and is reset when we end a top-level transaction,
 205  * or start a new one; so it can be used to tell if the current transaction has
 206  * created any XLOG records.
 207  */
 208 static XLogRecPtr ProcLastRecPtr = {0, 0};
 209
 210 XLogRecPtr      XactLastRecEnd = {0, 0};
 211
 212 /*
 213  * RedoRecPtr is this backend's local copy of the REDO record pointer
 214  * (which is almost but not quite the same as a pointer to the most recent
 215  * CHECKPOINT record).  We update this from the shared-memory copy,
 216  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 217  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 218  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 219  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 220  * InitXLOGAccess.
 221  */
 222 static XLogRecPtr RedoRecPtr;
 223
 224 /*----------
 225  * Shared-memory data structures for XLOG control
 226  *
 227  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 228  * the log up to (all records before that point must be written or fsynced).
 229  * LogwrtResult indicates the byte positions we have already written/fsynced.
 230  * These structs are identical but are declared separately to indicate their
 231  * slightly different functions.
 232  *
 233  * We do a lot of pushups to minimize the amount of access to lockable
 234  * shared memory values.  There are actually three shared-memory copies of
 235  * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 236  *              XLogCtl->LogwrtResult is protected by info_lck
 237  *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 238  *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 239  * One must hold the associated lock to read or write any of these, but
 240  * of course no lock is needed to read/write the unshared LogwrtResult.
 241  *
 242  * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 243  * right", since both are updated by a write or flush operation before
 244  * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 245  * is that it can be examined/modified by code that already holds WALWriteLock
 246  * without needing to grab info_lck as well.
 247  *
 248  * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
 249  * but is updated when convenient.      Again, it exists for the convenience of
 250  * code that is already holding WALInsertLock but not the other locks.
 251  *
 252  * The unshared LogwrtResult may lag behind any or all of these, and again
 253  * is updated when convenient.
 254  *
 255  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 256  * (protected by info_lck), but we don't need to cache any copies of it.
 257  *
 258  * Note that this all works because the request and result positions can only
 259  * advance forward, never back up, and so we can easily determine which of two
 260  * values is "more up to date".
 261  *
 262  * info_lck is only held long enough to read/update the protected variables,
 263  * so it's a plain spinlock.  The other locks are held longer (potentially
 264  * over I/O operations), so we use LWLocks for them.  These locks are:
 265  *
 266  * WALInsertLock: must be held to insert a record into the WAL buffers.
 267  *
 268  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 269  * XLogFlush).
 270  *
 271  * ControlFileLock: must be held to read/update control file or create
 272  * new log file.
 273  *
 274  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 275  * only one checkpointer at a time; currently, with all checkpoints done by
 276  * the bgwriter, this is just pro forma).
 277  *
 278  *----------
 279  */
 280
 281 typedef struct XLogwrtRqst
 282 {
 283         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 284         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 285 } XLogwrtRqst;
 286
 287 typedef struct XLogwrtResult
 288 {
 289         XLogRecPtr      Write;                  /* last byte + 1 written out */
 290         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 291 } XLogwrtResult;
 292
 293 /*
 294  * Shared state data for XLogInsert.
 295  */
 296 typedef struct XLogCtlInsert
 297 {
 298         XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 299         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 300         int                     curridx;                /* current block index in cache */
 301         XLogPageHeader currpage;        /* points to header of block in cache */
 302         char       *currpos;            /* current insertion point in cache */
 303         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 304         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 305 } XLogCtlInsert;
 306
 307 /*
 308  * Shared state data for XLogWrite/XLogFlush.
 309  */
 310 typedef struct XLogCtlWrite
 311 {
 312         XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 313         int                     curridx;                /* cache index of next block to write */
 314         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 315 } XLogCtlWrite;
 316
 317 /*
 318  * Total shared-memory state for XLOG.
 319  */
 320 typedef struct XLogCtlData
 321 {
 322         /* Protected by WALInsertLock: */
 323         XLogCtlInsert Insert;
 324
 325         /* Protected by info_lck: */
 326         XLogwrtRqst LogwrtRqst;
 327         XLogwrtResult LogwrtResult;
 328         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 329         TransactionId ckptXid;
 330         XLogRecPtr      asyncCommitLSN; /* LSN of newest async commit */
 331
 332         /* Protected by WALWriteLock: */
 333         XLogCtlWrite Write;
 334
 335         /*
 336          * These values do not change after startup, although the pointed-to pages
 337          * and xlblocks values certainly do.  Permission to read/write the pages
 338          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 339          */
 340         char       *pages;                      /* buffers for unwritten XLOG pages */
 341         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 342         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 343         TimeLineID      ThisTimeLineID;
 344
 345         /*
 346          * SharedRecoveryInProgress indicates if we're still in crash or archive
 347          * recovery.  Protected by info_lck.
 348          */
 349         bool            SharedRecoveryInProgress;
 350
 351         /*
 352          * During recovery, we keep a copy of the latest checkpoint record here.
 353          * Used by the background writer when it wants to create a restartpoint.
 354          *
 355          * Protected by info_lck.
 356          */
 357         XLogRecPtr      lastCheckPointRecPtr;
 358         CheckPoint      lastCheckPoint;
 359
 360         /* end+1 of the last record replayed (or being replayed) */
 361         XLogRecPtr      replayEndRecPtr;
 362
 363         slock_t         info_lck;               /* locks shared variables shown above */
 364 } XLogCtlData;
 365
 366 static XLogCtlData *XLogCtl = NULL;
 367
 368 /*
 369  * We maintain an image of pg_control in shared memory.
 370  */
 371 static ControlFileData *ControlFile = NULL;
 372
 373 /*
 374  * Macros for managing XLogInsert state.  In most cases, the calling routine
 375  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 376  * so these are passed as parameters instead of being fetched via XLogCtl.
 377  */
 378
 379 /* Free space remaining in the current xlog page buffer */
 380 #define INSERT_FREESPACE(Insert)  \
 381         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 382
 383 /* Construct XLogRecPtr value for current insertion point */
 384 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 385         ( \
 386           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 387           (recptr).xrecoff = \
 388                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 389         )
 390
 391 #define PrevBufIdx(idx)         \
 392                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 393
 394 #define NextBufIdx(idx)         \
 395                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 396
 397 /*
 398  * Private, possibly out-of-date copy of shared LogwrtResult.
 399  * See discussion above.
 400  */
 401 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 402
 403 /*
 404  * openLogFile is -1 or a kernel FD for an open log file segment.
 405  * When it's open, openLogOff is the current seek offset in the file.
 406  * openLogId/openLogSeg identify the segment.  These variables are only
 407  * used to write the XLOG, and so will normally refer to the active segment.
 408  */
 409 static int      openLogFile = -1;
 410 static uint32 openLogId = 0;
 411 static uint32 openLogSeg = 0;
 412 static uint32 openLogOff = 0;
 413
 414 /*
 415  * These variables are used similarly to the ones above, but for reading
 416  * the XLOG.  Note, however, that readOff generally represents the offset
 417  * of the page just read, not the seek position of the FD itself, which
 418  * will be just past that page.
 419  */
 420 static int      readFile = -1;
 421 static uint32 readId = 0;
 422 static uint32 readSeg = 0;
 423 static uint32 readOff = 0;
 424
 425 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 426 static char *readBuf = NULL;
 427
 428 /* Buffer for current ReadRecord result (expandable) */
 429 static char *readRecordBuf = NULL;
 430 static uint32 readRecordBufSize = 0;
 431
 432 /* State information for XLOG reading */
 433 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 434 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 435 static XLogRecord *nextRecord = NULL;
 436 static TimeLineID lastPageTLI = 0;
 437
 438 static XLogRecPtr minRecoveryPoint;             /* local copy of
 439                                                                                  * ControlFile->minRecoveryPoint */
 440 static bool updateMinRecoveryPoint = true;
 441
 442 static bool InRedo = false;
 443
 444 /*
 445  * Flags set by interrupt handlers for later service in the redo loop.
 446  */
 447 static volatile sig_atomic_t got_SIGHUP = false;
 448 static volatile sig_atomic_t shutdown_requested = false;
 449
 450 /*
 451  * Flag set when executing a restore command, to tell SIGTERM signal handler
 452  * that it's safe to just proc_exit.
 453  */
 454 static volatile sig_atomic_t in_restore_command = false;
 455
 456
 457 static void XLogArchiveNotify(const char *xlog);
 458 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 459 static bool XLogArchiveCheckDone(const char *xlog);
 460 static bool XLogArchiveIsBusy(const char *xlog);
 461 static void XLogArchiveCleanup(const char *xlog);
 462 static void readRecoveryCommandFile(void);
 463 static void exitArchiveRecovery(TimeLineID endTLI,
 464                                         uint32 endLogId, uint32 endLogSeg);
 465 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 466 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 467
 468 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 469                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 470 static bool AdvanceXLInsertBuffer(bool new_segment);
 471 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 472 static int XLogFileInit(uint32 log, uint32 seg,
 473                          bool *use_existent, bool use_lock);
 474 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 475                                            bool find_free, int *max_advance,
 476                                            bool use_lock);
 477 static int      XLogFileOpen(uint32 log, uint32 seg);
 478 static int      XLogFileRead(uint32 log, uint32 seg, int emode);
 479 static void XLogFileClose(void);
 480 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 481                                         const char *recovername, off_t expectedSize);
 482 static void ExecuteRecoveryEndCommand(void);
 483 static void PreallocXlogFiles(XLogRecPtr endptr);
 484 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 485 static void ValidateXLOGDirectoryStructure(void);
 486 static void CleanupBackupHistory(void);
 487 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 488 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 489 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 490 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 491 static List *readTimeLineHistory(TimeLineID targetTLI);
 492 static bool existsTimeLineHistory(TimeLineID probeTLI);
 493 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 494 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 495                                          TimeLineID endTLI,
 496                                          uint32 endLogId, uint32 endLogSeg);
 497 static void WriteControlFile(void);
 498 static void ReadControlFile(void);
 499 static char *str_time(pg_time_t tnow);
 500
 501 #ifdef WAL_DEBUG
 502 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 503 #endif
 504 static void issue_xlog_fsync(void);
 505 static void pg_start_backup_callback(int code, Datum arg);
 506 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 507                                   XLogRecPtr *minRecoveryLoc);
 508 static void rm_redo_error_callback(void *arg);
 509 static int      get_sync_bit(int method);
 510
 511
 512 /*
 513  * Insert an XLOG record having the specified RMID and info bytes,
 514  * with the body of the record being the data chunk(s) described by
 515  * the rdata chain (see xlog.h for notes about rdata).
 516  *
 517  * Returns XLOG pointer to end of record (beginning of next record).
 518  * This can be used as LSN for data pages affected by the logged action.
 519  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 520  * before the data page can be written out.  This implements the basic
 521  * WAL rule "write the log before the data".)
 522  *
 523  * NB: this routine feels free to scribble on the XLogRecData structs,
 524  * though not on the data they reference.  This is OK since the XLogRecData
 525  * structs are always just temporaries in the calling code.
 526  */
 527 XLogRecPtr
 528 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 529 {
 530         XLogCtlInsert *Insert = &XLogCtl->Insert;
 531         XLogRecord *record;
 532         XLogContRecord *contrecord;
 533         XLogRecPtr      RecPtr;
 534         XLogRecPtr      WriteRqst;
 535         uint32          freespace;
 536         int                     curridx;
 537         XLogRecData *rdt;
 538         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 539         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 540         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 541         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 542         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 543         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 544         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 545         pg_crc32        rdata_crc;
 546         uint32          len,
 547                                 write_len;
 548         unsigned        i;
 549         bool            updrqst;
 550         bool            doPageWrites;
 551         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 552
 553         /* cross-check on whether we should be here or not */
 554         if (!XLogInsertAllowed())
 555                 elog(ERROR, "cannot make new WAL entries during recovery");
 556
 557         /* info's high bits are reserved for use by me */
 558         if (info & XLR_INFO_MASK)
 559                 elog(PANIC, "invalid xlog info mask %02X", info);
 560
 561         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 562
 563         /*
 564          * In bootstrap mode, we don't actually log anything but XLOG resources;
 565          * return a phony record pointer.
 566          */
 567         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 568         {
 569                 RecPtr.xlogid = 0;
 570                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 571                 return RecPtr;
 572         }
 573
 574         /*
 575          * Here we scan the rdata chain, determine which buffers must be backed
 576          * up, and compute the CRC values for the data.  Note that the record
 577          * header isn't added into the CRC initially since we don't know the final
 578          * length or info bits quite yet.  Thus, the CRC will represent the CRC of
 579          * the whole record in the order "rdata, then backup blocks, then record
 580          * header".
 581          *
 582          * We may have to loop back to here if a race condition is detected below.
 583          * We could prevent the race by doing all this work while holding the
 584          * insert lock, but it seems better to avoid doing CRC calculations while
 585          * holding the lock.  This means we have to be careful about modifying the
 586          * rdata chain until we know we aren't going to loop back again.  The only
 587          * change we allow ourselves to make earlier is to set rdt->data = NULL in
 588          * chain items we have decided we will have to back up the whole buffer
 589          * for.  This is OK because we will certainly decide the same thing again
 590          * for those items if we do it over; doing it here saves an extra pass
 591          * over the chain later.
 592          */
 593 begin:;
 594         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 595         {
 596                 dtbuf[i] = InvalidBuffer;
 597                 dtbuf_bkp[i] = false;
 598         }
 599
 600         /*
 601          * Decide if we need to do full-page writes in this XLOG record: true if
 602          * full_page_writes is on or we have a PITR request for it.  Since we
 603          * don't yet have the insert lock, forcePageWrites could change under us,
 604          * but we'll recheck it once we have the lock.
 605          */
 606         doPageWrites = fullPageWrites || Insert->forcePageWrites;
 607
 608         INIT_CRC32(rdata_crc);
 609         len = 0;
 610         for (rdt = rdata;;)
 611         {
 612                 if (rdt->buffer == InvalidBuffer)
 613                 {
 614                         /* Simple data, just include it */
 615                         len += rdt->len;
 616                         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 617                 }
 618                 else
 619                 {
 620                         /* Find info for buffer */
 621                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 622                         {
 623                                 if (rdt->buffer == dtbuf[i])
 624                                 {
 625                                         /* Buffer already referenced by earlier chain item */
 626                                         if (dtbuf_bkp[i])
 627                                                 rdt->data = NULL;
 628                                         else if (rdt->data)
 629                                         {
 630                                                 len += rdt->len;
 631                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 632                                         }
 633                                         break;
 634                                 }
 635                                 if (dtbuf[i] == InvalidBuffer)
 636                                 {
 637                                         /* OK, put it in this slot */
 638                                         dtbuf[i] = rdt->buffer;
 639                                         if (XLogCheckBuffer(rdt, doPageWrites,
 640                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 641                                         {
 642                                                 dtbuf_bkp[i] = true;
 643                                                 rdt->data = NULL;
 644                                         }
 645                                         else if (rdt->data)
 646                                         {
 647                                                 len += rdt->len;
 648                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 649                                         }
 650                                         break;
 651                                 }
 652                         }
 653                         if (i >= XLR_MAX_BKP_BLOCKS)
 654                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 655                                          XLR_MAX_BKP_BLOCKS);
 656                 }
 657                 /* Break out of loop when rdt points to last chain item */
 658                 if (rdt->next == NULL)
 659                         break;
 660                 rdt = rdt->next;
 661         }
 662
 663         /*
 664          * Now add the backup block headers and data into the CRC
 665          */
 666         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 667         {
 668                 if (dtbuf_bkp[i])
 669                 {
 670                         BkpBlock   *bkpb = &(dtbuf_xlg[i]);
 671                         char       *page;
 672
 673                         COMP_CRC32(rdata_crc,
 674                                            (char *) bkpb,
 675                                            sizeof(BkpBlock));
 676                         page = (char *) BufferGetBlock(dtbuf[i]);
 677                         if (bkpb->hole_length == 0)
 678                         {
 679                                 COMP_CRC32(rdata_crc,
 680                                                    page,
 681                                                    BLCKSZ);
 682                         }
 683                         else
 684                         {
 685                                 /* must skip the hole */
 686                                 COMP_CRC32(rdata_crc,
 687                                                    page,
 688                                                    bkpb->hole_offset);
 689                                 COMP_CRC32(rdata_crc,
 690                                                    page + (bkpb->hole_offset + bkpb->hole_length),
 691                                                    BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
 692                         }
 693                 }
 694         }
 695
 696         /*
 697          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 698          * error checking in ReadRecord.  This means that all callers of
 699          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 700          * make an exception for XLOG SWITCH records because we don't want them to
 701          * ever cross a segment boundary.
 702          */
 703         if (len == 0 && !isLogSwitch)
 704                 elog(PANIC, "invalid xlog record length %u", len);
 705
 706         START_CRIT_SECTION();
 707
 708         /* Now wait to get insert lock */
 709         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 710
 711         /*
 712          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 713          * back and recompute everything.  This can only happen just after a
 714          * checkpoint, so it's better to be slow in this case and fast otherwise.
 715          *
 716          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 717          * affect the contents of the XLOG record, so we'll update our local copy
 718          * but not force a recomputation.
 719          */
 720         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 721         {
 722                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 723                 RedoRecPtr = Insert->RedoRecPtr;
 724
 725                 if (doPageWrites)
 726                 {
 727                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 728                         {
 729                                 if (dtbuf[i] == InvalidBuffer)
 730                                         continue;
 731                                 if (dtbuf_bkp[i] == false &&
 732                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 733                                 {
 734                                         /*
 735                                          * Oops, this buffer now needs to be backed up, but we
 736                                          * didn't think so above.  Start over.
 737                                          */
 738                                         LWLockRelease(WALInsertLock);
 739                                         END_CRIT_SECTION();
 740                                         goto begin;
 741                                 }
 742                         }
 743                 }
 744         }
 745
 746         /*
 747          * Also check to see if forcePageWrites was just turned on; if we weren't
 748          * already doing full-page writes then go back and recompute. (If it was
 749          * just turned off, we could recompute the record without full pages, but
 750          * we choose not to bother.)
 751          */
 752         if (Insert->forcePageWrites && !doPageWrites)
 753         {
 754                 /* Oops, must redo it with full-page data */
 755                 LWLockRelease(WALInsertLock);
 756                 END_CRIT_SECTION();
 757                 goto begin;
 758         }
 759
 760         /*
 761          * Make additional rdata chain entries for the backup blocks, so that we
 762          * don't need to special-case them in the write loop.  Note that we have
 763          * now irrevocably changed the input rdata chain.  At the exit of this
 764          * loop, write_len includes the backup block data.
 765          *
 766          * Also set the appropriate info bits to show which buffers were backed
 767          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 768          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 769          */
 770         write_len = len;
 771         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 772         {
 773                 BkpBlock   *bkpb;
 774                 char       *page;
 775
 776                 if (!dtbuf_bkp[i])
 777                         continue;
 778
 779                 info |= XLR_SET_BKP_BLOCK(i);
 780
 781                 bkpb = &(dtbuf_xlg[i]);
 782                 page = (char *) BufferGetBlock(dtbuf[i]);
 783
 784                 rdt->next = &(dtbuf_rdt1[i]);
 785                 rdt = rdt->next;
 786
 787                 rdt->data = (char *) bkpb;
 788                 rdt->len = sizeof(BkpBlock);
 789                 write_len += sizeof(BkpBlock);
 790
 791                 rdt->next = &(dtbuf_rdt2[i]);
 792                 rdt = rdt->next;
 793
 794                 if (bkpb->hole_length == 0)
 795                 {
 796                         rdt->data = page;
 797                         rdt->len = BLCKSZ;
 798                         write_len += BLCKSZ;
 799                         rdt->next = NULL;
 800                 }
 801                 else
 802                 {
 803                         /* must skip the hole */
 804                         rdt->data = page;
 805                         rdt->len = bkpb->hole_offset;
 806                         write_len += bkpb->hole_offset;
 807
 808                         rdt->next = &(dtbuf_rdt3[i]);
 809                         rdt = rdt->next;
 810
 811                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 812                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 813                         write_len += rdt->len;
 814                         rdt->next = NULL;
 815                 }
 816         }
 817
 818         /*
 819          * If we backed up any full blocks and online backup is not in progress,
 820          * mark the backup blocks as removable.  This allows the WAL archiver to
 821          * know whether it is safe to compress archived WAL data by transforming
 822          * full-block records into the non-full-block format.
 823          *
 824          * Note: we could just set the flag whenever !forcePageWrites, but
 825          * defining it like this leaves the info bit free for some potential other
 826          * use in records without any backup blocks.
 827          */
 828         if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
 829                 info |= XLR_BKP_REMOVABLE;
 830
 831         /*
 832          * If there isn't enough space on the current XLOG page for a record
 833          * header, advance to the next page (leaving the unused space as zeroes).
 834          */
 835         updrqst = false;
 836         freespace = INSERT_FREESPACE(Insert);
 837         if (freespace < SizeOfXLogRecord)
 838         {
 839                 updrqst = AdvanceXLInsertBuffer(false);
 840                 freespace = INSERT_FREESPACE(Insert);
 841         }
 842
 843         /* Compute record's XLOG location */
 844         curridx = Insert->curridx;
 845         INSERT_RECPTR(RecPtr, Insert, curridx);
 846
 847         /*
 848          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 849          * segment, we need not insert it (and don't want to because we'd like
 850          * consecutive switch requests to be no-ops).  Instead, make sure
 851          * everything is written and flushed through the end of the prior segment,
 852          * and return the prior segment's end address.
 853          */
 854         if (isLogSwitch &&
 855                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 856         {
 857                 /* We can release insert lock immediately */
 858                 LWLockRelease(WALInsertLock);
 859
 860                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 861                 if (RecPtr.xrecoff == 0)
 862                 {
 863                         /* crossing a logid boundary */
 864                         RecPtr.xlogid -= 1;
 865                         RecPtr.xrecoff = XLogFileSize;
 866                 }
 867
 868                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 869                 LogwrtResult = XLogCtl->Write.LogwrtResult;
 870                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 871                 {
 872                         XLogwrtRqst FlushRqst;
 873
 874                         FlushRqst.Write = RecPtr;
 875                         FlushRqst.Flush = RecPtr;
 876                         XLogWrite(FlushRqst, false, false);
 877                 }
 878                 LWLockRelease(WALWriteLock);
 879
 880                 END_CRIT_SECTION();
 881
 882                 return RecPtr;
 883         }
 884
 885         /* Insert record header */
 886
 887         record = (XLogRecord *) Insert->currpos;
 888         record->xl_prev = Insert->PrevRecord;
 889         record->xl_xid = GetCurrentTransactionIdIfAny();
 890         record->xl_tot_len = SizeOfXLogRecord + write_len;
 891         record->xl_len = len;           /* doesn't include backup blocks */
 892         record->xl_info = info;
 893         record->xl_rmid = rmid;
 894
 895         /* Now we can finish computing the record's CRC */
 896         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
 897                            SizeOfXLogRecord - sizeof(pg_crc32));
 898         FIN_CRC32(rdata_crc);
 899         record->xl_crc = rdata_crc;
 900
 901 #ifdef WAL_DEBUG
 902         if (XLOG_DEBUG)
 903         {
 904                 StringInfoData buf;
 905
 906                 initStringInfo(&buf);
 907                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
 908                                                  RecPtr.xlogid, RecPtr.xrecoff);
 909                 xlog_outrec(&buf, record);
 910                 if (rdata->data != NULL)
 911                 {
 912                         appendStringInfo(&buf, " - ");
 913                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
 914                 }
 915                 elog(LOG, "%s", buf.data);
 916                 pfree(buf.data);
 917         }
 918 #endif
 919
 920         /* Record begin of record in appropriate places */
 921         ProcLastRecPtr = RecPtr;
 922         Insert->PrevRecord = RecPtr;
 923
 924         Insert->currpos += SizeOfXLogRecord;
 925         freespace -= SizeOfXLogRecord;
 926
 927         /*
 928          * Append the data, including backup blocks if any
 929          */
 930         while (write_len)
 931         {
 932                 while (rdata->data == NULL)
 933                         rdata = rdata->next;
 934
 935                 if (freespace > 0)
 936                 {
 937                         if (rdata->len > freespace)
 938                         {
 939                                 memcpy(Insert->currpos, rdata->data, freespace);
 940                                 rdata->data += freespace;
 941                                 rdata->len -= freespace;
 942                                 write_len -= freespace;
 943                         }
 944                         else
 945                         {
 946                                 memcpy(Insert->currpos, rdata->data, rdata->len);
 947                                 freespace -= rdata->len;
 948                                 write_len -= rdata->len;
 949                                 Insert->currpos += rdata->len;
 950                                 rdata = rdata->next;
 951                                 continue;
 952                         }
 953                 }
 954
 955                 /* Use next buffer */
 956                 updrqst = AdvanceXLInsertBuffer(false);
 957                 curridx = Insert->curridx;
 958                 /* Insert cont-record header */
 959                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
 960                 contrecord = (XLogContRecord *) Insert->currpos;
 961                 contrecord->xl_rem_len = write_len;
 962                 Insert->currpos += SizeOfXLogContRecord;
 963                 freespace = INSERT_FREESPACE(Insert);
 964         }
 965
 966         /* Ensure next record will be properly aligned */
 967         Insert->currpos = (char *) Insert->currpage +
 968                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
 969         freespace = INSERT_FREESPACE(Insert);
 970
 971         /*
 972          * The recptr I return is the beginning of the *next* record. This will be
 973          * stored as LSN for changed data pages...
 974          */
 975         INSERT_RECPTR(RecPtr, Insert, curridx);
 976
 977         /*
 978          * If the record is an XLOG_SWITCH, we must now write and flush all the
 979          * existing data, and then forcibly advance to the start of the next
 980          * segment.  It's not good to do this I/O while holding the insert lock,
 981          * but there seems too much risk of confusion if we try to release the
 982          * lock sooner.  Fortunately xlog switch needn't be a high-performance
 983          * operation anyway...
 984          */
 985         if (isLogSwitch)
 986         {
 987                 XLogCtlWrite *Write = &XLogCtl->Write;
 988                 XLogwrtRqst FlushRqst;
 989                 XLogRecPtr      OldSegEnd;
 990
 991                 TRACE_POSTGRESQL_XLOG_SWITCH();
 992
 993                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 994
 995                 /*
 996                  * Flush through the end of the page containing XLOG_SWITCH, and
 997                  * perform end-of-segment actions (eg, notifying archiver).
 998                  */
 999                 WriteRqst = XLogCtl->xlblocks[curridx];
1000                 FlushRqst.Write = WriteRqst;
1001                 FlushRqst.Flush = WriteRqst;
1002                 XLogWrite(FlushRqst, false, true);
1003
1004                 /* Set up the next buffer as first page of next segment */
1005                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1006                 (void) AdvanceXLInsertBuffer(true);
1007
1008                 /* There should be no unwritten data */
1009                 curridx = Insert->curridx;
1010                 Assert(curridx == Write->curridx);
1011
1012                 /* Compute end address of old segment */
1013                 OldSegEnd = XLogCtl->xlblocks[curridx];
1014                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
1015                 if (OldSegEnd.xrecoff == 0)
1016                 {
1017                         /* crossing a logid boundary */
1018                         OldSegEnd.xlogid -= 1;
1019                         OldSegEnd.xrecoff = XLogFileSize;
1020                 }
1021
1022                 /* Make it look like we've written and synced all of old segment */
1023                 LogwrtResult.Write = OldSegEnd;
1024                 LogwrtResult.Flush = OldSegEnd;
1025
1026                 /*
1027                  * Update shared-memory status --- this code should match XLogWrite
1028                  */
1029                 {
1030                         /* use volatile pointer to prevent code rearrangement */
1031                         volatile XLogCtlData *xlogctl = XLogCtl;
1032
1033                         SpinLockAcquire(&xlogctl->info_lck);
1034                         xlogctl->LogwrtResult = LogwrtResult;
1035                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1036                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1037                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1038                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1039                         SpinLockRelease(&xlogctl->info_lck);
1040                 }
1041
1042                 Write->LogwrtResult = LogwrtResult;
1043
1044                 LWLockRelease(WALWriteLock);
1045
1046                 updrqst = false;                /* done already */
1047         }
1048         else
1049         {
1050                 /* normal case, ie not xlog switch */
1051
1052                 /* Need to update shared LogwrtRqst if some block was filled up */
1053                 if (freespace < SizeOfXLogRecord)
1054                 {
1055                         /* curridx is filled and available for writing out */
1056                         updrqst = true;
1057                 }
1058                 else
1059                 {
1060                         /* if updrqst already set, write through end of previous buf */
1061                         curridx = PrevBufIdx(curridx);
1062                 }
1063                 WriteRqst = XLogCtl->xlblocks[curridx];
1064         }
1065
1066         LWLockRelease(WALInsertLock);
1067
1068         if (updrqst)
1069         {
1070                 /* use volatile pointer to prevent code rearrangement */
1071                 volatile XLogCtlData *xlogctl = XLogCtl;
1072
1073                 SpinLockAcquire(&xlogctl->info_lck);
1074                 /* advance global request to include new block(s) */
1075                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1076                         xlogctl->LogwrtRqst.Write = WriteRqst;
1077                 /* update local result copy while I have the chance */
1078                 LogwrtResult = xlogctl->LogwrtResult;
1079                 SpinLockRelease(&xlogctl->info_lck);
1080         }
1081
1082         XactLastRecEnd = RecPtr;
1083
1084         END_CRIT_SECTION();
1085
1086         return RecPtr;
1087 }
1088
1089 /*
1090  * Determine whether the buffer referenced by an XLogRecData item has to
1091  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1092  * save the buffer's LSN at *lsn.
1093  */
1094 static bool
1095 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1096                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1097 {
1098         Page            page;
1099
1100         page = BufferGetPage(rdata->buffer);
1101
1102         /*
1103          * XXX We assume page LSN is first data on *every* page that can be passed
1104          * to XLogInsert, whether it otherwise has the standard page layout or
1105          * not.
1106          */
1107         *lsn = PageGetLSN(page);
1108
1109         if (doPageWrites &&
1110                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1111         {
1112                 /*
1113                  * The page needs to be backed up, so set up *bkpb
1114                  */
1115                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1116
1117                 if (rdata->buffer_std)
1118                 {
1119                         /* Assume we can omit data between pd_lower and pd_upper */
1120                         uint16          lower = ((PageHeader) page)->pd_lower;
1121                         uint16          upper = ((PageHeader) page)->pd_upper;
1122
1123                         if (lower >= SizeOfPageHeaderData &&
1124                                 upper > lower &&
1125                                 upper <= BLCKSZ)
1126                         {
1127                                 bkpb->hole_offset = lower;
1128                                 bkpb->hole_length = upper - lower;
1129                         }
1130                         else
1131                         {
1132                                 /* No "hole" to compress out */
1133                                 bkpb->hole_offset = 0;
1134                                 bkpb->hole_length = 0;
1135                         }
1136                 }
1137                 else
1138                 {
1139                         /* Not a standard page header, don't try to eliminate "hole" */
1140                         bkpb->hole_offset = 0;
1141                         bkpb->hole_length = 0;
1142                 }
1143
1144                 return true;                    /* buffer requires backup */
1145         }
1146
1147         return false;                           /* buffer does not need to be backed up */
1148 }
1149
1150 /*
1151  * XLogArchiveNotify
1152  *
1153  * Create an archive notification file
1154  *
1155  * The name of the notification file is the message that will be picked up
1156  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1157  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1158  * then when complete, rename it to 0000000100000001000000C6.done
1159  */
1160 static void
1161 XLogArchiveNotify(const char *xlog)
1162 {
1163         char            archiveStatusPath[MAXPGPATH];
1164         FILE       *fd;
1165
1166         /* insert an otherwise empty file called <XLOG>.ready */
1167         StatusFilePath(archiveStatusPath, xlog, ".ready");
1168         fd = AllocateFile(archiveStatusPath, "w");
1169         if (fd == NULL)
1170         {
1171                 ereport(LOG,
1172                                 (errcode_for_file_access(),
1173                                  errmsg("could not create archive status file \"%s\": %m",
1174                                                 archiveStatusPath)));
1175                 return;
1176         }
1177         if (FreeFile(fd))
1178         {
1179                 ereport(LOG,
1180                                 (errcode_for_file_access(),
1181                                  errmsg("could not write archive status file \"%s\": %m",
1182                                                 archiveStatusPath)));
1183                 return;
1184         }
1185
1186         /* Notify archiver that it's got something to do */
1187         if (IsUnderPostmaster)
1188                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1189 }
1190
1191 /*
1192  * Convenience routine to notify using log/seg representation of filename
1193  */
1194 static void
1195 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1196 {
1197         char            xlog[MAXFNAMELEN];
1198
1199         XLogFileName(xlog, ThisTimeLineID, log, seg);
1200         XLogArchiveNotify(xlog);
1201 }
1202
1203 /*
1204  * XLogArchiveCheckDone
1205  *
1206  * This is called when we are ready to delete or recycle an old XLOG segment
1207  * file or backup history file.  If it is okay to delete it then return true.
1208  * If it is not time to delete it, make sure a .ready file exists, and return
1209  * false.
1210  *
1211  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1212  * then return false; else create <XLOG>.ready and return false.
1213  *
1214  * The reason we do things this way is so that if the original attempt to
1215  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1216  */
1217 static bool
1218 XLogArchiveCheckDone(const char *xlog)
1219 {
1220         char            archiveStatusPath[MAXPGPATH];
1221         struct stat stat_buf;
1222
1223         /* Always deletable if archiving is off */
1224         if (!XLogArchivingActive())
1225                 return true;
1226
1227         /* First check for .done --- this means archiver is done with it */
1228         StatusFilePath(archiveStatusPath, xlog, ".done");
1229         if (stat(archiveStatusPath, &stat_buf) == 0)
1230                 return true;
1231
1232         /* check for .ready --- this means archiver is still busy with it */
1233         StatusFilePath(archiveStatusPath, xlog, ".ready");
1234         if (stat(archiveStatusPath, &stat_buf) == 0)
1235                 return false;
1236
1237         /* Race condition --- maybe archiver just finished, so recheck */
1238         StatusFilePath(archiveStatusPath, xlog, ".done");
1239         if (stat(archiveStatusPath, &stat_buf) == 0)
1240                 return true;
1241
1242         /* Retry creation of the .ready file */
1243         XLogArchiveNotify(xlog);
1244         return false;
1245 }
1246
1247 /*
1248  * XLogArchiveIsBusy
1249  *
1250  * Check to see if an XLOG segment file is still unarchived.
1251  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1252  * the first place we aren't chartered to recreate the .ready file, and
1253  * in the second place we should consider that if the file is already gone
1254  * then it's not busy.  (This check is needed to handle the race condition
1255  * that a checkpoint already deleted the no-longer-needed file.)
1256  */
1257 static bool
1258 XLogArchiveIsBusy(const char *xlog)
1259 {
1260         char            archiveStatusPath[MAXPGPATH];
1261         struct stat stat_buf;
1262
1263         /* First check for .done --- this means archiver is done with it */
1264         StatusFilePath(archiveStatusPath, xlog, ".done");
1265         if (stat(archiveStatusPath, &stat_buf) == 0)
1266                 return false;
1267
1268         /* check for .ready --- this means archiver is still busy with it */
1269         StatusFilePath(archiveStatusPath, xlog, ".ready");
1270         if (stat(archiveStatusPath, &stat_buf) == 0)
1271                 return true;
1272
1273         /* Race condition --- maybe archiver just finished, so recheck */
1274         StatusFilePath(archiveStatusPath, xlog, ".done");
1275         if (stat(archiveStatusPath, &stat_buf) == 0)
1276                 return false;
1277
1278         /*
1279          * Check to see if the WAL file has been removed by checkpoint, which
1280          * implies it has already been archived, and explains why we can't see a
1281          * status file for it.
1282          */
1283         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1284         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1285                 errno == ENOENT)
1286                 return false;
1287
1288         return true;
1289 }
1290
1291 /*
1292  * XLogArchiveCleanup
1293  *
1294  * Cleanup archive notification file(s) for a particular xlog segment
1295  */
1296 static void
1297 XLogArchiveCleanup(const char *xlog)
1298 {
1299         char            archiveStatusPath[MAXPGPATH];
1300
1301         /* Remove the .done file */
1302         StatusFilePath(archiveStatusPath, xlog, ".done");
1303         unlink(archiveStatusPath);
1304         /* should we complain about failure? */
1305
1306         /* Remove the .ready file if present --- normally it shouldn't be */
1307         StatusFilePath(archiveStatusPath, xlog, ".ready");
1308         unlink(archiveStatusPath);
1309         /* should we complain about failure? */
1310 }
1311
1312 /*
1313  * Advance the Insert state to the next buffer page, writing out the next
1314  * buffer if it still contains unwritten data.
1315  *
1316  * If new_segment is TRUE then we set up the next buffer page as the first
1317  * page of the next xlog segment file, possibly but not usually the next
1318  * consecutive file page.
1319  *
1320  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1321  * just-filled page.  If we can do this for free (without an extra lock),
1322  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1323  * request update still needs to be done, FALSE if we did it internally.
1324  *
1325  * Must be called with WALInsertLock held.
1326  */
1327 static bool
1328 AdvanceXLInsertBuffer(bool new_segment)
1329 {
1330         XLogCtlInsert *Insert = &XLogCtl->Insert;
1331         XLogCtlWrite *Write = &XLogCtl->Write;
1332         int                     nextidx = NextBufIdx(Insert->curridx);
1333         bool            update_needed = true;
1334         XLogRecPtr      OldPageRqstPtr;
1335         XLogwrtRqst WriteRqst;
1336         XLogRecPtr      NewPageEndPtr;
1337         XLogPageHeader NewPage;
1338
1339         /* Use Insert->LogwrtResult copy if it's more fresh */
1340         if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
1341                 LogwrtResult = Insert->LogwrtResult;
1342
1343         /*
1344          * Get ending-offset of the buffer page we need to replace (this may be
1345          * zero if the buffer hasn't been used yet).  Fall through if it's already
1346          * written out.
1347          */
1348         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1349         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1350         {
1351                 /* nope, got work to do... */
1352                 XLogRecPtr      FinishedPageRqstPtr;
1353
1354                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1355
1356                 /* Before waiting, get info_lck and update LogwrtResult */
1357                 {
1358                         /* use volatile pointer to prevent code rearrangement */
1359                         volatile XLogCtlData *xlogctl = XLogCtl;
1360
1361                         SpinLockAcquire(&xlogctl->info_lck);
1362                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1363                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1364                         LogwrtResult = xlogctl->LogwrtResult;
1365                         SpinLockRelease(&xlogctl->info_lck);
1366                 }
1367
1368                 update_needed = false;  /* Did the shared-request update */
1369
1370                 if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1371                 {
1372                         /* OK, someone wrote it already */
1373                         Insert->LogwrtResult = LogwrtResult;
1374                 }
1375                 else
1376                 {
1377                         /* Must acquire write lock */
1378                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1379                         LogwrtResult = Write->LogwrtResult;
1380                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1381                         {
1382                                 /* OK, someone wrote it already */
1383                                 LWLockRelease(WALWriteLock);
1384                                 Insert->LogwrtResult = LogwrtResult;
1385                         }
1386                         else
1387                         {
1388                                 /*
1389                                  * Have to write buffers while holding insert lock. This is
1390                                  * not good, so only write as much as we absolutely must.
1391                                  */
1392                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1393                                 WriteRqst.Write = OldPageRqstPtr;
1394                                 WriteRqst.Flush.xlogid = 0;
1395                                 WriteRqst.Flush.xrecoff = 0;
1396                                 XLogWrite(WriteRqst, false, false);
1397                                 LWLockRelease(WALWriteLock);
1398                                 Insert->LogwrtResult = LogwrtResult;
1399                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1400                         }
1401                 }
1402         }
1403
1404         /*
1405          * Now the next buffer slot is free and we can set it up to be the next
1406          * output page.
1407          */
1408         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1409
1410         if (new_segment)
1411         {
1412                 /* force it to a segment start point */
1413                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1414                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1415         }
1416
1417         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1418         {
1419                 /* crossing a logid boundary */
1420                 NewPageEndPtr.xlogid += 1;
1421                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1422         }
1423         else
1424                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1425         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1426         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1427
1428         Insert->curridx = nextidx;
1429         Insert->currpage = NewPage;
1430
1431         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1432
1433         /*
1434          * Be sure to re-zero the buffer so that bytes beyond what we've written
1435          * will look like zeroes and not valid XLOG records...
1436          */
1437         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1438
1439         /*
1440          * Fill the new page's header
1441          */
1442         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1443
1444         /* NewPage->xlp_info = 0; */    /* done by memset */
1445         NewPage   ->xlp_tli = ThisTimeLineID;
1446         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1447         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1448
1449         /*
1450          * If first page of an XLOG segment file, make it a long header.
1451          */
1452         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1453         {
1454                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1455
1456                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1457                 NewLongPage->xlp_seg_size = XLogSegSize;
1458                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1459                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1460
1461                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1462         }
1463
1464         return update_needed;
1465 }
1466
1467 /*
1468  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1469  *
1470  * Caller must have just finished filling the open log file (so that
1471  * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
1472  * to the open log file and see if that exceeds CheckPointSegments.
1473  *
1474  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1475  */
1476 static bool
1477 XLogCheckpointNeeded(void)
1478 {
1479         /*
1480          * A straight computation of segment number could overflow 32 bits. Rather
1481          * than assuming we have working 64-bit arithmetic, we compare the
1482          * highest-order bits separately, and force a checkpoint immediately when
1483          * they change.
1484          */
1485         uint32          old_segno,
1486                                 new_segno;
1487         uint32          old_highbits,
1488                                 new_highbits;
1489
1490         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1491                 (RedoRecPtr.xrecoff / XLogSegSize);
1492         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1493         new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
1494         new_highbits = openLogId / XLogSegSize;
1495         if (new_highbits != old_highbits ||
1496                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1497                 return true;
1498         return false;
1499 }
1500
1501 /*
1502  * Write and/or fsync the log at least as far as WriteRqst indicates.
1503  *
1504  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1505  * may stop at any convenient boundary (such as a cache or logfile boundary).
1506  * This option allows us to avoid uselessly issuing multiple writes when a
1507  * single one would do.
1508  *
1509  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1510  * perform end-of-segment actions after writing the last page, even if
1511  * it's not physically the end of its segment.  (NB: this will work properly
1512  * only if caller specifies WriteRqst == page-end and flexible == false,
1513  * and there is some data to write.)
1514  *
1515  * Must be called with WALWriteLock held.
1516  */
1517 static void
1518 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1519 {
1520         XLogCtlWrite *Write = &XLogCtl->Write;
1521         bool            ispartialpage;
1522         bool            last_iteration;
1523         bool            finishing_seg;
1524         bool            use_existent;
1525         int                     curridx;
1526         int                     npages;
1527         int                     startidx;
1528         uint32          startoffset;
1529
1530         /* We should always be inside a critical section here */
1531         Assert(CritSectionCount > 0);
1532
1533         /*
1534          * Update local LogwrtResult (caller probably did this already, but...)
1535          */
1536         LogwrtResult = Write->LogwrtResult;
1537
1538         /*
1539          * Since successive pages in the xlog cache are consecutively allocated,
1540          * we can usually gather multiple pages together and issue just one
1541          * write() call.  npages is the number of pages we have determined can be
1542          * written together; startidx is the cache block index of the first one,
1543          * and startoffset is the file offset at which it should go. The latter
1544          * two variables are only valid when npages > 0, but we must initialize
1545          * all of them to keep the compiler quiet.
1546          */
1547         npages = 0;
1548         startidx = 0;
1549         startoffset = 0;
1550
1551         /*
1552          * Within the loop, curridx is the cache block index of the page to
1553          * consider writing.  We advance Write->curridx only after successfully
1554          * writing pages.  (Right now, this refinement is useless since we are
1555          * going to PANIC if any error occurs anyway; but someday it may come in
1556          * useful.)
1557          */
1558         curridx = Write->curridx;
1559
1560         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1561         {
1562                 /*
1563                  * Make sure we're not ahead of the insert process.  This could happen
1564                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1565                  * last page that's been initialized by AdvanceXLInsertBuffer.
1566                  */
1567                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1568                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1569                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1570                                  XLogCtl->xlblocks[curridx].xlogid,
1571                                  XLogCtl->xlblocks[curridx].xrecoff);
1572
1573                 /* Advance LogwrtResult.Write to end of current buffer page */
1574                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1575                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1576
1577                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1578                 {
1579                         /*
1580                          * Switch to new logfile segment.  We cannot have any pending
1581                          * pages here (since we dump what we have at segment end).
1582                          */
1583                         Assert(npages == 0);
1584                         if (openLogFile >= 0)
1585                                 XLogFileClose();
1586                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1587
1588                         /* create/use new log file */
1589                         use_existent = true;
1590                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1591                                                                            &use_existent, true);
1592                         openLogOff = 0;
1593                 }
1594
1595                 /* Make sure we have the current logfile open */
1596                 if (openLogFile < 0)
1597                 {
1598                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1599                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1600                         openLogOff = 0;
1601                 }
1602
1603                 /* Add current page to the set of pending pages-to-dump */
1604                 if (npages == 0)
1605                 {
1606                         /* first of group */
1607                         startidx = curridx;
1608                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1609                 }
1610                 npages++;
1611
1612                 /*
1613                  * Dump the set if this will be the last loop iteration, or if we are
1614                  * at the last page of the cache area (since the next page won't be
1615                  * contiguous in memory), or if we are at the end of the logfile
1616                  * segment.
1617                  */
1618                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1619
1620                 finishing_seg = !ispartialpage &&
1621                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1622
1623                 if (last_iteration ||
1624                         curridx == XLogCtl->XLogCacheBlck ||
1625                         finishing_seg)
1626                 {
1627                         char       *from;
1628                         Size            nbytes;
1629
1630                         /* Need to seek in the file? */
1631                         if (openLogOff != startoffset)
1632                         {
1633                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1634                                         ereport(PANIC,
1635                                                         (errcode_for_file_access(),
1636                                                          errmsg("could not seek in log file %u, "
1637                                                                         "segment %u to offset %u: %m",
1638                                                                         openLogId, openLogSeg, startoffset)));
1639                                 openLogOff = startoffset;
1640                         }
1641
1642                         /* OK to write the page(s) */
1643                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1644                         nbytes = npages * (Size) XLOG_BLCKSZ;
1645                         errno = 0;
1646                         if (write(openLogFile, from, nbytes) != nbytes)
1647                         {
1648                                 /* if write didn't set errno, assume no disk space */
1649                                 if (errno == 0)
1650                                         errno = ENOSPC;
1651                                 ereport(PANIC,
1652                                                 (errcode_for_file_access(),
1653                                                  errmsg("could not write to log file %u, segment %u "
1654                                                                 "at offset %u, length %lu: %m",
1655                                                                 openLogId, openLogSeg,
1656                                                                 openLogOff, (unsigned long) nbytes)));
1657                         }
1658
1659                         /* Update state for write */
1660                         openLogOff += nbytes;
1661                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1662                         npages = 0;
1663
1664                         /*
1665                          * If we just wrote the whole last page of a logfile segment,
1666                          * fsync the segment immediately.  This avoids having to go back
1667                          * and re-open prior segments when an fsync request comes along
1668                          * later. Doing it here ensures that one and only one backend will
1669                          * perform this fsync.
1670                          *
1671                          * We also do this if this is the last page written for an xlog
1672                          * switch.
1673                          *
1674                          * This is also the right place to notify the Archiver that the
1675                          * segment is ready to copy to archival storage, and to update the
1676                          * timer for archive_timeout, and to signal for a checkpoint if
1677                          * too many logfile segments have been used since the last
1678                          * checkpoint.
1679                          */
1680                         if (finishing_seg || (xlog_switch && last_iteration))
1681                         {
1682                                 issue_xlog_fsync();
1683                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1684
1685                                 if (XLogArchivingActive())
1686                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1687
1688                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1689
1690                                 /*
1691                                  * Signal bgwriter to start a checkpoint if we've consumed too
1692                                  * much xlog since the last one.  For speed, we first check
1693                                  * using the local copy of RedoRecPtr, which might be out of
1694                                  * date; if it looks like a checkpoint is needed, forcibly
1695                                  * update RedoRecPtr and recheck.
1696                                  */
1697                                 if (IsUnderPostmaster &&
1698                                         XLogCheckpointNeeded())
1699                                 {
1700                                         (void) GetRedoRecPtr();
1701                                         if (XLogCheckpointNeeded())
1702                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1703                                 }
1704                         }
1705                 }
1706
1707                 if (ispartialpage)
1708                 {
1709                         /* Only asked to write a partial page */
1710                         LogwrtResult.Write = WriteRqst.Write;
1711                         break;
1712                 }
1713                 curridx = NextBufIdx(curridx);
1714
1715                 /* If flexible, break out of loop as soon as we wrote something */
1716                 if (flexible && npages == 0)
1717                         break;
1718         }
1719
1720         Assert(npages == 0);
1721         Assert(curridx == Write->curridx);
1722
1723         /*
1724          * If asked to flush, do so
1725          */
1726         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1727                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1728         {
1729                 /*
1730                  * Could get here without iterating above loop, in which case we might
1731                  * have no open file or the wrong one.  However, we do not need to
1732                  * fsync more than one file.
1733                  */
1734                 if (sync_method != SYNC_METHOD_OPEN &&
1735                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1736                 {
1737                         if (openLogFile >= 0 &&
1738                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1739                                 XLogFileClose();
1740                         if (openLogFile < 0)
1741                         {
1742                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1743                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1744                                 openLogOff = 0;
1745                         }
1746                         issue_xlog_fsync();
1747                 }
1748                 LogwrtResult.Flush = LogwrtResult.Write;
1749         }
1750
1751         /*
1752          * Update shared-memory status
1753          *
1754          * We make sure that the shared 'request' values do not fall behind the
1755          * 'result' values.  This is not absolutely essential, but it saves some
1756          * code in a couple of places.
1757          */
1758         {
1759                 /* use volatile pointer to prevent code rearrangement */
1760                 volatile XLogCtlData *xlogctl = XLogCtl;
1761
1762                 SpinLockAcquire(&xlogctl->info_lck);
1763                 xlogctl->LogwrtResult = LogwrtResult;
1764                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1765                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1766                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1767                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1768                 SpinLockRelease(&xlogctl->info_lck);
1769         }
1770
1771         Write->LogwrtResult = LogwrtResult;
1772 }
1773
1774 /*
1775  * Record the LSN for an asynchronous transaction commit.
1776  * (This should not be called for aborts, nor for synchronous commits.)
1777  */
1778 void
1779 XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
1780 {
1781         /* use volatile pointer to prevent code rearrangement */
1782         volatile XLogCtlData *xlogctl = XLogCtl;
1783
1784         SpinLockAcquire(&xlogctl->info_lck);
1785         if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
1786                 xlogctl->asyncCommitLSN = asyncCommitLSN;
1787         SpinLockRelease(&xlogctl->info_lck);
1788 }
1789
1790 /*
1791  * Advance minRecoveryPoint in control file.
1792  *
1793  * If we crash during recovery, we must reach this point again before the
1794  * database is consistent.
1795  *
1796  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1797  * is only updated if it's not already greater than or equal to 'lsn'.
1798  */
1799 static void
1800 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1801 {
1802         /* Quick check using our local copy of the variable */
1803         if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
1804                 return;
1805
1806         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1807
1808         /* update local copy */
1809         minRecoveryPoint = ControlFile->minRecoveryPoint;
1810
1811         /*
1812          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1813          * i.e., we're doing crash recovery.  We never modify the control file's
1814          * value in that case, so we can short-circuit future checks here too.
1815          */
1816         if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
1817                 updateMinRecoveryPoint = false;
1818         else if (force || XLByteLT(minRecoveryPoint, lsn))
1819         {
1820                 /* use volatile pointer to prevent code rearrangement */
1821                 volatile XLogCtlData *xlogctl = XLogCtl;
1822                 XLogRecPtr      newMinRecoveryPoint;
1823
1824                 /*
1825                  * To avoid having to update the control file too often, we update it
1826                  * all the way to the last record being replayed, even though 'lsn'
1827                  * would suffice for correctness.  This also allows the 'force' case
1828                  * to not need a valid 'lsn' value.
1829                  *
1830                  * Another important reason for doing it this way is that the passed
1831                  * 'lsn' value could be bogus, i.e., past the end of available WAL,
1832                  * if the caller got it from a corrupted heap page.  Accepting such
1833                  * a value as the min recovery point would prevent us from coming up
1834                  * at all.  Instead, we just log a warning and continue with recovery.
1835                  * (See also the comments about corrupt LSNs in XLogFlush.)
1836                  */
1837                 SpinLockAcquire(&xlogctl->info_lck);
1838                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
1839                 SpinLockRelease(&xlogctl->info_lck);
1840
1841                 if (!force && XLByteLT(newMinRecoveryPoint, lsn))
1842                         elog(WARNING,
1843                                  "xlog min recovery request %X/%X is past current point %X/%X",
1844                                  lsn.xlogid, lsn.xrecoff,
1845                                  newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);
1846
1847                 /* update control file */
1848                 if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
1849                 {
1850                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
1851                         UpdateControlFile();
1852                         minRecoveryPoint = newMinRecoveryPoint;
1853
1854                         ereport(DEBUG2,
1855                                         (errmsg("updated min recovery point to %X/%X",
1856                                                 minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
1857                 }
1858         }
1859         LWLockRelease(ControlFileLock);
1860 }
1861
1862 /*
1863  * Ensure that all XLOG data through the given position is flushed to disk.
1864  *
1865  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1866  * already held, and we try to avoid acquiring it if possible.
1867  */
1868 void
1869 XLogFlush(XLogRecPtr record)
1870 {
1871         XLogRecPtr      WriteRqstPtr;
1872         XLogwrtRqst WriteRqst;
1873
1874         /*
1875          * During REDO, we are reading not writing WAL.  Therefore, instead of
1876          * trying to flush the WAL, we should update minRecoveryPoint instead.
1877          * We test XLogInsertAllowed(), not InRecovery, because we need the
1878          * bgwriter to act this way too, and because when the bgwriter tries
1879          * to write the end-of-recovery checkpoint, it should indeed flush.
1880          */
1881         if (!XLogInsertAllowed())
1882         {
1883                 UpdateMinRecoveryPoint(record, false);
1884                 return;
1885         }
1886
1887         /* Quick exit if already known flushed */
1888         if (XLByteLE(record, LogwrtResult.Flush))
1889                 return;
1890
1891 #ifdef WAL_DEBUG
1892         if (XLOG_DEBUG)
1893                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1894                          record.xlogid, record.xrecoff,
1895                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1896                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1897 #endif
1898
1899         START_CRIT_SECTION();
1900
1901         /*
1902          * Since fsync is usually a horribly expensive operation, we try to
1903          * piggyback as much data as we can on each fsync: if we see any more data
1904          * entered into the xlog buffer, we'll write and fsync that too, so that
1905          * the final value of LogwrtResult.Flush is as large as possible. This
1906          * gives us some chance of avoiding another fsync immediately after.
1907          */
1908
1909         /* initialize to given target; may increase below */
1910         WriteRqstPtr = record;
1911
1912         /* read LogwrtResult and update local state */
1913         {
1914                 /* use volatile pointer to prevent code rearrangement */
1915                 volatile XLogCtlData *xlogctl = XLogCtl;
1916
1917                 SpinLockAcquire(&xlogctl->info_lck);
1918                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
1919                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1920                 LogwrtResult = xlogctl->LogwrtResult;
1921                 SpinLockRelease(&xlogctl->info_lck);
1922         }
1923
1924         /* done already? */
1925         if (!XLByteLE(record, LogwrtResult.Flush))
1926         {
1927                 /* now wait for the write lock */
1928                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1929                 LogwrtResult = XLogCtl->Write.LogwrtResult;
1930                 if (!XLByteLE(record, LogwrtResult.Flush))
1931                 {
1932                         /* try to write/flush later additions to XLOG as well */
1933                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
1934                         {
1935                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
1936                                 uint32          freespace = INSERT_FREESPACE(Insert);
1937
1938                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
1939                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1940                                 else
1941                                 {
1942                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1943                                         WriteRqstPtr.xrecoff -= freespace;
1944                                 }
1945                                 LWLockRelease(WALInsertLock);
1946                                 WriteRqst.Write = WriteRqstPtr;
1947                                 WriteRqst.Flush = WriteRqstPtr;
1948                         }
1949                         else
1950                         {
1951                                 WriteRqst.Write = WriteRqstPtr;
1952                                 WriteRqst.Flush = record;
1953                         }
1954                         XLogWrite(WriteRqst, false, false);
1955                 }
1956                 LWLockRelease(WALWriteLock);
1957         }
1958
1959         END_CRIT_SECTION();
1960
1961         /*
1962          * If we still haven't flushed to the request point then we have a
1963          * problem; most likely, the requested flush point is past end of XLOG.
1964          * This has been seen to occur when a disk page has a corrupted LSN.
1965          *
1966          * Formerly we treated this as a PANIC condition, but that hurts the
1967          * system's robustness rather than helping it: we do not want to take down
1968          * the whole system due to corruption on one data page.  In particular, if
1969          * the bad page is encountered again during recovery then we would be
1970          * unable to restart the database at all!  (This scenario actually
1971          * happened in the field several times with 7.1 releases.)  As of 8.4,
1972          * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's
1973          * problem; the only time we can reach here during recovery is while
1974          * flushing the end-of-recovery checkpoint record, and we don't expect
1975          * that to have a bad LSN.
1976          *
1977          * Note that for calls from xact.c, the ERROR will
1978          * be promoted to PANIC since xact.c calls this routine inside a critical
1979          * section.  However, calls from bufmgr.c are not within critical sections
1980          * and so we will not force a restart for a bad LSN on a data page.
1981          */
1982         if (XLByteLT(LogwrtResult.Flush, record))
1983                 elog(ERROR,
1984                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
1985                          record.xlogid, record.xrecoff,
1986                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1987 }
1988
1989 /*
1990  * Flush xlog, but without specifying exactly where to flush to.
1991  *
1992  * We normally flush only completed blocks; but if there is nothing to do on
1993  * that basis, we check for unflushed async commits in the current incomplete
1994  * block, and flush through the latest one of those.  Thus, if async commits
1995  * are not being used, we will flush complete blocks only.      We can guarantee
1996  * that async commits reach disk after at most three cycles; normally only
1997  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
1998  * at the end of the buffer ring; this makes a difference only with very high
1999  * load or long wal_writer_delay, but imposes one extra cycle for the worst
2000  * case for async commits.)
2001  *
2002  * This routine is invoked periodically by the background walwriter process.
2003  */
2004 void
2005 XLogBackgroundFlush(void)
2006 {
2007         XLogRecPtr      WriteRqstPtr;
2008         bool            flexible = true;
2009
2010         /* XLOG doesn't need flushing during recovery */
2011         if (RecoveryInProgress())
2012                 return;
2013
2014         /* read LogwrtResult and update local state */
2015         {
2016                 /* use volatile pointer to prevent code rearrangement */
2017                 volatile XLogCtlData *xlogctl = XLogCtl;
2018
2019                 SpinLockAcquire(&xlogctl->info_lck);
2020                 LogwrtResult = xlogctl->LogwrtResult;
2021                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2022                 SpinLockRelease(&xlogctl->info_lck);
2023         }
2024
2025         /* back off to last completed page boundary */
2026         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
2027
2028         /* if we have already flushed that far, consider async commit records */
2029         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2030         {
2031                 /* use volatile pointer to prevent code rearrangement */
2032                 volatile XLogCtlData *xlogctl = XLogCtl;
2033
2034                 SpinLockAcquire(&xlogctl->info_lck);
2035                 WriteRqstPtr = xlogctl->asyncCommitLSN;
2036                 SpinLockRelease(&xlogctl->info_lck);
2037                 flexible = false;               /* ensure it all gets written */
2038         }
2039
2040         /* Done if already known flushed */
2041         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2042                 return;
2043
2044 #ifdef WAL_DEBUG
2045         if (XLOG_DEBUG)
2046                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2047                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
2048                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2049                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2050 #endif
2051
2052         START_CRIT_SECTION();
2053
2054         /* now wait for the write lock */
2055         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2056         LogwrtResult = XLogCtl->Write.LogwrtResult;
2057         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2058         {
2059                 XLogwrtRqst WriteRqst;
2060
2061                 WriteRqst.Write = WriteRqstPtr;
2062                 WriteRqst.Flush = WriteRqstPtr;
2063                 XLogWrite(WriteRqst, flexible, false);
2064         }
2065         LWLockRelease(WALWriteLock);
2066
2067         END_CRIT_SECTION();
2068 }
2069
2070 /*
2071  * Flush any previous asynchronously-committed transactions' commit records.
2072  *
2073  * NOTE: it is unwise to assume that this provides any strong guarantees.
2074  * In particular, because of the inexact LSN bookkeeping used by clog.c,
2075  * we cannot assume that hint bits will be settable for these transactions.
2076  */
2077 void
2078 XLogAsyncCommitFlush(void)
2079 {
2080         XLogRecPtr      WriteRqstPtr;
2081
2082         /* use volatile pointer to prevent code rearrangement */
2083         volatile XLogCtlData *xlogctl = XLogCtl;
2084
2085         /* There's no asynchronously committed transactions during recovery */
2086         if (RecoveryInProgress())
2087                 return;
2088
2089         SpinLockAcquire(&xlogctl->info_lck);
2090         WriteRqstPtr = xlogctl->asyncCommitLSN;
2091         SpinLockRelease(&xlogctl->info_lck);
2092
2093         XLogFlush(WriteRqstPtr);
2094 }
2095
2096 /*
2097  * Test whether XLOG data has been flushed up to (at least) the given position.
2098  *
2099  * Returns true if a flush is still needed.  (It may be that someone else
2100  * is already in process of flushing that far, however.)
2101  */
2102 bool
2103 XLogNeedsFlush(XLogRecPtr record)
2104 {
2105         /* XLOG doesn't need flushing during recovery */
2106         if (RecoveryInProgress())
2107                 return false;
2108
2109         /* Quick exit if already known flushed */
2110         if (XLByteLE(record, LogwrtResult.Flush))
2111                 return false;
2112
2113         /* read LogwrtResult and update local state */
2114         {
2115                 /* use volatile pointer to prevent code rearrangement */
2116                 volatile XLogCtlData *xlogctl = XLogCtl;
2117
2118                 SpinLockAcquire(&xlogctl->info_lck);
2119                 LogwrtResult = xlogctl->LogwrtResult;
2120                 SpinLockRelease(&xlogctl->info_lck);
2121         }
2122
2123         /* check again */
2124         if (XLByteLE(record, LogwrtResult.Flush))
2125                 return false;
2126
2127         return true;
2128 }
2129
2130 /*
2131  * Create a new XLOG file segment, or open a pre-existing one.
2132  *
2133  * log, seg: identify segment to be created/opened.
2134  *
2135  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2136  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2137  * file was used.
2138  *
2139  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2140  * place.  This should be TRUE except during bootstrap log creation.  The
2141  * caller must *not* hold the lock at call.
2142  *
2143  * Returns FD of opened file.
2144  *
2145  * Note: errors here are ERROR not PANIC because we might or might not be
2146  * inside a critical section (eg, during checkpoint there is no reason to
2147  * take down the system on failure).  They will promote to PANIC if we are
2148  * in a critical section.
2149  */
2150 static int
2151 XLogFileInit(uint32 log, uint32 seg,
2152                          bool *use_existent, bool use_lock)
2153 {
2154         char            path[MAXPGPATH];
2155         char            tmppath[MAXPGPATH];
2156         char       *zbuffer;
2157         uint32          installed_log;
2158         uint32          installed_seg;
2159         int                     max_advance;
2160         int                     fd;
2161         int                     nbytes;
2162
2163         XLogFilePath(path, ThisTimeLineID, log, seg);
2164
2165         /*
2166          * Try to use existent file (checkpoint maker may have created it already)
2167          */
2168         if (*use_existent)
2169         {
2170                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2171                                                    S_IRUSR | S_IWUSR);
2172                 if (fd < 0)
2173                 {
2174                         if (errno != ENOENT)
2175                                 ereport(ERROR,
2176                                                 (errcode_for_file_access(),
2177                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2178                                                                 path, log, seg)));
2179                 }
2180                 else
2181                         return fd;
2182         }
2183
2184         /*
2185          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2186          * another process is doing the same thing.  If so, we will end up
2187          * pre-creating an extra log segment.  That seems OK, and better than
2188          * holding the lock throughout this lengthy process.
2189          */
2190         elog(DEBUG2, "creating and filling new WAL file");
2191
2192         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2193
2194         unlink(tmppath);
2195
2196         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2197         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2198                                            S_IRUSR | S_IWUSR);
2199         if (fd < 0)
2200                 ereport(ERROR,
2201                                 (errcode_for_file_access(),
2202                                  errmsg("could not create file \"%s\": %m", tmppath)));
2203
2204         /*
2205          * Zero-fill the file.  We have to do this the hard way to ensure that all
2206          * the file space has really been allocated --- on platforms that allow
2207          * "holes" in files, just seeking to the end doesn't allocate intermediate
2208          * space.  This way, we know that we have all the space and (after the
2209          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2210          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2211          * log file.
2212          *
2213          * Note: palloc zbuffer, instead of just using a local char array, to
2214          * ensure it is reasonably well-aligned; this may save a few cycles
2215          * transferring data to the kernel.
2216          */
2217         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2218         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2219         {
2220                 errno = 0;
2221                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2222                 {
2223                         int                     save_errno = errno;
2224
2225                         /*
2226                          * If we fail to make the file, delete it to release disk space
2227                          */
2228                         unlink(tmppath);
2229                         /* if write didn't set errno, assume problem is no disk space */
2230                         errno = save_errno ? save_errno : ENOSPC;
2231
2232                         ereport(ERROR,
2233                                         (errcode_for_file_access(),
2234                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2235                 }
2236         }
2237         pfree(zbuffer);
2238
2239         if (pg_fsync(fd) != 0)
2240                 ereport(ERROR,
2241                                 (errcode_for_file_access(),
2242                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2243
2244         if (close(fd))
2245                 ereport(ERROR,
2246                                 (errcode_for_file_access(),
2247                                  errmsg("could not close file \"%s\": %m", tmppath)));
2248
2249         /*
2250          * Now move the segment into place with its final name.
2251          *
2252          * If caller didn't want to use a pre-existing file, get rid of any
2253          * pre-existing file.  Otherwise, cope with possibility that someone else
2254          * has created the file while we were filling ours: if so, use ours to
2255          * pre-create a future log segment.
2256          */
2257         installed_log = log;
2258         installed_seg = seg;
2259         max_advance = XLOGfileslop;
2260         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2261                                                                 *use_existent, &max_advance,
2262                                                                 use_lock))
2263         {
2264                 /* No need for any more future segments... */
2265                 unlink(tmppath);
2266         }
2267
2268         elog(DEBUG2, "done creating and filling new WAL file");
2269
2270         /* Set flag to tell caller there was no existent file */
2271         *use_existent = false;
2272
2273         /* Now open original target segment (might not be file I just made) */
2274         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2275                                            S_IRUSR | S_IWUSR);
2276         if (fd < 0)
2277                 ereport(ERROR,
2278                                 (errcode_for_file_access(),
2279                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2280                                   path, log, seg)));
2281
2282         return fd;
2283 }
2284
2285 /*
2286  * Create a new XLOG file segment by copying a pre-existing one.
2287  *
2288  * log, seg: identify segment to be created.
2289  *
2290  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2291  *              a different timeline)
2292  *
2293  * Currently this is only used during recovery, and so there are no locking
2294  * considerations.      But we should be just as tense as XLogFileInit to avoid
2295  * emplacing a bogus file.
2296  */
2297 static void
2298 XLogFileCopy(uint32 log, uint32 seg,
2299                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2300 {
2301         char            path[MAXPGPATH];
2302         char            tmppath[MAXPGPATH];
2303         char            buffer[XLOG_BLCKSZ];
2304         int                     srcfd;
2305         int                     fd;
2306         int                     nbytes;
2307
2308         /*
2309          * Open the source file
2310          */
2311         XLogFilePath(path, srcTLI, srclog, srcseg);
2312         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2313         if (srcfd < 0)
2314                 ereport(ERROR,
2315                                 (errcode_for_file_access(),
2316                                  errmsg("could not open file \"%s\": %m", path)));
2317
2318         /*
2319          * Copy into a temp file name.
2320          */
2321         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2322
2323         unlink(tmppath);
2324
2325         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2326         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2327                                            S_IRUSR | S_IWUSR);
2328         if (fd < 0)
2329                 ereport(ERROR,
2330                                 (errcode_for_file_access(),
2331                                  errmsg("could not create file \"%s\": %m", tmppath)));
2332
2333         /*
2334          * Do the data copying.
2335          */
2336         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2337         {
2338                 errno = 0;
2339                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2340                 {
2341                         if (errno != 0)
2342                                 ereport(ERROR,
2343                                                 (errcode_for_file_access(),
2344                                                  errmsg("could not read file \"%s\": %m", path)));
2345                         else
2346                                 ereport(ERROR,
2347                                                 (errmsg("not enough data in file \"%s\"", path)));
2348                 }
2349                 errno = 0;
2350                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2351                 {
2352                         int                     save_errno = errno;
2353
2354                         /*
2355                          * If we fail to make the file, delete it to release disk space
2356                          */
2357                         unlink(tmppath);
2358                         /* if write didn't set errno, assume problem is no disk space */
2359                         errno = save_errno ? save_errno : ENOSPC;
2360
2361                         ereport(ERROR,
2362                                         (errcode_for_file_access(),
2363                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2364                 }
2365         }
2366
2367         if (pg_fsync(fd) != 0)
2368                 ereport(ERROR,
2369                                 (errcode_for_file_access(),
2370                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2371
2372         if (close(fd))
2373                 ereport(ERROR,
2374                                 (errcode_for_file_access(),
2375                                  errmsg("could not close file \"%s\": %m", tmppath)));
2376
2377         close(srcfd);
2378
2379         /*
2380          * Now move the segment into place with its final name.
2381          */
2382         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2383                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2384 }
2385
2386 /*
2387  * Install a new XLOG segment file as a current or future log segment.
2388  *
2389  * This is used both to install a newly-created segment (which has a temp
2390  * filename while it's being created) and to recycle an old segment.
2391  *
2392  * *log, *seg: identify segment to install as (or first possible target).
2393  * When find_free is TRUE, these are modified on return to indicate the
2394  * actual installation location or last segment searched.
2395  *
2396  * tmppath: initial name of file to install.  It will be renamed into place.
2397  *
2398  * find_free: if TRUE, install the new segment at the first empty log/seg
2399  * number at or after the passed numbers.  If FALSE, install the new segment
2400  * exactly where specified, deleting any existing segment file there.
2401  *
2402  * *max_advance: maximum number of log/seg slots to advance past the starting
2403  * point.  Fail if no free slot is found in this range.  On return, reduced
2404  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2405  * when find_free is FALSE.)
2406  *
2407  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2408  * place.  This should be TRUE except during bootstrap log creation.  The
2409  * caller must *not* hold the lock at call.
2410  *
2411  * Returns TRUE if file installed, FALSE if not installed because of
2412  * exceeding max_advance limit.  On Windows, we also return FALSE if we
2413  * can't rename the file into place because someone's got it open.
2414  * (Any other kind of failure causes ereport().)
2415  */
2416 static bool
2417 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2418                                            bool find_free, int *max_advance,
2419                                            bool use_lock)
2420 {
2421         char            path[MAXPGPATH];
2422         struct stat stat_buf;
2423
2424         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2425
2426         /*
2427          * We want to be sure that only one process does this at a time.
2428          */
2429         if (use_lock)
2430                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2431
2432         if (!find_free)
2433         {
2434                 /* Force installation: get rid of any pre-existing segment file */
2435                 unlink(path);
2436         }
2437         else
2438         {
2439                 /* Find a free slot to put it in */
2440                 while (stat(path, &stat_buf) == 0)
2441                 {
2442                         if (*max_advance <= 0)
2443                         {
2444                                 /* Failed to find a free slot within specified range */
2445                                 if (use_lock)
2446                                         LWLockRelease(ControlFileLock);
2447                                 return false;
2448                         }
2449                         NextLogSeg(*log, *seg);
2450                         (*max_advance)--;
2451                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2452                 }
2453         }
2454
2455         /*
2456          * Prefer link() to rename() here just to be really sure that we don't
2457          * overwrite an existing logfile.  However, there shouldn't be one, so
2458          * rename() is an acceptable substitute except for the truly paranoid.
2459          */
2460 #if HAVE_WORKING_LINK
2461         if (link(tmppath, path) < 0)
2462                 ereport(ERROR,
2463                                 (errcode_for_file_access(),
2464                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2465                                                 tmppath, path, *log, *seg)));
2466         unlink(tmppath);
2467 #else
2468         if (rename(tmppath, path) < 0)
2469         {
2470 #ifdef WIN32
2471 #if !defined(__CYGWIN__)
2472                 if (GetLastError() == ERROR_ACCESS_DENIED)
2473 #else
2474                 if (errno == EACCES)
2475 #endif
2476                 {
2477                         if (use_lock)
2478                                 LWLockRelease(ControlFileLock);
2479                         return false;
2480                 }
2481 #endif   /* WIN32 */
2482
2483                 ereport(ERROR,
2484                                 (errcode_for_file_access(),
2485                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2486                                                 tmppath, path, *log, *seg)));
2487         }
2488 #endif
2489
2490         if (use_lock)
2491                 LWLockRelease(ControlFileLock);
2492
2493         return true;
2494 }
2495
2496 /*
2497  * Open a pre-existing logfile segment for writing.
2498  */
2499 static int
2500 XLogFileOpen(uint32 log, uint32 seg)
2501 {
2502         char            path[MAXPGPATH];
2503         int                     fd;
2504
2505         XLogFilePath(path, ThisTimeLineID, log, seg);
2506
2507         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2508                                            S_IRUSR | S_IWUSR);
2509         if (fd < 0)
2510                 ereport(PANIC,
2511                                 (errcode_for_file_access(),
2512                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2513                                   path, log, seg)));
2514
2515         return fd;
2516 }
2517
2518 /*
2519  * Open a logfile segment for reading (during recovery).
2520  */
2521 static int
2522 XLogFileRead(uint32 log, uint32 seg, int emode)
2523 {
2524         char            path[MAXPGPATH];
2525         char            xlogfname[MAXFNAMELEN];
2526         char            activitymsg[MAXFNAMELEN + 16];
2527         ListCell   *cell;
2528         int                     fd;
2529
2530         /*
2531          * Loop looking for a suitable timeline ID: we might need to read any of
2532          * the timelines listed in expectedTLIs.
2533          *
2534          * We expect curFileTLI on entry to be the TLI of the preceding file in
2535          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2536          * to go backwards; this prevents us from picking up the wrong file when a
2537          * parent timeline extends to higher segment numbers than the child we
2538          * want to read.
2539          */
2540         foreach(cell, expectedTLIs)
2541         {
2542                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2543
2544                 if (tli < curFileTLI)
2545                         break;                          /* don't bother looking at too-old TLIs */
2546
2547                 XLogFileName(xlogfname, tli, log, seg);
2548
2549                 if (InArchiveRecovery)
2550                 {
2551                         /* Report recovery progress in PS display */
2552                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2553                                          xlogfname);
2554                         set_ps_display(activitymsg, false);
2555
2556                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2557                                                                                                           "RECOVERYXLOG",
2558                                                                                                           XLogSegSize);
2559                 }
2560                 else
2561                         XLogFilePath(path, tli, log, seg);
2562
2563                 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2564                 if (fd >= 0)
2565                 {
2566                         /* Success! */
2567                         curFileTLI = tli;
2568
2569                         /* Report recovery progress in PS display */
2570                         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2571                                          xlogfname);
2572                         set_ps_display(activitymsg, false);
2573
2574                         return fd;
2575                 }
2576                 if (errno != ENOENT)    /* unexpected failure? */
2577                         ereport(PANIC,
2578                                         (errcode_for_file_access(),
2579                         errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2580                                    path, log, seg)));
2581         }
2582
2583         /* Couldn't find it.  For simplicity, complain about front timeline */
2584         XLogFilePath(path, recoveryTargetTLI, log, seg);
2585         errno = ENOENT;
2586         ereport(emode,
2587                         (errcode_for_file_access(),
2588                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2589                                   path, log, seg)));
2590         return -1;
2591 }
2592
2593 /*
2594  * Close the current logfile segment for writing.
2595  */
2596 static void
2597 XLogFileClose(void)
2598 {
2599         Assert(openLogFile >= 0);
2600
2601         /*
2602          * WAL segment files will not be re-read in normal operation, so we advise
2603          * the OS to release any cached pages.  But do not do so if WAL archiving
2604          * is active, because archiver process could use the cache to read the WAL
2605          * segment.  Also, don't bother with it if we are using O_DIRECT, since
2606          * the kernel is presumably not caching in that case.
2607          */
2608 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2609         if (!XLogArchivingActive() &&
2610                 (get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
2611                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2612 #endif
2613
2614         if (close(openLogFile))
2615                 ereport(PANIC,
2616                                 (errcode_for_file_access(),
2617                                  errmsg("could not close log file %u, segment %u: %m",
2618                                                 openLogId, openLogSeg)));
2619         openLogFile = -1;
2620 }
2621
2622 /*
2623  * Attempt to retrieve the specified file from off-line archival storage.
2624  * If successful, fill "path" with its complete path (note that this will be
2625  * a temp file name that doesn't follow the normal naming convention), and
2626  * return TRUE.
2627  *
2628  * If not successful, fill "path" with the name of the normal on-line file
2629  * (which may or may not actually exist, but we'll try to use it), and return
2630  * FALSE.
2631  *
2632  * For fixed-size files, the caller may pass the expected size as an
2633  * additional crosscheck on successful recovery.  If the file size is not
2634  * known, set expectedSize = 0.
2635  */
2636 static bool
2637 RestoreArchivedFile(char *path, const char *xlogfname,
2638                                         const char *recovername, off_t expectedSize)
2639 {
2640         char            xlogpath[MAXPGPATH];
2641         char            xlogRestoreCmd[MAXPGPATH];
2642         char            lastRestartPointFname[MAXPGPATH];
2643         char       *dp;
2644         char       *endp;
2645         const char *sp;
2646         int                     rc;
2647         bool            signaled;
2648         struct stat stat_buf;
2649         uint32          restartLog;
2650         uint32          restartSeg;
2651
2652         /*
2653          * When doing archive recovery, we always prefer an archived log file even
2654          * if a file of the same name exists in XLOGDIR.  The reason is that the
2655          * file in XLOGDIR could be an old, un-filled or partly-filled version
2656          * that was copied and restored as part of backing up $PGDATA.
2657          *
2658          * We could try to optimize this slightly by checking the local copy
2659          * lastchange timestamp against the archived copy, but we have no API to
2660          * do this, nor can we guarantee that the lastchange timestamp was
2661          * preserved correctly when we copied to archive. Our aim is robustness,
2662          * so we elect not to do this.
2663          *
2664          * If we cannot obtain the log file from the archive, however, we will try
2665          * to use the XLOGDIR file if it exists.  This is so that we can make use
2666          * of log segments that weren't yet transferred to the archive.
2667          *
2668          * Notice that we don't actually overwrite any files when we copy back
2669          * from archive because the recoveryRestoreCommand may inadvertently
2670          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
2671          * fallback to the segments remaining in current XLOGDIR later. The
2672          * copy-from-archive filename is always the same, ensuring that we don't
2673          * run out of disk space on long recoveries.
2674          */
2675         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2676
2677         /*
2678          * Make sure there is no existing file named recovername.
2679          */
2680         if (stat(xlogpath, &stat_buf) != 0)
2681         {
2682                 if (errno != ENOENT)
2683                         ereport(FATAL,
2684                                         (errcode_for_file_access(),
2685                                          errmsg("could not stat file \"%s\": %m",
2686                                                         xlogpath)));
2687         }
2688         else
2689         {
2690                 if (unlink(xlogpath) != 0)
2691                         ereport(FATAL,
2692                                         (errcode_for_file_access(),
2693                                          errmsg("could not remove file \"%s\": %m",
2694                                                         xlogpath)));
2695         }
2696
2697         /*
2698          * Calculate the archive file cutoff point for use during log shipping
2699          * replication. All files earlier than this point can be deleted from the
2700          * archive, though there is no requirement to do so.
2701          *
2702          * We initialise this with the filename of an InvalidXLogRecPtr, which
2703          * will prevent the deletion of any WAL files from the archive because of
2704          * the alphabetic sorting property of WAL filenames.
2705          *
2706          * Once we have successfully located the redo pointer of the checkpoint
2707          * from which we start recovery we never request a file prior to the redo
2708          * pointer of the last restartpoint. When redo begins we know that we have
2709          * successfully located it, so there is no need for additional status
2710          * flags to signify the point when we can begin deleting WAL files from
2711          * the archive.
2712          */
2713         if (InRedo)
2714         {
2715                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2716                                         restartLog, restartSeg);
2717                 XLogFileName(lastRestartPointFname,
2718                                          ControlFile->checkPointCopy.ThisTimeLineID,
2719                                          restartLog, restartSeg);
2720                 /* we shouldn't need anything earlier than last restart point */
2721                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2722         }
2723         else
2724                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2725
2726         /*
2727          * construct the command to be executed
2728          */
2729         dp = xlogRestoreCmd;
2730         endp = xlogRestoreCmd + MAXPGPATH - 1;
2731         *endp = '\0';
2732
2733         for (sp = recoveryRestoreCommand; *sp; sp++)
2734         {
2735                 if (*sp == '%')
2736                 {
2737                         switch (sp[1])
2738                         {
2739                                 case 'p':
2740                                         /* %p: relative path of target file */
2741                                         sp++;
2742                                         StrNCpy(dp, xlogpath, endp - dp);
2743                                         make_native_path(dp);
2744                                         dp += strlen(dp);
2745                                         break;
2746                                 case 'f':
2747                                         /* %f: filename of desired file */
2748                                         sp++;
2749                                         StrNCpy(dp, xlogfname, endp - dp);
2750                                         dp += strlen(dp);
2751                                         break;
2752                                 case 'r':
2753                                         /* %r: filename of last restartpoint */
2754                                         sp++;
2755                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2756                                         dp += strlen(dp);
2757                                         break;
2758                                 case '%':
2759                                         /* convert %% to a single % */
2760                                         sp++;
2761                                         if (dp < endp)
2762                                                 *dp++ = *sp;
2763                                         break;
2764                                 default:
2765                                         /* otherwise treat the % as not special */
2766                                         if (dp < endp)
2767                                                 *dp++ = *sp;
2768                                         break;
2769                         }
2770                 }
2771                 else
2772                 {
2773                         if (dp < endp)
2774                                 *dp++ = *sp;
2775                 }
2776         }
2777         *dp = '\0';
2778
2779         ereport(DEBUG3,
2780                         (errmsg_internal("executing restore command \"%s\"",
2781                                                          xlogRestoreCmd)));
2782
2783         /*
2784          * Set in_restore_command to tell the signal handler that we should exit
2785          * right away on SIGTERM. We know that we're at a safe point to do that.
2786          * Check if we had already received the signal, so that we don't miss a
2787          * shutdown request received just before this.
2788          */
2789         in_restore_command = true;
2790         if (shutdown_requested)
2791                 proc_exit(1);
2792
2793         /*
2794          * Copy xlog from archival storage to XLOGDIR
2795          */
2796         rc = system(xlogRestoreCmd);
2797
2798         in_restore_command = false;
2799
2800         if (rc == 0)
2801         {
2802                 /*
2803                  * command apparently succeeded, but let's make sure the file is
2804                  * really there now and has the correct size.
2805                  *
2806                  * XXX I made wrong-size a fatal error to ensure the DBA would notice
2807                  * it, but is that too strong?  We could try to plow ahead with a
2808                  * local copy of the file ... but the problem is that there probably
2809                  * isn't one, and we'd incorrectly conclude we've reached the end of
2810                  * WAL and we're done recovering ...
2811                  */
2812                 if (stat(xlogpath, &stat_buf) == 0)
2813                 {
2814                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
2815                                 ereport(FATAL,
2816                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
2817                                                                 xlogfname,
2818                                                                 (unsigned long) stat_buf.st_size,
2819                                                                 (unsigned long) expectedSize)));
2820                         else
2821                         {
2822                                 ereport(LOG,
2823                                                 (errmsg("restored log file \"%s\" from archive",
2824                                                                 xlogfname)));
2825                                 strcpy(path, xlogpath);
2826                                 return true;
2827                         }
2828                 }
2829                 else
2830                 {
2831                         /* stat failed */
2832                         if (errno != ENOENT)
2833                                 ereport(FATAL,
2834                                                 (errcode_for_file_access(),
2835                                                  errmsg("could not stat file \"%s\": %m",
2836                                                                 xlogpath)));
2837                 }
2838         }
2839
2840         /*
2841          * Remember, we rollforward UNTIL the restore fails so failure here is
2842          * just part of the process... that makes it difficult to determine
2843          * whether the restore failed because there isn't an archive to restore,
2844          * or because the administrator has specified the restore program
2845          * incorrectly.  We have to assume the former.
2846          *
2847          * However, if the failure was due to any sort of signal, it's best to
2848          * punt and abort recovery.  (If we "return false" here, upper levels will
2849          * assume that recovery is complete and start up the database!) It's
2850          * essential to abort on child SIGINT and SIGQUIT, because per spec
2851          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
2852          * those it's a good bet we should have gotten it too.
2853          *
2854          * On SIGTERM, assume we have received a fast shutdown request, and exit
2855          * cleanly. It's pure chance whether we receive the SIGTERM first, or the
2856          * child process. If we receive it first, the signal handler will call
2857          * proc_exit, otherwise we do it here. If we or the child process received
2858          * SIGTERM for any other reason than a fast shutdown request, postmaster
2859          * will perform an immediate shutdown when it sees us exiting
2860          * unexpectedly.
2861          *
2862          * Per the Single Unix Spec, shells report exit status > 128 when a called
2863          * command died on a signal.  Also, 126 and 127 are used to report
2864          * problems such as an unfindable command; treat those as fatal errors
2865          * too.
2866          */
2867         if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
2868                 proc_exit(1);
2869
2870         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
2871
2872         ereport(signaled ? FATAL : DEBUG2,
2873                 (errmsg("could not restore file \"%s\" from archive: return code %d",
2874                                 xlogfname, rc)));
2875
2876         /*
2877          * if an archived file is not available, there might still be a version of
2878          * this file in XLOGDIR, so return that as the filename to open.
2879          *
2880          * In many recovery scenarios we expect this to fail also, but if so that
2881          * just means we've reached the end of WAL.
2882          */
2883         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2884         return false;
2885 }
2886
2887 /*
2888  * Attempt to execute the recovery_end_command.
2889  */
2890 static void
2891 ExecuteRecoveryEndCommand(void)
2892 {
2893         char            xlogRecoveryEndCmd[MAXPGPATH];
2894         char            lastRestartPointFname[MAXPGPATH];
2895         char       *dp;
2896         char       *endp;
2897         const char *sp;
2898         int                     rc;
2899         bool            signaled;
2900         uint32          restartLog;
2901         uint32          restartSeg;
2902
2903         Assert(recoveryEndCommand);
2904
2905         /*
2906          * Calculate the archive file cutoff point for use during log shipping
2907          * replication. All files earlier than this point can be deleted from the
2908          * archive, though there is no requirement to do so.
2909          *
2910          * We initialise this with the filename of an InvalidXLogRecPtr, which
2911          * will prevent the deletion of any WAL files from the archive because of
2912          * the alphabetic sorting property of WAL filenames.
2913          *
2914          * Once we have successfully located the redo pointer of the checkpoint
2915          * from which we start recovery we never request a file prior to the redo
2916          * pointer of the last restartpoint. When redo begins we know that we have
2917          * successfully located it, so there is no need for additional status
2918          * flags to signify the point when we can begin deleting WAL files from
2919          * the archive.
2920          */
2921         if (InRedo)
2922         {
2923                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2924                                         restartLog, restartSeg);
2925                 XLogFileName(lastRestartPointFname,
2926                                          ControlFile->checkPointCopy.ThisTimeLineID,
2927                                          restartLog, restartSeg);
2928         }
2929         else
2930                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2931
2932         /*
2933          * construct the command to be executed
2934          */
2935         dp = xlogRecoveryEndCmd;
2936         endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
2937         *endp = '\0';
2938
2939         for (sp = recoveryEndCommand; *sp; sp++)
2940         {
2941                 if (*sp == '%')
2942                 {
2943                         switch (sp[1])
2944                         {
2945                                 case 'r':
2946                                         /* %r: filename of last restartpoint */
2947                                         sp++;
2948                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2949                                         dp += strlen(dp);
2950                                         break;
2951                                 case '%':
2952                                         /* convert %% to a single % */
2953                                         sp++;
2954                                         if (dp < endp)
2955                                                 *dp++ = *sp;
2956                                         break;
2957                                 default:
2958                                         /* otherwise treat the % as not special */
2959                                         if (dp < endp)
2960                                                 *dp++ = *sp;
2961                                         break;
2962                         }
2963                 }
2964                 else
2965                 {
2966                         if (dp < endp)
2967                                 *dp++ = *sp;
2968                 }
2969         }
2970         *dp = '\0';
2971
2972         ereport(DEBUG3,
2973                         (errmsg_internal("executing recovery end command \"%s\"",
2974                                                          xlogRecoveryEndCmd)));
2975
2976         /*
2977          * execute the constructed command
2978          */
2979         rc = system(xlogRecoveryEndCmd);
2980         if (rc != 0)
2981         {
2982                 /*
2983                  * If the failure was due to any sort of signal, it's best to punt and
2984                  * abort recovery. See also detailed comments on signals in
2985                  * RestoreArchivedFile().
2986                  */
2987                 signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
2988
2989                 ereport(signaled ? FATAL : WARNING,
2990                                 (errmsg("recovery_end_command \"%s\": return code %d",
2991                                                 xlogRecoveryEndCmd, rc)));
2992         }
2993 }
2994
2995 /*
2996  * Preallocate log files beyond the specified log endpoint.
2997  *
2998  * XXX this is currently extremely conservative, since it forces only one
2999  * future log segment to exist, and even that only if we are 75% done with
3000  * the current one.  This is only appropriate for very low-WAL-volume systems.
3001  * High-volume systems will be OK once they've built up a sufficient set of
3002  * recycled log segments, but the startup transient is likely to include
3003  * a lot of segment creations by foreground processes, which is not so good.
3004  */
3005 static void
3006 PreallocXlogFiles(XLogRecPtr endptr)
3007 {
3008         uint32          _logId;
3009         uint32          _logSeg;
3010         int                     lf;
3011         bool            use_existent;
3012
3013         XLByteToPrevSeg(endptr, _logId, _logSeg);
3014         if ((endptr.xrecoff - 1) % XLogSegSize >=
3015                 (uint32) (0.75 * XLogSegSize))
3016         {
3017                 NextLogSeg(_logId, _logSeg);
3018                 use_existent = true;
3019                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
3020                 close(lf);
3021                 if (!use_existent)
3022                         CheckpointStats.ckpt_segs_added++;
3023         }
3024 }
3025
3026 /*
3027  * Recycle or remove all log files older or equal to passed log/seg#
3028  *
3029  * endptr is current (or recent) end of xlog; this is used to determine
3030  * whether we want to recycle rather than delete no-longer-wanted log files.
3031  */
3032 static void
3033 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
3034 {
3035         uint32          endlogId;
3036         uint32          endlogSeg;
3037         int                     max_advance;
3038         DIR                *xldir;
3039         struct dirent *xlde;
3040         char            lastoff[MAXFNAMELEN];
3041         char            path[MAXPGPATH];
3042         struct stat statbuf;
3043
3044         /*
3045          * Initialize info about where to try to recycle to.  We allow recycling
3046          * segments up to XLOGfileslop segments beyond the current XLOG location.
3047          */
3048         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3049         max_advance = XLOGfileslop;
3050
3051         xldir = AllocateDir(XLOGDIR);
3052         if (xldir == NULL)
3053                 ereport(ERROR,
3054                                 (errcode_for_file_access(),
3055                                  errmsg("could not open transaction log directory \"%s\": %m",
3056                                                 XLOGDIR)));
3057
3058         XLogFileName(lastoff, ThisTimeLineID, log, seg);
3059
3060         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3061         {
3062                 /*
3063                  * We ignore the timeline part of the XLOG segment identifiers in
3064                  * deciding whether a segment is still needed.  This ensures that we
3065                  * won't prematurely remove a segment from a parent timeline. We could
3066                  * probably be a little more proactive about removing segments of
3067                  * non-parent timelines, but that would be a whole lot more
3068                  * complicated.
3069                  *
3070                  * We use the alphanumeric sorting property of the filenames to decide
3071                  * which ones are earlier than the lastoff segment.
3072                  */
3073                 if (strlen(xlde->d_name) == 24 &&
3074                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3075                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3076                 {
3077                         if (XLogArchiveCheckDone(xlde->d_name))
3078                         {
3079                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3080
3081                                 /*
3082                                  * Before deleting the file, see if it can be recycled as a
3083                                  * future log segment. Only recycle normal files, pg_standby
3084                                  * for example can create symbolic links pointing to a
3085                                  * separate archive directory.
3086                                  */
3087                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3088                                         InstallXLogFileSegment(&endlogId, &endlogSeg, path,
3089                                                                                    true, &max_advance, true))
3090                                 {
3091                                         ereport(DEBUG2,
3092                                                         (errmsg("recycled transaction log file \"%s\"",
3093                                                                         xlde->d_name)));
3094                                         CheckpointStats.ckpt_segs_recycled++;
3095                                         /* Needn't recheck that slot on future iterations */
3096                                         if (max_advance > 0)
3097                                         {
3098                                                 NextLogSeg(endlogId, endlogSeg);
3099                                                 max_advance--;
3100                                         }
3101                                 }
3102                                 else
3103                                 {
3104                                         /* No need for any more future segments... */
3105                                         ereport(DEBUG2,
3106                                                         (errmsg("removing transaction log file \"%s\"",
3107                                                                         xlde->d_name)));
3108                                         unlink(path);
3109                                         CheckpointStats.ckpt_segs_removed++;
3110                                 }
3111
3112                                 XLogArchiveCleanup(xlde->d_name);
3113                         }
3114                 }
3115         }
3116
3117         FreeDir(xldir);
3118 }
3119
3120 /*
3121  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3122  * If the latter does not exist, recreate it.
3123  *
3124  * It is not the goal of this function to verify the contents of these
3125  * directories, but to help in cases where someone has performed a cluster
3126  * copy for PITR purposes but omitted pg_xlog from the copy.
3127  *
3128  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3129  * policy decision was made not to.  It is fairly common for pg_xlog to be
3130  * a symlink, and if that was the DBA's intent then automatically making a
3131  * plain directory would result in degraded performance with no notice.
3132  */
3133 static void
3134 ValidateXLOGDirectoryStructure(void)
3135 {
3136         char            path[MAXPGPATH];
3137         struct stat stat_buf;
3138
3139         /* Check for pg_xlog; if it doesn't exist, error out */
3140         if (stat(XLOGDIR, &stat_buf) != 0 ||
3141                 !S_ISDIR(stat_buf.st_mode))
3142                 ereport(FATAL,
3143                                 (errmsg("required WAL directory \"%s\" does not exist",
3144                                                 XLOGDIR)));
3145
3146         /* Check for archive_status */
3147         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3148         if (stat(path, &stat_buf) == 0)
3149         {
3150                 /* Check for weird cases where it exists but isn't a directory */
3151                 if (!S_ISDIR(stat_buf.st_mode))
3152                         ereport(FATAL,
3153                                         (errmsg("required WAL directory \"%s\" does not exist",
3154                                                         path)));
3155         }
3156         else
3157         {
3158                 ereport(LOG,
3159                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3160                 if (mkdir(path, 0700) < 0)
3161                         ereport(FATAL,
3162                                         (errmsg("could not create missing directory \"%s\": %m",
3163                                                         path)));
3164         }
3165 }
3166
3167 /*
3168  * Remove previous backup history files.  This also retries creation of
3169  * .ready files for any backup history files for which XLogArchiveNotify
3170  * failed earlier.
3171  */
3172 static void
3173 CleanupBackupHistory(void)
3174 {
3175         DIR                *xldir;
3176         struct dirent *xlde;
3177         char            path[MAXPGPATH];
3178
3179         xldir = AllocateDir(XLOGDIR);
3180         if (xldir == NULL)
3181                 ereport(ERROR,
3182                                 (errcode_for_file_access(),
3183                                  errmsg("could not open transaction log directory \"%s\": %m",
3184                                                 XLOGDIR)));
3185
3186         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3187         {
3188                 if (strlen(xlde->d_name) > 24 &&
3189                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3190                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3191                                    ".backup") == 0)
3192                 {
3193                         if (XLogArchiveCheckDone(xlde->d_name))
3194                         {
3195                                 ereport(DEBUG2,
3196                                 (errmsg("removing transaction log backup history file \"%s\"",
3197                                                 xlde->d_name)));
3198                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3199                                 unlink(path);
3200                                 XLogArchiveCleanup(xlde->d_name);
3201                         }
3202                 }
3203         }
3204
3205         FreeDir(xldir);
3206 }
3207
3208 /*
3209  * Restore the backup blocks present in an XLOG record, if any.
3210  *
3211  * We assume all of the record has been read into memory at *record.
3212  *
3213  * Note: when a backup block is available in XLOG, we restore it
3214  * unconditionally, even if the page in the database appears newer.
3215  * This is to protect ourselves against database pages that were partially
3216  * or incorrectly written during a crash.  We assume that the XLOG data
3217  * must be good because it has passed a CRC check, while the database
3218  * page might not be.  This will force us to replay all subsequent
3219  * modifications of the page that appear in XLOG, rather than possibly
3220  * ignoring them as already applied, but that's not a huge drawback.
3221  *
3222  * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3223  * Otherwise, a normal exclusive lock is used.  At the moment, that's just
3224  * pro forma, because there can't be any regular backends in the system
3225  * during recovery.  The 'cleanup' argument applies to all backup blocks
3226  * in the WAL record, that suffices for now.
3227  */
3228 void
3229 RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3230 {
3231         Buffer          buffer;
3232         Page            page;
3233         BkpBlock        bkpb;
3234         char       *blk;
3235         int                     i;
3236
3237         if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
3238                 return;
3239
3240         blk = (char *) XLogRecGetData(record) + record->xl_len;
3241         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3242         {
3243                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3244                         continue;
3245
3246                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3247                 blk += sizeof(BkpBlock);
3248
3249                 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3250                                                                                 RBM_ZERO);
3251                 Assert(BufferIsValid(buffer));
3252                 if (cleanup)
3253                         LockBufferForCleanup(buffer);
3254                 else
3255                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3256
3257                 page = (Page) BufferGetPage(buffer);
3258
3259                 if (bkpb.hole_length == 0)
3260                 {
3261                         memcpy((char *) page, blk, BLCKSZ);
3262                 }
3263                 else
3264                 {
3265                         /* must zero-fill the hole */
3266                         MemSet((char *) page, 0, BLCKSZ);
3267                         memcpy((char *) page, blk, bkpb.hole_offset);
3268                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3269                                    blk + bkpb.hole_offset,
3270                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3271                 }
3272
3273                 PageSetLSN(page, lsn);
3274                 PageSetTLI(page, ThisTimeLineID);
3275                 MarkBufferDirty(buffer);
3276                 UnlockReleaseBuffer(buffer);
3277
3278                 blk += BLCKSZ - bkpb.hole_length;
3279         }
3280 }
3281
3282 /*
3283  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
3284  * record (other than to the minimal extent of computing the amount of
3285  * data to read in) until we've checked the CRCs.
3286  *
3287  * We assume all of the record has been read into memory at *record.
3288  */
3289 static bool
3290 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
3291 {
3292         pg_crc32        crc;
3293         int                     i;
3294         uint32          len = record->xl_len;
3295         BkpBlock        bkpb;
3296         char       *blk;
3297
3298         /* First the rmgr data */
3299         INIT_CRC32(crc);
3300         COMP_CRC32(crc, XLogRecGetData(record), len);
3301
3302         /* Add in the backup blocks, if any */
3303         blk = (char *) XLogRecGetData(record) + len;
3304         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3305         {
3306                 uint32          blen;
3307
3308                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3309                         continue;
3310
3311                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3312                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3313                 {
3314                         ereport(emode,
3315                                         (errmsg("incorrect hole size in record at %X/%X",
3316                                                         recptr.xlogid, recptr.xrecoff)));
3317                         return false;
3318                 }
3319                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3320                 COMP_CRC32(crc, blk, blen);
3321                 blk += blen;
3322         }
3323
3324         /* Check that xl_tot_len agrees with our calculation */
3325         if (blk != (char *) record + record->xl_tot_len)
3326         {
3327                 ereport(emode,
3328                                 (errmsg("incorrect total length in record at %X/%X",
3329                                                 recptr.xlogid, recptr.xrecoff)));
3330                 return false;
3331         }
3332
3333         /* Finally include the record header */
3334         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
3335                            SizeOfXLogRecord - sizeof(pg_crc32));
3336         FIN_CRC32(crc);
3337
3338         if (!EQ_CRC32(record->xl_crc, crc))
3339         {
3340                 ereport(emode,
3341                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3342                                 recptr.xlogid, recptr.xrecoff)));
3343                 return false;
3344         }
3345
3346         return true;
3347 }
3348
3349 /*
3350  * Attempt to read an XLOG record.
3351  *
3352  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3353  * try to read a record just after the last one previously read.
3354  *
3355  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3356  * (emode must be either PANIC or LOG.)
3357  *
3358  * The record is copied into readRecordBuf, so that on successful return,
3359  * the returned record pointer always points there.
3360  */
3361 static XLogRecord *
3362 ReadRecord(XLogRecPtr *RecPtr, int emode)
3363 {
3364         XLogRecord *record;
3365         char       *buffer;
3366         XLogRecPtr      tmpRecPtr = EndRecPtr;
3367         bool            randAccess = false;
3368         uint32          len,
3369                                 total_len;
3370         uint32          targetPageOff;
3371         uint32          targetRecOff;
3372         uint32          pageHeaderSize;
3373
3374         if (readBuf == NULL)
3375         {
3376                 /*
3377                  * First time through, permanently allocate readBuf.  We do it this
3378                  * way, rather than just making a static array, for two reasons: (1)
3379                  * no need to waste the storage in most instantiations of the backend;
3380                  * (2) a static char array isn't guaranteed to have any particular
3381                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3382                  */
3383                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3384                 Assert(readBuf != NULL);
3385         }
3386
3387         if (RecPtr == NULL)
3388         {
3389                 RecPtr = &tmpRecPtr;
3390                 /* fast case if next record is on same page */
3391                 if (nextRecord != NULL)
3392                 {
3393                         record = nextRecord;
3394                         goto got_record;
3395                 }
3396                 /* align old recptr to next page */
3397                 if (tmpRecPtr.xrecoff % XLOG_BLCKSZ != 0)
3398                         tmpRecPtr.xrecoff += (XLOG_BLCKSZ - tmpRecPtr.xrecoff % XLOG_BLCKSZ);
3399                 if (tmpRecPtr.xrecoff >= XLogFileSize)
3400                 {
3401                         (tmpRecPtr.xlogid)++;
3402                         tmpRecPtr.xrecoff = 0;
3403                 }
3404                 /* We will account for page header size below */
3405         }
3406         else
3407         {
3408                 if (!XRecOffIsValid(RecPtr->xrecoff))
3409                         ereport(PANIC,
3410                                         (errmsg("invalid record offset at %X/%X",
3411                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3412
3413                 /*
3414                  * Since we are going to a random position in WAL, forget any prior
3415                  * state about what timeline we were in, and allow it to be any
3416                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3417                  * to go backwards (but we can't reset that variable right here, since
3418                  * we might not change files at all).
3419                  */
3420                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3421                 randAccess = true;              /* allow curFileTLI to go backwards too */
3422         }
3423
3424         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
3425         {
3426                 close(readFile);
3427                 readFile = -1;
3428         }
3429         XLByteToSeg(*RecPtr, readId, readSeg);
3430         if (readFile < 0)
3431         {
3432                 /* Now it's okay to reset curFileTLI if random fetch */
3433                 if (randAccess)
3434                         curFileTLI = 0;
3435
3436                 readFile = XLogFileRead(readId, readSeg, emode);
3437                 if (readFile < 0)
3438                         goto next_record_is_invalid;
3439
3440                 /*
3441                  * Whenever switching to a new WAL segment, we read the first page of
3442                  * the file and validate its header, even if that's not where the
3443                  * target record is.  This is so that we can check the additional
3444                  * identification info that is present in the first page's "long"
3445                  * header.
3446                  */
3447                 readOff = 0;
3448                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3449                 {
3450                         ereport(emode,
3451                                         (errcode_for_file_access(),
3452                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3453                                                         readId, readSeg, readOff)));
3454                         goto next_record_is_invalid;
3455                 }
3456                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3457                         goto next_record_is_invalid;
3458         }
3459
3460         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3461         if (readOff != targetPageOff)
3462         {
3463                 readOff = targetPageOff;
3464                 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
3465                 {
3466                         ereport(emode,
3467                                         (errcode_for_file_access(),
3468                                          errmsg("could not seek in log file %u, segment %u to offset %u: %m",
3469                                                         readId, readSeg, readOff)));
3470                         goto next_record_is_invalid;
3471                 }
3472                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3473                 {
3474                         ereport(emode,
3475                                         (errcode_for_file_access(),
3476                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3477                                                         readId, readSeg, readOff)));
3478                         goto next_record_is_invalid;
3479                 }
3480                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3481                         goto next_record_is_invalid;
3482         }
3483         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3484         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3485         if (targetRecOff == 0)
3486         {
3487                 /*
3488                  * Can only get here in the continuing-from-prev-page case, because
3489                  * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
3490                  * to skip over the new page's header.
3491                  */
3492                 tmpRecPtr.xrecoff += pageHeaderSize;
3493                 targetRecOff = pageHeaderSize;
3494         }
3495         else if (targetRecOff < pageHeaderSize)
3496         {
3497                 ereport(emode,
3498                                 (errmsg("invalid record offset at %X/%X",
3499                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3500                 goto next_record_is_invalid;
3501         }
3502         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3503                 targetRecOff == pageHeaderSize)
3504         {
3505                 ereport(emode,
3506                                 (errmsg("contrecord is requested by %X/%X",
3507                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3508                 goto next_record_is_invalid;
3509         }
3510         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3511
3512 got_record:;
3513
3514         /*
3515          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3516          * required.
3517          */
3518         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3519         {
3520                 if (record->xl_len != 0)
3521                 {
3522                         ereport(emode,
3523                                         (errmsg("invalid xlog switch record at %X/%X",
3524                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3525                         goto next_record_is_invalid;
3526                 }
3527         }
3528         else if (record->xl_len == 0)
3529         {
3530                 ereport(emode,
3531                                 (errmsg("record with zero length at %X/%X",
3532                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3533                 goto next_record_is_invalid;
3534         }
3535         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3536                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3537                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3538         {
3539                 ereport(emode,
3540                                 (errmsg("invalid record length at %X/%X",
3541                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3542                 goto next_record_is_invalid;
3543         }
3544         if (record->xl_rmid > RM_MAX_ID)
3545         {
3546                 ereport(emode,
3547                                 (errmsg("invalid resource manager ID %u at %X/%X",
3548                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3549                 goto next_record_is_invalid;
3550         }
3551         if (randAccess)
3552         {
3553                 /*
3554                  * We can't exactly verify the prev-link, but surely it should be less
3555                  * than the record's own address.
3556                  */
3557                 if (!XLByteLT(record->xl_prev, *RecPtr))
3558                 {
3559                         ereport(emode,
3560                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3561                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3562                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3563                         goto next_record_is_invalid;
3564                 }
3565         }
3566         else
3567         {
3568                 /*
3569                  * Record's prev-link should exactly match our previous location. This
3570                  * check guards against torn WAL pages where a stale but valid-looking
3571                  * WAL record starts on a sector boundary.
3572                  */
3573                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3574                 {
3575                         ereport(emode,
3576                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3577                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3578                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3579                         goto next_record_is_invalid;
3580                 }
3581         }
3582
3583         /*
3584          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3585          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3586          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3587          * enough for all "normal" records, but very large commit or abort records
3588          * might need more space.)
3589          */
3590         total_len = record->xl_tot_len;
3591         if (total_len > readRecordBufSize)
3592         {
3593                 uint32          newSize = total_len;
3594
3595                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3596                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3597                 if (readRecordBuf)
3598                         free(readRecordBuf);
3599                 readRecordBuf = (char *) malloc(newSize);
3600                 if (!readRecordBuf)
3601                 {
3602                         readRecordBufSize = 0;
3603                         /* We treat this as a "bogus data" condition */
3604                         ereport(emode,
3605                                         (errmsg("record length %u at %X/%X too long",
3606                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3607                         goto next_record_is_invalid;
3608                 }
3609                 readRecordBufSize = newSize;
3610         }
3611
3612         buffer = readRecordBuf;
3613         nextRecord = NULL;
3614         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3615         if (total_len > len)
3616         {
3617                 /* Need to reassemble record */
3618                 XLogContRecord *contrecord;
3619                 uint32          gotlen = len;
3620
3621                 memcpy(buffer, record, len);
3622                 record = (XLogRecord *) buffer;
3623                 buffer += len;
3624                 for (;;)
3625                 {
3626                         readOff += XLOG_BLCKSZ;
3627                         if (readOff >= XLogSegSize)
3628                         {
3629                                 close(readFile);
3630                                 readFile = -1;
3631                                 NextLogSeg(readId, readSeg);
3632                                 readFile = XLogFileRead(readId, readSeg, emode);
3633                                 if (readFile < 0)
3634                                         goto next_record_is_invalid;
3635                                 readOff = 0;
3636                         }
3637                         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3638                         {
3639                                 ereport(emode,
3640                                                 (errcode_for_file_access(),
3641                                                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
3642                                                                 readId, readSeg, readOff)));
3643                                 goto next_record_is_invalid;
3644                         }
3645                         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3646                                 goto next_record_is_invalid;
3647                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3648                         {
3649                                 ereport(emode,
3650                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
3651                                                                 readId, readSeg, readOff)));
3652                                 goto next_record_is_invalid;
3653                         }
3654                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3655                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
3656                         if (contrecord->xl_rem_len == 0 ||
3657                                 total_len != (contrecord->xl_rem_len + gotlen))
3658                         {
3659                                 ereport(emode,
3660                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
3661                                                                 contrecord->xl_rem_len,
3662                                                                 readId, readSeg, readOff)));
3663                                 goto next_record_is_invalid;
3664                         }
3665                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
3666                         if (contrecord->xl_rem_len > len)
3667                         {
3668                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
3669                                 gotlen += len;
3670                                 buffer += len;
3671                                 continue;
3672                         }
3673                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
3674                                    contrecord->xl_rem_len);
3675                         break;
3676                 }
3677                 if (!RecordIsValid(record, *RecPtr, emode))
3678                         goto next_record_is_invalid;
3679                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3680                 if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
3681                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
3682                 {
3683                         nextRecord = (XLogRecord *) ((char *) contrecord +
3684                                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
3685                 }
3686                 EndRecPtr.xlogid = readId;
3687                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3688                         pageHeaderSize +
3689                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3690                 ReadRecPtr = *RecPtr;
3691                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
3692                 return record;
3693         }
3694
3695         /* Record does not cross a page boundary */
3696         if (!RecordIsValid(record, *RecPtr, emode))
3697                 goto next_record_is_invalid;
3698         if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
3699                 MAXALIGN(total_len))
3700                 nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
3701         EndRecPtr.xlogid = RecPtr->xlogid;
3702         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3703         ReadRecPtr = *RecPtr;
3704         memcpy(buffer, record, total_len);
3705
3706         /*
3707          * Special processing if it's an XLOG SWITCH record
3708          */
3709         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3710         {
3711                 /* Pretend it extends to end of segment */
3712                 EndRecPtr.xrecoff += XLogSegSize - 1;
3713                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
3714                 nextRecord = NULL;              /* definitely not on same page */
3715
3716                 /*
3717                  * Pretend that readBuf contains the last page of the segment. This is
3718                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
3719                  * segment.
3720                  */
3721                 readOff = XLogSegSize - XLOG_BLCKSZ;
3722         }
3723         return (XLogRecord *) buffer;
3724
3725 next_record_is_invalid:;
3726         if (readFile >= 0)
3727         {
3728                 close(readFile);
3729                 readFile = -1;
3730         }
3731         nextRecord = NULL;
3732         return NULL;
3733 }
3734
3735 /*
3736  * Check whether the xlog header of a page just read in looks valid.
3737  *
3738  * This is just a convenience subroutine to avoid duplicated code in
3739  * ReadRecord.  It's not intended for use from anywhere else.
3740  */
3741 static bool
3742 ValidXLOGHeader(XLogPageHeader hdr, int emode)
3743 {
3744         XLogRecPtr      recaddr;
3745
3746         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
3747         {
3748                 ereport(emode,
3749                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
3750                                                 hdr->xlp_magic, readId, readSeg, readOff)));
3751                 return false;
3752         }
3753         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
3754         {
3755                 ereport(emode,
3756                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3757                                                 hdr->xlp_info, readId, readSeg, readOff)));
3758                 return false;
3759         }
3760         if (hdr->xlp_info & XLP_LONG_HEADER)
3761         {
3762                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
3763
3764                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
3765                 {
3766                         char            fhdrident_str[32];
3767                         char            sysident_str[32];
3768
3769                         /*
3770                          * Format sysids separately to keep platform-dependent format code
3771                          * out of the translatable message string.
3772                          */
3773                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
3774                                          longhdr->xlp_sysid);
3775                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
3776                                          ControlFile->system_identifier);
3777                         ereport(emode,
3778                                         (errmsg("WAL file is from different system"),
3779                                          errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
3780                                                            fhdrident_str, sysident_str)));
3781                         return false;
3782                 }
3783                 if (longhdr->xlp_seg_size != XLogSegSize)
3784                 {
3785                         ereport(emode,
3786                                         (errmsg("WAL file is from different system"),
3787                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
3788                         return false;
3789                 }
3790                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
3791                 {
3792                         ereport(emode,
3793                                         (errmsg("WAL file is from different system"),
3794                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
3795                         return false;
3796                 }
3797         }
3798         else if (readOff == 0)
3799         {
3800                 /* hmm, first page of file doesn't have a long header? */
3801                 ereport(emode,
3802                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3803                                                 hdr->xlp_info, readId, readSeg, readOff)));
3804                 return false;
3805         }
3806
3807         recaddr.xlogid = readId;
3808         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
3809         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
3810         {
3811                 ereport(emode,
3812                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
3813                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
3814                                                 readId, readSeg, readOff)));
3815                 return false;
3816         }
3817
3818         /*
3819          * Check page TLI is one of the expected values.
3820          */
3821         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
3822         {
3823                 ereport(emode,
3824                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
3825                                                 hdr->xlp_tli,
3826                                                 readId, readSeg, readOff)));
3827                 return false;
3828         }
3829
3830         /*
3831          * Since child timelines are always assigned a TLI greater than their
3832          * immediate parent's TLI, we should never see TLI go backwards across
3833          * successive pages of a consistent WAL sequence.
3834          *
3835          * Of course this check should only be applied when advancing sequentially
3836          * across pages; therefore ReadRecord resets lastPageTLI to zero when
3837          * going to a random page.
3838          */
3839         if (hdr->xlp_tli < lastPageTLI)
3840         {
3841                 ereport(emode,
3842                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
3843                                                 hdr->xlp_tli, lastPageTLI,
3844                                                 readId, readSeg, readOff)));
3845                 return false;
3846         }
3847         lastPageTLI = hdr->xlp_tli;
3848         return true;
3849 }
3850
3851 /*
3852  * Try to read a timeline's history file.
3853  *
3854  * If successful, return the list of component TLIs (the given TLI followed by
3855  * its ancestor TLIs).  If we can't find the history file, assume that the
3856  * timeline has no parents, and return a list of just the specified timeline
3857  * ID.
3858  */
3859 static List *
3860 readTimeLineHistory(TimeLineID targetTLI)
3861 {
3862         List       *result;
3863         char            path[MAXPGPATH];
3864         char            histfname[MAXFNAMELEN];
3865         char            fline[MAXPGPATH];
3866         FILE       *fd;
3867
3868         if (InArchiveRecovery)
3869         {
3870                 TLHistoryFileName(histfname, targetTLI);
3871                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3872         }
3873         else
3874                 TLHistoryFilePath(path, targetTLI);
3875
3876         fd = AllocateFile(path, "r");
3877         if (fd == NULL)
3878         {
3879                 if (errno != ENOENT)
3880                         ereport(FATAL,
3881                                         (errcode_for_file_access(),
3882                                          errmsg("could not open file \"%s\": %m", path)));
3883                 /* Not there, so assume no parents */
3884                 return list_make1_int((int) targetTLI);
3885         }
3886
3887         result = NIL;
3888
3889         /*
3890          * Parse the file...
3891          */
3892         while (fgets(fline, sizeof(fline), fd) != NULL)
3893         {
3894                 /* skip leading whitespace and check for # comment */
3895                 char       *ptr;
3896                 char       *endptr;
3897                 TimeLineID      tli;
3898
3899                 for (ptr = fline; *ptr; ptr++)
3900                 {
3901                         if (!isspace((unsigned char) *ptr))
3902                                 break;
3903                 }
3904                 if (*ptr == '\0' || *ptr == '#')
3905                         continue;
3906
3907                 /* expect a numeric timeline ID as first field of line */
3908                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
3909                 if (endptr == ptr)
3910                         ereport(FATAL,
3911                                         (errmsg("syntax error in history file: %s", fline),
3912                                          errhint("Expected a numeric timeline ID.")));
3913
3914                 if (result &&
3915                         tli <= (TimeLineID) linitial_int(result))
3916                         ereport(FATAL,
3917                                         (errmsg("invalid data in history file: %s", fline),
3918                                    errhint("Timeline IDs must be in increasing sequence.")));
3919
3920                 /* Build list with newest item first */
3921                 result = lcons_int((int) tli, result);
3922
3923                 /* we ignore the remainder of each line */
3924         }
3925
3926         FreeFile(fd);
3927
3928         if (result &&
3929                 targetTLI <= (TimeLineID) linitial_int(result))
3930                 ereport(FATAL,
3931                                 (errmsg("invalid data in history file \"%s\"", path),
3932                         errhint("Timeline IDs must be less than child timeline's ID.")));
3933
3934         result = lcons_int((int) targetTLI, result);
3935
3936         ereport(DEBUG3,
3937                         (errmsg_internal("history of timeline %u is %s",
3938                                                          targetTLI, nodeToString(result))));
3939
3940         return result;
3941 }
3942
3943 /*
3944  * Probe whether a timeline history file exists for the given timeline ID
3945  */
3946 static bool
3947 existsTimeLineHistory(TimeLineID probeTLI)
3948 {
3949         char            path[MAXPGPATH];
3950         char            histfname[MAXFNAMELEN];
3951         FILE       *fd;
3952
3953         if (InArchiveRecovery)
3954         {
3955                 TLHistoryFileName(histfname, probeTLI);
3956                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3957         }
3958         else
3959                 TLHistoryFilePath(path, probeTLI);
3960
3961         fd = AllocateFile(path, "r");
3962         if (fd != NULL)
3963         {
3964                 FreeFile(fd);
3965                 return true;
3966         }
3967         else
3968         {
3969                 if (errno != ENOENT)
3970                         ereport(FATAL,
3971                                         (errcode_for_file_access(),
3972                                          errmsg("could not open file \"%s\": %m", path)));
3973                 return false;
3974         }
3975 }
3976
3977 /*
3978  * Find the newest existing timeline, assuming that startTLI exists.
3979  *
3980  * Note: while this is somewhat heuristic, it does positively guarantee
3981  * that (result + 1) is not a known timeline, and therefore it should
3982  * be safe to assign that ID to a new timeline.
3983  */
3984 static TimeLineID
3985 findNewestTimeLine(TimeLineID startTLI)
3986 {
3987         TimeLineID      newestTLI;
3988         TimeLineID      probeTLI;
3989
3990         /*
3991          * The algorithm is just to probe for the existence of timeline history
3992          * files.  XXX is it useful to allow gaps in the sequence?
3993          */
3994         newestTLI = startTLI;
3995
3996         for (probeTLI = startTLI + 1;; probeTLI++)
3997         {
3998                 if (existsTimeLineHistory(probeTLI))
3999                 {
4000                         newestTLI = probeTLI;           /* probeTLI exists */
4001                 }
4002                 else
4003                 {
4004                         /* doesn't exist, assume we're done */
4005                         break;
4006                 }
4007         }
4008
4009         return newestTLI;
4010 }
4011
4012 /*
4013  * Create a new timeline history file.
4014  *
4015  *      newTLI: ID of the new timeline
4016  *      parentTLI: ID of its immediate parent
4017  *      endTLI et al: ID of the last used WAL file, for annotation purposes
4018  *
4019  * Currently this is only used during recovery, and so there are no locking
4020  * considerations.      But we should be just as tense as XLogFileInit to avoid
4021  * emplacing a bogus file.
4022  */
4023 static void
4024 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
4025                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4026 {
4027         char            path[MAXPGPATH];
4028         char            tmppath[MAXPGPATH];
4029         char            histfname[MAXFNAMELEN];
4030         char            xlogfname[MAXFNAMELEN];
4031         char            buffer[BLCKSZ];
4032         int                     srcfd;
4033         int                     fd;
4034         int                     nbytes;
4035
4036         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4037
4038         /*
4039          * Write into a temp file name.
4040          */
4041         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4042
4043         unlink(tmppath);
4044
4045         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
4046         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
4047                                            S_IRUSR | S_IWUSR);
4048         if (fd < 0)
4049                 ereport(ERROR,
4050                                 (errcode_for_file_access(),
4051                                  errmsg("could not create file \"%s\": %m", tmppath)));
4052
4053         /*
4054          * If a history file exists for the parent, copy it verbatim
4055          */
4056         if (InArchiveRecovery)
4057         {
4058                 TLHistoryFileName(histfname, parentTLI);
4059                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4060         }
4061         else
4062                 TLHistoryFilePath(path, parentTLI);
4063
4064         srcfd = BasicOpenFile(path, O_RDONLY, 0);
4065         if (srcfd < 0)
4066         {
4067                 if (errno != ENOENT)
4068                         ereport(ERROR,
4069                                         (errcode_for_file_access(),
4070                                          errmsg("could not open file \"%s\": %m", path)));
4071                 /* Not there, so assume parent has no parents */
4072         }
4073         else
4074         {
4075                 for (;;)
4076                 {
4077                         errno = 0;
4078                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
4079                         if (nbytes < 0 || errno != 0)
4080                                 ereport(ERROR,
4081                                                 (errcode_for_file_access(),
4082                                                  errmsg("could not read file \"%s\": %m", path)));
4083                         if (nbytes == 0)
4084                                 break;
4085                         errno = 0;
4086                         if ((int) write(fd, buffer, nbytes) != nbytes)
4087                         {
4088                                 int                     save_errno = errno;
4089
4090                                 /*
4091                                  * If we fail to make the file, delete it to release disk
4092                                  * space
4093                                  */
4094                                 unlink(tmppath);
4095
4096                                 /*
4097                                  * if write didn't set errno, assume problem is no disk space
4098                                  */
4099                                 errno = save_errno ? save_errno : ENOSPC;
4100
4101                                 ereport(ERROR,
4102                                                 (errcode_for_file_access(),
4103                                          errmsg("could not write to file \"%s\": %m", tmppath)));
4104                         }
4105                 }
4106                 close(srcfd);
4107         }
4108
4109         /*
4110          * Append one line with the details of this timeline split.
4111          *
4112          * If we did have a parent file, insert an extra newline just in case the
4113          * parent file failed to end with one.
4114          */
4115         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
4116
4117         snprintf(buffer, sizeof(buffer),
4118                          "%s%u\t%s\t%s transaction %u at %s\n",
4119                          (srcfd < 0) ? "" : "\n",
4120                          parentTLI,
4121                          xlogfname,
4122                          recoveryStopAfter ? "after" : "before",
4123                          recoveryStopXid,
4124                          timestamptz_to_str(recoveryStopTime));
4125
4126         nbytes = strlen(buffer);
4127         errno = 0;
4128         if ((int) write(fd, buffer, nbytes) != nbytes)
4129         {
4130                 int                     save_errno = errno;
4131
4132                 /*
4133                  * If we fail to make the file, delete it to release disk space
4134                  */
4135                 unlink(tmppath);
4136                 /* if write didn't set errno, assume problem is no disk space */
4137                 errno = save_errno ? save_errno : ENOSPC;
4138
4139                 ereport(ERROR,
4140                                 (errcode_for_file_access(),
4141                                  errmsg("could not write to file \"%s\": %m", tmppath)));
4142         }
4143
4144         if (pg_fsync(fd) != 0)
4145                 ereport(ERROR,
4146                                 (errcode_for_file_access(),
4147                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
4148
4149         if (close(fd))
4150                 ereport(ERROR,
4151                                 (errcode_for_file_access(),
4152                                  errmsg("could not close file \"%s\": %m", tmppath)));
4153
4154
4155         /*
4156          * Now move the completed history file into place with its final name.
4157          */
4158         TLHistoryFilePath(path, newTLI);
4159
4160         /*
4161          * Prefer link() to rename() here just to be really sure that we don't
4162          * overwrite an existing logfile.  However, there shouldn't be one, so
4163          * rename() is an acceptable substitute except for the truly paranoid.
4164          */
4165 #if HAVE_WORKING_LINK
4166         if (link(tmppath, path) < 0)
4167                 ereport(ERROR,
4168                                 (errcode_for_file_access(),
4169                                  errmsg("could not link file \"%s\" to \"%s\": %m",
4170                                                 tmppath, path)));
4171         unlink(tmppath);
4172 #else
4173         if (rename(tmppath, path) < 0)
4174                 ereport(ERROR,
4175                                 (errcode_for_file_access(),
4176                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4177                                                 tmppath, path)));
4178 #endif
4179
4180         /* The history file can be archived immediately. */
4181         TLHistoryFileName(histfname, newTLI);
4182         XLogArchiveNotify(histfname);
4183 }
4184
4185 /*
4186  * I/O routines for pg_control
4187  *
4188  * *ControlFile is a buffer in shared memory that holds an image of the
4189  * contents of pg_control.      WriteControlFile() initializes pg_control
4190  * given a preloaded buffer, ReadControlFile() loads the buffer from
4191  * the pg_control file (during postmaster or standalone-backend startup),
4192  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4193  *
4194  * For simplicity, WriteControlFile() initializes the fields of pg_control
4195  * that are related to checking backend/database compatibility, and
4196  * ReadControlFile() verifies they are correct.  We could split out the
4197  * I/O and compatibility-check functions, but there seems no need currently.
4198  */
4199 static void
4200 WriteControlFile(void)
4201 {
4202         int                     fd;
4203         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4204
4205         /*
4206          * Initialize version and compatibility-check fields
4207          */
4208         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4209         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4210
4211         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4212         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4213
4214         ControlFile->blcksz = BLCKSZ;
4215         ControlFile->relseg_size = RELSEG_SIZE;
4216         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4217         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4218
4219         ControlFile->nameDataLen = NAMEDATALEN;
4220         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4221
4222         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4223
4224 #ifdef HAVE_INT64_TIMESTAMP
4225         ControlFile->enableIntTimes = true;
4226 #else
4227         ControlFile->enableIntTimes = false;
4228 #endif
4229         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4230         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4231
4232         /* Contents are protected with a CRC */
4233         INIT_CRC32(ControlFile->crc);
4234         COMP_CRC32(ControlFile->crc,
4235                            (char *) ControlFile,
4236                            offsetof(ControlFileData, crc));
4237         FIN_CRC32(ControlFile->crc);
4238
4239         /*
4240          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4241          * excess over sizeof(ControlFileData).  This reduces the odds of
4242          * premature-EOF errors when reading pg_control.  We'll still fail when we
4243          * check the contents of the file, but hopefully with a more specific
4244          * error than "couldn't read pg_control".
4245          */
4246         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4247                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4248
4249         memset(buffer, 0, PG_CONTROL_SIZE);
4250         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4251
4252         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4253                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4254                                            S_IRUSR | S_IWUSR);
4255         if (fd < 0)
4256                 ereport(PANIC,
4257                                 (errcode_for_file_access(),
4258                                  errmsg("could not create control file \"%s\": %m",
4259                                                 XLOG_CONTROL_FILE)));
4260
4261         errno = 0;
4262         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4263         {
4264                 /* if write didn't set errno, assume problem is no disk space */
4265                 if (errno == 0)
4266                         errno = ENOSPC;
4267                 ereport(PANIC,
4268                                 (errcode_for_file_access(),
4269                                  errmsg("could not write to control file: %m")));
4270         }
4271
4272         if (pg_fsync(fd) != 0)
4273                 ereport(PANIC,
4274                                 (errcode_for_file_access(),
4275                                  errmsg("could not fsync control file: %m")));
4276
4277         if (close(fd))
4278                 ereport(PANIC,
4279                                 (errcode_for_file_access(),
4280                                  errmsg("could not close control file: %m")));
4281 }
4282
4283 static void
4284 ReadControlFile(void)
4285 {
4286         pg_crc32        crc;
4287         int                     fd;
4288
4289         /*
4290          * Read data...
4291          */
4292         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4293                                            O_RDWR | PG_BINARY,
4294                                            S_IRUSR | S_IWUSR);
4295         if (fd < 0)
4296                 ereport(PANIC,
4297                                 (errcode_for_file_access(),
4298                                  errmsg("could not open control file \"%s\": %m",
4299                                                 XLOG_CONTROL_FILE)));
4300
4301         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4302                 ereport(PANIC,
4303                                 (errcode_for_file_access(),
4304                                  errmsg("could not read from control file: %m")));
4305
4306         close(fd);
4307
4308         /*
4309          * Check for expected pg_control format version.  If this is wrong, the
4310          * CRC check will likely fail because we'll be checking the wrong number
4311          * of bytes.  Complaining about wrong version will probably be more
4312          * enlightening than complaining about wrong CRC.
4313          */
4314
4315         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4316                 ereport(FATAL,
4317                                 (errmsg("database files are incompatible with server"),
4318                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4319                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4320                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4321                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4322                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4323
4324         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4325                 ereport(FATAL,
4326                                 (errmsg("database files are incompatible with server"),
4327                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4328                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4329                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4330                                  errhint("It looks like you need to initdb.")));
4331
4332         /* Now check the CRC. */
4333         INIT_CRC32(crc);
4334         COMP_CRC32(crc,
4335                            (char *) ControlFile,
4336                            offsetof(ControlFileData, crc));
4337         FIN_CRC32(crc);
4338
4339         if (!EQ_CRC32(crc, ControlFile->crc))
4340                 ereport(FATAL,
4341                                 (errmsg("incorrect checksum in control file")));
4342
4343         /*
4344          * Do compatibility checking immediately.  If the database isn't
4345          * compatible with the backend executable, we want to abort before we can
4346          * possibly do any damage.
4347          */
4348         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4349                 ereport(FATAL,
4350                                 (errmsg("database files are incompatible with server"),
4351                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4352                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4353                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4354                                  errhint("It looks like you need to initdb.")));
4355         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4356                 ereport(FATAL,
4357                                 (errmsg("database files are incompatible with server"),
4358                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4359                                          " but the server was compiled with MAXALIGN %d.",
4360                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4361                                  errhint("It looks like you need to initdb.")));
4362         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4363                 ereport(FATAL,
4364                                 (errmsg("database files are incompatible with server"),
4365                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4366                                  errhint("It looks like you need to initdb.")));
4367         if (ControlFile->blcksz != BLCKSZ)
4368                 ereport(FATAL,
4369                                 (errmsg("database files are incompatible with server"),
4370                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4371                                            " but the server was compiled with BLCKSZ %d.",
4372                                            ControlFile->blcksz, BLCKSZ),
4373                                  errhint("It looks like you need to recompile or initdb.")));
4374         if (ControlFile->relseg_size != RELSEG_SIZE)
4375                 ereport(FATAL,
4376                                 (errmsg("database files are incompatible with server"),
4377                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4378                                   " but the server was compiled with RELSEG_SIZE %d.",
4379                                   ControlFile->relseg_size, RELSEG_SIZE),
4380                                  errhint("It looks like you need to recompile or initdb.")));
4381         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4382                 ereport(FATAL,
4383                                 (errmsg("database files are incompatible with server"),
4384                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4385                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4386                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4387                                  errhint("It looks like you need to recompile or initdb.")));
4388         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4389                 ereport(FATAL,
4390                                 (errmsg("database files are incompatible with server"),
4391                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4392                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4393                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4394                                  errhint("It looks like you need to recompile or initdb.")));
4395         if (ControlFile->nameDataLen != NAMEDATALEN)
4396                 ereport(FATAL,
4397                                 (errmsg("database files are incompatible with server"),
4398                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4399                                   " but the server was compiled with NAMEDATALEN %d.",
4400                                   ControlFile->nameDataLen, NAMEDATALEN),
4401                                  errhint("It looks like you need to recompile or initdb.")));
4402         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4403                 ereport(FATAL,
4404                                 (errmsg("database files are incompatible with server"),
4405                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4406                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4407                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4408                                  errhint("It looks like you need to recompile or initdb.")));
4409         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4410                 ereport(FATAL,
4411                                 (errmsg("database files are incompatible with server"),
4412                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4413                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4414                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4415                                  errhint("It looks like you need to recompile or initdb.")));
4416
4417 #ifdef HAVE_INT64_TIMESTAMP
4418         if (ControlFile->enableIntTimes != true)
4419                 ereport(FATAL,
4420                                 (errmsg("database files are incompatible with server"),
4421                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4422                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4423                                  errhint("It looks like you need to recompile or initdb.")));
4424 #else
4425         if (ControlFile->enableIntTimes != false)
4426                 ereport(FATAL,
4427                                 (errmsg("database files are incompatible with server"),
4428                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4429                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4430                                  errhint("It looks like you need to recompile or initdb.")));
4431 #endif
4432
4433 #ifdef USE_FLOAT4_BYVAL
4434         if (ControlFile->float4ByVal != true)
4435                 ereport(FATAL,
4436                                 (errmsg("database files are incompatible with server"),
4437                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4438                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4439                                  errhint("It looks like you need to recompile or initdb.")));
4440 #else
4441         if (ControlFile->float4ByVal != false)
4442                 ereport(FATAL,
4443                                 (errmsg("database files are incompatible with server"),
4444                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4445                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4446                                  errhint("It looks like you need to recompile or initdb.")));
4447 #endif
4448
4449 #ifdef USE_FLOAT8_BYVAL
4450         if (ControlFile->float8ByVal != true)
4451                 ereport(FATAL,
4452                                 (errmsg("database files are incompatible with server"),
4453                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4454                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4455                                  errhint("It looks like you need to recompile or initdb.")));
4456 #else
4457         if (ControlFile->float8ByVal != false)
4458                 ereport(FATAL,
4459                                 (errmsg("database files are incompatible with server"),
4460                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4461                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4462                                  errhint("It looks like you need to recompile or initdb.")));
4463 #endif
4464 }
4465
4466 void
4467 UpdateControlFile(void)
4468 {
4469         int                     fd;
4470
4471         INIT_CRC32(ControlFile->crc);
4472         COMP_CRC32(ControlFile->crc,
4473                            (char *) ControlFile,
4474                            offsetof(ControlFileData, crc));
4475         FIN_CRC32(ControlFile->crc);
4476
4477         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4478                                            O_RDWR | PG_BINARY,
4479                                            S_IRUSR | S_IWUSR);
4480         if (fd < 0)
4481                 ereport(PANIC,
4482                                 (errcode_for_file_access(),
4483                                  errmsg("could not open control file \"%s\": %m",
4484                                                 XLOG_CONTROL_FILE)));
4485
4486         errno = 0;
4487         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4488         {
4489                 /* if write didn't set errno, assume problem is no disk space */
4490                 if (errno == 0)
4491                         errno = ENOSPC;
4492                 ereport(PANIC,
4493                                 (errcode_for_file_access(),
4494                                  errmsg("could not write to control file: %m")));
4495         }
4496
4497         if (pg_fsync(fd) != 0)
4498                 ereport(PANIC,
4499                                 (errcode_for_file_access(),
4500                                  errmsg("could not fsync control file: %m")));
4501
4502         if (close(fd))
4503                 ereport(PANIC,
4504                                 (errcode_for_file_access(),
4505                                  errmsg("could not close control file: %m")));
4506 }
4507
4508 /*
4509  * Initialization of shared memory for XLOG
4510  */
4511 Size
4512 XLOGShmemSize(void)
4513 {
4514         Size            size;
4515
4516         /* XLogCtl */
4517         size = sizeof(XLogCtlData);
4518         /* xlblocks array */
4519         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4520         /* extra alignment padding for XLOG I/O buffers */
4521         size = add_size(size, ALIGNOF_XLOG_BUFFER);
4522         /* and the buffers themselves */
4523         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4524
4525         /*
4526          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4527          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4528          * routine again below to compute the actual allocation size.
4529          */
4530
4531         return size;
4532 }
4533
4534 void
4535 XLOGShmemInit(void)
4536 {
4537         bool            foundCFile,
4538                                 foundXLog;
4539         char       *allocptr;
4540
4541         ControlFile = (ControlFileData *)
4542                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4543         XLogCtl = (XLogCtlData *)
4544                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4545
4546         if (foundCFile || foundXLog)
4547         {
4548                 /* both should be present or neither */
4549                 Assert(foundCFile && foundXLog);
4550                 return;
4551         }
4552
4553         memset(XLogCtl, 0, sizeof(XLogCtlData));
4554
4555         /*
4556          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4557          * multiple of the alignment for same, so no extra alignment padding is
4558          * needed here.
4559          */
4560         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4561         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4562         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4563         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4564
4565         /*
4566          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
4567          */
4568         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
4569         XLogCtl->pages = allocptr;
4570         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4571
4572         /*
4573          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4574          * in additional info.)
4575          */
4576         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4577         XLogCtl->SharedRecoveryInProgress = true;
4578         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4579         SpinLockInit(&XLogCtl->info_lck);
4580
4581         /*
4582          * If we are not in bootstrap mode, pg_control should already exist. Read
4583          * and validate it immediately (see comments in ReadControlFile() for the
4584          * reasons why).
4585          */
4586         if (!IsBootstrapProcessingMode())
4587                 ReadControlFile();
4588 }
4589
4590 /*
4591  * This func must be called ONCE on system install.  It creates pg_control
4592  * and the initial XLOG segment.
4593  */
4594 void
4595 BootStrapXLOG(void)
4596 {
4597         CheckPoint      checkPoint;
4598         char       *buffer;
4599         XLogPageHeader page;
4600         XLogLongPageHeader longpage;
4601         XLogRecord *record;
4602         bool            use_existent;
4603         uint64          sysidentifier;
4604         struct timeval tv;
4605         pg_crc32        crc;
4606
4607         /*
4608          * Select a hopefully-unique system identifier code for this installation.
4609          * We use the result of gettimeofday(), including the fractional seconds
4610          * field, as being about as unique as we can easily get.  (Think not to
4611          * use random(), since it hasn't been seeded and there's no portable way
4612          * to seed it other than the system clock value...)  The upper half of the
4613          * uint64 value is just the tv_sec part, while the lower half is the XOR
4614          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4615          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4616          * knowing this encoding can determine the initialization time of the
4617          * installation, which could perhaps be useful sometimes.
4618          */
4619         gettimeofday(&tv, NULL);
4620         sysidentifier = ((uint64) tv.tv_sec) << 32;
4621         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4622
4623         /* First timeline ID is always 1 */
4624         ThisTimeLineID = 1;
4625
4626         /* page buffer must be aligned suitably for O_DIRECT */
4627         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4628         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4629         memset(page, 0, XLOG_BLCKSZ);
4630
4631         /* Set up information for the initial checkpoint record */
4632         checkPoint.redo.xlogid = 0;
4633         checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
4634         checkPoint.ThisTimeLineID = ThisTimeLineID;
4635         checkPoint.nextXidEpoch = 0;
4636         checkPoint.nextXid = FirstNormalTransactionId;
4637         checkPoint.nextOid = FirstBootstrapObjectId;
4638         checkPoint.nextMulti = FirstMultiXactId;
4639         checkPoint.nextMultiOffset = 0;
4640         checkPoint.time = (pg_time_t) time(NULL);
4641
4642         ShmemVariableCache->nextXid = checkPoint.nextXid;
4643         ShmemVariableCache->nextOid = checkPoint.nextOid;
4644         ShmemVariableCache->oidCount = 0;
4645         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4646
4647         /* Set up the XLOG page header */
4648         page->xlp_magic = XLOG_PAGE_MAGIC;
4649         page->xlp_info = XLP_LONG_HEADER;
4650         page->xlp_tli = ThisTimeLineID;
4651         page->xlp_pageaddr.xlogid = 0;
4652         page->xlp_pageaddr.xrecoff = 0;
4653         longpage = (XLogLongPageHeader) page;
4654         longpage->xlp_sysid = sysidentifier;
4655         longpage->xlp_seg_size = XLogSegSize;
4656         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4657
4658         /* Insert the initial checkpoint record */
4659         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4660         record->xl_prev.xlogid = 0;
4661         record->xl_prev.xrecoff = 0;
4662         record->xl_xid = InvalidTransactionId;
4663         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4664         record->xl_len = sizeof(checkPoint);
4665         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4666         record->xl_rmid = RM_XLOG_ID;
4667         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4668
4669         INIT_CRC32(crc);
4670         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4671         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
4672                            SizeOfXLogRecord - sizeof(pg_crc32));
4673         FIN_CRC32(crc);
4674         record->xl_crc = crc;
4675
4676         /* Create first XLOG segment file */
4677         use_existent = false;
4678         openLogFile = XLogFileInit(0, 0, &use_existent, false);
4679
4680         /* Write the first page with the initial record */
4681         errno = 0;
4682         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4683         {
4684                 /* if write didn't set errno, assume problem is no disk space */
4685                 if (errno == 0)
4686                         errno = ENOSPC;
4687                 ereport(PANIC,
4688                                 (errcode_for_file_access(),
4689                           errmsg("could not write bootstrap transaction log file: %m")));
4690         }
4691
4692         if (pg_fsync(openLogFile) != 0)
4693                 ereport(PANIC,
4694                                 (errcode_for_file_access(),
4695                           errmsg("could not fsync bootstrap transaction log file: %m")));
4696
4697         if (close(openLogFile))
4698                 ereport(PANIC,
4699                                 (errcode_for_file_access(),
4700                           errmsg("could not close bootstrap transaction log file: %m")));
4701
4702         openLogFile = -1;
4703
4704         /* Now create pg_control */
4705
4706         memset(ControlFile, 0, sizeof(ControlFileData));
4707         /* Initialize pg_control status fields */
4708         ControlFile->system_identifier = sysidentifier;
4709         ControlFile->state = DB_SHUTDOWNED;
4710         ControlFile->time = checkPoint.time;
4711         ControlFile->checkPoint = checkPoint.redo;
4712         ControlFile->checkPointCopy = checkPoint;
4713         /* some additional ControlFile fields are set in WriteControlFile() */
4714
4715         WriteControlFile();
4716
4717         /* Bootstrap the commit log, too */
4718         BootStrapCLOG();
4719         BootStrapSUBTRANS();
4720         BootStrapMultiXact();
4721
4722         pfree(buffer);
4723 }
4724
4725 static char *
4726 str_time(pg_time_t tnow)
4727 {
4728         static char buf[128];
4729
4730         pg_strftime(buf, sizeof(buf),
4731                                 "%Y-%m-%d %H:%M:%S %Z",
4732                                 pg_localtime(&tnow, log_timezone));
4733
4734         return buf;
4735 }
4736
4737 /*
4738  * See if there is a recovery command file (recovery.conf), and if so
4739  * read in parameters for archive recovery.
4740  *
4741  * XXX longer term intention is to expand this to
4742  * cater for additional parameters and controls
4743  * possibly use a flex lexer similar to the GUC one
4744  */
4745 static void
4746 readRecoveryCommandFile(void)
4747 {
4748         FILE       *fd;
4749         char            cmdline[MAXPGPATH];
4750         TimeLineID      rtli = 0;
4751         bool            rtliGiven = false;
4752         bool            syntaxError = false;
4753
4754         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4755         if (fd == NULL)
4756         {
4757                 if (errno == ENOENT)
4758                         return;                         /* not there, so no archive recovery */
4759                 ereport(FATAL,
4760                                 (errcode_for_file_access(),
4761                                  errmsg("could not open recovery command file \"%s\": %m",
4762                                                 RECOVERY_COMMAND_FILE)));
4763         }
4764
4765         ereport(LOG,
4766                         (errmsg("starting archive recovery")));
4767
4768         /*
4769          * Parse the file...
4770          */
4771         while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
4772         {
4773                 /* skip leading whitespace and check for # comment */
4774                 char       *ptr;
4775                 char       *tok1;
4776                 char       *tok2;
4777
4778                 for (ptr = cmdline; *ptr; ptr++)
4779                 {
4780                         if (!isspace((unsigned char) *ptr))
4781                                 break;
4782                 }
4783                 if (*ptr == '\0' || *ptr == '#')
4784                         continue;
4785
4786                 /* identify the quoted parameter value */
4787                 tok1 = strtok(ptr, "'");
4788                 if (!tok1)
4789                 {
4790                         syntaxError = true;
4791                         break;
4792                 }
4793                 tok2 = strtok(NULL, "'");
4794                 if (!tok2)
4795                 {
4796                         syntaxError = true;
4797                         break;
4798                 }
4799                 /* reparse to get just the parameter name */
4800                 tok1 = strtok(ptr, " \t=");
4801                 if (!tok1)
4802                 {
4803                         syntaxError = true;
4804                         break;
4805                 }
4806
4807                 if (strcmp(tok1, "restore_command") == 0)
4808                 {
4809                         recoveryRestoreCommand = pstrdup(tok2);
4810                         ereport(LOG,
4811                                         (errmsg("restore_command = '%s'",
4812                                                         recoveryRestoreCommand)));
4813                 }
4814                 else if (strcmp(tok1, "recovery_end_command") == 0)
4815                 {
4816                         recoveryEndCommand = pstrdup(tok2);
4817                         ereport(LOG,
4818                                         (errmsg("recovery_end_command = '%s'",
4819                                                         recoveryEndCommand)));
4820                 }
4821                 else if (strcmp(tok1, "recovery_target_timeline") == 0)
4822                 {
4823                         rtliGiven = true;
4824                         if (strcmp(tok2, "latest") == 0)
4825                                 rtli = 0;
4826                         else
4827                         {
4828                                 errno = 0;
4829                                 rtli = (TimeLineID) strtoul(tok2, NULL, 0);
4830                                 if (errno == EINVAL || errno == ERANGE)
4831                                         ereport(FATAL,
4832                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4833                                                                         tok2)));
4834                         }
4835                         if (rtli)
4836                                 ereport(LOG,
4837                                                 (errmsg("recovery_target_timeline = %u", rtli)));
4838                         else
4839                                 ereport(LOG,
4840                                                 (errmsg("recovery_target_timeline = latest")));
4841                 }
4842                 else if (strcmp(tok1, "recovery_target_xid") == 0)
4843                 {
4844                         errno = 0;
4845                         recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
4846                         if (errno == EINVAL || errno == ERANGE)
4847                                 ereport(FATAL,
4848                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4849                                                  tok2)));
4850                         ereport(LOG,
4851                                         (errmsg("recovery_target_xid = %u",
4852                                                         recoveryTargetXid)));
4853                         recoveryTarget = true;
4854                         recoveryTargetExact = true;
4855                 }
4856                 else if (strcmp(tok1, "recovery_target_time") == 0)
4857                 {
4858                         /*
4859                          * if recovery_target_xid specified, then this overrides
4860                          * recovery_target_time
4861                          */
4862                         if (recoveryTargetExact)
4863                                 continue;
4864                         recoveryTarget = true;
4865                         recoveryTargetExact = false;
4866
4867                         /*
4868                          * Convert the time string given by the user to TimestampTz form.
4869                          */
4870                         recoveryTargetTime =
4871                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4872                                                                                                                 CStringGetDatum(tok2),
4873                                                                                                 ObjectIdGetDatum(InvalidOid),
4874                                                                                                                 Int32GetDatum(-1)));
4875                         ereport(LOG,
4876                                         (errmsg("recovery_target_time = '%s'",
4877                                                         timestamptz_to_str(recoveryTargetTime))));
4878                 }
4879                 else if (strcmp(tok1, "recovery_target_inclusive") == 0)
4880                 {
4881                         /*
4882                          * does nothing if a recovery_target is not also set
4883                          */
4884                         if (!parse_bool(tok2, &recoveryTargetInclusive))
4885                                 ereport(ERROR,
4886                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4887                                                  errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
4888                         ereport(LOG,
4889                                         (errmsg("recovery_target_inclusive = %s", tok2)));
4890                 }
4891                 else
4892                         ereport(FATAL,
4893                                         (errmsg("unrecognized recovery parameter \"%s\"",
4894                                                         tok1)));
4895         }
4896
4897         FreeFile(fd);
4898
4899         if (syntaxError)
4900                 ereport(FATAL,
4901                                 (errmsg("syntax error in recovery command file: %s",
4902                                                 cmdline),
4903                           errhint("Lines should have the format parameter = 'value'.")));
4904
4905         /* Check that required parameters were supplied */
4906         if (recoveryRestoreCommand == NULL)
4907                 ereport(FATAL,
4908                                 (errmsg("recovery command file \"%s\" did not specify restore_command",
4909                                                 RECOVERY_COMMAND_FILE)));
4910
4911         /* Enable fetching from archive recovery area */
4912         InArchiveRecovery = true;
4913
4914         /*
4915          * If user specified recovery_target_timeline, validate it or compute the
4916          * "latest" value.      We can't do this until after we've gotten the restore
4917          * command and set InArchiveRecovery, because we need to fetch timeline
4918          * history files from the archive.
4919          */
4920         if (rtliGiven)
4921         {
4922                 if (rtli)
4923                 {
4924                         /* Timeline 1 does not have a history file, all else should */
4925                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4926                                 ereport(FATAL,
4927                                                 (errmsg("recovery target timeline %u does not exist",
4928                                                                 rtli)));
4929                         recoveryTargetTLI = rtli;
4930                 }
4931                 else
4932                 {
4933                         /* We start the "latest" search from pg_control's timeline */
4934                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4935                 }
4936         }
4937 }
4938
4939 /*
4940  * Exit archive-recovery state
4941  */
4942 static void
4943 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4944 {
4945         char            recoveryPath[MAXPGPATH];
4946         char            xlogpath[MAXPGPATH];
4947         XLogRecPtr      InvalidXLogRecPtr = {0, 0};
4948
4949         /*
4950          * We are no longer in archive recovery state.
4951          */
4952         InArchiveRecovery = false;
4953
4954         /*
4955          * Update min recovery point one last time.
4956          */
4957         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
4958
4959         /*
4960          * We should have the ending log segment currently open.  Verify, and then
4961          * close it (to avoid problems on Windows with trying to rename or delete
4962          * an open file).
4963          */
4964         Assert(readFile >= 0);
4965         Assert(readId == endLogId);
4966         Assert(readSeg == endLogSeg);
4967
4968         close(readFile);
4969         readFile = -1;
4970
4971         /*
4972          * If the segment was fetched from archival storage, we want to replace
4973          * the existing xlog segment (if any) with the archival version.  This is
4974          * because whatever is in XLOGDIR is very possibly older than what we have
4975          * from the archives, since it could have come from restoring a PGDATA
4976          * backup.      In any case, the archival version certainly is more
4977          * descriptive of what our current database state is, because that is what
4978          * we replayed from.
4979          *
4980          * Note that if we are establishing a new timeline, ThisTimeLineID is
4981          * already set to the new value, and so we will create a new file instead
4982          * of overwriting any existing file.  (This is, in fact, always the case
4983          * at present.)
4984          */
4985         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4986         XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4987
4988         if (restoredFromArchive)
4989         {
4990                 ereport(DEBUG3,
4991                                 (errmsg_internal("moving last restored xlog to \"%s\"",
4992                                                                  xlogpath)));
4993                 unlink(xlogpath);               /* might or might not exist */
4994                 if (rename(recoveryPath, xlogpath) != 0)
4995                         ereport(FATAL,
4996                                         (errcode_for_file_access(),
4997                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
4998                                                         recoveryPath, xlogpath)));
4999                 /* XXX might we need to fix permissions on the file? */
5000         }
5001         else
5002         {
5003                 /*
5004                  * If the latest segment is not archival, but there's still a
5005                  * RECOVERYXLOG laying about, get rid of it.
5006                  */
5007                 unlink(recoveryPath);   /* ignore any error */
5008
5009                 /*
5010                  * If we are establishing a new timeline, we have to copy data from
5011                  * the last WAL segment of the old timeline to create a starting WAL
5012                  * segment for the new timeline.
5013                  *
5014                  * Notify the archiver that the last WAL segment of the old timeline
5015                  * is ready to copy to archival storage. Otherwise, it is not archived
5016                  * for a while.
5017                  */
5018                 if (endTLI != ThisTimeLineID)
5019                 {
5020                         XLogFileCopy(endLogId, endLogSeg,
5021                                                  endTLI, endLogId, endLogSeg);
5022
5023                         if (XLogArchivingActive())
5024                         {
5025                                 XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
5026                                 XLogArchiveNotify(xlogpath);
5027                         }
5028                 }
5029         }
5030
5031         /*
5032          * Let's just make real sure there are not .ready or .done flags posted
5033          * for the new segment.
5034          */
5035         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5036         XLogArchiveCleanup(xlogpath);
5037
5038         /* Get rid of any remaining recovered timeline-history file, too */
5039         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5040         unlink(recoveryPath);           /* ignore any error */
5041
5042         /*
5043          * Rename the config file out of the way, so that we don't accidentally
5044          * re-enter archive recovery mode in a subsequent crash.
5045          */
5046         unlink(RECOVERY_COMMAND_DONE);
5047         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5048                 ereport(FATAL,
5049                                 (errcode_for_file_access(),
5050                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5051                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5052
5053         ereport(LOG,
5054                         (errmsg("archive recovery complete")));
5055 }
5056
5057 /*
5058  * For point-in-time recovery, this function decides whether we want to
5059  * stop applying the XLOG at or after the current record.
5060  *
5061  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5062  * *includeThis is set TRUE if we should apply this record before stopping.
5063  *
5064  * We also track the timestamp of the latest applied COMMIT/ABORT record
5065  * in recoveryLastXTime, for logging purposes.
5066  * Also, some information is saved in recoveryStopXid et al for use in
5067  * annotating the new timeline's history file.
5068  */
5069 static bool
5070 recoveryStopsHere(XLogRecord *record, bool *includeThis)
5071 {
5072         bool            stopsHere;
5073         uint8           record_info;
5074         TimestampTz recordXtime;
5075
5076         /* We only consider stopping at COMMIT or ABORT records */
5077         if (record->xl_rmid != RM_XACT_ID)
5078                 return false;
5079         record_info = record->xl_info & ~XLR_INFO_MASK;
5080         if (record_info == XLOG_XACT_COMMIT)
5081         {
5082                 xl_xact_commit *recordXactCommitData;
5083
5084                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5085                 recordXtime = recordXactCommitData->xact_time;
5086         }
5087         else if (record_info == XLOG_XACT_ABORT)
5088         {
5089                 xl_xact_abort *recordXactAbortData;
5090
5091                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5092                 recordXtime = recordXactAbortData->xact_time;
5093         }
5094         else
5095                 return false;
5096
5097         /* Do we have a PITR target at all? */
5098         if (!recoveryTarget)
5099         {
5100                 recoveryLastXTime = recordXtime;
5101                 return false;
5102         }
5103
5104         if (recoveryTargetExact)
5105         {
5106                 /*
5107                  * there can be only one transaction end record with this exact
5108                  * transactionid
5109                  *
5110                  * when testing for an xid, we MUST test for equality only, since
5111                  * transactions are numbered in the order they start, not the order
5112                  * they complete. A higher numbered xid will complete before you about
5113                  * 50% of the time...
5114                  */
5115                 stopsHere = (record->xl_xid == recoveryTargetXid);
5116                 if (stopsHere)
5117                         *includeThis = recoveryTargetInclusive;
5118         }
5119         else
5120         {
5121                 /*
5122                  * there can be many transactions that share the same commit time, so
5123                  * we stop after the last one, if we are inclusive, or stop at the
5124                  * first one if we are exclusive
5125                  */
5126                 if (recoveryTargetInclusive)
5127                         stopsHere = (recordXtime > recoveryTargetTime);
5128                 else
5129                         stopsHere = (recordXtime >= recoveryTargetTime);
5130                 if (stopsHere)
5131                         *includeThis = false;
5132         }
5133
5134         if (stopsHere)
5135         {
5136                 recoveryStopXid = record->xl_xid;
5137                 recoveryStopTime = recordXtime;
5138                 recoveryStopAfter = *includeThis;
5139
5140                 if (record_info == XLOG_XACT_COMMIT)
5141                 {
5142                         if (recoveryStopAfter)
5143                                 ereport(LOG,
5144                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5145                                                                 recoveryStopXid,
5146                                                                 timestamptz_to_str(recoveryStopTime))));
5147                         else
5148                                 ereport(LOG,
5149                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5150                                                                 recoveryStopXid,
5151                                                                 timestamptz_to_str(recoveryStopTime))));
5152                 }
5153                 else
5154                 {
5155                         if (recoveryStopAfter)
5156                                 ereport(LOG,
5157                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5158                                                                 recoveryStopXid,
5159                                                                 timestamptz_to_str(recoveryStopTime))));
5160                         else
5161                                 ereport(LOG,
5162                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5163                                                                 recoveryStopXid,
5164                                                                 timestamptz_to_str(recoveryStopTime))));
5165                 }
5166
5167                 if (recoveryStopAfter)
5168                         recoveryLastXTime = recordXtime;
5169         }
5170         else
5171                 recoveryLastXTime = recordXtime;
5172
5173         return stopsHere;
5174 }
5175
5176 /*
5177  * This must be called ONCE during postmaster or standalone-backend startup
5178  */
5179 void
5180 StartupXLOG(void)
5181 {
5182         XLogCtlInsert *Insert;
5183         CheckPoint      checkPoint;
5184         bool            wasShutdown;
5185         bool            reachedStopPoint = false;
5186         bool            haveBackupLabel = false;
5187         XLogRecPtr      RecPtr,
5188                                 LastRec,
5189                                 checkPointLoc,
5190                                 backupStopLoc,
5191                                 EndOfLog;
5192         uint32          endLogId;
5193         uint32          endLogSeg;
5194         XLogRecord *record;
5195         uint32          freespace;
5196         TransactionId oldestActiveXID;
5197         bool            bgwriterLaunched = false;
5198
5199         /*
5200          * Read control file and check XLOG status looks valid.
5201          *
5202          * Note: in most control paths, *ControlFile is already valid and we need
5203          * not do ReadControlFile() here, but might as well do it to be sure.
5204          */
5205         ReadControlFile();
5206
5207         if (ControlFile->state < DB_SHUTDOWNED ||
5208                 ControlFile->state > DB_IN_PRODUCTION ||
5209                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
5210                 ereport(FATAL,
5211                                 (errmsg("control file contains invalid data")));
5212
5213         if (ControlFile->state == DB_SHUTDOWNED)
5214                 ereport(LOG,
5215                                 (errmsg("database system was shut down at %s",
5216                                                 str_time(ControlFile->time))));
5217         else if (ControlFile->state == DB_SHUTDOWNING)
5218                 ereport(LOG,
5219                                 (errmsg("database system shutdown was interrupted; last known up at %s",
5220                                                 str_time(ControlFile->time))));
5221         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5222                 ereport(LOG,
5223                    (errmsg("database system was interrupted while in recovery at %s",
5224                                    str_time(ControlFile->time)),
5225                         errhint("This probably means that some data is corrupted and"
5226                                         " you will have to use the last backup for recovery.")));
5227         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
5228                 ereport(LOG,
5229                                 (errmsg("database system was interrupted while in recovery at log time %s",
5230                                                 str_time(ControlFile->checkPointCopy.time)),
5231                                  errhint("If this has occurred more than once some data might be corrupted"
5232                           " and you might need to choose an earlier recovery target.")));
5233         else if (ControlFile->state == DB_IN_PRODUCTION)
5234                 ereport(LOG,
5235                           (errmsg("database system was interrupted; last known up at %s",
5236                                           str_time(ControlFile->time))));
5237
5238         /* This is just to allow attaching to startup process with a debugger */
5239 #ifdef XLOG_REPLAY_DELAY
5240         if (ControlFile->state != DB_SHUTDOWNED)
5241                 pg_usleep(60000000L);
5242 #endif
5243
5244         /*
5245          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5246          * someone has performed a copy for PITR, these directories may have been
5247          * excluded and need to be re-created.
5248          */
5249         ValidateXLOGDirectoryStructure();
5250
5251         /*
5252          * Initialize on the assumption we want to recover to the same timeline
5253          * that's active according to pg_control.
5254          */
5255         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
5256
5257         /*
5258          * Check for recovery control file, and if so set up state for offline
5259          * recovery
5260          */
5261         readRecoveryCommandFile();
5262
5263         /* Now we can determine the list of expected TLIs */
5264         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
5265
5266         /*
5267          * If pg_control's timeline is not in expectedTLIs, then we cannot
5268          * proceed: the backup is not part of the history of the requested
5269          * timeline.
5270          */
5271         if (!list_member_int(expectedTLIs,
5272                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
5273                 ereport(FATAL,
5274                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
5275                                                 recoveryTargetTLI,
5276                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
5277
5278         if (read_backup_label(&checkPointLoc, &backupStopLoc))
5279         {
5280                 /*
5281                  * When a backup_label file is present, we want to roll forward from
5282                  * the checkpoint it identifies, rather than using pg_control.
5283                  */
5284                 record = ReadCheckpointRecord(checkPointLoc, 0);
5285                 if (record != NULL)
5286                 {
5287                         ereport(DEBUG1,
5288                                         (errmsg("checkpoint record is at %X/%X",
5289                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5290                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5291                 }
5292                 else
5293                 {
5294                         ereport(PANIC,
5295                                         (errmsg("could not locate required checkpoint record"),
5296                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5297                 }
5298                 /* set flag to delete it later */
5299                 haveBackupLabel = true;
5300         }
5301         else
5302         {
5303                 /*
5304                  * Get the last valid checkpoint record.  If the latest one according
5305                  * to pg_control is broken, try the next-to-last one.
5306                  */
5307                 checkPointLoc = ControlFile->checkPoint;
5308                 record = ReadCheckpointRecord(checkPointLoc, 1);
5309                 if (record != NULL)
5310                 {
5311                         ereport(DEBUG1,
5312                                         (errmsg("checkpoint record is at %X/%X",
5313                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5314                 }
5315                 else
5316                 {
5317                         checkPointLoc = ControlFile->prevCheckPoint;
5318                         record = ReadCheckpointRecord(checkPointLoc, 2);
5319                         if (record != NULL)
5320                         {
5321                                 ereport(LOG,
5322                                                 (errmsg("using previous checkpoint record at %X/%X",
5323                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5324                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5325                         }
5326                         else
5327                                 ereport(PANIC,
5328                                          (errmsg("could not locate a valid checkpoint record")));
5329                 }
5330         }
5331
5332         LastRec = RecPtr = checkPointLoc;
5333         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5334         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5335
5336         ereport(DEBUG1,
5337                         (errmsg("redo record is at %X/%X; shutdown %s",
5338                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
5339                                         wasShutdown ? "TRUE" : "FALSE")));
5340         ereport(DEBUG1,
5341                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5342                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5343                                         checkPoint.nextOid)));
5344         ereport(DEBUG1,
5345                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5346                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5347         if (!TransactionIdIsNormal(checkPoint.nextXid))
5348                 ereport(PANIC,
5349                                 (errmsg("invalid next transaction ID")));
5350
5351         ShmemVariableCache->nextXid = checkPoint.nextXid;
5352         ShmemVariableCache->nextOid = checkPoint.nextOid;
5353         ShmemVariableCache->oidCount = 0;
5354         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5355
5356         /*
5357          * We must replay WAL entries using the same TimeLineID they were created
5358          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5359          * also xlog_redo()).
5360          */
5361         ThisTimeLineID = checkPoint.ThisTimeLineID;
5362
5363         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5364
5365         if (XLByteLT(RecPtr, checkPoint.redo))
5366                 ereport(PANIC,
5367                                 (errmsg("invalid redo in checkpoint record")));
5368
5369         /*
5370          * Check whether we need to force recovery from WAL.  If it appears to
5371          * have been a clean shutdown and we did not have a recovery.conf file,
5372          * then assume no recovery needed.
5373          */
5374         if (XLByteLT(checkPoint.redo, RecPtr))
5375         {
5376                 if (wasShutdown)
5377                         ereport(PANIC,
5378                                         (errmsg("invalid redo record in shutdown checkpoint")));
5379                 InRecovery = true;
5380         }
5381         else if (ControlFile->state != DB_SHUTDOWNED)
5382                 InRecovery = true;
5383         else if (InArchiveRecovery)
5384         {
5385                 /* force recovery due to presence of recovery.conf */
5386                 InRecovery = true;
5387         }
5388
5389         /* REDO */
5390         if (InRecovery)
5391         {
5392                 int                     rmid;
5393
5394                 /*
5395                  * Update pg_control to show that we are recovering and to show the
5396                  * selected checkpoint as the place we are starting from. We also mark
5397                  * pg_control with any minimum recovery stop point obtained from a
5398                  * backup history file.
5399                  */
5400                 if (InArchiveRecovery)
5401                 {
5402                         ereport(LOG,
5403                                         (errmsg("automatic recovery in progress")));
5404                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5405                 }
5406                 else
5407                 {
5408                         ereport(LOG,
5409                                         (errmsg("database system was not properly shut down; "
5410                                                         "automatic recovery in progress")));
5411                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5412                 }
5413                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5414                 ControlFile->checkPoint = checkPointLoc;
5415                 ControlFile->checkPointCopy = checkPoint;
5416                 if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
5417                 {
5418                         if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
5419                                 ControlFile->minRecoveryPoint = backupStopLoc;
5420                 }
5421                 ControlFile->time = (pg_time_t) time(NULL);
5422                 /* No need to hold ControlFileLock yet, we aren't up far enough */
5423                 UpdateControlFile();
5424
5425                 /* initialize our local copy of minRecoveryPoint */
5426                 minRecoveryPoint = ControlFile->minRecoveryPoint;
5427
5428                 /*
5429                  * Reset pgstat data, because it may be invalid after recovery.
5430                  */
5431                 pgstat_reset_all();
5432
5433                 /*
5434                  * If there was a backup label file, it's done its job and the info
5435                  * has now been propagated into pg_control.  We must get rid of the
5436                  * label file so that if we crash during recovery, we'll pick up at
5437                  * the latest recovery restartpoint instead of going all the way back
5438                  * to the backup start point.  It seems prudent though to just rename
5439                  * the file out of the way rather than delete it completely.
5440                  */
5441                 if (haveBackupLabel)
5442                 {
5443                         unlink(BACKUP_LABEL_OLD);
5444                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5445                                 ereport(FATAL,
5446                                                 (errcode_for_file_access(),
5447                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5448                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5449                 }
5450
5451                 /* Initialize resource managers */
5452                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5453                 {
5454                         if (RmgrTable[rmid].rm_startup != NULL)
5455                                 RmgrTable[rmid].rm_startup();
5456                 }
5457
5458                 /*
5459                  * Find the first record that logically follows the checkpoint --- it
5460                  * might physically precede it, though.
5461                  */
5462                 if (XLByteLT(checkPoint.redo, RecPtr))
5463                 {
5464                         /* back up to find the record */
5465                         record = ReadRecord(&(checkPoint.redo), PANIC);
5466                 }
5467                 else
5468                 {
5469                         /* just have to read next record after CheckPoint */
5470                         record = ReadRecord(NULL, LOG);
5471                 }
5472
5473                 if (record != NULL)
5474                 {
5475                         bool            recoveryContinue = true;
5476                         bool            recoveryApply = true;
5477                         bool            reachedMinRecoveryPoint = false;
5478                         ErrorContextCallback errcontext;
5479
5480                         /* use volatile pointer to prevent code rearrangement */
5481                         volatile XLogCtlData *xlogctl = XLogCtl;
5482
5483                         /* initialize shared replayEndRecPtr */
5484                         SpinLockAcquire(&xlogctl->info_lck);
5485                         xlogctl->replayEndRecPtr = ReadRecPtr;
5486                         SpinLockRelease(&xlogctl->info_lck);
5487
5488                         InRedo = true;
5489
5490                         if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
5491                                 ereport(LOG,
5492                                                 (errmsg("redo starts at %X/%X",
5493                                                                 ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5494                         else
5495                                 ereport(LOG,
5496                                                 (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
5497                                                                 ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
5498                                                 minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
5499
5500                         /*
5501                          * Let postmaster know we've started redo now, so that it can
5502                          * launch bgwriter to perform restartpoints.  We don't bother
5503                          * during crash recovery as restartpoints can only be performed
5504                          * during archive recovery.  And we'd like to keep crash recovery
5505                          * simple, to avoid introducing bugs that could you from
5506                          * recovering after crash.
5507                          *
5508                          * After this point, we can no longer assume that we're the only
5509                          * process in addition to postmaster!  Also, fsync requests are
5510                          * subsequently to be handled by the bgwriter, not locally.
5511                          */
5512                         if (InArchiveRecovery && IsUnderPostmaster)
5513                         {
5514                                 SetForwardFsyncRequests();
5515                                 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
5516                                 bgwriterLaunched = true;
5517                         }
5518
5519                         /*
5520                          * main redo apply loop
5521                          */
5522                         do
5523                         {
5524 #ifdef WAL_DEBUG
5525                                 if (XLOG_DEBUG)
5526                                 {
5527                                         StringInfoData buf;
5528
5529                                         initStringInfo(&buf);
5530                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5531                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
5532                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
5533                                         xlog_outrec(&buf, record);
5534                                         appendStringInfo(&buf, " - ");
5535                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5536                                                                                                            record->xl_info,
5537                                                                                                          XLogRecGetData(record));
5538                                         elog(LOG, "%s", buf.data);
5539                                         pfree(buf.data);
5540                                 }
5541 #endif
5542
5543                                 /*
5544                                  * Check if we were requested to re-read config file.
5545                                  */
5546                                 if (got_SIGHUP)
5547                                 {
5548                                         got_SIGHUP = false;
5549                                         ProcessConfigFile(PGC_SIGHUP);
5550                                 }
5551
5552                                 /*
5553                                  * Check if we were requested to exit without finishing
5554                                  * recovery.
5555                                  */
5556                                 if (shutdown_requested)
5557                                         proc_exit(1);
5558
5559                                 /*
5560                                  * Have we passed our safe starting point? If so, we can tell
5561                                  * postmaster that the database is consistent now.
5562                                  */
5563                                 if (!reachedMinRecoveryPoint &&
5564                                         XLByteLT(minRecoveryPoint, EndRecPtr))
5565                                 {
5566                                         reachedMinRecoveryPoint = true;
5567                                         if (InArchiveRecovery)
5568                                         {
5569                                                 ereport(LOG,
5570                                                           (errmsg("consistent recovery state reached")));
5571                                                 if (IsUnderPostmaster)
5572                                                         SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
5573                                         }
5574                                 }
5575
5576                                 /*
5577                                  * Have we reached our recovery target?
5578                                  */
5579                                 if (recoveryStopsHere(record, &recoveryApply))
5580                                 {
5581                                         reachedStopPoint = true;        /* see below */
5582                                         recoveryContinue = false;
5583                                         if (!recoveryApply)
5584                                                 break;
5585                                 }
5586
5587                                 /* Setup error traceback support for ereport() */
5588                                 errcontext.callback = rm_redo_error_callback;
5589                                 errcontext.arg = (void *) record;
5590                                 errcontext.previous = error_context_stack;
5591                                 error_context_stack = &errcontext;
5592
5593                                 /* nextXid must be beyond record's xid */
5594                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5595                                                                                                  ShmemVariableCache->nextXid))
5596                                 {
5597                                         ShmemVariableCache->nextXid = record->xl_xid;
5598                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5599                                 }
5600
5601                                 /*
5602                                  * Update shared replayEndRecPtr before replaying this record,
5603                                  * so that XLogFlush will update minRecoveryPoint correctly.
5604                                  */
5605                                 SpinLockAcquire(&xlogctl->info_lck);
5606                                 xlogctl->replayEndRecPtr = EndRecPtr;
5607                                 SpinLockRelease(&xlogctl->info_lck);
5608
5609                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5610
5611                                 /* Pop the error context stack */
5612                                 error_context_stack = errcontext.previous;
5613
5614                                 LastRec = ReadRecPtr;
5615
5616                                 record = ReadRecord(NULL, LOG);
5617                         } while (record != NULL && recoveryContinue);
5618
5619                         /*
5620                          * end of main redo apply loop
5621                          */
5622
5623                         ereport(LOG,
5624                                         (errmsg("redo done at %X/%X",
5625                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5626                         if (recoveryLastXTime)
5627                                 ereport(LOG,
5628                                          (errmsg("last completed transaction was at log time %s",
5629                                                          timestamptz_to_str(recoveryLastXTime))));
5630                         InRedo = false;
5631                 }
5632                 else
5633                 {
5634                         /* there are no WAL records following the checkpoint */
5635                         ereport(LOG,
5636                                         (errmsg("redo is not required")));
5637                 }
5638         }
5639
5640         /*
5641          * Re-fetch the last valid or last applied record, so we can identify the
5642          * exact endpoint of what we consider the valid portion of WAL.
5643          */
5644         record = ReadRecord(&LastRec, PANIC);
5645         EndOfLog = EndRecPtr;
5646         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
5647
5648         /*
5649          * Complain if we did not roll forward far enough to render the backup
5650          * dump consistent.  Note: it is indeed okay to look at the local variable
5651          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
5652          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
5653          * advanced beyond the WAL we processed.
5654          */
5655         if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
5656         {
5657                 if (reachedStopPoint)   /* stopped because of stop request */
5658                         ereport(FATAL,
5659                                         (errmsg("requested recovery stop point is before consistent recovery point")));
5660                 else    /* ran off end of WAL */
5661                         ereport(FATAL,
5662                                         (errmsg("WAL ends before consistent recovery point")));
5663         }
5664
5665         /*
5666          * Consider whether we need to assign a new timeline ID.
5667          *
5668          * If we are doing an archive recovery, we always assign a new ID.      This
5669          * handles a couple of issues.  If we stopped short of the end of WAL
5670          * during recovery, then we are clearly generating a new timeline and must
5671          * assign it a unique new ID.  Even if we ran to the end, modifying the
5672          * current last segment is problematic because it may result in trying to
5673          * overwrite an already-archived copy of that segment, and we encourage
5674          * DBAs to make their archive_commands reject that.  We can dodge the
5675          * problem by making the new active segment have a new timeline ID.
5676          *
5677          * In a normal crash recovery, we can just extend the timeline we were in.
5678          */
5679         if (InArchiveRecovery)
5680         {
5681                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
5682                 ereport(LOG,
5683                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
5684                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
5685                                                          curFileTLI, endLogId, endLogSeg);
5686         }
5687
5688         /* Save the selected TimeLineID in shared memory, too */
5689         XLogCtl->ThisTimeLineID = ThisTimeLineID;
5690
5691         /*
5692          * We are now done reading the old WAL.  Turn off archive fetching if it
5693          * was active, and make a writable copy of the last WAL segment. (Note
5694          * that we also have a copy of the last block of the old WAL in readBuf;
5695          * we will use that below.)
5696          */
5697         if (InArchiveRecovery)
5698                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
5699
5700         /*
5701          * Prepare to write WAL starting at EndOfLog position, and init xlog
5702          * buffer cache using the block containing the last record from the
5703          * previous incarnation.
5704          */
5705         openLogId = endLogId;
5706         openLogSeg = endLogSeg;
5707         openLogFile = XLogFileOpen(openLogId, openLogSeg);
5708         openLogOff = 0;
5709         Insert = &XLogCtl->Insert;
5710         Insert->PrevRecord = LastRec;
5711         XLogCtl->xlblocks[0].xlogid = openLogId;
5712         XLogCtl->xlblocks[0].xrecoff =
5713                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
5714
5715         /*
5716          * Tricky point here: readBuf contains the *last* block that the LastRec
5717          * record spans, not the one it starts in.      The last block is indeed the
5718          * one we want to use.
5719          */
5720         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
5721         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
5722         Insert->currpos = (char *) Insert->currpage +
5723                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
5724
5725         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5726
5727         XLogCtl->Write.LogwrtResult = LogwrtResult;
5728         Insert->LogwrtResult = LogwrtResult;
5729         XLogCtl->LogwrtResult = LogwrtResult;
5730
5731         XLogCtl->LogwrtRqst.Write = EndOfLog;
5732         XLogCtl->LogwrtRqst.Flush = EndOfLog;
5733
5734         freespace = INSERT_FREESPACE(Insert);
5735         if (freespace > 0)
5736         {
5737                 /* Make sure rest of page is zero */
5738                 MemSet(Insert->currpos, 0, freespace);
5739                 XLogCtl->Write.curridx = 0;
5740         }
5741         else
5742         {
5743                 /*
5744                  * Whenever Write.LogwrtResult points to exactly the end of a page,
5745                  * Write.curridx must point to the *next* page (see XLogWrite()).
5746                  *
5747                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
5748                  * this is sufficient.  The first actual attempt to insert a log
5749                  * record will advance the insert state.
5750                  */
5751                 XLogCtl->Write.curridx = NextBufIdx(0);
5752         }
5753
5754         /* Pre-scan prepared transactions to find out the range of XIDs present */
5755         oldestActiveXID = PrescanPreparedTransactions();
5756
5757         if (InRecovery)
5758         {
5759                 int                     rmid;
5760
5761                 /*
5762                  * Allow resource managers to do any required cleanup.
5763                  */
5764                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5765                 {
5766                         if (RmgrTable[rmid].rm_cleanup != NULL)
5767                                 RmgrTable[rmid].rm_cleanup();
5768                 }
5769
5770                 /*
5771                  * Check to see if the XLOG sequence contained any unresolved
5772                  * references to uninitialized pages.
5773                  */
5774                 XLogCheckInvalidPages();
5775
5776                 /*
5777                  * Perform a checkpoint to update all our recovery activity to disk.
5778                  *
5779                  * Note that we write a shutdown checkpoint rather than an on-line
5780                  * one. This is not particularly critical, but since we may be
5781                  * assigning a new TLI, using a shutdown checkpoint allows us to have
5782                  * the rule that TLI only changes in shutdown checkpoints, which
5783                  * allows some extra error checking in xlog_redo.
5784                  */
5785                 if (bgwriterLaunched)
5786                         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
5787                                                           CHECKPOINT_IMMEDIATE |
5788                                                           CHECKPOINT_WAIT);
5789                 else
5790                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
5791
5792                 /*
5793                  * And finally, execute the recovery_end_command, if any.
5794                  */
5795                 if (recoveryEndCommand)
5796                         ExecuteRecoveryEndCommand();
5797         }
5798
5799         /*
5800          * Preallocate additional log files, if wanted.
5801          */
5802         PreallocXlogFiles(EndOfLog);
5803
5804         /*
5805          * Okay, we're officially UP.
5806          */
5807         InRecovery = false;
5808
5809         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5810         ControlFile->state = DB_IN_PRODUCTION;
5811         ControlFile->time = (pg_time_t) time(NULL);
5812         UpdateControlFile();
5813         LWLockRelease(ControlFileLock);
5814
5815         /* start the archive_timeout timer running */
5816         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
5817
5818         /* initialize shared-memory copy of latest checkpoint XID/epoch */
5819         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5820         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
5821
5822         /* also initialize latestCompletedXid, to nextXid - 1 */
5823         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
5824         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
5825
5826         /* Start up the commit log and related stuff, too */
5827         StartupCLOG();
5828         StartupSUBTRANS(oldestActiveXID);
5829         StartupMultiXact();
5830
5831         /* Reload shared-memory state for prepared transactions */
5832         RecoverPreparedTransactions();
5833
5834         /* Shut down readFile facility, free space */
5835         if (readFile >= 0)
5836         {
5837                 close(readFile);
5838                 readFile = -1;
5839         }
5840         if (readBuf)
5841         {
5842                 free(readBuf);
5843                 readBuf = NULL;
5844         }
5845         if (readRecordBuf)
5846         {
5847                 free(readRecordBuf);
5848                 readRecordBuf = NULL;
5849                 readRecordBufSize = 0;
5850         }
5851
5852         /*
5853          * All done.  Allow backends to write WAL.  (Although the bool flag is
5854          * probably atomic in itself, we use the info_lck here to ensure that
5855          * there are no race conditions concerning visibility of other recent
5856          * updates to shared memory.)
5857          */
5858         {
5859                 /* use volatile pointer to prevent code rearrangement */
5860                 volatile XLogCtlData *xlogctl = XLogCtl;
5861
5862                 SpinLockAcquire(&xlogctl->info_lck);
5863                 xlogctl->SharedRecoveryInProgress = false;
5864                 SpinLockRelease(&xlogctl->info_lck);
5865         }
5866 }
5867
5868 /*
5869  * Is the system still in recovery?
5870  *
5871  * Unlike testing InRecovery, this works in any process that's connected to
5872  * shared memory.
5873  *
5874  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
5875  * variables the first time we see that recovery is finished.
5876  */
5877 bool
5878 RecoveryInProgress(void)
5879 {
5880         /*
5881          * We check shared state each time only until we leave recovery mode.
5882          * We can't re-enter recovery, so there's no need to keep checking after
5883          * the shared variable has once been seen false.
5884          */
5885         if (!LocalRecoveryInProgress)
5886                 return false;
5887         else
5888         {
5889                 /* use volatile pointer to prevent code rearrangement */
5890                 volatile XLogCtlData *xlogctl = XLogCtl;
5891
5892                 /* spinlock is essential on machines with weak memory ordering! */
5893                 SpinLockAcquire(&xlogctl->info_lck);
5894                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
5895                 SpinLockRelease(&xlogctl->info_lck);
5896
5897                 /*
5898                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
5899                  * is finished.  (If you change this, see also
5900                  * LocalSetXLogInsertAllowed.)
5901                  */
5902                 if (!LocalRecoveryInProgress)
5903                         InitXLOGAccess();
5904
5905                 return LocalRecoveryInProgress;
5906         }
5907 }
5908
5909 /*
5910  * Is this process allowed to insert new WAL records?
5911  *
5912  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
5913  * But we also have provisions for forcing the result "true" or "false"
5914  * within specific processes regardless of the global state.
5915  */
5916 bool
5917 XLogInsertAllowed(void)
5918 {
5919         /*
5920          * If value is "unconditionally true" or "unconditionally false",
5921          * just return it.  This provides the normal fast path once recovery
5922          * is known done.
5923          */
5924         if (LocalXLogInsertAllowed >= 0)
5925                 return (bool) LocalXLogInsertAllowed;
5926
5927         /*
5928          * Else, must check to see if we're still in recovery.
5929          */
5930         if (RecoveryInProgress())
5931                 return false;
5932
5933         /*
5934          * On exit from recovery, reset to "unconditionally true", since there
5935          * is no need to keep checking.
5936          */
5937         LocalXLogInsertAllowed = 1;
5938         return true;
5939 }
5940
5941 /*
5942  * Make XLogInsertAllowed() return true in the current process only.
5943  */
5944 static void
5945 LocalSetXLogInsertAllowed(void)
5946 {
5947         Assert(LocalXLogInsertAllowed == -1);
5948         LocalXLogInsertAllowed = 1;
5949
5950         /* Initialize as RecoveryInProgress() would do when switching state */
5951         InitXLOGAccess();
5952 }
5953
5954 /*
5955  * Subroutine to try to fetch and validate a prior checkpoint record.
5956  *
5957  * whichChkpt identifies the checkpoint (merely for reporting purposes).
5958  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
5959  */
5960 static XLogRecord *
5961 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
5962 {
5963         XLogRecord *record;
5964
5965         if (!XRecOffIsValid(RecPtr.xrecoff))
5966         {
5967                 switch (whichChkpt)
5968                 {
5969                         case 1:
5970                                 ereport(LOG,
5971                                 (errmsg("invalid primary checkpoint link in control file")));
5972                                 break;
5973                         case 2:
5974                                 ereport(LOG,
5975                                                 (errmsg("invalid secondary checkpoint link in control file")));
5976                                 break;
5977                         default:
5978                                 ereport(LOG,
5979                                    (errmsg("invalid checkpoint link in backup_label file")));
5980                                 break;
5981                 }
5982                 return NULL;
5983         }
5984
5985         record = ReadRecord(&RecPtr, LOG);
5986
5987         if (record == NULL)
5988         {
5989                 switch (whichChkpt)
5990                 {
5991                         case 1:
5992                                 ereport(LOG,
5993                                                 (errmsg("invalid primary checkpoint record")));
5994                                 break;
5995                         case 2:
5996                                 ereport(LOG,
5997                                                 (errmsg("invalid secondary checkpoint record")));
5998                                 break;
5999                         default:
6000                                 ereport(LOG,
6001                                                 (errmsg("invalid checkpoint record")));
6002                                 break;
6003                 }
6004                 return NULL;
6005         }
6006         if (record->xl_rmid != RM_XLOG_ID)
6007         {
6008                 switch (whichChkpt)
6009                 {
6010                         case 1:
6011                                 ereport(LOG,
6012                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
6013                                 break;
6014                         case 2:
6015                                 ereport(LOG,
6016                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
6017                                 break;
6018                         default:
6019                                 ereport(LOG,
6020                                 (errmsg("invalid resource manager ID in checkpoint record")));
6021                                 break;
6022                 }
6023                 return NULL;
6024         }
6025         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
6026                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
6027         {
6028                 switch (whichChkpt)
6029                 {
6030                         case 1:
6031                                 ereport(LOG,
6032                                    (errmsg("invalid xl_info in primary checkpoint record")));
6033                                 break;
6034                         case 2:
6035                                 ereport(LOG,
6036                                  (errmsg("invalid xl_info in secondary checkpoint record")));
6037                                 break;
6038                         default:
6039                                 ereport(LOG,
6040                                                 (errmsg("invalid xl_info in checkpoint record")));
6041                                 break;
6042                 }
6043                 return NULL;
6044         }
6045         if (record->xl_len != sizeof(CheckPoint) ||
6046                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
6047         {
6048                 switch (whichChkpt)
6049                 {
6050                         case 1:
6051                                 ereport(LOG,
6052                                         (errmsg("invalid length of primary checkpoint record")));
6053                                 break;
6054                         case 2:
6055                                 ereport(LOG,
6056                                   (errmsg("invalid length of secondary checkpoint record")));
6057                                 break;
6058                         default:
6059                                 ereport(LOG,
6060                                                 (errmsg("invalid length of checkpoint record")));
6061                                 break;
6062                 }
6063                 return NULL;
6064         }
6065         return record;
6066 }
6067
6068 /*
6069  * This must be called during startup of a backend process, except that
6070  * it need not be called in a standalone backend (which does StartupXLOG
6071  * instead).  We need to initialize the local copies of ThisTimeLineID and
6072  * RedoRecPtr.
6073  *
6074  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6075  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6076  * unnecessary however, since the postmaster itself never touches XLOG anyway.
6077  */
6078 void
6079 InitXLOGAccess(void)
6080 {
6081         /* ThisTimeLineID doesn't change so we need no lock to copy it */
6082         ThisTimeLineID = XLogCtl->ThisTimeLineID;
6083         Assert(ThisTimeLineID != 0);
6084
6085         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
6086         (void) GetRedoRecPtr();
6087 }
6088
6089 /*
6090  * Once spawned, a backend may update its local RedoRecPtr from
6091  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
6092  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
6093  */
6094 XLogRecPtr
6095 GetRedoRecPtr(void)
6096 {
6097         /* use volatile pointer to prevent code rearrangement */
6098         volatile XLogCtlData *xlogctl = XLogCtl;
6099
6100         SpinLockAcquire(&xlogctl->info_lck);
6101         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
6102         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6103         SpinLockRelease(&xlogctl->info_lck);
6104
6105         return RedoRecPtr;
6106 }
6107
6108 /*
6109  * GetInsertRecPtr -- Returns the current insert position.
6110  *
6111  * NOTE: The value *actually* returned is the position of the last full
6112  * xlog page. It lags behind the real insert position by at most 1 page.
6113  * For that, we don't need to acquire WALInsertLock which can be quite
6114  * heavily contended, and an approximation is enough for the current
6115  * usage of this function.
6116  */
6117 XLogRecPtr
6118 GetInsertRecPtr(void)
6119 {
6120         /* use volatile pointer to prevent code rearrangement */
6121         volatile XLogCtlData *xlogctl = XLogCtl;
6122         XLogRecPtr      recptr;
6123
6124         SpinLockAcquire(&xlogctl->info_lck);
6125         recptr = xlogctl->LogwrtRqst.Write;
6126         SpinLockRelease(&xlogctl->info_lck);
6127
6128         return recptr;
6129 }
6130
6131 /*
6132  * Get the time of the last xlog segment switch
6133  */
6134 pg_time_t
6135 GetLastSegSwitchTime(void)
6136 {
6137         pg_time_t       result;
6138
6139         /* Need WALWriteLock, but shared lock is sufficient */
6140         LWLockAcquire(WALWriteLock, LW_SHARED);
6141         result = XLogCtl->Write.lastSegSwitchTime;
6142         LWLockRelease(WALWriteLock);
6143
6144         return result;
6145 }
6146
6147 /*
6148  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
6149  *
6150  * This is exported for use by code that would like to have 64-bit XIDs.
6151  * We don't really support such things, but all XIDs within the system
6152  * can be presumed "close to" the result, and thus the epoch associated
6153  * with them can be determined.
6154  */
6155 void
6156 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
6157 {
6158         uint32          ckptXidEpoch;
6159         TransactionId ckptXid;
6160         TransactionId nextXid;
6161
6162         /* Must read checkpoint info first, else have race condition */
6163         {
6164                 /* use volatile pointer to prevent code rearrangement */
6165                 volatile XLogCtlData *xlogctl = XLogCtl;
6166
6167                 SpinLockAcquire(&xlogctl->info_lck);
6168                 ckptXidEpoch = xlogctl->ckptXidEpoch;
6169                 ckptXid = xlogctl->ckptXid;
6170                 SpinLockRelease(&xlogctl->info_lck);
6171         }
6172
6173         /* Now fetch current nextXid */
6174         nextXid = ReadNewTransactionId();
6175
6176         /*
6177          * nextXid is certainly logically later than ckptXid.  So if it's
6178          * numerically less, it must have wrapped into the next epoch.
6179          */
6180         if (nextXid < ckptXid)
6181                 ckptXidEpoch++;
6182
6183         *xid = nextXid;
6184         *epoch = ckptXidEpoch;
6185 }
6186
6187 /*
6188  * This must be called ONCE during postmaster or standalone-backend shutdown
6189  */
6190 void
6191 ShutdownXLOG(int code, Datum arg)
6192 {
6193         ereport(LOG,
6194                         (errmsg("shutting down")));
6195
6196         if (RecoveryInProgress())
6197                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6198         else
6199         {
6200                 /*
6201                  * If archiving is enabled, rotate the last XLOG file so that all the
6202                  * remaining records are archived (postmaster wakes up the archiver
6203                  * process one more time at the end of shutdown). The checkpoint
6204                  * record will go to the next XLOG file and won't be archived (yet).
6205                  */
6206                 if (XLogArchivingActive() && XLogArchiveCommandSet())
6207                         RequestXLogSwitch();
6208
6209                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6210         }
6211         ShutdownCLOG();
6212         ShutdownSUBTRANS();
6213         ShutdownMultiXact();
6214
6215         ereport(LOG,
6216                         (errmsg("database system is shut down")));
6217 }
6218
6219 /*
6220  * Log start of a checkpoint.
6221  */
6222 static void
6223 LogCheckpointStart(int flags, bool restartpoint)
6224 {
6225         const char *msg;
6226
6227         /*
6228          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
6229          * the main message, but what about all the flags?
6230          */
6231         if (restartpoint)
6232                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
6233         else
6234                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
6235
6236         elog(LOG, msg,
6237                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6238                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6239                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6240                  (flags & CHECKPOINT_FORCE) ? " force" : "",
6241                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
6242                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
6243                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
6244 }
6245
6246 /*
6247  * Log end of a checkpoint.
6248  */
6249 static void
6250 LogCheckpointEnd(bool restartpoint)
6251 {
6252         long            write_secs,
6253                                 sync_secs,
6254                                 total_secs;
6255         int                     write_usecs,
6256                                 sync_usecs,
6257                                 total_usecs;
6258
6259         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6260
6261         TimestampDifference(CheckpointStats.ckpt_start_t,
6262                                                 CheckpointStats.ckpt_end_t,
6263                                                 &total_secs, &total_usecs);
6264
6265         TimestampDifference(CheckpointStats.ckpt_write_t,
6266                                                 CheckpointStats.ckpt_sync_t,
6267                                                 &write_secs, &write_usecs);
6268
6269         TimestampDifference(CheckpointStats.ckpt_sync_t,
6270                                                 CheckpointStats.ckpt_sync_end_t,
6271                                                 &sync_secs, &sync_usecs);
6272
6273         if (restartpoint)
6274                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
6275                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
6276                          CheckpointStats.ckpt_bufs_written,
6277                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6278                          write_secs, write_usecs / 1000,
6279                          sync_secs, sync_usecs / 1000,
6280                          total_secs, total_usecs / 1000);
6281         else
6282                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
6283                          "%d transaction log file(s) added, %d removed, %d recycled; "
6284                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
6285                          CheckpointStats.ckpt_bufs_written,
6286                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6287                          CheckpointStats.ckpt_segs_added,
6288                          CheckpointStats.ckpt_segs_removed,
6289                          CheckpointStats.ckpt_segs_recycled,
6290                          write_secs, write_usecs / 1000,
6291                          sync_secs, sync_usecs / 1000,
6292                          total_secs, total_usecs / 1000);
6293 }
6294
6295 /*
6296  * Perform a checkpoint --- either during shutdown, or on-the-fly
6297  *
6298  * flags is a bitwise OR of the following:
6299  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6300  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
6301  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
6302  *              ignoring checkpoint_completion_target parameter.
6303  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
6304  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6305  *              CHECKPOINT_END_OF_RECOVERY).
6306  *
6307  * Note: flags contains other bits, of interest here only for logging purposes.
6308  * In particular note that this routine is synchronous and does not pay
6309  * attention to CHECKPOINT_WAIT.
6310  */
6311 void
6312 CreateCheckPoint(int flags)
6313 {
6314         bool            shutdown;
6315         CheckPoint      checkPoint;
6316         XLogRecPtr      recptr;
6317         XLogCtlInsert *Insert = &XLogCtl->Insert;
6318         XLogRecData rdata;
6319         uint32          freespace;
6320         uint32          _logId;
6321         uint32          _logSeg;
6322         TransactionId *inCommitXids;
6323         int                     nInCommit;
6324
6325         /*
6326          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
6327          * issued at a different time.
6328          */
6329         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
6330                 shutdown = true;
6331         else
6332                 shutdown = false;
6333
6334         /* sanity check */
6335         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
6336                 elog(ERROR, "can't create a checkpoint during recovery");
6337
6338         /*
6339          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
6340          * (This is just pro forma, since in the present system structure there is
6341          * only one process that is allowed to issue checkpoints at any given
6342          * time.)
6343          */
6344         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
6345
6346         /*
6347          * Prepare to accumulate statistics.
6348          *
6349          * Note: because it is possible for log_checkpoints to change while a
6350          * checkpoint proceeds, we always accumulate stats, even if
6351          * log_checkpoints is currently off.
6352          */
6353         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6354         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6355
6356         /*
6357          * Use a critical section to force system panic if we have trouble.
6358          */
6359         START_CRIT_SECTION();
6360
6361         if (shutdown)
6362         {
6363                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6364                 ControlFile->state = DB_SHUTDOWNING;
6365                 ControlFile->time = (pg_time_t) time(NULL);
6366                 UpdateControlFile();
6367                 LWLockRelease(ControlFileLock);
6368         }
6369
6370         /*
6371          * Let smgr prepare for checkpoint; this has to happen before we determine
6372          * the REDO pointer.  Note that smgr must not do anything that'd have to
6373          * be undone if we decide no checkpoint is needed.
6374          */
6375         smgrpreckpt();
6376
6377         /* Begin filling in the checkpoint WAL record */
6378         MemSet(&checkPoint, 0, sizeof(checkPoint));
6379         checkPoint.time = (pg_time_t) time(NULL);
6380
6381         /*
6382          * We must hold WALInsertLock while examining insert state to determine
6383          * the checkpoint REDO pointer.
6384          */
6385         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6386
6387         /*
6388          * If this isn't a shutdown or forced checkpoint, and we have not inserted
6389          * any XLOG records since the start of the last checkpoint, skip the
6390          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
6391          * when the system is idle. That wastes log space, and more importantly it
6392          * exposes us to possible loss of both current and previous checkpoint
6393          * records if the machine crashes just as we're writing the update.
6394          * (Perhaps it'd make even more sense to checkpoint only when the previous
6395          * checkpoint record is in a different xlog page?)
6396          *
6397          * We have to make two tests to determine that nothing has happened since
6398          * the start of the last checkpoint: current insertion point must match
6399          * the end of the last checkpoint record, and its redo pointer must point
6400          * to itself.
6401          */
6402         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
6403                                   CHECKPOINT_FORCE)) == 0)
6404         {
6405                 XLogRecPtr      curInsert;
6406
6407                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
6408                 if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
6409                         curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
6410                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
6411                         ControlFile->checkPoint.xlogid ==
6412                         ControlFile->checkPointCopy.redo.xlogid &&
6413                         ControlFile->checkPoint.xrecoff ==
6414                         ControlFile->checkPointCopy.redo.xrecoff)
6415                 {
6416                         LWLockRelease(WALInsertLock);
6417                         LWLockRelease(CheckpointLock);
6418                         END_CRIT_SECTION();
6419                         return;
6420                 }
6421         }
6422
6423         /*
6424          * Compute new REDO record ptr = location of next XLOG record.
6425          *
6426          * NB: this is NOT necessarily where the checkpoint record itself will be,
6427          * since other backends may insert more XLOG records while we're off doing
6428          * the buffer flush work.  Those XLOG records are logically after the
6429          * checkpoint, even though physically before it.  Got that?
6430          */
6431         freespace = INSERT_FREESPACE(Insert);
6432         if (freespace < SizeOfXLogRecord)
6433         {
6434                 (void) AdvanceXLInsertBuffer(false);
6435                 /* OK to ignore update return flag, since we will do flush anyway */
6436                 freespace = INSERT_FREESPACE(Insert);
6437         }
6438         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
6439
6440         /*
6441          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
6442          * must be done while holding the insert lock AND the info_lck.
6443          *
6444          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
6445          * pointing past where it really needs to point.  This is okay; the only
6446          * consequence is that XLogInsert might back up whole buffers that it
6447          * didn't really need to.  We can't postpone advancing RedoRecPtr because
6448          * XLogInserts that happen while we are dumping buffers must assume that
6449          * their buffer changes are not included in the checkpoint.
6450          */
6451         {
6452                 /* use volatile pointer to prevent code rearrangement */
6453                 volatile XLogCtlData *xlogctl = XLogCtl;
6454
6455                 SpinLockAcquire(&xlogctl->info_lck);
6456                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
6457                 SpinLockRelease(&xlogctl->info_lck);
6458         }
6459
6460         /*
6461          * Now we can release WAL insert lock, allowing other xacts to proceed
6462          * while we are flushing disk buffers.
6463          */
6464         LWLockRelease(WALInsertLock);
6465
6466         /*
6467          * If enabled, log checkpoint start.  We postpone this until now so as not
6468          * to log anything if we decided to skip the checkpoint.
6469          */
6470         if (log_checkpoints)
6471                 LogCheckpointStart(flags, false);
6472
6473         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
6474
6475         /*
6476          * Before flushing data, we must wait for any transactions that are
6477          * currently in their commit critical sections.  If an xact inserted its
6478          * commit record into XLOG just before the REDO point, then a crash
6479          * restart from the REDO point would not replay that record, which means
6480          * that our flushing had better include the xact's update of pg_clog.  So
6481          * we wait till he's out of his commit critical section before proceeding.
6482          * See notes in RecordTransactionCommit().
6483          *
6484          * Because we've already released WALInsertLock, this test is a bit fuzzy:
6485          * it is possible that we will wait for xacts we didn't really need to
6486          * wait for.  But the delay should be short and it seems better to make
6487          * checkpoint take a bit longer than to hold locks longer than necessary.
6488          * (In fact, the whole reason we have this issue is that xact.c does
6489          * commit record XLOG insertion and clog update as two separate steps
6490          * protected by different locks, but again that seems best on grounds of
6491          * minimizing lock contention.)
6492          *
6493          * A transaction that has not yet set inCommit when we look cannot be at
6494          * risk, since he's not inserted his commit record yet; and one that's
6495          * already cleared it is not at risk either, since he's done fixing clog
6496          * and we will correctly flush the update below.  So we cannot miss any
6497          * xacts we need to wait for.
6498          */
6499         nInCommit = GetTransactionsInCommit(&inCommitXids);
6500         if (nInCommit > 0)
6501         {
6502                 do
6503                 {
6504                         pg_usleep(10000L);      /* wait for 10 msec */
6505                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
6506         }
6507         pfree(inCommitXids);
6508
6509         /*
6510          * Get the other info we need for the checkpoint record.
6511          */
6512         LWLockAcquire(XidGenLock, LW_SHARED);
6513         checkPoint.nextXid = ShmemVariableCache->nextXid;
6514         LWLockRelease(XidGenLock);
6515
6516         /* Increase XID epoch if we've wrapped around since last checkpoint */
6517         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
6518         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
6519                 checkPoint.nextXidEpoch++;
6520
6521         LWLockAcquire(OidGenLock, LW_SHARED);
6522         checkPoint.nextOid = ShmemVariableCache->nextOid;
6523         if (!shutdown)
6524                 checkPoint.nextOid += ShmemVariableCache->oidCount;
6525         LWLockRelease(OidGenLock);
6526
6527         MultiXactGetCheckptMulti(shutdown,
6528                                                          &checkPoint.nextMulti,
6529                                                          &checkPoint.nextMultiOffset);
6530
6531         /*
6532          * Having constructed the checkpoint record, ensure all shmem disk buffers
6533          * and commit-log buffers are flushed to disk.
6534          *
6535          * This I/O could fail for various reasons.  If so, we will fail to
6536          * complete the checkpoint, but there is no reason to force a system
6537          * panic. Accordingly, exit critical section while doing it.
6538          */
6539         END_CRIT_SECTION();
6540
6541         CheckPointGuts(checkPoint.redo, flags);
6542
6543         START_CRIT_SECTION();
6544
6545         /*
6546          * An end-of-recovery checkpoint is created before anyone is allowed to
6547          * write WAL. To allow us to write the checkpoint record, temporarily
6548          * enable XLogInsertAllowed.
6549          */
6550         if (flags & CHECKPOINT_END_OF_RECOVERY)
6551                 LocalSetXLogInsertAllowed();
6552
6553         /*
6554          * This needs to be done after LocalSetXLogInsertAllowed(), else
6555          * ThisTimeLineID might still be uninitialized.
6556          */
6557         checkPoint.ThisTimeLineID = ThisTimeLineID;
6558
6559         /*
6560          * Now insert the checkpoint record into XLOG.
6561          */
6562         rdata.data = (char *) (&checkPoint);
6563         rdata.len = sizeof(checkPoint);
6564         rdata.buffer = InvalidBuffer;
6565         rdata.next = NULL;
6566
6567         recptr = XLogInsert(RM_XLOG_ID,
6568                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
6569                                                 XLOG_CHECKPOINT_ONLINE,
6570                                                 &rdata);
6571
6572         XLogFlush(recptr);
6573
6574         /*
6575          * We mustn't write any new WAL after a shutdown checkpoint, or it will
6576          * be overwritten at next startup.  No-one should even try, this just
6577          * allows sanity-checking.  In the case of an end-of-recovery checkpoint,
6578          * we want to just temporarily disable writing until the system has exited
6579          * recovery.
6580          */
6581         if (shutdown)
6582         {
6583                 if (flags & CHECKPOINT_END_OF_RECOVERY)
6584                         LocalXLogInsertAllowed = -1;    /* return to "check" state */
6585                 else
6586                         LocalXLogInsertAllowed = 0;             /* never again write WAL */
6587         }
6588
6589         /*
6590          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
6591          * = end of actual checkpoint record.
6592          */
6593         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
6594                 ereport(PANIC,
6595                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
6596
6597         /*
6598          * Select point at which we can truncate the log, which we base on the
6599          * prior checkpoint's earliest info.
6600          */
6601         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
6602
6603         /*
6604          * Update the control file.
6605          */
6606         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6607         if (shutdown)
6608                 ControlFile->state = DB_SHUTDOWNED;
6609         ControlFile->prevCheckPoint = ControlFile->checkPoint;
6610         ControlFile->checkPoint = ProcLastRecPtr;
6611         ControlFile->checkPointCopy = checkPoint;
6612         ControlFile->time = (pg_time_t) time(NULL);
6613         UpdateControlFile();
6614         LWLockRelease(ControlFileLock);
6615
6616         /* Update shared-memory copy of checkpoint XID/epoch */
6617         {
6618                 /* use volatile pointer to prevent code rearrangement */
6619                 volatile XLogCtlData *xlogctl = XLogCtl;
6620
6621                 SpinLockAcquire(&xlogctl->info_lck);
6622                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
6623                 xlogctl->ckptXid = checkPoint.nextXid;
6624                 SpinLockRelease(&xlogctl->info_lck);
6625         }
6626
6627         /*
6628          * We are now done with critical updates; no need for system panic if we
6629          * have trouble while fooling with old log segments.
6630          */
6631         END_CRIT_SECTION();
6632
6633         /*
6634          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
6635          */
6636         smgrpostckpt();
6637
6638         /*
6639          * Delete old log files (those no longer needed even for previous
6640          * checkpoint).
6641          */
6642         if (_logId || _logSeg)
6643         {
6644                 PrevLogSeg(_logId, _logSeg);
6645                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
6646         }
6647
6648         /*
6649          * Make more log segments if needed.  (Do this after recycling old log
6650          * segments, since that may supply some of the needed files.)
6651          */
6652         if (!shutdown)
6653                 PreallocXlogFiles(recptr);
6654
6655         /*
6656          * Truncate pg_subtrans if possible.  We can throw away all data before
6657          * the oldest XMIN of any running transaction.  No future transaction will
6658          * attempt to reference any pg_subtrans entry older than that (see Asserts
6659          * in subtrans.c).      During recovery, though, we mustn't do this because
6660          * StartupSUBTRANS hasn't been called yet.
6661          */
6662         if (!RecoveryInProgress())
6663                 TruncateSUBTRANS(GetOldestXmin(true, false));
6664
6665         /* All real work is done, but log before releasing lock. */
6666         if (log_checkpoints)
6667                 LogCheckpointEnd(false);
6668
6669         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
6670                                                                          NBuffers,
6671                                                                          CheckpointStats.ckpt_segs_added,
6672                                                                          CheckpointStats.ckpt_segs_removed,
6673                                                                          CheckpointStats.ckpt_segs_recycled);
6674
6675         LWLockRelease(CheckpointLock);
6676 }
6677
6678 /*
6679  * Flush all data in shared memory to disk, and fsync
6680  *
6681  * This is the common code shared between regular checkpoints and
6682  * recovery restartpoints.
6683  */
6684 static void
6685 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
6686 {
6687         CheckPointCLOG();
6688         CheckPointSUBTRANS();
6689         CheckPointMultiXact();
6690         CheckPointBuffers(flags);       /* performs all required fsyncs */
6691         /* We deliberately delay 2PC checkpointing as long as possible */
6692         CheckPointTwoPhase(checkPointRedo);
6693 }
6694
6695 /*
6696  * Save a checkpoint for recovery restart if appropriate
6697  *
6698  * This function is called each time a checkpoint record is read from XLOG.
6699  * It must determine whether the checkpoint represents a safe restartpoint or
6700  * not.  If so, the checkpoint record is stashed in shared memory so that
6701  * CreateRestartPoint can consult it.  (Note that the latter function is
6702  * executed by the bgwriter, while this one will be executed by the startup
6703  * process.)
6704  */
6705 static void
6706 RecoveryRestartPoint(const CheckPoint *checkPoint)
6707 {
6708         int                     rmid;
6709
6710         /* use volatile pointer to prevent code rearrangement */
6711         volatile XLogCtlData *xlogctl = XLogCtl;
6712
6713         /*
6714          * Is it safe to checkpoint?  We must ask each of the resource managers
6715          * whether they have any partial state information that might prevent a
6716          * correct restart from this point.  If so, we skip this opportunity, but
6717          * return at the next checkpoint record for another try.
6718          */
6719         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6720         {
6721                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
6722                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
6723                         {
6724                                 elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
6725                                          rmid,
6726                                          checkPoint->redo.xlogid,
6727                                          checkPoint->redo.xrecoff);
6728                                 return;
6729                         }
6730         }
6731
6732         /*
6733          * Copy the checkpoint record to shared memory, so that bgwriter can use
6734          * it the next time it wants to perform a restartpoint.
6735          */
6736         SpinLockAcquire(&xlogctl->info_lck);
6737         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
6738         memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
6739         SpinLockRelease(&xlogctl->info_lck);
6740 }
6741
6742 /*
6743  * Establish a restartpoint if possible.
6744  *
6745  * This is similar to CreateCheckPoint, but is used during WAL recovery
6746  * to establish a point from which recovery can roll forward without
6747  * replaying the entire recovery log.
6748  *
6749  * Returns true if a new restartpoint was established. We can only establish
6750  * a restartpoint if we have replayed a safe checkpoint record since last
6751  * restartpoint.
6752  */
6753 bool
6754 CreateRestartPoint(int flags)
6755 {
6756         XLogRecPtr      lastCheckPointRecPtr;
6757         CheckPoint      lastCheckPoint;
6758
6759         /* use volatile pointer to prevent code rearrangement */
6760         volatile XLogCtlData *xlogctl = XLogCtl;
6761
6762         /*
6763          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
6764          * happens at a time.
6765          */
6766         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
6767
6768         /* Get a local copy of the last safe checkpoint record. */
6769         SpinLockAcquire(&xlogctl->info_lck);
6770         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
6771         memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
6772         SpinLockRelease(&xlogctl->info_lck);
6773
6774         /*
6775          * Check that we're still in recovery mode. It's ok if we exit recovery
6776          * mode after this check, the restart point is valid anyway.
6777          */
6778         if (!RecoveryInProgress())
6779         {
6780                 ereport(DEBUG2,
6781                           (errmsg("skipping restartpoint, recovery has already ended")));
6782                 LWLockRelease(CheckpointLock);
6783                 return false;
6784         }
6785
6786         /*
6787          * If the last checkpoint record we've replayed is already our last
6788          * restartpoint, we can't perform a new restart point. We still update
6789          * minRecoveryPoint in that case, so that if this is a shutdown restart
6790          * point, we won't start up earlier than before. That's not strictly
6791          * necessary, but when we get hot standby capability, it would be rather
6792          * weird if the database opened up for read-only connections at a
6793          * point-in-time before the last shutdown. Such time travel is still
6794          * possible in case of immediate shutdown, though.
6795          *
6796          * We don't explicitly advance minRecoveryPoint when we do create a
6797          * restartpoint. It's assumed that flushing the buffers will do that as a
6798          * side-effect.
6799          */
6800         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
6801                 XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
6802         {
6803                 XLogRecPtr      InvalidXLogRecPtr = {0, 0};
6804
6805                 ereport(DEBUG2,
6806                                 (errmsg("skipping restartpoint, already performed at %X/%X",
6807                                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
6808
6809                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
6810                 LWLockRelease(CheckpointLock);
6811                 return false;
6812         }
6813
6814         if (log_checkpoints)
6815         {
6816                 /*
6817                  * Prepare to accumulate statistics.
6818                  */
6819                 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
6820                 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
6821
6822                 LogCheckpointStart(flags, true);
6823         }
6824
6825         CheckPointGuts(lastCheckPoint.redo, flags);
6826
6827         /*
6828          * Update pg_control, using current time.  Check that it still shows
6829          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
6830          * this is a quick hack to make sure nothing really bad happens if
6831          * somehow we get here after the end-of-recovery checkpoint.
6832          */
6833         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6834         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
6835                 XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
6836         {
6837                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6838                 ControlFile->checkPoint = lastCheckPointRecPtr;
6839                 ControlFile->checkPointCopy = lastCheckPoint;
6840                 ControlFile->time = (pg_time_t) time(NULL);
6841                 UpdateControlFile();
6842         }
6843         LWLockRelease(ControlFileLock);
6844
6845         /*
6846          * Currently, there is no need to truncate pg_subtrans during recovery. If
6847          * we did do that, we will need to have called StartupSUBTRANS() already
6848          * and then TruncateSUBTRANS() would go here.
6849          */
6850
6851         /* All real work is done, but log before releasing lock. */
6852         if (log_checkpoints)
6853                 LogCheckpointEnd(true);
6854
6855         ereport((log_checkpoints ? LOG : DEBUG2),
6856                         (errmsg("recovery restart point at %X/%X",
6857                                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
6858
6859         /* XXX this is currently BROKEN because we are in the wrong process */
6860         if (recoveryLastXTime)
6861                 ereport((log_checkpoints ? LOG : DEBUG2),
6862                                 (errmsg("last completed transaction was at log time %s",
6863                                                 timestamptz_to_str(recoveryLastXTime))));
6864
6865         LWLockRelease(CheckpointLock);
6866         return true;
6867 }
6868
6869 /*
6870  * Write a NEXTOID log record
6871  */
6872 void
6873 XLogPutNextOid(Oid nextOid)
6874 {
6875         XLogRecData rdata;
6876
6877         rdata.data = (char *) (&nextOid);
6878         rdata.len = sizeof(Oid);
6879         rdata.buffer = InvalidBuffer;
6880         rdata.next = NULL;
6881         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
6882
6883         /*
6884          * We need not flush the NEXTOID record immediately, because any of the
6885          * just-allocated OIDs could only reach disk as part of a tuple insert or
6886          * update that would have its own XLOG record that must follow the NEXTOID
6887          * record.      Therefore, the standard buffer LSN interlock applied to those
6888          * records will ensure no such OID reaches disk before the NEXTOID record
6889          * does.
6890          *
6891          * Note, however, that the above statement only covers state "within" the
6892          * database.  When we use a generated OID as a file or directory name, we
6893          * are in a sense violating the basic WAL rule, because that filesystem
6894          * change may reach disk before the NEXTOID WAL record does.  The impact
6895          * of this is that if a database crash occurs immediately afterward, we
6896          * might after restart re-generate the same OID and find that it conflicts
6897          * with the leftover file or directory.  But since for safety's sake we
6898          * always loop until finding a nonconflicting filename, this poses no real
6899          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
6900          */
6901 }
6902
6903 /*
6904  * Write an XLOG SWITCH record.
6905  *
6906  * Here we just blindly issue an XLogInsert request for the record.
6907  * All the magic happens inside XLogInsert.
6908  *
6909  * The return value is either the end+1 address of the switch record,
6910  * or the end+1 address of the prior segment if we did not need to
6911  * write a switch record because we are already at segment start.
6912  */
6913 XLogRecPtr
6914 RequestXLogSwitch(void)
6915 {
6916         XLogRecPtr      RecPtr;
6917         XLogRecData rdata;
6918
6919         /* XLOG SWITCH, alone among xlog record types, has no data */
6920         rdata.buffer = InvalidBuffer;
6921         rdata.data = NULL;
6922         rdata.len = 0;
6923         rdata.next = NULL;
6924
6925         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
6926
6927         return RecPtr;
6928 }
6929
6930 /*
6931  * XLOG resource manager's routines
6932  *
6933  * Definitions of info values are in include/catalog/pg_control.h, though
6934  * not all record types are related to control file updates.
6935  */
6936 void
6937 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
6938 {
6939         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6940
6941         /* Backup blocks are not used in xlog records */
6942         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
6943
6944         if (info == XLOG_NEXTOID)
6945         {
6946                 Oid                     nextOid;
6947
6948                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
6949                 if (ShmemVariableCache->nextOid < nextOid)
6950                 {
6951                         ShmemVariableCache->nextOid = nextOid;
6952                         ShmemVariableCache->oidCount = 0;
6953                 }
6954         }
6955         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
6956         {
6957                 CheckPoint      checkPoint;
6958
6959                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6960                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
6961                 ShmemVariableCache->nextXid = checkPoint.nextXid;
6962                 ShmemVariableCache->nextOid = checkPoint.nextOid;
6963                 ShmemVariableCache->oidCount = 0;
6964                 MultiXactSetNextMXact(checkPoint.nextMulti,
6965                                                           checkPoint.nextMultiOffset);
6966
6967                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
6968                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
6969                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
6970
6971                 /*
6972                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
6973                  */
6974                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
6975                 {
6976                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
6977                                 !list_member_int(expectedTLIs,
6978                                                                  (int) checkPoint.ThisTimeLineID))
6979                                 ereport(PANIC,
6980                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
6981                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
6982                         /* Following WAL records should be run with new TLI */
6983                         ThisTimeLineID = checkPoint.ThisTimeLineID;
6984                 }
6985
6986                 RecoveryRestartPoint(&checkPoint);
6987         }
6988         else if (info == XLOG_CHECKPOINT_ONLINE)
6989         {
6990                 CheckPoint      checkPoint;
6991
6992                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6993                 /* In an ONLINE checkpoint, treat the counters like NEXTOID */
6994                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
6995                                                                   checkPoint.nextXid))
6996                         ShmemVariableCache->nextXid = checkPoint.nextXid;
6997                 if (ShmemVariableCache->nextOid < checkPoint.nextOid)
6998                 {
6999                         ShmemVariableCache->nextOid = checkPoint.nextOid;
7000                         ShmemVariableCache->oidCount = 0;
7001                 }
7002                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
7003                                                                   checkPoint.nextMultiOffset);
7004
7005                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7006                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
7007                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
7008
7009                 /* TLI should not change in an on-line checkpoint */
7010                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7011                         ereport(PANIC,
7012                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
7013                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
7014
7015                 RecoveryRestartPoint(&checkPoint);
7016         }
7017         else if (info == XLOG_NOOP)
7018         {
7019                 /* nothing to do here */
7020         }
7021         else if (info == XLOG_SWITCH)
7022         {
7023                 /* nothing to do here */
7024         }
7025 }
7026
7027 void
7028 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
7029 {
7030         uint8           info = xl_info & ~XLR_INFO_MASK;
7031
7032         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
7033                 info == XLOG_CHECKPOINT_ONLINE)
7034         {
7035                 CheckPoint *checkpoint = (CheckPoint *) rec;
7036
7037                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
7038                                                  "tli %u; xid %u/%u; oid %u; multi %u; offset %u; %s",
7039                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
7040                                                  checkpoint->ThisTimeLineID,
7041                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
7042                                                  checkpoint->nextOid,
7043                                                  checkpoint->nextMulti,
7044                                                  checkpoint->nextMultiOffset,
7045                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
7046         }
7047         else if (info == XLOG_NOOP)
7048         {
7049                 appendStringInfo(buf, "xlog no-op");
7050         }
7051         else if (info == XLOG_NEXTOID)
7052         {
7053                 Oid                     nextOid;
7054
7055                 memcpy(&nextOid, rec, sizeof(Oid));
7056                 appendStringInfo(buf, "nextOid: %u", nextOid);
7057         }
7058         else if (info == XLOG_SWITCH)
7059         {
7060                 appendStringInfo(buf, "xlog switch");
7061         }
7062         else
7063                 appendStringInfo(buf, "UNKNOWN");
7064 }
7065
7066 #ifdef WAL_DEBUG
7067
7068 static void
7069 xlog_outrec(StringInfo buf, XLogRecord *record)
7070 {
7071         int                     i;
7072
7073         appendStringInfo(buf, "prev %X/%X; xid %u",
7074                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
7075                                          record->xl_xid);
7076
7077         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
7078         {
7079                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
7080                         appendStringInfo(buf, "; bkpb%d", i + 1);
7081         }
7082
7083         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
7084 }
7085 #endif   /* WAL_DEBUG */
7086
7087
7088 /*
7089  * Return the (possible) sync flag used for opening a file, depending on the
7090  * value of the GUC wal_sync_method.
7091  */
7092 static int
7093 get_sync_bit(int method)
7094 {
7095         /* If fsync is disabled, never open in sync mode */
7096         if (!enableFsync)
7097                 return 0;
7098
7099         switch (method)
7100         {
7101                         /*
7102                          * enum values for all sync options are defined even if they are
7103                          * not supported on the current platform.  But if not, they are
7104                          * not included in the enum option array, and therefore will never
7105                          * be seen here.
7106                          */
7107                 case SYNC_METHOD_FSYNC:
7108                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
7109                 case SYNC_METHOD_FDATASYNC:
7110                         return 0;
7111 #ifdef OPEN_SYNC_FLAG
7112                 case SYNC_METHOD_OPEN:
7113                         return OPEN_SYNC_FLAG;
7114 #endif
7115 #ifdef OPEN_DATASYNC_FLAG
7116                 case SYNC_METHOD_OPEN_DSYNC:
7117                         return OPEN_DATASYNC_FLAG;
7118 #endif
7119                 default:
7120                         /* can't happen (unless we are out of sync with option array) */
7121                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
7122                         return 0;                       /* silence warning */
7123         }
7124 }
7125
7126 /*
7127  * GUC support
7128  */
7129 bool
7130 assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
7131 {
7132         if (!doit)
7133                 return true;
7134
7135         if (sync_method != new_sync_method)
7136         {
7137                 /*
7138                  * To ensure that no blocks escape unsynced, force an fsync on the
7139                  * currently open log segment (if any).  Also, if the open flag is
7140                  * changing, close the log file so it will be reopened (with new flag
7141                  * bit) at next use.
7142                  */
7143                 if (openLogFile >= 0)
7144                 {
7145                         if (pg_fsync(openLogFile) != 0)
7146                                 ereport(PANIC,
7147                                                 (errcode_for_file_access(),
7148                                                  errmsg("could not fsync log file %u, segment %u: %m",
7149                                                                 openLogId, openLogSeg)));
7150                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
7151                                 XLogFileClose();
7152                 }
7153         }
7154
7155         return true;
7156 }
7157
7158
7159 /*
7160  * Issue appropriate kind of fsync (if any) on the current XLOG output file
7161  */
7162 static void
7163 issue_xlog_fsync(void)
7164 {
7165         switch (sync_method)
7166         {
7167                 case SYNC_METHOD_FSYNC:
7168                         if (pg_fsync_no_writethrough(openLogFile) != 0)
7169                                 ereport(PANIC,
7170                                                 (errcode_for_file_access(),
7171                                                  errmsg("could not fsync log file %u, segment %u: %m",
7172                                                                 openLogId, openLogSeg)));
7173                         break;
7174 #ifdef HAVE_FSYNC_WRITETHROUGH
7175                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
7176                         if (pg_fsync_writethrough(openLogFile) != 0)
7177                                 ereport(PANIC,
7178                                                 (errcode_for_file_access(),
7179                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
7180                                                                 openLogId, openLogSeg)));
7181                         break;
7182 #endif
7183 #ifdef HAVE_FDATASYNC
7184                 case SYNC_METHOD_FDATASYNC:
7185                         if (pg_fdatasync(openLogFile) != 0)
7186                                 ereport(PANIC,
7187                                                 (errcode_for_file_access(),
7188                                         errmsg("could not fdatasync log file %u, segment %u: %m",
7189                                                    openLogId, openLogSeg)));
7190                         break;
7191 #endif
7192                 case SYNC_METHOD_OPEN:
7193                 case SYNC_METHOD_OPEN_DSYNC:
7194                         /* write synced it already */
7195                         break;
7196                 default:
7197                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
7198                         break;
7199         }
7200 }
7201
7202
7203 /*
7204  * pg_start_backup: set up for taking an on-line backup dump
7205  *
7206  * Essentially what this does is to create a backup label file in $PGDATA,
7207  * where it will be archived as part of the backup dump.  The label file
7208  * contains the user-supplied label string (typically this would be used
7209  * to tell where the backup dump will be stored) and the starting time and
7210  * starting WAL location for the dump.
7211  */
7212 Datum
7213 pg_start_backup(PG_FUNCTION_ARGS)
7214 {
7215         text       *backupid = PG_GETARG_TEXT_P(0);
7216         bool            fast = PG_GETARG_BOOL(1);
7217         char       *backupidstr;
7218         XLogRecPtr      checkpointloc;
7219         XLogRecPtr      startpoint;
7220         pg_time_t       stamp_time;
7221         char            strfbuf[128];
7222         char            xlogfilename[MAXFNAMELEN];
7223         uint32          _logId;
7224         uint32          _logSeg;
7225         struct stat stat_buf;
7226         FILE       *fp;
7227
7228         if (!superuser())
7229                 ereport(ERROR,
7230                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
7231                                  errmsg("must be superuser to run a backup")));
7232
7233         if (!XLogArchivingActive())
7234                 ereport(ERROR,
7235                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7236                                  errmsg("WAL archiving is not active"),
7237                                  errhint("archive_mode must be enabled at server start.")));
7238
7239         if (!XLogArchiveCommandSet())
7240                 ereport(ERROR,
7241                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7242                                  errmsg("WAL archiving is not active"),
7243                                  errhint("archive_command must be defined before "
7244                                                  "online backups can be made safely.")));
7245
7246         backupidstr = text_to_cstring(backupid);
7247
7248         /*
7249          * Mark backup active in shared memory.  We must do full-page WAL writes
7250          * during an on-line backup even if not doing so at other times, because
7251          * it's quite possible for the backup dump to obtain a "torn" (partially
7252          * written) copy of a database page if it reads the page concurrently with
7253          * our write to the same page.  This can be fixed as long as the first
7254          * write to the page in the WAL sequence is a full-page write. Hence, we
7255          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
7256          * are no dirty pages in shared memory that might get dumped while the
7257          * backup is in progress without having a corresponding WAL record.  (Once
7258          * the backup is complete, we need not force full-page writes anymore,
7259          * since we expect that any pages not modified during the backup interval
7260          * must have been correctly captured by the backup.)
7261          *
7262          * We must hold WALInsertLock to change the value of forcePageWrites, to
7263          * ensure adequate interlocking against XLogInsert().
7264          */
7265         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7266         if (XLogCtl->Insert.forcePageWrites)
7267         {
7268                 LWLockRelease(WALInsertLock);
7269                 ereport(ERROR,
7270                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7271                                  errmsg("a backup is already in progress"),
7272                                  errhint("Run pg_stop_backup() and try again.")));
7273         }
7274         XLogCtl->Insert.forcePageWrites = true;
7275         LWLockRelease(WALInsertLock);
7276
7277         /*
7278          * Force an XLOG file switch before the checkpoint, to ensure that the WAL
7279          * segment the checkpoint is written to doesn't contain pages with old
7280          * timeline IDs. That would otherwise happen if you called
7281          * pg_start_backup() right after restoring from a PITR archive: the first
7282          * WAL segment containing the startup checkpoint has pages in the
7283          * beginning with the old timeline ID. That can cause trouble at recovery:
7284          * we won't have a history file covering the old timeline if pg_xlog
7285          * directory was not included in the base backup and the WAL archive was
7286          * cleared too before starting the backup.
7287          */
7288         RequestXLogSwitch();
7289
7290         /* Ensure we release forcePageWrites if fail below */
7291         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
7292         {
7293                 /*
7294                  * Force a CHECKPOINT.  Aside from being necessary to prevent torn
7295                  * page problems, this guarantees that two successive backup runs will
7296                  * have different checkpoint positions and hence different history
7297                  * file names, even if nothing happened in between.
7298                  *
7299                  * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
7300                  * fast = true).  Otherwise this can take awhile.
7301                  */
7302                 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
7303                                                   (fast ? CHECKPOINT_IMMEDIATE : 0));
7304
7305                 /*
7306                  * Now we need to fetch the checkpoint record location, and also its
7307                  * REDO pointer.  The oldest point in WAL that would be needed to
7308                  * restore starting from the checkpoint is precisely the REDO pointer.
7309                  */
7310                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7311                 checkpointloc = ControlFile->checkPoint;
7312                 startpoint = ControlFile->checkPointCopy.redo;
7313                 LWLockRelease(ControlFileLock);
7314
7315                 XLByteToSeg(startpoint, _logId, _logSeg);
7316                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
7317
7318                 /* Use the log timezone here, not the session timezone */
7319                 stamp_time = (pg_time_t) time(NULL);
7320                 pg_strftime(strfbuf, sizeof(strfbuf),
7321                                         "%Y-%m-%d %H:%M:%S %Z",
7322                                         pg_localtime(&stamp_time, log_timezone));
7323
7324                 /*
7325                  * Check for existing backup label --- implies a backup is already
7326                  * running.  (XXX given that we checked forcePageWrites above, maybe
7327                  * it would be OK to just unlink any such label file?)
7328                  */
7329                 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
7330                 {
7331                         if (errno != ENOENT)
7332                                 ereport(ERROR,
7333                                                 (errcode_for_file_access(),
7334                                                  errmsg("could not stat file \"%s\": %m",
7335                                                                 BACKUP_LABEL_FILE)));
7336                 }
7337                 else
7338                         ereport(ERROR,
7339                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7340                                          errmsg("a backup is already in progress"),
7341                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
7342                                                          BACKUP_LABEL_FILE)));
7343
7344                 /*
7345                  * Okay, write the file
7346                  */
7347                 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
7348                 if (!fp)
7349                         ereport(ERROR,
7350                                         (errcode_for_file_access(),
7351                                          errmsg("could not create file \"%s\": %m",
7352                                                         BACKUP_LABEL_FILE)));
7353                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
7354                                 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
7355                 fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
7356                                 checkpointloc.xlogid, checkpointloc.xrecoff);
7357                 fprintf(fp, "START TIME: %s\n", strfbuf);
7358                 fprintf(fp, "LABEL: %s\n", backupidstr);
7359                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
7360                         ereport(ERROR,
7361                                         (errcode_for_file_access(),
7362                                          errmsg("could not write file \"%s\": %m",
7363                                                         BACKUP_LABEL_FILE)));
7364         }
7365         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
7366
7367         /*
7368          * We're done.  As a convenience, return the starting WAL location.
7369          */
7370         snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
7371                          startpoint.xlogid, startpoint.xrecoff);
7372         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
7373 }
7374
7375 /* Error cleanup callback for pg_start_backup */
7376 static void
7377 pg_start_backup_callback(int code, Datum arg)
7378 {
7379         /* Turn off forcePageWrites on failure */
7380         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7381         XLogCtl->Insert.forcePageWrites = false;
7382         LWLockRelease(WALInsertLock);
7383 }
7384
7385 /*
7386  * pg_stop_backup: finish taking an on-line backup dump
7387  *
7388  * We remove the backup label file created by pg_start_backup, and instead
7389  * create a backup history file in pg_xlog (whence it will immediately be
7390  * archived).  The backup history file contains the same info found in
7391  * the label file, plus the backup-end time and WAL location.
7392  * Note: different from CancelBackup which just cancels online backup mode.
7393  */
7394 Datum
7395 pg_stop_backup(PG_FUNCTION_ARGS)
7396 {
7397         XLogRecPtr      startpoint;
7398         XLogRecPtr      stoppoint;
7399         pg_time_t       stamp_time;
7400         char            strfbuf[128];
7401         char            histfilepath[MAXPGPATH];
7402         char            startxlogfilename[MAXFNAMELEN];
7403         char            stopxlogfilename[MAXFNAMELEN];
7404         char            lastxlogfilename[MAXFNAMELEN];
7405         char            histfilename[MAXFNAMELEN];
7406         uint32          _logId;
7407         uint32          _logSeg;
7408         FILE       *lfp;
7409         FILE       *fp;
7410         char            ch;
7411         int                     ich;
7412         int                     seconds_before_warning;
7413         int                     waits = 0;
7414
7415         if (!superuser())
7416                 ereport(ERROR,
7417                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
7418                                  (errmsg("must be superuser to run a backup"))));
7419
7420         if (!XLogArchivingActive())
7421                 ereport(ERROR,
7422                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7423                                  errmsg("WAL archiving is not active"),
7424                                  errhint("archive_mode must be enabled at server start.")));
7425
7426         /*
7427          * OK to clear forcePageWrites
7428          */
7429         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7430         XLogCtl->Insert.forcePageWrites = false;
7431         LWLockRelease(WALInsertLock);
7432
7433         /*
7434          * Force a switch to a new xlog segment file, so that the backup is valid
7435          * as soon as archiver moves out the current segment file. We'll report
7436          * the end address of the XLOG SWITCH record as the backup stopping point.
7437          */
7438         stoppoint = RequestXLogSwitch();
7439
7440         XLByteToSeg(stoppoint, _logId, _logSeg);
7441         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
7442
7443         /* Use the log timezone here, not the session timezone */
7444         stamp_time = (pg_time_t) time(NULL);
7445         pg_strftime(strfbuf, sizeof(strfbuf),
7446                                 "%Y-%m-%d %H:%M:%S %Z",
7447                                 pg_localtime(&stamp_time, log_timezone));
7448
7449         /*
7450          * Open the existing label file
7451          */
7452         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
7453         if (!lfp)
7454         {
7455                 if (errno != ENOENT)
7456                         ereport(ERROR,
7457                                         (errcode_for_file_access(),
7458                                          errmsg("could not read file \"%s\": %m",
7459                                                         BACKUP_LABEL_FILE)));
7460                 ereport(ERROR,
7461                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7462                                  errmsg("a backup is not in progress")));
7463         }
7464
7465         /*
7466          * Read and parse the START WAL LOCATION line (this code is pretty crude,
7467          * but we are not expecting any variability in the file format).
7468          */
7469         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
7470                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
7471                            &ch) != 4 || ch != '\n')
7472                 ereport(ERROR,
7473                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7474                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7475
7476         /*
7477          * Write the backup history file
7478          */
7479         XLByteToSeg(startpoint, _logId, _logSeg);
7480         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
7481                                                   startpoint.xrecoff % XLogSegSize);
7482         fp = AllocateFile(histfilepath, "w");
7483         if (!fp)
7484                 ereport(ERROR,
7485                                 (errcode_for_file_access(),
7486                                  errmsg("could not create file \"%s\": %m",
7487                                                 histfilepath)));
7488         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
7489                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
7490         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
7491                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
7492         /* transfer remaining lines from label to history file */
7493         while ((ich = fgetc(lfp)) != EOF)
7494                 fputc(ich, fp);
7495         fprintf(fp, "STOP TIME: %s\n", strfbuf);
7496         if (fflush(fp) || ferror(fp) || FreeFile(fp))
7497                 ereport(ERROR,
7498                                 (errcode_for_file_access(),
7499                                  errmsg("could not write file \"%s\": %m",
7500                                                 histfilepath)));
7501
7502         /*
7503          * Close and remove the backup label file
7504          */
7505         if (ferror(lfp) || FreeFile(lfp))
7506                 ereport(ERROR,
7507                                 (errcode_for_file_access(),
7508                                  errmsg("could not read file \"%s\": %m",
7509                                                 BACKUP_LABEL_FILE)));
7510         if (unlink(BACKUP_LABEL_FILE) != 0)
7511                 ereport(ERROR,
7512                                 (errcode_for_file_access(),
7513                                  errmsg("could not remove file \"%s\": %m",
7514                                                 BACKUP_LABEL_FILE)));
7515
7516         /*
7517          * Clean out any no-longer-needed history files.  As a side effect, this
7518          * will post a .ready file for the newly created history file, notifying
7519          * the archiver that history file may be archived immediately.
7520          */
7521         CleanupBackupHistory();
7522
7523         /*
7524          * Wait until both the last WAL file filled during backup and the history
7525          * file have been archived.  We assume that the alphabetic sorting
7526          * property of the WAL files ensures any earlier WAL files are safely
7527          * archived as well.
7528          *
7529          * We wait forever, since archive_command is supposed to work and we
7530          * assume the admin wanted his backup to work completely. If you don't
7531          * wish to wait, you can set statement_timeout.
7532          */
7533         XLByteToPrevSeg(stoppoint, _logId, _logSeg);
7534         XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
7535
7536         XLByteToSeg(startpoint, _logId, _logSeg);
7537         BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
7538                                                   startpoint.xrecoff % XLogSegSize);
7539
7540         seconds_before_warning = 60;
7541         waits = 0;
7542
7543         while (XLogArchiveIsBusy(lastxlogfilename) ||
7544                    XLogArchiveIsBusy(histfilename))
7545         {
7546                 CHECK_FOR_INTERRUPTS();
7547
7548                 pg_usleep(1000000L);
7549
7550                 if (++waits >= seconds_before_warning)
7551                 {
7552                         seconds_before_warning *= 2;            /* This wraps in >10 years... */
7553                         ereport(WARNING,
7554                                         (errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
7555                                                         waits)));
7556                 }
7557         }
7558
7559         /*
7560          * We're done.  As a convenience, return the ending WAL location.
7561          */
7562         snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
7563                          stoppoint.xlogid, stoppoint.xrecoff);
7564         PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
7565 }
7566
7567 /*
7568  * pg_switch_xlog: switch to next xlog file
7569  */
7570 Datum
7571 pg_switch_xlog(PG_FUNCTION_ARGS)
7572 {
7573         XLogRecPtr      switchpoint;
7574         char            location[MAXFNAMELEN];
7575
7576         if (!superuser())
7577                 ereport(ERROR,
7578                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
7579                          (errmsg("must be superuser to switch transaction log files"))));
7580
7581         switchpoint = RequestXLogSwitch();
7582
7583         /*
7584          * As a convenience, return the WAL location of the switch record
7585          */
7586         snprintf(location, sizeof(location), "%X/%X",
7587                          switchpoint.xlogid, switchpoint.xrecoff);
7588         PG_RETURN_TEXT_P(cstring_to_text(location));
7589 }
7590
7591 /*
7592  * Report the current WAL write location (same format as pg_start_backup etc)
7593  *
7594  * This is useful for determining how much of WAL is visible to an external
7595  * archiving process.  Note that the data before this point is written out
7596  * to the kernel, but is not necessarily synced to disk.
7597  */
7598 Datum
7599 pg_current_xlog_location(PG_FUNCTION_ARGS)
7600 {
7601         char            location[MAXFNAMELEN];
7602
7603         /* Make sure we have an up-to-date local LogwrtResult */
7604         {
7605                 /* use volatile pointer to prevent code rearrangement */
7606                 volatile XLogCtlData *xlogctl = XLogCtl;
7607
7608                 SpinLockAcquire(&xlogctl->info_lck);
7609                 LogwrtResult = xlogctl->LogwrtResult;
7610                 SpinLockRelease(&xlogctl->info_lck);
7611         }
7612
7613         snprintf(location, sizeof(location), "%X/%X",
7614                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
7615         PG_RETURN_TEXT_P(cstring_to_text(location));
7616 }
7617
7618 /*
7619  * Report the current WAL insert location (same format as pg_start_backup etc)
7620  *
7621  * This function is mostly for debugging purposes.
7622  */
7623 Datum
7624 pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
7625 {
7626         XLogCtlInsert *Insert = &XLogCtl->Insert;
7627         XLogRecPtr      current_recptr;
7628         char            location[MAXFNAMELEN];
7629
7630         /*
7631          * Get the current end-of-WAL position ... shared lock is sufficient
7632          */
7633         LWLockAcquire(WALInsertLock, LW_SHARED);
7634         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
7635         LWLockRelease(WALInsertLock);
7636
7637         snprintf(location, sizeof(location), "%X/%X",
7638                          current_recptr.xlogid, current_recptr.xrecoff);
7639         PG_RETURN_TEXT_P(cstring_to_text(location));
7640 }
7641
7642 /*
7643  * Compute an xlog file name and decimal byte offset given a WAL location,
7644  * such as is returned by pg_stop_backup() or pg_xlog_switch().
7645  *
7646  * Note that a location exactly at a segment boundary is taken to be in
7647  * the previous segment.  This is usually the right thing, since the
7648  * expected usage is to determine which xlog file(s) are ready to archive.
7649  */
7650 Datum
7651 pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
7652 {
7653         text       *location = PG_GETARG_TEXT_P(0);
7654         char       *locationstr;
7655         unsigned int uxlogid;
7656         unsigned int uxrecoff;
7657         uint32          xlogid;
7658         uint32          xlogseg;
7659         uint32          xrecoff;
7660         XLogRecPtr      locationpoint;
7661         char            xlogfilename[MAXFNAMELEN];
7662         Datum           values[2];
7663         bool            isnull[2];
7664         TupleDesc       resultTupleDesc;
7665         HeapTuple       resultHeapTuple;
7666         Datum           result;
7667
7668         /*
7669          * Read input and parse
7670          */
7671         locationstr = text_to_cstring(location);
7672
7673         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
7674                 ereport(ERROR,
7675                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
7676                                  errmsg("could not parse transaction log location \"%s\"",
7677                                                 locationstr)));
7678
7679         locationpoint.xlogid = uxlogid;
7680         locationpoint.xrecoff = uxrecoff;
7681
7682         /*
7683          * Construct a tuple descriptor for the result row.  This must match this
7684          * function's pg_proc entry!
7685          */
7686         resultTupleDesc = CreateTemplateTupleDesc(2, false);
7687         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
7688                                            TEXTOID, -1, 0);
7689         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
7690                                            INT4OID, -1, 0);
7691
7692         resultTupleDesc = BlessTupleDesc(resultTupleDesc);
7693
7694         /*
7695          * xlogfilename
7696          */
7697         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
7698         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
7699
7700         values[0] = CStringGetTextDatum(xlogfilename);
7701         isnull[0] = false;
7702
7703         /*
7704          * offset
7705          */
7706         xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;
7707
7708         values[1] = UInt32GetDatum(xrecoff);
7709         isnull[1] = false;
7710
7711         /*
7712          * Tuple jam: Having first prepared your Datums, then squash together
7713          */
7714         resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
7715
7716         result = HeapTupleGetDatum(resultHeapTuple);
7717
7718         PG_RETURN_DATUM(result);
7719 }
7720
7721 /*
7722  * Compute an xlog file name given a WAL location,
7723  * such as is returned by pg_stop_backup() or pg_xlog_switch().
7724  */
7725 Datum
7726 pg_xlogfile_name(PG_FUNCTION_ARGS)
7727 {
7728         text       *location = PG_GETARG_TEXT_P(0);
7729         char       *locationstr;
7730         unsigned int uxlogid;
7731         unsigned int uxrecoff;
7732         uint32          xlogid;
7733         uint32          xlogseg;
7734         XLogRecPtr      locationpoint;
7735         char            xlogfilename[MAXFNAMELEN];
7736
7737         locationstr = text_to_cstring(location);
7738
7739         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
7740                 ereport(ERROR,
7741                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
7742                                  errmsg("could not parse transaction log location \"%s\"",
7743                                                 locationstr)));
7744
7745         locationpoint.xlogid = uxlogid;
7746         locationpoint.xrecoff = uxrecoff;
7747
7748         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
7749         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
7750
7751         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
7752 }
7753
7754 /*
7755  * read_backup_label: check to see if a backup_label file is present
7756  *
7757  * If we see a backup_label during recovery, we assume that we are recovering
7758  * from a backup dump file, and we therefore roll forward from the checkpoint
7759  * identified by the label file, NOT what pg_control says.      This avoids the
7760  * problem that pg_control might have been archived one or more checkpoints
7761  * later than the start of the dump, and so if we rely on it as the start
7762  * point, we will fail to restore a consistent database state.
7763  *
7764  * We also attempt to retrieve the corresponding backup history file.
7765  * If successful, set *minRecoveryLoc to constrain valid PITR stopping
7766  * points.
7767  *
7768  * Returns TRUE if a backup_label was found (and fills the checkpoint
7769  * location into *checkPointLoc); returns FALSE if not.
7770  */
7771 static bool
7772 read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc)
7773 {
7774         XLogRecPtr      startpoint;
7775         XLogRecPtr      stoppoint;
7776         char            histfilename[MAXFNAMELEN];
7777         char            histfilepath[MAXPGPATH];
7778         char            startxlogfilename[MAXFNAMELEN];
7779         char            stopxlogfilename[MAXFNAMELEN];
7780         TimeLineID      tli;
7781         uint32          _logId;
7782         uint32          _logSeg;
7783         FILE       *lfp;
7784         FILE       *fp;
7785         char            ch;
7786
7787         /* Default is to not constrain recovery stop point */
7788         minRecoveryLoc->xlogid = 0;
7789         minRecoveryLoc->xrecoff = 0;
7790
7791         /*
7792          * See if label file is present
7793          */
7794         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
7795         if (!lfp)
7796         {
7797                 if (errno != ENOENT)
7798                         ereport(FATAL,
7799                                         (errcode_for_file_access(),
7800                                          errmsg("could not read file \"%s\": %m",
7801                                                         BACKUP_LABEL_FILE)));
7802                 return false;                   /* it's not there, all is fine */
7803         }
7804
7805         /*
7806          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
7807          * is pretty crude, but we are not expecting any variability in the file
7808          * format).
7809          */
7810         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
7811                            &startpoint.xlogid, &startpoint.xrecoff, &tli,
7812                            startxlogfilename, &ch) != 5 || ch != '\n')
7813                 ereport(FATAL,
7814                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7815                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7816         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
7817                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
7818                            &ch) != 3 || ch != '\n')
7819                 ereport(FATAL,
7820                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7821                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7822         if (ferror(lfp) || FreeFile(lfp))
7823                 ereport(FATAL,
7824                                 (errcode_for_file_access(),
7825                                  errmsg("could not read file \"%s\": %m",
7826                                                 BACKUP_LABEL_FILE)));
7827
7828         /*
7829          * Try to retrieve the backup history file (no error if we can't)
7830          */
7831         XLByteToSeg(startpoint, _logId, _logSeg);
7832         BackupHistoryFileName(histfilename, tli, _logId, _logSeg,
7833                                                   startpoint.xrecoff % XLogSegSize);
7834
7835         if (InArchiveRecovery)
7836                 RestoreArchivedFile(histfilepath, histfilename, "RECOVERYHISTORY", 0);
7837         else
7838                 BackupHistoryFilePath(histfilepath, tli, _logId, _logSeg,
7839                                                           startpoint.xrecoff % XLogSegSize);
7840
7841         fp = AllocateFile(histfilepath, "r");
7842         if (fp)
7843         {
7844                 /*
7845                  * Parse history file to identify stop point.
7846                  */
7847                 if (fscanf(fp, "START WAL LOCATION: %X/%X (file %24s)%c",
7848                                    &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
7849                                    &ch) != 4 || ch != '\n')
7850                         ereport(FATAL,
7851                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7852                                          errmsg("invalid data in file \"%s\"", histfilename)));
7853                 if (fscanf(fp, "STOP WAL LOCATION: %X/%X (file %24s)%c",
7854                                    &stoppoint.xlogid, &stoppoint.xrecoff, stopxlogfilename,
7855                                    &ch) != 4 || ch != '\n')
7856                         ereport(FATAL,
7857                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7858                                          errmsg("invalid data in file \"%s\"", histfilename)));
7859                 *minRecoveryLoc = stoppoint;
7860                 if (ferror(fp) || FreeFile(fp))
7861                         ereport(FATAL,
7862                                         (errcode_for_file_access(),
7863                                          errmsg("could not read file \"%s\": %m",
7864                                                         histfilepath)));
7865         }
7866
7867         return true;
7868 }
7869
7870 /*
7871  * Error context callback for errors occurring during rm_redo().
7872  */
7873 static void
7874 rm_redo_error_callback(void *arg)
7875 {
7876         XLogRecord *record = (XLogRecord *) arg;
7877         StringInfoData buf;
7878
7879         initStringInfo(&buf);
7880         RmgrTable[record->xl_rmid].rm_desc(&buf,
7881                                                                            record->xl_info,
7882                                                                            XLogRecGetData(record));
7883
7884         /* don't bother emitting empty description */
7885         if (buf.len > 0)
7886                 errcontext("xlog redo %s", buf.data);
7887
7888         pfree(buf.data);
7889 }
7890
7891 /*
7892  * BackupInProgress: check if online backup mode is active
7893  *
7894  * This is done by checking for existence of the "backup_label" file.
7895  */
7896 bool
7897 BackupInProgress(void)
7898 {
7899         struct stat stat_buf;
7900
7901         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
7902 }
7903
7904 /*
7905  * CancelBackup: rename the "backup_label" file to cancel backup mode
7906  *
7907  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
7908  * Note that this will render an online backup in progress useless.
7909  * To correctly finish an online backup, pg_stop_backup must be called.
7910  */
7911 void
7912 CancelBackup(void)
7913 {
7914         struct stat stat_buf;
7915
7916         /* if the file is not there, return */
7917         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
7918                 return;
7919
7920         /* remove leftover file from previously cancelled backup if it exists */
7921         unlink(BACKUP_LABEL_OLD);
7922
7923         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
7924         {
7925                 ereport(LOG,
7926                                 (errmsg("online backup mode cancelled"),
7927                                  errdetail("\"%s\" was renamed to \"%s\".",
7928                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7929         }
7930         else
7931         {
7932                 ereport(WARNING,
7933                                 (errcode_for_file_access(),
7934                                  errmsg("online backup mode was not cancelled"),
7935                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
7936                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7937         }
7938 }
7939
7940 /* ------------------------------------------------------
7941  *      Startup Process main entry point and signal handlers
7942  * ------------------------------------------------------
7943  */
7944
7945 /*
7946  * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
7947  *
7948  * Some backend has bought the farm,
7949  * so we need to stop what we're doing and exit.
7950  */
7951 static void
7952 startupproc_quickdie(SIGNAL_ARGS)
7953 {
7954         PG_SETMASK(&BlockSig);
7955
7956         /*
7957          * We DO NOT want to run proc_exit() callbacks -- we're here because
7958          * shared memory may be corrupted, so we don't want to try to clean up our
7959          * transaction.  Just nail the windows shut and get out of town.  Now that
7960          * there's an atexit callback to prevent third-party code from breaking
7961          * things by calling exit() directly, we have to reset the callbacks
7962          * explicitly to make this work as intended.
7963          */
7964         on_exit_reset();
7965
7966         /*
7967          * Note we do exit(2) not exit(0).      This is to force the postmaster into a
7968          * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
7969          * backend.  This is necessary precisely because we don't clean up our
7970          * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
7971          * should ensure the postmaster sees this as a crash, too, but no harm in
7972          * being doubly sure.)
7973          */
7974         exit(2);
7975 }
7976
7977
7978 /* SIGHUP: set flag to re-read config file at next convenient time */
7979 static void
7980 StartupProcSigHupHandler(SIGNAL_ARGS)
7981 {
7982         got_SIGHUP = true;
7983 }
7984
7985 /* SIGTERM: set flag to abort redo and exit */
7986 static void
7987 StartupProcShutdownHandler(SIGNAL_ARGS)
7988 {
7989         if (in_restore_command)
7990                 proc_exit(1);
7991         else
7992                 shutdown_requested = true;
7993 }
7994
7995 /* Main entry point for startup process */
7996 void
7997 StartupProcessMain(void)
7998 {
7999         /*
8000          * If possible, make this process a group leader, so that the postmaster
8001          * can signal any child processes too.
8002          */
8003 #ifdef HAVE_SETSID
8004         if (setsid() < 0)
8005                 elog(FATAL, "setsid() failed: %m");
8006 #endif
8007
8008         /*
8009          * Properly accept or ignore signals the postmaster might send us
8010          */
8011         pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */
8012         pqsignal(SIGINT, SIG_IGN);      /* ignore query cancel */
8013         pqsignal(SIGTERM, StartupProcShutdownHandler);          /* request shutdown */
8014         pqsignal(SIGQUIT, startupproc_quickdie);        /* hard crash time */
8015         pqsignal(SIGALRM, SIG_IGN);
8016         pqsignal(SIGPIPE, SIG_IGN);
8017         pqsignal(SIGUSR1, SIG_IGN);
8018         pqsignal(SIGUSR2, SIG_IGN);
8019
8020         /*
8021          * Reset some signals that are accepted by postmaster but not here
8022          */
8023         pqsignal(SIGCHLD, SIG_DFL);
8024         pqsignal(SIGTTIN, SIG_DFL);
8025         pqsignal(SIGTTOU, SIG_DFL);
8026         pqsignal(SIGCONT, SIG_DFL);
8027         pqsignal(SIGWINCH, SIG_DFL);
8028
8029         /*
8030          * Unblock signals (they were blocked when the postmaster forked us)
8031          */
8032         PG_SETMASK(&UnBlockSig);
8033
8034         StartupXLOG();
8035
8036         BuildFlatFiles(false);
8037
8038         /*
8039          * Exit normally. Exit code 0 tells postmaster that we completed recovery
8040          * successfully.
8041          */
8042         proc_exit(0);
8043 }