src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <sys/wait.h>
  24 #include <unistd.h>
  25
  26 #include "access/clog.h"
  27 #include "access/multixact.h"
  28 #include "access/subtrans.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_type.h"
  38 #include "funcapi.h"
  39 #include "miscadmin.h"
  40 #include "pgstat.h"
  41 #include "postmaster/bgwriter.h"
  42 #include "storage/bufmgr.h"
  43 #include "storage/fd.h"
  44 #include "storage/ipc.h"
  45 #include "storage/pmsignal.h"
  46 #include "storage/procarray.h"
  47 #include "storage/smgr.h"
  48 #include "storage/spin.h"
  49 #include "utils/builtins.h"
  50 #include "utils/guc.h"
  51 #include "utils/ps_status.h"
  52 #include "pg_trace.h"
  53
  54
  55 /* File path names (all relative to $PGDATA) */
  56 #define BACKUP_LABEL_FILE               "backup_label"
  57 #define BACKUP_LABEL_OLD                "backup_label.old"
  58 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  59 #define RECOVERY_COMMAND_DONE   "recovery.done"
  60
  61
  62 /* User-settable parameters */
  63 int                     CheckPointSegments = 3;
  64 int                     XLOGbuffers = 8;
  65 int                     XLogArchiveTimeout = 0;
  66 bool            XLogArchiveMode = false;
  67 char       *XLogArchiveCommand = NULL;
  68 bool            fullPageWrites = true;
  69 bool            log_checkpoints = false;
  70 int             sync_method = DEFAULT_SYNC_METHOD;
  71
  72 #ifdef WAL_DEBUG
  73 bool            XLOG_DEBUG = false;
  74 #endif
  75
  76 /*
  77  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  78  * When we are done with an old XLOG segment file, we will recycle it as a
  79  * future XLOG segment as long as there aren't already XLOGfileslop future
  80  * segments; else we'll delete it.  This could be made a separate GUC
  81  * variable, but at present I think it's sufficient to hardwire it as
  82  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  83  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  84  * of them; the +1 allows boundary cases to happen without wasting a
  85  * delete/create-segment cycle.
  86  */
  87 #define XLOGfileslop    (2*CheckPointSegments + 1)
  88
  89 /*
  90  * GUC support
  91  */
  92 const struct config_enum_entry sync_method_options[] = {
  93         {"fsync", SYNC_METHOD_FSYNC, false},
  94 #ifdef HAVE_FSYNC_WRITETHROUGH
  95         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
  96 #endif
  97 #ifdef HAVE_FDATASYNC
  98         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
  99 #endif
 100 #ifdef OPEN_SYNC_FLAG
 101         {"open_sync", SYNC_METHOD_OPEN, false},
 102 #endif
 103 #ifdef OPEN_DATASYNC_FLAG
 104         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 105 #endif
 106         {NULL, 0, false}
 107 };
 108
 109 /*
 110  * Statistics for current checkpoint are collected in this global struct.
 111  * Because only the background writer or a stand-alone backend can perform
 112  * checkpoints, this will be unused in normal backends.
 113  */
 114 CheckpointStatsData CheckpointStats;
 115
 116 /*
 117  * ThisTimeLineID will be same in all backends --- it identifies current
 118  * WAL timeline for the database system.
 119  */
 120 TimeLineID      ThisTimeLineID = 0;
 121
 122 /* Are we doing recovery from XLOG? */
 123 bool            InRecovery = false;
 124
 125 /* Are we recovering using offline XLOG archives? */
 126 static bool InArchiveRecovery = false;
 127
 128 /* Was the last xlog file restored from archive, or local? */
 129 static bool restoredFromArchive = false;
 130
 131 /* options taken from recovery.conf */
 132 static char *recoveryRestoreCommand = NULL;
 133 static bool recoveryTarget = false;
 134 static bool recoveryTargetExact = false;
 135 static bool recoveryTargetInclusive = true;
 136 static bool recoveryLogRestartpoints = false;
 137 static TransactionId recoveryTargetXid;
 138 static TimestampTz recoveryTargetTime;
 139 static TimestampTz recoveryLastXTime = 0;
 140
 141 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 142 static TransactionId recoveryStopXid;
 143 static TimestampTz recoveryStopTime;
 144 static bool recoveryStopAfter;
 145
 146 /*
 147  * During normal operation, the only timeline we care about is ThisTimeLineID.
 148  * During recovery, however, things are more complicated.  To simplify life
 149  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 150  * scan through the WAL history (that is, it is the line that was active when
 151  * the currently-scanned WAL record was generated).  We also need these
 152  * timeline values:
 153  *
 154  * recoveryTargetTLI: the desired timeline that we want to end in.
 155  *
 156  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 157  * its known parents, newest first (so recoveryTargetTLI is always the
 158  * first list member).  Only these TLIs are expected to be seen in the WAL
 159  * segments we read, and indeed only these TLIs will be considered as
 160  * candidate WAL files to open at all.
 161  *
 162  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 163  * (This is not necessarily the same as ThisTimeLineID, because we could
 164  * be scanning data that was copied from an ancestor timeline when the current
 165  * file was created.)  During a sequential scan we do not allow this value
 166  * to decrease.
 167  */
 168 static TimeLineID recoveryTargetTLI;
 169 static List *expectedTLIs;
 170 static TimeLineID curFileTLI;
 171
 172 /*
 173  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 174  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 175  * end+1 of the last record, and is reset when we end a top-level transaction,
 176  * or start a new one; so it can be used to tell if the current transaction has
 177  * created any XLOG records.
 178  */
 179 static XLogRecPtr ProcLastRecPtr = {0, 0};
 180
 181 XLogRecPtr      XactLastRecEnd = {0, 0};
 182
 183 /*
 184  * RedoRecPtr is this backend's local copy of the REDO record pointer
 185  * (which is almost but not quite the same as a pointer to the most recent
 186  * CHECKPOINT record).  We update this from the shared-memory copy,
 187  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 188  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 189  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 190  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 191  * InitXLOGAccess.
 192  */
 193 static XLogRecPtr RedoRecPtr;
 194
 195 /*----------
 196  * Shared-memory data structures for XLOG control
 197  *
 198  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 199  * the log up to (all records before that point must be written or fsynced).
 200  * LogwrtResult indicates the byte positions we have already written/fsynced.
 201  * These structs are identical but are declared separately to indicate their
 202  * slightly different functions.
 203  *
 204  * We do a lot of pushups to minimize the amount of access to lockable
 205  * shared memory values.  There are actually three shared-memory copies of
 206  * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 207  *              XLogCtl->LogwrtResult is protected by info_lck
 208  *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 209  *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 210  * One must hold the associated lock to read or write any of these, but
 211  * of course no lock is needed to read/write the unshared LogwrtResult.
 212  *
 213  * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 214  * right", since both are updated by a write or flush operation before
 215  * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 216  * is that it can be examined/modified by code that already holds WALWriteLock
 217  * without needing to grab info_lck as well.
 218  *
 219  * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
 220  * but is updated when convenient.      Again, it exists for the convenience of
 221  * code that is already holding WALInsertLock but not the other locks.
 222  *
 223  * The unshared LogwrtResult may lag behind any or all of these, and again
 224  * is updated when convenient.
 225  *
 226  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 227  * (protected by info_lck), but we don't need to cache any copies of it.
 228  *
 229  * Note that this all works because the request and result positions can only
 230  * advance forward, never back up, and so we can easily determine which of two
 231  * values is "more up to date".
 232  *
 233  * info_lck is only held long enough to read/update the protected variables,
 234  * so it's a plain spinlock.  The other locks are held longer (potentially
 235  * over I/O operations), so we use LWLocks for them.  These locks are:
 236  *
 237  * WALInsertLock: must be held to insert a record into the WAL buffers.
 238  *
 239  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 240  * XLogFlush).
 241  *
 242  * ControlFileLock: must be held to read/update control file or create
 243  * new log file.
 244  *
 245  * CheckpointLock: must be held to do a checkpoint (ensures only one
 246  * checkpointer at a time; currently, with all checkpoints done by the
 247  * bgwriter, this is just pro forma).
 248  *
 249  *----------
 250  */
 251
 252 typedef struct XLogwrtRqst
 253 {
 254         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 255         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 256 } XLogwrtRqst;
 257
 258 typedef struct XLogwrtResult
 259 {
 260         XLogRecPtr      Write;                  /* last byte + 1 written out */
 261         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 262 } XLogwrtResult;
 263
 264 /*
 265  * Shared state data for XLogInsert.
 266  */
 267 typedef struct XLogCtlInsert
 268 {
 269         XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 270         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 271         int                     curridx;                /* current block index in cache */
 272         XLogPageHeader currpage;        /* points to header of block in cache */
 273         char       *currpos;            /* current insertion point in cache */
 274         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 275         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 276 } XLogCtlInsert;
 277
 278 /*
 279  * Shared state data for XLogWrite/XLogFlush.
 280  */
 281 typedef struct XLogCtlWrite
 282 {
 283         XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 284         int                     curridx;                /* cache index of next block to write */
 285         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 286 } XLogCtlWrite;
 287
 288 /*
 289  * Total shared-memory state for XLOG.
 290  */
 291 typedef struct XLogCtlData
 292 {
 293         /* Protected by WALInsertLock: */
 294         XLogCtlInsert Insert;
 295
 296         /* Protected by info_lck: */
 297         XLogwrtRqst LogwrtRqst;
 298         XLogwrtResult LogwrtResult;
 299         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 300         TransactionId ckptXid;
 301         XLogRecPtr      asyncCommitLSN; /* LSN of newest async commit */
 302
 303         /* Protected by WALWriteLock: */
 304         XLogCtlWrite Write;
 305
 306         /*
 307          * These values do not change after startup, although the pointed-to pages
 308          * and xlblocks values certainly do.  Permission to read/write the pages
 309          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 310          */
 311         char       *pages;                      /* buffers for unwritten XLOG pages */
 312         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 313         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 314         TimeLineID      ThisTimeLineID;
 315
 316         slock_t         info_lck;               /* locks shared variables shown above */
 317 } XLogCtlData;
 318
 319 static XLogCtlData *XLogCtl = NULL;
 320
 321 /*
 322  * We maintain an image of pg_control in shared memory.
 323  */
 324 static ControlFileData *ControlFile = NULL;
 325
 326 /*
 327  * Macros for managing XLogInsert state.  In most cases, the calling routine
 328  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 329  * so these are passed as parameters instead of being fetched via XLogCtl.
 330  */
 331
 332 /* Free space remaining in the current xlog page buffer */
 333 #define INSERT_FREESPACE(Insert)  \
 334         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 335
 336 /* Construct XLogRecPtr value for current insertion point */
 337 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 338         ( \
 339           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 340           (recptr).xrecoff = \
 341                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 342         )
 343
 344 #define PrevBufIdx(idx)         \
 345                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 346
 347 #define NextBufIdx(idx)         \
 348                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 349
 350 /*
 351  * Private, possibly out-of-date copy of shared LogwrtResult.
 352  * See discussion above.
 353  */
 354 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 355
 356 /*
 357  * openLogFile is -1 or a kernel FD for an open log file segment.
 358  * When it's open, openLogOff is the current seek offset in the file.
 359  * openLogId/openLogSeg identify the segment.  These variables are only
 360  * used to write the XLOG, and so will normally refer to the active segment.
 361  */
 362 static int      openLogFile = -1;
 363 static uint32 openLogId = 0;
 364 static uint32 openLogSeg = 0;
 365 static uint32 openLogOff = 0;
 366
 367 /*
 368  * These variables are used similarly to the ones above, but for reading
 369  * the XLOG.  Note, however, that readOff generally represents the offset
 370  * of the page just read, not the seek position of the FD itself, which
 371  * will be just past that page.
 372  */
 373 static int      readFile = -1;
 374 static uint32 readId = 0;
 375 static uint32 readSeg = 0;
 376 static uint32 readOff = 0;
 377
 378 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 379 static char *readBuf = NULL;
 380
 381 /* Buffer for current ReadRecord result (expandable) */
 382 static char *readRecordBuf = NULL;
 383 static uint32 readRecordBufSize = 0;
 384
 385 /* State information for XLOG reading */
 386 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 387 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 388 static XLogRecord *nextRecord = NULL;
 389 static TimeLineID lastPageTLI = 0;
 390
 391 static bool InRedo = false;
 392
 393
 394 static void XLogArchiveNotify(const char *xlog);
 395 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 396 static bool XLogArchiveCheckDone(const char *xlog);
 397 static bool XLogArchiveIsBusy(const char *xlog);
 398 static void XLogArchiveCleanup(const char *xlog);
 399 static void readRecoveryCommandFile(void);
 400 static void exitArchiveRecovery(TimeLineID endTLI,
 401                                         uint32 endLogId, uint32 endLogSeg);
 402 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 403 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 404
 405 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 406                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 407 static bool AdvanceXLInsertBuffer(bool new_segment);
 408 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 409 static int XLogFileInit(uint32 log, uint32 seg,
 410                          bool *use_existent, bool use_lock);
 411 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 412                                            bool find_free, int *max_advance,
 413                                            bool use_lock);
 414 static int      XLogFileOpen(uint32 log, uint32 seg);
 415 static int      XLogFileRead(uint32 log, uint32 seg, int emode);
 416 static void XLogFileClose(void);
 417 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 418                                         const char *recovername, off_t expectedSize);
 419 static void PreallocXlogFiles(XLogRecPtr endptr);
 420 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 421 static void ValidateXLOGDirectoryStructure(void);
 422 static void CleanupBackupHistory(void);
 423 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 424 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 425 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 426 static List *readTimeLineHistory(TimeLineID targetTLI);
 427 static bool existsTimeLineHistory(TimeLineID probeTLI);
 428 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 429 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 430                                          TimeLineID endTLI,
 431                                          uint32 endLogId, uint32 endLogSeg);
 432 static void WriteControlFile(void);
 433 static void ReadControlFile(void);
 434 static char *str_time(pg_time_t tnow);
 435 #ifdef WAL_DEBUG
 436 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 437 #endif
 438 static void issue_xlog_fsync(void);
 439 static void pg_start_backup_callback(int code, Datum arg);
 440 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 441                                   XLogRecPtr *minRecoveryLoc);
 442 static void rm_redo_error_callback(void *arg);
 443 static int get_sync_bit(int method);
 444
 445
 446 /*
 447  * Insert an XLOG record having the specified RMID and info bytes,
 448  * with the body of the record being the data chunk(s) described by
 449  * the rdata chain (see xlog.h for notes about rdata).
 450  *
 451  * Returns XLOG pointer to end of record (beginning of next record).
 452  * This can be used as LSN for data pages affected by the logged action.
 453  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 454  * before the data page can be written out.  This implements the basic
 455  * WAL rule "write the log before the data".)
 456  *
 457  * NB: this routine feels free to scribble on the XLogRecData structs,
 458  * though not on the data they reference.  This is OK since the XLogRecData
 459  * structs are always just temporaries in the calling code.
 460  */
 461 XLogRecPtr
 462 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 463 {
 464         XLogCtlInsert *Insert = &XLogCtl->Insert;
 465         XLogRecord *record;
 466         XLogContRecord *contrecord;
 467         XLogRecPtr      RecPtr;
 468         XLogRecPtr      WriteRqst;
 469         uint32          freespace;
 470         int                     curridx;
 471         XLogRecData *rdt;
 472         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 473         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 474         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 475         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 476         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 477         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 478         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 479         pg_crc32        rdata_crc;
 480         uint32          len,
 481                                 write_len;
 482         unsigned        i;
 483         bool            updrqst;
 484         bool            doPageWrites;
 485         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 486
 487         /* info's high bits are reserved for use by me */
 488         if (info & XLR_INFO_MASK)
 489                 elog(PANIC, "invalid xlog info mask %02X", info);
 490
 491         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 492
 493         /*
 494          * In bootstrap mode, we don't actually log anything but XLOG resources;
 495          * return a phony record pointer.
 496          */
 497         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 498         {
 499                 RecPtr.xlogid = 0;
 500                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 501                 return RecPtr;
 502         }
 503
 504         /*
 505          * Here we scan the rdata chain, determine which buffers must be backed
 506          * up, and compute the CRC values for the data.  Note that the record
 507          * header isn't added into the CRC initially since we don't know the final
 508          * length or info bits quite yet.  Thus, the CRC will represent the CRC of
 509          * the whole record in the order "rdata, then backup blocks, then record
 510          * header".
 511          *
 512          * We may have to loop back to here if a race condition is detected below.
 513          * We could prevent the race by doing all this work while holding the
 514          * insert lock, but it seems better to avoid doing CRC calculations while
 515          * holding the lock.  This means we have to be careful about modifying the
 516          * rdata chain until we know we aren't going to loop back again.  The only
 517          * change we allow ourselves to make earlier is to set rdt->data = NULL in
 518          * chain items we have decided we will have to back up the whole buffer
 519          * for.  This is OK because we will certainly decide the same thing again
 520          * for those items if we do it over; doing it here saves an extra pass
 521          * over the chain later.
 522          */
 523 begin:;
 524         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 525         {
 526                 dtbuf[i] = InvalidBuffer;
 527                 dtbuf_bkp[i] = false;
 528         }
 529
 530         /*
 531          * Decide if we need to do full-page writes in this XLOG record: true if
 532          * full_page_writes is on or we have a PITR request for it.  Since we
 533          * don't yet have the insert lock, forcePageWrites could change under us,
 534          * but we'll recheck it once we have the lock.
 535          */
 536         doPageWrites = fullPageWrites || Insert->forcePageWrites;
 537
 538         INIT_CRC32(rdata_crc);
 539         len = 0;
 540         for (rdt = rdata;;)
 541         {
 542                 if (rdt->buffer == InvalidBuffer)
 543                 {
 544                         /* Simple data, just include it */
 545                         len += rdt->len;
 546                         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 547                 }
 548                 else
 549                 {
 550                         /* Find info for buffer */
 551                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 552                         {
 553                                 if (rdt->buffer == dtbuf[i])
 554                                 {
 555                                         /* Buffer already referenced by earlier chain item */
 556                                         if (dtbuf_bkp[i])
 557                                                 rdt->data = NULL;
 558                                         else if (rdt->data)
 559                                         {
 560                                                 len += rdt->len;
 561                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 562                                         }
 563                                         break;
 564                                 }
 565                                 if (dtbuf[i] == InvalidBuffer)
 566                                 {
 567                                         /* OK, put it in this slot */
 568                                         dtbuf[i] = rdt->buffer;
 569                                         if (XLogCheckBuffer(rdt, doPageWrites,
 570                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 571                                         {
 572                                                 dtbuf_bkp[i] = true;
 573                                                 rdt->data = NULL;
 574                                         }
 575                                         else if (rdt->data)
 576                                         {
 577                                                 len += rdt->len;
 578                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 579                                         }
 580                                         break;
 581                                 }
 582                         }
 583                         if (i >= XLR_MAX_BKP_BLOCKS)
 584                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 585                                          XLR_MAX_BKP_BLOCKS);
 586                 }
 587                 /* Break out of loop when rdt points to last chain item */
 588                 if (rdt->next == NULL)
 589                         break;
 590                 rdt = rdt->next;
 591         }
 592
 593         /*
 594          * Now add the backup block headers and data into the CRC
 595          */
 596         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 597         {
 598                 if (dtbuf_bkp[i])
 599                 {
 600                         BkpBlock   *bkpb = &(dtbuf_xlg[i]);
 601                         char       *page;
 602
 603                         COMP_CRC32(rdata_crc,
 604                                            (char *) bkpb,
 605                                            sizeof(BkpBlock));
 606                         page = (char *) BufferGetBlock(dtbuf[i]);
 607                         if (bkpb->hole_length == 0)
 608                         {
 609                                 COMP_CRC32(rdata_crc,
 610                                                    page,
 611                                                    BLCKSZ);
 612                         }
 613                         else
 614                         {
 615                                 /* must skip the hole */
 616                                 COMP_CRC32(rdata_crc,
 617                                                    page,
 618                                                    bkpb->hole_offset);
 619                                 COMP_CRC32(rdata_crc,
 620                                                    page + (bkpb->hole_offset + bkpb->hole_length),
 621                                                    BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
 622                         }
 623                 }
 624         }
 625
 626         /*
 627          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 628          * error checking in ReadRecord.  This means that all callers of
 629          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 630          * make an exception for XLOG SWITCH records because we don't want them to
 631          * ever cross a segment boundary.
 632          */
 633         if (len == 0 && !isLogSwitch)
 634                 elog(PANIC, "invalid xlog record length %u", len);
 635
 636         START_CRIT_SECTION();
 637
 638         /* Now wait to get insert lock */
 639         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 640
 641         /*
 642          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 643          * back and recompute everything.  This can only happen just after a
 644          * checkpoint, so it's better to be slow in this case and fast otherwise.
 645          *
 646          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 647          * affect the contents of the XLOG record, so we'll update our local copy
 648          * but not force a recomputation.
 649          */
 650         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 651         {
 652                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 653                 RedoRecPtr = Insert->RedoRecPtr;
 654
 655                 if (doPageWrites)
 656                 {
 657                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 658                         {
 659                                 if (dtbuf[i] == InvalidBuffer)
 660                                         continue;
 661                                 if (dtbuf_bkp[i] == false &&
 662                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 663                                 {
 664                                         /*
 665                                          * Oops, this buffer now needs to be backed up, but we
 666                                          * didn't think so above.  Start over.
 667                                          */
 668                                         LWLockRelease(WALInsertLock);
 669                                         END_CRIT_SECTION();
 670                                         goto begin;
 671                                 }
 672                         }
 673                 }
 674         }
 675
 676         /*
 677          * Also check to see if forcePageWrites was just turned on; if we weren't
 678          * already doing full-page writes then go back and recompute. (If it was
 679          * just turned off, we could recompute the record without full pages, but
 680          * we choose not to bother.)
 681          */
 682         if (Insert->forcePageWrites && !doPageWrites)
 683         {
 684                 /* Oops, must redo it with full-page data */
 685                 LWLockRelease(WALInsertLock);
 686                 END_CRIT_SECTION();
 687                 goto begin;
 688         }
 689
 690         /*
 691          * Make additional rdata chain entries for the backup blocks, so that we
 692          * don't need to special-case them in the write loop.  Note that we have
 693          * now irrevocably changed the input rdata chain.  At the exit of this
 694          * loop, write_len includes the backup block data.
 695          *
 696          * Also set the appropriate info bits to show which buffers were backed
 697          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 698          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 699          */
 700         write_len = len;
 701         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 702         {
 703                 BkpBlock   *bkpb;
 704                 char       *page;
 705
 706                 if (!dtbuf_bkp[i])
 707                         continue;
 708
 709                 info |= XLR_SET_BKP_BLOCK(i);
 710
 711                 bkpb = &(dtbuf_xlg[i]);
 712                 page = (char *) BufferGetBlock(dtbuf[i]);
 713
 714                 rdt->next = &(dtbuf_rdt1[i]);
 715                 rdt = rdt->next;
 716
 717                 rdt->data = (char *) bkpb;
 718                 rdt->len = sizeof(BkpBlock);
 719                 write_len += sizeof(BkpBlock);
 720
 721                 rdt->next = &(dtbuf_rdt2[i]);
 722                 rdt = rdt->next;
 723
 724                 if (bkpb->hole_length == 0)
 725                 {
 726                         rdt->data = page;
 727                         rdt->len = BLCKSZ;
 728                         write_len += BLCKSZ;
 729                         rdt->next = NULL;
 730                 }
 731                 else
 732                 {
 733                         /* must skip the hole */
 734                         rdt->data = page;
 735                         rdt->len = bkpb->hole_offset;
 736                         write_len += bkpb->hole_offset;
 737
 738                         rdt->next = &(dtbuf_rdt3[i]);
 739                         rdt = rdt->next;
 740
 741                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 742                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 743                         write_len += rdt->len;
 744                         rdt->next = NULL;
 745                 }
 746         }
 747
 748         /*
 749          * If we backed up any full blocks and online backup is not in progress,
 750          * mark the backup blocks as removable.  This allows the WAL archiver to
 751          * know whether it is safe to compress archived WAL data by transforming
 752          * full-block records into the non-full-block format.
 753          *
 754          * Note: we could just set the flag whenever !forcePageWrites, but
 755          * defining it like this leaves the info bit free for some potential other
 756          * use in records without any backup blocks.
 757          */
 758         if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
 759                 info |= XLR_BKP_REMOVABLE;
 760
 761         /*
 762          * If there isn't enough space on the current XLOG page for a record
 763          * header, advance to the next page (leaving the unused space as zeroes).
 764          */
 765         updrqst = false;
 766         freespace = INSERT_FREESPACE(Insert);
 767         if (freespace < SizeOfXLogRecord)
 768         {
 769                 updrqst = AdvanceXLInsertBuffer(false);
 770                 freespace = INSERT_FREESPACE(Insert);
 771         }
 772
 773         /* Compute record's XLOG location */
 774         curridx = Insert->curridx;
 775         INSERT_RECPTR(RecPtr, Insert, curridx);
 776
 777         /*
 778          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 779          * segment, we need not insert it (and don't want to because we'd like
 780          * consecutive switch requests to be no-ops).  Instead, make sure
 781          * everything is written and flushed through the end of the prior segment,
 782          * and return the prior segment's end address.
 783          */
 784         if (isLogSwitch &&
 785                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 786         {
 787                 /* We can release insert lock immediately */
 788                 LWLockRelease(WALInsertLock);
 789
 790                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 791                 if (RecPtr.xrecoff == 0)
 792                 {
 793                         /* crossing a logid boundary */
 794                         RecPtr.xlogid -= 1;
 795                         RecPtr.xrecoff = XLogFileSize;
 796                 }
 797
 798                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 799                 LogwrtResult = XLogCtl->Write.LogwrtResult;
 800                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 801                 {
 802                         XLogwrtRqst FlushRqst;
 803
 804                         FlushRqst.Write = RecPtr;
 805                         FlushRqst.Flush = RecPtr;
 806                         XLogWrite(FlushRqst, false, false);
 807                 }
 808                 LWLockRelease(WALWriteLock);
 809
 810                 END_CRIT_SECTION();
 811
 812                 return RecPtr;
 813         }
 814
 815         /* Insert record header */
 816
 817         record = (XLogRecord *) Insert->currpos;
 818         record->xl_prev = Insert->PrevRecord;
 819         record->xl_xid = GetCurrentTransactionIdIfAny();
 820         record->xl_tot_len = SizeOfXLogRecord + write_len;
 821         record->xl_len = len;           /* doesn't include backup blocks */
 822         record->xl_info = info;
 823         record->xl_rmid = rmid;
 824
 825         /* Now we can finish computing the record's CRC */
 826         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
 827                            SizeOfXLogRecord - sizeof(pg_crc32));
 828         FIN_CRC32(rdata_crc);
 829         record->xl_crc = rdata_crc;
 830
 831 #ifdef WAL_DEBUG
 832         if (XLOG_DEBUG)
 833         {
 834                 StringInfoData buf;
 835
 836                 initStringInfo(&buf);
 837                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
 838                                                  RecPtr.xlogid, RecPtr.xrecoff);
 839                 xlog_outrec(&buf, record);
 840                 if (rdata->data != NULL)
 841                 {
 842                         appendStringInfo(&buf, " - ");
 843                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
 844                 }
 845                 elog(LOG, "%s", buf.data);
 846                 pfree(buf.data);
 847         }
 848 #endif
 849
 850         /* Record begin of record in appropriate places */
 851         ProcLastRecPtr = RecPtr;
 852         Insert->PrevRecord = RecPtr;
 853
 854         Insert->currpos += SizeOfXLogRecord;
 855         freespace -= SizeOfXLogRecord;
 856
 857         /*
 858          * Append the data, including backup blocks if any
 859          */
 860         while (write_len)
 861         {
 862                 while (rdata->data == NULL)
 863                         rdata = rdata->next;
 864
 865                 if (freespace > 0)
 866                 {
 867                         if (rdata->len > freespace)
 868                         {
 869                                 memcpy(Insert->currpos, rdata->data, freespace);
 870                                 rdata->data += freespace;
 871                                 rdata->len -= freespace;
 872                                 write_len -= freespace;
 873                         }
 874                         else
 875                         {
 876                                 memcpy(Insert->currpos, rdata->data, rdata->len);
 877                                 freespace -= rdata->len;
 878                                 write_len -= rdata->len;
 879                                 Insert->currpos += rdata->len;
 880                                 rdata = rdata->next;
 881                                 continue;
 882                         }
 883                 }
 884
 885                 /* Use next buffer */
 886                 updrqst = AdvanceXLInsertBuffer(false);
 887                 curridx = Insert->curridx;
 888                 /* Insert cont-record header */
 889                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
 890                 contrecord = (XLogContRecord *) Insert->currpos;
 891                 contrecord->xl_rem_len = write_len;
 892                 Insert->currpos += SizeOfXLogContRecord;
 893                 freespace = INSERT_FREESPACE(Insert);
 894         }
 895
 896         /* Ensure next record will be properly aligned */
 897         Insert->currpos = (char *) Insert->currpage +
 898                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
 899         freespace = INSERT_FREESPACE(Insert);
 900
 901         /*
 902          * The recptr I return is the beginning of the *next* record. This will be
 903          * stored as LSN for changed data pages...
 904          */
 905         INSERT_RECPTR(RecPtr, Insert, curridx);
 906
 907         /*
 908          * If the record is an XLOG_SWITCH, we must now write and flush all the
 909          * existing data, and then forcibly advance to the start of the next
 910          * segment.  It's not good to do this I/O while holding the insert lock,
 911          * but there seems too much risk of confusion if we try to release the
 912          * lock sooner.  Fortunately xlog switch needn't be a high-performance
 913          * operation anyway...
 914          */
 915         if (isLogSwitch)
 916         {
 917                 XLogCtlWrite *Write = &XLogCtl->Write;
 918                 XLogwrtRqst FlushRqst;
 919                 XLogRecPtr      OldSegEnd;
 920
 921                 TRACE_POSTGRESQL_XLOG_SWITCH();
 922
 923                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 924
 925                 /*
 926                  * Flush through the end of the page containing XLOG_SWITCH, and
 927                  * perform end-of-segment actions (eg, notifying archiver).
 928                  */
 929                 WriteRqst = XLogCtl->xlblocks[curridx];
 930                 FlushRqst.Write = WriteRqst;
 931                 FlushRqst.Flush = WriteRqst;
 932                 XLogWrite(FlushRqst, false, true);
 933
 934                 /* Set up the next buffer as first page of next segment */
 935                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
 936                 (void) AdvanceXLInsertBuffer(true);
 937
 938                 /* There should be no unwritten data */
 939                 curridx = Insert->curridx;
 940                 Assert(curridx == Write->curridx);
 941
 942                 /* Compute end address of old segment */
 943                 OldSegEnd = XLogCtl->xlblocks[curridx];
 944                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
 945                 if (OldSegEnd.xrecoff == 0)
 946                 {
 947                         /* crossing a logid boundary */
 948                         OldSegEnd.xlogid -= 1;
 949                         OldSegEnd.xrecoff = XLogFileSize;
 950                 }
 951
 952                 /* Make it look like we've written and synced all of old segment */
 953                 LogwrtResult.Write = OldSegEnd;
 954                 LogwrtResult.Flush = OldSegEnd;
 955
 956                 /*
 957                  * Update shared-memory status --- this code should match XLogWrite
 958                  */
 959                 {
 960                         /* use volatile pointer to prevent code rearrangement */
 961                         volatile XLogCtlData *xlogctl = XLogCtl;
 962
 963                         SpinLockAcquire(&xlogctl->info_lck);
 964                         xlogctl->LogwrtResult = LogwrtResult;
 965                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
 966                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
 967                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
 968                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
 969                         SpinLockRelease(&xlogctl->info_lck);
 970                 }
 971
 972                 Write->LogwrtResult = LogwrtResult;
 973
 974                 LWLockRelease(WALWriteLock);
 975
 976                 updrqst = false;                /* done already */
 977         }
 978         else
 979         {
 980                 /* normal case, ie not xlog switch */
 981
 982                 /* Need to update shared LogwrtRqst if some block was filled up */
 983                 if (freespace < SizeOfXLogRecord)
 984                 {
 985                         /* curridx is filled and available for writing out */
 986                         updrqst = true;
 987                 }
 988                 else
 989                 {
 990                         /* if updrqst already set, write through end of previous buf */
 991                         curridx = PrevBufIdx(curridx);
 992                 }
 993                 WriteRqst = XLogCtl->xlblocks[curridx];
 994         }
 995
 996         LWLockRelease(WALInsertLock);
 997
 998         if (updrqst)
 999         {
1000                 /* use volatile pointer to prevent code rearrangement */
1001                 volatile XLogCtlData *xlogctl = XLogCtl;
1002
1003                 SpinLockAcquire(&xlogctl->info_lck);
1004                 /* advance global request to include new block(s) */
1005                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1006                         xlogctl->LogwrtRqst.Write = WriteRqst;
1007                 /* update local result copy while I have the chance */
1008                 LogwrtResult = xlogctl->LogwrtResult;
1009                 SpinLockRelease(&xlogctl->info_lck);
1010         }
1011
1012         XactLastRecEnd = RecPtr;
1013
1014         END_CRIT_SECTION();
1015
1016         return RecPtr;
1017 }
1018
1019 /*
1020  * Determine whether the buffer referenced by an XLogRecData item has to
1021  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1022  * save the buffer's LSN at *lsn.
1023  */
1024 static bool
1025 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1026                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1027 {
1028         Page            page;
1029
1030         page = BufferGetPage(rdata->buffer);
1031
1032         /*
1033          * XXX We assume page LSN is first data on *every* page that can be passed
1034          * to XLogInsert, whether it otherwise has the standard page layout or
1035          * not.
1036          */
1037         *lsn = PageGetLSN(page);
1038
1039         if (doPageWrites &&
1040                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1041         {
1042                 /*
1043                  * The page needs to be backed up, so set up *bkpb
1044                  */
1045                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1046
1047                 if (rdata->buffer_std)
1048                 {
1049                         /* Assume we can omit data between pd_lower and pd_upper */
1050                         uint16          lower = ((PageHeader) page)->pd_lower;
1051                         uint16          upper = ((PageHeader) page)->pd_upper;
1052
1053                         if (lower >= SizeOfPageHeaderData &&
1054                                 upper > lower &&
1055                                 upper <= BLCKSZ)
1056                         {
1057                                 bkpb->hole_offset = lower;
1058                                 bkpb->hole_length = upper - lower;
1059                         }
1060                         else
1061                         {
1062                                 /* No "hole" to compress out */
1063                                 bkpb->hole_offset = 0;
1064                                 bkpb->hole_length = 0;
1065                         }
1066                 }
1067                 else
1068                 {
1069                         /* Not a standard page header, don't try to eliminate "hole" */
1070                         bkpb->hole_offset = 0;
1071                         bkpb->hole_length = 0;
1072                 }
1073
1074                 return true;                    /* buffer requires backup */
1075         }
1076
1077         return false;                           /* buffer does not need to be backed up */
1078 }
1079
1080 /*
1081  * XLogArchiveNotify
1082  *
1083  * Create an archive notification file
1084  *
1085  * The name of the notification file is the message that will be picked up
1086  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1087  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1088  * then when complete, rename it to 0000000100000001000000C6.done
1089  */
1090 static void
1091 XLogArchiveNotify(const char *xlog)
1092 {
1093         char            archiveStatusPath[MAXPGPATH];
1094         FILE       *fd;
1095
1096         /* insert an otherwise empty file called <XLOG>.ready */
1097         StatusFilePath(archiveStatusPath, xlog, ".ready");
1098         fd = AllocateFile(archiveStatusPath, "w");
1099         if (fd == NULL)
1100         {
1101                 ereport(LOG,
1102                                 (errcode_for_file_access(),
1103                                  errmsg("could not create archive status file \"%s\": %m",
1104                                                 archiveStatusPath)));
1105                 return;
1106         }
1107         if (FreeFile(fd))
1108         {
1109                 ereport(LOG,
1110                                 (errcode_for_file_access(),
1111                                  errmsg("could not write archive status file \"%s\": %m",
1112                                                 archiveStatusPath)));
1113                 return;
1114         }
1115
1116         /* Notify archiver that it's got something to do */
1117         if (IsUnderPostmaster)
1118                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1119 }
1120
1121 /*
1122  * Convenience routine to notify using log/seg representation of filename
1123  */
1124 static void
1125 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1126 {
1127         char            xlog[MAXFNAMELEN];
1128
1129         XLogFileName(xlog, ThisTimeLineID, log, seg);
1130         XLogArchiveNotify(xlog);
1131 }
1132
1133 /*
1134  * XLogArchiveCheckDone
1135  *
1136  * This is called when we are ready to delete or recycle an old XLOG segment
1137  * file or backup history file.  If it is okay to delete it then return true.
1138  * If it is not time to delete it, make sure a .ready file exists, and return
1139  * false.
1140  *
1141  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1142  * then return false; else create <XLOG>.ready and return false.
1143  *
1144  * The reason we do things this way is so that if the original attempt to
1145  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1146  */
1147 static bool
1148 XLogArchiveCheckDone(const char *xlog)
1149 {
1150         char            archiveStatusPath[MAXPGPATH];
1151         struct stat stat_buf;
1152
1153         /* Always deletable if archiving is off */
1154         if (!XLogArchivingActive())
1155                 return true;
1156
1157         /* First check for .done --- this means archiver is done with it */
1158         StatusFilePath(archiveStatusPath, xlog, ".done");
1159         if (stat(archiveStatusPath, &stat_buf) == 0)
1160                 return true;
1161
1162         /* check for .ready --- this means archiver is still busy with it */
1163         StatusFilePath(archiveStatusPath, xlog, ".ready");
1164         if (stat(archiveStatusPath, &stat_buf) == 0)
1165                 return false;
1166
1167         /* Race condition --- maybe archiver just finished, so recheck */
1168         StatusFilePath(archiveStatusPath, xlog, ".done");
1169         if (stat(archiveStatusPath, &stat_buf) == 0)
1170                 return true;
1171
1172         /* Retry creation of the .ready file */
1173         XLogArchiveNotify(xlog);
1174         return false;
1175 }
1176
1177 /*
1178  * XLogArchiveIsBusy
1179  *
1180  * Check to see if an XLOG segment file is still unarchived.
1181  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1182  * the first place we aren't chartered to recreate the .ready file, and
1183  * in the second place we should consider that if the file is already gone
1184  * then it's not busy.  (This check is needed to handle the race condition
1185  * that a checkpoint already deleted the no-longer-needed file.)
1186  */
1187 static bool
1188 XLogArchiveIsBusy(const char *xlog)
1189 {
1190         char            archiveStatusPath[MAXPGPATH];
1191         struct stat stat_buf;
1192
1193         /* First check for .done --- this means archiver is done with it */
1194         StatusFilePath(archiveStatusPath, xlog, ".done");
1195         if (stat(archiveStatusPath, &stat_buf) == 0)
1196                 return false;
1197
1198         /* check for .ready --- this means archiver is still busy with it */
1199         StatusFilePath(archiveStatusPath, xlog, ".ready");
1200         if (stat(archiveStatusPath, &stat_buf) == 0)
1201                 return true;
1202
1203         /* Race condition --- maybe archiver just finished, so recheck */
1204         StatusFilePath(archiveStatusPath, xlog, ".done");
1205         if (stat(archiveStatusPath, &stat_buf) == 0)
1206                 return false;
1207
1208         /*
1209          * Check to see if the WAL file has been removed by checkpoint,
1210          * which implies it has already been archived, and explains why we
1211          * can't see a status file for it.
1212          */
1213         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1214         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1215                 errno == ENOENT)
1216                 return false;
1217
1218         return true;
1219 }
1220
1221 /*
1222  * XLogArchiveCleanup
1223  *
1224  * Cleanup archive notification file(s) for a particular xlog segment
1225  */
1226 static void
1227 XLogArchiveCleanup(const char *xlog)
1228 {
1229         char            archiveStatusPath[MAXPGPATH];
1230
1231         /* Remove the .done file */
1232         StatusFilePath(archiveStatusPath, xlog, ".done");
1233         unlink(archiveStatusPath);
1234         /* should we complain about failure? */
1235
1236         /* Remove the .ready file if present --- normally it shouldn't be */
1237         StatusFilePath(archiveStatusPath, xlog, ".ready");
1238         unlink(archiveStatusPath);
1239         /* should we complain about failure? */
1240 }
1241
1242 /*
1243  * Advance the Insert state to the next buffer page, writing out the next
1244  * buffer if it still contains unwritten data.
1245  *
1246  * If new_segment is TRUE then we set up the next buffer page as the first
1247  * page of the next xlog segment file, possibly but not usually the next
1248  * consecutive file page.
1249  *
1250  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1251  * just-filled page.  If we can do this for free (without an extra lock),
1252  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1253  * request update still needs to be done, FALSE if we did it internally.
1254  *
1255  * Must be called with WALInsertLock held.
1256  */
1257 static bool
1258 AdvanceXLInsertBuffer(bool new_segment)
1259 {
1260         XLogCtlInsert *Insert = &XLogCtl->Insert;
1261         XLogCtlWrite *Write = &XLogCtl->Write;
1262         int                     nextidx = NextBufIdx(Insert->curridx);
1263         bool            update_needed = true;
1264         XLogRecPtr      OldPageRqstPtr;
1265         XLogwrtRqst WriteRqst;
1266         XLogRecPtr      NewPageEndPtr;
1267         XLogPageHeader NewPage;
1268
1269         /* Use Insert->LogwrtResult copy if it's more fresh */
1270         if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
1271                 LogwrtResult = Insert->LogwrtResult;
1272
1273         /*
1274          * Get ending-offset of the buffer page we need to replace (this may be
1275          * zero if the buffer hasn't been used yet).  Fall through if it's already
1276          * written out.
1277          */
1278         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1279         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1280         {
1281                 /* nope, got work to do... */
1282                 XLogRecPtr      FinishedPageRqstPtr;
1283
1284                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1285
1286                 /* Before waiting, get info_lck and update LogwrtResult */
1287                 {
1288                         /* use volatile pointer to prevent code rearrangement */
1289                         volatile XLogCtlData *xlogctl = XLogCtl;
1290
1291                         SpinLockAcquire(&xlogctl->info_lck);
1292                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1293                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1294                         LogwrtResult = xlogctl->LogwrtResult;
1295                         SpinLockRelease(&xlogctl->info_lck);
1296                 }
1297
1298                 update_needed = false;  /* Did the shared-request update */
1299
1300                 if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1301                 {
1302                         /* OK, someone wrote it already */
1303                         Insert->LogwrtResult = LogwrtResult;
1304                 }
1305                 else
1306                 {
1307                         /* Must acquire write lock */
1308                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1309                         LogwrtResult = Write->LogwrtResult;
1310                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1311                         {
1312                                 /* OK, someone wrote it already */
1313                                 LWLockRelease(WALWriteLock);
1314                                 Insert->LogwrtResult = LogwrtResult;
1315                         }
1316                         else
1317                         {
1318                                 /*
1319                                  * Have to write buffers while holding insert lock. This is
1320                                  * not good, so only write as much as we absolutely must.
1321                                  */
1322                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1323                                 WriteRqst.Write = OldPageRqstPtr;
1324                                 WriteRqst.Flush.xlogid = 0;
1325                                 WriteRqst.Flush.xrecoff = 0;
1326                                 XLogWrite(WriteRqst, false, false);
1327                                 LWLockRelease(WALWriteLock);
1328                                 Insert->LogwrtResult = LogwrtResult;
1329                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1330                         }
1331                 }
1332         }
1333
1334         /*
1335          * Now the next buffer slot is free and we can set it up to be the next
1336          * output page.
1337          */
1338         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1339
1340         if (new_segment)
1341         {
1342                 /* force it to a segment start point */
1343                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1344                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1345         }
1346
1347         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1348         {
1349                 /* crossing a logid boundary */
1350                 NewPageEndPtr.xlogid += 1;
1351                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1352         }
1353         else
1354                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1355         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1356         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1357
1358         Insert->curridx = nextidx;
1359         Insert->currpage = NewPage;
1360
1361         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1362
1363         /*
1364          * Be sure to re-zero the buffer so that bytes beyond what we've written
1365          * will look like zeroes and not valid XLOG records...
1366          */
1367         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1368
1369         /*
1370          * Fill the new page's header
1371          */
1372         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1373
1374         /* NewPage->xlp_info = 0; */    /* done by memset */
1375         NewPage   ->xlp_tli = ThisTimeLineID;
1376         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1377         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1378
1379         /*
1380          * If first page of an XLOG segment file, make it a long header.
1381          */
1382         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1383         {
1384                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1385
1386                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1387                 NewLongPage->xlp_seg_size = XLogSegSize;
1388                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1389                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1390
1391                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1392         }
1393
1394         return update_needed;
1395 }
1396
1397 /*
1398  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1399  *
1400  * Caller must have just finished filling the open log file (so that
1401  * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
1402  * to the open log file and see if that exceeds CheckPointSegments.
1403  *
1404  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1405  */
1406 static bool
1407 XLogCheckpointNeeded(void)
1408 {
1409         /*
1410          * A straight computation of segment number could overflow 32 bits. Rather
1411          * than assuming we have working 64-bit arithmetic, we compare the
1412          * highest-order bits separately, and force a checkpoint immediately when
1413          * they change.
1414          */
1415         uint32          old_segno,
1416                                 new_segno;
1417         uint32          old_highbits,
1418                                 new_highbits;
1419
1420         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1421                 (RedoRecPtr.xrecoff / XLogSegSize);
1422         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1423         new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
1424         new_highbits = openLogId / XLogSegSize;
1425         if (new_highbits != old_highbits ||
1426                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1427                 return true;
1428         return false;
1429 }
1430
1431 /*
1432  * Write and/or fsync the log at least as far as WriteRqst indicates.
1433  *
1434  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1435  * may stop at any convenient boundary (such as a cache or logfile boundary).
1436  * This option allows us to avoid uselessly issuing multiple writes when a
1437  * single one would do.
1438  *
1439  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1440  * perform end-of-segment actions after writing the last page, even if
1441  * it's not physically the end of its segment.  (NB: this will work properly
1442  * only if caller specifies WriteRqst == page-end and flexible == false,
1443  * and there is some data to write.)
1444  *
1445  * Must be called with WALWriteLock held.
1446  */
1447 static void
1448 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1449 {
1450         XLogCtlWrite *Write = &XLogCtl->Write;
1451         bool            ispartialpage;
1452         bool            last_iteration;
1453         bool            finishing_seg;
1454         bool            use_existent;
1455         int                     curridx;
1456         int                     npages;
1457         int                     startidx;
1458         uint32          startoffset;
1459
1460         /* We should always be inside a critical section here */
1461         Assert(CritSectionCount > 0);
1462
1463         /*
1464          * Update local LogwrtResult (caller probably did this already, but...)
1465          */
1466         LogwrtResult = Write->LogwrtResult;
1467
1468         /*
1469          * Since successive pages in the xlog cache are consecutively allocated,
1470          * we can usually gather multiple pages together and issue just one
1471          * write() call.  npages is the number of pages we have determined can be
1472          * written together; startidx is the cache block index of the first one,
1473          * and startoffset is the file offset at which it should go. The latter
1474          * two variables are only valid when npages > 0, but we must initialize
1475          * all of them to keep the compiler quiet.
1476          */
1477         npages = 0;
1478         startidx = 0;
1479         startoffset = 0;
1480
1481         /*
1482          * Within the loop, curridx is the cache block index of the page to
1483          * consider writing.  We advance Write->curridx only after successfully
1484          * writing pages.  (Right now, this refinement is useless since we are
1485          * going to PANIC if any error occurs anyway; but someday it may come in
1486          * useful.)
1487          */
1488         curridx = Write->curridx;
1489
1490         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1491         {
1492                 /*
1493                  * Make sure we're not ahead of the insert process.  This could happen
1494                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1495                  * last page that's been initialized by AdvanceXLInsertBuffer.
1496                  */
1497                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1498                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1499                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1500                                  XLogCtl->xlblocks[curridx].xlogid,
1501                                  XLogCtl->xlblocks[curridx].xrecoff);
1502
1503                 /* Advance LogwrtResult.Write to end of current buffer page */
1504                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1505                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1506
1507                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1508                 {
1509                         /*
1510                          * Switch to new logfile segment.  We cannot have any pending
1511                          * pages here (since we dump what we have at segment end).
1512                          */
1513                         Assert(npages == 0);
1514                         if (openLogFile >= 0)
1515                                 XLogFileClose();
1516                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1517
1518                         /* create/use new log file */
1519                         use_existent = true;
1520                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1521                                                                            &use_existent, true);
1522                         openLogOff = 0;
1523                 }
1524
1525                 /* Make sure we have the current logfile open */
1526                 if (openLogFile < 0)
1527                 {
1528                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1529                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1530                         openLogOff = 0;
1531                 }
1532
1533                 /* Add current page to the set of pending pages-to-dump */
1534                 if (npages == 0)
1535                 {
1536                         /* first of group */
1537                         startidx = curridx;
1538                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1539                 }
1540                 npages++;
1541
1542                 /*
1543                  * Dump the set if this will be the last loop iteration, or if we are
1544                  * at the last page of the cache area (since the next page won't be
1545                  * contiguous in memory), or if we are at the end of the logfile
1546                  * segment.
1547                  */
1548                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1549
1550                 finishing_seg = !ispartialpage &&
1551                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1552
1553                 if (last_iteration ||
1554                         curridx == XLogCtl->XLogCacheBlck ||
1555                         finishing_seg)
1556                 {
1557                         char       *from;
1558                         Size            nbytes;
1559
1560                         /* Need to seek in the file? */
1561                         if (openLogOff != startoffset)
1562                         {
1563                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1564                                         ereport(PANIC,
1565                                                         (errcode_for_file_access(),
1566                                                          errmsg("could not seek in log file %u, "
1567                                                                         "segment %u to offset %u: %m",
1568                                                                         openLogId, openLogSeg, startoffset)));
1569                                 openLogOff = startoffset;
1570                         }
1571
1572                         /* OK to write the page(s) */
1573                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1574                         nbytes = npages * (Size) XLOG_BLCKSZ;
1575                         errno = 0;
1576                         if (write(openLogFile, from, nbytes) != nbytes)
1577                         {
1578                                 /* if write didn't set errno, assume no disk space */
1579                                 if (errno == 0)
1580                                         errno = ENOSPC;
1581                                 ereport(PANIC,
1582                                                 (errcode_for_file_access(),
1583                                                  errmsg("could not write to log file %u, segment %u "
1584                                                                 "at offset %u, length %lu: %m",
1585                                                                 openLogId, openLogSeg,
1586                                                                 openLogOff, (unsigned long) nbytes)));
1587                         }
1588
1589                         /* Update state for write */
1590                         openLogOff += nbytes;
1591                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1592                         npages = 0;
1593
1594                         /*
1595                          * If we just wrote the whole last page of a logfile segment,
1596                          * fsync the segment immediately.  This avoids having to go back
1597                          * and re-open prior segments when an fsync request comes along
1598                          * later. Doing it here ensures that one and only one backend will
1599                          * perform this fsync.
1600                          *
1601                          * We also do this if this is the last page written for an xlog
1602                          * switch.
1603                          *
1604                          * This is also the right place to notify the Archiver that the
1605                          * segment is ready to copy to archival storage, and to update the
1606                          * timer for archive_timeout, and to signal for a checkpoint if
1607                          * too many logfile segments have been used since the last
1608                          * checkpoint.
1609                          */
1610                         if (finishing_seg || (xlog_switch && last_iteration))
1611                         {
1612                                 issue_xlog_fsync();
1613                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1614
1615                                 if (XLogArchivingActive())
1616                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1617
1618                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1619
1620                                 /*
1621                                  * Signal bgwriter to start a checkpoint if we've consumed too
1622                                  * much xlog since the last one.  For speed, we first check
1623                                  * using the local copy of RedoRecPtr, which might be out of
1624                                  * date; if it looks like a checkpoint is needed, forcibly
1625                                  * update RedoRecPtr and recheck.
1626                                  */
1627                                 if (IsUnderPostmaster &&
1628                                         XLogCheckpointNeeded())
1629                                 {
1630                                         (void) GetRedoRecPtr();
1631                                         if (XLogCheckpointNeeded())
1632                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1633                                 }
1634                         }
1635                 }
1636
1637                 if (ispartialpage)
1638                 {
1639                         /* Only asked to write a partial page */
1640                         LogwrtResult.Write = WriteRqst.Write;
1641                         break;
1642                 }
1643                 curridx = NextBufIdx(curridx);
1644
1645                 /* If flexible, break out of loop as soon as we wrote something */
1646                 if (flexible && npages == 0)
1647                         break;
1648         }
1649
1650         Assert(npages == 0);
1651         Assert(curridx == Write->curridx);
1652
1653         /*
1654          * If asked to flush, do so
1655          */
1656         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1657                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1658         {
1659                 /*
1660                  * Could get here without iterating above loop, in which case we might
1661                  * have no open file or the wrong one.  However, we do not need to
1662                  * fsync more than one file.
1663                  */
1664                 if (sync_method != SYNC_METHOD_OPEN &&
1665                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1666                 {
1667                         if (openLogFile >= 0 &&
1668                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1669                                 XLogFileClose();
1670                         if (openLogFile < 0)
1671                         {
1672                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1673                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1674                                 openLogOff = 0;
1675                         }
1676                         issue_xlog_fsync();
1677                 }
1678                 LogwrtResult.Flush = LogwrtResult.Write;
1679         }
1680
1681         /*
1682          * Update shared-memory status
1683          *
1684          * We make sure that the shared 'request' values do not fall behind the
1685          * 'result' values.  This is not absolutely essential, but it saves some
1686          * code in a couple of places.
1687          */
1688         {
1689                 /* use volatile pointer to prevent code rearrangement */
1690                 volatile XLogCtlData *xlogctl = XLogCtl;
1691
1692                 SpinLockAcquire(&xlogctl->info_lck);
1693                 xlogctl->LogwrtResult = LogwrtResult;
1694                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1695                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1696                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1697                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1698                 SpinLockRelease(&xlogctl->info_lck);
1699         }
1700
1701         Write->LogwrtResult = LogwrtResult;
1702 }
1703
1704 /*
1705  * Record the LSN for an asynchronous transaction commit.
1706  * (This should not be called for aborts, nor for synchronous commits.)
1707  */
1708 void
1709 XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
1710 {
1711         /* use volatile pointer to prevent code rearrangement */
1712         volatile XLogCtlData *xlogctl = XLogCtl;
1713
1714         SpinLockAcquire(&xlogctl->info_lck);
1715         if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
1716                 xlogctl->asyncCommitLSN = asyncCommitLSN;
1717         SpinLockRelease(&xlogctl->info_lck);
1718 }
1719
1720 /*
1721  * Ensure that all XLOG data through the given position is flushed to disk.
1722  *
1723  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1724  * already held, and we try to avoid acquiring it if possible.
1725  */
1726 void
1727 XLogFlush(XLogRecPtr record)
1728 {
1729         XLogRecPtr      WriteRqstPtr;
1730         XLogwrtRqst WriteRqst;
1731
1732         /* Disabled during REDO */
1733         if (InRedo)
1734                 return;
1735
1736         /* Quick exit if already known flushed */
1737         if (XLByteLE(record, LogwrtResult.Flush))
1738                 return;
1739
1740 #ifdef WAL_DEBUG
1741         if (XLOG_DEBUG)
1742                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1743                          record.xlogid, record.xrecoff,
1744                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1745                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1746 #endif
1747
1748         START_CRIT_SECTION();
1749
1750         /*
1751          * Since fsync is usually a horribly expensive operation, we try to
1752          * piggyback as much data as we can on each fsync: if we see any more data
1753          * entered into the xlog buffer, we'll write and fsync that too, so that
1754          * the final value of LogwrtResult.Flush is as large as possible. This
1755          * gives us some chance of avoiding another fsync immediately after.
1756          */
1757
1758         /* initialize to given target; may increase below */
1759         WriteRqstPtr = record;
1760
1761         /* read LogwrtResult and update local state */
1762         {
1763                 /* use volatile pointer to prevent code rearrangement */
1764                 volatile XLogCtlData *xlogctl = XLogCtl;
1765
1766                 SpinLockAcquire(&xlogctl->info_lck);
1767                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
1768                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1769                 LogwrtResult = xlogctl->LogwrtResult;
1770                 SpinLockRelease(&xlogctl->info_lck);
1771         }
1772
1773         /* done already? */
1774         if (!XLByteLE(record, LogwrtResult.Flush))
1775         {
1776                 /* now wait for the write lock */
1777                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1778                 LogwrtResult = XLogCtl->Write.LogwrtResult;
1779                 if (!XLByteLE(record, LogwrtResult.Flush))
1780                 {
1781                         /* try to write/flush later additions to XLOG as well */
1782                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
1783                         {
1784                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
1785                                 uint32          freespace = INSERT_FREESPACE(Insert);
1786
1787                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
1788                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1789                                 else
1790                                 {
1791                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1792                                         WriteRqstPtr.xrecoff -= freespace;
1793                                 }
1794                                 LWLockRelease(WALInsertLock);
1795                                 WriteRqst.Write = WriteRqstPtr;
1796                                 WriteRqst.Flush = WriteRqstPtr;
1797                         }
1798                         else
1799                         {
1800                                 WriteRqst.Write = WriteRqstPtr;
1801                                 WriteRqst.Flush = record;
1802                         }
1803                         XLogWrite(WriteRqst, false, false);
1804                 }
1805                 LWLockRelease(WALWriteLock);
1806         }
1807
1808         END_CRIT_SECTION();
1809
1810         /*
1811          * If we still haven't flushed to the request point then we have a
1812          * problem; most likely, the requested flush point is past end of XLOG.
1813          * This has been seen to occur when a disk page has a corrupted LSN.
1814          *
1815          * Formerly we treated this as a PANIC condition, but that hurts the
1816          * system's robustness rather than helping it: we do not want to take down
1817          * the whole system due to corruption on one data page.  In particular, if
1818          * the bad page is encountered again during recovery then we would be
1819          * unable to restart the database at all!  (This scenario has actually
1820          * happened in the field several times with 7.1 releases. Note that we
1821          * cannot get here while InRedo is true, but if the bad page is brought in
1822          * and marked dirty during recovery then CreateCheckPoint will try to
1823          * flush it at the end of recovery.)
1824          *
1825          * The current approach is to ERROR under normal conditions, but only
1826          * WARNING during recovery, so that the system can be brought up even if
1827          * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR will
1828          * be promoted to PANIC since xact.c calls this routine inside a critical
1829          * section.  However, calls from bufmgr.c are not within critical sections
1830          * and so we will not force a restart for a bad LSN on a data page.
1831          */
1832         if (XLByteLT(LogwrtResult.Flush, record))
1833                 elog(InRecovery ? WARNING : ERROR,
1834                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
1835                          record.xlogid, record.xrecoff,
1836                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1837 }
1838
1839 /*
1840  * Flush xlog, but without specifying exactly where to flush to.
1841  *
1842  * We normally flush only completed blocks; but if there is nothing to do on
1843  * that basis, we check for unflushed async commits in the current incomplete
1844  * block, and flush through the latest one of those.  Thus, if async commits
1845  * are not being used, we will flush complete blocks only.      We can guarantee
1846  * that async commits reach disk after at most three cycles; normally only
1847  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
1848  * at the end of the buffer ring; this makes a difference only with very high
1849  * load or long wal_writer_delay, but imposes one extra cycle for the worst
1850  * case for async commits.)
1851  *
1852  * This routine is invoked periodically by the background walwriter process.
1853  */
1854 void
1855 XLogBackgroundFlush(void)
1856 {
1857         XLogRecPtr      WriteRqstPtr;
1858         bool            flexible = true;
1859
1860         /* read LogwrtResult and update local state */
1861         {
1862                 /* use volatile pointer to prevent code rearrangement */
1863                 volatile XLogCtlData *xlogctl = XLogCtl;
1864
1865                 SpinLockAcquire(&xlogctl->info_lck);
1866                 LogwrtResult = xlogctl->LogwrtResult;
1867                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1868                 SpinLockRelease(&xlogctl->info_lck);
1869         }
1870
1871         /* back off to last completed page boundary */
1872         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
1873
1874         /* if we have already flushed that far, consider async commit records */
1875         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1876         {
1877                 /* use volatile pointer to prevent code rearrangement */
1878                 volatile XLogCtlData *xlogctl = XLogCtl;
1879
1880                 SpinLockAcquire(&xlogctl->info_lck);
1881                 WriteRqstPtr = xlogctl->asyncCommitLSN;
1882                 SpinLockRelease(&xlogctl->info_lck);
1883                 flexible = false;               /* ensure it all gets written */
1884         }
1885
1886         /* Done if already known flushed */
1887         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1888                 return;
1889
1890 #ifdef WAL_DEBUG
1891         if (XLOG_DEBUG)
1892                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
1893                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
1894                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1895                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1896 #endif
1897
1898         START_CRIT_SECTION();
1899
1900         /* now wait for the write lock */
1901         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1902         LogwrtResult = XLogCtl->Write.LogwrtResult;
1903         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1904         {
1905                 XLogwrtRqst WriteRqst;
1906
1907                 WriteRqst.Write = WriteRqstPtr;
1908                 WriteRqst.Flush = WriteRqstPtr;
1909                 XLogWrite(WriteRqst, flexible, false);
1910         }
1911         LWLockRelease(WALWriteLock);
1912
1913         END_CRIT_SECTION();
1914 }
1915
1916 /*
1917  * Flush any previous asynchronously-committed transactions' commit records.
1918  *
1919  * NOTE: it is unwise to assume that this provides any strong guarantees.
1920  * In particular, because of the inexact LSN bookkeeping used by clog.c,
1921  * we cannot assume that hint bits will be settable for these transactions.
1922  */
1923 void
1924 XLogAsyncCommitFlush(void)
1925 {
1926         XLogRecPtr      WriteRqstPtr;
1927
1928         /* use volatile pointer to prevent code rearrangement */
1929         volatile XLogCtlData *xlogctl = XLogCtl;
1930
1931         SpinLockAcquire(&xlogctl->info_lck);
1932         WriteRqstPtr = xlogctl->asyncCommitLSN;
1933         SpinLockRelease(&xlogctl->info_lck);
1934
1935         XLogFlush(WriteRqstPtr);
1936 }
1937
1938 /*
1939  * Test whether XLOG data has been flushed up to (at least) the given position.
1940  *
1941  * Returns true if a flush is still needed.  (It may be that someone else
1942  * is already in process of flushing that far, however.)
1943  */
1944 bool
1945 XLogNeedsFlush(XLogRecPtr record)
1946 {
1947         /* Quick exit if already known flushed */
1948         if (XLByteLE(record, LogwrtResult.Flush))
1949                 return false;
1950
1951         /* read LogwrtResult and update local state */
1952         {
1953                 /* use volatile pointer to prevent code rearrangement */
1954                 volatile XLogCtlData *xlogctl = XLogCtl;
1955
1956                 SpinLockAcquire(&xlogctl->info_lck);
1957                 LogwrtResult = xlogctl->LogwrtResult;
1958                 SpinLockRelease(&xlogctl->info_lck);
1959         }
1960
1961         /* check again */
1962         if (XLByteLE(record, LogwrtResult.Flush))
1963                 return false;
1964
1965         return true;
1966 }
1967
1968 /*
1969  * Create a new XLOG file segment, or open a pre-existing one.
1970  *
1971  * log, seg: identify segment to be created/opened.
1972  *
1973  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
1974  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
1975  * file was used.
1976  *
1977  * use_lock: if TRUE, acquire ControlFileLock while moving file into
1978  * place.  This should be TRUE except during bootstrap log creation.  The
1979  * caller must *not* hold the lock at call.
1980  *
1981  * Returns FD of opened file.
1982  *
1983  * Note: errors here are ERROR not PANIC because we might or might not be
1984  * inside a critical section (eg, during checkpoint there is no reason to
1985  * take down the system on failure).  They will promote to PANIC if we are
1986  * in a critical section.
1987  */
1988 static int
1989 XLogFileInit(uint32 log, uint32 seg,
1990                          bool *use_existent, bool use_lock)
1991 {
1992         char            path[MAXPGPATH];
1993         char            tmppath[MAXPGPATH];
1994         char       *zbuffer;
1995         uint32          installed_log;
1996         uint32          installed_seg;
1997         int                     max_advance;
1998         int                     fd;
1999         int                     nbytes;
2000
2001         XLogFilePath(path, ThisTimeLineID, log, seg);
2002
2003         /*
2004          * Try to use existent file (checkpoint maker may have created it already)
2005          */
2006         if (*use_existent)
2007         {
2008                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2009                                                    S_IRUSR | S_IWUSR);
2010                 if (fd < 0)
2011                 {
2012                         if (errno != ENOENT)
2013                                 ereport(ERROR,
2014                                                 (errcode_for_file_access(),
2015                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2016                                                                 path, log, seg)));
2017                 }
2018                 else
2019                         return fd;
2020         }
2021
2022         /*
2023          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2024          * another process is doing the same thing.  If so, we will end up
2025          * pre-creating an extra log segment.  That seems OK, and better than
2026          * holding the lock throughout this lengthy process.
2027          */
2028         elog(DEBUG2, "creating and filling new WAL file");
2029
2030         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2031
2032         unlink(tmppath);
2033
2034         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2035         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2036                                            S_IRUSR | S_IWUSR);
2037         if (fd < 0)
2038                 ereport(ERROR,
2039                                 (errcode_for_file_access(),
2040                                  errmsg("could not create file \"%s\": %m", tmppath)));
2041
2042         /*
2043          * Zero-fill the file.  We have to do this the hard way to ensure that all
2044          * the file space has really been allocated --- on platforms that allow
2045          * "holes" in files, just seeking to the end doesn't allocate intermediate
2046          * space.  This way, we know that we have all the space and (after the
2047          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2048          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2049          * log file.
2050          *
2051          * Note: palloc zbuffer, instead of just using a local char array, to
2052          * ensure it is reasonably well-aligned; this may save a few cycles
2053          * transferring data to the kernel.
2054          */
2055         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2056         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2057         {
2058                 errno = 0;
2059                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2060                 {
2061                         int                     save_errno = errno;
2062
2063                         /*
2064                          * If we fail to make the file, delete it to release disk space
2065                          */
2066                         unlink(tmppath);
2067                         /* if write didn't set errno, assume problem is no disk space */
2068                         errno = save_errno ? save_errno : ENOSPC;
2069
2070                         ereport(ERROR,
2071                                         (errcode_for_file_access(),
2072                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2073                 }
2074         }
2075         pfree(zbuffer);
2076
2077         if (pg_fsync(fd) != 0)
2078                 ereport(ERROR,
2079                                 (errcode_for_file_access(),
2080                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2081
2082         if (close(fd))
2083                 ereport(ERROR,
2084                                 (errcode_for_file_access(),
2085                                  errmsg("could not close file \"%s\": %m", tmppath)));
2086
2087         /*
2088          * Now move the segment into place with its final name.
2089          *
2090          * If caller didn't want to use a pre-existing file, get rid of any
2091          * pre-existing file.  Otherwise, cope with possibility that someone else
2092          * has created the file while we were filling ours: if so, use ours to
2093          * pre-create a future log segment.
2094          */
2095         installed_log = log;
2096         installed_seg = seg;
2097         max_advance = XLOGfileslop;
2098         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2099                                                                 *use_existent, &max_advance,
2100                                                                 use_lock))
2101         {
2102                 /* No need for any more future segments... */
2103                 unlink(tmppath);
2104         }
2105
2106         elog(DEBUG2, "done creating and filling new WAL file");
2107
2108         /* Set flag to tell caller there was no existent file */
2109         *use_existent = false;
2110
2111         /* Now open original target segment (might not be file I just made) */
2112         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2113                                            S_IRUSR | S_IWUSR);
2114         if (fd < 0)
2115                 ereport(ERROR,
2116                                 (errcode_for_file_access(),
2117                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2118                                   path, log, seg)));
2119
2120         return fd;
2121 }
2122
2123 /*
2124  * Create a new XLOG file segment by copying a pre-existing one.
2125  *
2126  * log, seg: identify segment to be created.
2127  *
2128  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2129  *              a different timeline)
2130  *
2131  * Currently this is only used during recovery, and so there are no locking
2132  * considerations.      But we should be just as tense as XLogFileInit to avoid
2133  * emplacing a bogus file.
2134  */
2135 static void
2136 XLogFileCopy(uint32 log, uint32 seg,
2137                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2138 {
2139         char            path[MAXPGPATH];
2140         char            tmppath[MAXPGPATH];
2141         char            buffer[XLOG_BLCKSZ];
2142         int                     srcfd;
2143         int                     fd;
2144         int                     nbytes;
2145
2146         /*
2147          * Open the source file
2148          */
2149         XLogFilePath(path, srcTLI, srclog, srcseg);
2150         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2151         if (srcfd < 0)
2152                 ereport(ERROR,
2153                                 (errcode_for_file_access(),
2154                                  errmsg("could not open file \"%s\": %m", path)));
2155
2156         /*
2157          * Copy into a temp file name.
2158          */
2159         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2160
2161         unlink(tmppath);
2162
2163         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2164         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2165                                            S_IRUSR | S_IWUSR);
2166         if (fd < 0)
2167                 ereport(ERROR,
2168                                 (errcode_for_file_access(),
2169                                  errmsg("could not create file \"%s\": %m", tmppath)));
2170
2171         /*
2172          * Do the data copying.
2173          */
2174         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2175         {
2176                 errno = 0;
2177                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2178                 {
2179                         if (errno != 0)
2180                                 ereport(ERROR,
2181                                                 (errcode_for_file_access(),
2182                                                  errmsg("could not read file \"%s\": %m", path)));
2183                         else
2184                                 ereport(ERROR,
2185                                                 (errmsg("not enough data in file \"%s\"", path)));
2186                 }
2187                 errno = 0;
2188                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2189                 {
2190                         int                     save_errno = errno;
2191
2192                         /*
2193                          * If we fail to make the file, delete it to release disk space
2194                          */
2195                         unlink(tmppath);
2196                         /* if write didn't set errno, assume problem is no disk space */
2197                         errno = save_errno ? save_errno : ENOSPC;
2198
2199                         ereport(ERROR,
2200                                         (errcode_for_file_access(),
2201                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2202                 }
2203         }
2204
2205         if (pg_fsync(fd) != 0)
2206                 ereport(ERROR,
2207                                 (errcode_for_file_access(),
2208                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2209
2210         if (close(fd))
2211                 ereport(ERROR,
2212                                 (errcode_for_file_access(),
2213                                  errmsg("could not close file \"%s\": %m", tmppath)));
2214
2215         close(srcfd);
2216
2217         /*
2218          * Now move the segment into place with its final name.
2219          */
2220         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2221                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2222 }
2223
2224 /*
2225  * Install a new XLOG segment file as a current or future log segment.
2226  *
2227  * This is used both to install a newly-created segment (which has a temp
2228  * filename while it's being created) and to recycle an old segment.
2229  *
2230  * *log, *seg: identify segment to install as (or first possible target).
2231  * When find_free is TRUE, these are modified on return to indicate the
2232  * actual installation location or last segment searched.
2233  *
2234  * tmppath: initial name of file to install.  It will be renamed into place.
2235  *
2236  * find_free: if TRUE, install the new segment at the first empty log/seg
2237  * number at or after the passed numbers.  If FALSE, install the new segment
2238  * exactly where specified, deleting any existing segment file there.
2239  *
2240  * *max_advance: maximum number of log/seg slots to advance past the starting
2241  * point.  Fail if no free slot is found in this range.  On return, reduced
2242  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2243  * when find_free is FALSE.)
2244  *
2245  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2246  * place.  This should be TRUE except during bootstrap log creation.  The
2247  * caller must *not* hold the lock at call.
2248  *
2249  * Returns TRUE if file installed, FALSE if not installed because of
2250  * exceeding max_advance limit.  On Windows, we also return FALSE if we
2251  * can't rename the file into place because someone's got it open.
2252  * (Any other kind of failure causes ereport().)
2253  */
2254 static bool
2255 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2256                                            bool find_free, int *max_advance,
2257                                            bool use_lock)
2258 {
2259         char            path[MAXPGPATH];
2260         struct stat stat_buf;
2261
2262         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2263
2264         /*
2265          * We want to be sure that only one process does this at a time.
2266          */
2267         if (use_lock)
2268                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2269
2270         if (!find_free)
2271         {
2272                 /* Force installation: get rid of any pre-existing segment file */
2273                 unlink(path);
2274         }
2275         else
2276         {
2277                 /* Find a free slot to put it in */
2278                 while (stat(path, &stat_buf) == 0)
2279                 {
2280                         if (*max_advance <= 0)
2281                         {
2282                                 /* Failed to find a free slot within specified range */
2283                                 if (use_lock)
2284                                         LWLockRelease(ControlFileLock);
2285                                 return false;
2286                         }
2287                         NextLogSeg(*log, *seg);
2288                         (*max_advance)--;
2289                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2290                 }
2291         }
2292
2293         /*
2294          * Prefer link() to rename() here just to be really sure that we don't
2295          * overwrite an existing logfile.  However, there shouldn't be one, so
2296          * rename() is an acceptable substitute except for the truly paranoid.
2297          */
2298 #if HAVE_WORKING_LINK
2299         if (link(tmppath, path) < 0)
2300                 ereport(ERROR,
2301                                 (errcode_for_file_access(),
2302                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2303                                                 tmppath, path, *log, *seg)));
2304         unlink(tmppath);
2305 #else
2306         if (rename(tmppath, path) < 0)
2307         {
2308 #ifdef WIN32
2309 #if !defined(__CYGWIN__)
2310                 if (GetLastError() == ERROR_ACCESS_DENIED)
2311 #else
2312                 if (errno == EACCES)
2313 #endif
2314                 {
2315                         if (use_lock)
2316                                 LWLockRelease(ControlFileLock);
2317                         return false;
2318                 }
2319 #endif   /* WIN32 */
2320
2321                 ereport(ERROR,
2322                                 (errcode_for_file_access(),
2323                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2324                                                 tmppath, path, *log, *seg)));
2325         }
2326 #endif
2327
2328         if (use_lock)
2329                 LWLockRelease(ControlFileLock);
2330
2331         return true;
2332 }
2333
2334 /*
2335  * Open a pre-existing logfile segment for writing.
2336  */
2337 static int
2338 XLogFileOpen(uint32 log, uint32 seg)
2339 {
2340         char            path[MAXPGPATH];
2341         int                     fd;
2342
2343         XLogFilePath(path, ThisTimeLineID, log, seg);
2344
2345         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2346                                            S_IRUSR | S_IWUSR);
2347         if (fd < 0)
2348                 ereport(PANIC,
2349                                 (errcode_for_file_access(),
2350                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2351                                   path, log, seg)));
2352
2353         return fd;
2354 }
2355
2356 /*
2357  * Open a logfile segment for reading (during recovery).
2358  */
2359 static int
2360 XLogFileRead(uint32 log, uint32 seg, int emode)
2361 {
2362         char            path[MAXPGPATH];
2363         char            xlogfname[MAXFNAMELEN];
2364         char            activitymsg[MAXFNAMELEN + 16];
2365         ListCell   *cell;
2366         int                     fd;
2367
2368         /*
2369          * Loop looking for a suitable timeline ID: we might need to read any of
2370          * the timelines listed in expectedTLIs.
2371          *
2372          * We expect curFileTLI on entry to be the TLI of the preceding file in
2373          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2374          * to go backwards; this prevents us from picking up the wrong file when a
2375          * parent timeline extends to higher segment numbers than the child we
2376          * want to read.
2377          */
2378         foreach(cell, expectedTLIs)
2379         {
2380                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2381
2382                 if (tli < curFileTLI)
2383                         break;                          /* don't bother looking at too-old TLIs */
2384
2385                 XLogFileName(xlogfname, tli, log, seg);
2386
2387                 if (InArchiveRecovery)
2388                 {
2389                         /* Report recovery progress in PS display */
2390                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2391                                          xlogfname);
2392                         set_ps_display(activitymsg, false);
2393
2394                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2395                                                                                                           "RECOVERYXLOG",
2396                                                                                                           XLogSegSize);
2397                 }
2398                 else
2399                         XLogFilePath(path, tli, log, seg);
2400
2401                 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2402                 if (fd >= 0)
2403                 {
2404                         /* Success! */
2405                         curFileTLI = tli;
2406
2407                         /* Report recovery progress in PS display */
2408                         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2409                                          xlogfname);
2410                         set_ps_display(activitymsg, false);
2411
2412                         return fd;
2413                 }
2414                 if (errno != ENOENT)    /* unexpected failure? */
2415                         ereport(PANIC,
2416                                         (errcode_for_file_access(),
2417                         errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2418                                    path, log, seg)));
2419         }
2420
2421         /* Couldn't find it.  For simplicity, complain about front timeline */
2422         XLogFilePath(path, recoveryTargetTLI, log, seg);
2423         errno = ENOENT;
2424         ereport(emode,
2425                         (errcode_for_file_access(),
2426                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2427                                   path, log, seg)));
2428         return -1;
2429 }
2430
2431 /*
2432  * Close the current logfile segment for writing.
2433  */
2434 static void
2435 XLogFileClose(void)
2436 {
2437         Assert(openLogFile >= 0);
2438
2439         /*
2440          * WAL segment files will not be re-read in normal operation, so we advise
2441          * the OS to release any cached pages.  But do not do so if WAL archiving
2442          * is active, because archiver process could use the cache to read the WAL
2443          * segment.  Also, don't bother with it if we are using O_DIRECT, since
2444          * the kernel is presumably not caching in that case.
2445          */
2446 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2447         if (!XLogArchivingActive() &&
2448                 (get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
2449                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2450 #endif
2451
2452         if (close(openLogFile))
2453                 ereport(PANIC,
2454                                 (errcode_for_file_access(),
2455                                  errmsg("could not close log file %u, segment %u: %m",
2456                                                 openLogId, openLogSeg)));
2457         openLogFile = -1;
2458 }
2459
2460 /*
2461  * Attempt to retrieve the specified file from off-line archival storage.
2462  * If successful, fill "path" with its complete path (note that this will be
2463  * a temp file name that doesn't follow the normal naming convention), and
2464  * return TRUE.
2465  *
2466  * If not successful, fill "path" with the name of the normal on-line file
2467  * (which may or may not actually exist, but we'll try to use it), and return
2468  * FALSE.
2469  *
2470  * For fixed-size files, the caller may pass the expected size as an
2471  * additional crosscheck on successful recovery.  If the file size is not
2472  * known, set expectedSize = 0.
2473  */
2474 static bool
2475 RestoreArchivedFile(char *path, const char *xlogfname,
2476                                         const char *recovername, off_t expectedSize)
2477 {
2478         char            xlogpath[MAXPGPATH];
2479         char            xlogRestoreCmd[MAXPGPATH];
2480         char            lastRestartPointFname[MAXPGPATH];
2481         char       *dp;
2482         char       *endp;
2483         const char *sp;
2484         int                     rc;
2485         bool            signaled;
2486         struct stat stat_buf;
2487         uint32          restartLog;
2488         uint32          restartSeg;
2489
2490         /*
2491          * When doing archive recovery, we always prefer an archived log file even
2492          * if a file of the same name exists in XLOGDIR.  The reason is that the
2493          * file in XLOGDIR could be an old, un-filled or partly-filled version
2494          * that was copied and restored as part of backing up $PGDATA.
2495          *
2496          * We could try to optimize this slightly by checking the local copy
2497          * lastchange timestamp against the archived copy, but we have no API to
2498          * do this, nor can we guarantee that the lastchange timestamp was
2499          * preserved correctly when we copied to archive. Our aim is robustness,
2500          * so we elect not to do this.
2501          *
2502          * If we cannot obtain the log file from the archive, however, we will try
2503          * to use the XLOGDIR file if it exists.  This is so that we can make use
2504          * of log segments that weren't yet transferred to the archive.
2505          *
2506          * Notice that we don't actually overwrite any files when we copy back
2507          * from archive because the recoveryRestoreCommand may inadvertently
2508          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
2509          * fallback to the segments remaining in current XLOGDIR later. The
2510          * copy-from-archive filename is always the same, ensuring that we don't
2511          * run out of disk space on long recoveries.
2512          */
2513         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2514
2515         /*
2516          * Make sure there is no existing file named recovername.
2517          */
2518         if (stat(xlogpath, &stat_buf) != 0)
2519         {
2520                 if (errno != ENOENT)
2521                         ereport(FATAL,
2522                                         (errcode_for_file_access(),
2523                                          errmsg("could not stat file \"%s\": %m",
2524                                                         xlogpath)));
2525         }
2526         else
2527         {
2528                 if (unlink(xlogpath) != 0)
2529                         ereport(FATAL,
2530                                         (errcode_for_file_access(),
2531                                          errmsg("could not remove file \"%s\": %m",
2532                                                         xlogpath)));
2533         }
2534
2535         /*
2536          * Calculate the archive file cutoff point for use during log shipping
2537          * replication. All files earlier than this point can be deleted
2538          * from the archive, though there is no requirement to do so.
2539          *
2540          * We initialise this with the filename of an InvalidXLogRecPtr, which
2541          * will prevent the deletion of any WAL files from the archive
2542          * because of the alphabetic sorting property of WAL filenames.
2543          *
2544          * Once we have successfully located the redo pointer of the checkpoint
2545          * from which we start recovery we never request a file prior to the redo
2546          * pointer of the last restartpoint. When redo begins we know that we
2547          * have successfully located it, so there is no need for additional
2548          * status flags to signify the point when we can begin deleting WAL files
2549          * from the archive.
2550          */
2551         if (InRedo)
2552         {
2553                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2554                                         restartLog, restartSeg);
2555                 XLogFileName(lastRestartPointFname,
2556                                          ControlFile->checkPointCopy.ThisTimeLineID,
2557                                          restartLog, restartSeg);
2558                 /* we shouldn't need anything earlier than last restart point */
2559                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2560         }
2561         else
2562                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2563
2564         /*
2565          * construct the command to be executed
2566          */
2567         dp = xlogRestoreCmd;
2568         endp = xlogRestoreCmd + MAXPGPATH - 1;
2569         *endp = '\0';
2570
2571         for (sp = recoveryRestoreCommand; *sp; sp++)
2572         {
2573                 if (*sp == '%')
2574                 {
2575                         switch (sp[1])
2576                         {
2577                                 case 'p':
2578                                         /* %p: relative path of target file */
2579                                         sp++;
2580                                         StrNCpy(dp, xlogpath, endp - dp);
2581                                         make_native_path(dp);
2582                                         dp += strlen(dp);
2583                                         break;
2584                                 case 'f':
2585                                         /* %f: filename of desired file */
2586                                         sp++;
2587                                         StrNCpy(dp, xlogfname, endp - dp);
2588                                         dp += strlen(dp);
2589                                         break;
2590                                 case 'r':
2591                                         /* %r: filename of last restartpoint */
2592                                         sp++;
2593                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2594                                         dp += strlen(dp);
2595                                         break;
2596                                 case '%':
2597                                         /* convert %% to a single % */
2598                                         sp++;
2599                                         if (dp < endp)
2600                                                 *dp++ = *sp;
2601                                         break;
2602                                 default:
2603                                         /* otherwise treat the % as not special */
2604                                         if (dp < endp)
2605                                                 *dp++ = *sp;
2606                                         break;
2607                         }
2608                 }
2609                 else
2610                 {
2611                         if (dp < endp)
2612                                 *dp++ = *sp;
2613                 }
2614         }
2615         *dp = '\0';
2616
2617         ereport(DEBUG3,
2618                         (errmsg_internal("executing restore command \"%s\"",
2619                                                          xlogRestoreCmd)));
2620
2621         /*
2622          * Copy xlog from archival storage to XLOGDIR
2623          */
2624         rc = system(xlogRestoreCmd);
2625         if (rc == 0)
2626         {
2627                 /*
2628                  * command apparently succeeded, but let's make sure the file is
2629                  * really there now and has the correct size.
2630                  *
2631                  * XXX I made wrong-size a fatal error to ensure the DBA would notice
2632                  * it, but is that too strong?  We could try to plow ahead with a
2633                  * local copy of the file ... but the problem is that there probably
2634                  * isn't one, and we'd incorrectly conclude we've reached the end of
2635                  * WAL and we're done recovering ...
2636                  */
2637                 if (stat(xlogpath, &stat_buf) == 0)
2638                 {
2639                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
2640                                 ereport(FATAL,
2641                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
2642                                                                 xlogfname,
2643                                                                 (unsigned long) stat_buf.st_size,
2644                                                                 (unsigned long) expectedSize)));
2645                         else
2646                         {
2647                                 ereport(LOG,
2648                                                 (errmsg("restored log file \"%s\" from archive",
2649                                                                 xlogfname)));
2650                                 strcpy(path, xlogpath);
2651                                 return true;
2652                         }
2653                 }
2654                 else
2655                 {
2656                         /* stat failed */
2657                         if (errno != ENOENT)
2658                                 ereport(FATAL,
2659                                                 (errcode_for_file_access(),
2660                                                  errmsg("could not stat file \"%s\": %m",
2661                                                                 xlogpath)));
2662                 }
2663         }
2664
2665         /*
2666          * Remember, we rollforward UNTIL the restore fails so failure here is
2667          * just part of the process... that makes it difficult to determine
2668          * whether the restore failed because there isn't an archive to restore,
2669          * or because the administrator has specified the restore program
2670          * incorrectly.  We have to assume the former.
2671          *
2672          * However, if the failure was due to any sort of signal, it's best to
2673          * punt and abort recovery.  (If we "return false" here, upper levels will
2674          * assume that recovery is complete and start up the database!) It's
2675          * essential to abort on child SIGINT and SIGQUIT, because per spec
2676          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
2677          * those it's a good bet we should have gotten it too.  Aborting on other
2678          * signals such as SIGTERM seems a good idea as well.
2679          *
2680          * Per the Single Unix Spec, shells report exit status > 128 when a called
2681          * command died on a signal.  Also, 126 and 127 are used to report
2682          * problems such as an unfindable command; treat those as fatal errors
2683          * too.
2684          */
2685         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
2686
2687         ereport(signaled ? FATAL : DEBUG2,
2688                 (errmsg("could not restore file \"%s\" from archive: return code %d",
2689                                 xlogfname, rc)));
2690
2691         /*
2692          * if an archived file is not available, there might still be a version of
2693          * this file in XLOGDIR, so return that as the filename to open.
2694          *
2695          * In many recovery scenarios we expect this to fail also, but if so that
2696          * just means we've reached the end of WAL.
2697          */
2698         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2699         return false;
2700 }
2701
2702 /*
2703  * Preallocate log files beyond the specified log endpoint.
2704  *
2705  * XXX this is currently extremely conservative, since it forces only one
2706  * future log segment to exist, and even that only if we are 75% done with
2707  * the current one.  This is only appropriate for very low-WAL-volume systems.
2708  * High-volume systems will be OK once they've built up a sufficient set of
2709  * recycled log segments, but the startup transient is likely to include
2710  * a lot of segment creations by foreground processes, which is not so good.
2711  */
2712 static void
2713 PreallocXlogFiles(XLogRecPtr endptr)
2714 {
2715         uint32          _logId;
2716         uint32          _logSeg;
2717         int                     lf;
2718         bool            use_existent;
2719
2720         XLByteToPrevSeg(endptr, _logId, _logSeg);
2721         if ((endptr.xrecoff - 1) % XLogSegSize >=
2722                 (uint32) (0.75 * XLogSegSize))
2723         {
2724                 NextLogSeg(_logId, _logSeg);
2725                 use_existent = true;
2726                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
2727                 close(lf);
2728                 if (!use_existent)
2729                         CheckpointStats.ckpt_segs_added++;
2730         }
2731 }
2732
2733 /*
2734  * Recycle or remove all log files older or equal to passed log/seg#
2735  *
2736  * endptr is current (or recent) end of xlog; this is used to determine
2737  * whether we want to recycle rather than delete no-longer-wanted log files.
2738  */
2739 static void
2740 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
2741 {
2742         uint32          endlogId;
2743         uint32          endlogSeg;
2744         int                     max_advance;
2745         DIR                *xldir;
2746         struct dirent *xlde;
2747         char            lastoff[MAXFNAMELEN];
2748         char            path[MAXPGPATH];
2749
2750         /*
2751          * Initialize info about where to try to recycle to.  We allow recycling
2752          * segments up to XLOGfileslop segments beyond the current XLOG location.
2753          */
2754         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
2755         max_advance = XLOGfileslop;
2756
2757         xldir = AllocateDir(XLOGDIR);
2758         if (xldir == NULL)
2759                 ereport(ERROR,
2760                                 (errcode_for_file_access(),
2761                                  errmsg("could not open transaction log directory \"%s\": %m",
2762                                                 XLOGDIR)));
2763
2764         XLogFileName(lastoff, ThisTimeLineID, log, seg);
2765
2766         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2767         {
2768                 /*
2769                  * We ignore the timeline part of the XLOG segment identifiers in
2770                  * deciding whether a segment is still needed.  This ensures that we
2771                  * won't prematurely remove a segment from a parent timeline. We could
2772                  * probably be a little more proactive about removing segments of
2773                  * non-parent timelines, but that would be a whole lot more
2774                  * complicated.
2775                  *
2776                  * We use the alphanumeric sorting property of the filenames to decide
2777                  * which ones are earlier than the lastoff segment.
2778                  */
2779                 if (strlen(xlde->d_name) == 24 &&
2780                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2781                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
2782                 {
2783                         if (XLogArchiveCheckDone(xlde->d_name))
2784                         {
2785                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2786
2787                                 /*
2788                                  * Before deleting the file, see if it can be recycled as a
2789                                  * future log segment.
2790                                  */
2791                                 if (InstallXLogFileSegment(&endlogId, &endlogSeg, path,
2792                                                                                    true, &max_advance,
2793                                                                                    true))
2794                                 {
2795                                         ereport(DEBUG2,
2796                                                         (errmsg("recycled transaction log file \"%s\"",
2797                                                                         xlde->d_name)));
2798                                         CheckpointStats.ckpt_segs_recycled++;
2799                                         /* Needn't recheck that slot on future iterations */
2800                                         if (max_advance > 0)
2801                                         {
2802                                                 NextLogSeg(endlogId, endlogSeg);
2803                                                 max_advance--;
2804                                         }
2805                                 }
2806                                 else
2807                                 {
2808                                         /* No need for any more future segments... */
2809                                         ereport(DEBUG2,
2810                                                         (errmsg("removing transaction log file \"%s\"",
2811                                                                         xlde->d_name)));
2812                                         unlink(path);
2813                                         CheckpointStats.ckpt_segs_removed++;
2814                                 }
2815
2816                                 XLogArchiveCleanup(xlde->d_name);
2817                         }
2818                 }
2819         }
2820
2821         FreeDir(xldir);
2822 }
2823
2824 /*
2825  * Verify whether pg_xlog and pg_xlog/archive_status exist.
2826  * If the latter does not exist, recreate it.
2827  *
2828  * It is not the goal of this function to verify the contents of these
2829  * directories, but to help in cases where someone has performed a cluster
2830  * copy for PITR purposes but omitted pg_xlog from the copy.
2831  *
2832  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
2833  * policy decision was made not to.  It is fairly common for pg_xlog to be
2834  * a symlink, and if that was the DBA's intent then automatically making a
2835  * plain directory would result in degraded performance with no notice.
2836  */
2837 static void
2838 ValidateXLOGDirectoryStructure(void)
2839 {
2840         char            path[MAXPGPATH];
2841         struct stat     stat_buf;
2842
2843         /* Check for pg_xlog; if it doesn't exist, error out */
2844         if (stat(XLOGDIR, &stat_buf) != 0 ||
2845                 !S_ISDIR(stat_buf.st_mode))
2846                 ereport(FATAL,
2847                                 (errmsg("required WAL directory \"%s\" does not exist",
2848                                                 XLOGDIR)));
2849
2850         /* Check for archive_status */
2851         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
2852         if (stat(path, &stat_buf) == 0)
2853         {
2854                 /* Check for weird cases where it exists but isn't a directory */
2855                 if (!S_ISDIR(stat_buf.st_mode))
2856                         ereport(FATAL,
2857                                         (errmsg("required WAL directory \"%s\" does not exist",
2858                                                         path)));
2859         }
2860         else
2861         {
2862                 ereport(LOG,
2863                                 (errmsg("creating missing WAL directory \"%s\"", path)));
2864                 if (mkdir(path, 0700) < 0)
2865                         ereport(FATAL,
2866                                         (errmsg("could not create missing directory \"%s\": %m",
2867                                                         path)));
2868         }
2869 }
2870
2871 /*
2872  * Remove previous backup history files.  This also retries creation of
2873  * .ready files for any backup history files for which XLogArchiveNotify
2874  * failed earlier.
2875  */
2876 static void
2877 CleanupBackupHistory(void)
2878 {
2879         DIR                *xldir;
2880         struct dirent *xlde;
2881         char            path[MAXPGPATH];
2882
2883         xldir = AllocateDir(XLOGDIR);
2884         if (xldir == NULL)
2885                 ereport(ERROR,
2886                                 (errcode_for_file_access(),
2887                                  errmsg("could not open transaction log directory \"%s\": %m",
2888                                                 XLOGDIR)));
2889
2890         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2891         {
2892                 if (strlen(xlde->d_name) > 24 &&
2893                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2894                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
2895                                    ".backup") == 0)
2896                 {
2897                         if (XLogArchiveCheckDone(xlde->d_name))
2898                         {
2899                                 ereport(DEBUG2,
2900                                 (errmsg("removing transaction log backup history file \"%s\"",
2901                                                 xlde->d_name)));
2902                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2903                                 unlink(path);
2904                                 XLogArchiveCleanup(xlde->d_name);
2905                         }
2906                 }
2907         }
2908
2909         FreeDir(xldir);
2910 }
2911
2912 /*
2913  * Restore the backup blocks present in an XLOG record, if any.
2914  *
2915  * We assume all of the record has been read into memory at *record.
2916  *
2917  * Note: when a backup block is available in XLOG, we restore it
2918  * unconditionally, even if the page in the database appears newer.
2919  * This is to protect ourselves against database pages that were partially
2920  * or incorrectly written during a crash.  We assume that the XLOG data
2921  * must be good because it has passed a CRC check, while the database
2922  * page might not be.  This will force us to replay all subsequent
2923  * modifications of the page that appear in XLOG, rather than possibly
2924  * ignoring them as already applied, but that's not a huge drawback.
2925  *
2926  * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
2927  * Otherwise, a normal exclusive lock is used.  At the moment, that's just
2928  * pro forma, because there can't be any regular backends in the system
2929  * during recovery.  The 'cleanup' argument applies to all backup blocks
2930  * in the WAL record, that suffices for now.
2931  */
2932 void
2933 RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
2934 {
2935         Buffer          buffer;
2936         Page            page;
2937         BkpBlock        bkpb;
2938         char       *blk;
2939         int                     i;
2940
2941         if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
2942                 return;
2943
2944         blk = (char *) XLogRecGetData(record) + record->xl_len;
2945         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
2946         {
2947                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
2948                         continue;
2949
2950                 memcpy(&bkpb, blk, sizeof(BkpBlock));
2951                 blk += sizeof(BkpBlock);
2952
2953                 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
2954                                                                                 RBM_ZERO);
2955                 Assert(BufferIsValid(buffer));
2956                 if (cleanup)
2957                         LockBufferForCleanup(buffer);
2958                 else
2959                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2960
2961                 page = (Page) BufferGetPage(buffer);
2962
2963                 if (bkpb.hole_length == 0)
2964                 {
2965                         memcpy((char *) page, blk, BLCKSZ);
2966                 }
2967                 else
2968                 {
2969                         /* must zero-fill the hole */
2970                         MemSet((char *) page, 0, BLCKSZ);
2971                         memcpy((char *) page, blk, bkpb.hole_offset);
2972                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
2973                                    blk + bkpb.hole_offset,
2974                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
2975                 }
2976
2977                 PageSetLSN(page, lsn);
2978                 PageSetTLI(page, ThisTimeLineID);
2979                 MarkBufferDirty(buffer);
2980                 UnlockReleaseBuffer(buffer);
2981
2982                 blk += BLCKSZ - bkpb.hole_length;
2983         }
2984 }
2985
2986 /*
2987  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
2988  * record (other than to the minimal extent of computing the amount of
2989  * data to read in) until we've checked the CRCs.
2990  *
2991  * We assume all of the record has been read into memory at *record.
2992  */
2993 static bool
2994 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
2995 {
2996         pg_crc32        crc;
2997         int                     i;
2998         uint32          len = record->xl_len;
2999         BkpBlock        bkpb;
3000         char       *blk;
3001
3002         /* First the rmgr data */
3003         INIT_CRC32(crc);
3004         COMP_CRC32(crc, XLogRecGetData(record), len);
3005
3006         /* Add in the backup blocks, if any */
3007         blk = (char *) XLogRecGetData(record) + len;
3008         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3009         {
3010                 uint32          blen;
3011
3012                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3013                         continue;
3014
3015                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3016                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3017                 {
3018                         ereport(emode,
3019                                         (errmsg("incorrect hole size in record at %X/%X",
3020                                                         recptr.xlogid, recptr.xrecoff)));
3021                         return false;
3022                 }
3023                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3024                 COMP_CRC32(crc, blk, blen);
3025                 blk += blen;
3026         }
3027
3028         /* Check that xl_tot_len agrees with our calculation */
3029         if (blk != (char *) record + record->xl_tot_len)
3030         {
3031                 ereport(emode,
3032                                 (errmsg("incorrect total length in record at %X/%X",
3033                                                 recptr.xlogid, recptr.xrecoff)));
3034                 return false;
3035         }
3036
3037         /* Finally include the record header */
3038         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
3039                            SizeOfXLogRecord - sizeof(pg_crc32));
3040         FIN_CRC32(crc);
3041
3042         if (!EQ_CRC32(record->xl_crc, crc))
3043         {
3044                 ereport(emode,
3045                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3046                                 recptr.xlogid, recptr.xrecoff)));
3047                 return false;
3048         }
3049
3050         return true;
3051 }
3052
3053 /*
3054  * Attempt to read an XLOG record.
3055  *
3056  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3057  * try to read a record just after the last one previously read.
3058  *
3059  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3060  * (emode must be either PANIC or LOG.)
3061  *
3062  * The record is copied into readRecordBuf, so that on successful return,
3063  * the returned record pointer always points there.
3064  */
3065 static XLogRecord *
3066 ReadRecord(XLogRecPtr *RecPtr, int emode)
3067 {
3068         XLogRecord *record;
3069         char       *buffer;
3070         XLogRecPtr      tmpRecPtr = EndRecPtr;
3071         bool            randAccess = false;
3072         uint32          len,
3073                                 total_len;
3074         uint32          targetPageOff;
3075         uint32          targetRecOff;
3076         uint32          pageHeaderSize;
3077
3078         if (readBuf == NULL)
3079         {
3080                 /*
3081                  * First time through, permanently allocate readBuf.  We do it this
3082                  * way, rather than just making a static array, for two reasons: (1)
3083                  * no need to waste the storage in most instantiations of the backend;
3084                  * (2) a static char array isn't guaranteed to have any particular
3085                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3086                  */
3087                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3088                 Assert(readBuf != NULL);
3089         }
3090
3091         if (RecPtr == NULL)
3092         {
3093                 RecPtr = &tmpRecPtr;
3094                 /* fast case if next record is on same page */
3095                 if (nextRecord != NULL)
3096                 {
3097                         record = nextRecord;
3098                         goto got_record;
3099                 }
3100                 /* align old recptr to next page */
3101                 if (tmpRecPtr.xrecoff % XLOG_BLCKSZ != 0)
3102                         tmpRecPtr.xrecoff += (XLOG_BLCKSZ - tmpRecPtr.xrecoff % XLOG_BLCKSZ);
3103                 if (tmpRecPtr.xrecoff >= XLogFileSize)
3104                 {
3105                         (tmpRecPtr.xlogid)++;
3106                         tmpRecPtr.xrecoff = 0;
3107                 }
3108                 /* We will account for page header size below */
3109         }
3110         else
3111         {
3112                 if (!XRecOffIsValid(RecPtr->xrecoff))
3113                         ereport(PANIC,
3114                                         (errmsg("invalid record offset at %X/%X",
3115                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3116
3117                 /*
3118                  * Since we are going to a random position in WAL, forget any prior
3119                  * state about what timeline we were in, and allow it to be any
3120                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3121                  * to go backwards (but we can't reset that variable right here, since
3122                  * we might not change files at all).
3123                  */
3124                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3125                 randAccess = true;              /* allow curFileTLI to go backwards too */
3126         }
3127
3128         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
3129         {
3130                 close(readFile);
3131                 readFile = -1;
3132         }
3133         XLByteToSeg(*RecPtr, readId, readSeg);
3134         if (readFile < 0)
3135         {
3136                 /* Now it's okay to reset curFileTLI if random fetch */
3137                 if (randAccess)
3138                         curFileTLI = 0;
3139
3140                 readFile = XLogFileRead(readId, readSeg, emode);
3141                 if (readFile < 0)
3142                         goto next_record_is_invalid;
3143
3144                 /*
3145                  * Whenever switching to a new WAL segment, we read the first page of
3146                  * the file and validate its header, even if that's not where the
3147                  * target record is.  This is so that we can check the additional
3148                  * identification info that is present in the first page's "long"
3149                  * header.
3150                  */
3151                 readOff = 0;
3152                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3153                 {
3154                         ereport(emode,
3155                                         (errcode_for_file_access(),
3156                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3157                                                         readId, readSeg, readOff)));
3158                         goto next_record_is_invalid;
3159                 }
3160                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3161                         goto next_record_is_invalid;
3162         }
3163
3164         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3165         if (readOff != targetPageOff)
3166         {
3167                 readOff = targetPageOff;
3168                 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
3169                 {
3170                         ereport(emode,
3171                                         (errcode_for_file_access(),
3172                                          errmsg("could not seek in log file %u, segment %u to offset %u: %m",
3173                                                         readId, readSeg, readOff)));
3174                         goto next_record_is_invalid;
3175                 }
3176                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3177                 {
3178                         ereport(emode,
3179                                         (errcode_for_file_access(),
3180                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3181                                                         readId, readSeg, readOff)));
3182                         goto next_record_is_invalid;
3183                 }
3184                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3185                         goto next_record_is_invalid;
3186         }
3187         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3188         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3189         if (targetRecOff == 0)
3190         {
3191                 /*
3192                  * Can only get here in the continuing-from-prev-page case, because
3193                  * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
3194                  * to skip over the new page's header.
3195                  */
3196                 tmpRecPtr.xrecoff += pageHeaderSize;
3197                 targetRecOff = pageHeaderSize;
3198         }
3199         else if (targetRecOff < pageHeaderSize)
3200         {
3201                 ereport(emode,
3202                                 (errmsg("invalid record offset at %X/%X",
3203                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3204                 goto next_record_is_invalid;
3205         }
3206         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3207                 targetRecOff == pageHeaderSize)
3208         {
3209                 ereport(emode,
3210                                 (errmsg("contrecord is requested by %X/%X",
3211                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3212                 goto next_record_is_invalid;
3213         }
3214         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3215
3216 got_record:;
3217
3218         /*
3219          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3220          * required.
3221          */
3222         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3223         {
3224                 if (record->xl_len != 0)
3225                 {
3226                         ereport(emode,
3227                                         (errmsg("invalid xlog switch record at %X/%X",
3228                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3229                         goto next_record_is_invalid;
3230                 }
3231         }
3232         else if (record->xl_len == 0)
3233         {
3234                 ereport(emode,
3235                                 (errmsg("record with zero length at %X/%X",
3236                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3237                 goto next_record_is_invalid;
3238         }
3239         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3240                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3241                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3242         {
3243                 ereport(emode,
3244                                 (errmsg("invalid record length at %X/%X",
3245                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3246                 goto next_record_is_invalid;
3247         }
3248         if (record->xl_rmid > RM_MAX_ID)
3249         {
3250                 ereport(emode,
3251                                 (errmsg("invalid resource manager ID %u at %X/%X",
3252                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3253                 goto next_record_is_invalid;
3254         }
3255         if (randAccess)
3256         {
3257                 /*
3258                  * We can't exactly verify the prev-link, but surely it should be less
3259                  * than the record's own address.
3260                  */
3261                 if (!XLByteLT(record->xl_prev, *RecPtr))
3262                 {
3263                         ereport(emode,
3264                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3265                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3266                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3267                         goto next_record_is_invalid;
3268                 }
3269         }
3270         else
3271         {
3272                 /*
3273                  * Record's prev-link should exactly match our previous location. This
3274                  * check guards against torn WAL pages where a stale but valid-looking
3275                  * WAL record starts on a sector boundary.
3276                  */
3277                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3278                 {
3279                         ereport(emode,
3280                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3281                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3282                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3283                         goto next_record_is_invalid;
3284                 }
3285         }
3286
3287         /*
3288          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3289          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3290          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3291          * enough for all "normal" records, but very large commit or abort records
3292          * might need more space.)
3293          */
3294         total_len = record->xl_tot_len;
3295         if (total_len > readRecordBufSize)
3296         {
3297                 uint32          newSize = total_len;
3298
3299                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3300                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3301                 if (readRecordBuf)
3302                         free(readRecordBuf);
3303                 readRecordBuf = (char *) malloc(newSize);
3304                 if (!readRecordBuf)
3305                 {
3306                         readRecordBufSize = 0;
3307                         /* We treat this as a "bogus data" condition */
3308                         ereport(emode,
3309                                         (errmsg("record length %u at %X/%X too long",
3310                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3311                         goto next_record_is_invalid;
3312                 }
3313                 readRecordBufSize = newSize;
3314         }
3315
3316         buffer = readRecordBuf;
3317         nextRecord = NULL;
3318         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3319         if (total_len > len)
3320         {
3321                 /* Need to reassemble record */
3322                 XLogContRecord *contrecord;
3323                 uint32          gotlen = len;
3324
3325                 memcpy(buffer, record, len);
3326                 record = (XLogRecord *) buffer;
3327                 buffer += len;
3328                 for (;;)
3329                 {
3330                         readOff += XLOG_BLCKSZ;
3331                         if (readOff >= XLogSegSize)
3332                         {
3333                                 close(readFile);
3334                                 readFile = -1;
3335                                 NextLogSeg(readId, readSeg);
3336                                 readFile = XLogFileRead(readId, readSeg, emode);
3337                                 if (readFile < 0)
3338                                         goto next_record_is_invalid;
3339                                 readOff = 0;
3340                         }
3341                         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3342                         {
3343                                 ereport(emode,
3344                                                 (errcode_for_file_access(),
3345                                                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
3346                                                                 readId, readSeg, readOff)));
3347                                 goto next_record_is_invalid;
3348                         }
3349                         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3350                                 goto next_record_is_invalid;
3351                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3352                         {
3353                                 ereport(emode,
3354                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
3355                                                                 readId, readSeg, readOff)));
3356                                 goto next_record_is_invalid;
3357                         }
3358                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3359                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
3360                         if (contrecord->xl_rem_len == 0 ||
3361                                 total_len != (contrecord->xl_rem_len + gotlen))
3362                         {
3363                                 ereport(emode,
3364                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
3365                                                                 contrecord->xl_rem_len,
3366                                                                 readId, readSeg, readOff)));
3367                                 goto next_record_is_invalid;
3368                         }
3369                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
3370                         if (contrecord->xl_rem_len > len)
3371                         {
3372                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
3373                                 gotlen += len;
3374                                 buffer += len;
3375                                 continue;
3376                         }
3377                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
3378                                    contrecord->xl_rem_len);
3379                         break;
3380                 }
3381                 if (!RecordIsValid(record, *RecPtr, emode))
3382                         goto next_record_is_invalid;
3383                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3384                 if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
3385                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
3386                 {
3387                         nextRecord = (XLogRecord *) ((char *) contrecord +
3388                                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
3389                 }
3390                 EndRecPtr.xlogid = readId;
3391                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3392                         pageHeaderSize +
3393                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3394                 ReadRecPtr = *RecPtr;
3395                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
3396                 return record;
3397         }
3398
3399         /* Record does not cross a page boundary */
3400         if (!RecordIsValid(record, *RecPtr, emode))
3401                 goto next_record_is_invalid;
3402         if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
3403                 MAXALIGN(total_len))
3404                 nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
3405         EndRecPtr.xlogid = RecPtr->xlogid;
3406         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3407         ReadRecPtr = *RecPtr;
3408         memcpy(buffer, record, total_len);
3409
3410         /*
3411          * Special processing if it's an XLOG SWITCH record
3412          */
3413         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3414         {
3415                 /* Pretend it extends to end of segment */
3416                 EndRecPtr.xrecoff += XLogSegSize - 1;
3417                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
3418                 nextRecord = NULL;              /* definitely not on same page */
3419
3420                 /*
3421                  * Pretend that readBuf contains the last page of the segment. This is
3422                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
3423                  * segment.
3424                  */
3425                 readOff = XLogSegSize - XLOG_BLCKSZ;
3426         }
3427         return (XLogRecord *) buffer;
3428
3429 next_record_is_invalid:;
3430         if (readFile >= 0)
3431         {
3432                 close(readFile);
3433                 readFile = -1;
3434         }
3435         nextRecord = NULL;
3436         return NULL;
3437 }
3438
3439 /*
3440  * Check whether the xlog header of a page just read in looks valid.
3441  *
3442  * This is just a convenience subroutine to avoid duplicated code in
3443  * ReadRecord.  It's not intended for use from anywhere else.
3444  */
3445 static bool
3446 ValidXLOGHeader(XLogPageHeader hdr, int emode)
3447 {
3448         XLogRecPtr      recaddr;
3449
3450         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
3451         {
3452                 ereport(emode,
3453                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
3454                                                 hdr->xlp_magic, readId, readSeg, readOff)));
3455                 return false;
3456         }
3457         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
3458         {
3459                 ereport(emode,
3460                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3461                                                 hdr->xlp_info, readId, readSeg, readOff)));
3462                 return false;
3463         }
3464         if (hdr->xlp_info & XLP_LONG_HEADER)
3465         {
3466                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
3467
3468                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
3469                 {
3470                         char            fhdrident_str[32];
3471                         char            sysident_str[32];
3472
3473                         /*
3474                          * Format sysids separately to keep platform-dependent format code
3475                          * out of the translatable message string.
3476                          */
3477                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
3478                                          longhdr->xlp_sysid);
3479                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
3480                                          ControlFile->system_identifier);
3481                         ereport(emode,
3482                                         (errmsg("WAL file is from different system"),
3483                                          errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
3484                                                            fhdrident_str, sysident_str)));
3485                         return false;
3486                 }
3487                 if (longhdr->xlp_seg_size != XLogSegSize)
3488                 {
3489                         ereport(emode,
3490                                         (errmsg("WAL file is from different system"),
3491                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
3492                         return false;
3493                 }
3494                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
3495                 {
3496                         ereport(emode,
3497                                         (errmsg("WAL file is from different system"),
3498                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
3499                         return false;
3500                 }
3501         }
3502         else if (readOff == 0)
3503         {
3504                 /* hmm, first page of file doesn't have a long header? */
3505                 ereport(emode,
3506                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3507                                                 hdr->xlp_info, readId, readSeg, readOff)));
3508                 return false;
3509         }
3510
3511         recaddr.xlogid = readId;
3512         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
3513         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
3514         {
3515                 ereport(emode,
3516                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
3517                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
3518                                                 readId, readSeg, readOff)));
3519                 return false;
3520         }
3521
3522         /*
3523          * Check page TLI is one of the expected values.
3524          */
3525         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
3526         {
3527                 ereport(emode,
3528                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
3529                                                 hdr->xlp_tli,
3530                                                 readId, readSeg, readOff)));
3531                 return false;
3532         }
3533
3534         /*
3535          * Since child timelines are always assigned a TLI greater than their
3536          * immediate parent's TLI, we should never see TLI go backwards across
3537          * successive pages of a consistent WAL sequence.
3538          *
3539          * Of course this check should only be applied when advancing sequentially
3540          * across pages; therefore ReadRecord resets lastPageTLI to zero when
3541          * going to a random page.
3542          */
3543         if (hdr->xlp_tli < lastPageTLI)
3544         {
3545                 ereport(emode,
3546                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
3547                                                 hdr->xlp_tli, lastPageTLI,
3548                                                 readId, readSeg, readOff)));
3549                 return false;
3550         }
3551         lastPageTLI = hdr->xlp_tli;
3552         return true;
3553 }
3554
3555 /*
3556  * Try to read a timeline's history file.
3557  *
3558  * If successful, return the list of component TLIs (the given TLI followed by
3559  * its ancestor TLIs).  If we can't find the history file, assume that the
3560  * timeline has no parents, and return a list of just the specified timeline
3561  * ID.
3562  */
3563 static List *
3564 readTimeLineHistory(TimeLineID targetTLI)
3565 {
3566         List       *result;
3567         char            path[MAXPGPATH];
3568         char            histfname[MAXFNAMELEN];
3569         char            fline[MAXPGPATH];
3570         FILE       *fd;
3571
3572         if (InArchiveRecovery)
3573         {
3574                 TLHistoryFileName(histfname, targetTLI);
3575                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3576         }
3577         else
3578                 TLHistoryFilePath(path, targetTLI);
3579
3580         fd = AllocateFile(path, "r");
3581         if (fd == NULL)
3582         {
3583                 if (errno != ENOENT)
3584                         ereport(FATAL,
3585                                         (errcode_for_file_access(),
3586                                          errmsg("could not open file \"%s\": %m", path)));
3587                 /* Not there, so assume no parents */
3588                 return list_make1_int((int) targetTLI);
3589         }
3590
3591         result = NIL;
3592
3593         /*
3594          * Parse the file...
3595          */
3596         while (fgets(fline, sizeof(fline), fd) != NULL)
3597         {
3598                 /* skip leading whitespace and check for # comment */
3599                 char       *ptr;
3600                 char       *endptr;
3601                 TimeLineID      tli;
3602
3603                 for (ptr = fline; *ptr; ptr++)
3604                 {
3605                         if (!isspace((unsigned char) *ptr))
3606                                 break;
3607                 }
3608                 if (*ptr == '\0' || *ptr == '#')
3609                         continue;
3610
3611                 /* expect a numeric timeline ID as first field of line */
3612                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
3613                 if (endptr == ptr)
3614                         ereport(FATAL,
3615                                         (errmsg("syntax error in history file: %s", fline),
3616                                          errhint("Expected a numeric timeline ID.")));
3617
3618                 if (result &&
3619                         tli <= (TimeLineID) linitial_int(result))
3620                         ereport(FATAL,
3621                                         (errmsg("invalid data in history file: %s", fline),
3622                                    errhint("Timeline IDs must be in increasing sequence.")));
3623
3624                 /* Build list with newest item first */
3625                 result = lcons_int((int) tli, result);
3626
3627                 /* we ignore the remainder of each line */
3628         }
3629
3630         FreeFile(fd);
3631
3632         if (result &&
3633                 targetTLI <= (TimeLineID) linitial_int(result))
3634                 ereport(FATAL,
3635                                 (errmsg("invalid data in history file \"%s\"", path),
3636                         errhint("Timeline IDs must be less than child timeline's ID.")));
3637
3638         result = lcons_int((int) targetTLI, result);
3639
3640         ereport(DEBUG3,
3641                         (errmsg_internal("history of timeline %u is %s",
3642                                                          targetTLI, nodeToString(result))));
3643
3644         return result;
3645 }
3646
3647 /*
3648  * Probe whether a timeline history file exists for the given timeline ID
3649  */
3650 static bool
3651 existsTimeLineHistory(TimeLineID probeTLI)
3652 {
3653         char            path[MAXPGPATH];
3654         char            histfname[MAXFNAMELEN];
3655         FILE       *fd;
3656
3657         if (InArchiveRecovery)
3658         {
3659                 TLHistoryFileName(histfname, probeTLI);
3660                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3661         }
3662         else
3663                 TLHistoryFilePath(path, probeTLI);
3664
3665         fd = AllocateFile(path, "r");
3666         if (fd != NULL)
3667         {
3668                 FreeFile(fd);
3669                 return true;
3670         }
3671         else
3672         {
3673                 if (errno != ENOENT)
3674                         ereport(FATAL,
3675                                         (errcode_for_file_access(),
3676                                          errmsg("could not open file \"%s\": %m", path)));
3677                 return false;
3678         }
3679 }
3680
3681 /*
3682  * Find the newest existing timeline, assuming that startTLI exists.
3683  *
3684  * Note: while this is somewhat heuristic, it does positively guarantee
3685  * that (result + 1) is not a known timeline, and therefore it should
3686  * be safe to assign that ID to a new timeline.
3687  */
3688 static TimeLineID
3689 findNewestTimeLine(TimeLineID startTLI)
3690 {
3691         TimeLineID      newestTLI;
3692         TimeLineID      probeTLI;
3693
3694         /*
3695          * The algorithm is just to probe for the existence of timeline history
3696          * files.  XXX is it useful to allow gaps in the sequence?
3697          */
3698         newestTLI = startTLI;
3699
3700         for (probeTLI = startTLI + 1;; probeTLI++)
3701         {
3702                 if (existsTimeLineHistory(probeTLI))
3703                 {
3704                         newestTLI = probeTLI;           /* probeTLI exists */
3705                 }
3706                 else
3707                 {
3708                         /* doesn't exist, assume we're done */
3709                         break;
3710                 }
3711         }
3712
3713         return newestTLI;
3714 }
3715
3716 /*
3717  * Create a new timeline history file.
3718  *
3719  *      newTLI: ID of the new timeline
3720  *      parentTLI: ID of its immediate parent
3721  *      endTLI et al: ID of the last used WAL file, for annotation purposes
3722  *
3723  * Currently this is only used during recovery, and so there are no locking
3724  * considerations.      But we should be just as tense as XLogFileInit to avoid
3725  * emplacing a bogus file.
3726  */
3727 static void
3728 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
3729                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
3730 {
3731         char            path[MAXPGPATH];
3732         char            tmppath[MAXPGPATH];
3733         char            histfname[MAXFNAMELEN];
3734         char            xlogfname[MAXFNAMELEN];
3735         char            buffer[BLCKSZ];
3736         int                     srcfd;
3737         int                     fd;
3738         int                     nbytes;
3739
3740         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
3741
3742         /*
3743          * Write into a temp file name.
3744          */
3745         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3746
3747         unlink(tmppath);
3748
3749         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3750         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
3751                                            S_IRUSR | S_IWUSR);
3752         if (fd < 0)
3753                 ereport(ERROR,
3754                                 (errcode_for_file_access(),
3755                                  errmsg("could not create file \"%s\": %m", tmppath)));
3756
3757         /*
3758          * If a history file exists for the parent, copy it verbatim
3759          */
3760         if (InArchiveRecovery)
3761         {
3762                 TLHistoryFileName(histfname, parentTLI);
3763                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3764         }
3765         else
3766                 TLHistoryFilePath(path, parentTLI);
3767
3768         srcfd = BasicOpenFile(path, O_RDONLY, 0);
3769         if (srcfd < 0)
3770         {
3771                 if (errno != ENOENT)
3772                         ereport(ERROR,
3773                                         (errcode_for_file_access(),
3774                                          errmsg("could not open file \"%s\": %m", path)));
3775                 /* Not there, so assume parent has no parents */
3776         }
3777         else
3778         {
3779                 for (;;)
3780                 {
3781                         errno = 0;
3782                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
3783                         if (nbytes < 0 || errno != 0)
3784                                 ereport(ERROR,
3785                                                 (errcode_for_file_access(),
3786                                                  errmsg("could not read file \"%s\": %m", path)));
3787                         if (nbytes == 0)
3788                                 break;
3789                         errno = 0;
3790                         if ((int) write(fd, buffer, nbytes) != nbytes)
3791                         {
3792                                 int                     save_errno = errno;
3793
3794                                 /*
3795                                  * If we fail to make the file, delete it to release disk
3796                                  * space
3797                                  */
3798                                 unlink(tmppath);
3799
3800                                 /*
3801                                  * if write didn't set errno, assume problem is no disk space
3802                                  */
3803                                 errno = save_errno ? save_errno : ENOSPC;
3804
3805                                 ereport(ERROR,
3806                                                 (errcode_for_file_access(),
3807                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3808                         }
3809                 }
3810                 close(srcfd);
3811         }
3812
3813         /*
3814          * Append one line with the details of this timeline split.
3815          *
3816          * If we did have a parent file, insert an extra newline just in case the
3817          * parent file failed to end with one.
3818          */
3819         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
3820
3821         snprintf(buffer, sizeof(buffer),
3822                          "%s%u\t%s\t%s transaction %u at %s\n",
3823                          (srcfd < 0) ? "" : "\n",
3824                          parentTLI,
3825                          xlogfname,
3826                          recoveryStopAfter ? "after" : "before",
3827                          recoveryStopXid,
3828                          timestamptz_to_str(recoveryStopTime));
3829
3830         nbytes = strlen(buffer);
3831         errno = 0;
3832         if ((int) write(fd, buffer, nbytes) != nbytes)
3833         {
3834                 int                     save_errno = errno;
3835
3836                 /*
3837                  * If we fail to make the file, delete it to release disk space
3838                  */
3839                 unlink(tmppath);
3840                 /* if write didn't set errno, assume problem is no disk space */
3841                 errno = save_errno ? save_errno : ENOSPC;
3842
3843                 ereport(ERROR,
3844                                 (errcode_for_file_access(),
3845                                  errmsg("could not write to file \"%s\": %m", tmppath)));
3846         }
3847
3848         if (pg_fsync(fd) != 0)
3849                 ereport(ERROR,
3850                                 (errcode_for_file_access(),
3851                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3852
3853         if (close(fd))
3854                 ereport(ERROR,
3855                                 (errcode_for_file_access(),
3856                                  errmsg("could not close file \"%s\": %m", tmppath)));
3857
3858
3859         /*
3860          * Now move the completed history file into place with its final name.
3861          */
3862         TLHistoryFilePath(path, newTLI);
3863
3864         /*
3865          * Prefer link() to rename() here just to be really sure that we don't
3866          * overwrite an existing logfile.  However, there shouldn't be one, so
3867          * rename() is an acceptable substitute except for the truly paranoid.
3868          */
3869 #if HAVE_WORKING_LINK
3870         if (link(tmppath, path) < 0)
3871                 ereport(ERROR,
3872                                 (errcode_for_file_access(),
3873                                  errmsg("could not link file \"%s\" to \"%s\": %m",
3874                                                 tmppath, path)));
3875         unlink(tmppath);
3876 #else
3877         if (rename(tmppath, path) < 0)
3878                 ereport(ERROR,
3879                                 (errcode_for_file_access(),
3880                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
3881                                                 tmppath, path)));
3882 #endif
3883
3884         /* The history file can be archived immediately. */
3885         TLHistoryFileName(histfname, newTLI);
3886         XLogArchiveNotify(histfname);
3887 }
3888
3889 /*
3890  * I/O routines for pg_control
3891  *
3892  * *ControlFile is a buffer in shared memory that holds an image of the
3893  * contents of pg_control.      WriteControlFile() initializes pg_control
3894  * given a preloaded buffer, ReadControlFile() loads the buffer from
3895  * the pg_control file (during postmaster or standalone-backend startup),
3896  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3897  *
3898  * For simplicity, WriteControlFile() initializes the fields of pg_control
3899  * that are related to checking backend/database compatibility, and
3900  * ReadControlFile() verifies they are correct.  We could split out the
3901  * I/O and compatibility-check functions, but there seems no need currently.
3902  */
3903 static void
3904 WriteControlFile(void)
3905 {
3906         int                     fd;
3907         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
3908
3909         /*
3910          * Initialize version and compatibility-check fields
3911          */
3912         ControlFile->pg_control_version = PG_CONTROL_VERSION;
3913         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3914
3915         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3916         ControlFile->floatFormat = FLOATFORMAT_VALUE;
3917
3918         ControlFile->blcksz = BLCKSZ;
3919         ControlFile->relseg_size = RELSEG_SIZE;
3920         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
3921         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
3922
3923         ControlFile->nameDataLen = NAMEDATALEN;
3924         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
3925
3926         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3927
3928 #ifdef HAVE_INT64_TIMESTAMP
3929         ControlFile->enableIntTimes = true;
3930 #else
3931         ControlFile->enableIntTimes = false;
3932 #endif
3933         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
3934         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
3935
3936         /* Contents are protected with a CRC */
3937         INIT_CRC32(ControlFile->crc);
3938         COMP_CRC32(ControlFile->crc,
3939                            (char *) ControlFile,
3940                            offsetof(ControlFileData, crc));
3941         FIN_CRC32(ControlFile->crc);
3942
3943         /*
3944          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
3945          * excess over sizeof(ControlFileData).  This reduces the odds of
3946          * premature-EOF errors when reading pg_control.  We'll still fail when we
3947          * check the contents of the file, but hopefully with a more specific
3948          * error than "couldn't read pg_control".
3949          */
3950         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
3951                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
3952
3953         memset(buffer, 0, PG_CONTROL_SIZE);
3954         memcpy(buffer, ControlFile, sizeof(ControlFileData));
3955
3956         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3957                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3958                                            S_IRUSR | S_IWUSR);
3959         if (fd < 0)
3960                 ereport(PANIC,
3961                                 (errcode_for_file_access(),
3962                                  errmsg("could not create control file \"%s\": %m",
3963                                                 XLOG_CONTROL_FILE)));
3964
3965         errno = 0;
3966         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
3967         {
3968                 /* if write didn't set errno, assume problem is no disk space */
3969                 if (errno == 0)
3970                         errno = ENOSPC;
3971                 ereport(PANIC,
3972                                 (errcode_for_file_access(),
3973                                  errmsg("could not write to control file: %m")));
3974         }
3975
3976         if (pg_fsync(fd) != 0)
3977                 ereport(PANIC,
3978                                 (errcode_for_file_access(),
3979                                  errmsg("could not fsync control file: %m")));
3980
3981         if (close(fd))
3982                 ereport(PANIC,
3983                                 (errcode_for_file_access(),
3984                                  errmsg("could not close control file: %m")));
3985 }
3986
3987 static void
3988 ReadControlFile(void)
3989 {
3990         pg_crc32        crc;
3991         int                     fd;
3992
3993         /*
3994          * Read data...
3995          */
3996         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3997                                            O_RDWR | PG_BINARY,
3998                                            S_IRUSR | S_IWUSR);
3999         if (fd < 0)
4000                 ereport(PANIC,
4001                                 (errcode_for_file_access(),
4002                                  errmsg("could not open control file \"%s\": %m",
4003                                                 XLOG_CONTROL_FILE)));
4004
4005         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4006                 ereport(PANIC,
4007                                 (errcode_for_file_access(),
4008                                  errmsg("could not read from control file: %m")));
4009
4010         close(fd);
4011
4012         /*
4013          * Check for expected pg_control format version.  If this is wrong, the
4014          * CRC check will likely fail because we'll be checking the wrong number
4015          * of bytes.  Complaining about wrong version will probably be more
4016          * enlightening than complaining about wrong CRC.
4017          */
4018
4019         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4020                 ereport(FATAL,
4021                                 (errmsg("database files are incompatible with server"),
4022                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4023                                                    " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4024                                                    ControlFile->pg_control_version, ControlFile->pg_control_version,
4025                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4026                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4027
4028         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4029                 ereport(FATAL,
4030                                 (errmsg("database files are incompatible with server"),
4031                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4032                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4033                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4034                                  errhint("It looks like you need to initdb.")));
4035
4036         /* Now check the CRC. */
4037         INIT_CRC32(crc);
4038         COMP_CRC32(crc,
4039                            (char *) ControlFile,
4040                            offsetof(ControlFileData, crc));
4041         FIN_CRC32(crc);
4042
4043         if (!EQ_CRC32(crc, ControlFile->crc))
4044                 ereport(FATAL,
4045                                 (errmsg("incorrect checksum in control file")));
4046
4047         /*
4048          * Do compatibility checking immediately.  If the database isn't
4049          * compatible with the backend executable, we want to abort before we
4050          * can possibly do any damage.
4051          */
4052         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4053                 ereport(FATAL,
4054                                 (errmsg("database files are incompatible with server"),
4055                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4056                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4057                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4058                                  errhint("It looks like you need to initdb.")));
4059         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4060                 ereport(FATAL,
4061                                 (errmsg("database files are incompatible with server"),
4062                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4063                                          " but the server was compiled with MAXALIGN %d.",
4064                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4065                                  errhint("It looks like you need to initdb.")));
4066         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4067                 ereport(FATAL,
4068                                 (errmsg("database files are incompatible with server"),
4069                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4070                                  errhint("It looks like you need to initdb.")));
4071         if (ControlFile->blcksz != BLCKSZ)
4072                 ereport(FATAL,
4073                                 (errmsg("database files are incompatible with server"),
4074                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4075                                            " but the server was compiled with BLCKSZ %d.",
4076                                            ControlFile->blcksz, BLCKSZ),
4077                                  errhint("It looks like you need to recompile or initdb.")));
4078         if (ControlFile->relseg_size != RELSEG_SIZE)
4079                 ereport(FATAL,
4080                                 (errmsg("database files are incompatible with server"),
4081                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4082                                   " but the server was compiled with RELSEG_SIZE %d.",
4083                                   ControlFile->relseg_size, RELSEG_SIZE),
4084                                  errhint("It looks like you need to recompile or initdb.")));
4085         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4086                 ereport(FATAL,
4087                                 (errmsg("database files are incompatible with server"),
4088                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4089                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4090                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4091                                  errhint("It looks like you need to recompile or initdb.")));
4092         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4093                 ereport(FATAL,
4094                                 (errmsg("database files are incompatible with server"),
4095                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4096                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4097                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4098                                  errhint("It looks like you need to recompile or initdb.")));
4099         if (ControlFile->nameDataLen != NAMEDATALEN)
4100                 ereport(FATAL,
4101                                 (errmsg("database files are incompatible with server"),
4102                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4103                                   " but the server was compiled with NAMEDATALEN %d.",
4104                                   ControlFile->nameDataLen, NAMEDATALEN),
4105                                  errhint("It looks like you need to recompile or initdb.")));
4106         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4107                 ereport(FATAL,
4108                                 (errmsg("database files are incompatible with server"),
4109                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4110                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4111                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4112                                  errhint("It looks like you need to recompile or initdb.")));
4113         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4114                 ereport(FATAL,
4115                                 (errmsg("database files are incompatible with server"),
4116                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4117                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4118                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4119                                  errhint("It looks like you need to recompile or initdb.")));
4120
4121 #ifdef HAVE_INT64_TIMESTAMP
4122         if (ControlFile->enableIntTimes != true)
4123                 ereport(FATAL,
4124                                 (errmsg("database files are incompatible with server"),
4125                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4126                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4127                                  errhint("It looks like you need to recompile or initdb.")));
4128 #else
4129         if (ControlFile->enableIntTimes != false)
4130                 ereport(FATAL,
4131                                 (errmsg("database files are incompatible with server"),
4132                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4133                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4134                                  errhint("It looks like you need to recompile or initdb.")));
4135 #endif
4136
4137 #ifdef USE_FLOAT4_BYVAL
4138         if (ControlFile->float4ByVal != true)
4139                 ereport(FATAL,
4140                                 (errmsg("database files are incompatible with server"),
4141                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4142                                                    " but the server was compiled with USE_FLOAT4_BYVAL."),
4143                                  errhint("It looks like you need to recompile or initdb.")));
4144 #else
4145         if (ControlFile->float4ByVal != false)
4146                 ereport(FATAL,
4147                                 (errmsg("database files are incompatible with server"),
4148                                  errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4149                                                    " but the server was compiled without USE_FLOAT4_BYVAL."),
4150                                  errhint("It looks like you need to recompile or initdb.")));
4151 #endif
4152
4153 #ifdef USE_FLOAT8_BYVAL
4154         if (ControlFile->float8ByVal != true)
4155                 ereport(FATAL,
4156                                 (errmsg("database files are incompatible with server"),
4157                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4158                                                    " but the server was compiled with USE_FLOAT8_BYVAL."),
4159                                  errhint("It looks like you need to recompile or initdb.")));
4160 #else
4161         if (ControlFile->float8ByVal != false)
4162                 ereport(FATAL,
4163                                 (errmsg("database files are incompatible with server"),
4164                                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4165                                                    " but the server was compiled without USE_FLOAT8_BYVAL."),
4166                                  errhint("It looks like you need to recompile or initdb.")));
4167 #endif
4168 }
4169
4170 void
4171 UpdateControlFile(void)
4172 {
4173         int                     fd;
4174
4175         INIT_CRC32(ControlFile->crc);
4176         COMP_CRC32(ControlFile->crc,
4177                            (char *) ControlFile,
4178                            offsetof(ControlFileData, crc));
4179         FIN_CRC32(ControlFile->crc);
4180
4181         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4182                                            O_RDWR | PG_BINARY,
4183                                            S_IRUSR | S_IWUSR);
4184         if (fd < 0)
4185                 ereport(PANIC,
4186                                 (errcode_for_file_access(),
4187                                  errmsg("could not open control file \"%s\": %m",
4188                                                 XLOG_CONTROL_FILE)));
4189
4190         errno = 0;
4191         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4192         {
4193                 /* if write didn't set errno, assume problem is no disk space */
4194                 if (errno == 0)
4195                         errno = ENOSPC;
4196                 ereport(PANIC,
4197                                 (errcode_for_file_access(),
4198                                  errmsg("could not write to control file: %m")));
4199         }
4200
4201         if (pg_fsync(fd) != 0)
4202                 ereport(PANIC,
4203                                 (errcode_for_file_access(),
4204                                  errmsg("could not fsync control file: %m")));
4205
4206         if (close(fd))
4207                 ereport(PANIC,
4208                                 (errcode_for_file_access(),
4209                                  errmsg("could not close control file: %m")));
4210 }
4211
4212 /*
4213  * Initialization of shared memory for XLOG
4214  */
4215 Size
4216 XLOGShmemSize(void)
4217 {
4218         Size            size;
4219
4220         /* XLogCtl */
4221         size = sizeof(XLogCtlData);
4222         /* xlblocks array */
4223         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4224         /* extra alignment padding for XLOG I/O buffers */
4225         size = add_size(size, ALIGNOF_XLOG_BUFFER);
4226         /* and the buffers themselves */
4227         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4228
4229         /*
4230          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4231          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4232          * routine again below to compute the actual allocation size.
4233          */
4234
4235         return size;
4236 }
4237
4238 void
4239 XLOGShmemInit(void)
4240 {
4241         bool            foundCFile,
4242                                 foundXLog;
4243         char       *allocptr;
4244
4245         ControlFile = (ControlFileData *)
4246                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4247         XLogCtl = (XLogCtlData *)
4248                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4249
4250         if (foundCFile || foundXLog)
4251         {
4252                 /* both should be present or neither */
4253                 Assert(foundCFile && foundXLog);
4254                 return;
4255         }
4256
4257         memset(XLogCtl, 0, sizeof(XLogCtlData));
4258
4259         /*
4260          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4261          * multiple of the alignment for same, so no extra alignment padding is
4262          * needed here.
4263          */
4264         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4265         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4266         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4267         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4268
4269         /*
4270          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
4271          */
4272         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
4273         XLogCtl->pages = allocptr;
4274         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4275
4276         /*
4277          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4278          * in additional info.)
4279          */
4280         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4281         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4282         SpinLockInit(&XLogCtl->info_lck);
4283
4284         /*
4285          * If we are not in bootstrap mode, pg_control should already exist. Read
4286          * and validate it immediately (see comments in ReadControlFile() for the
4287          * reasons why).
4288          */
4289         if (!IsBootstrapProcessingMode())
4290                 ReadControlFile();
4291 }
4292
4293 /*
4294  * This func must be called ONCE on system install.  It creates pg_control
4295  * and the initial XLOG segment.
4296  */
4297 void
4298 BootStrapXLOG(void)
4299 {
4300         CheckPoint      checkPoint;
4301         char       *buffer;
4302         XLogPageHeader page;
4303         XLogLongPageHeader longpage;
4304         XLogRecord *record;
4305         bool            use_existent;
4306         uint64          sysidentifier;
4307         struct timeval tv;
4308         pg_crc32        crc;
4309
4310         /*
4311          * Select a hopefully-unique system identifier code for this installation.
4312          * We use the result of gettimeofday(), including the fractional seconds
4313          * field, as being about as unique as we can easily get.  (Think not to
4314          * use random(), since it hasn't been seeded and there's no portable way
4315          * to seed it other than the system clock value...)  The upper half of the
4316          * uint64 value is just the tv_sec part, while the lower half is the XOR
4317          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4318          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4319          * knowing this encoding can determine the initialization time of the
4320          * installation, which could perhaps be useful sometimes.
4321          */
4322         gettimeofday(&tv, NULL);
4323         sysidentifier = ((uint64) tv.tv_sec) << 32;
4324         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4325
4326         /* First timeline ID is always 1 */
4327         ThisTimeLineID = 1;
4328
4329         /* page buffer must be aligned suitably for O_DIRECT */
4330         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4331         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4332         memset(page, 0, XLOG_BLCKSZ);
4333
4334         /* Set up information for the initial checkpoint record */
4335         checkPoint.redo.xlogid = 0;
4336         checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
4337         checkPoint.ThisTimeLineID = ThisTimeLineID;
4338         checkPoint.nextXidEpoch = 0;
4339         checkPoint.nextXid = FirstNormalTransactionId;
4340         checkPoint.nextOid = FirstBootstrapObjectId;
4341         checkPoint.nextMulti = FirstMultiXactId;
4342         checkPoint.nextMultiOffset = 0;
4343         checkPoint.time = (pg_time_t) time(NULL);
4344
4345         ShmemVariableCache->nextXid = checkPoint.nextXid;
4346         ShmemVariableCache->nextOid = checkPoint.nextOid;
4347         ShmemVariableCache->oidCount = 0;
4348         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4349
4350         /* Set up the XLOG page header */
4351         page->xlp_magic = XLOG_PAGE_MAGIC;
4352         page->xlp_info = XLP_LONG_HEADER;
4353         page->xlp_tli = ThisTimeLineID;
4354         page->xlp_pageaddr.xlogid = 0;
4355         page->xlp_pageaddr.xrecoff = 0;
4356         longpage = (XLogLongPageHeader) page;
4357         longpage->xlp_sysid = sysidentifier;
4358         longpage->xlp_seg_size = XLogSegSize;
4359         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4360
4361         /* Insert the initial checkpoint record */
4362         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4363         record->xl_prev.xlogid = 0;
4364         record->xl_prev.xrecoff = 0;
4365         record->xl_xid = InvalidTransactionId;
4366         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4367         record->xl_len = sizeof(checkPoint);
4368         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4369         record->xl_rmid = RM_XLOG_ID;
4370         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4371
4372         INIT_CRC32(crc);
4373         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4374         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
4375                            SizeOfXLogRecord - sizeof(pg_crc32));
4376         FIN_CRC32(crc);
4377         record->xl_crc = crc;
4378
4379         /* Create first XLOG segment file */
4380         use_existent = false;
4381         openLogFile = XLogFileInit(0, 0, &use_existent, false);
4382
4383         /* Write the first page with the initial record */
4384         errno = 0;
4385         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4386         {
4387                 /* if write didn't set errno, assume problem is no disk space */
4388                 if (errno == 0)
4389                         errno = ENOSPC;
4390                 ereport(PANIC,
4391                                 (errcode_for_file_access(),
4392                           errmsg("could not write bootstrap transaction log file: %m")));
4393         }
4394
4395         if (pg_fsync(openLogFile) != 0)
4396                 ereport(PANIC,
4397                                 (errcode_for_file_access(),
4398                           errmsg("could not fsync bootstrap transaction log file: %m")));
4399
4400         if (close(openLogFile))
4401                 ereport(PANIC,
4402                                 (errcode_for_file_access(),
4403                           errmsg("could not close bootstrap transaction log file: %m")));
4404
4405         openLogFile = -1;
4406
4407         /* Now create pg_control */
4408
4409         memset(ControlFile, 0, sizeof(ControlFileData));
4410         /* Initialize pg_control status fields */
4411         ControlFile->system_identifier = sysidentifier;
4412         ControlFile->state = DB_SHUTDOWNED;
4413         ControlFile->time = checkPoint.time;
4414         ControlFile->checkPoint = checkPoint.redo;
4415         ControlFile->checkPointCopy = checkPoint;
4416         /* some additional ControlFile fields are set in WriteControlFile() */
4417
4418         WriteControlFile();
4419
4420         /* Bootstrap the commit log, too */
4421         BootStrapCLOG();
4422         BootStrapSUBTRANS();
4423         BootStrapMultiXact();
4424
4425         pfree(buffer);
4426 }
4427
4428 static char *
4429 str_time(pg_time_t tnow)
4430 {
4431         static char buf[128];
4432
4433         pg_strftime(buf, sizeof(buf),
4434                                 "%Y-%m-%d %H:%M:%S %Z",
4435                                 pg_localtime(&tnow, log_timezone));
4436
4437         return buf;
4438 }
4439
4440 /*
4441  * See if there is a recovery command file (recovery.conf), and if so
4442  * read in parameters for archive recovery.
4443  *
4444  * XXX longer term intention is to expand this to
4445  * cater for additional parameters and controls
4446  * possibly use a flex lexer similar to the GUC one
4447  */
4448 static void
4449 readRecoveryCommandFile(void)
4450 {
4451         FILE       *fd;
4452         char            cmdline[MAXPGPATH];
4453         TimeLineID      rtli = 0;
4454         bool            rtliGiven = false;
4455         bool            syntaxError = false;
4456
4457         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4458         if (fd == NULL)
4459         {
4460                 if (errno == ENOENT)
4461                         return;                         /* not there, so no archive recovery */
4462                 ereport(FATAL,
4463                                 (errcode_for_file_access(),
4464                                  errmsg("could not open recovery command file \"%s\": %m",
4465                                                 RECOVERY_COMMAND_FILE)));
4466         }
4467
4468         ereport(LOG,
4469                         (errmsg("starting archive recovery")));
4470
4471         /*
4472          * Parse the file...
4473          */
4474         while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
4475         {
4476                 /* skip leading whitespace and check for # comment */
4477                 char       *ptr;
4478                 char       *tok1;
4479                 char       *tok2;
4480
4481                 for (ptr = cmdline; *ptr; ptr++)
4482                 {
4483                         if (!isspace((unsigned char) *ptr))
4484                                 break;
4485                 }
4486                 if (*ptr == '\0' || *ptr == '#')
4487                         continue;
4488
4489                 /* identify the quoted parameter value */
4490                 tok1 = strtok(ptr, "'");
4491                 if (!tok1)
4492                 {
4493                         syntaxError = true;
4494                         break;
4495                 }
4496                 tok2 = strtok(NULL, "'");
4497                 if (!tok2)
4498                 {
4499                         syntaxError = true;
4500                         break;
4501                 }
4502                 /* reparse to get just the parameter name */
4503                 tok1 = strtok(ptr, " \t=");
4504                 if (!tok1)
4505                 {
4506                         syntaxError = true;
4507                         break;
4508                 }
4509
4510                 if (strcmp(tok1, "restore_command") == 0)
4511                 {
4512                         recoveryRestoreCommand = pstrdup(tok2);
4513                         ereport(LOG,
4514                                         (errmsg("restore_command = '%s'",
4515                                                         recoveryRestoreCommand)));
4516                 }
4517                 else if (strcmp(tok1, "recovery_target_timeline") == 0)
4518                 {
4519                         rtliGiven = true;
4520                         if (strcmp(tok2, "latest") == 0)
4521                                 rtli = 0;
4522                         else
4523                         {
4524                                 errno = 0;
4525                                 rtli = (TimeLineID) strtoul(tok2, NULL, 0);
4526                                 if (errno == EINVAL || errno == ERANGE)
4527                                         ereport(FATAL,
4528                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4529                                                                         tok2)));
4530                         }
4531                         if (rtli)
4532                                 ereport(LOG,
4533                                                 (errmsg("recovery_target_timeline = %u", rtli)));
4534                         else
4535                                 ereport(LOG,
4536                                                 (errmsg("recovery_target_timeline = latest")));
4537                 }
4538                 else if (strcmp(tok1, "recovery_target_xid") == 0)
4539                 {
4540                         errno = 0;
4541                         recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
4542                         if (errno == EINVAL || errno == ERANGE)
4543                                 ereport(FATAL,
4544                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4545                                                  tok2)));
4546                         ereport(LOG,
4547                                         (errmsg("recovery_target_xid = %u",
4548                                                         recoveryTargetXid)));
4549                         recoveryTarget = true;
4550                         recoveryTargetExact = true;
4551                 }
4552                 else if (strcmp(tok1, "recovery_target_time") == 0)
4553                 {
4554                         /*
4555                          * if recovery_target_xid specified, then this overrides
4556                          * recovery_target_time
4557                          */
4558                         if (recoveryTargetExact)
4559                                 continue;
4560                         recoveryTarget = true;
4561                         recoveryTargetExact = false;
4562
4563                         /*
4564                          * Convert the time string given by the user to TimestampTz form.
4565                          */
4566                         recoveryTargetTime =
4567                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4568                                                                                                                 CStringGetDatum(tok2),
4569                                                                                                 ObjectIdGetDatum(InvalidOid),
4570                                                                                                                 Int32GetDatum(-1)));
4571                         ereport(LOG,
4572                                         (errmsg("recovery_target_time = '%s'",
4573                                                         timestamptz_to_str(recoveryTargetTime))));
4574                 }
4575                 else if (strcmp(tok1, "recovery_target_inclusive") == 0)
4576                 {
4577                         /*
4578                          * does nothing if a recovery_target is not also set
4579                          */
4580                         if (!parse_bool(tok2, &recoveryTargetInclusive))
4581                                   ereport(ERROR,
4582                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4583                                           errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
4584                         ereport(LOG,
4585                                         (errmsg("recovery_target_inclusive = %s", tok2)));
4586                 }
4587                 else if (strcmp(tok1, "log_restartpoints") == 0)
4588                 {
4589                         /*
4590                          * does nothing if a recovery_target is not also set
4591                          */
4592                         if (!parse_bool(tok2, &recoveryLogRestartpoints))
4593                                   ereport(ERROR,
4594                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4595                                           errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
4596                         ereport(LOG,
4597                                         (errmsg("log_restartpoints = %s", tok2)));
4598                 }
4599                 else
4600                         ereport(FATAL,
4601                                         (errmsg("unrecognized recovery parameter \"%s\"",
4602                                                         tok1)));
4603         }
4604
4605         FreeFile(fd);
4606
4607         if (syntaxError)
4608                 ereport(FATAL,
4609                                 (errmsg("syntax error in recovery command file: %s",
4610                                                 cmdline),
4611                           errhint("Lines should have the format parameter = 'value'.")));
4612
4613         /* Check that required parameters were supplied */
4614         if (recoveryRestoreCommand == NULL)
4615                 ereport(FATAL,
4616                                 (errmsg("recovery command file \"%s\" did not specify restore_command",
4617                                                 RECOVERY_COMMAND_FILE)));
4618
4619         /* Enable fetching from archive recovery area */
4620         InArchiveRecovery = true;
4621
4622         /*
4623          * If user specified recovery_target_timeline, validate it or compute the
4624          * "latest" value.      We can't do this until after we've gotten the restore
4625          * command and set InArchiveRecovery, because we need to fetch timeline
4626          * history files from the archive.
4627          */
4628         if (rtliGiven)
4629         {
4630                 if (rtli)
4631                 {
4632                         /* Timeline 1 does not have a history file, all else should */
4633                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4634                                 ereport(FATAL,
4635                                                 (errmsg("recovery target timeline %u does not exist",
4636                                                                 rtli)));
4637                         recoveryTargetTLI = rtli;
4638                 }
4639                 else
4640                 {
4641                         /* We start the "latest" search from pg_control's timeline */
4642                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4643                 }
4644         }
4645 }
4646
4647 /*
4648  * Exit archive-recovery state
4649  */
4650 static void
4651 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4652 {
4653         char            recoveryPath[MAXPGPATH];
4654         char            xlogpath[MAXPGPATH];
4655
4656         /*
4657          * We are no longer in archive recovery state.
4658          */
4659         InArchiveRecovery = false;
4660
4661         /*
4662          * We should have the ending log segment currently open.  Verify, and then
4663          * close it (to avoid problems on Windows with trying to rename or delete
4664          * an open file).
4665          */
4666         Assert(readFile >= 0);
4667         Assert(readId == endLogId);
4668         Assert(readSeg == endLogSeg);
4669
4670         close(readFile);
4671         readFile = -1;
4672
4673         /*
4674          * If the segment was fetched from archival storage, we want to replace
4675          * the existing xlog segment (if any) with the archival version.  This is
4676          * because whatever is in XLOGDIR is very possibly older than what we have
4677          * from the archives, since it could have come from restoring a PGDATA
4678          * backup.      In any case, the archival version certainly is more
4679          * descriptive of what our current database state is, because that is what
4680          * we replayed from.
4681          *
4682          * Note that if we are establishing a new timeline, ThisTimeLineID is
4683          * already set to the new value, and so we will create a new file instead
4684          * of overwriting any existing file.  (This is, in fact, always the case
4685          * at present.)
4686          */
4687         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4688         XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4689
4690         if (restoredFromArchive)
4691         {
4692                 ereport(DEBUG3,
4693                                 (errmsg_internal("moving last restored xlog to \"%s\"",
4694                                                                  xlogpath)));
4695                 unlink(xlogpath);               /* might or might not exist */
4696                 if (rename(recoveryPath, xlogpath) != 0)
4697                         ereport(FATAL,
4698                                         (errcode_for_file_access(),
4699                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
4700                                                         recoveryPath, xlogpath)));
4701                 /* XXX might we need to fix permissions on the file? */
4702         }
4703         else
4704         {
4705                 /*
4706                  * If the latest segment is not archival, but there's still a
4707                  * RECOVERYXLOG laying about, get rid of it.
4708                  */
4709                 unlink(recoveryPath);   /* ignore any error */
4710
4711                 /*
4712                  * If we are establishing a new timeline, we have to copy data from
4713                  * the last WAL segment of the old timeline to create a starting WAL
4714                  * segment for the new timeline.
4715                  */
4716                 if (endTLI != ThisTimeLineID)
4717                         XLogFileCopy(endLogId, endLogSeg,
4718                                                  endTLI, endLogId, endLogSeg);
4719         }
4720
4721         /*
4722          * Let's just make real sure there are not .ready or .done flags posted
4723          * for the new segment.
4724          */
4725         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4726         XLogArchiveCleanup(xlogpath);
4727
4728         /* Get rid of any remaining recovered timeline-history file, too */
4729         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
4730         unlink(recoveryPath);           /* ignore any error */
4731
4732         /*
4733          * Rename the config file out of the way, so that we don't accidentally
4734          * re-enter archive recovery mode in a subsequent crash.
4735          */
4736         unlink(RECOVERY_COMMAND_DONE);
4737         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
4738                 ereport(FATAL,
4739                                 (errcode_for_file_access(),
4740                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4741                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
4742
4743         ereport(LOG,
4744                         (errmsg("archive recovery complete")));
4745 }
4746
4747 /*
4748  * For point-in-time recovery, this function decides whether we want to
4749  * stop applying the XLOG at or after the current record.
4750  *
4751  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
4752  * *includeThis is set TRUE if we should apply this record before stopping.
4753  *
4754  * We also track the timestamp of the latest applied COMMIT/ABORT record
4755  * in recoveryLastXTime, for logging purposes.
4756  * Also, some information is saved in recoveryStopXid et al for use in
4757  * annotating the new timeline's history file.
4758  */
4759 static bool
4760 recoveryStopsHere(XLogRecord *record, bool *includeThis)
4761 {
4762         bool            stopsHere;
4763         uint8           record_info;
4764         TimestampTz recordXtime;
4765
4766         /* We only consider stopping at COMMIT or ABORT records */
4767         if (record->xl_rmid != RM_XACT_ID)
4768                 return false;
4769         record_info = record->xl_info & ~XLR_INFO_MASK;
4770         if (record_info == XLOG_XACT_COMMIT)
4771         {
4772                 xl_xact_commit *recordXactCommitData;
4773
4774                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
4775                 recordXtime = recordXactCommitData->xact_time;
4776         }
4777         else if (record_info == XLOG_XACT_ABORT)
4778         {
4779                 xl_xact_abort *recordXactAbortData;
4780
4781                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
4782                 recordXtime = recordXactAbortData->xact_time;
4783         }
4784         else
4785                 return false;
4786
4787         /* Do we have a PITR target at all? */
4788         if (!recoveryTarget)
4789         {
4790                 recoveryLastXTime = recordXtime;
4791                 return false;
4792         }
4793
4794         if (recoveryTargetExact)
4795         {
4796                 /*
4797                  * there can be only one transaction end record with this exact
4798                  * transactionid
4799                  *
4800                  * when testing for an xid, we MUST test for equality only, since
4801                  * transactions are numbered in the order they start, not the order
4802                  * they complete. A higher numbered xid will complete before you about
4803                  * 50% of the time...
4804                  */
4805                 stopsHere = (record->xl_xid == recoveryTargetXid);
4806                 if (stopsHere)
4807                         *includeThis = recoveryTargetInclusive;
4808         }
4809         else
4810         {
4811                 /*
4812                  * there can be many transactions that share the same commit time, so
4813                  * we stop after the last one, if we are inclusive, or stop at the
4814                  * first one if we are exclusive
4815                  */
4816                 if (recoveryTargetInclusive)
4817                         stopsHere = (recordXtime > recoveryTargetTime);
4818                 else
4819                         stopsHere = (recordXtime >= recoveryTargetTime);
4820                 if (stopsHere)
4821                         *includeThis = false;
4822         }
4823
4824         if (stopsHere)
4825         {
4826                 recoveryStopXid = record->xl_xid;
4827                 recoveryStopTime = recordXtime;
4828                 recoveryStopAfter = *includeThis;
4829
4830                 if (record_info == XLOG_XACT_COMMIT)
4831                 {
4832                         if (recoveryStopAfter)
4833                                 ereport(LOG,
4834                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
4835                                                                 recoveryStopXid,
4836                                                                 timestamptz_to_str(recoveryStopTime))));
4837                         else
4838                                 ereport(LOG,
4839                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
4840                                                                 recoveryStopXid,
4841                                                                 timestamptz_to_str(recoveryStopTime))));
4842                 }
4843                 else
4844                 {
4845                         if (recoveryStopAfter)
4846                                 ereport(LOG,
4847                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
4848                                                                 recoveryStopXid,
4849                                                                 timestamptz_to_str(recoveryStopTime))));
4850                         else
4851                                 ereport(LOG,
4852                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
4853                                                                 recoveryStopXid,
4854                                                                 timestamptz_to_str(recoveryStopTime))));
4855                 }
4856
4857                 if (recoveryStopAfter)
4858                         recoveryLastXTime = recordXtime;
4859         }
4860         else
4861                 recoveryLastXTime = recordXtime;
4862
4863         return stopsHere;
4864 }
4865
4866 /*
4867  * This must be called ONCE during postmaster or standalone-backend startup
4868  */
4869 void
4870 StartupXLOG(void)
4871 {
4872         XLogCtlInsert *Insert;
4873         CheckPoint      checkPoint;
4874         bool            wasShutdown;
4875         bool            reachedStopPoint = false;
4876         bool            haveBackupLabel = false;
4877         XLogRecPtr      RecPtr,
4878                                 LastRec,
4879                                 checkPointLoc,
4880                                 minRecoveryLoc,
4881                                 EndOfLog;
4882         uint32          endLogId;
4883         uint32          endLogSeg;
4884         XLogRecord *record;
4885         uint32          freespace;
4886         TransactionId oldestActiveXID;
4887
4888         /*
4889          * Read control file and check XLOG status looks valid.
4890          *
4891          * Note: in most control paths, *ControlFile is already valid and we need
4892          * not do ReadControlFile() here, but might as well do it to be sure.
4893          */
4894         ReadControlFile();
4895
4896         if (ControlFile->state < DB_SHUTDOWNED ||
4897                 ControlFile->state > DB_IN_PRODUCTION ||
4898                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
4899                 ereport(FATAL,
4900                                 (errmsg("control file contains invalid data")));
4901
4902         if (ControlFile->state == DB_SHUTDOWNED)
4903                 ereport(LOG,
4904                                 (errmsg("database system was shut down at %s",
4905                                                 str_time(ControlFile->time))));
4906         else if (ControlFile->state == DB_SHUTDOWNING)
4907                 ereport(LOG,
4908                                 (errmsg("database system shutdown was interrupted; last known up at %s",
4909                                                 str_time(ControlFile->time))));
4910         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
4911                 ereport(LOG,
4912                    (errmsg("database system was interrupted while in recovery at %s",
4913                                    str_time(ControlFile->time)),
4914                         errhint("This probably means that some data is corrupted and"
4915                                         " you will have to use the last backup for recovery.")));
4916         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
4917                 ereport(LOG,
4918                                 (errmsg("database system was interrupted while in recovery at log time %s",
4919                                                 str_time(ControlFile->checkPointCopy.time)),
4920                                  errhint("If this has occurred more than once some data might be corrupted"
4921                           " and you might need to choose an earlier recovery target.")));
4922         else if (ControlFile->state == DB_IN_PRODUCTION)
4923                 ereport(LOG,
4924                           (errmsg("database system was interrupted; last known up at %s",
4925                                           str_time(ControlFile->time))));
4926
4927         /* This is just to allow attaching to startup process with a debugger */
4928 #ifdef XLOG_REPLAY_DELAY
4929         if (ControlFile->state != DB_SHUTDOWNED)
4930                 pg_usleep(60000000L);
4931 #endif
4932
4933         /*
4934          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
4935          * someone has performed a copy for PITR, these directories may have
4936          * been excluded and need to be re-created.
4937          */
4938         ValidateXLOGDirectoryStructure();
4939
4940         /*
4941          * Initialize on the assumption we want to recover to the same timeline
4942          * that's active according to pg_control.
4943          */
4944         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
4945
4946         /*
4947          * Check for recovery control file, and if so set up state for offline
4948          * recovery
4949          */
4950         readRecoveryCommandFile();
4951
4952         /* Now we can determine the list of expected TLIs */
4953         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
4954
4955         /*
4956          * If pg_control's timeline is not in expectedTLIs, then we cannot
4957          * proceed: the backup is not part of the history of the requested
4958          * timeline.
4959          */
4960         if (!list_member_int(expectedTLIs,
4961                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
4962                 ereport(FATAL,
4963                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
4964                                                 recoveryTargetTLI,
4965                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
4966
4967         if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
4968         {
4969                 /*
4970                  * When a backup_label file is present, we want to roll forward from
4971                  * the checkpoint it identifies, rather than using pg_control.
4972                  */
4973                 record = ReadCheckpointRecord(checkPointLoc, 0);
4974                 if (record != NULL)
4975                 {
4976                         ereport(DEBUG1,
4977                                         (errmsg("checkpoint record is at %X/%X",
4978                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
4979                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
4980                 }
4981                 else
4982                 {
4983                         ereport(PANIC,
4984                                         (errmsg("could not locate required checkpoint record"),
4985                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
4986                 }
4987                 /* set flag to delete it later */
4988                 haveBackupLabel = true;
4989         }
4990         else
4991         {
4992                 /*
4993                  * Get the last valid checkpoint record.  If the latest one according
4994                  * to pg_control is broken, try the next-to-last one.
4995                  */
4996                 checkPointLoc = ControlFile->checkPoint;
4997                 record = ReadCheckpointRecord(checkPointLoc, 1);
4998                 if (record != NULL)
4999                 {
5000                         ereport(DEBUG1,
5001                                         (errmsg("checkpoint record is at %X/%X",
5002                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5003                 }
5004                 else
5005                 {
5006                         checkPointLoc = ControlFile->prevCheckPoint;
5007                         record = ReadCheckpointRecord(checkPointLoc, 2);
5008                         if (record != NULL)
5009                         {
5010                                 ereport(LOG,
5011                                                 (errmsg("using previous checkpoint record at %X/%X",
5012                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5013                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5014                         }
5015                         else
5016                                 ereport(PANIC,
5017                                          (errmsg("could not locate a valid checkpoint record")));
5018                 }
5019         }
5020
5021         LastRec = RecPtr = checkPointLoc;
5022         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5023         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5024
5025         ereport(DEBUG1,
5026                         (errmsg("redo record is at %X/%X; shutdown %s",
5027                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
5028                                         wasShutdown ? "TRUE" : "FALSE")));
5029         ereport(DEBUG1,
5030                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5031                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5032                                         checkPoint.nextOid)));
5033         ereport(DEBUG1,
5034                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5035                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5036         if (!TransactionIdIsNormal(checkPoint.nextXid))
5037                 ereport(PANIC,
5038                                 (errmsg("invalid next transaction ID")));
5039
5040         ShmemVariableCache->nextXid = checkPoint.nextXid;
5041         ShmemVariableCache->nextOid = checkPoint.nextOid;
5042         ShmemVariableCache->oidCount = 0;
5043         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5044
5045         /*
5046          * We must replay WAL entries using the same TimeLineID they were created
5047          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5048          * also xlog_redo()).
5049          */
5050         ThisTimeLineID = checkPoint.ThisTimeLineID;
5051
5052         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5053
5054         if (XLByteLT(RecPtr, checkPoint.redo))
5055                 ereport(PANIC,
5056                                 (errmsg("invalid redo in checkpoint record")));
5057
5058         /*
5059          * Check whether we need to force recovery from WAL.  If it appears to
5060          * have been a clean shutdown and we did not have a recovery.conf file,
5061          * then assume no recovery needed.
5062          */
5063         if (XLByteLT(checkPoint.redo, RecPtr))
5064         {
5065                 if (wasShutdown)
5066                         ereport(PANIC,
5067                                         (errmsg("invalid redo record in shutdown checkpoint")));
5068                 InRecovery = true;
5069         }
5070         else if (ControlFile->state != DB_SHUTDOWNED)
5071                 InRecovery = true;
5072         else if (InArchiveRecovery)
5073         {
5074                 /* force recovery due to presence of recovery.conf */
5075                 InRecovery = true;
5076         }
5077
5078         /* REDO */
5079         if (InRecovery)
5080         {
5081                 int                     rmid;
5082
5083                 /*
5084                  * Update pg_control to show that we are recovering and to show the
5085                  * selected checkpoint as the place we are starting from. We also mark
5086                  * pg_control with any minimum recovery stop point obtained from a
5087                  * backup history file.
5088                  */
5089                 if (InArchiveRecovery)
5090                 {
5091                         ereport(LOG,
5092                                         (errmsg("automatic recovery in progress")));
5093                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5094                 }
5095                 else
5096                 {
5097                         ereport(LOG,
5098                                         (errmsg("database system was not properly shut down; "
5099                                                         "automatic recovery in progress")));
5100                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5101                 }
5102                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5103                 ControlFile->checkPoint = checkPointLoc;
5104                 ControlFile->checkPointCopy = checkPoint;
5105                 if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
5106                         ControlFile->minRecoveryPoint = minRecoveryLoc;
5107                 ControlFile->time = (pg_time_t) time(NULL);
5108                 UpdateControlFile();
5109
5110                 /*
5111                  * If there was a backup label file, it's done its job and the info
5112                  * has now been propagated into pg_control.  We must get rid of the
5113                  * label file so that if we crash during recovery, we'll pick up at
5114                  * the latest recovery restartpoint instead of going all the way back
5115                  * to the backup start point.  It seems prudent though to just rename
5116                  * the file out of the way rather than delete it completely.
5117                  */
5118                 if (haveBackupLabel)
5119                 {
5120                         unlink(BACKUP_LABEL_OLD);
5121                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5122                                 ereport(FATAL,
5123                                                 (errcode_for_file_access(),
5124                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5125                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5126                 }
5127
5128                 /* Initialize resource managers */
5129                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5130                 {
5131                         if (RmgrTable[rmid].rm_startup != NULL)
5132                                 RmgrTable[rmid].rm_startup();
5133                 }
5134
5135                 /*
5136                  * Find the first record that logically follows the checkpoint --- it
5137                  * might physically precede it, though.
5138                  */
5139                 if (XLByteLT(checkPoint.redo, RecPtr))
5140                 {
5141                         /* back up to find the record */
5142                         record = ReadRecord(&(checkPoint.redo), PANIC);
5143                 }
5144                 else
5145                 {
5146                         /* just have to read next record after CheckPoint */
5147                         record = ReadRecord(NULL, LOG);
5148                 }
5149
5150                 if (record != NULL)
5151                 {
5152                         bool            recoveryContinue = true;
5153                         bool            recoveryApply = true;
5154                         ErrorContextCallback errcontext;
5155
5156                         InRedo = true;
5157                         ereport(LOG,
5158                                         (errmsg("redo starts at %X/%X",
5159                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5160
5161                         /*
5162                          * main redo apply loop
5163                          */
5164                         do
5165                         {
5166 #ifdef WAL_DEBUG
5167                                 if (XLOG_DEBUG)
5168                                 {
5169                                         StringInfoData buf;
5170
5171                                         initStringInfo(&buf);
5172                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5173                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
5174                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
5175                                         xlog_outrec(&buf, record);
5176                                         appendStringInfo(&buf, " - ");
5177                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5178                                                                                                            record->xl_info,
5179                                                                                                          XLogRecGetData(record));
5180                                         elog(LOG, "%s", buf.data);
5181                                         pfree(buf.data);
5182                                 }
5183 #endif
5184
5185                                 /*
5186                                  * Have we reached our recovery target?
5187                                  */
5188                                 if (recoveryStopsHere(record, &recoveryApply))
5189                                 {
5190                                         reachedStopPoint = true;        /* see below */
5191                                         recoveryContinue = false;
5192                                         if (!recoveryApply)
5193                                                 break;
5194                                 }
5195
5196                                 /* Setup error traceback support for ereport() */
5197                                 errcontext.callback = rm_redo_error_callback;
5198                                 errcontext.arg = (void *) record;
5199                                 errcontext.previous = error_context_stack;
5200                                 error_context_stack = &errcontext;
5201
5202                                 /* nextXid must be beyond record's xid */
5203                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5204                                                                                                  ShmemVariableCache->nextXid))
5205                                 {
5206                                         ShmemVariableCache->nextXid = record->xl_xid;
5207                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5208                                 }
5209
5210                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5211
5212                                 /* Pop the error context stack */
5213                                 error_context_stack = errcontext.previous;
5214
5215                                 LastRec = ReadRecPtr;
5216
5217                                 record = ReadRecord(NULL, LOG);
5218                         } while (record != NULL && recoveryContinue);
5219
5220                         /*
5221                          * end of main redo apply loop
5222                          */
5223
5224                         ereport(LOG,
5225                                         (errmsg("redo done at %X/%X",
5226                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5227                         if (recoveryLastXTime)
5228                                 ereport(LOG,
5229                                          (errmsg("last completed transaction was at log time %s",
5230                                                          timestamptz_to_str(recoveryLastXTime))));
5231                         InRedo = false;
5232                 }
5233                 else
5234                 {
5235                         /* there are no WAL records following the checkpoint */
5236                         ereport(LOG,
5237                                         (errmsg("redo is not required")));
5238                 }
5239         }
5240
5241         /*
5242          * Re-fetch the last valid or last applied record, so we can identify the
5243          * exact endpoint of what we consider the valid portion of WAL.
5244          */
5245         record = ReadRecord(&LastRec, PANIC);
5246         EndOfLog = EndRecPtr;
5247         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
5248
5249         /*
5250          * Complain if we did not roll forward far enough to render the backup
5251          * dump consistent.
5252          */
5253         if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
5254         {
5255                 if (reachedStopPoint)   /* stopped because of stop request */
5256                         ereport(FATAL,
5257                                         (errmsg("requested recovery stop point is before end time of backup dump")));
5258                 else    /* ran off end of WAL */
5259                         ereport(FATAL,
5260                                         (errmsg("WAL ends before end time of backup dump")));
5261         }
5262
5263         /*
5264          * Consider whether we need to assign a new timeline ID.
5265          *
5266          * If we are doing an archive recovery, we always assign a new ID.      This
5267          * handles a couple of issues.  If we stopped short of the end of WAL
5268          * during recovery, then we are clearly generating a new timeline and must
5269          * assign it a unique new ID.  Even if we ran to the end, modifying the
5270          * current last segment is problematic because it may result in trying to
5271          * overwrite an already-archived copy of that segment, and we encourage
5272          * DBAs to make their archive_commands reject that.  We can dodge the
5273          * problem by making the new active segment have a new timeline ID.
5274          *
5275          * In a normal crash recovery, we can just extend the timeline we were in.
5276          */
5277         if (InArchiveRecovery)
5278         {
5279                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
5280                 ereport(LOG,
5281                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
5282                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
5283                                                          curFileTLI, endLogId, endLogSeg);
5284         }
5285
5286         /* Save the selected TimeLineID in shared memory, too */
5287         XLogCtl->ThisTimeLineID = ThisTimeLineID;
5288
5289         /*
5290          * We are now done reading the old WAL.  Turn off archive fetching if it
5291          * was active, and make a writable copy of the last WAL segment. (Note
5292          * that we also have a copy of the last block of the old WAL in readBuf;
5293          * we will use that below.)
5294          */
5295         if (InArchiveRecovery)
5296                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
5297
5298         /*
5299          * Prepare to write WAL starting at EndOfLog position, and init xlog
5300          * buffer cache using the block containing the last record from the
5301          * previous incarnation.
5302          */
5303         openLogId = endLogId;
5304         openLogSeg = endLogSeg;
5305         openLogFile = XLogFileOpen(openLogId, openLogSeg);
5306         openLogOff = 0;
5307         Insert = &XLogCtl->Insert;
5308         Insert->PrevRecord = LastRec;
5309         XLogCtl->xlblocks[0].xlogid = openLogId;
5310         XLogCtl->xlblocks[0].xrecoff =
5311                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
5312
5313         /*
5314          * Tricky point here: readBuf contains the *last* block that the LastRec
5315          * record spans, not the one it starts in.      The last block is indeed the
5316          * one we want to use.
5317          */
5318         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
5319         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
5320         Insert->currpos = (char *) Insert->currpage +
5321                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
5322
5323         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5324
5325         XLogCtl->Write.LogwrtResult = LogwrtResult;
5326         Insert->LogwrtResult = LogwrtResult;
5327         XLogCtl->LogwrtResult = LogwrtResult;
5328
5329         XLogCtl->LogwrtRqst.Write = EndOfLog;
5330         XLogCtl->LogwrtRqst.Flush = EndOfLog;
5331
5332         freespace = INSERT_FREESPACE(Insert);
5333         if (freespace > 0)
5334         {
5335                 /* Make sure rest of page is zero */
5336                 MemSet(Insert->currpos, 0, freespace);
5337                 XLogCtl->Write.curridx = 0;
5338         }
5339         else
5340         {
5341                 /*
5342                  * Whenever Write.LogwrtResult points to exactly the end of a page,
5343                  * Write.curridx must point to the *next* page (see XLogWrite()).
5344                  *
5345                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
5346                  * this is sufficient.  The first actual attempt to insert a log
5347                  * record will advance the insert state.
5348                  */
5349                 XLogCtl->Write.curridx = NextBufIdx(0);
5350         }
5351
5352         /* Pre-scan prepared transactions to find out the range of XIDs present */
5353         oldestActiveXID = PrescanPreparedTransactions();
5354
5355         if (InRecovery)
5356         {
5357                 int                     rmid;
5358
5359                 /*
5360                  * Allow resource managers to do any required cleanup.
5361                  */
5362                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5363                 {
5364                         if (RmgrTable[rmid].rm_cleanup != NULL)
5365                                 RmgrTable[rmid].rm_cleanup();
5366                 }
5367
5368                 /*
5369                  * Check to see if the XLOG sequence contained any unresolved
5370                  * references to uninitialized pages.
5371                  */
5372                 XLogCheckInvalidPages();
5373
5374                 /*
5375                  * Reset pgstat data, because it may be invalid after recovery.
5376                  */
5377                 pgstat_reset_all();
5378
5379                 /*
5380                  * Perform a checkpoint to update all our recovery activity to disk.
5381                  *
5382                  * Note that we write a shutdown checkpoint rather than an on-line
5383                  * one. This is not particularly critical, but since we may be
5384                  * assigning a new TLI, using a shutdown checkpoint allows us to have
5385                  * the rule that TLI only changes in shutdown checkpoints, which
5386                  * allows some extra error checking in xlog_redo.
5387                  */
5388                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
5389         }
5390
5391         /*
5392          * Preallocate additional log files, if wanted.
5393          */
5394         PreallocXlogFiles(EndOfLog);
5395
5396         /*
5397          * Okay, we're officially UP.
5398          */
5399         InRecovery = false;
5400
5401         ControlFile->state = DB_IN_PRODUCTION;
5402         ControlFile->time = (pg_time_t) time(NULL);
5403         UpdateControlFile();
5404
5405         /* start the archive_timeout timer running */
5406         XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
5407
5408         /* initialize shared-memory copy of latest checkpoint XID/epoch */
5409         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5410         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
5411
5412         /* also initialize latestCompletedXid, to nextXid - 1 */
5413         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
5414         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
5415
5416         /* Start up the commit log and related stuff, too */
5417         StartupCLOG();
5418         StartupSUBTRANS(oldestActiveXID);
5419         StartupMultiXact();
5420
5421         /* Reload shared-memory state for prepared transactions */
5422         RecoverPreparedTransactions();
5423
5424         /* Shut down readFile facility, free space */
5425         if (readFile >= 0)
5426         {
5427                 close(readFile);
5428                 readFile = -1;
5429         }
5430         if (readBuf)
5431         {
5432                 free(readBuf);
5433                 readBuf = NULL;
5434         }
5435         if (readRecordBuf)
5436         {
5437                 free(readRecordBuf);
5438                 readRecordBuf = NULL;
5439                 readRecordBufSize = 0;
5440         }
5441 }
5442
5443 /*
5444  * Subroutine to try to fetch and validate a prior checkpoint record.
5445  *
5446  * whichChkpt identifies the checkpoint (merely for reporting purposes).
5447  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
5448  */
5449 static XLogRecord *
5450 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
5451 {
5452         XLogRecord *record;
5453
5454         if (!XRecOffIsValid(RecPtr.xrecoff))
5455         {
5456                 switch (whichChkpt)
5457                 {
5458                         case 1:
5459                                 ereport(LOG,
5460                                 (errmsg("invalid primary checkpoint link in control file")));
5461                                 break;
5462                         case 2:
5463                                 ereport(LOG,
5464                                                 (errmsg("invalid secondary checkpoint link in control file")));
5465                                 break;
5466                         default:
5467                                 ereport(LOG,
5468                                    (errmsg("invalid checkpoint link in backup_label file")));
5469                                 break;
5470                 }
5471                 return NULL;
5472         }
5473
5474         record = ReadRecord(&RecPtr, LOG);
5475
5476         if (record == NULL)
5477         {
5478                 switch (whichChkpt)
5479                 {
5480                         case 1:
5481                                 ereport(LOG,
5482                                                 (errmsg("invalid primary checkpoint record")));
5483                                 break;
5484                         case 2:
5485                                 ereport(LOG,
5486                                                 (errmsg("invalid secondary checkpoint record")));
5487                                 break;
5488                         default:
5489                                 ereport(LOG,
5490                                                 (errmsg("invalid checkpoint record")));
5491                                 break;
5492                 }
5493                 return NULL;
5494         }
5495         if (record->xl_rmid != RM_XLOG_ID)
5496         {
5497                 switch (whichChkpt)
5498                 {
5499                         case 1:
5500                                 ereport(LOG,
5501                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
5502                                 break;
5503                         case 2:
5504                                 ereport(LOG,
5505                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
5506                                 break;
5507                         default:
5508                                 ereport(LOG,
5509                                 (errmsg("invalid resource manager ID in checkpoint record")));
5510                                 break;
5511                 }
5512                 return NULL;
5513         }
5514         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
5515                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
5516         {
5517                 switch (whichChkpt)
5518                 {
5519                         case 1:
5520                                 ereport(LOG,
5521                                    (errmsg("invalid xl_info in primary checkpoint record")));
5522                                 break;
5523                         case 2:
5524                                 ereport(LOG,
5525                                  (errmsg("invalid xl_info in secondary checkpoint record")));
5526                                 break;
5527                         default:
5528                                 ereport(LOG,
5529                                                 (errmsg("invalid xl_info in checkpoint record")));
5530                                 break;
5531                 }
5532                 return NULL;
5533         }
5534         if (record->xl_len != sizeof(CheckPoint) ||
5535                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
5536         {
5537                 switch (whichChkpt)
5538                 {
5539                         case 1:
5540                                 ereport(LOG,
5541                                         (errmsg("invalid length of primary checkpoint record")));
5542                                 break;
5543                         case 2:
5544                                 ereport(LOG,
5545                                   (errmsg("invalid length of secondary checkpoint record")));
5546                                 break;
5547                         default:
5548                                 ereport(LOG,
5549                                                 (errmsg("invalid length of checkpoint record")));
5550                                 break;
5551                 }
5552                 return NULL;
5553         }
5554         return record;
5555 }
5556
5557 /*
5558  * This must be called during startup of a backend process, except that
5559  * it need not be called in a standalone backend (which does StartupXLOG
5560  * instead).  We need to initialize the local copies of ThisTimeLineID and
5561  * RedoRecPtr.
5562  *
5563  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
5564  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
5565  * unnecessary however, since the postmaster itself never touches XLOG anyway.
5566  */
5567 void
5568 InitXLOGAccess(void)
5569 {
5570         /* ThisTimeLineID doesn't change so we need no lock to copy it */
5571         ThisTimeLineID = XLogCtl->ThisTimeLineID;
5572         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
5573         (void) GetRedoRecPtr();
5574 }
5575
5576 /*
5577  * Once spawned, a backend may update its local RedoRecPtr from
5578  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
5579  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
5580  */
5581 XLogRecPtr
5582 GetRedoRecPtr(void)
5583 {
5584         /* use volatile pointer to prevent code rearrangement */
5585         volatile XLogCtlData *xlogctl = XLogCtl;
5586
5587         SpinLockAcquire(&xlogctl->info_lck);
5588         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
5589         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
5590         SpinLockRelease(&xlogctl->info_lck);
5591
5592         return RedoRecPtr;
5593 }
5594
5595 /*
5596  * GetInsertRecPtr -- Returns the current insert position.
5597  *
5598  * NOTE: The value *actually* returned is the position of the last full
5599  * xlog page. It lags behind the real insert position by at most 1 page.
5600  * For that, we don't need to acquire WALInsertLock which can be quite
5601  * heavily contended, and an approximation is enough for the current
5602  * usage of this function.
5603  */
5604 XLogRecPtr
5605 GetInsertRecPtr(void)
5606 {
5607         /* use volatile pointer to prevent code rearrangement */
5608         volatile XLogCtlData *xlogctl = XLogCtl;
5609         XLogRecPtr      recptr;
5610
5611         SpinLockAcquire(&xlogctl->info_lck);
5612         recptr = xlogctl->LogwrtRqst.Write;
5613         SpinLockRelease(&xlogctl->info_lck);
5614
5615         return recptr;
5616 }
5617
5618 /*
5619  * Get the time of the last xlog segment switch
5620  */
5621 pg_time_t
5622 GetLastSegSwitchTime(void)
5623 {
5624         pg_time_t       result;
5625
5626         /* Need WALWriteLock, but shared lock is sufficient */
5627         LWLockAcquire(WALWriteLock, LW_SHARED);
5628         result = XLogCtl->Write.lastSegSwitchTime;
5629         LWLockRelease(WALWriteLock);
5630
5631         return result;
5632 }
5633
5634 /*
5635  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
5636  *
5637  * This is exported for use by code that would like to have 64-bit XIDs.
5638  * We don't really support such things, but all XIDs within the system
5639  * can be presumed "close to" the result, and thus the epoch associated
5640  * with them can be determined.
5641  */
5642 void
5643 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
5644 {
5645         uint32          ckptXidEpoch;
5646         TransactionId ckptXid;
5647         TransactionId nextXid;
5648
5649         /* Must read checkpoint info first, else have race condition */
5650         {
5651                 /* use volatile pointer to prevent code rearrangement */
5652                 volatile XLogCtlData *xlogctl = XLogCtl;
5653
5654                 SpinLockAcquire(&xlogctl->info_lck);
5655                 ckptXidEpoch = xlogctl->ckptXidEpoch;
5656                 ckptXid = xlogctl->ckptXid;
5657                 SpinLockRelease(&xlogctl->info_lck);
5658         }
5659
5660         /* Now fetch current nextXid */
5661         nextXid = ReadNewTransactionId();
5662
5663         /*
5664          * nextXid is certainly logically later than ckptXid.  So if it's
5665          * numerically less, it must have wrapped into the next epoch.
5666          */
5667         if (nextXid < ckptXid)
5668                 ckptXidEpoch++;
5669
5670         *xid = nextXid;
5671         *epoch = ckptXidEpoch;
5672 }
5673
5674 /*
5675  * This must be called ONCE during postmaster or standalone-backend shutdown
5676  */
5677 void
5678 ShutdownXLOG(int code, Datum arg)
5679 {
5680         ereport(LOG,
5681                         (errmsg("shutting down")));
5682
5683         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
5684         ShutdownCLOG();
5685         ShutdownSUBTRANS();
5686         ShutdownMultiXact();
5687
5688         ereport(LOG,
5689                         (errmsg("database system is shut down")));
5690 }
5691
5692 /*
5693  * Log start of a checkpoint.
5694  */
5695 static void
5696 LogCheckpointStart(int flags)
5697 {
5698         elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
5699                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
5700                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
5701                  (flags & CHECKPOINT_FORCE) ? " force" : "",
5702                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
5703                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
5704                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
5705 }
5706
5707 /*
5708  * Log end of a checkpoint.
5709  */
5710 static void
5711 LogCheckpointEnd(void)
5712 {
5713         long            write_secs,
5714                                 sync_secs,
5715                                 total_secs;
5716         int                     write_usecs,
5717                                 sync_usecs,
5718                                 total_usecs;
5719
5720         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
5721
5722         TimestampDifference(CheckpointStats.ckpt_start_t,
5723                                                 CheckpointStats.ckpt_end_t,
5724                                                 &total_secs, &total_usecs);
5725
5726         TimestampDifference(CheckpointStats.ckpt_write_t,
5727                                                 CheckpointStats.ckpt_sync_t,
5728                                                 &write_secs, &write_usecs);
5729
5730         TimestampDifference(CheckpointStats.ckpt_sync_t,
5731                                                 CheckpointStats.ckpt_sync_end_t,
5732                                                 &sync_secs, &sync_usecs);
5733
5734         elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
5735                  "%d transaction log file(s) added, %d removed, %d recycled; "
5736                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
5737                  CheckpointStats.ckpt_bufs_written,
5738                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
5739                  CheckpointStats.ckpt_segs_added,
5740                  CheckpointStats.ckpt_segs_removed,
5741                  CheckpointStats.ckpt_segs_recycled,
5742                  write_secs, write_usecs / 1000,
5743                  sync_secs, sync_usecs / 1000,
5744                  total_secs, total_usecs / 1000);
5745 }
5746
5747 /*
5748  * Perform a checkpoint --- either during shutdown, or on-the-fly
5749  *
5750  * flags is a bitwise OR of the following:
5751  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
5752  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
5753  *              ignoring checkpoint_completion_target parameter.
5754  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
5755  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
5756  *
5757  * Note: flags contains other bits, of interest here only for logging purposes.
5758  * In particular note that this routine is synchronous and does not pay
5759  * attention to CHECKPOINT_WAIT.
5760  */
5761 void
5762 CreateCheckPoint(int flags)
5763 {
5764         bool            shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0;
5765         CheckPoint      checkPoint;
5766         XLogRecPtr      recptr;
5767         XLogCtlInsert *Insert = &XLogCtl->Insert;
5768         XLogRecData rdata;
5769         uint32          freespace;
5770         uint32          _logId;
5771         uint32          _logSeg;
5772         TransactionId *inCommitXids;
5773         int                     nInCommit;
5774
5775         /*
5776          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
5777          * (This is just pro forma, since in the present system structure there is
5778          * only one process that is allowed to issue checkpoints at any given
5779          * time.)
5780          */
5781         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
5782
5783         /*
5784          * Prepare to accumulate statistics.
5785          *
5786          * Note: because it is possible for log_checkpoints to change while a
5787          * checkpoint proceeds, we always accumulate stats, even if
5788          * log_checkpoints is currently off.
5789          */
5790         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
5791         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
5792
5793         /*
5794          * Use a critical section to force system panic if we have trouble.
5795          */
5796         START_CRIT_SECTION();
5797
5798         if (shutdown)
5799         {
5800                 ControlFile->state = DB_SHUTDOWNING;
5801                 ControlFile->time = (pg_time_t) time(NULL);
5802                 UpdateControlFile();
5803         }
5804
5805         /*
5806          * Let smgr prepare for checkpoint; this has to happen before we determine
5807          * the REDO pointer.  Note that smgr must not do anything that'd have to
5808          * be undone if we decide no checkpoint is needed.
5809          */
5810         smgrpreckpt();
5811
5812         /* Begin filling in the checkpoint WAL record */
5813         MemSet(&checkPoint, 0, sizeof(checkPoint));
5814         checkPoint.ThisTimeLineID = ThisTimeLineID;
5815         checkPoint.time = (pg_time_t) time(NULL);
5816
5817         /*
5818          * We must hold WALInsertLock while examining insert state to determine
5819          * the checkpoint REDO pointer.
5820          */
5821         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
5822
5823         /*
5824          * If this isn't a shutdown or forced checkpoint, and we have not inserted
5825          * any XLOG records since the start of the last checkpoint, skip the
5826          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
5827          * when the system is idle. That wastes log space, and more importantly it
5828          * exposes us to possible loss of both current and previous checkpoint
5829          * records if the machine crashes just as we're writing the update.
5830          * (Perhaps it'd make even more sense to checkpoint only when the previous
5831          * checkpoint record is in a different xlog page?)
5832          *
5833          * We have to make two tests to determine that nothing has happened since
5834          * the start of the last checkpoint: current insertion point must match
5835          * the end of the last checkpoint record, and its redo pointer must point
5836          * to itself.
5837          */
5838         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
5839         {
5840                 XLogRecPtr      curInsert;
5841
5842                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
5843                 if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
5844                         curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
5845                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
5846                         ControlFile->checkPoint.xlogid ==
5847                         ControlFile->checkPointCopy.redo.xlogid &&
5848                         ControlFile->checkPoint.xrecoff ==
5849                         ControlFile->checkPointCopy.redo.xrecoff)
5850                 {
5851                         LWLockRelease(WALInsertLock);
5852                         LWLockRelease(CheckpointLock);
5853                         END_CRIT_SECTION();
5854                         return;
5855                 }
5856         }
5857
5858         /*
5859          * Compute new REDO record ptr = location of next XLOG record.
5860          *
5861          * NB: this is NOT necessarily where the checkpoint record itself will be,
5862          * since other backends may insert more XLOG records while we're off doing
5863          * the buffer flush work.  Those XLOG records are logically after the
5864          * checkpoint, even though physically before it.  Got that?
5865          */
5866         freespace = INSERT_FREESPACE(Insert);
5867         if (freespace < SizeOfXLogRecord)
5868         {
5869                 (void) AdvanceXLInsertBuffer(false);
5870                 /* OK to ignore update return flag, since we will do flush anyway */
5871                 freespace = INSERT_FREESPACE(Insert);
5872         }
5873         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
5874
5875         /*
5876          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
5877          * must be done while holding the insert lock AND the info_lck.
5878          *
5879          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
5880          * pointing past where it really needs to point.  This is okay; the only
5881          * consequence is that XLogInsert might back up whole buffers that it
5882          * didn't really need to.  We can't postpone advancing RedoRecPtr because
5883          * XLogInserts that happen while we are dumping buffers must assume that
5884          * their buffer changes are not included in the checkpoint.
5885          */
5886         {
5887                 /* use volatile pointer to prevent code rearrangement */
5888                 volatile XLogCtlData *xlogctl = XLogCtl;
5889
5890                 SpinLockAcquire(&xlogctl->info_lck);
5891                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
5892                 SpinLockRelease(&xlogctl->info_lck);
5893         }
5894
5895         /*
5896          * Now we can release WAL insert lock, allowing other xacts to proceed
5897          * while we are flushing disk buffers.
5898          */
5899         LWLockRelease(WALInsertLock);
5900
5901         /*
5902          * If enabled, log checkpoint start.  We postpone this until now so as not
5903          * to log anything if we decided to skip the checkpoint.
5904          */
5905         if (log_checkpoints)
5906                 LogCheckpointStart(flags);
5907
5908         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
5909
5910         /*
5911          * Before flushing data, we must wait for any transactions that are
5912          * currently in their commit critical sections.  If an xact inserted its
5913          * commit record into XLOG just before the REDO point, then a crash
5914          * restart from the REDO point would not replay that record, which means
5915          * that our flushing had better include the xact's update of pg_clog.  So
5916          * we wait till he's out of his commit critical section before proceeding.
5917          * See notes in RecordTransactionCommit().
5918          *
5919          * Because we've already released WALInsertLock, this test is a bit fuzzy:
5920          * it is possible that we will wait for xacts we didn't really need to
5921          * wait for.  But the delay should be short and it seems better to make
5922          * checkpoint take a bit longer than to hold locks longer than necessary.
5923          * (In fact, the whole reason we have this issue is that xact.c does
5924          * commit record XLOG insertion and clog update as two separate steps
5925          * protected by different locks, but again that seems best on grounds of
5926          * minimizing lock contention.)
5927          *
5928          * A transaction that has not yet set inCommit when we look cannot be at
5929          * risk, since he's not inserted his commit record yet; and one that's
5930          * already cleared it is not at risk either, since he's done fixing clog
5931          * and we will correctly flush the update below.  So we cannot miss any
5932          * xacts we need to wait for.
5933          */
5934         nInCommit = GetTransactionsInCommit(&inCommitXids);
5935         if (nInCommit > 0)
5936         {
5937                 do
5938                 {
5939                         pg_usleep(10000L);      /* wait for 10 msec */
5940                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
5941         }
5942         pfree(inCommitXids);
5943
5944         /*
5945          * Get the other info we need for the checkpoint record.
5946          */
5947         LWLockAcquire(XidGenLock, LW_SHARED);
5948         checkPoint.nextXid = ShmemVariableCache->nextXid;
5949         LWLockRelease(XidGenLock);
5950
5951         /* Increase XID epoch if we've wrapped around since last checkpoint */
5952         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5953         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
5954                 checkPoint.nextXidEpoch++;
5955
5956         LWLockAcquire(OidGenLock, LW_SHARED);
5957         checkPoint.nextOid = ShmemVariableCache->nextOid;
5958         if (!shutdown)
5959                 checkPoint.nextOid += ShmemVariableCache->oidCount;
5960         LWLockRelease(OidGenLock);
5961
5962         MultiXactGetCheckptMulti(shutdown,
5963                                                          &checkPoint.nextMulti,
5964                                                          &checkPoint.nextMultiOffset);
5965
5966         /*
5967          * Having constructed the checkpoint record, ensure all shmem disk buffers
5968          * and commit-log buffers are flushed to disk.
5969          *
5970          * This I/O could fail for various reasons.  If so, we will fail to
5971          * complete the checkpoint, but there is no reason to force a system
5972          * panic. Accordingly, exit critical section while doing it.
5973          */
5974         END_CRIT_SECTION();
5975
5976         CheckPointGuts(checkPoint.redo, flags);
5977
5978         START_CRIT_SECTION();
5979
5980         /*
5981          * Now insert the checkpoint record into XLOG.
5982          */
5983         rdata.data = (char *) (&checkPoint);
5984         rdata.len = sizeof(checkPoint);
5985         rdata.buffer = InvalidBuffer;
5986         rdata.next = NULL;
5987
5988         recptr = XLogInsert(RM_XLOG_ID,
5989                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
5990                                                 XLOG_CHECKPOINT_ONLINE,
5991                                                 &rdata);
5992
5993         XLogFlush(recptr);
5994
5995         /*
5996          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
5997          * = end of actual checkpoint record.
5998          */
5999         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
6000                 ereport(PANIC,
6001                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
6002
6003         /*
6004          * Select point at which we can truncate the log, which we base on the
6005          * prior checkpoint's earliest info.
6006          */
6007         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
6008
6009         /*
6010          * Update the control file.
6011          */
6012         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6013         if (shutdown)
6014                 ControlFile->state = DB_SHUTDOWNED;
6015         ControlFile->prevCheckPoint = ControlFile->checkPoint;
6016         ControlFile->checkPoint = ProcLastRecPtr;
6017         ControlFile->checkPointCopy = checkPoint;
6018         ControlFile->time = (pg_time_t) time(NULL);
6019         UpdateControlFile();
6020         LWLockRelease(ControlFileLock);
6021
6022         /* Update shared-memory copy of checkpoint XID/epoch */
6023         {
6024                 /* use volatile pointer to prevent code rearrangement */
6025                 volatile XLogCtlData *xlogctl = XLogCtl;
6026
6027                 SpinLockAcquire(&xlogctl->info_lck);
6028                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
6029                 xlogctl->ckptXid = checkPoint.nextXid;
6030                 SpinLockRelease(&xlogctl->info_lck);
6031         }
6032
6033         /*
6034          * We are now done with critical updates; no need for system panic if we
6035          * have trouble while fooling with old log segments.
6036          */
6037         END_CRIT_SECTION();
6038
6039         /*
6040          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
6041          */
6042         smgrpostckpt();
6043
6044         /*
6045          * Delete old log files (those no longer needed even for previous
6046          * checkpoint).
6047          */
6048         if (_logId || _logSeg)
6049         {
6050                 PrevLogSeg(_logId, _logSeg);
6051                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
6052         }
6053
6054         /*
6055          * Make more log segments if needed.  (Do this after recycling old log
6056          * segments, since that may supply some of the needed files.)
6057          */
6058         if (!shutdown)
6059                 PreallocXlogFiles(recptr);
6060
6061         /*
6062          * Truncate pg_subtrans if possible.  We can throw away all data before
6063          * the oldest XMIN of any running transaction.  No future transaction will
6064          * attempt to reference any pg_subtrans entry older than that (see Asserts
6065          * in subtrans.c).      During recovery, though, we mustn't do this because
6066          * StartupSUBTRANS hasn't been called yet.
6067          */
6068         if (!InRecovery)
6069                 TruncateSUBTRANS(GetOldestXmin(true, false));
6070
6071         /* All real work is done, but log before releasing lock. */
6072         if (log_checkpoints)
6073                 LogCheckpointEnd();
6074
6075         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
6076                                 NBuffers, CheckpointStats.ckpt_segs_added,
6077                                 CheckpointStats.ckpt_segs_removed,
6078                                 CheckpointStats.ckpt_segs_recycled);
6079
6080         LWLockRelease(CheckpointLock);
6081 }
6082
6083 /*
6084  * Flush all data in shared memory to disk, and fsync
6085  *
6086  * This is the common code shared between regular checkpoints and
6087  * recovery restartpoints.
6088  */
6089 static void
6090 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
6091 {
6092         CheckPointCLOG();
6093         CheckPointSUBTRANS();
6094         CheckPointMultiXact();
6095         CheckPointBuffers(flags);       /* performs all required fsyncs */
6096         /* We deliberately delay 2PC checkpointing as long as possible */
6097         CheckPointTwoPhase(checkPointRedo);
6098 }
6099
6100 /*
6101  * Set a recovery restart point if appropriate
6102  *
6103  * This is similar to CreateCheckPoint, but is used during WAL recovery
6104  * to establish a point from which recovery can roll forward without
6105  * replaying the entire recovery log.  This function is called each time
6106  * a checkpoint record is read from XLOG; it must determine whether a
6107  * restartpoint is needed or not.
6108  */
6109 static void
6110 RecoveryRestartPoint(const CheckPoint *checkPoint)
6111 {
6112         int                     elapsed_secs;
6113         int                     rmid;
6114
6115         /*
6116          * Do nothing if the elapsed time since the last restartpoint is less than
6117          * half of checkpoint_timeout.  (We use a value less than
6118          * checkpoint_timeout so that variations in the timing of checkpoints on
6119          * the master, or speed of transmission of WAL segments to a slave, won't
6120          * make the slave skip a restartpoint once it's synced with the master.)
6121          * Checking true elapsed time keeps us from doing restartpoints too often
6122          * while rapidly scanning large amounts of WAL.
6123          */
6124         elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
6125         if (elapsed_secs < CheckPointTimeout / 2)
6126                 return;
6127
6128         /*
6129          * Is it safe to checkpoint?  We must ask each of the resource managers
6130          * whether they have any partial state information that might prevent a
6131          * correct restart from this point.  If so, we skip this opportunity, but
6132          * return at the next checkpoint record for another try.
6133          */
6134         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6135         {
6136                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
6137                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
6138                         {
6139                                 elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
6140                                          rmid,
6141                                          checkPoint->redo.xlogid,
6142                                          checkPoint->redo.xrecoff);
6143                                 return;
6144                         }
6145         }
6146
6147         /*
6148          * OK, force data out to disk
6149          */
6150         CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
6151
6152         /*
6153          * Update pg_control so that any subsequent crash will restart from this
6154          * checkpoint.  Note: ReadRecPtr gives the XLOG address of the checkpoint
6155          * record itself.
6156          */
6157         ControlFile->prevCheckPoint = ControlFile->checkPoint;
6158         ControlFile->checkPoint = ReadRecPtr;
6159         ControlFile->checkPointCopy = *checkPoint;
6160         ControlFile->time = (pg_time_t) time(NULL);
6161         UpdateControlFile();
6162
6163         ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
6164                         (errmsg("recovery restart point at %X/%X",
6165                                         checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
6166         if (recoveryLastXTime)
6167                 ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
6168                                 (errmsg("last completed transaction was at log time %s",
6169                                                 timestamptz_to_str(recoveryLastXTime))));
6170 }
6171
6172 /*
6173  * Write a NEXTOID log record
6174  */
6175 void
6176 XLogPutNextOid(Oid nextOid)
6177 {
6178         XLogRecData rdata;
6179
6180         rdata.data = (char *) (&nextOid);
6181         rdata.len = sizeof(Oid);
6182         rdata.buffer = InvalidBuffer;
6183         rdata.next = NULL;
6184         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
6185
6186         /*
6187          * We need not flush the NEXTOID record immediately, because any of the
6188          * just-allocated OIDs could only reach disk as part of a tuple insert or
6189          * update that would have its own XLOG record that must follow the NEXTOID
6190          * record.      Therefore, the standard buffer LSN interlock applied to those
6191          * records will ensure no such OID reaches disk before the NEXTOID record
6192          * does.
6193          *
6194          * Note, however, that the above statement only covers state "within" the
6195          * database.  When we use a generated OID as a file or directory name, we
6196          * are in a sense violating the basic WAL rule, because that filesystem
6197          * change may reach disk before the NEXTOID WAL record does.  The impact
6198          * of this is that if a database crash occurs immediately afterward, we
6199          * might after restart re-generate the same OID and find that it conflicts
6200          * with the leftover file or directory.  But since for safety's sake we
6201          * always loop until finding a nonconflicting filename, this poses no real
6202          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
6203          */
6204 }
6205
6206 /*
6207  * Write an XLOG SWITCH record.
6208  *
6209  * Here we just blindly issue an XLogInsert request for the record.
6210  * All the magic happens inside XLogInsert.
6211  *
6212  * The return value is either the end+1 address of the switch record,
6213  * or the end+1 address of the prior segment if we did not need to
6214  * write a switch record because we are already at segment start.
6215  */
6216 XLogRecPtr
6217 RequestXLogSwitch(void)
6218 {
6219         XLogRecPtr      RecPtr;
6220         XLogRecData rdata;
6221
6222         /* XLOG SWITCH, alone among xlog record types, has no data */
6223         rdata.buffer = InvalidBuffer;
6224         rdata.data = NULL;
6225         rdata.len = 0;
6226         rdata.next = NULL;
6227
6228         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
6229
6230         return RecPtr;
6231 }
6232
6233 /*
6234  * XLOG resource manager's routines
6235  */
6236 void
6237 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
6238 {
6239         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6240
6241         /* Backup blocks are not used in xlog records */
6242         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
6243
6244         if (info == XLOG_NEXTOID)
6245         {
6246                 Oid                     nextOid;
6247
6248                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
6249                 if (ShmemVariableCache->nextOid < nextOid)
6250                 {
6251                         ShmemVariableCache->nextOid = nextOid;
6252                         ShmemVariableCache->oidCount = 0;
6253                 }
6254         }
6255         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
6256         {
6257                 CheckPoint      checkPoint;
6258
6259                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6260                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
6261                 ShmemVariableCache->nextXid = checkPoint.nextXid;
6262                 ShmemVariableCache->nextOid = checkPoint.nextOid;
6263                 ShmemVariableCache->oidCount = 0;
6264                 MultiXactSetNextMXact(checkPoint.nextMulti,
6265                                                           checkPoint.nextMultiOffset);
6266
6267                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
6268                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
6269                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
6270
6271                 /*
6272                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
6273                  */
6274                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
6275                 {
6276                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
6277                                 !list_member_int(expectedTLIs,
6278                                                                  (int) checkPoint.ThisTimeLineID))
6279                                 ereport(PANIC,
6280                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
6281                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
6282                         /* Following WAL records should be run with new TLI */
6283                         ThisTimeLineID = checkPoint.ThisTimeLineID;
6284                 }
6285
6286                 RecoveryRestartPoint(&checkPoint);
6287         }
6288         else if (info == XLOG_CHECKPOINT_ONLINE)
6289         {
6290                 CheckPoint      checkPoint;
6291
6292                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6293                 /* In an ONLINE checkpoint, treat the counters like NEXTOID */
6294                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
6295                                                                   checkPoint.nextXid))
6296                         ShmemVariableCache->nextXid = checkPoint.nextXid;
6297                 if (ShmemVariableCache->nextOid < checkPoint.nextOid)
6298                 {
6299                         ShmemVariableCache->nextOid = checkPoint.nextOid;
6300                         ShmemVariableCache->oidCount = 0;
6301                 }
6302                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
6303                                                                   checkPoint.nextMultiOffset);
6304
6305                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
6306                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
6307                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
6308
6309                 /* TLI should not change in an on-line checkpoint */
6310                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
6311                         ereport(PANIC,
6312                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
6313                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
6314
6315                 RecoveryRestartPoint(&checkPoint);
6316         }
6317         else if (info == XLOG_NOOP)
6318         {
6319                 /* nothing to do here */
6320         }
6321         else if (info == XLOG_SWITCH)
6322         {
6323                 /* nothing to do here */
6324         }
6325 }
6326
6327 void
6328 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
6329 {
6330         uint8           info = xl_info & ~XLR_INFO_MASK;
6331
6332         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
6333                 info == XLOG_CHECKPOINT_ONLINE)
6334         {
6335                 CheckPoint *checkpoint = (CheckPoint *) rec;
6336
6337                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
6338                                                  "tli %u; xid %u/%u; oid %u; multi %u; offset %u; %s",
6339                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
6340                                                  checkpoint->ThisTimeLineID,
6341                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
6342                                                  checkpoint->nextOid,
6343                                                  checkpoint->nextMulti,
6344                                                  checkpoint->nextMultiOffset,
6345                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
6346         }
6347         else if (info == XLOG_NOOP)
6348         {
6349                 appendStringInfo(buf, "xlog no-op");
6350         }
6351         else if (info == XLOG_NEXTOID)
6352         {
6353                 Oid                     nextOid;
6354
6355                 memcpy(&nextOid, rec, sizeof(Oid));
6356                 appendStringInfo(buf, "nextOid: %u", nextOid);
6357         }
6358         else if (info == XLOG_SWITCH)
6359         {
6360                 appendStringInfo(buf, "xlog switch");
6361         }
6362         else
6363                 appendStringInfo(buf, "UNKNOWN");
6364 }
6365
6366 #ifdef WAL_DEBUG
6367
6368 static void
6369 xlog_outrec(StringInfo buf, XLogRecord *record)
6370 {
6371         int                     i;
6372
6373         appendStringInfo(buf, "prev %X/%X; xid %u",
6374                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
6375                                          record->xl_xid);
6376
6377         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
6378         {
6379                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
6380                         appendStringInfo(buf, "; bkpb%d", i + 1);
6381         }
6382
6383         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
6384 }
6385 #endif   /* WAL_DEBUG */
6386
6387
6388 /*
6389  * Return the (possible) sync flag used for opening a file, depending on the
6390  * value of the GUC wal_sync_method.
6391  */
6392 static int
6393 get_sync_bit(int method)
6394 {
6395         /* If fsync is disabled, never open in sync mode */
6396         if (!enableFsync)
6397                 return 0;
6398
6399         switch (method)
6400         {
6401                 /*
6402                  * enum values for all sync options are defined even if they are not
6403                  * supported on the current platform.  But if not, they are not
6404                  * included in the enum option array, and therefore will never be seen
6405                  * here.
6406                  */
6407                 case SYNC_METHOD_FSYNC:
6408                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
6409                 case SYNC_METHOD_FDATASYNC:
6410                         return 0;
6411 #ifdef OPEN_SYNC_FLAG
6412                 case SYNC_METHOD_OPEN:
6413                         return OPEN_SYNC_FLAG;
6414 #endif
6415 #ifdef OPEN_DATASYNC_FLAG
6416                 case SYNC_METHOD_OPEN_DSYNC:
6417                         return OPEN_DATASYNC_FLAG;
6418 #endif
6419                 default:
6420                         /* can't happen (unless we are out of sync with option array) */
6421                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
6422                         return 0; /* silence warning */
6423         }
6424 }
6425
6426 /*
6427  * GUC support
6428  */
6429 bool
6430 assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
6431 {
6432         if (!doit)
6433                 return true;
6434
6435         if (sync_method != new_sync_method)
6436         {
6437                 /*
6438                  * To ensure that no blocks escape unsynced, force an fsync on the
6439                  * currently open log segment (if any).  Also, if the open flag is
6440                  * changing, close the log file so it will be reopened (with new flag
6441                  * bit) at next use.
6442                  */
6443                 if (openLogFile >= 0)
6444                 {
6445                         if (pg_fsync(openLogFile) != 0)
6446                                 ereport(PANIC,
6447                                                 (errcode_for_file_access(),
6448                                                  errmsg("could not fsync log file %u, segment %u: %m",
6449                                                                 openLogId, openLogSeg)));
6450                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
6451                                 XLogFileClose();
6452                 }
6453         }
6454
6455         return true;
6456 }
6457
6458
6459 /*
6460  * Issue appropriate kind of fsync (if any) on the current XLOG output file
6461  */
6462 static void
6463 issue_xlog_fsync(void)
6464 {
6465         switch (sync_method)
6466         {
6467                 case SYNC_METHOD_FSYNC:
6468                         if (pg_fsync_no_writethrough(openLogFile) != 0)
6469                                 ereport(PANIC,
6470                                                 (errcode_for_file_access(),
6471                                                  errmsg("could not fsync log file %u, segment %u: %m",
6472                                                                 openLogId, openLogSeg)));
6473                         break;
6474 #ifdef HAVE_FSYNC_WRITETHROUGH
6475                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
6476                         if (pg_fsync_writethrough(openLogFile) != 0)
6477                                 ereport(PANIC,
6478                                                 (errcode_for_file_access(),
6479                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
6480                                                                 openLogId, openLogSeg)));
6481                         break;
6482 #endif
6483 #ifdef HAVE_FDATASYNC
6484                 case SYNC_METHOD_FDATASYNC:
6485                         if (pg_fdatasync(openLogFile) != 0)
6486                                 ereport(PANIC,
6487                                                 (errcode_for_file_access(),
6488                                         errmsg("could not fdatasync log file %u, segment %u: %m",
6489                                                    openLogId, openLogSeg)));
6490                         break;
6491 #endif
6492                 case SYNC_METHOD_OPEN:
6493                 case SYNC_METHOD_OPEN_DSYNC:
6494                         /* write synced it already */
6495                         break;
6496                 default:
6497                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
6498                         break;
6499         }
6500 }
6501
6502
6503 /*
6504  * pg_start_backup: set up for taking an on-line backup dump
6505  *
6506  * Essentially what this does is to create a backup label file in $PGDATA,
6507  * where it will be archived as part of the backup dump.  The label file
6508  * contains the user-supplied label string (typically this would be used
6509  * to tell where the backup dump will be stored) and the starting time and
6510  * starting WAL location for the dump.
6511  */
6512 Datum
6513 pg_start_backup(PG_FUNCTION_ARGS)
6514 {
6515         text       *backupid = PG_GETARG_TEXT_P(0);
6516         char       *backupidstr;
6517         XLogRecPtr      checkpointloc;
6518         XLogRecPtr      startpoint;
6519         pg_time_t       stamp_time;
6520         char            strfbuf[128];
6521         char            xlogfilename[MAXFNAMELEN];
6522         uint32          _logId;
6523         uint32          _logSeg;
6524         struct stat stat_buf;
6525         FILE       *fp;
6526
6527         if (!superuser())
6528                 ereport(ERROR,
6529                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6530                                  errmsg("must be superuser to run a backup")));
6531
6532         if (!XLogArchivingActive())
6533                 ereport(ERROR,
6534                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6535                                  errmsg("WAL archiving is not active"),
6536                                  errhint("archive_mode must be enabled at server start.")));
6537
6538         if (!XLogArchiveCommandSet())
6539                 ereport(ERROR,
6540                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6541                                  errmsg("WAL archiving is not active"),
6542                                  errhint("archive_command must be defined before "
6543                                                  "online backups can be made safely.")));
6544
6545         backupidstr = text_to_cstring(backupid);
6546
6547         /*
6548          * Mark backup active in shared memory.  We must do full-page WAL writes
6549          * during an on-line backup even if not doing so at other times, because
6550          * it's quite possible for the backup dump to obtain a "torn" (partially
6551          * written) copy of a database page if it reads the page concurrently with
6552          * our write to the same page.  This can be fixed as long as the first
6553          * write to the page in the WAL sequence is a full-page write. Hence, we
6554          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
6555          * are no dirty pages in shared memory that might get dumped while the
6556          * backup is in progress without having a corresponding WAL record.  (Once
6557          * the backup is complete, we need not force full-page writes anymore,
6558          * since we expect that any pages not modified during the backup interval
6559          * must have been correctly captured by the backup.)
6560          *
6561          * We must hold WALInsertLock to change the value of forcePageWrites, to
6562          * ensure adequate interlocking against XLogInsert().
6563          */
6564         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6565         if (XLogCtl->Insert.forcePageWrites)
6566         {
6567                 LWLockRelease(WALInsertLock);
6568                 ereport(ERROR,
6569                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6570                                  errmsg("a backup is already in progress"),
6571                                  errhint("Run pg_stop_backup() and try again.")));
6572         }
6573         XLogCtl->Insert.forcePageWrites = true;
6574         LWLockRelease(WALInsertLock);
6575
6576         /* Ensure we release forcePageWrites if fail below */
6577         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
6578         {
6579                 /*
6580                  * Force a CHECKPOINT.  Aside from being necessary to prevent torn
6581                  * page problems, this guarantees that two successive backup runs will
6582                  * have different checkpoint positions and hence different history
6583                  * file names, even if nothing happened in between.
6584                  *
6585                  * We don't use CHECKPOINT_IMMEDIATE, hence this can take awhile.
6586                  */
6587                 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT);
6588
6589                 /*
6590                  * Now we need to fetch the checkpoint record location, and also its
6591                  * REDO pointer.  The oldest point in WAL that would be needed to
6592                  * restore starting from the checkpoint is precisely the REDO pointer.
6593                  */
6594                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6595                 checkpointloc = ControlFile->checkPoint;
6596                 startpoint = ControlFile->checkPointCopy.redo;
6597                 LWLockRelease(ControlFileLock);
6598
6599                 XLByteToSeg(startpoint, _logId, _logSeg);
6600                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
6601
6602                 /* Use the log timezone here, not the session timezone */
6603                 stamp_time = (pg_time_t) time(NULL);
6604                 pg_strftime(strfbuf, sizeof(strfbuf),
6605                                         "%Y-%m-%d %H:%M:%S %Z",
6606                                         pg_localtime(&stamp_time, log_timezone));
6607
6608                 /*
6609                  * Check for existing backup label --- implies a backup is already
6610                  * running.  (XXX given that we checked forcePageWrites above, maybe
6611                  * it would be OK to just unlink any such label file?)
6612                  */
6613                 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
6614                 {
6615                         if (errno != ENOENT)
6616                                 ereport(ERROR,
6617                                                 (errcode_for_file_access(),
6618                                                  errmsg("could not stat file \"%s\": %m",
6619                                                                 BACKUP_LABEL_FILE)));
6620                 }
6621                 else
6622                         ereport(ERROR,
6623                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6624                                          errmsg("a backup is already in progress"),
6625                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
6626                                                          BACKUP_LABEL_FILE)));
6627
6628                 /*
6629                  * Okay, write the file
6630                  */
6631                 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
6632                 if (!fp)
6633                         ereport(ERROR,
6634                                         (errcode_for_file_access(),
6635                                          errmsg("could not create file \"%s\": %m",
6636                                                         BACKUP_LABEL_FILE)));
6637                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
6638                                 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
6639                 fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
6640                                 checkpointloc.xlogid, checkpointloc.xrecoff);
6641                 fprintf(fp, "START TIME: %s\n", strfbuf);
6642                 fprintf(fp, "LABEL: %s\n", backupidstr);
6643                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
6644                         ereport(ERROR,
6645                                         (errcode_for_file_access(),
6646                                          errmsg("could not write file \"%s\": %m",
6647                                                         BACKUP_LABEL_FILE)));
6648         }
6649         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
6650
6651         /*
6652          * We're done.  As a convenience, return the starting WAL location.
6653          */
6654         snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
6655                          startpoint.xlogid, startpoint.xrecoff);
6656         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
6657 }
6658
6659 /* Error cleanup callback for pg_start_backup */
6660 static void
6661 pg_start_backup_callback(int code, Datum arg)
6662 {
6663         /* Turn off forcePageWrites on failure */
6664         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6665         XLogCtl->Insert.forcePageWrites = false;
6666         LWLockRelease(WALInsertLock);
6667 }
6668
6669 /*
6670  * pg_stop_backup: finish taking an on-line backup dump
6671  *
6672  * We remove the backup label file created by pg_start_backup, and instead
6673  * create a backup history file in pg_xlog (whence it will immediately be
6674  * archived).  The backup history file contains the same info found in
6675  * the label file, plus the backup-end time and WAL location.
6676  * Note: different from CancelBackup which just cancels online backup mode.
6677  */
6678 Datum
6679 pg_stop_backup(PG_FUNCTION_ARGS)
6680 {
6681         XLogRecPtr      startpoint;
6682         XLogRecPtr      stoppoint;
6683         pg_time_t       stamp_time;
6684         char            strfbuf[128];
6685         char            histfilepath[MAXPGPATH];
6686         char            startxlogfilename[MAXFNAMELEN];
6687         char            stopxlogfilename[MAXFNAMELEN];
6688         char            lastxlogfilename[MAXFNAMELEN];
6689         char            histfilename[MAXFNAMELEN];
6690         uint32          _logId;
6691         uint32          _logSeg;
6692         FILE       *lfp;
6693         FILE       *fp;
6694         char            ch;
6695         int                     ich;
6696         int                     seconds_before_warning;
6697         int                     waits = 0;
6698
6699         if (!superuser())
6700                 ereport(ERROR,
6701                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6702                                  (errmsg("must be superuser to run a backup"))));
6703
6704         if (!XLogArchivingActive())
6705                 ereport(ERROR,
6706                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6707                                  errmsg("WAL archiving is not active"),
6708                                  errhint("archive_mode must be enabled at server start.")));
6709
6710         /*
6711          * OK to clear forcePageWrites
6712          */
6713         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6714         XLogCtl->Insert.forcePageWrites = false;
6715         LWLockRelease(WALInsertLock);
6716
6717         /*
6718          * Force a switch to a new xlog segment file, so that the backup is valid
6719          * as soon as archiver moves out the current segment file. We'll report
6720          * the end address of the XLOG SWITCH record as the backup stopping point.
6721          */
6722         stoppoint = RequestXLogSwitch();
6723
6724         XLByteToSeg(stoppoint, _logId, _logSeg);
6725         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
6726
6727         /* Use the log timezone here, not the session timezone */
6728         stamp_time = (pg_time_t) time(NULL);
6729         pg_strftime(strfbuf, sizeof(strfbuf),
6730                                 "%Y-%m-%d %H:%M:%S %Z",
6731                                 pg_localtime(&stamp_time, log_timezone));
6732
6733         /*
6734          * Open the existing label file
6735          */
6736         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
6737         if (!lfp)
6738         {
6739                 if (errno != ENOENT)
6740                         ereport(ERROR,
6741                                         (errcode_for_file_access(),
6742                                          errmsg("could not read file \"%s\": %m",
6743                                                         BACKUP_LABEL_FILE)));
6744                 ereport(ERROR,
6745                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6746                                  errmsg("a backup is not in progress")));
6747         }
6748
6749         /*
6750          * Read and parse the START WAL LOCATION line (this code is pretty crude,
6751          * but we are not expecting any variability in the file format).
6752          */
6753         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
6754                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
6755                            &ch) != 4 || ch != '\n')
6756                 ereport(ERROR,
6757                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6758                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
6759
6760         /*
6761          * Write the backup history file
6762          */
6763         XLByteToSeg(startpoint, _logId, _logSeg);
6764         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
6765                                                   startpoint.xrecoff % XLogSegSize);
6766         fp = AllocateFile(histfilepath, "w");
6767         if (!fp)
6768                 ereport(ERROR,
6769                                 (errcode_for_file_access(),
6770                                  errmsg("could not create file \"%s\": %m",
6771                                                 histfilepath)));
6772         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
6773                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
6774         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
6775                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
6776         /* transfer remaining lines from label to history file */
6777         while ((ich = fgetc(lfp)) != EOF)
6778                 fputc(ich, fp);
6779         fprintf(fp, "STOP TIME: %s\n", strfbuf);
6780         if (fflush(fp) || ferror(fp) || FreeFile(fp))
6781                 ereport(ERROR,
6782                                 (errcode_for_file_access(),
6783                                  errmsg("could not write file \"%s\": %m",
6784                                                 histfilepath)));
6785
6786         /*
6787          * Close and remove the backup label file
6788          */
6789         if (ferror(lfp) || FreeFile(lfp))
6790                 ereport(ERROR,
6791                                 (errcode_for_file_access(),
6792                                  errmsg("could not read file \"%s\": %m",
6793                                                 BACKUP_LABEL_FILE)));
6794         if (unlink(BACKUP_LABEL_FILE) != 0)
6795                 ereport(ERROR,
6796                                 (errcode_for_file_access(),
6797                                  errmsg("could not remove file \"%s\": %m",
6798                                                 BACKUP_LABEL_FILE)));
6799
6800         /*
6801          * Clean out any no-longer-needed history files.  As a side effect, this
6802          * will post a .ready file for the newly created history file, notifying
6803          * the archiver that history file may be archived immediately.
6804          */
6805         CleanupBackupHistory();
6806
6807         /*
6808          * Wait until both the last WAL file filled during backup and the history
6809          * file have been archived.  We assume that the alphabetic sorting
6810          * property of the WAL files ensures any earlier WAL files are safely
6811          * archived as well.
6812          *
6813          * We wait forever, since archive_command is supposed to work and
6814          * we assume the admin wanted his backup to work completely. If you
6815          * don't wish to wait, you can set statement_timeout.
6816          */
6817         XLByteToPrevSeg(stoppoint, _logId, _logSeg);
6818         XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
6819
6820         XLByteToSeg(startpoint, _logId, _logSeg);
6821         BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
6822                                                   startpoint.xrecoff % XLogSegSize);
6823
6824         seconds_before_warning = 60;
6825         waits = 0;
6826
6827         while (XLogArchiveIsBusy(lastxlogfilename) ||
6828                    XLogArchiveIsBusy(histfilename))
6829         {
6830                 CHECK_FOR_INTERRUPTS();
6831
6832                 pg_usleep(1000000L);
6833
6834                 if (++waits >= seconds_before_warning)
6835                 {
6836                         seconds_before_warning *= 2;     /* This wraps in >10 years... */
6837                         ereport(WARNING,
6838                                         (errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
6839                                                         waits)));
6840                 }
6841         }
6842
6843         /*
6844          * We're done.  As a convenience, return the ending WAL location.
6845          */
6846         snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
6847                          stoppoint.xlogid, stoppoint.xrecoff);
6848         PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
6849 }
6850
6851 /*
6852  * pg_switch_xlog: switch to next xlog file
6853  */
6854 Datum
6855 pg_switch_xlog(PG_FUNCTION_ARGS)
6856 {
6857         XLogRecPtr      switchpoint;
6858         char            location[MAXFNAMELEN];
6859
6860         if (!superuser())
6861                 ereport(ERROR,
6862                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6863                          (errmsg("must be superuser to switch transaction log files"))));
6864
6865         switchpoint = RequestXLogSwitch();
6866
6867         /*
6868          * As a convenience, return the WAL location of the switch record
6869          */
6870         snprintf(location, sizeof(location), "%X/%X",
6871                          switchpoint.xlogid, switchpoint.xrecoff);
6872         PG_RETURN_TEXT_P(cstring_to_text(location));
6873 }
6874
6875 /*
6876  * Report the current WAL write location (same format as pg_start_backup etc)
6877  *
6878  * This is useful for determining how much of WAL is visible to an external
6879  * archiving process.  Note that the data before this point is written out
6880  * to the kernel, but is not necessarily synced to disk.
6881  */
6882 Datum
6883 pg_current_xlog_location(PG_FUNCTION_ARGS)
6884 {
6885         char            location[MAXFNAMELEN];
6886
6887         /* Make sure we have an up-to-date local LogwrtResult */
6888         {
6889                 /* use volatile pointer to prevent code rearrangement */
6890                 volatile XLogCtlData *xlogctl = XLogCtl;
6891
6892                 SpinLockAcquire(&xlogctl->info_lck);
6893                 LogwrtResult = xlogctl->LogwrtResult;
6894                 SpinLockRelease(&xlogctl->info_lck);
6895         }
6896
6897         snprintf(location, sizeof(location), "%X/%X",
6898                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
6899         PG_RETURN_TEXT_P(cstring_to_text(location));
6900 }
6901
6902 /*
6903  * Report the current WAL insert location (same format as pg_start_backup etc)
6904  *
6905  * This function is mostly for debugging purposes.
6906  */
6907 Datum
6908 pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
6909 {
6910         XLogCtlInsert *Insert = &XLogCtl->Insert;
6911         XLogRecPtr      current_recptr;
6912         char            location[MAXFNAMELEN];
6913
6914         /*
6915          * Get the current end-of-WAL position ... shared lock is sufficient
6916          */
6917         LWLockAcquire(WALInsertLock, LW_SHARED);
6918         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
6919         LWLockRelease(WALInsertLock);
6920
6921         snprintf(location, sizeof(location), "%X/%X",
6922                          current_recptr.xlogid, current_recptr.xrecoff);
6923         PG_RETURN_TEXT_P(cstring_to_text(location));
6924 }
6925
6926 /*
6927  * Compute an xlog file name and decimal byte offset given a WAL location,
6928  * such as is returned by pg_stop_backup() or pg_xlog_switch().
6929  *
6930  * Note that a location exactly at a segment boundary is taken to be in
6931  * the previous segment.  This is usually the right thing, since the
6932  * expected usage is to determine which xlog file(s) are ready to archive.
6933  */
6934 Datum
6935 pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
6936 {
6937         text       *location = PG_GETARG_TEXT_P(0);
6938         char       *locationstr;
6939         unsigned int uxlogid;
6940         unsigned int uxrecoff;
6941         uint32          xlogid;
6942         uint32          xlogseg;
6943         uint32          xrecoff;
6944         XLogRecPtr      locationpoint;
6945         char            xlogfilename[MAXFNAMELEN];
6946         Datum           values[2];
6947         bool            isnull[2];
6948         TupleDesc       resultTupleDesc;
6949         HeapTuple       resultHeapTuple;
6950         Datum           result;
6951
6952         /*
6953          * Read input and parse
6954          */
6955         locationstr = text_to_cstring(location);
6956
6957         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
6958                 ereport(ERROR,
6959                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6960                                  errmsg("could not parse transaction log location \"%s\"",
6961                                                 locationstr)));
6962
6963         locationpoint.xlogid = uxlogid;
6964         locationpoint.xrecoff = uxrecoff;
6965
6966         /*
6967          * Construct a tuple descriptor for the result row.  This must match this
6968          * function's pg_proc entry!
6969          */
6970         resultTupleDesc = CreateTemplateTupleDesc(2, false);
6971         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
6972                                            TEXTOID, -1, 0);
6973         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
6974                                            INT4OID, -1, 0);
6975
6976         resultTupleDesc = BlessTupleDesc(resultTupleDesc);
6977
6978         /*
6979          * xlogfilename
6980          */
6981         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
6982         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
6983
6984         values[0] = CStringGetTextDatum(xlogfilename);
6985         isnull[0] = false;
6986
6987         /*
6988          * offset
6989          */
6990         xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;
6991
6992         values[1] = UInt32GetDatum(xrecoff);
6993         isnull[1] = false;
6994
6995         /*
6996          * Tuple jam: Having first prepared your Datums, then squash together
6997          */
6998         resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
6999
7000         result = HeapTupleGetDatum(resultHeapTuple);
7001
7002         PG_RETURN_DATUM(result);
7003 }
7004
7005 /*
7006  * Compute an xlog file name given a WAL location,
7007  * such as is returned by pg_stop_backup() or pg_xlog_switch().
7008  */
7009 Datum
7010 pg_xlogfile_name(PG_FUNCTION_ARGS)
7011 {
7012         text       *location = PG_GETARG_TEXT_P(0);
7013         char       *locationstr;
7014         unsigned int uxlogid;
7015         unsigned int uxrecoff;
7016         uint32          xlogid;
7017         uint32          xlogseg;
7018         XLogRecPtr      locationpoint;
7019         char            xlogfilename[MAXFNAMELEN];
7020
7021         locationstr = text_to_cstring(location);
7022
7023         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
7024                 ereport(ERROR,
7025                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
7026                                  errmsg("could not parse transaction log location \"%s\"",
7027                                                 locationstr)));
7028
7029         locationpoint.xlogid = uxlogid;
7030         locationpoint.xrecoff = uxrecoff;
7031
7032         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
7033         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
7034
7035         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
7036 }
7037
7038 /*
7039  * read_backup_label: check to see if a backup_label file is present
7040  *
7041  * If we see a backup_label during recovery, we assume that we are recovering
7042  * from a backup dump file, and we therefore roll forward from the checkpoint
7043  * identified by the label file, NOT what pg_control says.      This avoids the
7044  * problem that pg_control might have been archived one or more checkpoints
7045  * later than the start of the dump, and so if we rely on it as the start
7046  * point, we will fail to restore a consistent database state.
7047  *
7048  * We also attempt to retrieve the corresponding backup history file.
7049  * If successful, set *minRecoveryLoc to constrain valid PITR stopping
7050  * points.
7051  *
7052  * Returns TRUE if a backup_label was found (and fills the checkpoint
7053  * location into *checkPointLoc); returns FALSE if not.
7054  */
7055 static bool
7056 read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc)
7057 {
7058         XLogRecPtr      startpoint;
7059         XLogRecPtr      stoppoint;
7060         char            histfilename[MAXFNAMELEN];
7061         char            histfilepath[MAXPGPATH];
7062         char            startxlogfilename[MAXFNAMELEN];
7063         char            stopxlogfilename[MAXFNAMELEN];
7064         TimeLineID      tli;
7065         uint32          _logId;
7066         uint32          _logSeg;
7067         FILE       *lfp;
7068         FILE       *fp;
7069         char            ch;
7070
7071         /* Default is to not constrain recovery stop point */
7072         minRecoveryLoc->xlogid = 0;
7073         minRecoveryLoc->xrecoff = 0;
7074
7075         /*
7076          * See if label file is present
7077          */
7078         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
7079         if (!lfp)
7080         {
7081                 if (errno != ENOENT)
7082                         ereport(FATAL,
7083                                         (errcode_for_file_access(),
7084                                          errmsg("could not read file \"%s\": %m",
7085                                                         BACKUP_LABEL_FILE)));
7086                 return false;                   /* it's not there, all is fine */
7087         }
7088
7089         /*
7090          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
7091          * is pretty crude, but we are not expecting any variability in the file
7092          * format).
7093          */
7094         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
7095                            &startpoint.xlogid, &startpoint.xrecoff, &tli,
7096                            startxlogfilename, &ch) != 5 || ch != '\n')
7097                 ereport(FATAL,
7098                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7099                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7100         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
7101                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
7102                            &ch) != 3 || ch != '\n')
7103                 ereport(FATAL,
7104                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7105                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7106         if (ferror(lfp) || FreeFile(lfp))
7107                 ereport(FATAL,
7108                                 (errcode_for_file_access(),
7109                                  errmsg("could not read file \"%s\": %m",
7110                                                 BACKUP_LABEL_FILE)));
7111
7112         /*
7113          * Try to retrieve the backup history file (no error if we can't)
7114          */
7115         XLByteToSeg(startpoint, _logId, _logSeg);
7116         BackupHistoryFileName(histfilename, tli, _logId, _logSeg,
7117                                                   startpoint.xrecoff % XLogSegSize);
7118
7119         if (InArchiveRecovery)
7120                 RestoreArchivedFile(histfilepath, histfilename, "RECOVERYHISTORY", 0);
7121         else
7122                 BackupHistoryFilePath(histfilepath, tli, _logId, _logSeg,
7123                                                           startpoint.xrecoff % XLogSegSize);
7124
7125         fp = AllocateFile(histfilepath, "r");
7126         if (fp)
7127         {
7128                 /*
7129                  * Parse history file to identify stop point.
7130                  */
7131                 if (fscanf(fp, "START WAL LOCATION: %X/%X (file %24s)%c",
7132                                    &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
7133                                    &ch) != 4 || ch != '\n')
7134                         ereport(FATAL,
7135                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7136                                          errmsg("invalid data in file \"%s\"", histfilename)));
7137                 if (fscanf(fp, "STOP WAL LOCATION: %X/%X (file %24s)%c",
7138                                    &stoppoint.xlogid, &stoppoint.xrecoff, stopxlogfilename,
7139                                    &ch) != 4 || ch != '\n')
7140                         ereport(FATAL,
7141                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7142                                          errmsg("invalid data in file \"%s\"", histfilename)));
7143                 *minRecoveryLoc = stoppoint;
7144                 if (ferror(fp) || FreeFile(fp))
7145                         ereport(FATAL,
7146                                         (errcode_for_file_access(),
7147                                          errmsg("could not read file \"%s\": %m",
7148                                                         histfilepath)));
7149         }
7150
7151         return true;
7152 }
7153
7154 /*
7155  * Error context callback for errors occurring during rm_redo().
7156  */
7157 static void
7158 rm_redo_error_callback(void *arg)
7159 {
7160         XLogRecord *record = (XLogRecord *) arg;
7161         StringInfoData buf;
7162
7163         initStringInfo(&buf);
7164         RmgrTable[record->xl_rmid].rm_desc(&buf,
7165                                                                            record->xl_info,
7166                                                                            XLogRecGetData(record));
7167
7168         /* don't bother emitting empty description */
7169         if (buf.len > 0)
7170                 errcontext("xlog redo %s", buf.data);
7171
7172         pfree(buf.data);
7173 }
7174
7175 /*
7176  * BackupInProgress: check if online backup mode is active
7177  *
7178  * This is done by checking for existence of the "backup_label" file.
7179  */
7180 bool
7181 BackupInProgress(void)
7182 {
7183         struct stat stat_buf;
7184
7185         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
7186 }
7187
7188 /*
7189  * CancelBackup: rename the "backup_label" file to cancel backup mode
7190  *
7191  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
7192  * Note that this will render an online backup in progress useless.
7193  * To correctly finish an online backup, pg_stop_backup must be called.
7194  */
7195 void
7196 CancelBackup(void)
7197 {
7198         struct stat stat_buf;
7199
7200         /* if the file is not there, return */
7201         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
7202                 return;
7203
7204         /* remove leftover file from previously cancelled backup if it exists */
7205         unlink(BACKUP_LABEL_OLD);
7206
7207         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
7208         {
7209                 ereport(LOG,
7210                                 (errmsg("online backup mode cancelled"),
7211                                  errdetail("\"%s\" was renamed to \"%s\".",
7212                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7213         }
7214         else
7215         {
7216                 ereport(WARNING,
7217                                 (errcode_for_file_access(),
7218                                  errmsg("online backup mode was not cancelled"),
7219                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
7220                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7221         }
7222 }
7223