src/backend/storage/ipc/standby.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * standby.c
   4  *        Misc functions used in Hot Standby mode.
   5  *
   6  *      All functions for handling RM_STANDBY_ID, which relate to
   7  *      AccessExclusiveLocks and starting snapshots for Hot Standby mode.
   8  *      Plus conflict recovery processing.
   9  *
  10  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  11  * Portions Copyright (c) 1994, Regents of the University of California
  12  *
  13  * IDENTIFICATION
  14  *        src/backend/storage/ipc/standby.c
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19 #include "access/transam.h"
  20 #include "access/twophase.h"
  21 #include "access/xact.h"
  22 #include "access/xloginsert.h"
  23 #include "access/xlogrecovery.h"
  24 #include "access/xlogutils.h"
  25 #include "miscadmin.h"
  26 #include "pgstat.h"
  27 #include "storage/bufmgr.h"
  28 #include "storage/lmgr.h"
  29 #include "storage/proc.h"
  30 #include "storage/procarray.h"
  31 #include "storage/sinvaladt.h"
  32 #include "storage/standby.h"
  33 #include "utils/hsearch.h"
  34 #include "utils/memutils.h"
  35 #include "utils/ps_status.h"
  36 #include "utils/timeout.h"
  37 #include "utils/timestamp.h"
  38
  39 /* User-settable GUC parameters */
  40 int                     vacuum_defer_cleanup_age;
  41 int                     max_standby_archive_delay = 30 * 1000;
  42 int                     max_standby_streaming_delay = 30 * 1000;
  43 bool            log_recovery_conflict_waits = false;
  44
  45 /*
  46  * Keep track of all the exclusive locks owned by original transactions.
  47  * For each known exclusive lock, there is a RecoveryLockEntry in the
  48  * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
  49  * given XID are chained together so that we can find them easily.
  50  * For each original transaction that is known to have any such locks,
  51  * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
  52  * which stores the head of the chain of its locks.
  53  */
  54 typedef struct RecoveryLockEntry
  55 {
  56         xl_standby_lock key;            /* hash key: xid, dbOid, relOid */
  57         struct RecoveryLockEntry *next; /* chain link */
  58 } RecoveryLockEntry;
  59
  60 typedef struct RecoveryLockXidEntry
  61 {
  62         TransactionId xid;                      /* hash key -- must be first */
  63         struct RecoveryLockEntry *head; /* chain head */
  64 } RecoveryLockXidEntry;
  65
  66 static HTAB *RecoveryLockHash = NULL;
  67 static HTAB *RecoveryLockXidHash = NULL;
  68
  69 /* Flags set by timeout handlers */
  70 static volatile sig_atomic_t got_standby_deadlock_timeout = false;
  71 static volatile sig_atomic_t got_standby_delay_timeout = false;
  72 static volatile sig_atomic_t got_standby_lock_timeout = false;
  73
  74 static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
  75                                                                                                    ProcSignalReason reason,
  76                                                                                                    uint32 wait_event_info,
  77                                                                                                    bool report_waiting);
  78 static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
  79 static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
  80 static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
  81 static const char *get_recovery_conflict_desc(ProcSignalReason reason);
  82
  83 /*
  84  * InitRecoveryTransactionEnvironment
  85  *              Initialize tracking of our primary's in-progress transactions.
  86  *
  87  * We need to issue shared invalidations and hold locks. Holding locks
  88  * means others may want to wait on us, so we need to make a lock table
  89  * vxact entry like a real transaction. We could create and delete
  90  * lock table entries for each transaction but its simpler just to create
  91  * one permanent entry and leave it there all the time. Locks are then
  92  * acquired and released as needed. Yes, this means you can see the
  93  * Startup process in pg_locks once we have run this.
  94  */
  95 void
  96 InitRecoveryTransactionEnvironment(void)
  97 {
  98         VirtualTransactionId vxid;
  99         HASHCTL         hash_ctl;
 100
 101         Assert(RecoveryLockHash == NULL);       /* don't run this twice */
 102
 103         /*
 104          * Initialize the hash tables for tracking the locks held by each
 105          * transaction.
 106          */
 107         hash_ctl.keysize = sizeof(xl_standby_lock);
 108         hash_ctl.entrysize = sizeof(RecoveryLockEntry);
 109         RecoveryLockHash = hash_create("RecoveryLockHash",
 110                                                                    64,
 111                                                                    &hash_ctl,
 112                                                                    HASH_ELEM | HASH_BLOBS);
 113         hash_ctl.keysize = sizeof(TransactionId);
 114         hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
 115         RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
 116                                                                           64,
 117                                                                           &hash_ctl,
 118                                                                           HASH_ELEM | HASH_BLOBS);
 119
 120         /*
 121          * Initialize shared invalidation management for Startup process, being
 122          * careful to register ourselves as a sendOnly process so we don't need to
 123          * read messages, nor will we get signaled when the queue starts filling
 124          * up.
 125          */
 126         SharedInvalBackendInit(true);
 127
 128         /*
 129          * Lock a virtual transaction id for Startup process.
 130          *
 131          * We need to do GetNextLocalTransactionId() because
 132          * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
 133          * manager doesn't like that at all.
 134          *
 135          * Note that we don't need to run XactLockTableInsert() because nobody
 136          * needs to wait on xids. That sounds a little strange, but table locks
 137          * are held by vxids and row level locks are held by xids. All queries
 138          * hold AccessShareLocks so never block while we write or lock new rows.
 139          */
 140         vxid.backendId = MyBackendId;
 141         vxid.localTransactionId = GetNextLocalTransactionId();
 142         VirtualXactLockTableInsert(vxid);
 143
 144         standbyState = STANDBY_INITIALIZED;
 145 }
 146
 147 /*
 148  * ShutdownRecoveryTransactionEnvironment
 149  *              Shut down transaction tracking
 150  *
 151  * Prepare to switch from hot standby mode to normal operation. Shut down
 152  * recovery-time transaction tracking.
 153  *
 154  * This must be called even in shutdown of startup process if transaction
 155  * tracking has been initialized. Otherwise some locks the tracked
 156  * transactions were holding will not be released and may interfere with
 157  * the processes still running (but will exit soon later) at the exit of
 158  * startup process.
 159  */
 160 void
 161 ShutdownRecoveryTransactionEnvironment(void)
 162 {
 163         /*
 164          * Do nothing if RecoveryLockHash is NULL because that means that
 165          * transaction tracking has not yet been initialized or has already been
 166          * shut down.  This makes it safe to have possibly-redundant calls of this
 167          * function during process exit.
 168          */
 169         if (RecoveryLockHash == NULL)
 170                 return;
 171
 172         /* Mark all tracked in-progress transactions as finished. */
 173         ExpireAllKnownAssignedTransactionIds();
 174
 175         /* Release all locks the tracked transactions were holding */
 176         StandbyReleaseAllLocks();
 177
 178         /* Destroy the lock hash tables. */
 179         hash_destroy(RecoveryLockHash);
 180         hash_destroy(RecoveryLockXidHash);
 181         RecoveryLockHash = NULL;
 182         RecoveryLockXidHash = NULL;
 183
 184         /* Cleanup our VirtualTransaction */
 185         VirtualXactLockTableCleanup();
 186 }
 187
 188
 189 /*
 190  * -----------------------------------------------------
 191  *              Standby wait timers and backend cancel logic
 192  * -----------------------------------------------------
 193  */
 194
 195 /*
 196  * Determine the cutoff time at which we want to start canceling conflicting
 197  * transactions.  Returns zero (a time safely in the past) if we are willing
 198  * to wait forever.
 199  */
 200 static TimestampTz
 201 GetStandbyLimitTime(void)
 202 {
 203         TimestampTz rtime;
 204         bool            fromStream;
 205
 206         /*
 207          * The cutoff time is the last WAL data receipt time plus the appropriate
 208          * delay variable.  Delay of -1 means wait forever.
 209          */
 210         GetXLogReceiptTime(&rtime, &fromStream);
 211         if (fromStream)
 212         {
 213                 if (max_standby_streaming_delay < 0)
 214                         return 0;                       /* wait forever */
 215                 return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
 216         }
 217         else
 218         {
 219                 if (max_standby_archive_delay < 0)
 220                         return 0;                       /* wait forever */
 221                 return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
 222         }
 223 }
 224
 225 #define STANDBY_INITIAL_WAIT_US  1000
 226 static int      standbyWait_us = STANDBY_INITIAL_WAIT_US;
 227
 228 /*
 229  * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
 230  * We wait here for a while then return. If we decide we can't wait any
 231  * more then we return true, if we can wait some more return false.
 232  */
 233 static bool
 234 WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
 235 {
 236         TimestampTz ltime;
 237
 238         CHECK_FOR_INTERRUPTS();
 239
 240         /* Are we past the limit time? */
 241         ltime = GetStandbyLimitTime();
 242         if (ltime && GetCurrentTimestamp() >= ltime)
 243                 return true;
 244
 245         /*
 246          * Sleep a bit (this is essential to avoid busy-waiting).
 247          */
 248         pgstat_report_wait_start(wait_event_info);
 249         pg_usleep(standbyWait_us);
 250         pgstat_report_wait_end();
 251
 252         /*
 253          * Progressively increase the sleep times, but not to more than 1s, since
 254          * pg_usleep isn't interruptible on some platforms.
 255          */
 256         standbyWait_us *= 2;
 257         if (standbyWait_us > 1000000)
 258                 standbyWait_us = 1000000;
 259
 260         return false;
 261 }
 262
 263 /*
 264  * Log the recovery conflict.
 265  *
 266  * wait_start is the timestamp when the caller started to wait.
 267  * now is the timestamp when this function has been called.
 268  * wait_list is the list of virtual transaction ids assigned to
 269  * conflicting processes. still_waiting indicates whether
 270  * the startup process is still waiting for the recovery conflict
 271  * to be resolved or not.
 272  */
 273 void
 274 LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
 275                                         TimestampTz now, VirtualTransactionId *wait_list,
 276                                         bool still_waiting)
 277 {
 278         long            secs;
 279         int                     usecs;
 280         long            msecs;
 281         StringInfoData buf;
 282         int                     nprocs = 0;
 283
 284         /*
 285          * There must be no conflicting processes when the recovery conflict has
 286          * already been resolved.
 287          */
 288         Assert(still_waiting || wait_list == NULL);
 289
 290         TimestampDifference(wait_start, now, &secs, &usecs);
 291         msecs = secs * 1000 + usecs / 1000;
 292         usecs = usecs % 1000;
 293
 294         if (wait_list)
 295         {
 296                 VirtualTransactionId *vxids;
 297
 298                 /* Construct a string of list of the conflicting processes */
 299                 vxids = wait_list;
 300                 while (VirtualTransactionIdIsValid(*vxids))
 301                 {
 302                         PGPROC     *proc = BackendIdGetProc(vxids->backendId);
 303
 304                         /* proc can be NULL if the target backend is not active */
 305                         if (proc)
 306                         {
 307                                 if (nprocs == 0)
 308                                 {
 309                                         initStringInfo(&buf);
 310                                         appendStringInfo(&buf, "%d", proc->pid);
 311                                 }
 312                                 else
 313                                         appendStringInfo(&buf, ", %d", proc->pid);
 314
 315                                 nprocs++;
 316                         }
 317
 318                         vxids++;
 319                 }
 320         }
 321
 322         /*
 323          * If wait_list is specified, report the list of PIDs of active
 324          * conflicting backends in a detail message. Note that if all the backends
 325          * in the list are not active, no detail message is logged.
 326          */
 327         if (still_waiting)
 328         {
 329                 ereport(LOG,
 330                                 errmsg("recovery still waiting after %ld.%03d ms: %s",
 331                                            msecs, usecs, get_recovery_conflict_desc(reason)),
 332                                 nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
 333                                                                                                   "Conflicting processes: %s.",
 334                                                                                                   nprocs, buf.data) : 0);
 335         }
 336         else
 337         {
 338                 ereport(LOG,
 339                                 errmsg("recovery finished waiting after %ld.%03d ms: %s",
 340                                            msecs, usecs, get_recovery_conflict_desc(reason)));
 341         }
 342
 343         if (nprocs > 0)
 344                 pfree(buf.data);
 345 }
 346
 347 /*
 348  * This is the main executioner for any query backend that conflicts with
 349  * recovery processing. Judgement has already been passed on it within
 350  * a specific rmgr. Here we just issue the orders to the procs. The procs
 351  * then throw the required error as instructed.
 352  *
 353  * If report_waiting is true, "waiting" is reported in PS display and the
 354  * wait for recovery conflict is reported in the log, if necessary. If
 355  * the caller is responsible for reporting them, report_waiting should be
 356  * false. Otherwise, both the caller and this function report the same
 357  * thing unexpectedly.
 358  */
 359 static void
 360 ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
 361                                                                            ProcSignalReason reason, uint32 wait_event_info,
 362                                                                            bool report_waiting)
 363 {
 364         TimestampTz waitStart = 0;
 365         char       *new_status = NULL;
 366         bool            logged_recovery_conflict = false;
 367
 368         /* Fast exit, to avoid a kernel call if there's no work to be done. */
 369         if (!VirtualTransactionIdIsValid(*waitlist))
 370                 return;
 371
 372         /* Set the wait start timestamp for reporting */
 373         if (report_waiting && (log_recovery_conflict_waits || update_process_title))
 374                 waitStart = GetCurrentTimestamp();
 375
 376         while (VirtualTransactionIdIsValid(*waitlist))
 377         {
 378                 /* reset standbyWait_us for each xact we wait for */
 379                 standbyWait_us = STANDBY_INITIAL_WAIT_US;
 380
 381                 /* wait until the virtual xid is gone */
 382                 while (!VirtualXactLock(*waitlist, false))
 383                 {
 384                         /* Is it time to kill it? */
 385                         if (WaitExceedsMaxStandbyDelay(wait_event_info))
 386                         {
 387                                 pid_t           pid;
 388
 389                                 /*
 390                                  * Now find out who to throw out of the balloon.
 391                                  */
 392                                 Assert(VirtualTransactionIdIsValid(*waitlist));
 393                                 pid = CancelVirtualTransaction(*waitlist, reason);
 394
 395                                 /*
 396                                  * Wait a little bit for it to die so that we avoid flooding
 397                                  * an unresponsive backend when system is heavily loaded.
 398                                  */
 399                                 if (pid != 0)
 400                                         pg_usleep(5000L);
 401                         }
 402
 403                         if (waitStart != 0 && (!logged_recovery_conflict || new_status == NULL))
 404                         {
 405                                 TimestampTz now = 0;
 406                                 bool            maybe_log_conflict;
 407                                 bool            maybe_update_title;
 408
 409                                 maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
 410                                 maybe_update_title = (update_process_title && new_status == NULL);
 411
 412                                 /* Get the current timestamp if not report yet */
 413                                 if (maybe_log_conflict || maybe_update_title)
 414                                         now = GetCurrentTimestamp();
 415
 416                                 /*
 417                                  * Report via ps if we have been waiting for more than 500
 418                                  * msec (should that be configurable?)
 419                                  */
 420                                 if (maybe_update_title &&
 421                                         TimestampDifferenceExceeds(waitStart, now, 500))
 422                                 {
 423                                         const char *old_status;
 424                                         int                     len;
 425
 426                                         old_status = get_ps_display(&len);
 427                                         new_status = (char *) palloc(len + 8 + 1);
 428                                         memcpy(new_status, old_status, len);
 429                                         strcpy(new_status + len, " waiting");
 430                                         set_ps_display(new_status);
 431                                         new_status[len] = '\0'; /* truncate off " waiting" */
 432                                 }
 433
 434                                 /*
 435                                  * Emit the log message if the startup process is waiting
 436                                  * longer than deadlock_timeout for recovery conflict.
 437                                  */
 438                                 if (maybe_log_conflict &&
 439                                         TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
 440                                 {
 441                                         LogRecoveryConflict(reason, waitStart, now, waitlist, true);
 442                                         logged_recovery_conflict = true;
 443                                 }
 444                         }
 445                 }
 446
 447                 /* The virtual transaction is gone now, wait for the next one */
 448                 waitlist++;
 449         }
 450
 451         /*
 452          * Emit the log message if recovery conflict was resolved but the startup
 453          * process waited longer than deadlock_timeout for it.
 454          */
 455         if (logged_recovery_conflict)
 456                 LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
 457                                                         NULL, false);
 458
 459         /* Reset ps display if we changed it */
 460         if (new_status)
 461         {
 462                 set_ps_display(new_status);
 463                 pfree(new_status);
 464         }
 465 }
 466
 467 /*
 468  * Generate whatever recovery conflicts are needed to eliminate snapshots that
 469  * might see XIDs <= snapshotConflictHorizon as still running.
 470  *
 471  * snapshotConflictHorizon cutoffs are our standard approach to generating
 472  * granular recovery conflicts.  Note that InvalidTransactionId values are
 473  * interpreted as "definitely don't need any conflicts" here, which is a
 474  * general convention that WAL records can (and often do) depend on.
 475  */
 476 void
 477 ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
 478                                                                         RelFileLocator locator)
 479 {
 480         VirtualTransactionId *backends;
 481
 482         /*
 483          * If we get passed InvalidTransactionId then we do nothing (no conflict).
 484          *
 485          * This can happen when replaying already-applied WAL records after a
 486          * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
 487          * record that marks as frozen a page which was already all-visible.  It's
 488          * also quite common with records generated during index deletion
 489          * (original execution of the deletion can reason that a recovery conflict
 490          * which is sufficient for the deletion operation must take place before
 491          * replay of the deletion record itself).
 492          */
 493         if (!TransactionIdIsValid(snapshotConflictHorizon))
 494                 return;
 495
 496         backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
 497                                                                                  locator.dbOid);
 498         ResolveRecoveryConflictWithVirtualXIDs(backends,
 499                                                                                    PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
 500                                                                                    WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
 501                                                                                    true);
 502 }
 503
 504 /*
 505  * Variant of ResolveRecoveryConflictWithSnapshot that works with
 506  * FullTransactionId values
 507  */
 508 void
 509 ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
 510                                                                                    RelFileLocator locator)
 511 {
 512         /*
 513          * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
 514          * so truncate the logged FullTransactionId.  If the logged value is very
 515          * old, so that XID wrap-around already happened on it, there can't be any
 516          * snapshots that still see it.
 517          */
 518         FullTransactionId nextXid = ReadNextFullTransactionId();
 519         uint64          diff;
 520
 521         diff = U64FromFullTransactionId(nextXid) -
 522                 U64FromFullTransactionId(snapshotConflictHorizon);
 523         if (diff < MaxTransactionId / 2)
 524         {
 525                 TransactionId truncated;
 526
 527                 truncated = XidFromFullTransactionId(snapshotConflictHorizon);
 528                 ResolveRecoveryConflictWithSnapshot(truncated, locator);
 529         }
 530 }
 531
 532 void
 533 ResolveRecoveryConflictWithTablespace(Oid tsid)
 534 {
 535         VirtualTransactionId *temp_file_users;
 536
 537         /*
 538          * Standby users may be currently using this tablespace for their
 539          * temporary files. We only care about current users because
 540          * temp_tablespace parameter will just ignore tablespaces that no longer
 541          * exist.
 542          *
 543          * Ask everybody to cancel their queries immediately so we can ensure no
 544          * temp files remain and we can remove the tablespace. Nuke the entire
 545          * site from orbit, it's the only way to be sure.
 546          *
 547          * XXX: We could work out the pids of active backends using this
 548          * tablespace by examining the temp filenames in the directory. We would
 549          * then convert the pids into VirtualXIDs before attempting to cancel
 550          * them.
 551          *
 552          * We don't wait for commit because drop tablespace is non-transactional.
 553          */
 554         temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
 555                                                                                                 InvalidOid);
 556         ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
 557                                                                                    PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
 558                                                                                    WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
 559                                                                                    true);
 560 }
 561
 562 void
 563 ResolveRecoveryConflictWithDatabase(Oid dbid)
 564 {
 565         /*
 566          * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
 567          * only waits for transactions and completely idle sessions would block
 568          * us. This is rare enough that we do this as simply as possible: no wait,
 569          * just force them off immediately.
 570          *
 571          * No locking is required here because we already acquired
 572          * AccessExclusiveLock. Anybody trying to connect while we do this will
 573          * block during InitPostgres() and then disconnect when they see the
 574          * database has been removed.
 575          */
 576         while (CountDBBackends(dbid) > 0)
 577         {
 578                 CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
 579
 580                 /*
 581                  * Wait awhile for them to die so that we avoid flooding an
 582                  * unresponsive backend when system is heavily loaded.
 583                  */
 584                 pg_usleep(10000);
 585         }
 586 }
 587
 588 /*
 589  * ResolveRecoveryConflictWithLock is called from ProcSleep()
 590  * to resolve conflicts with other backends holding relation locks.
 591  *
 592  * The WaitLatch sleep normally done in ProcSleep()
 593  * (when not InHotStandby) is performed here, for code clarity.
 594  *
 595  * We either resolve conflicts immediately or set a timeout to wake us at
 596  * the limit of our patience.
 597  *
 598  * Resolve conflicts by canceling to all backends holding a conflicting
 599  * lock.  As we are already queued to be granted the lock, no new lock
 600  * requests conflicting with ours will be granted in the meantime.
 601  *
 602  * We also must check for deadlocks involving the Startup process and
 603  * hot-standby backend processes. If deadlock_timeout is reached in
 604  * this function, all the backends holding the conflicting locks are
 605  * requested to check themselves for deadlocks.
 606  *
 607  * logging_conflict should be true if the recovery conflict has not been
 608  * logged yet even though logging is enabled. After deadlock_timeout is
 609  * reached and the request for deadlock check is sent, we wait again to
 610  * be signaled by the release of the lock if logging_conflict is false.
 611  * Otherwise we return without waiting again so that the caller can report
 612  * the recovery conflict. In this case, then, this function is called again
 613  * with logging_conflict=false (because the recovery conflict has already
 614  * been logged) and we will wait again for the lock to be released.
 615  */
 616 void
 617 ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
 618 {
 619         TimestampTz ltime;
 620         TimestampTz now;
 621
 622         Assert(InHotStandby);
 623
 624         ltime = GetStandbyLimitTime();
 625         now = GetCurrentTimestamp();
 626
 627         /*
 628          * Update waitStart if first time through after the startup process
 629          * started waiting for the lock. It should not be updated every time
 630          * ResolveRecoveryConflictWithLock() is called during the wait.
 631          *
 632          * Use the current time obtained for comparison with ltime as waitStart
 633          * (i.e., the time when this process started waiting for the lock). Since
 634          * getting the current time newly can cause overhead, we reuse the
 635          * already-obtained time to avoid that overhead.
 636          *
 637          * Note that waitStart is updated without holding the lock table's
 638          * partition lock, to avoid the overhead by additional lock acquisition.
 639          * This can cause "waitstart" in pg_locks to become NULL for a very short
 640          * period of time after the wait started even though "granted" is false.
 641          * This is OK in practice because we can assume that users are likely to
 642          * look at "waitstart" when waiting for the lock for a long time.
 643          */
 644         if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
 645                 pg_atomic_write_u64(&MyProc->waitStart, now);
 646
 647         if (now >= ltime && ltime != 0)
 648         {
 649                 /*
 650                  * We're already behind, so clear a path as quickly as possible.
 651                  */
 652                 VirtualTransactionId *backends;
 653
 654                 backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
 655
 656                 /*
 657                  * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
 658                  * "waiting" in PS display by disabling its argument report_waiting
 659                  * because the caller, WaitOnLock(), has already reported that.
 660                  */
 661                 ResolveRecoveryConflictWithVirtualXIDs(backends,
 662                                                                                            PROCSIG_RECOVERY_CONFLICT_LOCK,
 663                                                                                            PG_WAIT_LOCK | locktag.locktag_type,
 664                                                                                            false);
 665         }
 666         else
 667         {
 668                 /*
 669                  * Wait (or wait again) until ltime, and check for deadlocks as well
 670                  * if we will be waiting longer than deadlock_timeout
 671                  */
 672                 EnableTimeoutParams timeouts[2];
 673                 int                     cnt = 0;
 674
 675                 if (ltime != 0)
 676                 {
 677                         got_standby_lock_timeout = false;
 678                         timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
 679                         timeouts[cnt].type = TMPARAM_AT;
 680                         timeouts[cnt].fin_time = ltime;
 681                         cnt++;
 682                 }
 683
 684                 got_standby_deadlock_timeout = false;
 685                 timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
 686                 timeouts[cnt].type = TMPARAM_AFTER;
 687                 timeouts[cnt].delay_ms = DeadlockTimeout;
 688                 cnt++;
 689
 690                 enable_timeouts(timeouts, cnt);
 691         }
 692
 693         /* Wait to be signaled by the release of the Relation Lock */
 694         ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
 695
 696         /*
 697          * Exit if ltime is reached. Then all the backends holding conflicting
 698          * locks will be canceled in the next ResolveRecoveryConflictWithLock()
 699          * call.
 700          */
 701         if (got_standby_lock_timeout)
 702                 goto cleanup;
 703
 704         if (got_standby_deadlock_timeout)
 705         {
 706                 VirtualTransactionId *backends;
 707
 708                 backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
 709
 710                 /* Quick exit if there's no work to be done */
 711                 if (!VirtualTransactionIdIsValid(*backends))
 712                         goto cleanup;
 713
 714                 /*
 715                  * Send signals to all the backends holding the conflicting locks, to
 716                  * ask them to check themselves for deadlocks.
 717                  */
 718                 while (VirtualTransactionIdIsValid(*backends))
 719                 {
 720                         SignalVirtualTransaction(*backends,
 721                                                                          PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
 722                                                                          false);
 723                         backends++;
 724                 }
 725
 726                 /*
 727                  * Exit if the recovery conflict has not been logged yet even though
 728                  * logging is enabled, so that the caller can log that. Then
 729                  * RecoveryConflictWithLock() is called again and we will wait again
 730                  * for the lock to be released.
 731                  */
 732                 if (logging_conflict)
 733                         goto cleanup;
 734
 735                 /*
 736                  * Wait again here to be signaled by the release of the Relation Lock,
 737                  * to prevent the subsequent RecoveryConflictWithLock() from causing
 738                  * deadlock_timeout and sending a request for deadlocks check again.
 739                  * Otherwise the request continues to be sent every deadlock_timeout
 740                  * until the relation locks are released or ltime is reached.
 741                  */
 742                 got_standby_deadlock_timeout = false;
 743                 ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
 744         }
 745
 746 cleanup:
 747
 748         /*
 749          * Clear any timeout requests established above.  We assume here that the
 750          * Startup process doesn't have any other outstanding timeouts than those
 751          * used by this function. If that stops being true, we could cancel the
 752          * timeouts individually, but that'd be slower.
 753          */
 754         disable_all_timeouts(false);
 755         got_standby_lock_timeout = false;
 756         got_standby_deadlock_timeout = false;
 757 }
 758
 759 /*
 760  * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
 761  * to resolve conflicts with other backends holding buffer pins.
 762  *
 763  * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
 764  * (when not InHotStandby) is performed here, for code clarity.
 765  *
 766  * We either resolve conflicts immediately or set a timeout to wake us at
 767  * the limit of our patience.
 768  *
 769  * Resolve conflicts by sending a PROCSIG signal to all backends to check if
 770  * they hold one of the buffer pins that is blocking Startup process. If so,
 771  * those backends will take an appropriate error action, ERROR or FATAL.
 772  *
 773  * We also must check for deadlocks.  Deadlocks occur because if queries
 774  * wait on a lock, that must be behind an AccessExclusiveLock, which can only
 775  * be cleared if the Startup process replays a transaction completion record.
 776  * If Startup process is also waiting then that is a deadlock. The deadlock
 777  * can occur if the query is waiting and then the Startup sleeps, or if
 778  * Startup is sleeping and the query waits on a lock. We protect against
 779  * only the former sequence here, the latter sequence is checked prior to
 780  * the query sleeping, in CheckRecoveryConflictDeadlock().
 781  *
 782  * Deadlocks are extremely rare, and relatively expensive to check for,
 783  * so we don't do a deadlock check right away ... only if we have had to wait
 784  * at least deadlock_timeout.
 785  */
 786 void
 787 ResolveRecoveryConflictWithBufferPin(void)
 788 {
 789         TimestampTz ltime;
 790
 791         Assert(InHotStandby);
 792
 793         ltime = GetStandbyLimitTime();
 794
 795         if (GetCurrentTimestamp() >= ltime && ltime != 0)
 796         {
 797                 /*
 798                  * We're already behind, so clear a path as quickly as possible.
 799                  */
 800                 SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
 801         }
 802         else
 803         {
 804                 /*
 805                  * Wake up at ltime, and check for deadlocks as well if we will be
 806                  * waiting longer than deadlock_timeout
 807                  */
 808                 EnableTimeoutParams timeouts[2];
 809                 int                     cnt = 0;
 810
 811                 if (ltime != 0)
 812                 {
 813                         timeouts[cnt].id = STANDBY_TIMEOUT;
 814                         timeouts[cnt].type = TMPARAM_AT;
 815                         timeouts[cnt].fin_time = ltime;
 816                         cnt++;
 817                 }
 818
 819                 got_standby_deadlock_timeout = false;
 820                 timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
 821                 timeouts[cnt].type = TMPARAM_AFTER;
 822                 timeouts[cnt].delay_ms = DeadlockTimeout;
 823                 cnt++;
 824
 825                 enable_timeouts(timeouts, cnt);
 826         }
 827
 828         /*
 829          * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
 830          * by one of the timeouts established above.
 831          *
 832          * We assume that only UnpinBuffer() and the timeout requests established
 833          * above can wake us up here. WakeupRecovery() called by walreceiver or
 834          * SIGHUP signal handler, etc cannot do that because it uses the different
 835          * latch from that ProcWaitForSignal() waits on.
 836          */
 837         ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
 838
 839         if (got_standby_delay_timeout)
 840                 SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
 841         else if (got_standby_deadlock_timeout)
 842         {
 843                 /*
 844                  * Send out a request for hot-standby backends to check themselves for
 845                  * deadlocks.
 846                  *
 847                  * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
 848                  * to be signaled by UnpinBuffer() again and send a request for
 849                  * deadlocks check if deadlock_timeout happens. This causes the
 850                  * request to continue to be sent every deadlock_timeout until the
 851                  * buffer is unpinned or ltime is reached. This would increase the
 852                  * workload in the startup process and backends. In practice it may
 853                  * not be so harmful because the period that the buffer is kept pinned
 854                  * is basically no so long. But we should fix this?
 855                  */
 856                 SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
 857         }
 858
 859         /*
 860          * Clear any timeout requests established above.  We assume here that the
 861          * Startup process doesn't have any other timeouts than what this function
 862          * uses.  If that stops being true, we could cancel the timeouts
 863          * individually, but that'd be slower.
 864          */
 865         disable_all_timeouts(false);
 866         got_standby_delay_timeout = false;
 867         got_standby_deadlock_timeout = false;
 868 }
 869
 870 static void
 871 SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
 872 {
 873         Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
 874                    reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
 875
 876         /*
 877          * We send signal to all backends to ask them if they are holding the
 878          * buffer pin which is delaying the Startup process. We must not set the
 879          * conflict flag yet, since most backends will be innocent. Let the
 880          * SIGUSR1 handling in each backend decide their own fate.
 881          */
 882         CancelDBBackends(InvalidOid, reason, false);
 883 }
 884
 885 /*
 886  * In Hot Standby perform early deadlock detection.  We abort the lock
 887  * wait if we are about to sleep while holding the buffer pin that Startup
 888  * process is waiting for.
 889  *
 890  * Note: this code is pessimistic, because there is no way for it to
 891  * determine whether an actual deadlock condition is present: the lock we
 892  * need to wait for might be unrelated to any held by the Startup process.
 893  * Sooner or later, this mechanism should get ripped out in favor of somehow
 894  * accounting for buffer locks in DeadLockCheck().  However, errors here
 895  * seem to be very low-probability in practice, so for now it's not worth
 896  * the trouble.
 897  */
 898 void
 899 CheckRecoveryConflictDeadlock(void)
 900 {
 901         Assert(!InRecovery);            /* do not call in Startup process */
 902
 903         if (!HoldingBufferPinThatDelaysRecovery())
 904                 return;
 905
 906         /*
 907          * Error message should match ProcessInterrupts() but we avoid calling
 908          * that because we aren't handling an interrupt at this point. Note that
 909          * we only cancel the current transaction here, so if we are in a
 910          * subtransaction and the pin is held by a parent, then the Startup
 911          * process will continue to wait even though we have avoided deadlock.
 912          */
 913         ereport(ERROR,
 914                         (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
 915                          errmsg("canceling statement due to conflict with recovery"),
 916                          errdetail("User transaction caused buffer deadlock with recovery.")));
 917 }
 918
 919
 920 /* --------------------------------
 921  *              timeout handler routines
 922  * --------------------------------
 923  */
 924
 925 /*
 926  * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
 927  * exceeded.
 928  */
 929 void
 930 StandbyDeadLockHandler(void)
 931 {
 932         got_standby_deadlock_timeout = true;
 933 }
 934
 935 /*
 936  * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
 937  */
 938 void
 939 StandbyTimeoutHandler(void)
 940 {
 941         got_standby_delay_timeout = true;
 942 }
 943
 944 /*
 945  * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
 946  */
 947 void
 948 StandbyLockTimeoutHandler(void)
 949 {
 950         got_standby_lock_timeout = true;
 951 }
 952
 953 /*
 954  * -----------------------------------------------------
 955  * Locking in Recovery Mode
 956  * -----------------------------------------------------
 957  *
 958  * All locks are held by the Startup process using a single virtual
 959  * transaction. This implementation is both simpler and in some senses,
 960  * more correct. The locks held mean "some original transaction held
 961  * this lock, so query access is not allowed at this time". So the Startup
 962  * process is the proxy by which the original locks are implemented.
 963  *
 964  * We only keep track of AccessExclusiveLocks, which are only ever held by
 965  * one transaction on one relation.
 966  *
 967  * We keep a table of known locks in the RecoveryLockHash hash table.
 968  * The point of that table is to let us efficiently de-duplicate locks,
 969  * which is important because checkpoints will re-report the same locks
 970  * already held.  There is also a RecoveryLockXidHash table with one entry
 971  * per xid, which allows us to efficiently find all the locks held by a
 972  * given original transaction.
 973  *
 974  * We use session locks rather than normal locks so we don't need
 975  * ResourceOwners.
 976  */
 977
 978
 979 void
 980 StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
 981 {
 982         RecoveryLockXidEntry *xidentry;
 983         RecoveryLockEntry *lockentry;
 984         xl_standby_lock key;
 985         LOCKTAG         locktag;
 986         bool            found;
 987
 988         /* Already processed? */
 989         if (!TransactionIdIsValid(xid) ||
 990                 TransactionIdDidCommit(xid) ||
 991                 TransactionIdDidAbort(xid))
 992                 return;
 993
 994         elog(trace_recovery(DEBUG4),
 995                  "adding recovery lock: db %u rel %u", dbOid, relOid);
 996
 997         /* dbOid is InvalidOid when we are locking a shared relation. */
 998         Assert(OidIsValid(relOid));
 999
1000         /* Create a hash entry for this xid, if we don't have one already. */
1001         xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
1002         if (!found)
1003         {
1004                 Assert(xidentry->xid == xid);   /* dynahash should have set this */
1005                 xidentry->head = NULL;
1006         }
1007
1008         /* Create a hash entry for this lock, unless we have one already. */
1009         key.xid = xid;
1010         key.dbOid = dbOid;
1011         key.relOid = relOid;
1012         lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
1013         if (!found)
1014         {
1015                 /* It's new, so link it into the XID's list ... */
1016                 lockentry->next = xidentry->head;
1017                 xidentry->head = lockentry;
1018
1019                 /* ... and acquire the lock locally. */
1020                 SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
1021
1022                 (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
1023         }
1024 }
1025
1026 /*
1027  * Release all the locks associated with this RecoveryLockXidEntry.
1028  */
1029 static void
1030 StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
1031 {
1032         RecoveryLockEntry *entry;
1033         RecoveryLockEntry *next;
1034
1035         for (entry = xidentry->head; entry != NULL; entry = next)
1036         {
1037                 LOCKTAG         locktag;
1038
1039                 elog(trace_recovery(DEBUG4),
1040                          "releasing recovery lock: xid %u db %u rel %u",
1041                          entry->key.xid, entry->key.dbOid, entry->key.relOid);
1042                 /* Release the lock ... */
1043                 SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
1044                 if (!LockRelease(&locktag, AccessExclusiveLock, true))
1045                 {
1046                         elog(LOG,
1047                                  "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
1048                                  entry->key.xid, entry->key.dbOid, entry->key.relOid);
1049                         Assert(false);
1050                 }
1051                 /* ... and remove the per-lock hash entry */
1052                 next = entry->next;
1053                 hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
1054         }
1055
1056         xidentry->head = NULL;          /* just for paranoia */
1057 }
1058
1059 /*
1060  * Release locks for specific XID, or all locks if it's InvalidXid.
1061  */
1062 static void
1063 StandbyReleaseLocks(TransactionId xid)
1064 {
1065         RecoveryLockXidEntry *entry;
1066
1067         if (TransactionIdIsValid(xid))
1068         {
1069                 if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
1070                 {
1071                         StandbyReleaseXidEntryLocks(entry);
1072                         hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1073                 }
1074         }
1075         else
1076                 StandbyReleaseAllLocks();
1077 }
1078
1079 /*
1080  * Release locks for a transaction tree, starting at xid down, from
1081  * RecoveryLockXidHash.
1082  *
1083  * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
1084  * to remove any AccessExclusiveLocks requested by a transaction.
1085  */
1086 void
1087 StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
1088 {
1089         int                     i;
1090
1091         StandbyReleaseLocks(xid);
1092
1093         for (i = 0; i < nsubxids; i++)
1094                 StandbyReleaseLocks(subxids[i]);
1095 }
1096
1097 /*
1098  * Called at end of recovery and when we see a shutdown checkpoint.
1099  */
1100 void
1101 StandbyReleaseAllLocks(void)
1102 {
1103         HASH_SEQ_STATUS status;
1104         RecoveryLockXidEntry *entry;
1105
1106         elog(trace_recovery(DEBUG2), "release all standby locks");
1107
1108         hash_seq_init(&status, RecoveryLockXidHash);
1109         while ((entry = hash_seq_search(&status)))
1110         {
1111                 StandbyReleaseXidEntryLocks(entry);
1112                 hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1113         }
1114 }
1115
1116 /*
1117  * StandbyReleaseOldLocks
1118  *              Release standby locks held by top-level XIDs that aren't running,
1119  *              as long as they're not prepared transactions.
1120  */
1121 void
1122 StandbyReleaseOldLocks(TransactionId oldxid)
1123 {
1124         HASH_SEQ_STATUS status;
1125         RecoveryLockXidEntry *entry;
1126
1127         hash_seq_init(&status, RecoveryLockXidHash);
1128         while ((entry = hash_seq_search(&status)))
1129         {
1130                 Assert(TransactionIdIsValid(entry->xid));
1131
1132                 /* Skip if prepared transaction. */
1133                 if (StandbyTransactionIdIsPrepared(entry->xid))
1134                         continue;
1135
1136                 /* Skip if >= oldxid. */
1137                 if (!TransactionIdPrecedes(entry->xid, oldxid))
1138                         continue;
1139
1140                 /* Remove all locks and hash table entry. */
1141                 StandbyReleaseXidEntryLocks(entry);
1142                 hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1143         }
1144 }
1145
1146 /*
1147  * --------------------------------------------------------------------
1148  *              Recovery handling for Rmgr RM_STANDBY_ID
1149  *
1150  * These record types will only be created if XLogStandbyInfoActive()
1151  * --------------------------------------------------------------------
1152  */
1153
1154 void
1155 standby_redo(XLogReaderState *record)
1156 {
1157         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1158
1159         /* Backup blocks are not used in standby records */
1160         Assert(!XLogRecHasAnyBlockRefs(record));
1161
1162         /* Do nothing if we're not in hot standby mode */
1163         if (standbyState == STANDBY_DISABLED)
1164                 return;
1165
1166         if (info == XLOG_STANDBY_LOCK)
1167         {
1168                 xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
1169                 int                     i;
1170
1171                 for (i = 0; i < xlrec->nlocks; i++)
1172                         StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
1173                                                                                           xlrec->locks[i].dbOid,
1174                                                                                           xlrec->locks[i].relOid);
1175         }
1176         else if (info == XLOG_RUNNING_XACTS)
1177         {
1178                 xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
1179                 RunningTransactionsData running;
1180
1181                 running.xcnt = xlrec->xcnt;
1182                 running.subxcnt = xlrec->subxcnt;
1183                 running.subxid_overflow = xlrec->subxid_overflow;
1184                 running.nextXid = xlrec->nextXid;
1185                 running.latestCompletedXid = xlrec->latestCompletedXid;
1186                 running.oldestRunningXid = xlrec->oldestRunningXid;
1187                 running.xids = xlrec->xids;
1188
1189                 ProcArrayApplyRecoveryInfo(&running);
1190         }
1191         else if (info == XLOG_INVALIDATIONS)
1192         {
1193                 xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
1194
1195                 ProcessCommittedInvalidationMessages(xlrec->msgs,
1196                                                                                          xlrec->nmsgs,
1197                                                                                          xlrec->relcacheInitFileInval,
1198                                                                                          xlrec->dbId,
1199                                                                                          xlrec->tsId);
1200         }
1201         else
1202                 elog(PANIC, "standby_redo: unknown op code %u", info);
1203 }
1204
1205 /*
1206  * Log details of the current snapshot to WAL. This allows the snapshot state
1207  * to be reconstructed on the standby and for logical decoding.
1208  *
1209  * This is used for Hot Standby as follows:
1210  *
1211  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
1212  * start from a shutdown checkpoint because we know nothing was running
1213  * at that time and our recovery snapshot is known empty. In the more
1214  * typical case of an online checkpoint we need to jump through a few
1215  * hoops to get a correct recovery snapshot and this requires a two or
1216  * sometimes a three stage process.
1217  *
1218  * The initial snapshot must contain all running xids and all current
1219  * AccessExclusiveLocks at a point in time on the standby. Assembling
1220  * that information while the server is running requires many and
1221  * various LWLocks, so we choose to derive that information piece by
1222  * piece and then re-assemble that info on the standby. When that
1223  * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
1224  *
1225  * Since locking on the primary when we derive the information is not
1226  * strict, we note that there is a time window between the derivation and
1227  * writing to WAL of the derived information. That allows race conditions
1228  * that we must resolve, since xids and locks may enter or leave the
1229  * snapshot during that window. This creates the issue that an xid or
1230  * lock may start *after* the snapshot has been derived yet *before* the
1231  * snapshot is logged in the running xacts WAL record. We resolve this by
1232  * starting to accumulate changes at a point just prior to when we derive
1233  * the snapshot on the primary, then ignore duplicates when we later apply
1234  * the snapshot from the running xacts record. This is implemented during
1235  * CreateCheckPoint() where we use the logical checkpoint location as
1236  * our starting point and then write the running xacts record immediately
1237  * before writing the main checkpoint WAL record. Since we always start
1238  * up from a checkpoint and are immediately at our starting point, we
1239  * unconditionally move to STANDBY_INITIALIZED. After this point we
1240  * must do 4 things:
1241  *      * move shared nextXid forwards as we see new xids
1242  *      * extend the clog and subtrans with each new xid
1243  *      * keep track of uncommitted known assigned xids
1244  *      * keep track of uncommitted AccessExclusiveLocks
1245  *
1246  * When we see a commit/abort we must remove known assigned xids and locks
1247  * from the completing transaction. Attempted removals that cannot locate
1248  * an entry are expected and must not cause an error when we are in state
1249  * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
1250  * KnownAssignedXidsRemove().
1251  *
1252  * Later, when we apply the running xact data we must be careful to ignore
1253  * transactions already committed, since those commits raced ahead when
1254  * making WAL entries.
1255  *
1256  * The loose timing also means that locks may be recorded that have a
1257  * zero xid, since xids are removed from procs before locks are removed.
1258  * So we must prune the lock list down to ensure we hold locks only for
1259  * currently running xids, performed by StandbyReleaseOldLocks().
1260  * Zero xids should no longer be possible, but we may be replaying WAL
1261  * from a time when they were possible.
1262  *
1263  * For logical decoding only the running xacts information is needed;
1264  * there's no need to look at the locking information, but it's logged anyway,
1265  * as there's no independent knob to just enable logical decoding. For
1266  * details of how this is used, check snapbuild.c's introductory comment.
1267  *
1268  *
1269  * Returns the RecPtr of the last inserted record.
1270  */
1271 XLogRecPtr
1272 LogStandbySnapshot(void)
1273 {
1274         XLogRecPtr      recptr;
1275         RunningTransactions running;
1276         xl_standby_lock *locks;
1277         int                     nlocks;
1278
1279         Assert(XLogStandbyInfoActive());
1280
1281         /*
1282          * Get details of any AccessExclusiveLocks being held at the moment.
1283          */
1284         locks = GetRunningTransactionLocks(&nlocks);
1285         if (nlocks > 0)
1286                 LogAccessExclusiveLocks(nlocks, locks);
1287         pfree(locks);
1288
1289         /*
1290          * Log details of all in-progress transactions. This should be the last
1291          * record we write, because standby will open up when it sees this.
1292          */
1293         running = GetRunningTransactionData();
1294
1295         /*
1296          * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
1297          * For Hot Standby this can be done before inserting the WAL record
1298          * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
1299          * the clog. For logical decoding, though, the lock can't be released
1300          * early because the clog might be "in the future" from the POV of the
1301          * historic snapshot. This would allow for situations where we're waiting
1302          * for the end of a transaction listed in the xl_running_xacts record
1303          * which, according to the WAL, has committed before the xl_running_xacts
1304          * record. Fortunately this routine isn't executed frequently, and it's
1305          * only a shared lock.
1306          */
1307         if (wal_level < WAL_LEVEL_LOGICAL)
1308                 LWLockRelease(ProcArrayLock);
1309
1310         recptr = LogCurrentRunningXacts(running);
1311
1312         /* Release lock if we kept it longer ... */
1313         if (wal_level >= WAL_LEVEL_LOGICAL)
1314                 LWLockRelease(ProcArrayLock);
1315
1316         /* GetRunningTransactionData() acquired XidGenLock, we must release it */
1317         LWLockRelease(XidGenLock);
1318
1319         return recptr;
1320 }
1321
1322 /*
1323  * Record an enhanced snapshot of running transactions into WAL.
1324  *
1325  * The definitions of RunningTransactionsData and xl_running_xacts are
1326  * similar. We keep them separate because xl_running_xacts is a contiguous
1327  * chunk of memory and never exists fully until it is assembled in WAL.
1328  * The inserted records are marked as not being important for durability,
1329  * to avoid triggering superfluous checkpoint / archiving activity.
1330  */
1331 static XLogRecPtr
1332 LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
1333 {
1334         xl_running_xacts xlrec;
1335         XLogRecPtr      recptr;
1336
1337         xlrec.xcnt = CurrRunningXacts->xcnt;
1338         xlrec.subxcnt = CurrRunningXacts->subxcnt;
1339         xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
1340         xlrec.nextXid = CurrRunningXacts->nextXid;
1341         xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
1342         xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
1343
1344         /* Header */
1345         XLogBeginInsert();
1346         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1347         XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
1348
1349         /* array of TransactionIds */
1350         if (xlrec.xcnt > 0)
1351                 XLogRegisterData((char *) CurrRunningXacts->xids,
1352                                                  (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
1353
1354         recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
1355
1356         if (CurrRunningXacts->subxid_overflow)
1357                 elog(trace_recovery(DEBUG2),
1358                          "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1359                          CurrRunningXacts->xcnt,
1360                          LSN_FORMAT_ARGS(recptr),
1361                          CurrRunningXacts->oldestRunningXid,
1362                          CurrRunningXacts->latestCompletedXid,
1363                          CurrRunningXacts->nextXid);
1364         else
1365                 elog(trace_recovery(DEBUG2),
1366                          "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1367                          CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1368                          LSN_FORMAT_ARGS(recptr),
1369                          CurrRunningXacts->oldestRunningXid,
1370                          CurrRunningXacts->latestCompletedXid,
1371                          CurrRunningXacts->nextXid);
1372
1373         /*
1374          * Ensure running_xacts information is synced to disk not too far in the
1375          * future. We don't want to stall anything though (i.e. use XLogFlush()),
1376          * so we let the wal writer do it during normal operation.
1377          * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1378          * and nudge the WALWriter into action if sleeping. Check
1379          * XLogBackgroundFlush() for details why a record might not be flushed
1380          * without it.
1381          */
1382         XLogSetAsyncXactLSN(recptr);
1383
1384         return recptr;
1385 }
1386
1387 /*
1388  * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1389  * logged, as described in backend/storage/lmgr/README.
1390  */
1391 static void
1392 LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
1393 {
1394         xl_standby_locks xlrec;
1395
1396         xlrec.nlocks = nlocks;
1397
1398         XLogBeginInsert();
1399         XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
1400         XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
1401         XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1402
1403         (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1404 }
1405
1406 /*
1407  * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1408  */
1409 void
1410 LogAccessExclusiveLock(Oid dbOid, Oid relOid)
1411 {
1412         xl_standby_lock xlrec;
1413
1414         xlrec.xid = GetCurrentTransactionId();
1415
1416         xlrec.dbOid = dbOid;
1417         xlrec.relOid = relOid;
1418
1419         LogAccessExclusiveLocks(1, &xlrec);
1420         MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
1421 }
1422
1423 /*
1424  * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1425  */
1426 void
1427 LogAccessExclusiveLockPrepare(void)
1428 {
1429         /*
1430          * Ensure that a TransactionId has been assigned to this transaction, for
1431          * two reasons, both related to lock release on the standby. First, we
1432          * must assign an xid so that RecordTransactionCommit() and
1433          * RecordTransactionAbort() do not optimise away the transaction
1434          * completion record which recovery relies upon to release locks. It's a
1435          * hack, but for a corner case not worth adding code for into the main
1436          * commit path. Second, we must assign an xid before the lock is recorded
1437          * in shared memory, otherwise a concurrently executing
1438          * GetRunningTransactionLocks() might see a lock associated with an
1439          * InvalidTransactionId which we later assert cannot happen.
1440          */
1441         (void) GetCurrentTransactionId();
1442 }
1443
1444 /*
1445  * Emit WAL for invalidations. This currently is only used for commits without
1446  * an xid but which contain invalidations.
1447  */
1448 void
1449 LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
1450                                                 bool relcacheInitFileInval)
1451 {
1452         xl_invalidations xlrec;
1453
1454         /* prepare record */
1455         memset(&xlrec, 0, sizeof(xlrec));
1456         xlrec.dbId = MyDatabaseId;
1457         xlrec.tsId = MyDatabaseTableSpace;
1458         xlrec.relcacheInitFileInval = relcacheInitFileInval;
1459         xlrec.nmsgs = nmsgs;
1460
1461         /* perform insertion */
1462         XLogBeginInsert();
1463         XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
1464         XLogRegisterData((char *) msgs,
1465                                          nmsgs * sizeof(SharedInvalidationMessage));
1466         XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1467 }
1468
1469 /* Return the description of recovery conflict */
1470 static const char *
1471 get_recovery_conflict_desc(ProcSignalReason reason)
1472 {
1473         const char *reasonDesc = _("unknown reason");
1474
1475         switch (reason)
1476         {
1477                 case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
1478                         reasonDesc = _("recovery conflict on buffer pin");
1479                         break;
1480                 case PROCSIG_RECOVERY_CONFLICT_LOCK:
1481                         reasonDesc = _("recovery conflict on lock");
1482                         break;
1483                 case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
1484                         reasonDesc = _("recovery conflict on tablespace");
1485                         break;
1486                 case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
1487                         reasonDesc = _("recovery conflict on snapshot");
1488                         break;
1489                 case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
1490                         reasonDesc = _("recovery conflict on buffer deadlock");
1491                         break;
1492                 case PROCSIG_RECOVERY_CONFLICT_DATABASE:
1493                         reasonDesc = _("recovery conflict on database");
1494                         break;
1495                 default:
1496                         break;
1497         }
1498
1499         return reasonDesc;
1500 }