src/backend/postmaster/checkpointer.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * checkpointer.c
   4  *
   5  * The checkpointer is new as of Postgres 9.2.  It handles all checkpoints.
   6  * Checkpoints are automatically dispatched after a certain amount of time has
   7  * elapsed since the last one, and it can be signaled to perform requested
   8  * checkpoints as well.  (The GUC parameter that mandates a checkpoint every
   9  * so many WAL segments is implemented by having backends signal when they
  10  * fill WAL segments; the checkpointer itself doesn't watch for the
  11  * condition.)
  12  *
  13  * Normal termination is by SIGUSR2, which instructs the checkpointer to
  14  * execute a shutdown checkpoint and then exit(0).  (All backends must be
  15  * stopped before SIGUSR2 is issued!)  Emergency termination is by SIGQUIT;
  16  * like any backend, the checkpointer will simply abort and exit on SIGQUIT.
  17  *
  18  * If the checkpointer exits unexpectedly, the postmaster treats that the same
  19  * as a backend crash: shared memory may be corrupted, so remaining backends
  20  * should be killed by SIGQUIT and then a recovery cycle started.  (Even if
  21  * shared memory isn't corrupted, we have lost information about which
  22  * files need to be fsync'd for the next checkpoint, and so a system
  23  * restart needs to be forced.)
  24  *
  25  *
  26  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  27  *
  28  *
  29  * IDENTIFICATION
  30  *        src/backend/postmaster/checkpointer.c
  31  *
  32  *-------------------------------------------------------------------------
  33  */
  34 #include "postgres.h"
  35
  36 #include <sys/time.h>
  37 #include <time.h>
  38
  39 #include "access/xlog.h"
  40 #include "access/xlog_internal.h"
  41 #include "access/xlogrecovery.h"
  42 #include "libpq/pqsignal.h"
  43 #include "miscadmin.h"
  44 #include "pgstat.h"
  45 #include "postmaster/auxprocess.h"
  46 #include "postmaster/bgwriter.h"
  47 #include "postmaster/interrupt.h"
  48 #include "replication/syncrep.h"
  49 #include "storage/bufmgr.h"
  50 #include "storage/condition_variable.h"
  51 #include "storage/fd.h"
  52 #include "storage/ipc.h"
  53 #include "storage/lwlock.h"
  54 #include "storage/proc.h"
  55 #include "storage/procsignal.h"
  56 #include "storage/shmem.h"
  57 #include "storage/smgr.h"
  58 #include "storage/spin.h"
  59 #include "utils/guc.h"
  60 #include "utils/memutils.h"
  61 #include "utils/resowner.h"
  62
  63
  64 /*----------
  65  * Shared memory area for communication between checkpointer and backends
  66  *
  67  * The ckpt counters allow backends to watch for completion of a checkpoint
  68  * request they send.  Here's how it works:
  69  *      * At start of a checkpoint, checkpointer reads (and clears) the request
  70  *        flags and increments ckpt_started, while holding ckpt_lck.
  71  *      * On completion of a checkpoint, checkpointer sets ckpt_done to
  72  *        equal ckpt_started.
  73  *      * On failure of a checkpoint, checkpointer increments ckpt_failed
  74  *        and sets ckpt_done to equal ckpt_started.
  75  *
  76  * The algorithm for backends is:
  77  *      1. Record current values of ckpt_failed and ckpt_started, and
  78  *         set request flags, while holding ckpt_lck.
  79  *      2. Send signal to request checkpoint.
  80  *      3. Sleep until ckpt_started changes.  Now you know a checkpoint has
  81  *         begun since you started this algorithm (although *not* that it was
  82  *         specifically initiated by your signal), and that it is using your flags.
  83  *      4. Record new value of ckpt_started.
  84  *      5. Sleep until ckpt_done >= saved value of ckpt_started.  (Use modulo
  85  *         arithmetic here in case counters wrap around.)  Now you know a
  86  *         checkpoint has started and completed, but not whether it was
  87  *         successful.
  88  *      6. If ckpt_failed is different from the originally saved value,
  89  *         assume request failed; otherwise it was definitely successful.
  90  *
  91  * ckpt_flags holds the OR of the checkpoint request flags sent by all
  92  * requesting backends since the last checkpoint start.  The flags are
  93  * chosen so that OR'ing is the correct way to combine multiple requests.
  94  *
  95  * The requests array holds fsync requests sent by backends and not yet
  96  * absorbed by the checkpointer.
  97  *
  98  * Unlike the checkpoint fields, requests related fields are protected by
  99  * CheckpointerCommLock.
 100  *----------
 101  */
 102 typedef struct
 103 {
 104         SyncRequestType type;           /* request type */
 105         FileTag         ftag;                   /* file identifier */
 106 } CheckpointerRequest;
 107
 108 typedef struct
 109 {
 110         pid_t           checkpointer_pid;       /* PID (0 if not started) */
 111
 112         slock_t         ckpt_lck;               /* protects all the ckpt_* fields */
 113
 114         int                     ckpt_started;   /* advances when checkpoint starts */
 115         int                     ckpt_done;              /* advances when checkpoint done */
 116         int                     ckpt_failed;    /* advances when checkpoint fails */
 117
 118         int                     ckpt_flags;             /* checkpoint flags, as defined in xlog.h */
 119
 120         ConditionVariable start_cv; /* signaled when ckpt_started advances */
 121         ConditionVariable done_cv;      /* signaled when ckpt_done advances */
 122
 123         int                     num_requests;   /* current # of requests */
 124         int                     max_requests;   /* allocated array size */
 125         CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
 126 } CheckpointerShmemStruct;
 127
 128 static CheckpointerShmemStruct *CheckpointerShmem;
 129
 130 /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */
 131 #define WRITES_PER_ABSORB               1000
 132
 133 /*
 134  * GUC parameters
 135  */
 136 int                     CheckPointTimeout = 300;
 137 int                     CheckPointWarning = 30;
 138 double          CheckPointCompletionTarget = 0.9;
 139
 140 /*
 141  * Private state
 142  */
 143 static bool ckpt_active = false;
 144
 145 /* these values are valid when ckpt_active is true: */
 146 static pg_time_t ckpt_start_time;
 147 static XLogRecPtr ckpt_start_recptr;
 148 static double ckpt_cached_elapsed;
 149
 150 static pg_time_t last_checkpoint_time;
 151 static pg_time_t last_xlog_switch_time;
 152
 153 /* Prototypes for private functions */
 154
 155 static void HandleCheckpointerInterrupts(void);
 156 static void CheckArchiveTimeout(void);
 157 static bool IsCheckpointOnSchedule(double progress);
 158 static bool ImmediateCheckpointRequested(void);
 159 static bool CompactCheckpointerRequestQueue(void);
 160 static void UpdateSharedMemoryConfig(void);
 161
 162 /* Signal handlers */
 163 static void ReqCheckpointHandler(SIGNAL_ARGS);
 164
 165
 166 /*
 167  * Main entry point for checkpointer process
 168  *
 169  * This is invoked from AuxiliaryProcessMain, which has already created the
 170  * basic execution environment, but not enabled signals yet.
 171  */
 172 void
 173 CheckpointerMain(char *startup_data, size_t startup_data_len)
 174 {
 175         sigjmp_buf      local_sigjmp_buf;
 176         MemoryContext checkpointer_context;
 177
 178         Assert(startup_data_len == 0);
 179
 180         MyBackendType = B_CHECKPOINTER;
 181         AuxiliaryProcessMainCommon();
 182
 183         CheckpointerShmem->checkpointer_pid = MyProcPid;
 184
 185         /*
 186          * Properly accept or ignore signals the postmaster might send us
 187          *
 188          * Note: we deliberately ignore SIGTERM, because during a standard Unix
 189          * system shutdown cycle, init will SIGTERM all processes at once.  We
 190          * want to wait for the backends to exit, whereupon the postmaster will
 191          * tell us it's okay to shut down (via SIGUSR2).
 192          */
 193         pqsignal(SIGHUP, SignalHandlerForConfigReload);
 194         pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */
 195         pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
 196         /* SIGQUIT handler was already set up by InitPostmasterChild */
 197         pqsignal(SIGALRM, SIG_IGN);
 198         pqsignal(SIGPIPE, SIG_IGN);
 199         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 200         pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
 201
 202         /*
 203          * Reset some signals that are accepted by postmaster but not here
 204          */
 205         pqsignal(SIGCHLD, SIG_DFL);
 206
 207         /*
 208          * Initialize so that first time-driven event happens at the correct time.
 209          */
 210         last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
 211
 212         /*
 213          * Write out stats after shutdown. This needs to be called by exactly one
 214          * process during a normal shutdown, and since checkpointer is shut down
 215          * very late...
 216          *
 217          * Walsenders are shut down after the checkpointer, but currently don't
 218          * report stats. If that changes, we need a more complicated solution.
 219          */
 220         before_shmem_exit(pgstat_before_server_shutdown, 0);
 221
 222         /*
 223          * Create a memory context that we will do all our work in.  We do this so
 224          * that we can reset the context during error recovery and thereby avoid
 225          * possible memory leaks.  Formerly this code just ran in
 226          * TopMemoryContext, but resetting that would be a really bad idea.
 227          */
 228         checkpointer_context = AllocSetContextCreate(TopMemoryContext,
 229                                                                                                  "Checkpointer",
 230                                                                                                  ALLOCSET_DEFAULT_SIZES);
 231         MemoryContextSwitchTo(checkpointer_context);
 232
 233         /*
 234          * If an exception is encountered, processing resumes here.
 235          *
 236          * You might wonder why this isn't coded as an infinite loop around a
 237          * PG_TRY construct.  The reason is that this is the bottom of the
 238          * exception stack, and so with PG_TRY there would be no exception handler
 239          * in force at all during the CATCH part.  By leaving the outermost setjmp
 240          * always active, we have at least some chance of recovering from an error
 241          * during error recovery.  (If we get into an infinite loop thereby, it
 242          * will soon be stopped by overflow of elog.c's internal state stack.)
 243          *
 244          * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask
 245          * (to wit, BlockSig) will be restored when longjmp'ing to here.  Thus,
 246          * signals other than SIGQUIT will be blocked until we complete error
 247          * recovery.  It might seem that this policy makes the HOLD_INTERRUPTS()
 248          * call redundant, but it is not since InterruptPending might be set
 249          * already.
 250          */
 251         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
 252         {
 253                 /* Since not using PG_TRY, must reset error stack by hand */
 254                 error_context_stack = NULL;
 255
 256                 /* Prevent interrupts while cleaning up */
 257                 HOLD_INTERRUPTS();
 258
 259                 /* Report the error to the server log */
 260                 EmitErrorReport();
 261
 262                 /*
 263                  * These operations are really just a minimal subset of
 264                  * AbortTransaction().  We don't have very many resources to worry
 265                  * about in checkpointer, but we do have LWLocks, buffers, and temp
 266                  * files.
 267                  */
 268                 LWLockReleaseAll();
 269                 ConditionVariableCancelSleep();
 270                 pgstat_report_wait_end();
 271                 UnlockBuffers();
 272                 ReleaseAuxProcessResources(false);
 273                 AtEOXact_Buffers(false);
 274                 AtEOXact_SMgr();
 275                 AtEOXact_Files(false);
 276                 AtEOXact_HashTables(false);
 277
 278                 /* Warn any waiting backends that the checkpoint failed. */
 279                 if (ckpt_active)
 280                 {
 281                         SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
 282                         CheckpointerShmem->ckpt_failed++;
 283                         CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
 284                         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 285
 286                         ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
 287
 288                         ckpt_active = false;
 289                 }
 290
 291                 /*
 292                  * Now return to normal top-level context and clear ErrorContext for
 293                  * next time.
 294                  */
 295                 MemoryContextSwitchTo(checkpointer_context);
 296                 FlushErrorState();
 297
 298                 /* Flush any leaked data in the top-level context */
 299                 MemoryContextReset(checkpointer_context);
 300
 301                 /* Now we can allow interrupts again */
 302                 RESUME_INTERRUPTS();
 303
 304                 /*
 305                  * Sleep at least 1 second after any error.  A write error is likely
 306                  * to be repeated, and we don't want to be filling the error logs as
 307                  * fast as we can.
 308                  */
 309                 pg_usleep(1000000L);
 310         }
 311
 312         /* We can now handle ereport(ERROR) */
 313         PG_exception_stack = &local_sigjmp_buf;
 314
 315         /*
 316          * Unblock signals (they were blocked when the postmaster forked us)
 317          */
 318         sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
 319
 320         /*
 321          * Ensure all shared memory values are set correctly for the config. Doing
 322          * this here ensures no race conditions from other concurrent updaters.
 323          */
 324         UpdateSharedMemoryConfig();
 325
 326         /*
 327          * Advertise our latch that backends can use to wake us up while we're
 328          * sleeping.
 329          */
 330         ProcGlobal->checkpointerLatch = &MyProc->procLatch;
 331
 332         /*
 333          * Loop forever
 334          */
 335         for (;;)
 336         {
 337                 bool            do_checkpoint = false;
 338                 int                     flags = 0;
 339                 pg_time_t       now;
 340                 int                     elapsed_secs;
 341                 int                     cur_timeout;
 342                 bool            chkpt_or_rstpt_requested = false;
 343                 bool            chkpt_or_rstpt_timed = false;
 344
 345                 /* Clear any already-pending wakeups */
 346                 ResetLatch(MyLatch);
 347
 348                 /*
 349                  * Process any requests or signals received recently.
 350                  */
 351                 AbsorbSyncRequests();
 352                 HandleCheckpointerInterrupts();
 353
 354                 /*
 355                  * Detect a pending checkpoint request by checking whether the flags
 356                  * word in shared memory is nonzero.  We shouldn't need to acquire the
 357                  * ckpt_lck for this.
 358                  */
 359                 if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
 360                 {
 361                         do_checkpoint = true;
 362                         chkpt_or_rstpt_requested = true;
 363                 }
 364
 365                 /*
 366                  * Force a checkpoint if too much time has elapsed since the last one.
 367                  * Note that we count a timed checkpoint in stats only when this
 368                  * occurs without an external request, but we set the CAUSE_TIME flag
 369                  * bit even if there is also an external request.
 370                  */
 371                 now = (pg_time_t) time(NULL);
 372                 elapsed_secs = now - last_checkpoint_time;
 373                 if (elapsed_secs >= CheckPointTimeout)
 374                 {
 375                         if (!do_checkpoint)
 376                                 chkpt_or_rstpt_timed = true;
 377                         do_checkpoint = true;
 378                         flags |= CHECKPOINT_CAUSE_TIME;
 379                 }
 380
 381                 /*
 382                  * Do a checkpoint if requested.
 383                  */
 384                 if (do_checkpoint)
 385                 {
 386                         bool            ckpt_performed = false;
 387                         bool            do_restartpoint;
 388
 389                         /* Check if we should perform a checkpoint or a restartpoint. */
 390                         do_restartpoint = RecoveryInProgress();
 391
 392                         /*
 393                          * Atomically fetch the request flags to figure out what kind of a
 394                          * checkpoint we should perform, and increase the started-counter
 395                          * to acknowledge that we've started a new checkpoint.
 396                          */
 397                         SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
 398                         flags |= CheckpointerShmem->ckpt_flags;
 399                         CheckpointerShmem->ckpt_flags = 0;
 400                         CheckpointerShmem->ckpt_started++;
 401                         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 402
 403                         ConditionVariableBroadcast(&CheckpointerShmem->start_cv);
 404
 405                         /*
 406                          * The end-of-recovery checkpoint is a real checkpoint that's
 407                          * performed while we're still in recovery.
 408                          */
 409                         if (flags & CHECKPOINT_END_OF_RECOVERY)
 410                                 do_restartpoint = false;
 411
 412                         if (chkpt_or_rstpt_timed)
 413                         {
 414                                 chkpt_or_rstpt_timed = false;
 415                                 if (do_restartpoint)
 416                                         PendingCheckpointerStats.restartpoints_timed++;
 417                                 else
 418                                         PendingCheckpointerStats.num_timed++;
 419                         }
 420
 421                         if (chkpt_or_rstpt_requested)
 422                         {
 423                                 chkpt_or_rstpt_requested = false;
 424                                 if (do_restartpoint)
 425                                         PendingCheckpointerStats.restartpoints_requested++;
 426                                 else
 427                                         PendingCheckpointerStats.num_requested++;
 428                         }
 429
 430                         /*
 431                          * We will warn if (a) too soon since last checkpoint (whatever
 432                          * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
 433                          * since the last checkpoint start.  Note in particular that this
 434                          * implementation will not generate warnings caused by
 435                          * CheckPointTimeout < CheckPointWarning.
 436                          */
 437                         if (!do_restartpoint &&
 438                                 (flags & CHECKPOINT_CAUSE_XLOG) &&
 439                                 elapsed_secs < CheckPointWarning)
 440                                 ereport(LOG,
 441                                                 (errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
 442                                                                            "checkpoints are occurring too frequently (%d seconds apart)",
 443                                                                            elapsed_secs,
 444                                                                            elapsed_secs),
 445                                                  errhint("Consider increasing the configuration parameter \"%s\".", "max_wal_size")));
 446
 447                         /*
 448                          * Initialize checkpointer-private variables used during
 449                          * checkpoint.
 450                          */
 451                         ckpt_active = true;
 452                         if (do_restartpoint)
 453                                 ckpt_start_recptr = GetXLogReplayRecPtr(NULL);
 454                         else
 455                                 ckpt_start_recptr = GetInsertRecPtr();
 456                         ckpt_start_time = now;
 457                         ckpt_cached_elapsed = 0;
 458
 459                         /*
 460                          * Do the checkpoint.
 461                          */
 462                         if (!do_restartpoint)
 463                                 ckpt_performed = CreateCheckPoint(flags);
 464                         else
 465                                 ckpt_performed = CreateRestartPoint(flags);
 466
 467                         /*
 468                          * After any checkpoint, free all smgr objects.  Otherwise we
 469                          * would never do so for dropped relations, as the checkpointer
 470                          * does not process shared invalidation messages or call
 471                          * AtEOXact_SMgr().
 472                          */
 473                         smgrdestroyall();
 474
 475                         /*
 476                          * Indicate checkpoint completion to any waiting backends.
 477                          */
 478                         SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
 479                         CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
 480                         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 481
 482                         ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
 483
 484                         if (!do_restartpoint)
 485                         {
 486                                 /*
 487                                  * Note we record the checkpoint start time not end time as
 488                                  * last_checkpoint_time.  This is so that time-driven
 489                                  * checkpoints happen at a predictable spacing.
 490                                  */
 491                                 last_checkpoint_time = now;
 492
 493                                 if (ckpt_performed)
 494                                         PendingCheckpointerStats.num_performed++;
 495                         }
 496                         else
 497                         {
 498                                 if (ckpt_performed)
 499                                 {
 500                                         /*
 501                                          * The same as for checkpoint. Please see the
 502                                          * corresponding comment.
 503                                          */
 504                                         last_checkpoint_time = now;
 505
 506                                         PendingCheckpointerStats.restartpoints_performed++;
 507                                 }
 508                                 else
 509                                 {
 510                                         /*
 511                                          * We were not able to perform the restartpoint
 512                                          * (checkpoints throw an ERROR in case of error).  Most
 513                                          * likely because we have not received any new checkpoint
 514                                          * WAL records since the last restartpoint. Try again in
 515                                          * 15 s.
 516                                          */
 517                                         last_checkpoint_time = now - CheckPointTimeout + 15;
 518                                 }
 519                         }
 520
 521                         ckpt_active = false;
 522
 523                         /* We may have received an interrupt during the checkpoint. */
 524                         HandleCheckpointerInterrupts();
 525                 }
 526
 527                 /* Check for archive_timeout and switch xlog files if necessary. */
 528                 CheckArchiveTimeout();
 529
 530                 /* Report pending statistics to the cumulative stats system */
 531                 pgstat_report_checkpointer();
 532                 pgstat_report_wal(true);
 533
 534                 /*
 535                  * If any checkpoint flags have been set, redo the loop to handle the
 536                  * checkpoint without sleeping.
 537                  */
 538                 if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
 539                         continue;
 540
 541                 /*
 542                  * Sleep until we are signaled or it's time for another checkpoint or
 543                  * xlog file switch.
 544                  */
 545                 now = (pg_time_t) time(NULL);
 546                 elapsed_secs = now - last_checkpoint_time;
 547                 if (elapsed_secs >= CheckPointTimeout)
 548                         continue;                       /* no sleep for us ... */
 549                 cur_timeout = CheckPointTimeout - elapsed_secs;
 550                 if (XLogArchiveTimeout > 0 && !RecoveryInProgress())
 551                 {
 552                         elapsed_secs = now - last_xlog_switch_time;
 553                         if (elapsed_secs >= XLogArchiveTimeout)
 554                                 continue;               /* no sleep for us ... */
 555                         cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
 556                 }
 557
 558                 (void) WaitLatch(MyLatch,
 559                                                  WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
 560                                                  cur_timeout * 1000L /* convert to ms */ ,
 561                                                  WAIT_EVENT_CHECKPOINTER_MAIN);
 562         }
 563 }
 564
 565 /*
 566  * Process any new interrupts.
 567  */
 568 static void
 569 HandleCheckpointerInterrupts(void)
 570 {
 571         if (ProcSignalBarrierPending)
 572                 ProcessProcSignalBarrier();
 573
 574         if (ConfigReloadPending)
 575         {
 576                 ConfigReloadPending = false;
 577                 ProcessConfigFile(PGC_SIGHUP);
 578
 579                 /*
 580                  * Checkpointer is the last process to shut down, so we ask it to hold
 581                  * the keys for a range of other tasks required most of which have
 582                  * nothing to do with checkpointing at all.
 583                  *
 584                  * For various reasons, some config values can change dynamically so
 585                  * the primary copy of them is held in shared memory to make sure all
 586                  * backends see the same value.  We make Checkpointer responsible for
 587                  * updating the shared memory copy if the parameter setting changes
 588                  * because of SIGHUP.
 589                  */
 590                 UpdateSharedMemoryConfig();
 591         }
 592         if (ShutdownRequestPending)
 593         {
 594                 /*
 595                  * From here on, elog(ERROR) should end with exit(1), not send control
 596                  * back to the sigsetjmp block above
 597                  */
 598                 ExitOnAnyError = true;
 599
 600                 /*
 601                  * Close down the database.
 602                  *
 603                  * Since ShutdownXLOG() creates restartpoint or checkpoint, and
 604                  * updates the statistics, increment the checkpoint request and flush
 605                  * out pending statistic.
 606                  */
 607                 PendingCheckpointerStats.num_requested++;
 608                 ShutdownXLOG(0, 0);
 609                 pgstat_report_checkpointer();
 610                 pgstat_report_wal(true);
 611
 612                 /* Normal exit from the checkpointer is here */
 613                 proc_exit(0);                   /* done */
 614         }
 615
 616         /* Perform logging of memory contexts of this process */
 617         if (LogMemoryContextPending)
 618                 ProcessLogMemoryContextInterrupt();
 619 }
 620
 621 /*
 622  * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
 623  *
 624  * This will switch to a new WAL file and force an archive file write if
 625  * meaningful activity is recorded in the current WAL file. This includes most
 626  * writes, including just a single checkpoint record, but excludes WAL records
 627  * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like
 628  * snapshots of running transactions).  Such records, depending on
 629  * configuration, occur on regular intervals and don't contain important
 630  * information.  This avoids generating archives with a few unimportant
 631  * records.
 632  */
 633 static void
 634 CheckArchiveTimeout(void)
 635 {
 636         pg_time_t       now;
 637         pg_time_t       last_time;
 638         XLogRecPtr      last_switch_lsn;
 639
 640         if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
 641                 return;
 642
 643         now = (pg_time_t) time(NULL);
 644
 645         /* First we do a quick check using possibly-stale local state. */
 646         if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
 647                 return;
 648
 649         /*
 650          * Update local state ... note that last_xlog_switch_time is the last time
 651          * a switch was performed *or requested*.
 652          */
 653         last_time = GetLastSegSwitchData(&last_switch_lsn);
 654
 655         last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
 656
 657         /* Now we can do the real checks */
 658         if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
 659         {
 660                 /*
 661                  * Switch segment only when "important" WAL has been logged since the
 662                  * last segment switch (last_switch_lsn points to end of segment
 663                  * switch occurred in).
 664                  */
 665                 if (GetLastImportantRecPtr() > last_switch_lsn)
 666                 {
 667                         XLogRecPtr      switchpoint;
 668
 669                         /* mark switch as unimportant, avoids triggering checkpoints */
 670                         switchpoint = RequestXLogSwitch(true);
 671
 672                         /*
 673                          * If the returned pointer points exactly to a segment boundary,
 674                          * assume nothing happened.
 675                          */
 676                         if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0)
 677                                 elog(DEBUG1, "write-ahead log switch forced (\"archive_timeout\"=%d)",
 678                                          XLogArchiveTimeout);
 679                 }
 680
 681                 /*
 682                  * Update state in any case, so we don't retry constantly when the
 683                  * system is idle.
 684                  */
 685                 last_xlog_switch_time = now;
 686         }
 687 }
 688
 689 /*
 690  * Returns true if an immediate checkpoint request is pending.  (Note that
 691  * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
 692  * there is one pending behind it.)
 693  */
 694 static bool
 695 ImmediateCheckpointRequested(void)
 696 {
 697         volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
 698
 699         /*
 700          * We don't need to acquire the ckpt_lck in this case because we're only
 701          * looking at a single flag bit.
 702          */
 703         if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE)
 704                 return true;
 705         return false;
 706 }
 707
 708 /*
 709  * CheckpointWriteDelay -- control rate of checkpoint
 710  *
 711  * This function is called after each page write performed by BufferSync().
 712  * It is responsible for throttling BufferSync()'s write rate to hit
 713  * checkpoint_completion_target.
 714  *
 715  * The checkpoint request flags should be passed in; currently the only one
 716  * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
 717  *
 718  * 'progress' is an estimate of how much of the work has been done, as a
 719  * fraction between 0.0 meaning none, and 1.0 meaning all done.
 720  */
 721 void
 722 CheckpointWriteDelay(int flags, double progress)
 723 {
 724         static int      absorb_counter = WRITES_PER_ABSORB;
 725
 726         /* Do nothing if checkpoint is being executed by non-checkpointer process */
 727         if (!AmCheckpointerProcess())
 728                 return;
 729
 730         /*
 731          * Perform the usual duties and take a nap, unless we're behind schedule,
 732          * in which case we just try to catch up as quickly as possible.
 733          */
 734         if (!(flags & CHECKPOINT_IMMEDIATE) &&
 735                 !ShutdownRequestPending &&
 736                 !ImmediateCheckpointRequested() &&
 737                 IsCheckpointOnSchedule(progress))
 738         {
 739                 if (ConfigReloadPending)
 740                 {
 741                         ConfigReloadPending = false;
 742                         ProcessConfigFile(PGC_SIGHUP);
 743                         /* update shmem copies of config variables */
 744                         UpdateSharedMemoryConfig();
 745                 }
 746
 747                 AbsorbSyncRequests();
 748                 absorb_counter = WRITES_PER_ABSORB;
 749
 750                 CheckArchiveTimeout();
 751
 752                 /* Report interim statistics to the cumulative stats system */
 753                 pgstat_report_checkpointer();
 754
 755                 /*
 756                  * This sleep used to be connected to bgwriter_delay, typically 200ms.
 757                  * That resulted in more frequent wakeups if not much work to do.
 758                  * Checkpointer and bgwriter are no longer related so take the Big
 759                  * Sleep.
 760                  */
 761                 WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
 762                                   100,
 763                                   WAIT_EVENT_CHECKPOINT_WRITE_DELAY);
 764                 ResetLatch(MyLatch);
 765         }
 766         else if (--absorb_counter <= 0)
 767         {
 768                 /*
 769                  * Absorb pending fsync requests after each WRITES_PER_ABSORB write
 770                  * operations even when we don't sleep, to prevent overflow of the
 771                  * fsync request queue.
 772                  */
 773                 AbsorbSyncRequests();
 774                 absorb_counter = WRITES_PER_ABSORB;
 775         }
 776
 777         /* Check for barrier events. */
 778         if (ProcSignalBarrierPending)
 779                 ProcessProcSignalBarrier();
 780 }
 781
 782 /*
 783  * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
 784  *               (or restartpoint) in time?
 785  *
 786  * Compares the current progress against the time/segments elapsed since last
 787  * checkpoint, and returns true if the progress we've made this far is greater
 788  * than the elapsed time/segments.
 789  */
 790 static bool
 791 IsCheckpointOnSchedule(double progress)
 792 {
 793         XLogRecPtr      recptr;
 794         struct timeval now;
 795         double          elapsed_xlogs,
 796                                 elapsed_time;
 797
 798         Assert(ckpt_active);
 799
 800         /* Scale progress according to checkpoint_completion_target. */
 801         progress *= CheckPointCompletionTarget;
 802
 803         /*
 804          * Check against the cached value first. Only do the more expensive
 805          * calculations once we reach the target previously calculated. Since
 806          * neither time or WAL insert pointer moves backwards, a freshly
 807          * calculated value can only be greater than or equal to the cached value.
 808          */
 809         if (progress < ckpt_cached_elapsed)
 810                 return false;
 811
 812         /*
 813          * Check progress against WAL segments written and CheckPointSegments.
 814          *
 815          * We compare the current WAL insert location against the location
 816          * computed before calling CreateCheckPoint. The code in XLogInsert that
 817          * actually triggers a checkpoint when CheckPointSegments is exceeded
 818          * compares against RedoRecPtr, so this is not completely accurate.
 819          * However, it's good enough for our purposes, we're only calculating an
 820          * estimate anyway.
 821          *
 822          * During recovery, we compare last replayed WAL record's location with
 823          * the location computed before calling CreateRestartPoint. That maintains
 824          * the same pacing as we have during checkpoints in normal operation, but
 825          * we might exceed max_wal_size by a fair amount. That's because there can
 826          * be a large gap between a checkpoint's redo-pointer and the checkpoint
 827          * record itself, and we only start the restartpoint after we've seen the
 828          * checkpoint record. (The gap is typically up to CheckPointSegments *
 829          * checkpoint_completion_target where checkpoint_completion_target is the
 830          * value that was in effect when the WAL was generated).
 831          */
 832         if (RecoveryInProgress())
 833                 recptr = GetXLogReplayRecPtr(NULL);
 834         else
 835                 recptr = GetInsertRecPtr();
 836         elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) /
 837                                          wal_segment_size) / CheckPointSegments;
 838
 839         if (progress < elapsed_xlogs)
 840         {
 841                 ckpt_cached_elapsed = elapsed_xlogs;
 842                 return false;
 843         }
 844
 845         /*
 846          * Check progress against time elapsed and checkpoint_timeout.
 847          */
 848         gettimeofday(&now, NULL);
 849         elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
 850                                         now.tv_usec / 1000000.0) / CheckPointTimeout;
 851
 852         if (progress < elapsed_time)
 853         {
 854                 ckpt_cached_elapsed = elapsed_time;
 855                 return false;
 856         }
 857
 858         /* It looks like we're on schedule. */
 859         return true;
 860 }
 861
 862
 863 /* --------------------------------
 864  *              signal handler routines
 865  * --------------------------------
 866  */
 867
 868 /* SIGINT: set flag to run a normal checkpoint right away */
 869 static void
 870 ReqCheckpointHandler(SIGNAL_ARGS)
 871 {
 872         /*
 873          * The signaling process should have set ckpt_flags nonzero, so all we
 874          * need do is ensure that our main loop gets kicked out of any wait.
 875          */
 876         SetLatch(MyLatch);
 877 }
 878
 879
 880 /* --------------------------------
 881  *              communication with backends
 882  * --------------------------------
 883  */
 884
 885 /*
 886  * CheckpointerShmemSize
 887  *              Compute space needed for checkpointer-related shared memory
 888  */
 889 Size
 890 CheckpointerShmemSize(void)
 891 {
 892         Size            size;
 893
 894         /*
 895          * Currently, the size of the requests[] array is arbitrarily set equal to
 896          * NBuffers.  This may prove too large or small ...
 897          */
 898         size = offsetof(CheckpointerShmemStruct, requests);
 899         size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest)));
 900
 901         return size;
 902 }
 903
 904 /*
 905  * CheckpointerShmemInit
 906  *              Allocate and initialize checkpointer-related shared memory
 907  */
 908 void
 909 CheckpointerShmemInit(void)
 910 {
 911         Size            size = CheckpointerShmemSize();
 912         bool            found;
 913
 914         CheckpointerShmem = (CheckpointerShmemStruct *)
 915                 ShmemInitStruct("Checkpointer Data",
 916                                                 size,
 917                                                 &found);
 918
 919         if (!found)
 920         {
 921                 /*
 922                  * First time through, so initialize.  Note that we zero the whole
 923                  * requests array; this is so that CompactCheckpointerRequestQueue can
 924                  * assume that any pad bytes in the request structs are zeroes.
 925                  */
 926                 MemSet(CheckpointerShmem, 0, size);
 927                 SpinLockInit(&CheckpointerShmem->ckpt_lck);
 928                 CheckpointerShmem->max_requests = NBuffers;
 929                 ConditionVariableInit(&CheckpointerShmem->start_cv);
 930                 ConditionVariableInit(&CheckpointerShmem->done_cv);
 931         }
 932 }
 933
 934 /*
 935  * RequestCheckpoint
 936  *              Called in backend processes to request a checkpoint
 937  *
 938  * flags is a bitwise OR of the following:
 939  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
 940  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
 941  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
 942  *              ignoring checkpoint_completion_target parameter.
 943  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
 944  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
 945  *              CHECKPOINT_END_OF_RECOVERY).
 946  *      CHECKPOINT_WAIT: wait for completion before returning (otherwise,
 947  *              just signal checkpointer to do it, and return).
 948  *      CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
 949  *              (This affects logging, and in particular enables CheckPointWarning.)
 950  */
 951 void
 952 RequestCheckpoint(int flags)
 953 {
 954         int                     ntries;
 955         int                     old_failed,
 956                                 old_started;
 957
 958         /*
 959          * If in a standalone backend, just do it ourselves.
 960          */
 961         if (!IsPostmasterEnvironment)
 962         {
 963                 /*
 964                  * There's no point in doing slow checkpoints in a standalone backend,
 965                  * because there's no other backends the checkpoint could disrupt.
 966                  */
 967                 CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
 968
 969                 /* Free all smgr objects, as CheckpointerMain() normally would. */
 970                 smgrdestroyall();
 971
 972                 return;
 973         }
 974
 975         /*
 976          * Atomically set the request flags, and take a snapshot of the counters.
 977          * When we see ckpt_started > old_started, we know the flags we set here
 978          * have been seen by checkpointer.
 979          *
 980          * Note that we OR the flags with any existing flags, to avoid overriding
 981          * a "stronger" request by another backend.  The flag senses must be
 982          * chosen to make this work!
 983          */
 984         SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
 985
 986         old_failed = CheckpointerShmem->ckpt_failed;
 987         old_started = CheckpointerShmem->ckpt_started;
 988         CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED);
 989
 990         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 991
 992         /*
 993          * Send signal to request checkpoint.  It's possible that the checkpointer
 994          * hasn't started yet, or is in process of restarting, so we will retry a
 995          * few times if needed.  (Actually, more than a few times, since on slow
 996          * or overloaded buildfarm machines, it's been observed that the
 997          * checkpointer can take several seconds to start.)  However, if not told
 998          * to wait for the checkpoint to occur, we consider failure to send the
 999          * signal to be nonfatal and merely LOG it.  The checkpointer should see
1000          * the request when it does start, with or without getting a signal.
1001          */
1002 #define MAX_SIGNAL_TRIES 600    /* max wait 60.0 sec */
1003         for (ntries = 0;; ntries++)
1004         {
1005                 if (CheckpointerShmem->checkpointer_pid == 0)
1006                 {
1007                         if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
1008                         {
1009                                 elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
1010                                          "could not signal for checkpoint: checkpointer is not running");
1011                                 break;
1012                         }
1013                 }
1014                 else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0)
1015                 {
1016                         if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
1017                         {
1018                                 elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
1019                                          "could not signal for checkpoint: %m");
1020                                 break;
1021                         }
1022                 }
1023                 else
1024                         break;                          /* signal sent successfully */
1025
1026                 CHECK_FOR_INTERRUPTS();
1027                 pg_usleep(100000L);             /* wait 0.1 sec, then retry */
1028         }
1029
1030         /*
1031          * If requested, wait for completion.  We detect completion according to
1032          * the algorithm given above.
1033          */
1034         if (flags & CHECKPOINT_WAIT)
1035         {
1036                 int                     new_started,
1037                                         new_failed;
1038
1039                 /* Wait for a new checkpoint to start. */
1040                 ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv);
1041                 for (;;)
1042                 {
1043                         SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1044                         new_started = CheckpointerShmem->ckpt_started;
1045                         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1046
1047                         if (new_started != old_started)
1048                                 break;
1049
1050                         ConditionVariableSleep(&CheckpointerShmem->start_cv,
1051                                                                    WAIT_EVENT_CHECKPOINT_START);
1052                 }
1053                 ConditionVariableCancelSleep();
1054
1055                 /*
1056                  * We are waiting for ckpt_done >= new_started, in a modulo sense.
1057                  */
1058                 ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv);
1059                 for (;;)
1060                 {
1061                         int                     new_done;
1062
1063                         SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1064                         new_done = CheckpointerShmem->ckpt_done;
1065                         new_failed = CheckpointerShmem->ckpt_failed;
1066                         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1067
1068                         if (new_done - new_started >= 0)
1069                                 break;
1070
1071                         ConditionVariableSleep(&CheckpointerShmem->done_cv,
1072                                                                    WAIT_EVENT_CHECKPOINT_DONE);
1073                 }
1074                 ConditionVariableCancelSleep();
1075
1076                 if (new_failed != old_failed)
1077                         ereport(ERROR,
1078                                         (errmsg("checkpoint request failed"),
1079                                          errhint("Consult recent messages in the server log for details.")));
1080         }
1081 }
1082
1083 /*
1084  * ForwardSyncRequest
1085  *              Forward a file-fsync request from a backend to the checkpointer
1086  *
1087  * Whenever a backend is compelled to write directly to a relation
1088  * (which should be seldom, if the background writer is getting its job done),
1089  * the backend calls this routine to pass over knowledge that the relation
1090  * is dirty and must be fsync'd before next checkpoint.  We also use this
1091  * opportunity to count such writes for statistical purposes.
1092  *
1093  * To avoid holding the lock for longer than necessary, we normally write
1094  * to the requests[] queue without checking for duplicates.  The checkpointer
1095  * will have to eliminate dups internally anyway.  However, if we discover
1096  * that the queue is full, we make a pass over the entire queue to compact
1097  * it.  This is somewhat expensive, but the alternative is for the backend
1098  * to perform its own fsync, which is far more expensive in practice.  It
1099  * is theoretically possible a backend fsync might still be necessary, if
1100  * the queue is full and contains no duplicate entries.  In that case, we
1101  * let the backend know by returning false.
1102  */
1103 bool
1104 ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
1105 {
1106         CheckpointerRequest *request;
1107         bool            too_full;
1108
1109         if (!IsUnderPostmaster)
1110                 return false;                   /* probably shouldn't even get here */
1111
1112         if (AmCheckpointerProcess())
1113                 elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
1114
1115         LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1116
1117         /*
1118          * If the checkpointer isn't running or the request queue is full, the
1119          * backend will have to perform its own fsync request.  But before forcing
1120          * that to happen, we can try to compact the request queue.
1121          */
1122         if (CheckpointerShmem->checkpointer_pid == 0 ||
1123                 (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests &&
1124                  !CompactCheckpointerRequestQueue()))
1125         {
1126                 LWLockRelease(CheckpointerCommLock);
1127                 return false;
1128         }
1129
1130         /* OK, insert request */
1131         request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
1132         request->ftag = *ftag;
1133         request->type = type;
1134
1135         /* If queue is more than half full, nudge the checkpointer to empty it */
1136         too_full = (CheckpointerShmem->num_requests >=
1137                                 CheckpointerShmem->max_requests / 2);
1138
1139         LWLockRelease(CheckpointerCommLock);
1140
1141         /* ... but not till after we release the lock */
1142         if (too_full && ProcGlobal->checkpointerLatch)
1143                 SetLatch(ProcGlobal->checkpointerLatch);
1144
1145         return true;
1146 }
1147
1148 /*
1149  * CompactCheckpointerRequestQueue
1150  *              Remove duplicates from the request queue to avoid backend fsyncs.
1151  *              Returns "true" if any entries were removed.
1152  *
1153  * Although a full fsync request queue is not common, it can lead to severe
1154  * performance problems when it does happen.  So far, this situation has
1155  * only been observed to occur when the system is under heavy write load,
1156  * and especially during the "sync" phase of a checkpoint.  Without this
1157  * logic, each backend begins doing an fsync for every block written, which
1158  * gets very expensive and can slow down the whole system.
1159  *
1160  * Trying to do this every time the queue is full could lose if there
1161  * aren't any removable entries.  But that should be vanishingly rare in
1162  * practice: there's one queue entry per shared buffer.
1163  */
1164 static bool
1165 CompactCheckpointerRequestQueue(void)
1166 {
1167         struct CheckpointerSlotMapping
1168         {
1169                 CheckpointerRequest request;
1170                 int                     slot;
1171         };
1172
1173         int                     n,
1174                                 preserve_count;
1175         int                     num_skipped = 0;
1176         HASHCTL         ctl;
1177         HTAB       *htab;
1178         bool       *skip_slot;
1179
1180         /* must hold CheckpointerCommLock in exclusive mode */
1181         Assert(LWLockHeldByMe(CheckpointerCommLock));
1182
1183         /* Avoid memory allocations in a critical section. */
1184         if (CritSectionCount > 0)
1185                 return false;
1186
1187         /* Initialize skip_slot array */
1188         skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
1189
1190         /* Initialize temporary hash table */
1191         ctl.keysize = sizeof(CheckpointerRequest);
1192         ctl.entrysize = sizeof(struct CheckpointerSlotMapping);
1193         ctl.hcxt = CurrentMemoryContext;
1194
1195         htab = hash_create("CompactCheckpointerRequestQueue",
1196                                            CheckpointerShmem->num_requests,
1197                                            &ctl,
1198                                            HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1199
1200         /*
1201          * The basic idea here is that a request can be skipped if it's followed
1202          * by a later, identical request.  It might seem more sensible to work
1203          * backwards from the end of the queue and check whether a request is
1204          * *preceded* by an earlier, identical request, in the hopes of doing less
1205          * copying.  But that might change the semantics, if there's an
1206          * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it
1207          * this way.  It would be possible to be even smarter if we made the code
1208          * below understand the specific semantics of such requests (it could blow
1209          * away preceding entries that would end up being canceled anyhow), but
1210          * it's not clear that the extra complexity would buy us anything.
1211          */
1212         for (n = 0; n < CheckpointerShmem->num_requests; n++)
1213         {
1214                 CheckpointerRequest *request;
1215                 struct CheckpointerSlotMapping *slotmap;
1216                 bool            found;
1217
1218                 /*
1219                  * We use the request struct directly as a hashtable key.  This
1220                  * assumes that any padding bytes in the structs are consistently the
1221                  * same, which should be okay because we zeroed them in
1222                  * CheckpointerShmemInit.  Note also that RelFileLocator had better
1223                  * contain no pad bytes.
1224                  */
1225                 request = &CheckpointerShmem->requests[n];
1226                 slotmap = hash_search(htab, request, HASH_ENTER, &found);
1227                 if (found)
1228                 {
1229                         /* Duplicate, so mark the previous occurrence as skippable */
1230                         skip_slot[slotmap->slot] = true;
1231                         num_skipped++;
1232                 }
1233                 /* Remember slot containing latest occurrence of this request value */
1234                 slotmap->slot = n;
1235         }
1236
1237         /* Done with the hash table. */
1238         hash_destroy(htab);
1239
1240         /* If no duplicates, we're out of luck. */
1241         if (!num_skipped)
1242         {
1243                 pfree(skip_slot);
1244                 return false;
1245         }
1246
1247         /* We found some duplicates; remove them. */
1248         preserve_count = 0;
1249         for (n = 0; n < CheckpointerShmem->num_requests; n++)
1250         {
1251                 if (skip_slot[n])
1252                         continue;
1253                 CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n];
1254         }
1255         ereport(DEBUG1,
1256                         (errmsg_internal("compacted fsync request queue from %d entries to %d entries",
1257                                                          CheckpointerShmem->num_requests, preserve_count)));
1258         CheckpointerShmem->num_requests = preserve_count;
1259
1260         /* Cleanup. */
1261         pfree(skip_slot);
1262         return true;
1263 }
1264
1265 /*
1266  * AbsorbSyncRequests
1267  *              Retrieve queued sync requests and pass them to sync mechanism.
1268  *
1269  * This is exported because it must be called during CreateCheckPoint;
1270  * we have to be sure we have accepted all pending requests just before
1271  * we start fsync'ing.  Since CreateCheckPoint sometimes runs in
1272  * non-checkpointer processes, do nothing if not checkpointer.
1273  */
1274 void
1275 AbsorbSyncRequests(void)
1276 {
1277         CheckpointerRequest *requests = NULL;
1278         CheckpointerRequest *request;
1279         int                     n;
1280
1281         if (!AmCheckpointerProcess())
1282                 return;
1283
1284         LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1285
1286         /*
1287          * We try to avoid holding the lock for a long time by copying the request
1288          * array, and processing the requests after releasing the lock.
1289          *
1290          * Once we have cleared the requests from shared memory, we have to PANIC
1291          * if we then fail to absorb them (eg, because our hashtable runs out of
1292          * memory).  This is because the system cannot run safely if we are unable
1293          * to fsync what we have been told to fsync.  Fortunately, the hashtable
1294          * is so small that the problem is quite unlikely to arise in practice.
1295          */
1296         n = CheckpointerShmem->num_requests;
1297         if (n > 0)
1298         {
1299                 requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
1300                 memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
1301         }
1302
1303         START_CRIT_SECTION();
1304
1305         CheckpointerShmem->num_requests = 0;
1306
1307         LWLockRelease(CheckpointerCommLock);
1308
1309         for (request = requests; n > 0; request++, n--)
1310                 RememberSyncRequest(&request->ftag, request->type);
1311
1312         END_CRIT_SECTION();
1313
1314         if (requests)
1315                 pfree(requests);
1316 }
1317
1318 /*
1319  * Update any shared memory configurations based on config parameters
1320  */
1321 static void
1322 UpdateSharedMemoryConfig(void)
1323 {
1324         /* update global shmem state for sync rep */
1325         SyncRepUpdateSyncStandbysDefined();
1326
1327         /*
1328          * If full_page_writes has been changed by SIGHUP, we update it in shared
1329          * memory and write an XLOG_FPW_CHANGE record.
1330          */
1331         UpdateFullPageWrites();
1332
1333         elog(DEBUG2, "checkpointer updated shared memory configuration values");
1334 }
1335
1336 /*
1337  * FirstCallSinceLastCheckpoint allows a process to take an action once
1338  * per checkpoint cycle by asynchronously checking for checkpoint completion.
1339  */
1340 bool
1341 FirstCallSinceLastCheckpoint(void)
1342 {
1343         static int      ckpt_done = 0;
1344         int                     new_done;
1345         bool            FirstCall = false;
1346
1347         SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1348         new_done = CheckpointerShmem->ckpt_done;
1349         SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1350
1351         if (new_done != ckpt_done)
1352                 FirstCall = true;
1353
1354         ckpt_done = new_done;
1355
1356         return FirstCall;
1357 }