src/backend/storage/ipc/latch.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * latch.c
   4  *        Routines for inter-process latches
   5  *
   6  * The poll() implementation uses the so-called self-pipe trick to overcome the
   7  * race condition involved with poll() and setting a global flag in the signal
   8  * handler. When a latch is set and the current process is waiting for it, the
   9  * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
  10  * A signal by itself doesn't interrupt poll() on all platforms, and even on
  11  * platforms where it does, a signal that arrives just before the poll() call
  12  * does not prevent poll() from entering sleep. An incoming byte on a pipe
  13  * however reliably interrupts the sleep, and causes poll() to return
  14  * immediately even if the signal arrives before poll() begins.
  15  *
  16  * The epoll() implementation overcomes the race with a different technique: it
  17  * keeps SIGURG blocked and consumes from a signalfd() descriptor instead.  We
  18  * don't need to register a signal handler or create our own self-pipe.  We
  19  * assume that any system that has Linux epoll() also has Linux signalfd().
  20  *
  21  * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
  22  *
  23  * The Windows implementation uses Windows events that are inherited by all
  24  * postmaster child processes. There's no need for the self-pipe trick there.
  25  *
  26  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  27  * Portions Copyright (c) 1994, Regents of the University of California
  28  *
  29  * IDENTIFICATION
  30  *        src/backend/storage/ipc/latch.c
  31  *
  32  *-------------------------------------------------------------------------
  33  */
  34 #include "postgres.h"
  35
  36 #include <fcntl.h>
  37 #include <limits.h>
  38 #include <signal.h>
  39 #include <unistd.h>
  40 #ifdef HAVE_SYS_EPOLL_H
  41 #include <sys/epoll.h>
  42 #endif
  43 #ifdef HAVE_SYS_EVENT_H
  44 #include <sys/event.h>
  45 #endif
  46 #ifdef HAVE_SYS_SIGNALFD_H
  47 #include <sys/signalfd.h>
  48 #endif
  49 #ifdef HAVE_POLL_H
  50 #include <poll.h>
  51 #endif
  52
  53 #include "libpq/pqsignal.h"
  54 #include "miscadmin.h"
  55 #include "pgstat.h"
  56 #include "port/atomics.h"
  57 #include "portability/instr_time.h"
  58 #include "postmaster/postmaster.h"
  59 #include "storage/fd.h"
  60 #include "storage/ipc.h"
  61 #include "storage/latch.h"
  62 #include "storage/pmsignal.h"
  63 #include "utils/memutils.h"
  64 #include "utils/resowner.h"
  65
  66 /*
  67  * Select the fd readiness primitive to use. Normally the "most modern"
  68  * primitive supported by the OS will be used, but for testing it can be
  69  * useful to manually specify the used primitive.  If desired, just add a
  70  * define somewhere before this block.
  71  */
  72 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
  73         defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
  74 /* don't overwrite manual choice */
  75 #elif defined(HAVE_SYS_EPOLL_H)
  76 #define WAIT_USE_EPOLL
  77 #elif defined(HAVE_KQUEUE)
  78 #define WAIT_USE_KQUEUE
  79 #elif defined(HAVE_POLL)
  80 #define WAIT_USE_POLL
  81 #elif WIN32
  82 #define WAIT_USE_WIN32
  83 #else
  84 #error "no wait set implementation available"
  85 #endif
  86
  87 /*
  88  * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
  89  * available.  For testing the choice can also be manually specified.
  90  */
  91 #if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
  92 #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
  93 /* don't overwrite manual choice */
  94 #elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H)
  95 #define WAIT_USE_SIGNALFD
  96 #else
  97 #define WAIT_USE_SELF_PIPE
  98 #endif
  99 #endif
 100
 101 /* typedef in latch.h */
 102 struct WaitEventSet
 103 {
 104         ResourceOwner owner;
 105
 106         int                     nevents;                /* number of registered events */
 107         int                     nevents_space;  /* maximum number of events in this set */
 108
 109         /*
 110          * Array, of nevents_space length, storing the definition of events this
 111          * set is waiting for.
 112          */
 113         WaitEvent  *events;
 114
 115         /*
 116          * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
 117          * said latch, and latch_pos the offset in the ->events array. This is
 118          * useful because we check the state of the latch before performing doing
 119          * syscalls related to waiting.
 120          */
 121         Latch      *latch;
 122         int                     latch_pos;
 123
 124         /*
 125          * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
 126          * is set so that we'll exit immediately if postmaster death is detected,
 127          * instead of returning.
 128          */
 129         bool            exit_on_postmaster_death;
 130
 131 #if defined(WAIT_USE_EPOLL)
 132         int                     epoll_fd;
 133         /* epoll_wait returns events in a user provided arrays, allocate once */
 134         struct epoll_event *epoll_ret_events;
 135 #elif defined(WAIT_USE_KQUEUE)
 136         int                     kqueue_fd;
 137         /* kevent returns events in a user provided arrays, allocate once */
 138         struct kevent *kqueue_ret_events;
 139         bool            report_postmaster_not_running;
 140 #elif defined(WAIT_USE_POLL)
 141         /* poll expects events to be waited on every poll() call, prepare once */
 142         struct pollfd *pollfds;
 143 #elif defined(WAIT_USE_WIN32)
 144
 145         /*
 146          * Array of windows events. The first element always contains
 147          * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
 148          * event->pos + 1).
 149          */
 150         HANDLE     *handles;
 151 #endif
 152 };
 153
 154 /* A common WaitEventSet used to implement WaitLatch() */
 155 static WaitEventSet *LatchWaitSet;
 156
 157 /* The position of the latch in LatchWaitSet. */
 158 #define LatchWaitSetLatchPos 0
 159
 160 #ifndef WIN32
 161 /* Are we currently in WaitLatch? The signal handler would like to know. */
 162 static volatile sig_atomic_t waiting = false;
 163 #endif
 164
 165 #ifdef WAIT_USE_SIGNALFD
 166 /* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
 167 static int      signal_fd = -1;
 168 #endif
 169
 170 #ifdef WAIT_USE_SELF_PIPE
 171 /* Read and write ends of the self-pipe */
 172 static int      selfpipe_readfd = -1;
 173 static int      selfpipe_writefd = -1;
 174
 175 /* Process owning the self-pipe --- needed for checking purposes */
 176 static int      selfpipe_owner_pid = 0;
 177
 178 /* Private function prototypes */
 179 static void latch_sigurg_handler(SIGNAL_ARGS);
 180 static void sendSelfPipeByte(void);
 181 #endif
 182
 183 #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
 184 static void drain(void);
 185 #endif
 186
 187 #if defined(WAIT_USE_EPOLL)
 188 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
 189 #elif defined(WAIT_USE_KQUEUE)
 190 static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
 191 #elif defined(WAIT_USE_POLL)
 192 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
 193 #elif defined(WAIT_USE_WIN32)
 194 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
 195 #endif
 196
 197 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 198                                                                                 WaitEvent *occurred_events, int nevents);
 199
 200 /* ResourceOwner support to hold WaitEventSets */
 201 static void ResOwnerReleaseWaitEventSet(Datum res);
 202
 203 static const ResourceOwnerDesc wait_event_set_resowner_desc =
 204 {
 205         .name = "WaitEventSet",
 206         .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
 207         .release_priority = RELEASE_PRIO_WAITEVENTSETS,
 208         .ReleaseResource = ResOwnerReleaseWaitEventSet,
 209         .DebugPrint = NULL
 210 };
 211
 212 /* Convenience wrappers over ResourceOwnerRemember/Forget */
 213 static inline void
 214 ResourceOwnerRememberWaitEventSet(ResourceOwner owner, WaitEventSet *set)
 215 {
 216         ResourceOwnerRemember(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
 217 }
 218 static inline void
 219 ResourceOwnerForgetWaitEventSet(ResourceOwner owner, WaitEventSet *set)
 220 {
 221         ResourceOwnerForget(owner, PointerGetDatum(set), &wait_event_set_resowner_desc);
 222 }
 223
 224
 225 /*
 226  * Initialize the process-local latch infrastructure.
 227  *
 228  * This must be called once during startup of any process that can wait on
 229  * latches, before it issues any InitLatch() or OwnLatch() calls.
 230  */
 231 void
 232 InitializeLatchSupport(void)
 233 {
 234 #if defined(WAIT_USE_SELF_PIPE)
 235         int                     pipefd[2];
 236
 237         if (IsUnderPostmaster)
 238         {
 239                 /*
 240                  * We might have inherited connections to a self-pipe created by the
 241                  * postmaster.  It's critical that child processes create their own
 242                  * self-pipes, of course, and we really want them to close the
 243                  * inherited FDs for safety's sake.
 244                  */
 245                 if (selfpipe_owner_pid != 0)
 246                 {
 247                         /* Assert we go through here but once in a child process */
 248                         Assert(selfpipe_owner_pid != MyProcPid);
 249                         /* Release postmaster's pipe FDs; ignore any error */
 250                         (void) close(selfpipe_readfd);
 251                         (void) close(selfpipe_writefd);
 252                         /* Clean up, just for safety's sake; we'll set these below */
 253                         selfpipe_readfd = selfpipe_writefd = -1;
 254                         selfpipe_owner_pid = 0;
 255                         /* Keep fd.c's accounting straight */
 256                         ReleaseExternalFD();
 257                         ReleaseExternalFD();
 258                 }
 259                 else
 260                 {
 261                         /*
 262                          * Postmaster didn't create a self-pipe ... or else we're in an
 263                          * EXEC_BACKEND build, in which case it doesn't matter since the
 264                          * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
 265                          * fd.c won't have state to clean up, either.
 266                          */
 267                         Assert(selfpipe_readfd == -1);
 268                 }
 269         }
 270         else
 271         {
 272                 /* In postmaster or standalone backend, assert we do this but once */
 273                 Assert(selfpipe_readfd == -1);
 274                 Assert(selfpipe_owner_pid == 0);
 275         }
 276
 277         /*
 278          * Set up the self-pipe that allows a signal handler to wake up the
 279          * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
 280          * that SetLatch won't block if the event has already been set many times
 281          * filling the kernel buffer. Make the read-end non-blocking too, so that
 282          * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
 283          * Also, make both FDs close-on-exec, since we surely do not want any
 284          * child processes messing with them.
 285          */
 286         if (pipe(pipefd) < 0)
 287                 elog(FATAL, "pipe() failed: %m");
 288         if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
 289                 elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
 290         if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
 291                 elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
 292         if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
 293                 elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
 294         if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
 295                 elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
 296
 297         selfpipe_readfd = pipefd[0];
 298         selfpipe_writefd = pipefd[1];
 299         selfpipe_owner_pid = MyProcPid;
 300
 301         /* Tell fd.c about these two long-lived FDs */
 302         ReserveExternalFD();
 303         ReserveExternalFD();
 304
 305         pqsignal(SIGURG, latch_sigurg_handler);
 306 #endif
 307
 308 #ifdef WAIT_USE_SIGNALFD
 309         sigset_t        signalfd_mask;
 310
 311         if (IsUnderPostmaster)
 312         {
 313                 /*
 314                  * It would probably be safe to re-use the inherited signalfd since
 315                  * signalfds only see the current process's pending signals, but it
 316                  * seems less surprising to close it and create our own.
 317                  */
 318                 if (signal_fd != -1)
 319                 {
 320                         /* Release postmaster's signal FD; ignore any error */
 321                         (void) close(signal_fd);
 322                         signal_fd = -1;
 323                         ReleaseExternalFD();
 324                 }
 325         }
 326
 327         /* Block SIGURG, because we'll receive it through a signalfd. */
 328         sigaddset(&UnBlockSig, SIGURG);
 329
 330         /* Set up the signalfd to receive SIGURG notifications. */
 331         sigemptyset(&signalfd_mask);
 332         sigaddset(&signalfd_mask, SIGURG);
 333         signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
 334         if (signal_fd < 0)
 335                 elog(FATAL, "signalfd() failed");
 336         ReserveExternalFD();
 337 #endif
 338
 339 #ifdef WAIT_USE_KQUEUE
 340         /* Ignore SIGURG, because we'll receive it via kqueue. */
 341         pqsignal(SIGURG, SIG_IGN);
 342 #endif
 343 }
 344
 345 void
 346 InitializeLatchWaitSet(void)
 347 {
 348         int                     latch_pos PG_USED_FOR_ASSERTS_ONLY;
 349
 350         Assert(LatchWaitSet == NULL);
 351
 352         /* Set up the WaitEventSet used by WaitLatch(). */
 353         LatchWaitSet = CreateWaitEventSet(NULL, 2);
 354         latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
 355                                                                   MyLatch, NULL);
 356         if (IsUnderPostmaster)
 357                 AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
 358                                                   PGINVALID_SOCKET, NULL, NULL);
 359
 360         Assert(latch_pos == LatchWaitSetLatchPos);
 361 }
 362
 363 void
 364 ShutdownLatchSupport(void)
 365 {
 366 #if defined(WAIT_USE_POLL)
 367         pqsignal(SIGURG, SIG_IGN);
 368 #endif
 369
 370         if (LatchWaitSet)
 371         {
 372                 FreeWaitEventSet(LatchWaitSet);
 373                 LatchWaitSet = NULL;
 374         }
 375
 376 #if defined(WAIT_USE_SELF_PIPE)
 377         close(selfpipe_readfd);
 378         close(selfpipe_writefd);
 379         selfpipe_readfd = -1;
 380         selfpipe_writefd = -1;
 381         selfpipe_owner_pid = InvalidPid;
 382 #endif
 383
 384 #if defined(WAIT_USE_SIGNALFD)
 385         close(signal_fd);
 386         signal_fd = -1;
 387 #endif
 388 }
 389
 390 /*
 391  * Initialize a process-local latch.
 392  */
 393 void
 394 InitLatch(Latch *latch)
 395 {
 396         latch->is_set = false;
 397         latch->maybe_sleeping = false;
 398         latch->owner_pid = MyProcPid;
 399         latch->is_shared = false;
 400
 401 #if defined(WAIT_USE_SELF_PIPE)
 402         /* Assert InitializeLatchSupport has been called in this process */
 403         Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
 404 #elif defined(WAIT_USE_SIGNALFD)
 405         /* Assert InitializeLatchSupport has been called in this process */
 406         Assert(signal_fd >= 0);
 407 #elif defined(WAIT_USE_WIN32)
 408         latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
 409         if (latch->event == NULL)
 410                 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
 411 #endif                                                  /* WIN32 */
 412 }
 413
 414 /*
 415  * Initialize a shared latch that can be set from other processes. The latch
 416  * is initially owned by no-one; use OwnLatch to associate it with the
 417  * current process.
 418  *
 419  * InitSharedLatch needs to be called in postmaster before forking child
 420  * processes, usually right after allocating the shared memory block
 421  * containing the latch with ShmemInitStruct. (The Unix implementation
 422  * doesn't actually require that, but the Windows one does.) Because of
 423  * this restriction, we have no concurrency issues to worry about here.
 424  *
 425  * Note that other handles created in this module are never marked as
 426  * inheritable.  Thus we do not need to worry about cleaning up child
 427  * process references to postmaster-private latches or WaitEventSets.
 428  */
 429 void
 430 InitSharedLatch(Latch *latch)
 431 {
 432 #ifdef WIN32
 433         SECURITY_ATTRIBUTES sa;
 434
 435         /*
 436          * Set up security attributes to specify that the events are inherited.
 437          */
 438         ZeroMemory(&sa, sizeof(sa));
 439         sa.nLength = sizeof(sa);
 440         sa.bInheritHandle = TRUE;
 441
 442         latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
 443         if (latch->event == NULL)
 444                 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
 445 #endif
 446
 447         latch->is_set = false;
 448         latch->maybe_sleeping = false;
 449         latch->owner_pid = 0;
 450         latch->is_shared = true;
 451 }
 452
 453 /*
 454  * Associate a shared latch with the current process, allowing it to
 455  * wait on the latch.
 456  *
 457  * Although there is a sanity check for latch-already-owned, we don't do
 458  * any sort of locking here, meaning that we could fail to detect the error
 459  * if two processes try to own the same latch at about the same time.  If
 460  * there is any risk of that, caller must provide an interlock to prevent it.
 461  */
 462 void
 463 OwnLatch(Latch *latch)
 464 {
 465         int                     owner_pid;
 466
 467         /* Sanity checks */
 468         Assert(latch->is_shared);
 469
 470 #if defined(WAIT_USE_SELF_PIPE)
 471         /* Assert InitializeLatchSupport has been called in this process */
 472         Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
 473 #elif defined(WAIT_USE_SIGNALFD)
 474         /* Assert InitializeLatchSupport has been called in this process */
 475         Assert(signal_fd >= 0);
 476 #endif
 477
 478         owner_pid = latch->owner_pid;
 479         if (owner_pid != 0)
 480                 elog(PANIC, "latch already owned by PID %d", owner_pid);
 481
 482         latch->owner_pid = MyProcPid;
 483 }
 484
 485 /*
 486  * Disown a shared latch currently owned by the current process.
 487  */
 488 void
 489 DisownLatch(Latch *latch)
 490 {
 491         Assert(latch->is_shared);
 492         Assert(latch->owner_pid == MyProcPid);
 493
 494         latch->owner_pid = 0;
 495 }
 496
 497 /*
 498  * Wait for a given latch to be set, or for postmaster death, or until timeout
 499  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
 500  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
 501  * function returns immediately.
 502  *
 503  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
 504  * is given.  Although it is declared as "long", we don't actually support
 505  * timeouts longer than INT_MAX milliseconds.  Note that some extra overhead
 506  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
 507  *
 508  * The latch must be owned by the current process, ie. it must be a
 509  * process-local latch initialized with InitLatch, or a shared latch
 510  * associated with the current process by calling OwnLatch.
 511  *
 512  * Returns bit mask indicating which condition(s) caused the wake-up. Note
 513  * that if multiple wake-up conditions are true, there is no guarantee that
 514  * we return all of them in one call, but we will return at least one.
 515  */
 516 int
 517 WaitLatch(Latch *latch, int wakeEvents, long timeout,
 518                   uint32 wait_event_info)
 519 {
 520         WaitEvent       event;
 521
 522         /* Postmaster-managed callers must handle postmaster death somehow. */
 523         Assert(!IsUnderPostmaster ||
 524                    (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
 525                    (wakeEvents & WL_POSTMASTER_DEATH));
 526
 527         /*
 528          * Some callers may have a latch other than MyLatch, or no latch at all,
 529          * or want to handle postmaster death differently.  It's cheap to assign
 530          * those, so just do it every time.
 531          */
 532         if (!(wakeEvents & WL_LATCH_SET))
 533                 latch = NULL;
 534         ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
 535         LatchWaitSet->exit_on_postmaster_death =
 536                 ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
 537
 538         if (WaitEventSetWait(LatchWaitSet,
 539                                                  (wakeEvents & WL_TIMEOUT) ? timeout : -1,
 540                                                  &event, 1,
 541                                                  wait_event_info) == 0)
 542                 return WL_TIMEOUT;
 543         else
 544                 return event.events;
 545 }
 546
 547 /*
 548  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
 549  * conditions.
 550  *
 551  * When waiting on a socket, EOF and error conditions always cause the socket
 552  * to be reported as readable/writable/connected, so that the caller can deal
 553  * with the condition.
 554  *
 555  * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
 556  * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
 557  * return value if the postmaster dies.  The latter is useful for rare cases
 558  * where some behavior other than immediate exit is needed.
 559  *
 560  * NB: These days this is just a wrapper around the WaitEventSet API. When
 561  * using a latch very frequently, consider creating a longer living
 562  * WaitEventSet instead; that's more efficient.
 563  */
 564 int
 565 WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
 566                                   long timeout, uint32 wait_event_info)
 567 {
 568         int                     ret = 0;
 569         int                     rc;
 570         WaitEvent       event;
 571         WaitEventSet *set = CreateWaitEventSet(CurrentResourceOwner, 3);
 572
 573         if (wakeEvents & WL_TIMEOUT)
 574                 Assert(timeout >= 0);
 575         else
 576                 timeout = -1;
 577
 578         if (wakeEvents & WL_LATCH_SET)
 579                 AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
 580                                                   latch, NULL);
 581
 582         /* Postmaster-managed callers must handle postmaster death somehow. */
 583         Assert(!IsUnderPostmaster ||
 584                    (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
 585                    (wakeEvents & WL_POSTMASTER_DEATH));
 586
 587         if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
 588                 AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
 589                                                   NULL, NULL);
 590
 591         if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
 592                 AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 593                                                   NULL, NULL);
 594
 595         if (wakeEvents & WL_SOCKET_MASK)
 596         {
 597                 int                     ev;
 598
 599                 ev = wakeEvents & WL_SOCKET_MASK;
 600                 AddWaitEventToSet(set, ev, sock, NULL, NULL);
 601         }
 602
 603         rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
 604
 605         if (rc == 0)
 606                 ret |= WL_TIMEOUT;
 607         else
 608         {
 609                 ret |= event.events & (WL_LATCH_SET |
 610                                                            WL_POSTMASTER_DEATH |
 611                                                            WL_SOCKET_MASK);
 612         }
 613
 614         FreeWaitEventSet(set);
 615
 616         return ret;
 617 }
 618
 619 /*
 620  * Sets a latch and wakes up anyone waiting on it.
 621  *
 622  * This is cheap if the latch is already set, otherwise not so much.
 623  *
 624  * NB: when calling this in a signal handler, be sure to save and restore
 625  * errno around it.  (That's standard practice in most signal handlers, of
 626  * course, but we used to omit it in handlers that only set a flag.)
 627  *
 628  * NB: this function is called from critical sections and signal handlers so
 629  * throwing an error is not a good idea.
 630  */
 631 void
 632 SetLatch(Latch *latch)
 633 {
 634 #ifndef WIN32
 635         pid_t           owner_pid;
 636 #else
 637         HANDLE          handle;
 638 #endif
 639
 640         /*
 641          * The memory barrier has to be placed here to ensure that any flag
 642          * variables possibly changed by this process have been flushed to main
 643          * memory, before we check/set is_set.
 644          */
 645         pg_memory_barrier();
 646
 647         /* Quick exit if already set */
 648         if (latch->is_set)
 649                 return;
 650
 651         latch->is_set = true;
 652
 653         pg_memory_barrier();
 654         if (!latch->maybe_sleeping)
 655                 return;
 656
 657 #ifndef WIN32
 658
 659         /*
 660          * See if anyone's waiting for the latch. It can be the current process if
 661          * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
 662          * to wake up WaitEventSetWaitBlock() without races in that case. If it's
 663          * another process, send a signal.
 664          *
 665          * Fetch owner_pid only once, in case the latch is concurrently getting
 666          * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
 667          * guaranteed to be true! In practice, the effective range of pid_t fits
 668          * in a 32 bit integer, and so should be atomic. In the worst case, we
 669          * might end up signaling the wrong process. Even then, you're very
 670          * unlucky if a process with that bogus pid exists and belongs to
 671          * Postgres; and PG database processes should handle excess SIGUSR1
 672          * interrupts without a problem anyhow.
 673          *
 674          * Another sort of race condition that's possible here is for a new
 675          * process to own the latch immediately after we look, so we don't signal
 676          * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
 677          * the standard coding convention of waiting at the bottom of their loops,
 678          * not the top, so that they'll correctly process latch-setting events
 679          * that happen before they enter the loop.
 680          */
 681         owner_pid = latch->owner_pid;
 682         if (owner_pid == 0)
 683                 return;
 684         else if (owner_pid == MyProcPid)
 685         {
 686 #if defined(WAIT_USE_SELF_PIPE)
 687                 if (waiting)
 688                         sendSelfPipeByte();
 689 #else
 690                 if (waiting)
 691                         kill(MyProcPid, SIGURG);
 692 #endif
 693         }
 694         else
 695                 kill(owner_pid, SIGURG);
 696
 697 #else
 698
 699         /*
 700          * See if anyone's waiting for the latch. It can be the current process if
 701          * we're in a signal handler.
 702          *
 703          * Use a local variable here just in case somebody changes the event field
 704          * concurrently (which really should not happen).
 705          */
 706         handle = latch->event;
 707         if (handle)
 708         {
 709                 SetEvent(handle);
 710
 711                 /*
 712                  * Note that we silently ignore any errors. We might be in a signal
 713                  * handler or other critical path where it's not safe to call elog().
 714                  */
 715         }
 716 #endif
 717 }
 718
 719 /*
 720  * Clear the latch. Calling WaitLatch after this will sleep, unless
 721  * the latch is set again before the WaitLatch call.
 722  */
 723 void
 724 ResetLatch(Latch *latch)
 725 {
 726         /* Only the owner should reset the latch */
 727         Assert(latch->owner_pid == MyProcPid);
 728         Assert(latch->maybe_sleeping == false);
 729
 730         latch->is_set = false;
 731
 732         /*
 733          * Ensure that the write to is_set gets flushed to main memory before we
 734          * examine any flag variables.  Otherwise a concurrent SetLatch might
 735          * falsely conclude that it needn't signal us, even though we have missed
 736          * seeing some flag updates that SetLatch was supposed to inform us of.
 737          */
 738         pg_memory_barrier();
 739 }
 740
 741 /*
 742  * Create a WaitEventSet with space for nevents different events to wait for.
 743  *
 744  * These events can then be efficiently waited upon together, using
 745  * WaitEventSetWait().
 746  *
 747  * The WaitEventSet is tracked by the given 'resowner'.  Use NULL for session
 748  * lifetime.
 749  */
 750 WaitEventSet *
 751 CreateWaitEventSet(ResourceOwner resowner, int nevents)
 752 {
 753         WaitEventSet *set;
 754         char       *data;
 755         Size            sz = 0;
 756
 757         /*
 758          * Use MAXALIGN size/alignment to guarantee that later uses of memory are
 759          * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
 760          * platforms, but earlier allocations like WaitEventSet and WaitEvent
 761          * might not be sized to guarantee that when purely using sizeof().
 762          */
 763         sz += MAXALIGN(sizeof(WaitEventSet));
 764         sz += MAXALIGN(sizeof(WaitEvent) * nevents);
 765
 766 #if defined(WAIT_USE_EPOLL)
 767         sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
 768 #elif defined(WAIT_USE_KQUEUE)
 769         sz += MAXALIGN(sizeof(struct kevent) * nevents);
 770 #elif defined(WAIT_USE_POLL)
 771         sz += MAXALIGN(sizeof(struct pollfd) * nevents);
 772 #elif defined(WAIT_USE_WIN32)
 773         /* need space for the pgwin32_signal_event */
 774         sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
 775 #endif
 776
 777         if (resowner != NULL)
 778                 ResourceOwnerEnlarge(resowner);
 779
 780         data = (char *) MemoryContextAllocZero(TopMemoryContext, sz);
 781
 782         set = (WaitEventSet *) data;
 783         data += MAXALIGN(sizeof(WaitEventSet));
 784
 785         set->events = (WaitEvent *) data;
 786         data += MAXALIGN(sizeof(WaitEvent) * nevents);
 787
 788 #if defined(WAIT_USE_EPOLL)
 789         set->epoll_ret_events = (struct epoll_event *) data;
 790         data += MAXALIGN(sizeof(struct epoll_event) * nevents);
 791 #elif defined(WAIT_USE_KQUEUE)
 792         set->kqueue_ret_events = (struct kevent *) data;
 793         data += MAXALIGN(sizeof(struct kevent) * nevents);
 794 #elif defined(WAIT_USE_POLL)
 795         set->pollfds = (struct pollfd *) data;
 796         data += MAXALIGN(sizeof(struct pollfd) * nevents);
 797 #elif defined(WAIT_USE_WIN32)
 798         set->handles = (HANDLE) data;
 799         data += MAXALIGN(sizeof(HANDLE) * nevents);
 800 #endif
 801
 802         set->latch = NULL;
 803         set->nevents_space = nevents;
 804         set->exit_on_postmaster_death = false;
 805
 806         if (resowner != NULL)
 807         {
 808                 ResourceOwnerRememberWaitEventSet(resowner, set);
 809                 set->owner = resowner;
 810         }
 811
 812 #if defined(WAIT_USE_EPOLL)
 813         if (!AcquireExternalFD())
 814                 elog(ERROR, "AcquireExternalFD, for epoll_create1, failed: %m");
 815         set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 816         if (set->epoll_fd < 0)
 817         {
 818                 ReleaseExternalFD();
 819                 elog(ERROR, "epoll_create1 failed: %m");
 820         }
 821 #elif defined(WAIT_USE_KQUEUE)
 822         if (!AcquireExternalFD())
 823                 elog(ERROR, "AcquireExternalFD, for kqueue, failed: %m");
 824         set->kqueue_fd = kqueue();
 825         if (set->kqueue_fd < 0)
 826         {
 827                 ReleaseExternalFD();
 828                 elog(ERROR, "kqueue failed: %m");
 829         }
 830         if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
 831         {
 832                 int                     save_errno = errno;
 833
 834                 close(set->kqueue_fd);
 835                 ReleaseExternalFD();
 836                 errno = save_errno;
 837                 elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
 838         }
 839         set->report_postmaster_not_running = false;
 840 #elif defined(WAIT_USE_WIN32)
 841
 842         /*
 843          * To handle signals while waiting, we need to add a win32 specific event.
 844          * We accounted for the additional event at the top of this routine. See
 845          * port/win32/signal.c for more details.
 846          *
 847          * Note: pgwin32_signal_event should be first to ensure that it will be
 848          * reported when multiple events are set.  We want to guarantee that
 849          * pending signals are serviced.
 850          */
 851         set->handles[0] = pgwin32_signal_event;
 852         StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
 853 #endif
 854
 855         return set;
 856 }
 857
 858 /*
 859  * Free a previously created WaitEventSet.
 860  *
 861  * Note: preferably, this shouldn't have to free any resources that could be
 862  * inherited across an exec().  If it did, we'd likely leak those resources in
 863  * many scenarios.  For the epoll case, we ensure that by setting EPOLL_CLOEXEC
 864  * when the FD is created.  For the Windows case, we assume that the handles
 865  * involved are non-inheritable.
 866  */
 867 void
 868 FreeWaitEventSet(WaitEventSet *set)
 869 {
 870         if (set->owner)
 871         {
 872                 ResourceOwnerForgetWaitEventSet(set->owner, set);
 873                 set->owner = NULL;
 874         }
 875
 876 #if defined(WAIT_USE_EPOLL)
 877         close(set->epoll_fd);
 878         ReleaseExternalFD();
 879 #elif defined(WAIT_USE_KQUEUE)
 880         close(set->kqueue_fd);
 881         ReleaseExternalFD();
 882 #elif defined(WAIT_USE_WIN32)
 883         for (WaitEvent *cur_event = set->events;
 884                  cur_event < (set->events + set->nevents);
 885                  cur_event++)
 886         {
 887                 if (cur_event->events & WL_LATCH_SET)
 888                 {
 889                         /* uses the latch's HANDLE */
 890                 }
 891                 else if (cur_event->events & WL_POSTMASTER_DEATH)
 892                 {
 893                         /* uses PostmasterHandle */
 894                 }
 895                 else
 896                 {
 897                         /* Clean up the event object we created for the socket */
 898                         WSAEventSelect(cur_event->fd, NULL, 0);
 899                         WSACloseEvent(set->handles[cur_event->pos + 1]);
 900                 }
 901         }
 902 #endif
 903
 904         pfree(set);
 905 }
 906
 907 /*
 908  * Free a previously created WaitEventSet in a child process after a fork().
 909  */
 910 void
 911 FreeWaitEventSetAfterFork(WaitEventSet *set)
 912 {
 913 #if defined(WAIT_USE_EPOLL)
 914         close(set->epoll_fd);
 915         ReleaseExternalFD();
 916 #elif defined(WAIT_USE_KQUEUE)
 917         /* kqueues are not normally inherited by child processes */
 918         ReleaseExternalFD();
 919 #endif
 920
 921         pfree(set);
 922 }
 923
 924 /* ---
 925  * Add an event to the set. Possible events are:
 926  * - WL_LATCH_SET: Wait for the latch to be set
 927  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
 928  * - WL_SOCKET_READABLE: Wait for socket to become readable,
 929  *       can be combined in one event with other WL_SOCKET_* events
 930  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
 931  *       can be combined with other WL_SOCKET_* events
 932  * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
 933  *       can be combined with other WL_SOCKET_* events (on non-Windows
 934  *       platforms, this is the same as WL_SOCKET_WRITEABLE)
 935  * - WL_SOCKET_ACCEPT: Wait for new connection to a server socket,
 936  *       can be combined with other WL_SOCKET_* events (on non-Windows
 937  *       platforms, this is the same as WL_SOCKET_READABLE)
 938  * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
 939  * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
 940  *
 941  * Returns the offset in WaitEventSet->events (starting from 0), which can be
 942  * used to modify previously added wait events using ModifyWaitEvent().
 943  *
 944  * In the WL_LATCH_SET case the latch must be owned by the current process,
 945  * i.e. it must be a process-local latch initialized with InitLatch, or a
 946  * shared latch associated with the current process by calling OwnLatch.
 947  *
 948  * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED/ACCEPT cases, EOF and error
 949  * conditions cause the socket to be reported as readable/writable/connected,
 950  * so that the caller can deal with the condition.
 951  *
 952  * The user_data pointer specified here will be set for the events returned
 953  * by WaitEventSetWait(), allowing to easily associate additional data with
 954  * events.
 955  */
 956 int
 957 AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 958                                   void *user_data)
 959 {
 960         WaitEvent  *event;
 961
 962         /* not enough space */
 963         Assert(set->nevents < set->nevents_space);
 964
 965         if (events == WL_EXIT_ON_PM_DEATH)
 966         {
 967                 events = WL_POSTMASTER_DEATH;
 968                 set->exit_on_postmaster_death = true;
 969         }
 970
 971         if (latch)
 972         {
 973                 if (latch->owner_pid != MyProcPid)
 974                         elog(ERROR, "cannot wait on a latch owned by another process");
 975                 if (set->latch)
 976                         elog(ERROR, "cannot wait on more than one latch");
 977                 if ((events & WL_LATCH_SET) != WL_LATCH_SET)
 978                         elog(ERROR, "latch events only support being set");
 979         }
 980         else
 981         {
 982                 if (events & WL_LATCH_SET)
 983                         elog(ERROR, "cannot wait on latch without a specified latch");
 984         }
 985
 986         /* waiting for socket readiness without a socket indicates a bug */
 987         if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
 988                 elog(ERROR, "cannot wait on socket event without a socket");
 989
 990         event = &set->events[set->nevents];
 991         event->pos = set->nevents++;
 992         event->fd = fd;
 993         event->events = events;
 994         event->user_data = user_data;
 995 #ifdef WIN32
 996         event->reset = false;
 997 #endif
 998
 999         if (events == WL_LATCH_SET)
1000         {
1001                 set->latch = latch;
1002                 set->latch_pos = event->pos;
1003 #if defined(WAIT_USE_SELF_PIPE)
1004                 event->fd = selfpipe_readfd;
1005 #elif defined(WAIT_USE_SIGNALFD)
1006                 event->fd = signal_fd;
1007 #else
1008                 event->fd = PGINVALID_SOCKET;
1009 #ifdef WAIT_USE_EPOLL
1010                 return event->pos;
1011 #endif
1012 #endif
1013         }
1014         else if (events == WL_POSTMASTER_DEATH)
1015         {
1016 #ifndef WIN32
1017                 event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
1018 #endif
1019         }
1020
1021         /* perform wait primitive specific initialization, if needed */
1022 #if defined(WAIT_USE_EPOLL)
1023         WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
1024 #elif defined(WAIT_USE_KQUEUE)
1025         WaitEventAdjustKqueue(set, event, 0);
1026 #elif defined(WAIT_USE_POLL)
1027         WaitEventAdjustPoll(set, event);
1028 #elif defined(WAIT_USE_WIN32)
1029         WaitEventAdjustWin32(set, event);
1030 #endif
1031
1032         return event->pos;
1033 }
1034
1035 /*
1036  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
1037  * with the WaitEvent.  The latch may be changed to NULL to disable the latch
1038  * temporarily, and then set back to a latch later.
1039  *
1040  * 'pos' is the id returned by AddWaitEventToSet.
1041  */
1042 void
1043 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
1044 {
1045         WaitEvent  *event;
1046 #if defined(WAIT_USE_KQUEUE)
1047         int                     old_events;
1048 #endif
1049
1050         Assert(pos < set->nevents);
1051
1052         event = &set->events[pos];
1053 #if defined(WAIT_USE_KQUEUE)
1054         old_events = event->events;
1055 #endif
1056
1057         /*
1058          * If neither the event mask nor the associated latch changes, return
1059          * early. That's an important optimization for some sockets, where
1060          * ModifyWaitEvent is frequently used to switch from waiting for reads to
1061          * waiting on writes.
1062          */
1063         if (events == event->events &&
1064                 (!(event->events & WL_LATCH_SET) || set->latch == latch))
1065                 return;
1066
1067         if (event->events & WL_LATCH_SET &&
1068                 events != event->events)
1069         {
1070                 elog(ERROR, "cannot modify latch event");
1071         }
1072
1073         if (event->events & WL_POSTMASTER_DEATH)
1074         {
1075                 elog(ERROR, "cannot modify postmaster death event");
1076         }
1077
1078         /* FIXME: validate event mask */
1079         event->events = events;
1080
1081         if (events == WL_LATCH_SET)
1082         {
1083                 if (latch && latch->owner_pid != MyProcPid)
1084                         elog(ERROR, "cannot wait on a latch owned by another process");
1085                 set->latch = latch;
1086
1087                 /*
1088                  * On Unix, we don't need to modify the kernel object because the
1089                  * underlying pipe (if there is one) is the same for all latches so we
1090                  * can return immediately.  On Windows, we need to update our array of
1091                  * handles, but we leave the old one in place and tolerate spurious
1092                  * wakeups if the latch is disabled.
1093                  */
1094 #if defined(WAIT_USE_WIN32)
1095                 if (!latch)
1096                         return;
1097 #else
1098                 return;
1099 #endif
1100         }
1101
1102 #if defined(WAIT_USE_EPOLL)
1103         WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
1104 #elif defined(WAIT_USE_KQUEUE)
1105         WaitEventAdjustKqueue(set, event, old_events);
1106 #elif defined(WAIT_USE_POLL)
1107         WaitEventAdjustPoll(set, event);
1108 #elif defined(WAIT_USE_WIN32)
1109         WaitEventAdjustWin32(set, event);
1110 #endif
1111 }
1112
1113 #if defined(WAIT_USE_EPOLL)
1114 /*
1115  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
1116  */
1117 static void
1118 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
1119 {
1120         struct epoll_event epoll_ev;
1121         int                     rc;
1122
1123         /* pointer to our event, returned by epoll_wait */
1124         epoll_ev.data.ptr = event;
1125         /* always wait for errors */
1126         epoll_ev.events = EPOLLERR | EPOLLHUP;
1127
1128         /* prepare pollfd entry once */
1129         if (event->events == WL_LATCH_SET)
1130         {
1131                 Assert(set->latch != NULL);
1132                 epoll_ev.events |= EPOLLIN;
1133         }
1134         else if (event->events == WL_POSTMASTER_DEATH)
1135         {
1136                 epoll_ev.events |= EPOLLIN;
1137         }
1138         else
1139         {
1140                 Assert(event->fd != PGINVALID_SOCKET);
1141                 Assert(event->events & (WL_SOCKET_READABLE |
1142                                                                 WL_SOCKET_WRITEABLE |
1143                                                                 WL_SOCKET_CLOSED));
1144
1145                 if (event->events & WL_SOCKET_READABLE)
1146                         epoll_ev.events |= EPOLLIN;
1147                 if (event->events & WL_SOCKET_WRITEABLE)
1148                         epoll_ev.events |= EPOLLOUT;
1149                 if (event->events & WL_SOCKET_CLOSED)
1150                         epoll_ev.events |= EPOLLRDHUP;
1151         }
1152
1153         /*
1154          * Even though unused, we also pass epoll_ev as the data argument if
1155          * EPOLL_CTL_DEL is passed as action.  There used to be an epoll bug
1156          * requiring that, and actually it makes the code simpler...
1157          */
1158         rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
1159
1160         if (rc < 0)
1161                 ereport(ERROR,
1162                                 (errcode_for_socket_access(),
1163                                  errmsg("%s() failed: %m",
1164                                                 "epoll_ctl")));
1165 }
1166 #endif
1167
1168 #if defined(WAIT_USE_POLL)
1169 static void
1170 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
1171 {
1172         struct pollfd *pollfd = &set->pollfds[event->pos];
1173
1174         pollfd->revents = 0;
1175         pollfd->fd = event->fd;
1176
1177         /* prepare pollfd entry once */
1178         if (event->events == WL_LATCH_SET)
1179         {
1180                 Assert(set->latch != NULL);
1181                 pollfd->events = POLLIN;
1182         }
1183         else if (event->events == WL_POSTMASTER_DEATH)
1184         {
1185                 pollfd->events = POLLIN;
1186         }
1187         else
1188         {
1189                 Assert(event->events & (WL_SOCKET_READABLE |
1190                                                                 WL_SOCKET_WRITEABLE |
1191                                                                 WL_SOCKET_CLOSED));
1192                 pollfd->events = 0;
1193                 if (event->events & WL_SOCKET_READABLE)
1194                         pollfd->events |= POLLIN;
1195                 if (event->events & WL_SOCKET_WRITEABLE)
1196                         pollfd->events |= POLLOUT;
1197 #ifdef POLLRDHUP
1198                 if (event->events & WL_SOCKET_CLOSED)
1199                         pollfd->events |= POLLRDHUP;
1200 #endif
1201         }
1202
1203         Assert(event->fd != PGINVALID_SOCKET);
1204 }
1205 #endif
1206
1207 #if defined(WAIT_USE_KQUEUE)
1208
1209 /*
1210  * On most BSD family systems, the udata member of struct kevent is of type
1211  * void *, so we could directly convert to/from WaitEvent *.  Unfortunately,
1212  * NetBSD has it as intptr_t, so here we wallpaper over that difference with
1213  * an lvalue cast.
1214  */
1215 #define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
1216
1217 static inline void
1218 WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
1219                                                  WaitEvent *event)
1220 {
1221         k_ev->ident = event->fd;
1222         k_ev->filter = filter;
1223         k_ev->flags = action;
1224         k_ev->fflags = 0;
1225         k_ev->data = 0;
1226         AccessWaitEvent(k_ev) = event;
1227 }
1228
1229 static inline void
1230 WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
1231 {
1232         /* For now postmaster death can only be added, not removed. */
1233         k_ev->ident = PostmasterPid;
1234         k_ev->filter = EVFILT_PROC;
1235         k_ev->flags = EV_ADD;
1236         k_ev->fflags = NOTE_EXIT;
1237         k_ev->data = 0;
1238         AccessWaitEvent(k_ev) = event;
1239 }
1240
1241 static inline void
1242 WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
1243 {
1244         /* For now latch can only be added, not removed. */
1245         k_ev->ident = SIGURG;
1246         k_ev->filter = EVFILT_SIGNAL;
1247         k_ev->flags = EV_ADD;
1248         k_ev->fflags = 0;
1249         k_ev->data = 0;
1250         AccessWaitEvent(k_ev) = event;
1251 }
1252
1253 /*
1254  * old_events is the previous event mask, used to compute what has changed.
1255  */
1256 static void
1257 WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
1258 {
1259         int                     rc;
1260         struct kevent k_ev[2];
1261         int                     count = 0;
1262         bool            new_filt_read = false;
1263         bool            old_filt_read = false;
1264         bool            new_filt_write = false;
1265         bool            old_filt_write = false;
1266
1267         if (old_events == event->events)
1268                 return;
1269
1270         Assert(event->events != WL_LATCH_SET || set->latch != NULL);
1271         Assert(event->events == WL_LATCH_SET ||
1272                    event->events == WL_POSTMASTER_DEATH ||
1273                    (event->events & (WL_SOCKET_READABLE |
1274                                                          WL_SOCKET_WRITEABLE |
1275                                                          WL_SOCKET_CLOSED)));
1276
1277         if (event->events == WL_POSTMASTER_DEATH)
1278         {
1279                 /*
1280                  * Unlike all the other implementations, we detect postmaster death
1281                  * using process notification instead of waiting on the postmaster
1282                  * alive pipe.
1283                  */
1284                 WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
1285         }
1286         else if (event->events == WL_LATCH_SET)
1287         {
1288                 /* We detect latch wakeup using a signal event. */
1289                 WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
1290         }
1291         else
1292         {
1293                 /*
1294                  * We need to compute the adds and deletes required to get from the
1295                  * old event mask to the new event mask, since kevent treats readable
1296                  * and writable as separate events.
1297                  */
1298                 if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1299                         old_filt_read = true;
1300                 if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
1301                         new_filt_read = true;
1302                 if (old_events & WL_SOCKET_WRITEABLE)
1303                         old_filt_write = true;
1304                 if (event->events & WL_SOCKET_WRITEABLE)
1305                         new_filt_write = true;
1306                 if (old_filt_read && !new_filt_read)
1307                         WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
1308                                                                          event);
1309                 else if (!old_filt_read && new_filt_read)
1310                         WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
1311                                                                          event);
1312                 if (old_filt_write && !new_filt_write)
1313                         WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
1314                                                                          event);
1315                 else if (!old_filt_write && new_filt_write)
1316                         WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
1317                                                                          event);
1318         }
1319
1320         /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
1321         if (count == 0)
1322                 return;
1323
1324         Assert(count <= 2);
1325
1326         rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
1327
1328         /*
1329          * When adding the postmaster's pid, we have to consider that it might
1330          * already have exited and perhaps even been replaced by another process
1331          * with the same pid.  If so, we have to defer reporting this as an event
1332          * until the next call to WaitEventSetWaitBlock().
1333          */
1334
1335         if (rc < 0)
1336         {
1337                 if (event->events == WL_POSTMASTER_DEATH &&
1338                         (errno == ESRCH || errno == EACCES))
1339                         set->report_postmaster_not_running = true;
1340                 else
1341                         ereport(ERROR,
1342                                         (errcode_for_socket_access(),
1343                                          errmsg("%s() failed: %m",
1344                                                         "kevent")));
1345         }
1346         else if (event->events == WL_POSTMASTER_DEATH &&
1347                          PostmasterPid != getppid() &&
1348                          !PostmasterIsAlive())
1349         {
1350                 /*
1351                  * The extra PostmasterIsAliveInternal() check prevents false alarms
1352                  * on systems that give a different value for getppid() while being
1353                  * traced by a debugger.
1354                  */
1355                 set->report_postmaster_not_running = true;
1356         }
1357 }
1358
1359 #endif
1360
1361 #if defined(WAIT_USE_WIN32)
1362 static void
1363 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
1364 {
1365         HANDLE     *handle = &set->handles[event->pos + 1];
1366
1367         if (event->events == WL_LATCH_SET)
1368         {
1369                 Assert(set->latch != NULL);
1370                 *handle = set->latch->event;
1371         }
1372         else if (event->events == WL_POSTMASTER_DEATH)
1373         {
1374                 *handle = PostmasterHandle;
1375         }
1376         else
1377         {
1378                 int                     flags = FD_CLOSE;       /* always check for errors/EOF */
1379
1380                 if (event->events & WL_SOCKET_READABLE)
1381                         flags |= FD_READ;
1382                 if (event->events & WL_SOCKET_WRITEABLE)
1383                         flags |= FD_WRITE;
1384                 if (event->events & WL_SOCKET_CONNECTED)
1385                         flags |= FD_CONNECT;
1386                 if (event->events & WL_SOCKET_ACCEPT)
1387                         flags |= FD_ACCEPT;
1388
1389                 if (*handle == WSA_INVALID_EVENT)
1390                 {
1391                         *handle = WSACreateEvent();
1392                         if (*handle == WSA_INVALID_EVENT)
1393                                 elog(ERROR, "failed to create event for socket: error code %d",
1394                                          WSAGetLastError());
1395                 }
1396                 if (WSAEventSelect(event->fd, *handle, flags) != 0)
1397                         elog(ERROR, "failed to set up event for socket: error code %d",
1398                                  WSAGetLastError());
1399
1400                 Assert(event->fd != PGINVALID_SOCKET);
1401         }
1402 }
1403 #endif
1404
1405 /*
1406  * Wait for events added to the set to happen, or until the timeout is
1407  * reached.  At most nevents occurred events are returned.
1408  *
1409  * If timeout = -1, block until an event occurs; if 0, check sockets for
1410  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
1411  *
1412  * Returns the number of events occurred, or 0 if the timeout was reached.
1413  *
1414  * Returned events will have the fd, pos, user_data fields set to the
1415  * values associated with the registered event.
1416  */
1417 int
1418 WaitEventSetWait(WaitEventSet *set, long timeout,
1419                                  WaitEvent *occurred_events, int nevents,
1420                                  uint32 wait_event_info)
1421 {
1422         int                     returned_events = 0;
1423         instr_time      start_time;
1424         instr_time      cur_time;
1425         long            cur_timeout = -1;
1426
1427         Assert(nevents > 0);
1428
1429         /*
1430          * Initialize timeout if requested.  We must record the current time so
1431          * that we can determine the remaining timeout if interrupted.
1432          */
1433         if (timeout >= 0)
1434         {
1435                 INSTR_TIME_SET_CURRENT(start_time);
1436                 Assert(timeout >= 0 && timeout <= INT_MAX);
1437                 cur_timeout = timeout;
1438         }
1439         else
1440                 INSTR_TIME_SET_ZERO(start_time);
1441
1442         pgstat_report_wait_start(wait_event_info);
1443
1444 #ifndef WIN32
1445         waiting = true;
1446 #else
1447         /* Ensure that signals are serviced even if latch is already set */
1448         pgwin32_dispatch_queued_signals();
1449 #endif
1450         while (returned_events == 0)
1451         {
1452                 int                     rc;
1453
1454                 /*
1455                  * Check if the latch is set already. If so, leave the loop
1456                  * immediately, avoid blocking again. We don't attempt to report any
1457                  * other events that might also be satisfied.
1458                  *
1459                  * If someone sets the latch between this and the
1460                  * WaitEventSetWaitBlock() below, the setter will write a byte to the
1461                  * pipe (or signal us and the signal handler will do that), and the
1462                  * readiness routine will return immediately.
1463                  *
1464                  * On unix, If there's a pending byte in the self pipe, we'll notice
1465                  * whenever blocking. Only clearing the pipe in that case avoids
1466                  * having to drain it every time WaitLatchOrSocket() is used. Should
1467                  * the pipe-buffer fill up we're still ok, because the pipe is in
1468                  * nonblocking mode. It's unlikely for that to happen, because the
1469                  * self pipe isn't filled unless we're blocking (waiting = true), or
1470                  * from inside a signal handler in latch_sigurg_handler().
1471                  *
1472                  * On windows, we'll also notice if there's a pending event for the
1473                  * latch when blocking, but there's no danger of anything filling up,
1474                  * as "Setting an event that is already set has no effect.".
1475                  *
1476                  * Note: we assume that the kernel calls involved in latch management
1477                  * will provide adequate synchronization on machines with weak memory
1478                  * ordering, so that we cannot miss seeing is_set if a notification
1479                  * has already been queued.
1480                  */
1481                 if (set->latch && !set->latch->is_set)
1482                 {
1483                         /* about to sleep on a latch */
1484                         set->latch->maybe_sleeping = true;
1485                         pg_memory_barrier();
1486                         /* and recheck */
1487                 }
1488
1489                 if (set->latch && set->latch->is_set)
1490                 {
1491                         occurred_events->fd = PGINVALID_SOCKET;
1492                         occurred_events->pos = set->latch_pos;
1493                         occurred_events->user_data =
1494                                 set->events[set->latch_pos].user_data;
1495                         occurred_events->events = WL_LATCH_SET;
1496                         occurred_events++;
1497                         returned_events++;
1498
1499                         /* could have been set above */
1500                         set->latch->maybe_sleeping = false;
1501
1502                         break;
1503                 }
1504
1505                 /*
1506                  * Wait for events using the readiness primitive chosen at the top of
1507                  * this file. If -1 is returned, a timeout has occurred, if 0 we have
1508                  * to retry, everything >= 1 is the number of returned events.
1509                  */
1510                 rc = WaitEventSetWaitBlock(set, cur_timeout,
1511                                                                    occurred_events, nevents);
1512
1513                 if (set->latch)
1514                 {
1515                         Assert(set->latch->maybe_sleeping);
1516                         set->latch->maybe_sleeping = false;
1517                 }
1518
1519                 if (rc == -1)
1520                         break;                          /* timeout occurred */
1521                 else
1522                         returned_events = rc;
1523
1524                 /* If we're not done, update cur_timeout for next iteration */
1525                 if (returned_events == 0 && timeout >= 0)
1526                 {
1527                         INSTR_TIME_SET_CURRENT(cur_time);
1528                         INSTR_TIME_SUBTRACT(cur_time, start_time);
1529                         cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1530                         if (cur_timeout <= 0)
1531                                 break;
1532                 }
1533         }
1534 #ifndef WIN32
1535         waiting = false;
1536 #endif
1537
1538         pgstat_report_wait_end();
1539
1540         return returned_events;
1541 }
1542
1543
1544 #if defined(WAIT_USE_EPOLL)
1545
1546 /*
1547  * Wait using linux's epoll_wait(2).
1548  *
1549  * This is the preferable wait method, as several readiness notifications are
1550  * delivered, without having to iterate through all of set->events. The return
1551  * epoll_event struct contain a pointer to our events, making association
1552  * easy.
1553  */
1554 static inline int
1555 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1556                                           WaitEvent *occurred_events, int nevents)
1557 {
1558         int                     returned_events = 0;
1559         int                     rc;
1560         WaitEvent  *cur_event;
1561         struct epoll_event *cur_epoll_event;
1562
1563         /* Sleep */
1564         rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1565                                         Min(nevents, set->nevents_space), cur_timeout);
1566
1567         /* Check return code */
1568         if (rc < 0)
1569         {
1570                 /* EINTR is okay, otherwise complain */
1571                 if (errno != EINTR)
1572                 {
1573                         waiting = false;
1574                         ereport(ERROR,
1575                                         (errcode_for_socket_access(),
1576                                          errmsg("%s() failed: %m",
1577                                                         "epoll_wait")));
1578                 }
1579                 return 0;
1580         }
1581         else if (rc == 0)
1582         {
1583                 /* timeout exceeded */
1584                 return -1;
1585         }
1586
1587         /*
1588          * At least one event occurred, iterate over the returned epoll events
1589          * until they're either all processed, or we've returned all the events
1590          * the caller desired.
1591          */
1592         for (cur_epoll_event = set->epoll_ret_events;
1593                  cur_epoll_event < (set->epoll_ret_events + rc) &&
1594                  returned_events < nevents;
1595                  cur_epoll_event++)
1596         {
1597                 /* epoll's data pointer is set to the associated WaitEvent */
1598                 cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1599
1600                 occurred_events->pos = cur_event->pos;
1601                 occurred_events->user_data = cur_event->user_data;
1602                 occurred_events->events = 0;
1603
1604                 if (cur_event->events == WL_LATCH_SET &&
1605                         cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1606                 {
1607                         /* Drain the signalfd. */
1608                         drain();
1609
1610                         if (set->latch && set->latch->is_set)
1611                         {
1612                                 occurred_events->fd = PGINVALID_SOCKET;
1613                                 occurred_events->events = WL_LATCH_SET;
1614                                 occurred_events++;
1615                                 returned_events++;
1616                         }
1617                 }
1618                 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1619                                  cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1620                 {
1621                         /*
1622                          * We expect an EPOLLHUP when the remote end is closed, but
1623                          * because we don't expect the pipe to become readable or to have
1624                          * any errors either, treat those cases as postmaster death, too.
1625                          *
1626                          * Be paranoid about a spurious event signaling the postmaster as
1627                          * being dead.  There have been reports about that happening with
1628                          * older primitives (select(2) to be specific), and a spurious
1629                          * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1630                          * cost much.
1631                          */
1632                         if (!PostmasterIsAliveInternal())
1633                         {
1634                                 if (set->exit_on_postmaster_death)
1635                                         proc_exit(1);
1636                                 occurred_events->fd = PGINVALID_SOCKET;
1637                                 occurred_events->events = WL_POSTMASTER_DEATH;
1638                                 occurred_events++;
1639                                 returned_events++;
1640                         }
1641                 }
1642                 else if (cur_event->events & (WL_SOCKET_READABLE |
1643                                                                           WL_SOCKET_WRITEABLE |
1644                                                                           WL_SOCKET_CLOSED))
1645                 {
1646                         Assert(cur_event->fd != PGINVALID_SOCKET);
1647
1648                         if ((cur_event->events & WL_SOCKET_READABLE) &&
1649                                 (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1650                         {
1651                                 /* data available in socket, or EOF */
1652                                 occurred_events->events |= WL_SOCKET_READABLE;
1653                         }
1654
1655                         if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1656                                 (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1657                         {
1658                                 /* writable, or EOF */
1659                                 occurred_events->events |= WL_SOCKET_WRITEABLE;
1660                         }
1661
1662                         if ((cur_event->events & WL_SOCKET_CLOSED) &&
1663                                 (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
1664                         {
1665                                 /* remote peer shut down, or error */
1666                                 occurred_events->events |= WL_SOCKET_CLOSED;
1667                         }
1668
1669                         if (occurred_events->events != 0)
1670                         {
1671                                 occurred_events->fd = cur_event->fd;
1672                                 occurred_events++;
1673                                 returned_events++;
1674                         }
1675                 }
1676         }
1677
1678         return returned_events;
1679 }
1680
1681 #elif defined(WAIT_USE_KQUEUE)
1682
1683 /*
1684  * Wait using kevent(2) on BSD-family systems and macOS.
1685  *
1686  * For now this mirrors the epoll code, but in future it could modify the fd
1687  * set in the same call to kevent as it uses for waiting instead of doing that
1688  * with separate system calls.
1689  */
1690 static int
1691 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1692                                           WaitEvent *occurred_events, int nevents)
1693 {
1694         int                     returned_events = 0;
1695         int                     rc;
1696         WaitEvent  *cur_event;
1697         struct kevent *cur_kqueue_event;
1698         struct timespec timeout;
1699         struct timespec *timeout_p;
1700
1701         if (cur_timeout < 0)
1702                 timeout_p = NULL;
1703         else
1704         {
1705                 timeout.tv_sec = cur_timeout / 1000;
1706                 timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
1707                 timeout_p = &timeout;
1708         }
1709
1710         /*
1711          * Report postmaster events discovered by WaitEventAdjustKqueue() or an
1712          * earlier call to WaitEventSetWait().
1713          */
1714         if (unlikely(set->report_postmaster_not_running))
1715         {
1716                 if (set->exit_on_postmaster_death)
1717                         proc_exit(1);
1718                 occurred_events->fd = PGINVALID_SOCKET;
1719                 occurred_events->events = WL_POSTMASTER_DEATH;
1720                 return 1;
1721         }
1722
1723         /* Sleep */
1724         rc = kevent(set->kqueue_fd, NULL, 0,
1725                                 set->kqueue_ret_events,
1726                                 Min(nevents, set->nevents_space),
1727                                 timeout_p);
1728
1729         /* Check return code */
1730         if (rc < 0)
1731         {
1732                 /* EINTR is okay, otherwise complain */
1733                 if (errno != EINTR)
1734                 {
1735                         waiting = false;
1736                         ereport(ERROR,
1737                                         (errcode_for_socket_access(),
1738                                          errmsg("%s() failed: %m",
1739                                                         "kevent")));
1740                 }
1741                 return 0;
1742         }
1743         else if (rc == 0)
1744         {
1745                 /* timeout exceeded */
1746                 return -1;
1747         }
1748
1749         /*
1750          * At least one event occurred, iterate over the returned kqueue events
1751          * until they're either all processed, or we've returned all the events
1752          * the caller desired.
1753          */
1754         for (cur_kqueue_event = set->kqueue_ret_events;
1755                  cur_kqueue_event < (set->kqueue_ret_events + rc) &&
1756                  returned_events < nevents;
1757                  cur_kqueue_event++)
1758         {
1759                 /* kevent's udata points to the associated WaitEvent */
1760                 cur_event = AccessWaitEvent(cur_kqueue_event);
1761
1762                 occurred_events->pos = cur_event->pos;
1763                 occurred_events->user_data = cur_event->user_data;
1764                 occurred_events->events = 0;
1765
1766                 if (cur_event->events == WL_LATCH_SET &&
1767                         cur_kqueue_event->filter == EVFILT_SIGNAL)
1768                 {
1769                         if (set->latch && set->latch->is_set)
1770                         {
1771                                 occurred_events->fd = PGINVALID_SOCKET;
1772                                 occurred_events->events = WL_LATCH_SET;
1773                                 occurred_events++;
1774                                 returned_events++;
1775                         }
1776                 }
1777                 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1778                                  cur_kqueue_event->filter == EVFILT_PROC &&
1779                                  (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
1780                 {
1781                         /*
1782                          * The kernel will tell this kqueue object only once about the
1783                          * exit of the postmaster, so let's remember that for next time so
1784                          * that we provide level-triggered semantics.
1785                          */
1786                         set->report_postmaster_not_running = true;
1787
1788                         if (set->exit_on_postmaster_death)
1789                                 proc_exit(1);
1790                         occurred_events->fd = PGINVALID_SOCKET;
1791                         occurred_events->events = WL_POSTMASTER_DEATH;
1792                         occurred_events++;
1793                         returned_events++;
1794                 }
1795                 else if (cur_event->events & (WL_SOCKET_READABLE |
1796                                                                           WL_SOCKET_WRITEABLE |
1797                                                                           WL_SOCKET_CLOSED))
1798                 {
1799                         Assert(cur_event->fd >= 0);
1800
1801                         if ((cur_event->events & WL_SOCKET_READABLE) &&
1802                                 (cur_kqueue_event->filter == EVFILT_READ))
1803                         {
1804                                 /* readable, or EOF */
1805                                 occurred_events->events |= WL_SOCKET_READABLE;
1806                         }
1807
1808                         if ((cur_event->events & WL_SOCKET_CLOSED) &&
1809                                 (cur_kqueue_event->filter == EVFILT_READ) &&
1810                                 (cur_kqueue_event->flags & EV_EOF))
1811                         {
1812                                 /* the remote peer has shut down */
1813                                 occurred_events->events |= WL_SOCKET_CLOSED;
1814                         }
1815
1816                         if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1817                                 (cur_kqueue_event->filter == EVFILT_WRITE))
1818                         {
1819                                 /* writable, or EOF */
1820                                 occurred_events->events |= WL_SOCKET_WRITEABLE;
1821                         }
1822
1823                         if (occurred_events->events != 0)
1824                         {
1825                                 occurred_events->fd = cur_event->fd;
1826                                 occurred_events++;
1827                                 returned_events++;
1828                         }
1829                 }
1830         }
1831
1832         return returned_events;
1833 }
1834
1835 #elif defined(WAIT_USE_POLL)
1836
1837 /*
1838  * Wait using poll(2).
1839  *
1840  * This allows to receive readiness notifications for several events at once,
1841  * but requires iterating through all of set->pollfds.
1842  */
1843 static inline int
1844 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1845                                           WaitEvent *occurred_events, int nevents)
1846 {
1847         int                     returned_events = 0;
1848         int                     rc;
1849         WaitEvent  *cur_event;
1850         struct pollfd *cur_pollfd;
1851
1852         /* Sleep */
1853         rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1854
1855         /* Check return code */
1856         if (rc < 0)
1857         {
1858                 /* EINTR is okay, otherwise complain */
1859                 if (errno != EINTR)
1860                 {
1861                         waiting = false;
1862                         ereport(ERROR,
1863                                         (errcode_for_socket_access(),
1864                                          errmsg("%s() failed: %m",
1865                                                         "poll")));
1866                 }
1867                 return 0;
1868         }
1869         else if (rc == 0)
1870         {
1871                 /* timeout exceeded */
1872                 return -1;
1873         }
1874
1875         for (cur_event = set->events, cur_pollfd = set->pollfds;
1876                  cur_event < (set->events + set->nevents) &&
1877                  returned_events < nevents;
1878                  cur_event++, cur_pollfd++)
1879         {
1880                 /* no activity on this FD, skip */
1881                 if (cur_pollfd->revents == 0)
1882                         continue;
1883
1884                 occurred_events->pos = cur_event->pos;
1885                 occurred_events->user_data = cur_event->user_data;
1886                 occurred_events->events = 0;
1887
1888                 if (cur_event->events == WL_LATCH_SET &&
1889                         (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1890                 {
1891                         /* There's data in the self-pipe, clear it. */
1892                         drain();
1893
1894                         if (set->latch && set->latch->is_set)
1895                         {
1896                                 occurred_events->fd = PGINVALID_SOCKET;
1897                                 occurred_events->events = WL_LATCH_SET;
1898                                 occurred_events++;
1899                                 returned_events++;
1900                         }
1901                 }
1902                 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1903                                  (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1904                 {
1905                         /*
1906                          * We expect an POLLHUP when the remote end is closed, but because
1907                          * we don't expect the pipe to become readable or to have any
1908                          * errors either, treat those cases as postmaster death, too.
1909                          *
1910                          * Be paranoid about a spurious event signaling the postmaster as
1911                          * being dead.  There have been reports about that happening with
1912                          * older primitives (select(2) to be specific), and a spurious
1913                          * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1914                          * cost much.
1915                          */
1916                         if (!PostmasterIsAliveInternal())
1917                         {
1918                                 if (set->exit_on_postmaster_death)
1919                                         proc_exit(1);
1920                                 occurred_events->fd = PGINVALID_SOCKET;
1921                                 occurred_events->events = WL_POSTMASTER_DEATH;
1922                                 occurred_events++;
1923                                 returned_events++;
1924                         }
1925                 }
1926                 else if (cur_event->events & (WL_SOCKET_READABLE |
1927                                                                           WL_SOCKET_WRITEABLE |
1928                                                                           WL_SOCKET_CLOSED))
1929                 {
1930                         int                     errflags = POLLHUP | POLLERR | POLLNVAL;
1931
1932                         Assert(cur_event->fd >= PGINVALID_SOCKET);
1933
1934                         if ((cur_event->events & WL_SOCKET_READABLE) &&
1935                                 (cur_pollfd->revents & (POLLIN | errflags)))
1936                         {
1937                                 /* data available in socket, or EOF */
1938                                 occurred_events->events |= WL_SOCKET_READABLE;
1939                         }
1940
1941                         if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1942                                 (cur_pollfd->revents & (POLLOUT | errflags)))
1943                         {
1944                                 /* writeable, or EOF */
1945                                 occurred_events->events |= WL_SOCKET_WRITEABLE;
1946                         }
1947
1948 #ifdef POLLRDHUP
1949                         if ((cur_event->events & WL_SOCKET_CLOSED) &&
1950                                 (cur_pollfd->revents & (POLLRDHUP | errflags)))
1951                         {
1952                                 /* remote peer closed, or error */
1953                                 occurred_events->events |= WL_SOCKET_CLOSED;
1954                         }
1955 #endif
1956
1957                         if (occurred_events->events != 0)
1958                         {
1959                                 occurred_events->fd = cur_event->fd;
1960                                 occurred_events++;
1961                                 returned_events++;
1962                         }
1963                 }
1964         }
1965         return returned_events;
1966 }
1967
1968 #elif defined(WAIT_USE_WIN32)
1969
1970 /*
1971  * Wait using Windows' WaitForMultipleObjects().  Each call only "consumes" one
1972  * event, so we keep calling until we've filled up our output buffer to match
1973  * the behavior of the other implementations.
1974  *
1975  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273
1976  */
1977 static inline int
1978 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1979                                           WaitEvent *occurred_events, int nevents)
1980 {
1981         int                     returned_events = 0;
1982         DWORD           rc;
1983         WaitEvent  *cur_event;
1984
1985         /* Reset any wait events that need it */
1986         for (cur_event = set->events;
1987                  cur_event < (set->events + set->nevents);
1988                  cur_event++)
1989         {
1990                 if (cur_event->reset)
1991                 {
1992                         WaitEventAdjustWin32(set, cur_event);
1993                         cur_event->reset = false;
1994                 }
1995
1996                 /*
1997                  * We associate the socket with a new event handle for each
1998                  * WaitEventSet.  FD_CLOSE is only generated once if the other end
1999                  * closes gracefully.  Therefore we might miss the FD_CLOSE
2000                  * notification, if it was delivered to another event after we stopped
2001                  * waiting for it.  Close that race by peeking for EOF after setting
2002                  * up this handle to receive notifications, and before entering the
2003                  * sleep.
2004                  *
2005                  * XXX If we had one event handle for the lifetime of a socket, we
2006                  * wouldn't need this.
2007                  */
2008                 if (cur_event->events & WL_SOCKET_READABLE)
2009                 {
2010                         char            c;
2011                         WSABUF          buf;
2012                         DWORD           received;
2013                         DWORD           flags;
2014
2015                         buf.buf = &c;
2016                         buf.len = 1;
2017                         flags = MSG_PEEK;
2018                         if (WSARecv(cur_event->fd, &buf, 1, &received, &flags, NULL, NULL) == 0)
2019                         {
2020                                 occurred_events->pos = cur_event->pos;
2021                                 occurred_events->user_data = cur_event->user_data;
2022                                 occurred_events->events = WL_SOCKET_READABLE;
2023                                 occurred_events->fd = cur_event->fd;
2024                                 return 1;
2025                         }
2026                 }
2027
2028                 /*
2029                  * Windows does not guarantee to log an FD_WRITE network event
2030                  * indicating that more data can be sent unless the previous send()
2031                  * failed with WSAEWOULDBLOCK.  While our caller might well have made
2032                  * such a call, we cannot assume that here.  Therefore, if waiting for
2033                  * write-ready, force the issue by doing a dummy send().  If the dummy
2034                  * send() succeeds, assume that the socket is in fact write-ready, and
2035                  * return immediately.  Also, if it fails with something other than
2036                  * WSAEWOULDBLOCK, return a write-ready indication to let our caller
2037                  * deal with the error condition.
2038                  */
2039                 if (cur_event->events & WL_SOCKET_WRITEABLE)
2040                 {
2041                         char            c;
2042                         WSABUF          buf;
2043                         DWORD           sent;
2044                         int                     r;
2045
2046                         buf.buf = &c;
2047                         buf.len = 0;
2048
2049                         r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
2050                         if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
2051                         {
2052                                 occurred_events->pos = cur_event->pos;
2053                                 occurred_events->user_data = cur_event->user_data;
2054                                 occurred_events->events = WL_SOCKET_WRITEABLE;
2055                                 occurred_events->fd = cur_event->fd;
2056                                 return 1;
2057                         }
2058                 }
2059         }
2060
2061         /*
2062          * Sleep.
2063          *
2064          * Need to wait for ->nevents + 1, because signal handle is in [0].
2065          */
2066         rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
2067                                                                 cur_timeout);
2068
2069         /* Check return code */
2070         if (rc == WAIT_FAILED)
2071                 elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
2072                          GetLastError());
2073         else if (rc == WAIT_TIMEOUT)
2074         {
2075                 /* timeout exceeded */
2076                 return -1;
2077         }
2078
2079         if (rc == WAIT_OBJECT_0)
2080         {
2081                 /* Service newly-arrived signals */
2082                 pgwin32_dispatch_queued_signals();
2083                 return 0;                               /* retry */
2084         }
2085
2086         /*
2087          * With an offset of one, due to the always present pgwin32_signal_event,
2088          * the handle offset directly corresponds to a wait event.
2089          */
2090         cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
2091
2092         for (;;)
2093         {
2094                 int                     next_pos;
2095                 int                     count;
2096
2097                 occurred_events->pos = cur_event->pos;
2098                 occurred_events->user_data = cur_event->user_data;
2099                 occurred_events->events = 0;
2100
2101                 if (cur_event->events == WL_LATCH_SET)
2102                 {
2103                         /*
2104                          * We cannot use set->latch->event to reset the fired event if we
2105                          * aren't waiting on this latch now.
2106                          */
2107                         if (!ResetEvent(set->handles[cur_event->pos + 1]))
2108                                 elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
2109
2110                         if (set->latch && set->latch->is_set)
2111                         {
2112                                 occurred_events->fd = PGINVALID_SOCKET;
2113                                 occurred_events->events = WL_LATCH_SET;
2114                                 occurred_events++;
2115                                 returned_events++;
2116                         }
2117                 }
2118                 else if (cur_event->events == WL_POSTMASTER_DEATH)
2119                 {
2120                         /*
2121                          * Postmaster apparently died.  Since the consequences of falsely
2122                          * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we
2123                          * take the trouble to positively verify this with
2124                          * PostmasterIsAlive(), even though there is no known reason to
2125                          * think that the event could be falsely set on Windows.
2126                          */
2127                         if (!PostmasterIsAliveInternal())
2128                         {
2129                                 if (set->exit_on_postmaster_death)
2130                                         proc_exit(1);
2131                                 occurred_events->fd = PGINVALID_SOCKET;
2132                                 occurred_events->events = WL_POSTMASTER_DEATH;
2133                                 occurred_events++;
2134                                 returned_events++;
2135                         }
2136                 }
2137                 else if (cur_event->events & WL_SOCKET_MASK)
2138                 {
2139                         WSANETWORKEVENTS resEvents;
2140                         HANDLE          handle = set->handles[cur_event->pos + 1];
2141
2142                         Assert(cur_event->fd);
2143
2144                         occurred_events->fd = cur_event->fd;
2145
2146                         ZeroMemory(&resEvents, sizeof(resEvents));
2147                         if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
2148                                 elog(ERROR, "failed to enumerate network events: error code %d",
2149                                          WSAGetLastError());
2150                         if ((cur_event->events & WL_SOCKET_READABLE) &&
2151                                 (resEvents.lNetworkEvents & FD_READ))
2152                         {
2153                                 /* data available in socket */
2154                                 occurred_events->events |= WL_SOCKET_READABLE;
2155
2156                                 /*------
2157                                  * WaitForMultipleObjects doesn't guarantee that a read event
2158                                  * will be returned if the latch is set at the same time.  Even
2159                                  * if it did, the caller might drop that event expecting it to
2160                                  * reoccur on next call.  So, we must force the event to be
2161                                  * reset if this WaitEventSet is used again in order to avoid
2162                                  * an indefinite hang.
2163                                  *
2164                                  * Refer
2165                                  * https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
2166                                  * for the behavior of socket events.
2167                                  *------
2168                                  */
2169                                 cur_event->reset = true;
2170                         }
2171                         if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
2172                                 (resEvents.lNetworkEvents & FD_WRITE))
2173                         {
2174                                 /* writeable */
2175                                 occurred_events->events |= WL_SOCKET_WRITEABLE;
2176                         }
2177                         if ((cur_event->events & WL_SOCKET_CONNECTED) &&
2178                                 (resEvents.lNetworkEvents & FD_CONNECT))
2179                         {
2180                                 /* connected */
2181                                 occurred_events->events |= WL_SOCKET_CONNECTED;
2182                         }
2183                         if ((cur_event->events & WL_SOCKET_ACCEPT) &&
2184                                 (resEvents.lNetworkEvents & FD_ACCEPT))
2185                         {
2186                                 /* incoming connection could be accepted */
2187                                 occurred_events->events |= WL_SOCKET_ACCEPT;
2188                         }
2189                         if (resEvents.lNetworkEvents & FD_CLOSE)
2190                         {
2191                                 /* EOF/error, so signal all caller-requested socket flags */
2192                                 occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
2193                         }
2194
2195                         if (occurred_events->events != 0)
2196                         {
2197                                 occurred_events++;
2198                                 returned_events++;
2199                         }
2200                 }
2201
2202                 /* Is the output buffer full? */
2203                 if (returned_events == nevents)
2204                         break;
2205
2206                 /* Have we run out of possible events? */
2207                 next_pos = cur_event->pos + 1;
2208                 if (next_pos == set->nevents)
2209                         break;
2210
2211                 /*
2212                  * Poll the rest of the event handles in the array starting at
2213                  * next_pos being careful to skip over the initial signal handle too.
2214                  * This time we use a zero timeout.
2215                  */
2216                 count = set->nevents - next_pos;
2217                 rc = WaitForMultipleObjects(count,
2218                                                                         set->handles + 1 + next_pos,
2219                                                                         false,
2220                                                                         0);
2221
2222                 /*
2223                  * We don't distinguish between errors and WAIT_TIMEOUT here because
2224                  * we already have events to report.
2225                  */
2226                 if (rc < WAIT_OBJECT_0 || rc >= WAIT_OBJECT_0 + count)
2227                         break;
2228
2229                 /* We have another event to decode. */
2230                 cur_event = &set->events[next_pos + (rc - WAIT_OBJECT_0)];
2231         }
2232
2233         return returned_events;
2234 }
2235 #endif
2236
2237 /*
2238  * Return whether the current build options can report WL_SOCKET_CLOSED.
2239  */
2240 bool
2241 WaitEventSetCanReportClosed(void)
2242 {
2243 #if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
2244         defined(WAIT_USE_EPOLL) || \
2245         defined(WAIT_USE_KQUEUE)
2246         return true;
2247 #else
2248         return false;
2249 #endif
2250 }
2251
2252 /*
2253  * Get the number of wait events registered in a given WaitEventSet.
2254  */
2255 int
2256 GetNumRegisteredWaitEvents(WaitEventSet *set)
2257 {
2258         return set->nevents;
2259 }
2260
2261 #if defined(WAIT_USE_SELF_PIPE)
2262
2263 /*
2264  * SetLatch uses SIGURG to wake up the process waiting on the latch.
2265  *
2266  * Wake up WaitLatch, if we're waiting.
2267  */
2268 static void
2269 latch_sigurg_handler(SIGNAL_ARGS)
2270 {
2271         if (waiting)
2272                 sendSelfPipeByte();
2273 }
2274
2275 /* Send one byte to the self-pipe, to wake up WaitLatch */
2276 static void
2277 sendSelfPipeByte(void)
2278 {
2279         int                     rc;
2280         char            dummy = 0;
2281
2282 retry:
2283         rc = write(selfpipe_writefd, &dummy, 1);
2284         if (rc < 0)
2285         {
2286                 /* If interrupted by signal, just retry */
2287                 if (errno == EINTR)
2288                         goto retry;
2289
2290                 /*
2291                  * If the pipe is full, we don't need to retry, the data that's there
2292                  * already is enough to wake up WaitLatch.
2293                  */
2294                 if (errno == EAGAIN || errno == EWOULDBLOCK)
2295                         return;
2296
2297                 /*
2298                  * Oops, the write() failed for some other reason. We might be in a
2299                  * signal handler, so it's not safe to elog(). We have no choice but
2300                  * silently ignore the error.
2301                  */
2302                 return;
2303         }
2304 }
2305
2306 #endif
2307
2308 #if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
2309
2310 /*
2311  * Read all available data from self-pipe or signalfd.
2312  *
2313  * Note: this is only called when waiting = true.  If it fails and doesn't
2314  * return, it must reset that flag first (though ideally, this will never
2315  * happen).
2316  */
2317 static void
2318 drain(void)
2319 {
2320         char            buf[1024];
2321         int                     rc;
2322         int                     fd;
2323
2324 #ifdef WAIT_USE_SELF_PIPE
2325         fd = selfpipe_readfd;
2326 #else
2327         fd = signal_fd;
2328 #endif
2329
2330         for (;;)
2331         {
2332                 rc = read(fd, buf, sizeof(buf));
2333                 if (rc < 0)
2334                 {
2335                         if (errno == EAGAIN || errno == EWOULDBLOCK)
2336                                 break;                  /* the descriptor is empty */
2337                         else if (errno == EINTR)
2338                                 continue;               /* retry */
2339                         else
2340                         {
2341                                 waiting = false;
2342 #ifdef WAIT_USE_SELF_PIPE
2343                                 elog(ERROR, "read() on self-pipe failed: %m");
2344 #else
2345                                 elog(ERROR, "read() on signalfd failed: %m");
2346 #endif
2347                         }
2348                 }
2349                 else if (rc == 0)
2350                 {
2351                         waiting = false;
2352 #ifdef WAIT_USE_SELF_PIPE
2353                         elog(ERROR, "unexpected EOF on self-pipe");
2354 #else
2355                         elog(ERROR, "unexpected EOF on signalfd");
2356 #endif
2357                 }
2358                 else if (rc < sizeof(buf))
2359                 {
2360                         /* we successfully drained the pipe; no need to read() again */
2361                         break;
2362                 }
2363                 /* else buffer wasn't big enough, so read again */
2364         }
2365 }
2366
2367 #endif
2368
2369 static void
2370 ResOwnerReleaseWaitEventSet(Datum res)
2371 {
2372         WaitEventSet *set = (WaitEventSet *) DatumGetPointer(res);
2373
2374         Assert(set->owner != NULL);
2375         set->owner = NULL;
2376         FreeWaitEventSet(set);
2377 }