src/backend/port/sysv_sema.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * sysv_sema.c
   4  *        Implement PGSemaphores using SysV semaphore facilities
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <signal.h>
  18 #include <unistd.h>
  19 #include <sys/file.h>
  20 #ifdef HAVE_SYS_IPC_H
  21 #include <sys/ipc.h>
  22 #endif
  23 #ifdef HAVE_SYS_SEM_H
  24 #include <sys/sem.h>
  25 #endif
  26 #ifdef HAVE_KERNEL_OS_H
  27 #include <kernel/OS.h>
  28 #endif
  29
  30 #include "miscadmin.h"
  31 #include "storage/ipc.h"
  32 #include "storage/pg_sema.h"
  33
  34
  35 #ifndef HAVE_UNION_SEMUN
  36 union semun
  37 {
  38         int                     val;
  39         struct semid_ds *buf;
  40         unsigned short *array;
  41 };
  42 #endif
  43
  44 typedef key_t IpcSemaphoreKey;  /* semaphore key passed to semget(2) */
  45 typedef int IpcSemaphoreId;             /* semaphore ID returned by semget(2) */
  46
  47 /*
  48  * SEMAS_PER_SET is the number of useful semaphores in each semaphore set
  49  * we allocate.  It must be *less than* your kernel's SEMMSL (max semaphores
  50  * per set) parameter, which is often around 25.  (Less than, because we
  51  * allocate one extra sema in each set for identification purposes.)
  52  */
  53 #define SEMAS_PER_SET   16
  54
  55 #define IPCProtection   (0600)  /* access/modify by user only */
  56
  57 #define PGSemaMagic             537             /* must be less than SEMVMX */
  58
  59
  60 static IpcSemaphoreId *mySemaSets;              /* IDs of sema sets acquired so far */
  61 static int      numSemaSets;            /* number of sema sets acquired so far */
  62 static int      maxSemaSets;            /* allocated size of mySemaSets array */
  63 static IpcSemaphoreKey nextSemaKey;             /* next key to try using */
  64 static int      nextSemaNumber;         /* next free sem num in last sema set */
  65
  66
  67 static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey,
  68                                                    int numSems);
  69 static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum,
  70                                            int value);
  71 static void IpcSemaphoreKill(IpcSemaphoreId semId);
  72 static int      IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum);
  73 static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum);
  74 static IpcSemaphoreId IpcSemaphoreCreate(int numSems);
  75 static void ReleaseSemaphores(int status, Datum arg);
  76
  77
  78 /*
  79  * InternalIpcSemaphoreCreate
  80  *
  81  * Attempt to create a new semaphore set with the specified key.
  82  * Will fail (return -1) if such a set already exists.
  83  *
  84  * If we fail with a failure code other than collision-with-existing-set,
  85  * print out an error and abort.  Other types of errors suggest nonrecoverable
  86  * problems.
  87  */
  88 static IpcSemaphoreId
  89 InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
  90 {
  91         int                     semId;
  92
  93         semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection);
  94
  95         if (semId < 0)
  96         {
  97                 /*
  98                  * Fail quietly if error indicates a collision with existing set. One
  99                  * would expect EEXIST, given that we said IPC_EXCL, but perhaps we
 100                  * could get a permission violation instead?  Also, EIDRM might occur
 101                  * if an old set is slated for destruction but not gone yet.
 102                  */
 103                 if (errno == EEXIST || errno == EACCES
 104 #ifdef EIDRM
 105                         || errno == EIDRM
 106 #endif
 107                         )
 108                         return -1;
 109
 110                 /*
 111                  * Else complain and abort
 112                  */
 113                 ereport(FATAL,
 114                                 (errmsg("could not create semaphores: %m"),
 115                                  errdetail("Failed system call was semget(%lu, %d, 0%o).",
 116                                                    (unsigned long) semKey, numSems,
 117                                                    IPC_CREAT | IPC_EXCL | IPCProtection),
 118                                  (errno == ENOSPC) ?
 119                                  errhint("This error does *not* mean that you have run out of disk space.\n"
 120                   "It occurs when either the system limit for the maximum number of "
 121                          "semaphore sets (SEMMNI), or the system wide maximum number of "
 122                         "semaphores (SEMMNS), would be exceeded.  You need to raise the "
 123                   "respective kernel parameter.  Alternatively, reduce PostgreSQL's "
 124                 "consumption of semaphores by reducing its max_connections parameter "
 125                                                  "(currently %d).\n"
 126                           "The PostgreSQL documentation contains more information about "
 127                                                  "configuring your system for PostgreSQL.",
 128                                                  MaxBackends) : 0));
 129         }
 130
 131         return semId;
 132 }
 133
 134 /*
 135  * Initialize a semaphore to the specified value.
 136  */
 137 static void
 138 IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value)
 139 {
 140         union semun semun;
 141
 142         semun.val = value;
 143         if (semctl(semId, semNum, SETVAL, semun) < 0)
 144                 ereport(FATAL,
 145                                 (errmsg_internal("semctl(%d, %d, SETVAL, %d) failed: %m",
 146                                                                  semId, semNum, value),
 147                                  (errno == ERANGE) ?
 148                                  errhint("You possibly need to raise your kernel's SEMVMX value to be at least "
 149                                   "%d.  Look into the PostgreSQL documentation for details.",
 150                                                  value) : 0));
 151 }
 152
 153 /*
 154  * IpcSemaphoreKill(semId)      - removes a semaphore set
 155  */
 156 static void
 157 IpcSemaphoreKill(IpcSemaphoreId semId)
 158 {
 159         union semun semun;
 160
 161         semun.val = 0;                          /* unused, but keep compiler quiet */
 162
 163         if (semctl(semId, 0, IPC_RMID, semun) < 0)
 164                 elog(LOG, "semctl(%d, 0, IPC_RMID, ...) failed: %m", semId);
 165 }
 166
 167 /* Get the current value (semval) of the semaphore */
 168 static int
 169 IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum)
 170 {
 171         union semun dummy;                      /* for Solaris */
 172
 173         dummy.val = 0;                          /* unused */
 174
 175         return semctl(semId, semNum, GETVAL, dummy);
 176 }
 177
 178 /* Get the PID of the last process to do semop() on the semaphore */
 179 static pid_t
 180 IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum)
 181 {
 182         union semun dummy;                      /* for Solaris */
 183
 184         dummy.val = 0;                          /* unused */
 185
 186         return semctl(semId, semNum, GETPID, dummy);
 187 }
 188
 189
 190 /*
 191  * Create a semaphore set with the given number of useful semaphores
 192  * (an additional sema is actually allocated to serve as identifier).
 193  * Dead Postgres sema sets are recycled if found, but we do not fail
 194  * upon collision with non-Postgres sema sets.
 195  *
 196  * The idea here is to detect and re-use keys that may have been assigned
 197  * by a crashed postmaster or backend.
 198  */
 199 static IpcSemaphoreId
 200 IpcSemaphoreCreate(int numSems)
 201 {
 202         IpcSemaphoreId semId;
 203         union semun semun;
 204         PGSemaphoreData mysema;
 205
 206         /* Loop till we find a free IPC key */
 207         for (nextSemaKey++;; nextSemaKey++)
 208         {
 209                 pid_t           creatorPID;
 210
 211                 /* Try to create new semaphore set */
 212                 semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
 213                 if (semId >= 0)
 214                         break;                          /* successful create */
 215
 216                 /* See if it looks to be leftover from a dead Postgres process */
 217                 semId = semget(nextSemaKey, numSems + 1, 0);
 218                 if (semId < 0)
 219                         continue;                       /* failed: must be some other app's */
 220                 if (IpcSemaphoreGetValue(semId, numSems) != PGSemaMagic)
 221                         continue;                       /* sema belongs to a non-Postgres app */
 222
 223                 /*
 224                  * If the creator PID is my own PID or does not belong to any extant
 225                  * process, it's safe to zap it.
 226                  */
 227                 creatorPID = IpcSemaphoreGetLastPID(semId, numSems);
 228                 if (creatorPID <= 0)
 229                         continue;                       /* oops, GETPID failed */
 230                 if (creatorPID != getpid())
 231                 {
 232                         if (kill(creatorPID, 0) == 0 || errno != ESRCH)
 233                                 continue;               /* sema belongs to a live process */
 234                 }
 235
 236                 /*
 237                  * The sema set appears to be from a dead Postgres process, or from a
 238                  * previous cycle of life in this same process.  Zap it, if possible.
 239                  * This probably shouldn't fail, but if it does, assume the sema set
 240                  * belongs to someone else after all, and continue quietly.
 241                  */
 242                 semun.val = 0;                  /* unused, but keep compiler quiet */
 243                 if (semctl(semId, 0, IPC_RMID, semun) < 0)
 244                         continue;
 245
 246                 /*
 247                  * Now try again to create the sema set.
 248                  */
 249                 semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
 250                 if (semId >= 0)
 251                         break;                          /* successful create */
 252
 253                 /*
 254                  * Can only get here if some other process managed to create the same
 255                  * sema key before we did.      Let him have that one, loop around to try
 256                  * next key.
 257                  */
 258         }
 259
 260         /*
 261          * OK, we created a new sema set.  Mark it as created by this process. We
 262          * do this by setting the spare semaphore to PGSemaMagic-1 and then
 263          * incrementing it with semop().  That leaves it with value PGSemaMagic
 264          * and sempid referencing this process.
 265          */
 266         IpcSemaphoreInitialize(semId, numSems, PGSemaMagic - 1);
 267         mysema.semId = semId;
 268         mysema.semNum = numSems;
 269         PGSemaphoreUnlock(&mysema);
 270
 271         return semId;
 272 }
 273
 274
 275 /*
 276  * PGReserveSemaphores --- initialize semaphore support
 277  *
 278  * This is called during postmaster start or shared memory reinitialization.
 279  * It should do whatever is needed to be able to support up to maxSemas
 280  * subsequent PGSemaphoreCreate calls.  Also, if any system resources
 281  * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
 282  * callback to release them.
 283  *
 284  * The port number is passed for possible use as a key (for SysV, we use
 285  * it to generate the starting semaphore key).  In a standalone backend,
 286  * zero will be passed.
 287  *
 288  * In the SysV implementation, we acquire semaphore sets on-demand; the
 289  * maxSemas parameter is just used to size the array that keeps track of
 290  * acquired sets for subsequent releasing.
 291  */
 292 void
 293 PGReserveSemaphores(int maxSemas, int port)
 294 {
 295         maxSemaSets = (maxSemas + SEMAS_PER_SET - 1) / SEMAS_PER_SET;
 296         mySemaSets = (IpcSemaphoreId *)
 297                 malloc(maxSemaSets * sizeof(IpcSemaphoreId));
 298         if (mySemaSets == NULL)
 299                 elog(PANIC, "out of memory");
 300         numSemaSets = 0;
 301         nextSemaKey = port * 1000;
 302         nextSemaNumber = SEMAS_PER_SET;         /* force sema set alloc on 1st call */
 303
 304         on_shmem_exit(ReleaseSemaphores, 0);
 305 }
 306
 307 /*
 308  * Release semaphores at shutdown or shmem reinitialization
 309  *
 310  * (called as an on_shmem_exit callback, hence funny argument list)
 311  */
 312 static void
 313 ReleaseSemaphores(int status, Datum arg)
 314 {
 315         int                     i;
 316
 317         for (i = 0; i < numSemaSets; i++)
 318                 IpcSemaphoreKill(mySemaSets[i]);
 319         free(mySemaSets);
 320 }
 321
 322 /*
 323  * PGSemaphoreCreate
 324  *
 325  * Initialize a PGSemaphore structure to represent a sema with count 1
 326  */
 327 void
 328 PGSemaphoreCreate(PGSemaphore sema)
 329 {
 330         /* Can't do this in a backend, because static state is postmaster's */
 331         Assert(!IsUnderPostmaster);
 332
 333         if (nextSemaNumber >= SEMAS_PER_SET)
 334         {
 335                 /* Time to allocate another semaphore set */
 336                 if (numSemaSets >= maxSemaSets)
 337                         elog(PANIC, "too many semaphores created");
 338                 mySemaSets[numSemaSets] = IpcSemaphoreCreate(SEMAS_PER_SET);
 339                 numSemaSets++;
 340                 nextSemaNumber = 0;
 341         }
 342         /* Assign the next free semaphore in the current set */
 343         sema->semId = mySemaSets[numSemaSets - 1];
 344         sema->semNum = nextSemaNumber++;
 345         /* Initialize it to count 1 */
 346         IpcSemaphoreInitialize(sema->semId, sema->semNum, 1);
 347 }
 348
 349 /*
 350  * PGSemaphoreReset
 351  *
 352  * Reset a previously-initialized PGSemaphore to have count 0
 353  */
 354 void
 355 PGSemaphoreReset(PGSemaphore sema)
 356 {
 357         IpcSemaphoreInitialize(sema->semId, sema->semNum, 0);
 358 }
 359
 360 /*
 361  * PGSemaphoreLock
 362  *
 363  * Lock a semaphore (decrement count), blocking if count would be < 0
 364  */
 365 void
 366 PGSemaphoreLock(PGSemaphore sema, bool interruptOK)
 367 {
 368         int                     errStatus;
 369         struct sembuf sops;
 370
 371         sops.sem_op = -1;                       /* decrement */
 372         sops.sem_flg = 0;
 373         sops.sem_num = sema->semNum;
 374
 375         /*
 376          * Note: if errStatus is -1 and errno == EINTR then it means we returned
 377          * from the operation prematurely because we were sent a signal.  So we
 378          * try and lock the semaphore again.
 379          *
 380          * Each time around the loop, we check for a cancel/die interrupt.      On
 381          * some platforms, if such an interrupt comes in while we are waiting, it
 382          * will cause the semop() call to exit with errno == EINTR, allowing us to
 383          * service the interrupt (if not in a critical section already) during the
 384          * next loop iteration.
 385          *
 386          * Once we acquire the lock, we do NOT check for an interrupt before
 387          * returning.  The caller needs to be able to record ownership of the lock
 388          * before any interrupt can be accepted.
 389          *
 390          * There is a window of a few instructions between CHECK_FOR_INTERRUPTS
 391          * and entering the semop() call.  If a cancel/die interrupt occurs in
 392          * that window, we would fail to notice it until after we acquire the lock
 393          * (or get another interrupt to escape the semop()).  We can avoid this
 394          * problem by temporarily setting ImmediateInterruptOK to true before we
 395          * do CHECK_FOR_INTERRUPTS; then, a die() interrupt in this interval will
 396          * execute directly.  However, there is a huge pitfall: there is another
 397          * window of a few instructions after the semop() before we are able to
 398          * reset ImmediateInterruptOK.  If an interrupt occurs then, we'll lose
 399          * control, which means that the lock has been acquired but our caller did
 400          * not get a chance to record the fact. Therefore, we only set
 401          * ImmediateInterruptOK if the caller tells us it's OK to do so, ie, the
 402          * caller does not need to record acquiring the lock.  (This is currently
 403          * true for lockmanager locks, since the process that granted us the lock
 404          * did all the necessary state updates. It's not true for SysV semaphores
 405          * used to implement LW locks or emulate spinlocks --- but the wait time
 406          * for such locks should not be very long, anyway.)
 407          *
 408          * On some platforms, signals marked SA_RESTART (which is most, for us)
 409          * will not interrupt the semop(); it will just keep waiting.  Therefore
 410          * it's necessary for cancel/die interrupts to be serviced directly by the
 411          * signal handler.      On these platforms the behavior is really the same
 412          * whether the signal arrives just before the semop() begins, or while it
 413          * is waiting.  The loop on EINTR is thus important only for other types
 414          * of interrupts.
 415          */
 416         do
 417         {
 418                 ImmediateInterruptOK = interruptOK;
 419                 CHECK_FOR_INTERRUPTS();
 420                 errStatus = semop(sema->semId, &sops, 1);
 421                 ImmediateInterruptOK = false;
 422         } while (errStatus < 0 && errno == EINTR);
 423
 424         if (errStatus < 0)
 425                 elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
 426 }
 427
 428 /*
 429  * PGSemaphoreUnlock
 430  *
 431  * Unlock a semaphore (increment count)
 432  */
 433 void
 434 PGSemaphoreUnlock(PGSemaphore sema)
 435 {
 436         int                     errStatus;
 437         struct sembuf sops;
 438
 439         sops.sem_op = 1;                        /* increment */
 440         sops.sem_flg = 0;
 441         sops.sem_num = sema->semNum;
 442
 443         /*
 444          * Note: if errStatus is -1 and errno == EINTR then it means we returned
 445          * from the operation prematurely because we were sent a signal.  So we
 446          * try and unlock the semaphore again. Not clear this can really happen,
 447          * but might as well cope.
 448          */
 449         do
 450         {
 451                 errStatus = semop(sema->semId, &sops, 1);
 452         } while (errStatus < 0 && errno == EINTR);
 453
 454         if (errStatus < 0)
 455                 elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
 456 }
 457
 458 /*
 459  * PGSemaphoreTryLock
 460  *
 461  * Lock a semaphore only if able to do so without blocking
 462  */
 463 bool
 464 PGSemaphoreTryLock(PGSemaphore sema)
 465 {
 466         int                     errStatus;
 467         struct sembuf sops;
 468
 469         sops.sem_op = -1;                       /* decrement */
 470         sops.sem_flg = IPC_NOWAIT;      /* but don't block */
 471         sops.sem_num = sema->semNum;
 472
 473         /*
 474          * Note: if errStatus is -1 and errno == EINTR then it means we returned
 475          * from the operation prematurely because we were sent a signal.  So we
 476          * try and lock the semaphore again.
 477          */
 478         do
 479         {
 480                 errStatus = semop(sema->semId, &sops, 1);
 481         } while (errStatus < 0 && errno == EINTR);
 482
 483         if (errStatus < 0)
 484         {
 485                 /* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */
 486 #ifdef EAGAIN
 487                 if (errno == EAGAIN)
 488                         return false;           /* failed to lock it */
 489 #endif
 490 #if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
 491                 if (errno == EWOULDBLOCK)
 492                         return false;           /* failed to lock it */
 493 #endif
 494                 /* Otherwise we got trouble */
 495                 elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
 496         }
 497
 498         return true;
 499 }