1 /*-------------------------------------------------------------------------
4 * Implement PGSemaphores using SysV semaphore facilities
7 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
13 *-------------------------------------------------------------------------
26 #ifdef HAVE_KERNEL_OS_H
27 #include <kernel/OS.h>
30 #include "miscadmin.h"
31 #include "storage/ipc.h"
32 #include "storage/pg_sema.h"
35 #ifndef HAVE_UNION_SEMUN
40 unsigned short *array
;
44 typedef key_t IpcSemaphoreKey
; /* semaphore key passed to semget(2) */
45 typedef int IpcSemaphoreId
; /* semaphore ID returned by semget(2) */
48 * SEMAS_PER_SET is the number of useful semaphores in each semaphore set
49 * we allocate. It must be *less than* your kernel's SEMMSL (max semaphores
50 * per set) parameter, which is often around 25. (Less than, because we
51 * allocate one extra sema in each set for identification purposes.)
53 #define SEMAS_PER_SET 16
55 #define IPCProtection (0600) /* access/modify by user only */
57 #define PGSemaMagic 537 /* must be less than SEMVMX */
60 static IpcSemaphoreId
*mySemaSets
; /* IDs of sema sets acquired so far */
61 static int numSemaSets
; /* number of sema sets acquired so far */
62 static int maxSemaSets
; /* allocated size of mySemaSets array */
63 static IpcSemaphoreKey nextSemaKey
; /* next key to try using */
64 static int nextSemaNumber
; /* next free sem num in last sema set */
67 static IpcSemaphoreId
InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey
,
69 static void IpcSemaphoreInitialize(IpcSemaphoreId semId
, int semNum
,
71 static void IpcSemaphoreKill(IpcSemaphoreId semId
);
72 static int IpcSemaphoreGetValue(IpcSemaphoreId semId
, int semNum
);
73 static pid_t
IpcSemaphoreGetLastPID(IpcSemaphoreId semId
, int semNum
);
74 static IpcSemaphoreId
IpcSemaphoreCreate(int numSems
);
75 static void ReleaseSemaphores(int status
, Datum arg
);
79 * InternalIpcSemaphoreCreate
81 * Attempt to create a new semaphore set with the specified key.
82 * Will fail (return -1) if such a set already exists.
84 * If we fail with a failure code other than collision-with-existing-set,
85 * print out an error and abort. Other types of errors suggest nonrecoverable
89 InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey
, int numSems
)
93 semId
= semget(semKey
, numSems
, IPC_CREAT
| IPC_EXCL
| IPCProtection
);
98 * Fail quietly if error indicates a collision with existing set. One
99 * would expect EEXIST, given that we said IPC_EXCL, but perhaps we
100 * could get a permission violation instead? Also, EIDRM might occur
101 * if an old set is slated for destruction but not gone yet.
103 if (errno
== EEXIST
|| errno
== EACCES
111 * Else complain and abort
114 (errmsg("could not create semaphores: %m"),
115 errdetail("Failed system call was semget(%lu, %d, 0%o).",
116 (unsigned long) semKey
, numSems
,
117 IPC_CREAT
| IPC_EXCL
| IPCProtection
),
119 errhint("This error does *not* mean that you have run out of disk space.\n"
120 "It occurs when either the system limit for the maximum number of "
121 "semaphore sets (SEMMNI), or the system wide maximum number of "
122 "semaphores (SEMMNS), would be exceeded. You need to raise the "
123 "respective kernel parameter. Alternatively, reduce PostgreSQL's "
124 "consumption of semaphores by reducing its max_connections parameter "
126 "The PostgreSQL documentation contains more information about "
127 "configuring your system for PostgreSQL.",
135 * Initialize a semaphore to the specified value.
138 IpcSemaphoreInitialize(IpcSemaphoreId semId
, int semNum
, int value
)
143 if (semctl(semId
, semNum
, SETVAL
, semun
) < 0)
145 (errmsg_internal("semctl(%d, %d, SETVAL, %d) failed: %m",
146 semId
, semNum
, value
),
148 errhint("You possibly need to raise your kernel's SEMVMX value to be at least "
149 "%d. Look into the PostgreSQL documentation for details.",
154 * IpcSemaphoreKill(semId) - removes a semaphore set
157 IpcSemaphoreKill(IpcSemaphoreId semId
)
161 semun
.val
= 0; /* unused, but keep compiler quiet */
163 if (semctl(semId
, 0, IPC_RMID
, semun
) < 0)
164 elog(LOG
, "semctl(%d, 0, IPC_RMID, ...) failed: %m", semId
);
167 /* Get the current value (semval) of the semaphore */
169 IpcSemaphoreGetValue(IpcSemaphoreId semId
, int semNum
)
171 union semun dummy
; /* for Solaris */
173 dummy
.val
= 0; /* unused */
175 return semctl(semId
, semNum
, GETVAL
, dummy
);
178 /* Get the PID of the last process to do semop() on the semaphore */
180 IpcSemaphoreGetLastPID(IpcSemaphoreId semId
, int semNum
)
182 union semun dummy
; /* for Solaris */
184 dummy
.val
= 0; /* unused */
186 return semctl(semId
, semNum
, GETPID
, dummy
);
191 * Create a semaphore set with the given number of useful semaphores
192 * (an additional sema is actually allocated to serve as identifier).
193 * Dead Postgres sema sets are recycled if found, but we do not fail
194 * upon collision with non-Postgres sema sets.
196 * The idea here is to detect and re-use keys that may have been assigned
197 * by a crashed postmaster or backend.
199 static IpcSemaphoreId
200 IpcSemaphoreCreate(int numSems
)
202 IpcSemaphoreId semId
;
204 PGSemaphoreData mysema
;
206 /* Loop till we find a free IPC key */
207 for (nextSemaKey
++;; nextSemaKey
++)
211 /* Try to create new semaphore set */
212 semId
= InternalIpcSemaphoreCreate(nextSemaKey
, numSems
+ 1);
214 break; /* successful create */
216 /* See if it looks to be leftover from a dead Postgres process */
217 semId
= semget(nextSemaKey
, numSems
+ 1, 0);
219 continue; /* failed: must be some other app's */
220 if (IpcSemaphoreGetValue(semId
, numSems
) != PGSemaMagic
)
221 continue; /* sema belongs to a non-Postgres app */
224 * If the creator PID is my own PID or does not belong to any extant
225 * process, it's safe to zap it.
227 creatorPID
= IpcSemaphoreGetLastPID(semId
, numSems
);
229 continue; /* oops, GETPID failed */
230 if (creatorPID
!= getpid())
232 if (kill(creatorPID
, 0) == 0 || errno
!= ESRCH
)
233 continue; /* sema belongs to a live process */
237 * The sema set appears to be from a dead Postgres process, or from a
238 * previous cycle of life in this same process. Zap it, if possible.
239 * This probably shouldn't fail, but if it does, assume the sema set
240 * belongs to someone else after all, and continue quietly.
242 semun
.val
= 0; /* unused, but keep compiler quiet */
243 if (semctl(semId
, 0, IPC_RMID
, semun
) < 0)
247 * Now try again to create the sema set.
249 semId
= InternalIpcSemaphoreCreate(nextSemaKey
, numSems
+ 1);
251 break; /* successful create */
254 * Can only get here if some other process managed to create the same
255 * sema key before we did. Let him have that one, loop around to try
261 * OK, we created a new sema set. Mark it as created by this process. We
262 * do this by setting the spare semaphore to PGSemaMagic-1 and then
263 * incrementing it with semop(). That leaves it with value PGSemaMagic
264 * and sempid referencing this process.
266 IpcSemaphoreInitialize(semId
, numSems
, PGSemaMagic
- 1);
267 mysema
.semId
= semId
;
268 mysema
.semNum
= numSems
;
269 PGSemaphoreUnlock(&mysema
);
276 * PGReserveSemaphores --- initialize semaphore support
278 * This is called during postmaster start or shared memory reinitialization.
279 * It should do whatever is needed to be able to support up to maxSemas
280 * subsequent PGSemaphoreCreate calls. Also, if any system resources
281 * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
282 * callback to release them.
284 * The port number is passed for possible use as a key (for SysV, we use
285 * it to generate the starting semaphore key). In a standalone backend,
286 * zero will be passed.
288 * In the SysV implementation, we acquire semaphore sets on-demand; the
289 * maxSemas parameter is just used to size the array that keeps track of
290 * acquired sets for subsequent releasing.
293 PGReserveSemaphores(int maxSemas
, int port
)
295 maxSemaSets
= (maxSemas
+ SEMAS_PER_SET
- 1) / SEMAS_PER_SET
;
296 mySemaSets
= (IpcSemaphoreId
*)
297 malloc(maxSemaSets
* sizeof(IpcSemaphoreId
));
298 if (mySemaSets
== NULL
)
299 elog(PANIC
, "out of memory");
301 nextSemaKey
= port
* 1000;
302 nextSemaNumber
= SEMAS_PER_SET
; /* force sema set alloc on 1st call */
304 on_shmem_exit(ReleaseSemaphores
, 0);
308 * Release semaphores at shutdown or shmem reinitialization
310 * (called as an on_shmem_exit callback, hence funny argument list)
313 ReleaseSemaphores(int status
, Datum arg
)
317 for (i
= 0; i
< numSemaSets
; i
++)
318 IpcSemaphoreKill(mySemaSets
[i
]);
325 * Initialize a PGSemaphore structure to represent a sema with count 1
328 PGSemaphoreCreate(PGSemaphore sema
)
330 /* Can't do this in a backend, because static state is postmaster's */
331 Assert(!IsUnderPostmaster
);
333 if (nextSemaNumber
>= SEMAS_PER_SET
)
335 /* Time to allocate another semaphore set */
336 if (numSemaSets
>= maxSemaSets
)
337 elog(PANIC
, "too many semaphores created");
338 mySemaSets
[numSemaSets
] = IpcSemaphoreCreate(SEMAS_PER_SET
);
342 /* Assign the next free semaphore in the current set */
343 sema
->semId
= mySemaSets
[numSemaSets
- 1];
344 sema
->semNum
= nextSemaNumber
++;
345 /* Initialize it to count 1 */
346 IpcSemaphoreInitialize(sema
->semId
, sema
->semNum
, 1);
352 * Reset a previously-initialized PGSemaphore to have count 0
355 PGSemaphoreReset(PGSemaphore sema
)
357 IpcSemaphoreInitialize(sema
->semId
, sema
->semNum
, 0);
363 * Lock a semaphore (decrement count), blocking if count would be < 0
366 PGSemaphoreLock(PGSemaphore sema
, bool interruptOK
)
371 sops
.sem_op
= -1; /* decrement */
373 sops
.sem_num
= sema
->semNum
;
376 * Note: if errStatus is -1 and errno == EINTR then it means we returned
377 * from the operation prematurely because we were sent a signal. So we
378 * try and lock the semaphore again.
380 * Each time around the loop, we check for a cancel/die interrupt. On
381 * some platforms, if such an interrupt comes in while we are waiting, it
382 * will cause the semop() call to exit with errno == EINTR, allowing us to
383 * service the interrupt (if not in a critical section already) during the
384 * next loop iteration.
386 * Once we acquire the lock, we do NOT check for an interrupt before
387 * returning. The caller needs to be able to record ownership of the lock
388 * before any interrupt can be accepted.
390 * There is a window of a few instructions between CHECK_FOR_INTERRUPTS
391 * and entering the semop() call. If a cancel/die interrupt occurs in
392 * that window, we would fail to notice it until after we acquire the lock
393 * (or get another interrupt to escape the semop()). We can avoid this
394 * problem by temporarily setting ImmediateInterruptOK to true before we
395 * do CHECK_FOR_INTERRUPTS; then, a die() interrupt in this interval will
396 * execute directly. However, there is a huge pitfall: there is another
397 * window of a few instructions after the semop() before we are able to
398 * reset ImmediateInterruptOK. If an interrupt occurs then, we'll lose
399 * control, which means that the lock has been acquired but our caller did
400 * not get a chance to record the fact. Therefore, we only set
401 * ImmediateInterruptOK if the caller tells us it's OK to do so, ie, the
402 * caller does not need to record acquiring the lock. (This is currently
403 * true for lockmanager locks, since the process that granted us the lock
404 * did all the necessary state updates. It's not true for SysV semaphores
405 * used to implement LW locks or emulate spinlocks --- but the wait time
406 * for such locks should not be very long, anyway.)
408 * On some platforms, signals marked SA_RESTART (which is most, for us)
409 * will not interrupt the semop(); it will just keep waiting. Therefore
410 * it's necessary for cancel/die interrupts to be serviced directly by the
411 * signal handler. On these platforms the behavior is really the same
412 * whether the signal arrives just before the semop() begins, or while it
413 * is waiting. The loop on EINTR is thus important only for other types
418 ImmediateInterruptOK
= interruptOK
;
419 CHECK_FOR_INTERRUPTS();
420 errStatus
= semop(sema
->semId
, &sops
, 1);
421 ImmediateInterruptOK
= false;
422 } while (errStatus
< 0 && errno
== EINTR
);
425 elog(FATAL
, "semop(id=%d) failed: %m", sema
->semId
);
431 * Unlock a semaphore (increment count)
434 PGSemaphoreUnlock(PGSemaphore sema
)
439 sops
.sem_op
= 1; /* increment */
441 sops
.sem_num
= sema
->semNum
;
444 * Note: if errStatus is -1 and errno == EINTR then it means we returned
445 * from the operation prematurely because we were sent a signal. So we
446 * try and unlock the semaphore again. Not clear this can really happen,
447 * but might as well cope.
451 errStatus
= semop(sema
->semId
, &sops
, 1);
452 } while (errStatus
< 0 && errno
== EINTR
);
455 elog(FATAL
, "semop(id=%d) failed: %m", sema
->semId
);
461 * Lock a semaphore only if able to do so without blocking
464 PGSemaphoreTryLock(PGSemaphore sema
)
469 sops
.sem_op
= -1; /* decrement */
470 sops
.sem_flg
= IPC_NOWAIT
; /* but don't block */
471 sops
.sem_num
= sema
->semNum
;
474 * Note: if errStatus is -1 and errno == EINTR then it means we returned
475 * from the operation prematurely because we were sent a signal. So we
476 * try and lock the semaphore again.
480 errStatus
= semop(sema
->semId
, &sops
, 1);
481 } while (errStatus
< 0 && errno
== EINTR
);
485 /* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */
488 return false; /* failed to lock it */
490 #if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
491 if (errno
== EWOULDBLOCK
)
492 return false; /* failed to lock it */
494 /* Otherwise we got trouble */
495 elog(FATAL
, "semop(id=%d) failed: %m", sema
->semId
);