etc/services - sync with NetBSD-8
[minix.git] / external / bsd / libevent / dist / epoll.c
blob571f3e7a064d4c51029ed7ab83e2492a96646fa5
1 /* $NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $ */
2 /*
3 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
4 * Copyright 2007-2012 Niels Provos, Nick Mathewson
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "event2/event-config.h"
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $");
32 #include <stdint.h>
33 #include <sys/types.h>
34 #include <sys/resource.h>
35 #ifdef _EVENT_HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif
38 #include <sys/queue.h>
39 #include <sys/epoll.h>
40 #include <signal.h>
41 #include <limits.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <errno.h>
47 #ifdef _EVENT_HAVE_FCNTL_H
48 #include <fcntl.h>
49 #endif
51 #include "event-internal.h"
52 #include "evsignal-internal.h"
53 #include "event2/thread.h"
54 #include "evthread-internal.h"
55 #include "log-internal.h"
56 #include "evmap-internal.h"
57 #include "changelist-internal.h"
59 struct epollop {
60 struct epoll_event *events;
61 int nevents;
62 int epfd;
65 static void *epoll_init(struct event_base *);
66 static int epoll_dispatch(struct event_base *, struct timeval *);
67 static void epoll_dealloc(struct event_base *);
69 static const struct eventop epollops_changelist = {
70 "epoll (with changelist)",
71 epoll_init,
72 event_changelist_add,
73 event_changelist_del,
74 epoll_dispatch,
75 epoll_dealloc,
76 1, /* need reinit */
77 EV_FEATURE_ET|EV_FEATURE_O1,
78 EVENT_CHANGELIST_FDINFO_SIZE
82 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
83 short old, short events, void *p);
84 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
85 short old, short events, void *p);
87 const struct eventop epollops = {
88 "epoll",
89 epoll_init,
90 epoll_nochangelist_add,
91 epoll_nochangelist_del,
92 epoll_dispatch,
93 epoll_dealloc,
94 1, /* need reinit */
95 EV_FEATURE_ET|EV_FEATURE_O1,
99 #define INITIAL_NEVENT 32
100 #define MAX_NEVENT 4096
102 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
103 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
104 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
105 * largest number of msec we can support here is 2147482. Let's
106 * round that down by 47 seconds.
108 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
110 static void *
111 epoll_init(struct event_base *base)
113 int epfd;
114 struct epollop *epollop;
116 /* Initialize the kernel queue. (The size field is ignored since
117 * 2.6.8.) */
118 if ((epfd = epoll_create(32000)) == -1) {
119 if (errno != ENOSYS)
120 event_warn("epoll_create");
121 return (NULL);
124 evutil_make_socket_closeonexec(epfd);
126 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
127 close(epfd);
128 return (NULL);
131 epollop->epfd = epfd;
133 /* Initialize fields */
134 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
135 if (epollop->events == NULL) {
136 mm_free(epollop);
137 close(epfd);
138 return (NULL);
140 epollop->nevents = INITIAL_NEVENT;
142 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
143 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
144 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
145 base->evsel = &epollops_changelist;
147 evsig_init(base);
149 return (epollop);
152 static const char *
153 change_to_string(int change)
155 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
156 if (change == EV_CHANGE_ADD) {
157 return "add";
158 } else if (change == EV_CHANGE_DEL) {
159 return "del";
160 } else if (change == 0) {
161 return "none";
162 } else {
163 return "???";
167 static const char *
168 epoll_op_to_string(int op)
170 return op == EPOLL_CTL_ADD?"ADD":
171 op == EPOLL_CTL_DEL?"DEL":
172 op == EPOLL_CTL_MOD?"MOD":
173 "???";
176 static int
177 epoll_apply_one_change(struct event_base *base,
178 struct epollop *epollop,
179 const struct event_change *ch)
181 struct epoll_event epev;
182 int op, events = 0;
184 if (1) {
185 /* The logic here is a little tricky. If we had no events set
186 on the fd before, we need to set op="ADD" and set
187 events=the events we want to add. If we had any events set
188 on the fd before, and we want any events to remain on the
189 fd, we need to say op="MOD" and set events=the events we
190 want to remain. But if we want to delete the last event,
191 we say op="DEL" and set events=the remaining events. What
192 fun!
195 /* TODO: Turn this into a switch or a table lookup. */
197 if ((ch->read_change & EV_CHANGE_ADD) ||
198 (ch->write_change & EV_CHANGE_ADD)) {
199 /* If we are adding anything at all, we'll want to do
200 * either an ADD or a MOD. */
201 events = 0;
202 op = EPOLL_CTL_ADD;
203 if (ch->read_change & EV_CHANGE_ADD) {
204 events |= EPOLLIN;
205 } else if (ch->read_change & EV_CHANGE_DEL) {
207 } else if (ch->old_events & EV_READ) {
208 events |= EPOLLIN;
210 if (ch->write_change & EV_CHANGE_ADD) {
211 events |= EPOLLOUT;
212 } else if (ch->write_change & EV_CHANGE_DEL) {
214 } else if (ch->old_events & EV_WRITE) {
215 events |= EPOLLOUT;
217 if ((ch->read_change|ch->write_change) & EV_ET)
218 events |= EPOLLET;
220 if (ch->old_events) {
221 /* If MOD fails, we retry as an ADD, and if
222 * ADD fails we will retry as a MOD. So the
223 * only hard part here is to guess which one
224 * will work. As a heuristic, we'll try
225 * MOD first if we think there were old
226 * events and ADD if we think there were none.
228 * We can be wrong about the MOD if the file
229 * has in fact been closed and re-opened.
231 * We can be wrong about the ADD if the
232 * the fd has been re-created with a dup()
233 * of the same file that it was before.
235 op = EPOLL_CTL_MOD;
237 } else if ((ch->read_change & EV_CHANGE_DEL) ||
238 (ch->write_change & EV_CHANGE_DEL)) {
239 /* If we're deleting anything, we'll want to do a MOD
240 * or a DEL. */
241 op = EPOLL_CTL_DEL;
243 if (ch->read_change & EV_CHANGE_DEL) {
244 if (ch->write_change & EV_CHANGE_DEL) {
245 events = EPOLLIN|EPOLLOUT;
246 } else if (ch->old_events & EV_WRITE) {
247 events = EPOLLOUT;
248 op = EPOLL_CTL_MOD;
249 } else {
250 events = EPOLLIN;
252 } else if (ch->write_change & EV_CHANGE_DEL) {
253 if (ch->old_events & EV_READ) {
254 events = EPOLLIN;
255 op = EPOLL_CTL_MOD;
256 } else {
257 events = EPOLLOUT;
262 if (!events)
263 return 0;
265 memset(&epev, 0, sizeof(epev));
266 epev.data.fd = ch->fd;
267 epev.events = events;
268 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
269 if (op == EPOLL_CTL_MOD && errno == ENOENT) {
270 /* If a MOD operation fails with ENOENT, the
271 * fd was probably closed and re-opened. We
272 * should retry the operation as an ADD.
274 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
275 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
276 (int)epev.events, ch->fd);
277 return -1;
278 } else {
279 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
280 (int)epev.events,
281 ch->fd));
283 } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
284 /* If an ADD operation fails with EEXIST,
285 * either the operation was redundant (as with a
286 * precautionary add), or we ran into a fun
287 * kernel bug where using dup*() to duplicate the
288 * same file into the same fd gives you the same epitem
289 * rather than a fresh one. For the second case,
290 * we must retry with MOD. */
291 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
292 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
293 (int)epev.events, ch->fd);
294 return -1;
295 } else {
296 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
297 (int)epev.events,
298 ch->fd));
300 } else if (op == EPOLL_CTL_DEL &&
301 (errno == ENOENT || errno == EBADF ||
302 errno == EPERM)) {
303 /* If a delete fails with one of these errors,
304 * that's fine too: we closed the fd before we
305 * got around to calling epoll_dispatch. */
306 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
307 (int)epev.events,
308 ch->fd,
309 strerror(errno)));
310 } else {
311 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)",
312 epoll_op_to_string(op),
313 (int)epev.events,
314 ch->fd,
315 ch->old_events,
316 ch->read_change,
317 change_to_string(ch->read_change),
318 ch->write_change,
319 change_to_string(ch->write_change));
320 return -1;
322 } else {
323 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
324 epoll_op_to_string(op),
325 (int)epev.events,
326 (int)ch->fd,
327 ch->old_events,
328 ch->read_change,
329 ch->write_change));
332 return 0;
335 static int
336 epoll_apply_changes(struct event_base *base)
338 struct event_changelist *changelist = &base->changelist;
339 struct epollop *epollop = base->evbase;
340 struct event_change *ch;
342 int r = 0;
343 int i;
345 for (i = 0; i < changelist->n_changes; ++i) {
346 ch = &changelist->changes[i];
347 if (epoll_apply_one_change(base, epollop, ch) < 0)
348 r = -1;
351 return (r);
354 static int
355 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
356 short old, short events, void *p)
358 struct event_change ch;
359 ch.fd = fd;
360 ch.old_events = old;
361 ch.read_change = ch.write_change = 0;
362 if (events & EV_WRITE)
363 ch.write_change = EV_CHANGE_ADD |
364 (events & EV_ET);
365 if (events & EV_READ)
366 ch.read_change = EV_CHANGE_ADD |
367 (events & EV_ET);
369 return epoll_apply_one_change(base, base->evbase, &ch);
372 static int
373 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
374 short old, short events, void *p)
376 struct event_change ch;
377 ch.fd = fd;
378 ch.old_events = old;
379 ch.read_change = ch.write_change = 0;
380 if (events & EV_WRITE)
381 ch.write_change = EV_CHANGE_DEL;
382 if (events & EV_READ)
383 ch.read_change = EV_CHANGE_DEL;
385 return epoll_apply_one_change(base, base->evbase, &ch);
388 static int
389 epoll_dispatch(struct event_base *base, struct timeval *tv)
391 struct epollop *epollop = base->evbase;
392 struct epoll_event *events = epollop->events;
393 int i, res;
394 long timeout = -1;
396 if (tv != NULL) {
397 timeout = evutil_tv_to_msec(tv);
398 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
399 /* Linux kernels can wait forever if the timeout is
400 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
401 timeout = MAX_EPOLL_TIMEOUT_MSEC;
405 epoll_apply_changes(base);
406 event_changelist_remove_all(&base->changelist, base);
408 EVBASE_RELEASE_LOCK(base, th_base_lock);
410 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
412 EVBASE_ACQUIRE_LOCK(base, th_base_lock);
414 if (res == -1) {
415 if (errno != EINTR) {
416 event_warn("epoll_wait");
417 return (-1);
420 return (0);
423 event_debug(("%s: epoll_wait reports %d", __func__, res));
424 EVUTIL_ASSERT(res <= epollop->nevents);
426 for (i = 0; i < res; i++) {
427 int what = events[i].events;
428 short ev = 0;
430 if (what & (EPOLLHUP|EPOLLERR)) {
431 ev = EV_READ | EV_WRITE;
432 } else {
433 if (what & EPOLLIN)
434 ev |= EV_READ;
435 if (what & EPOLLOUT)
436 ev |= EV_WRITE;
439 if (!ev)
440 continue;
442 evmap_io_active(base, events[i].data.fd, ev | EV_ET);
445 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
446 /* We used all of the event space this time. We should
447 be ready for more events next time. */
448 int new_nevents = epollop->nevents * 2;
449 struct epoll_event *new_events;
451 new_events = mm_realloc(epollop->events,
452 new_nevents * sizeof(struct epoll_event));
453 if (new_events) {
454 epollop->events = new_events;
455 epollop->nevents = new_nevents;
459 return (0);
463 static void
464 epoll_dealloc(struct event_base *base)
466 struct epollop *epollop = base->evbase;
468 evsig_dealloc(base);
469 if (epollop->events)
470 mm_free(epollop->events);
471 if (epollop->epfd >= 0)
472 close(epollop->epfd);
474 memset(epollop, 0, sizeof(struct epollop));
475 mm_free(epollop);