1 /* $NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $ */
3 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
4 * Copyright 2007-2012 Niels Provos, Nick Mathewson
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "event2/event-config.h"
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $");
33 #include <sys/types.h>
34 #include <sys/resource.h>
35 #ifdef _EVENT_HAVE_SYS_TIME_H
38 #include <sys/queue.h>
39 #include <sys/epoll.h>
47 #ifdef _EVENT_HAVE_FCNTL_H
51 #include "event-internal.h"
52 #include "evsignal-internal.h"
53 #include "event2/thread.h"
54 #include "evthread-internal.h"
55 #include "log-internal.h"
56 #include "evmap-internal.h"
57 #include "changelist-internal.h"
60 struct epoll_event
*events
;
65 static void *epoll_init(struct event_base
*);
66 static int epoll_dispatch(struct event_base
*, struct timeval
*);
67 static void epoll_dealloc(struct event_base
*);
69 static const struct eventop epollops_changelist
= {
70 "epoll (with changelist)",
77 EV_FEATURE_ET
|EV_FEATURE_O1
,
78 EVENT_CHANGELIST_FDINFO_SIZE
82 static int epoll_nochangelist_add(struct event_base
*base
, evutil_socket_t fd
,
83 short old
, short events
, void *p
);
84 static int epoll_nochangelist_del(struct event_base
*base
, evutil_socket_t fd
,
85 short old
, short events
, void *p
);
87 const struct eventop epollops
= {
90 epoll_nochangelist_add
,
91 epoll_nochangelist_del
,
95 EV_FEATURE_ET
|EV_FEATURE_O1
,
99 #define INITIAL_NEVENT 32
100 #define MAX_NEVENT 4096
102 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
103 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
104 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
105 * largest number of msec we can support here is 2147482. Let's
106 * round that down by 47 seconds.
108 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
111 epoll_init(struct event_base
*base
)
114 struct epollop
*epollop
;
116 /* Initialize the kernel queue. (The size field is ignored since
118 if ((epfd
= epoll_create(32000)) == -1) {
120 event_warn("epoll_create");
124 evutil_make_socket_closeonexec(epfd
);
126 if (!(epollop
= mm_calloc(1, sizeof(struct epollop
)))) {
131 epollop
->epfd
= epfd
;
133 /* Initialize fields */
134 epollop
->events
= mm_calloc(INITIAL_NEVENT
, sizeof(struct epoll_event
));
135 if (epollop
->events
== NULL
) {
140 epollop
->nevents
= INITIAL_NEVENT
;
142 if ((base
->flags
& EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST
) != 0 ||
143 ((base
->flags
& EVENT_BASE_FLAG_IGNORE_ENV
) == 0 &&
144 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL
))
145 base
->evsel
= &epollops_changelist
;
153 change_to_string(int change
)
155 change
&= (EV_CHANGE_ADD
|EV_CHANGE_DEL
);
156 if (change
== EV_CHANGE_ADD
) {
158 } else if (change
== EV_CHANGE_DEL
) {
160 } else if (change
== 0) {
168 epoll_op_to_string(int op
)
170 return op
== EPOLL_CTL_ADD
?"ADD":
171 op
== EPOLL_CTL_DEL
?"DEL":
172 op
== EPOLL_CTL_MOD
?"MOD":
177 epoll_apply_one_change(struct event_base
*base
,
178 struct epollop
*epollop
,
179 const struct event_change
*ch
)
181 struct epoll_event epev
;
185 /* The logic here is a little tricky. If we had no events set
186 on the fd before, we need to set op="ADD" and set
187 events=the events we want to add. If we had any events set
188 on the fd before, and we want any events to remain on the
189 fd, we need to say op="MOD" and set events=the events we
190 want to remain. But if we want to delete the last event,
191 we say op="DEL" and set events=the remaining events. What
195 /* TODO: Turn this into a switch or a table lookup. */
197 if ((ch
->read_change
& EV_CHANGE_ADD
) ||
198 (ch
->write_change
& EV_CHANGE_ADD
)) {
199 /* If we are adding anything at all, we'll want to do
200 * either an ADD or a MOD. */
203 if (ch
->read_change
& EV_CHANGE_ADD
) {
205 } else if (ch
->read_change
& EV_CHANGE_DEL
) {
207 } else if (ch
->old_events
& EV_READ
) {
210 if (ch
->write_change
& EV_CHANGE_ADD
) {
212 } else if (ch
->write_change
& EV_CHANGE_DEL
) {
214 } else if (ch
->old_events
& EV_WRITE
) {
217 if ((ch
->read_change
|ch
->write_change
) & EV_ET
)
220 if (ch
->old_events
) {
221 /* If MOD fails, we retry as an ADD, and if
222 * ADD fails we will retry as a MOD. So the
223 * only hard part here is to guess which one
224 * will work. As a heuristic, we'll try
225 * MOD first if we think there were old
226 * events and ADD if we think there were none.
228 * We can be wrong about the MOD if the file
229 * has in fact been closed and re-opened.
231 * We can be wrong about the ADD if the
232 * the fd has been re-created with a dup()
233 * of the same file that it was before.
237 } else if ((ch
->read_change
& EV_CHANGE_DEL
) ||
238 (ch
->write_change
& EV_CHANGE_DEL
)) {
239 /* If we're deleting anything, we'll want to do a MOD
243 if (ch
->read_change
& EV_CHANGE_DEL
) {
244 if (ch
->write_change
& EV_CHANGE_DEL
) {
245 events
= EPOLLIN
|EPOLLOUT
;
246 } else if (ch
->old_events
& EV_WRITE
) {
252 } else if (ch
->write_change
& EV_CHANGE_DEL
) {
253 if (ch
->old_events
& EV_READ
) {
265 memset(&epev
, 0, sizeof(epev
));
266 epev
.data
.fd
= ch
->fd
;
267 epev
.events
= events
;
268 if (epoll_ctl(epollop
->epfd
, op
, ch
->fd
, &epev
) == -1) {
269 if (op
== EPOLL_CTL_MOD
&& errno
== ENOENT
) {
270 /* If a MOD operation fails with ENOENT, the
271 * fd was probably closed and re-opened. We
272 * should retry the operation as an ADD.
274 if (epoll_ctl(epollop
->epfd
, EPOLL_CTL_ADD
, ch
->fd
, &epev
) == -1) {
275 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
276 (int)epev
.events
, ch
->fd
);
279 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
283 } else if (op
== EPOLL_CTL_ADD
&& errno
== EEXIST
) {
284 /* If an ADD operation fails with EEXIST,
285 * either the operation was redundant (as with a
286 * precautionary add), or we ran into a fun
287 * kernel bug where using dup*() to duplicate the
288 * same file into the same fd gives you the same epitem
289 * rather than a fresh one. For the second case,
290 * we must retry with MOD. */
291 if (epoll_ctl(epollop
->epfd
, EPOLL_CTL_MOD
, ch
->fd
, &epev
) == -1) {
292 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
293 (int)epev
.events
, ch
->fd
);
296 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
300 } else if (op
== EPOLL_CTL_DEL
&&
301 (errno
== ENOENT
|| errno
== EBADF
||
303 /* If a delete fails with one of these errors,
304 * that's fine too: we closed the fd before we
305 * got around to calling epoll_dispatch. */
306 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
311 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)",
312 epoll_op_to_string(op
),
317 change_to_string(ch
->read_change
),
319 change_to_string(ch
->write_change
));
323 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
324 epoll_op_to_string(op
),
336 epoll_apply_changes(struct event_base
*base
)
338 struct event_changelist
*changelist
= &base
->changelist
;
339 struct epollop
*epollop
= base
->evbase
;
340 struct event_change
*ch
;
345 for (i
= 0; i
< changelist
->n_changes
; ++i
) {
346 ch
= &changelist
->changes
[i
];
347 if (epoll_apply_one_change(base
, epollop
, ch
) < 0)
355 epoll_nochangelist_add(struct event_base
*base
, evutil_socket_t fd
,
356 short old
, short events
, void *p
)
358 struct event_change ch
;
361 ch
.read_change
= ch
.write_change
= 0;
362 if (events
& EV_WRITE
)
363 ch
.write_change
= EV_CHANGE_ADD
|
365 if (events
& EV_READ
)
366 ch
.read_change
= EV_CHANGE_ADD
|
369 return epoll_apply_one_change(base
, base
->evbase
, &ch
);
373 epoll_nochangelist_del(struct event_base
*base
, evutil_socket_t fd
,
374 short old
, short events
, void *p
)
376 struct event_change ch
;
379 ch
.read_change
= ch
.write_change
= 0;
380 if (events
& EV_WRITE
)
381 ch
.write_change
= EV_CHANGE_DEL
;
382 if (events
& EV_READ
)
383 ch
.read_change
= EV_CHANGE_DEL
;
385 return epoll_apply_one_change(base
, base
->evbase
, &ch
);
389 epoll_dispatch(struct event_base
*base
, struct timeval
*tv
)
391 struct epollop
*epollop
= base
->evbase
;
392 struct epoll_event
*events
= epollop
->events
;
397 timeout
= evutil_tv_to_msec(tv
);
398 if (timeout
< 0 || timeout
> MAX_EPOLL_TIMEOUT_MSEC
) {
399 /* Linux kernels can wait forever if the timeout is
400 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
401 timeout
= MAX_EPOLL_TIMEOUT_MSEC
;
405 epoll_apply_changes(base
);
406 event_changelist_remove_all(&base
->changelist
, base
);
408 EVBASE_RELEASE_LOCK(base
, th_base_lock
);
410 res
= epoll_wait(epollop
->epfd
, events
, epollop
->nevents
, timeout
);
412 EVBASE_ACQUIRE_LOCK(base
, th_base_lock
);
415 if (errno
!= EINTR
) {
416 event_warn("epoll_wait");
423 event_debug(("%s: epoll_wait reports %d", __func__
, res
));
424 EVUTIL_ASSERT(res
<= epollop
->nevents
);
426 for (i
= 0; i
< res
; i
++) {
427 int what
= events
[i
].events
;
430 if (what
& (EPOLLHUP
|EPOLLERR
)) {
431 ev
= EV_READ
| EV_WRITE
;
442 evmap_io_active(base
, events
[i
].data
.fd
, ev
| EV_ET
);
445 if (res
== epollop
->nevents
&& epollop
->nevents
< MAX_NEVENT
) {
446 /* We used all of the event space this time. We should
447 be ready for more events next time. */
448 int new_nevents
= epollop
->nevents
* 2;
449 struct epoll_event
*new_events
;
451 new_events
= mm_realloc(epollop
->events
,
452 new_nevents
* sizeof(struct epoll_event
));
454 epollop
->events
= new_events
;
455 epollop
->nevents
= new_nevents
;
464 epoll_dealloc(struct event_base
*base
)
466 struct epollop
*epollop
= base
->evbase
;
470 mm_free(epollop
->events
);
471 if (epollop
->epfd
>= 0)
472 close(epollop
->epfd
);
474 memset(epollop
, 0, sizeof(struct epollop
));