2 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3 * Copyright 2007-2012 Niels Provos, Nick Mathewson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "event2/event-config.h"
30 #include <sys/types.h>
31 #include <sys/resource.h>
32 #ifdef _EVENT_HAVE_SYS_TIME_H
35 #include <sys/queue.h>
36 #include <sys/epoll.h>
44 #ifdef _EVENT_HAVE_FCNTL_H
48 #include "event-internal.h"
49 #include "evsignal-internal.h"
50 #include "event2/thread.h"
51 #include "evthread-internal.h"
52 #include "log-internal.h"
53 #include "evmap-internal.h"
54 #include "changelist-internal.h"
57 struct epoll_event
*events
;
62 static void *epoll_init(struct event_base
*);
63 static int epoll_dispatch(struct event_base
*, struct timeval
*);
64 static void epoll_dealloc(struct event_base
*);
66 static const struct eventop epollops_changelist
= {
67 "epoll (with changelist)",
74 EV_FEATURE_ET
|EV_FEATURE_O1
,
75 EVENT_CHANGELIST_FDINFO_SIZE
79 static int epoll_nochangelist_add(struct event_base
*base
, evutil_socket_t fd
,
80 short old
, short events
, void *p
);
81 static int epoll_nochangelist_del(struct event_base
*base
, evutil_socket_t fd
,
82 short old
, short events
, void *p
);
84 const struct eventop epollops
= {
87 epoll_nochangelist_add
,
88 epoll_nochangelist_del
,
92 EV_FEATURE_ET
|EV_FEATURE_O1
,
96 #define INITIAL_NEVENT 32
97 #define MAX_NEVENT 4096
99 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
100 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
101 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
102 * largest number of msec we can support here is 2147482. Let's
103 * round that down by 47 seconds.
105 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
108 epoll_init(struct event_base
*base
)
111 struct epollop
*epollop
;
113 /* Initialize the kernel queue. (The size field is ignored since
115 if ((epfd
= epoll_create(32000)) == -1) {
117 event_warn("epoll_create");
121 evutil_make_socket_closeonexec(epfd
);
123 if (!(epollop
= mm_calloc(1, sizeof(struct epollop
)))) {
128 epollop
->epfd
= epfd
;
130 /* Initialize fields */
131 epollop
->events
= mm_calloc(INITIAL_NEVENT
, sizeof(struct epoll_event
));
132 if (epollop
->events
== NULL
) {
137 epollop
->nevents
= INITIAL_NEVENT
;
139 if ((base
->flags
& EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST
) != 0 ||
140 ((base
->flags
& EVENT_BASE_FLAG_IGNORE_ENV
) == 0 &&
141 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL
))
142 base
->evsel
= &epollops_changelist
;
150 change_to_string(int change
)
152 change
&= (EV_CHANGE_ADD
|EV_CHANGE_DEL
);
153 if (change
== EV_CHANGE_ADD
) {
155 } else if (change
== EV_CHANGE_DEL
) {
157 } else if (change
== 0) {
165 epoll_op_to_string(int op
)
167 return op
== EPOLL_CTL_ADD
?"ADD":
168 op
== EPOLL_CTL_DEL
?"DEL":
169 op
== EPOLL_CTL_MOD
?"MOD":
174 epoll_apply_one_change(struct event_base
*base
,
175 struct epollop
*epollop
,
176 const struct event_change
*ch
)
178 struct epoll_event epev
;
182 /* The logic here is a little tricky. If we had no events set
183 on the fd before, we need to set op="ADD" and set
184 events=the events we want to add. If we had any events set
185 on the fd before, and we want any events to remain on the
186 fd, we need to say op="MOD" and set events=the events we
187 want to remain. But if we want to delete the last event,
188 we say op="DEL" and set events=the remaining events. What
192 /* TODO: Turn this into a switch or a table lookup. */
194 if ((ch
->read_change
& EV_CHANGE_ADD
) ||
195 (ch
->write_change
& EV_CHANGE_ADD
)) {
196 /* If we are adding anything at all, we'll want to do
197 * either an ADD or a MOD. */
200 if (ch
->read_change
& EV_CHANGE_ADD
) {
202 } else if (ch
->read_change
& EV_CHANGE_DEL
) {
204 } else if (ch
->old_events
& EV_READ
) {
207 if (ch
->write_change
& EV_CHANGE_ADD
) {
209 } else if (ch
->write_change
& EV_CHANGE_DEL
) {
211 } else if (ch
->old_events
& EV_WRITE
) {
214 if ((ch
->read_change
|ch
->write_change
) & EV_ET
)
217 if (ch
->old_events
) {
218 /* If MOD fails, we retry as an ADD, and if
219 * ADD fails we will retry as a MOD. So the
220 * only hard part here is to guess which one
221 * will work. As a heuristic, we'll try
222 * MOD first if we think there were old
223 * events and ADD if we think there were none.
225 * We can be wrong about the MOD if the file
226 * has in fact been closed and re-opened.
228 * We can be wrong about the ADD if the
229 * the fd has been re-created with a dup()
230 * of the same file that it was before.
234 } else if ((ch
->read_change
& EV_CHANGE_DEL
) ||
235 (ch
->write_change
& EV_CHANGE_DEL
)) {
236 /* If we're deleting anything, we'll want to do a MOD
240 if (ch
->read_change
& EV_CHANGE_DEL
) {
241 if (ch
->write_change
& EV_CHANGE_DEL
) {
242 events
= EPOLLIN
|EPOLLOUT
;
243 } else if (ch
->old_events
& EV_WRITE
) {
249 } else if (ch
->write_change
& EV_CHANGE_DEL
) {
250 if (ch
->old_events
& EV_READ
) {
262 memset(&epev
, 0, sizeof(epev
));
263 epev
.data
.fd
= ch
->fd
;
264 epev
.events
= events
;
265 if (epoll_ctl(epollop
->epfd
, op
, ch
->fd
, &epev
) == -1) {
266 if (op
== EPOLL_CTL_MOD
&& errno
== ENOENT
) {
267 /* If a MOD operation fails with ENOENT, the
268 * fd was probably closed and re-opened. We
269 * should retry the operation as an ADD.
271 if (epoll_ctl(epollop
->epfd
, EPOLL_CTL_ADD
, ch
->fd
, &epev
) == -1) {
272 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
273 (int)epev
.events
, ch
->fd
);
276 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
280 } else if (op
== EPOLL_CTL_ADD
&& errno
== EEXIST
) {
281 /* If an ADD operation fails with EEXIST,
282 * either the operation was redundant (as with a
283 * precautionary add), or we ran into a fun
284 * kernel bug where using dup*() to duplicate the
285 * same file into the same fd gives you the same epitem
286 * rather than a fresh one. For the second case,
287 * we must retry with MOD. */
288 if (epoll_ctl(epollop
->epfd
, EPOLL_CTL_MOD
, ch
->fd
, &epev
) == -1) {
289 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
290 (int)epev
.events
, ch
->fd
);
293 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
297 } else if (op
== EPOLL_CTL_DEL
&&
298 (errno
== ENOENT
|| errno
== EBADF
||
300 /* If a delete fails with one of these errors,
301 * that's fine too: we closed the fd before we
302 * got around to calling epoll_dispatch. */
303 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
308 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)",
309 epoll_op_to_string(op
),
314 change_to_string(ch
->read_change
),
316 change_to_string(ch
->write_change
));
320 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
321 epoll_op_to_string(op
),
333 epoll_apply_changes(struct event_base
*base
)
335 struct event_changelist
*changelist
= &base
->changelist
;
336 struct epollop
*epollop
= base
->evbase
;
337 struct event_change
*ch
;
342 for (i
= 0; i
< changelist
->n_changes
; ++i
) {
343 ch
= &changelist
->changes
[i
];
344 if (epoll_apply_one_change(base
, epollop
, ch
) < 0)
352 epoll_nochangelist_add(struct event_base
*base
, evutil_socket_t fd
,
353 short old
, short events
, void *p
)
355 struct event_change ch
;
358 ch
.read_change
= ch
.write_change
= 0;
359 if (events
& EV_WRITE
)
360 ch
.write_change
= EV_CHANGE_ADD
|
362 if (events
& EV_READ
)
363 ch
.read_change
= EV_CHANGE_ADD
|
366 return epoll_apply_one_change(base
, base
->evbase
, &ch
);
370 epoll_nochangelist_del(struct event_base
*base
, evutil_socket_t fd
,
371 short old
, short events
, void *p
)
373 struct event_change ch
;
376 ch
.read_change
= ch
.write_change
= 0;
377 if (events
& EV_WRITE
)
378 ch
.write_change
= EV_CHANGE_DEL
;
379 if (events
& EV_READ
)
380 ch
.read_change
= EV_CHANGE_DEL
;
382 return epoll_apply_one_change(base
, base
->evbase
, &ch
);
386 epoll_dispatch(struct event_base
*base
, struct timeval
*tv
)
388 struct epollop
*epollop
= base
->evbase
;
389 struct epoll_event
*events
= epollop
->events
;
394 timeout
= evutil_tv_to_msec(tv
);
395 if (timeout
< 0 || timeout
> MAX_EPOLL_TIMEOUT_MSEC
) {
396 /* Linux kernels can wait forever if the timeout is
397 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
398 timeout
= MAX_EPOLL_TIMEOUT_MSEC
;
402 epoll_apply_changes(base
);
403 event_changelist_remove_all(&base
->changelist
, base
);
405 EVBASE_RELEASE_LOCK(base
, th_base_lock
);
407 res
= epoll_wait(epollop
->epfd
, events
, epollop
->nevents
, timeout
);
409 EVBASE_ACQUIRE_LOCK(base
, th_base_lock
);
412 if (errno
!= EINTR
) {
413 event_warn("epoll_wait");
420 event_debug(("%s: epoll_wait reports %d", __func__
, res
));
421 EVUTIL_ASSERT(res
<= epollop
->nevents
);
423 for (i
= 0; i
< res
; i
++) {
424 int what
= events
[i
].events
;
427 if (what
& (EPOLLHUP
|EPOLLERR
)) {
428 ev
= EV_READ
| EV_WRITE
;
439 evmap_io_active(base
, events
[i
].data
.fd
, ev
| EV_ET
);
442 if (res
== epollop
->nevents
&& epollop
->nevents
< MAX_NEVENT
) {
443 /* We used all of the event space this time. We should
444 be ready for more events next time. */
445 int new_nevents
= epollop
->nevents
* 2;
446 struct epoll_event
*new_events
;
448 new_events
= mm_realloc(epollop
->events
,
449 new_nevents
* sizeof(struct epoll_event
));
451 epollop
->events
= new_events
;
452 epollop
->nevents
= new_nevents
;
461 epoll_dealloc(struct event_base
*base
)
463 struct epollop
*epollop
= base
->evbase
;
467 mm_free(epollop
->events
);
468 if (epollop
->epfd
>= 0)
469 close(epollop
->epfd
);
471 memset(epollop
, 0, sizeof(struct epollop
));