2 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
3 * Copyright 2007-2012 Niels Provos, Nick Mathewson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "event2/event-config.h"
28 #include "evconfig-private.h"
30 #ifdef EVENT__HAVE_EPOLL
33 #include <sys/types.h>
34 #include <sys/resource.h>
35 #ifdef EVENT__HAVE_SYS_TIME_H
38 #include <sys/queue.h>
39 #include <sys/epoll.h>
47 #ifdef EVENT__HAVE_FCNTL_H
50 #ifdef EVENT__HAVE_SYS_TIMERFD_H
51 #include <sys/timerfd.h>
54 #include "event-internal.h"
55 #include "evsignal-internal.h"
56 #include "event2/thread.h"
57 #include "evthread-internal.h"
58 #include "log-internal.h"
59 #include "evmap-internal.h"
60 #include "changelist-internal.h"
61 #include "time-internal.h"
63 /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection
64 using special EPOLLRDHUP flag on a read event.
66 #if !defined(EPOLLRDHUP)
68 #define EARLY_CLOSE_IF_HAVE_RDHUP 0
70 #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE
73 #include "epolltable-internal.h"
75 #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \
76 defined(EVENT__HAVE_TIMERFD_CREATE) && \
77 defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
79 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
80 and working. This means that we can't support it on 2.6.25 (where timerfd
81 was introduced) or 2.6.26, since 2.6.27 introduced those flags.
87 struct epoll_event
*events
;
95 static void *epoll_init(struct event_base
*);
96 static int epoll_dispatch(struct event_base
*, struct timeval
*);
97 static void epoll_dealloc(struct event_base
*);
99 static const struct eventop epollops_changelist
= {
100 "epoll (with changelist)",
102 event_changelist_add_
,
103 event_changelist_del_
,
107 EV_FEATURE_ET
|EV_FEATURE_O1
| EARLY_CLOSE_IF_HAVE_RDHUP
,
108 EVENT_CHANGELIST_FDINFO_SIZE
112 static int epoll_nochangelist_add(struct event_base
*base
, evutil_socket_t fd
,
113 short old
, short events
, void *p
);
114 static int epoll_nochangelist_del(struct event_base
*base
, evutil_socket_t fd
,
115 short old
, short events
, void *p
);
117 const struct eventop epollops
= {
120 epoll_nochangelist_add
,
121 epoll_nochangelist_del
,
125 EV_FEATURE_ET
|EV_FEATURE_O1
|EV_FEATURE_EARLY_CLOSE
,
129 #define INITIAL_NEVENT 32
130 #define MAX_NEVENT 4096
132 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
133 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
134 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
135 * largest number of msec we can support here is 2147482. Let's
136 * round that down by 47 seconds.
138 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
141 epoll_init(struct event_base
*base
)
144 struct epollop
*epollop
;
146 #ifdef EVENT__HAVE_EPOLL_CREATE1
147 /* First, try the shiny new epoll_create1 interface, if we have it. */
148 epfd
= epoll_create1(EPOLL_CLOEXEC
);
151 /* Initialize the kernel queue using the old interface. (The
152 size field is ignored since 2.6.8.) */
153 if ((epfd
= epoll_create(32000)) == -1) {
155 event_warn("epoll_create");
158 evutil_make_socket_closeonexec(epfd
);
161 if (!(epollop
= mm_calloc(1, sizeof(struct epollop
)))) {
166 epollop
->epfd
= epfd
;
168 /* Initialize fields */
169 epollop
->events
= mm_calloc(INITIAL_NEVENT
, sizeof(struct epoll_event
));
170 if (epollop
->events
== NULL
) {
175 epollop
->nevents
= INITIAL_NEVENT
;
177 if ((base
->flags
& EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST
) != 0 ||
178 ((base
->flags
& EVENT_BASE_FLAG_IGNORE_ENV
) == 0 &&
179 evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL
)) {
181 base
->evsel
= &epollops_changelist
;
186 The epoll interface ordinarily gives us one-millisecond precision,
187 so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
188 timer. But when the user has set the new PRECISE_TIMER flag for an
189 event_base, we can try to use timerfd to give them finer granularity.
191 if ((base
->flags
& EVENT_BASE_FLAG_PRECISE_TIMER
) &&
192 base
->monotonic_timer
.monotonic_clock
== CLOCK_MONOTONIC
) {
194 fd
= epollop
->timerfd
= timerfd_create(CLOCK_MONOTONIC
, TFD_NONBLOCK
|TFD_CLOEXEC
);
195 if (epollop
->timerfd
>= 0) {
196 struct epoll_event epev
;
197 memset(&epev
, 0, sizeof(epev
));
198 epev
.data
.fd
= epollop
->timerfd
;
199 epev
.events
= EPOLLIN
;
200 if (epoll_ctl(epollop
->epfd
, EPOLL_CTL_ADD
, fd
, &epev
) < 0) {
201 event_warn("epoll_ctl(timerfd)");
203 epollop
->timerfd
= -1;
206 if (errno
!= EINVAL
&& errno
!= ENOSYS
) {
207 /* These errors probably mean that we were
208 * compiled with timerfd/TFD_* support, but
209 * we're running on a kernel that lacks those.
211 event_warn("timerfd_create");
213 epollop
->timerfd
= -1;
216 epollop
->timerfd
= -1;
226 change_to_string(int change
)
228 change
&= (EV_CHANGE_ADD
|EV_CHANGE_DEL
);
229 if (change
== EV_CHANGE_ADD
) {
231 } else if (change
== EV_CHANGE_DEL
) {
233 } else if (change
== 0) {
241 epoll_op_to_string(int op
)
243 return op
== EPOLL_CTL_ADD
?"ADD":
244 op
== EPOLL_CTL_DEL
?"DEL":
245 op
== EPOLL_CTL_MOD
?"MOD":
249 #define PRINT_CHANGES(op, events, ch, status) \
250 "Epoll %s(%d) on fd %d " status ". " \
251 "Old events were %d; " \
252 "read change was %d (%s); " \
253 "write change was %d (%s); " \
254 "close change was %d (%s)", \
255 epoll_op_to_string(op), \
260 change_to_string(ch->read_change), \
262 change_to_string(ch->write_change), \
264 change_to_string(ch->close_change)
267 epoll_apply_one_change(struct event_base
*base
,
268 struct epollop
*epollop
,
269 const struct event_change
*ch
)
271 struct epoll_event epev
;
275 idx
= EPOLL_OP_TABLE_INDEX(ch
);
276 op
= epoll_op_table
[idx
].op
;
277 events
= epoll_op_table
[idx
].events
;
280 EVUTIL_ASSERT(op
== 0);
284 if ((ch
->read_change
|ch
->write_change
) & EV_CHANGE_ET
)
287 memset(&epev
, 0, sizeof(epev
));
288 epev
.data
.fd
= ch
->fd
;
289 epev
.events
= events
;
290 if (epoll_ctl(epollop
->epfd
, op
, ch
->fd
, &epev
) == 0) {
291 event_debug((PRINT_CHANGES(op
, epev
.events
, ch
, "okay")));
297 if (errno
== ENOENT
) {
298 /* If a MOD operation fails with ENOENT, the
299 * fd was probably closed and re-opened. We
300 * should retry the operation as an ADD.
302 if (epoll_ctl(epollop
->epfd
, EPOLL_CTL_ADD
, ch
->fd
, &epev
) == -1) {
303 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
304 (int)epev
.events
, ch
->fd
);
307 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
315 if (errno
== EEXIST
) {
316 /* If an ADD operation fails with EEXIST,
317 * either the operation was redundant (as with a
318 * precautionary add), or we ran into a fun
319 * kernel bug where using dup*() to duplicate the
320 * same file into the same fd gives you the same epitem
321 * rather than a fresh one. For the second case,
322 * we must retry with MOD. */
323 if (epoll_ctl(epollop
->epfd
, EPOLL_CTL_MOD
, ch
->fd
, &epev
) == -1) {
324 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
325 (int)epev
.events
, ch
->fd
);
328 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
336 if (errno
== ENOENT
|| errno
== EBADF
|| errno
== EPERM
) {
337 /* If a delete fails with one of these errors,
338 * that's fine too: we closed the fd before we
339 * got around to calling epoll_dispatch. */
340 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
351 event_warn(PRINT_CHANGES(op
, epev
.events
, ch
, "failed"));
356 epoll_apply_changes(struct event_base
*base
)
358 struct event_changelist
*changelist
= &base
->changelist
;
359 struct epollop
*epollop
= base
->evbase
;
360 struct event_change
*ch
;
365 for (i
= 0; i
< changelist
->n_changes
; ++i
) {
366 ch
= &changelist
->changes
[i
];
367 if (epoll_apply_one_change(base
, epollop
, ch
) < 0)
375 epoll_nochangelist_add(struct event_base
*base
, evutil_socket_t fd
,
376 short old
, short events
, void *p
)
378 struct event_change ch
;
381 ch
.read_change
= ch
.write_change
= ch
.close_change
= 0;
382 if (events
& EV_WRITE
)
383 ch
.write_change
= EV_CHANGE_ADD
|
385 if (events
& EV_READ
)
386 ch
.read_change
= EV_CHANGE_ADD
|
388 if (events
& EV_CLOSED
)
389 ch
.close_change
= EV_CHANGE_ADD
|
392 return epoll_apply_one_change(base
, base
->evbase
, &ch
);
396 epoll_nochangelist_del(struct event_base
*base
, evutil_socket_t fd
,
397 short old
, short events
, void *p
)
399 struct event_change ch
;
402 ch
.read_change
= ch
.write_change
= ch
.close_change
= 0;
403 if (events
& EV_WRITE
)
404 ch
.write_change
= EV_CHANGE_DEL
;
405 if (events
& EV_READ
)
406 ch
.read_change
= EV_CHANGE_DEL
;
407 if (events
& EV_CLOSED
)
408 ch
.close_change
= EV_CHANGE_DEL
;
410 return epoll_apply_one_change(base
, base
->evbase
, &ch
);
414 epoll_dispatch(struct event_base
*base
, struct timeval
*tv
)
416 struct epollop
*epollop
= base
->evbase
;
417 struct epoll_event
*events
= epollop
->events
;
422 if (epollop
->timerfd
>= 0) {
423 struct itimerspec is
;
424 is
.it_interval
.tv_sec
= 0;
425 is
.it_interval
.tv_nsec
= 0;
427 /* No timeout; disarm the timer. */
428 is
.it_value
.tv_sec
= 0;
429 is
.it_value
.tv_nsec
= 0;
431 if (tv
->tv_sec
== 0 && tv
->tv_usec
== 0) {
432 /* we need to exit immediately; timerfd can't
436 is
.it_value
.tv_sec
= tv
->tv_sec
;
437 is
.it_value
.tv_nsec
= tv
->tv_usec
* 1000;
439 /* TODO: we could avoid unnecessary syscalls here by only
440 calling timerfd_settime when the top timeout changes, or
441 when we're called with a different timeval.
443 if (timerfd_settime(epollop
->timerfd
, 0, &is
, NULL
) < 0) {
444 event_warn("timerfd_settime");
449 timeout
= evutil_tv_to_msec_(tv
);
450 if (timeout
< 0 || timeout
> MAX_EPOLL_TIMEOUT_MSEC
) {
451 /* Linux kernels can wait forever if the timeout is
452 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
453 timeout
= MAX_EPOLL_TIMEOUT_MSEC
;
457 epoll_apply_changes(base
);
458 event_changelist_remove_all_(&base
->changelist
, base
);
460 EVBASE_RELEASE_LOCK(base
, th_base_lock
);
462 res
= epoll_wait(epollop
->epfd
, events
, epollop
->nevents
, timeout
);
464 EVBASE_ACQUIRE_LOCK(base
, th_base_lock
);
467 if (errno
!= EINTR
) {
468 event_warn("epoll_wait");
475 event_debug(("%s: epoll_wait reports %d", __func__
, res
));
476 EVUTIL_ASSERT(res
<= epollop
->nevents
);
478 for (i
= 0; i
< res
; i
++) {
479 int what
= events
[i
].events
;
482 if (events
[i
].data
.fd
== epollop
->timerfd
)
486 if (what
& (EPOLLHUP
|EPOLLERR
)) {
487 ev
= EV_READ
| EV_WRITE
;
493 if (what
& EPOLLRDHUP
)
500 evmap_io_active_(base
, events
[i
].data
.fd
, ev
| EV_ET
);
503 if (res
== epollop
->nevents
&& epollop
->nevents
< MAX_NEVENT
) {
504 /* We used all of the event space this time. We should
505 be ready for more events next time. */
506 int new_nevents
= epollop
->nevents
* 2;
507 struct epoll_event
*new_events
;
509 new_events
= mm_realloc(epollop
->events
,
510 new_nevents
* sizeof(struct epoll_event
));
512 epollop
->events
= new_events
;
513 epollop
->nevents
= new_nevents
;
522 epoll_dealloc(struct event_base
*base
)
524 struct epollop
*epollop
= base
->evbase
;
526 evsig_dealloc_(base
);
528 mm_free(epollop
->events
);
529 if (epollop
->epfd
>= 0)
530 close(epollop
->epfd
);
532 if (epollop
->timerfd
>= 0)
533 close(epollop
->timerfd
);
536 memset(epollop
, 0, sizeof(struct epollop
));
540 #endif /* EVENT__HAVE_EPOLL */