1 /* $NetBSD: kern_event.c,v 1.68 2009/12/20 09:36:05 dsl Exp $ */
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
34 * All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.68 2009/12/20 09:36:05 dsl Exp $");
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
68 #include <sys/select.h>
69 #include <sys/queue.h>
70 #include <sys/event.h>
71 #include <sys/eventvar.h>
75 #include <sys/filedesc.h>
76 #include <sys/syscallargs.h>
77 #include <sys/kauth.h>
79 #include <sys/atomic.h>
81 static int kqueue_scan(file_t
*, size_t, struct kevent
*,
82 const struct timespec
*, register_t
*,
83 const struct kevent_ops
*, struct kevent
*,
85 static int kqueue_ioctl(file_t
*, u_long
, void *);
86 static int kqueue_fcntl(file_t
*, u_int
, void *);
87 static int kqueue_poll(file_t
*, int);
88 static int kqueue_kqfilter(file_t
*, struct knote
*);
89 static int kqueue_stat(file_t
*, struct stat
*);
90 static int kqueue_close(file_t
*);
91 static int kqueue_register(struct kqueue
*, struct kevent
*);
92 static void kqueue_doclose(struct kqueue
*, struct klist
*, int);
94 static void knote_detach(struct knote
*, filedesc_t
*fdp
, bool);
95 static void knote_enqueue(struct knote
*);
96 static void knote_activate(struct knote
*);
98 static void filt_kqdetach(struct knote
*);
99 static int filt_kqueue(struct knote
*, long hint
);
100 static int filt_procattach(struct knote
*);
101 static void filt_procdetach(struct knote
*);
102 static int filt_proc(struct knote
*, long hint
);
103 static int filt_fileattach(struct knote
*);
104 static void filt_timerexpire(void *x
);
105 static int filt_timerattach(struct knote
*);
106 static void filt_timerdetach(struct knote
*);
107 static int filt_timer(struct knote
*, long hint
);
109 static const struct fileops kqueueops
= {
110 .fo_read
= (void *)enxio
,
111 .fo_write
= (void *)enxio
,
112 .fo_ioctl
= kqueue_ioctl
,
113 .fo_fcntl
= kqueue_fcntl
,
114 .fo_poll
= kqueue_poll
,
115 .fo_stat
= kqueue_stat
,
116 .fo_close
= kqueue_close
,
117 .fo_kqfilter
= kqueue_kqfilter
,
118 .fo_restart
= fnullop_restart
,
121 static const struct filterops kqread_filtops
=
122 { 1, NULL
, filt_kqdetach
, filt_kqueue
};
123 static const struct filterops proc_filtops
=
124 { 0, filt_procattach
, filt_procdetach
, filt_proc
};
125 static const struct filterops file_filtops
=
126 { 1, filt_fileattach
, NULL
, NULL
};
127 static const struct filterops timer_filtops
=
128 { 0, filt_timerattach
, filt_timerdetach
, filt_timer
};
130 static u_int kq_ncallouts
= 0;
131 static int kq_calloutmax
= (4 * 1024);
133 #define KN_HASHSIZE 64 /* XXX should be tunable */
134 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
136 extern const struct filterops sig_filtops
;
139 * Table for for all system-defined filters.
140 * These should be listed in the numeric order of the EVFILT_* defines.
141 * If filtops is NULL, the filter isn't implemented in NetBSD.
142 * End of list is when name is NULL.
144 * Note that 'refcnt' is meaningless for built-in filters.
147 const char *name
; /* name of filter */
148 uint32_t filter
; /* id of filter */
149 unsigned refcnt
; /* reference count */
150 const struct filterops
*filtops
;/* operations for filter */
151 size_t namelen
; /* length of name string */
154 /* System defined filters */
155 static struct kfilter sys_kfilters
[] = {
156 { "EVFILT_READ", EVFILT_READ
, 0, &file_filtops
, 0 },
157 { "EVFILT_WRITE", EVFILT_WRITE
, 0, &file_filtops
, 0, },
158 { "EVFILT_AIO", EVFILT_AIO
, 0, NULL
, 0 },
159 { "EVFILT_VNODE", EVFILT_VNODE
, 0, &file_filtops
, 0 },
160 { "EVFILT_PROC", EVFILT_PROC
, 0, &proc_filtops
, 0 },
161 { "EVFILT_SIGNAL", EVFILT_SIGNAL
, 0, &sig_filtops
, 0 },
162 { "EVFILT_TIMER", EVFILT_TIMER
, 0, &timer_filtops
, 0 },
163 { NULL
, 0, 0, NULL
, 0 },
166 /* User defined kfilters */
167 static struct kfilter
*user_kfilters
; /* array */
168 static int user_kfilterc
; /* current offset */
169 static int user_kfiltermaxc
; /* max size so far */
170 static size_t user_kfiltersz
; /* size of allocated memory */
173 static krwlock_t kqueue_filter_lock
; /* lock on filter lists */
174 static kmutex_t kqueue_misc_lock
; /* miscellaneous */
176 static kauth_listener_t kqueue_listener
;
179 kqueue_listener_cb(kauth_cred_t cred
, kauth_action_t action
, void *cookie
,
180 void *arg0
, void *arg1
, void *arg2
, void *arg3
)
185 result
= KAUTH_RESULT_DEFER
;
188 if (action
!= KAUTH_PROCESS_KEVENT_FILTER
)
191 if ((kauth_cred_getuid(p
->p_cred
) != kauth_cred_getuid(cred
) ||
192 ISSET(p
->p_flag
, PK_SUGID
)))
195 result
= KAUTH_RESULT_ALLOW
;
201 * Initialize the kqueue subsystem.
207 rw_init(&kqueue_filter_lock
);
208 mutex_init(&kqueue_misc_lock
, MUTEX_DEFAULT
, IPL_NONE
);
210 kqueue_listener
= kauth_listen_scope(KAUTH_SCOPE_PROCESS
,
211 kqueue_listener_cb
, NULL
);
215 * Find kfilter entry by name, or NULL if not found.
217 static struct kfilter
*
218 kfilter_byname_sys(const char *name
)
222 KASSERT(rw_lock_held(&kqueue_filter_lock
));
224 for (i
= 0; sys_kfilters
[i
].name
!= NULL
; i
++) {
225 if (strcmp(name
, sys_kfilters
[i
].name
) == 0)
226 return &sys_kfilters
[i
];
231 static struct kfilter
*
232 kfilter_byname_user(const char *name
)
236 KASSERT(rw_lock_held(&kqueue_filter_lock
));
238 /* user filter slots have a NULL name if previously deregistered */
239 for (i
= 0; i
< user_kfilterc
; i
++) {
240 if (user_kfilters
[i
].name
!= NULL
&&
241 strcmp(name
, user_kfilters
[i
].name
) == 0)
242 return &user_kfilters
[i
];
247 static struct kfilter
*
248 kfilter_byname(const char *name
)
250 struct kfilter
*kfilter
;
252 KASSERT(rw_lock_held(&kqueue_filter_lock
));
254 if ((kfilter
= kfilter_byname_sys(name
)) != NULL
)
257 return kfilter_byname_user(name
);
261 * Find kfilter entry by filter id, or NULL if not found.
262 * Assumes entries are indexed in filter id order, for speed.
264 static struct kfilter
*
265 kfilter_byfilter(uint32_t filter
)
267 struct kfilter
*kfilter
;
269 KASSERT(rw_lock_held(&kqueue_filter_lock
));
271 if (filter
< EVFILT_SYSCOUNT
) /* it's a system filter */
272 kfilter
= &sys_kfilters
[filter
];
273 else if (user_kfilters
!= NULL
&&
274 filter
< EVFILT_SYSCOUNT
+ user_kfilterc
)
275 /* it's a user filter */
276 kfilter
= &user_kfilters
[filter
- EVFILT_SYSCOUNT
];
278 return (NULL
); /* out of range */
279 KASSERT(kfilter
->filter
== filter
); /* sanity check! */
284 * Register a new kfilter. Stores the entry in user_kfilters.
285 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
286 * If retfilter != NULL, the new filterid is returned in it.
289 kfilter_register(const char *name
, const struct filterops
*filtops
,
292 struct kfilter
*kfilter
;
296 if (name
== NULL
|| name
[0] == '\0' || filtops
== NULL
)
297 return (EINVAL
); /* invalid args */
299 rw_enter(&kqueue_filter_lock
, RW_WRITER
);
300 if (kfilter_byname(name
) != NULL
) {
301 rw_exit(&kqueue_filter_lock
);
302 return (EEXIST
); /* already exists */
304 if (user_kfilterc
> 0xffffffff - EVFILT_SYSCOUNT
) {
305 rw_exit(&kqueue_filter_lock
);
306 return (EINVAL
); /* too many */
309 for (i
= 0; i
< user_kfilterc
; i
++) {
310 kfilter
= &user_kfilters
[i
];
311 if (kfilter
->name
== NULL
) {
312 /* Previously deregistered slot. Reuse. */
317 /* check if need to grow user_kfilters */
318 if (user_kfilterc
+ 1 > user_kfiltermaxc
) {
319 /* Grow in KFILTER_EXTENT chunks. */
320 user_kfiltermaxc
+= KFILTER_EXTENT
;
321 len
= user_kfiltermaxc
* sizeof(*kfilter
);
322 kfilter
= kmem_alloc(len
, KM_SLEEP
);
323 memset((char *)kfilter
+ user_kfiltersz
, 0, len
- user_kfiltersz
);
324 if (user_kfilters
!= NULL
) {
325 memcpy(kfilter
, user_kfilters
, user_kfiltersz
);
326 kmem_free(user_kfilters
, user_kfiltersz
);
328 user_kfiltersz
= len
;
329 user_kfilters
= kfilter
;
331 /* Adding new slot */
332 kfilter
= &user_kfilters
[user_kfilterc
++];
334 kfilter
->namelen
= strlen(name
) + 1;
335 kfilter
->name
= kmem_alloc(kfilter
->namelen
, KM_SLEEP
);
336 memcpy(__UNCONST(kfilter
->name
), name
, kfilter
->namelen
);
338 kfilter
->filter
= (kfilter
- user_kfilters
) + EVFILT_SYSCOUNT
;
340 kfilter
->filtops
= kmem_alloc(sizeof(*filtops
), KM_SLEEP
);
341 memcpy(__UNCONST(kfilter
->filtops
), filtops
, sizeof(*filtops
));
343 if (retfilter
!= NULL
)
344 *retfilter
= kfilter
->filter
;
345 rw_exit(&kqueue_filter_lock
);
351 * Unregister a kfilter previously registered with kfilter_register.
352 * This retains the filter id, but clears the name and frees filtops (filter
353 * operations), so that the number isn't reused during a boot.
354 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
357 kfilter_unregister(const char *name
)
359 struct kfilter
*kfilter
;
361 if (name
== NULL
|| name
[0] == '\0')
362 return (EINVAL
); /* invalid name */
364 rw_enter(&kqueue_filter_lock
, RW_WRITER
);
365 if (kfilter_byname_sys(name
) != NULL
) {
366 rw_exit(&kqueue_filter_lock
);
367 return (EINVAL
); /* can't detach system filters */
370 kfilter
= kfilter_byname_user(name
);
371 if (kfilter
== NULL
) {
372 rw_exit(&kqueue_filter_lock
);
375 if (kfilter
->refcnt
!= 0) {
376 rw_exit(&kqueue_filter_lock
);
380 /* Cast away const (but we know it's safe. */
381 kmem_free(__UNCONST(kfilter
->name
), kfilter
->namelen
);
382 kfilter
->name
= NULL
; /* mark as `not implemented' */
384 if (kfilter
->filtops
!= NULL
) {
385 /* Cast away const (but we know it's safe. */
386 kmem_free(__UNCONST(kfilter
->filtops
),
387 sizeof(*kfilter
->filtops
));
388 kfilter
->filtops
= NULL
; /* mark as `not implemented' */
390 rw_exit(&kqueue_filter_lock
);
397 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
398 * descriptors. Calls fileops kqfilter method for given file descriptor.
401 filt_fileattach(struct knote
*kn
)
407 return (*fp
->f_ops
->fo_kqfilter
)(fp
, kn
);
411 * Filter detach method for EVFILT_READ on kqueue descriptor.
414 filt_kqdetach(struct knote
*kn
)
418 kq
= ((file_t
*)kn
->kn_obj
)->f_data
;
420 mutex_spin_enter(&kq
->kq_lock
);
421 SLIST_REMOVE(&kq
->kq_sel
.sel_klist
, kn
, knote
, kn_selnext
);
422 mutex_spin_exit(&kq
->kq_lock
);
426 * Filter event method for EVFILT_READ on kqueue descriptor.
430 filt_kqueue(struct knote
*kn
, long hint
)
435 kq
= ((file_t
*)kn
->kn_obj
)->f_data
;
437 if (hint
!= NOTE_SUBMIT
)
438 mutex_spin_enter(&kq
->kq_lock
);
439 kn
->kn_data
= kq
->kq_count
;
440 rv
= (kn
->kn_data
> 0);
441 if (hint
!= NOTE_SUBMIT
)
442 mutex_spin_exit(&kq
->kq_lock
);
448 * Filter attach method for EVFILT_PROC.
451 filt_procattach(struct knote
*kn
)
453 struct proc
*p
, *curp
;
459 mutex_enter(proc_lock
);
460 p
= p_find(kn
->kn_id
, PFIND_LOCKED
);
462 mutex_exit(proc_lock
);
467 * Fail if it's not owned by you, or the last exec gave us
468 * setuid/setgid privs (unless you're root).
470 mutex_enter(p
->p_lock
);
471 mutex_exit(proc_lock
);
472 if (kauth_authorize_process(curl
->l_cred
, KAUTH_PROCESS_KEVENT_FILTER
,
473 p
, NULL
, NULL
, NULL
) != 0) {
474 mutex_exit(p
->p_lock
);
479 kn
->kn_flags
|= EV_CLEAR
; /* automatically set */
482 * internal flag indicating registration done by kernel
484 if (kn
->kn_flags
& EV_FLAG1
) {
485 kn
->kn_data
= kn
->kn_sdata
; /* ppid */
486 kn
->kn_fflags
= NOTE_CHILD
;
487 kn
->kn_flags
&= ~EV_FLAG1
;
489 SLIST_INSERT_HEAD(&p
->p_klist
, kn
, kn_selnext
);
490 mutex_exit(p
->p_lock
);
496 * Filter detach method for EVFILT_PROC.
498 * The knote may be attached to a different process, which may exit,
499 * leaving nothing for the knote to be attached to. So when the process
500 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
501 * it will be deleted when read out. However, as part of the knote deletion,
502 * this routine is called, so a check is needed to avoid actually performing
503 * a detach, because the original process might not exist any more.
506 filt_procdetach(struct knote
*kn
)
510 if (kn
->kn_status
& KN_DETACHED
)
515 mutex_enter(p
->p_lock
);
516 SLIST_REMOVE(&p
->p_klist
, kn
, knote
, kn_selnext
);
517 mutex_exit(p
->p_lock
);
521 * Filter event method for EVFILT_PROC.
524 filt_proc(struct knote
*kn
, long hint
)
531 event
= (u_int
)hint
& NOTE_PCTRLMASK
;
535 /* If the user is interested in this event, record it. */
536 if (kn
->kn_sfflags
& event
)
539 if (event
== NOTE_EXIT
) {
541 * Process is gone, so flag the event as finished.
543 * Detach the knote from watched process and mark
544 * it as such. We can't leave this to kqueue_scan(),
545 * since the process might not exist by then. And we
546 * have to do this now, since psignal KNOTE() is called
547 * also for zombies and we might end up reading freed
548 * memory if the kevent would already be picked up
553 mutex_spin_enter(&kq
->kq_lock
);
554 kn
->kn_status
|= KN_DETACHED
;
555 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
556 kn
->kn_flags
|= (EV_EOF
| EV_ONESHOT
);
557 kn
->kn_fflags
|= fflag
;
558 mutex_spin_exit(&kq
->kq_lock
);
563 mutex_spin_enter(&kq
->kq_lock
);
564 if ((event
== NOTE_FORK
) && (kn
->kn_sfflags
& NOTE_TRACK
)) {
566 * Process forked, and user wants to track the new process,
567 * so attach a new knote to it, and immediately report an
568 * event with the parent's pid. Register knote with new
571 kev
.ident
= hint
& NOTE_PDATAMASK
; /* pid */
572 kev
.filter
= kn
->kn_filter
;
573 kev
.flags
= kn
->kn_flags
| EV_ADD
| EV_ENABLE
| EV_FLAG1
;
574 kev
.fflags
= kn
->kn_sfflags
;
575 kev
.data
= kn
->kn_id
; /* parent */
576 kev
.udata
= kn
->kn_kevent
.udata
; /* preserve udata */
577 mutex_spin_exit(&kq
->kq_lock
);
578 error
= kqueue_register(kq
, &kev
);
579 mutex_spin_enter(&kq
->kq_lock
);
581 kn
->kn_fflags
|= NOTE_TRACKERR
;
583 kn
->kn_fflags
|= fflag
;
584 fflag
= kn
->kn_fflags
;
585 mutex_spin_exit(&kq
->kq_lock
);
591 filt_timerexpire(void *knx
)
593 struct knote
*kn
= knx
;
596 mutex_enter(&kqueue_misc_lock
);
599 if ((kn
->kn_flags
& EV_ONESHOT
) == 0) {
600 tticks
= mstohz(kn
->kn_sdata
);
601 callout_schedule((callout_t
*)kn
->kn_hook
, tticks
);
603 mutex_exit(&kqueue_misc_lock
);
607 * data contains amount of time to sleep, in milliseconds
610 filt_timerattach(struct knote
*kn
)
616 tticks
= mstohz(kn
->kn_sdata
);
618 /* if the supplied value is under our resolution, use 1 tick */
620 if (kn
->kn_sdata
== 0)
625 if (atomic_inc_uint_nv(&kq_ncallouts
) >= kq_calloutmax
||
626 (calloutp
= kmem_alloc(sizeof(*calloutp
), KM_NOSLEEP
)) == NULL
) {
627 atomic_dec_uint(&kq_ncallouts
);
630 callout_init(calloutp
, CALLOUT_MPSAFE
);
633 mutex_spin_enter(&kq
->kq_lock
);
634 kn
->kn_flags
|= EV_CLEAR
; /* automatically set */
635 kn
->kn_hook
= calloutp
;
636 mutex_spin_exit(&kq
->kq_lock
);
638 callout_reset(calloutp
, tticks
, filt_timerexpire
, kn
);
644 filt_timerdetach(struct knote
*kn
)
648 calloutp
= (callout_t
*)kn
->kn_hook
;
649 callout_halt(calloutp
, NULL
);
650 callout_destroy(calloutp
);
651 kmem_free(calloutp
, sizeof(*calloutp
));
652 atomic_dec_uint(&kq_ncallouts
);
656 filt_timer(struct knote
*kn
, long hint
)
660 mutex_enter(&kqueue_misc_lock
);
661 rv
= (kn
->kn_data
!= 0);
662 mutex_exit(&kqueue_misc_lock
);
670 * This filter "event" routine simulates seltrue().
673 filt_seltrue(struct knote
*kn
, long hint
)
677 * We don't know how much data can be read/written,
678 * but we know that it *can* be. This is about as
679 * good as select/poll does as well.
686 * This provides full kqfilter entry for device switch tables, which
687 * has same effect as filter using filt_seltrue() as filter method.
690 filt_seltruedetach(struct knote
*kn
)
695 const struct filterops seltrue_filtops
=
696 { 1, NULL
, filt_seltruedetach
, filt_seltrue
};
699 seltrue_kqfilter(dev_t dev
, struct knote
*kn
)
701 switch (kn
->kn_filter
) {
704 kn
->kn_fop
= &seltrue_filtops
;
710 /* Nothing more to do */
715 * kqueue(2) system call.
718 sys_kqueue(struct lwp
*l
, const void *v
, register_t
*retval
)
724 if ((error
= fd_allocfile(&fp
, &fd
)) != 0)
726 fp
->f_flag
= FREAD
| FWRITE
;
727 fp
->f_type
= DTYPE_KQUEUE
;
728 fp
->f_ops
= &kqueueops
;
729 kq
= kmem_zalloc(sizeof(*kq
), KM_SLEEP
);
730 mutex_init(&kq
->kq_lock
, MUTEX_DEFAULT
, IPL_SCHED
);
731 cv_init(&kq
->kq_cv
, "kqueue");
732 selinit(&kq
->kq_sel
);
733 TAILQ_INIT(&kq
->kq_head
);
736 kq
->kq_fdp
= curlwp
->l_fd
;
737 fd_affix(curproc
, fp
, fd
);
742 * kevent(2) system call.
745 kevent_fetch_changes(void *private, const struct kevent
*changelist
,
746 struct kevent
*changes
, size_t index
, int n
)
749 return copyin(changelist
+ index
, changes
, n
* sizeof(*changes
));
753 kevent_put_events(void *private, struct kevent
*events
,
754 struct kevent
*eventlist
, size_t index
, int n
)
757 return copyout(events
, eventlist
+ index
, n
* sizeof(*events
));
760 static const struct kevent_ops kevent_native_ops
= {
762 .keo_fetch_timeout
= copyin
,
763 .keo_fetch_changes
= kevent_fetch_changes
,
764 .keo_put_events
= kevent_put_events
,
768 sys___kevent50(struct lwp
*l
, const struct sys___kevent50_args
*uap
,
773 syscallarg(const struct kevent *) changelist;
774 syscallarg(size_t) nchanges;
775 syscallarg(struct kevent *) eventlist;
776 syscallarg(size_t) nevents;
777 syscallarg(const struct timespec *) timeout;
780 return kevent1(retval
, SCARG(uap
, fd
), SCARG(uap
, changelist
),
781 SCARG(uap
, nchanges
), SCARG(uap
, eventlist
), SCARG(uap
, nevents
),
782 SCARG(uap
, timeout
), &kevent_native_ops
);
786 kevent1(register_t
*retval
, int fd
,
787 const struct kevent
*changelist
, size_t nchanges
,
788 struct kevent
*eventlist
, size_t nevents
,
789 const struct timespec
*timeout
,
790 const struct kevent_ops
*keops
)
795 size_t i
, n
, ichange
;
797 struct kevent kevbuf
[8]; /* approx 300 bytes on 64-bit */
800 /* check that we're dealing with a kq */
805 if (fp
->f_type
!= DTYPE_KQUEUE
) {
810 if (timeout
!= NULL
) {
811 error
= (*keops
->keo_fetch_timeout
)(timeout
, &ts
, sizeof(ts
));
817 kq
= (struct kqueue
*)fp
->f_data
;
821 /* traverse list of events to register */
822 while (nchanges
> 0) {
823 n
= MIN(nchanges
, __arraycount(kevbuf
));
824 error
= (*keops
->keo_fetch_changes
)(keops
->keo_private
,
825 changelist
, kevbuf
, ichange
, n
);
828 for (i
= 0; i
< n
; i
++) {
830 kevp
->flags
&= ~EV_SYSFLAGS
;
831 /* register each knote */
832 error
= kqueue_register(kq
, kevp
);
835 kevp
->flags
= EV_ERROR
;
837 error
= (*keops
->keo_put_events
)
838 (keops
->keo_private
, kevp
,
839 eventlist
, nerrors
, 1);
849 nchanges
-= n
; /* update the results */
858 /* actually scan through the events */
859 error
= kqueue_scan(fp
, nevents
, eventlist
, timeout
, retval
, keops
,
860 kevbuf
, __arraycount(kevbuf
));
867 * Register a given kevent kev onto the kqueue
870 kqueue_register(struct kqueue
*kq
, struct kevent
*kev
)
872 struct kfilter
*kfilter
;
876 struct knote
*kn
, *newkn
;
886 newkn
= kmem_zalloc(sizeof(*newkn
), KM_SLEEP
);
888 rw_enter(&kqueue_filter_lock
, RW_READER
);
889 kfilter
= kfilter_byfilter(kev
->filter
);
890 if (kfilter
== NULL
|| kfilter
->filtops
== NULL
) {
891 /* filter not found nor implemented */
892 rw_exit(&kqueue_filter_lock
);
893 kmem_free(newkn
, sizeof(*newkn
));
897 mutex_enter(&fdp
->fd_lock
);
899 /* search if knote already exists */
900 if (kfilter
->filtops
->f_isfd
) {
901 /* monitoring a file descriptor */
903 if ((fp
= fd_getfile(fd
)) == NULL
) {
904 mutex_exit(&fdp
->fd_lock
);
905 rw_exit(&kqueue_filter_lock
);
906 kmem_free(newkn
, sizeof(*newkn
));
909 ff
= fdp
->fd_dt
->dt_ff
[fd
];
910 if (fd
<= fdp
->fd_lastkqfile
) {
911 SLIST_FOREACH(kn
, &ff
->ff_knlist
, kn_link
) {
912 if (kq
== kn
->kn_kq
&&
913 kev
->filter
== kn
->kn_filter
)
919 * not monitoring a file descriptor, so
920 * lookup knotes in internal hash table
922 if (fdp
->fd_knhashmask
!= 0) {
923 list
= &fdp
->fd_knhash
[
924 KN_HASH((u_long
)kev
->ident
, fdp
->fd_knhashmask
)];
925 SLIST_FOREACH(kn
, list
, kn_link
) {
926 if (kev
->ident
== kn
->kn_id
&&
928 kev
->filter
== kn
->kn_filter
)
935 * kn now contains the matching knote, or NULL if no match
937 if (kev
->flags
& EV_ADD
) {
939 /* create new knote */
944 kn
->kn_fop
= kfilter
->filtops
;
945 kn
->kn_kfilter
= kfilter
;
946 kn
->kn_sfflags
= kev
->fflags
;
947 kn
->kn_sdata
= kev
->data
;
950 kn
->kn_kevent
= *kev
;
953 * apply reference count to knote structure, and
954 * do not release it at the end of this routine.
958 if (!kn
->kn_fop
->f_isfd
) {
960 * If knote is not on an fd, store on
961 * internal hash table.
963 if (fdp
->fd_knhashmask
== 0) {
964 /* XXXAD can block with fd_lock held */
965 fdp
->fd_knhash
= hashinit(KN_HASHSIZE
,
967 &fdp
->fd_knhashmask
);
969 list
= &fdp
->fd_knhash
[KN_HASH(kn
->kn_id
,
970 fdp
->fd_knhashmask
)];
972 /* Otherwise, knote is on an fd. */
973 list
= (struct klist
*)
974 &fdp
->fd_dt
->dt_ff
[kn
->kn_id
]->ff_knlist
;
975 if ((int)kn
->kn_id
> fdp
->fd_lastkqfile
)
976 fdp
->fd_lastkqfile
= kn
->kn_id
;
978 SLIST_INSERT_HEAD(list
, kn
, kn_link
);
980 KERNEL_LOCK(1, NULL
); /* XXXSMP */
981 error
= (*kfilter
->filtops
->f_attach
)(kn
);
982 KERNEL_UNLOCK_ONE(NULL
); /* XXXSMP */
984 /* knote_detach() drops fdp->fd_lock */
985 knote_detach(kn
, fdp
, false);
988 atomic_inc_uint(&kfilter
->refcnt
);
991 * The user may change some filter values after the
992 * initial EV_ADD, but doing so will not reset any
993 * filter which have already been triggered.
995 kn
->kn_sfflags
= kev
->fflags
;
996 kn
->kn_sdata
= kev
->data
;
997 kn
->kn_kevent
.udata
= kev
->udata
;
999 KERNEL_LOCK(1, NULL
); /* XXXSMP */
1000 rv
= (*kn
->kn_fop
->f_event
)(kn
, 0);
1001 KERNEL_UNLOCK_ONE(NULL
); /* XXXSMP */
1007 mutex_exit(&fdp
->fd_lock
);
1010 if (kev
->flags
& EV_DELETE
) {
1011 /* knote_detach() drops fdp->fd_lock */
1012 knote_detach(kn
, fdp
, true);
1018 if ((kev
->flags
& EV_DISABLE
)) {
1019 mutex_spin_enter(&kq
->kq_lock
);
1020 if ((kn
->kn_status
& KN_DISABLED
) == 0)
1021 kn
->kn_status
|= KN_DISABLED
;
1022 mutex_spin_exit(&kq
->kq_lock
);
1026 if ((kev
->flags
& EV_ENABLE
)) {
1029 mutex_exit(&fdp
->fd_lock
);
1031 rw_exit(&kqueue_filter_lock
);
1033 kmem_free(newkn
, sizeof(*newkn
));
1041 kq_check(struct kqueue
*kq
)
1043 const struct knote
*kn
;
1047 KASSERT(mutex_owned(&kq
->kq_lock
));
1048 KASSERT(kq
->kq_count
>= 0);
1052 TAILQ_FOREACH(kn
, &kq
->kq_head
, kn_tqe
) {
1053 if ((kn
->kn_status
& (KN_MARKER
| KN_QUEUED
)) == 0) {
1054 panic("%s: kq=%p kn=%p inconsist 1", __func__
, kq
, kn
);
1056 if ((kn
->kn_status
& KN_MARKER
) == 0) {
1057 if (kn
->kn_kq
!= kq
) {
1058 panic("%s: kq=%p kn=%p inconsist 2",
1061 if ((kn
->kn_status
& KN_ACTIVE
) == 0) {
1062 panic("%s: kq=%p kn=%p: not active",
1066 if (count
> kq
->kq_count
) {
1072 if (nmarker
> 10000) {
1073 panic("%s: kq=%p too many markers: %d != %d, "
1075 __func__
, kq
, kq
->kq_count
, count
, nmarker
);
1080 if (kq
->kq_count
!= count
) {
1082 panic("%s: kq=%p inconsist 3: %d != %d, nmarker=%d",
1083 __func__
, kq
, kq
->kq_count
, count
, nmarker
);
1086 #else /* defined(DEBUG) */
1087 #define kq_check(a) /* nothing */
1088 #endif /* defined(DEBUG) */
1091 * Scan through the list of events on fp (for a maximum of maxevents),
1092 * returning the results in to ulistp. Timeout is determined by tsp; if
1093 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1097 kqueue_scan(file_t
*fp
, size_t maxevents
, struct kevent
*ulistp
,
1098 const struct timespec
*tsp
, register_t
*retval
,
1099 const struct kevent_ops
*keops
, struct kevent
*kevbuf
,
1103 struct kevent
*kevp
;
1104 struct timespec ats
, sleepts
;
1105 struct knote
*kn
, *marker
;
1106 size_t count
, nkev
, nevents
;
1107 int timeout
, error
, rv
;
1113 nkev
= nevents
= error
= 0;
1119 if (tsp
) { /* timeout supplied */
1121 if (inittimeleft(&ats
, &sleepts
) == -1) {
1122 *retval
= maxevents
;
1125 timeout
= tstohz(&ats
);
1127 timeout
= -1; /* do poll */
1129 /* no timeout, wait forever */
1133 marker
= kmem_zalloc(sizeof(*marker
), KM_SLEEP
);
1134 marker
->kn_status
= KN_MARKER
;
1135 mutex_spin_enter(&kq
->kq_lock
);
1138 if (kq
->kq_count
== 0) {
1140 error
= cv_timedwait_sig(&kq
->kq_cv
,
1141 &kq
->kq_lock
, timeout
);
1143 if (tsp
== NULL
|| (timeout
=
1144 gettimeleft(&ats
, &sleepts
)) > 0)
1147 /* don't restart after signals... */
1148 if (error
== ERESTART
)
1150 if (error
== EWOULDBLOCK
)
1155 /* mark end of knote list */
1156 TAILQ_INSERT_TAIL(&kq
->kq_head
, marker
, kn_tqe
);
1158 while (count
!= 0) {
1159 kn
= TAILQ_FIRST(&kq
->kq_head
); /* get next knote */
1160 while ((kn
->kn_status
& KN_MARKER
) != 0) {
1162 /* it's our marker, stop */
1163 TAILQ_REMOVE(&kq
->kq_head
, kn
, kn_tqe
);
1164 if (count
< maxevents
|| (tsp
!= NULL
&&
1165 (timeout
= gettimeleft(&ats
,
1170 /* someone else's marker. */
1171 kn
= TAILQ_NEXT(kn
, kn_tqe
);
1174 TAILQ_REMOVE(&kq
->kq_head
, kn
, kn_tqe
);
1176 kn
->kn_status
&= ~KN_QUEUED
;
1178 if (kn
->kn_status
& KN_DISABLED
) {
1179 /* don't want disabled events */
1182 if ((kn
->kn_flags
& EV_ONESHOT
) == 0) {
1183 mutex_spin_exit(&kq
->kq_lock
);
1184 KERNEL_LOCK(1, NULL
); /* XXXSMP */
1185 rv
= (*kn
->kn_fop
->f_event
)(kn
, 0);
1186 KERNEL_UNLOCK_ONE(NULL
); /* XXXSMP */
1187 mutex_spin_enter(&kq
->kq_lock
);
1188 /* Re-poll if note was re-enqueued. */
1189 if ((kn
->kn_status
& KN_QUEUED
) != 0)
1193 * non-ONESHOT event that hasn't
1194 * triggered again, so de-queue.
1196 kn
->kn_status
&= ~KN_ACTIVE
;
1200 /* XXXAD should be got from f_event if !oneshot. */
1201 *kevp
++ = kn
->kn_kevent
;
1203 if (kn
->kn_flags
& EV_ONESHOT
) {
1204 /* delete ONESHOT events after retrieval */
1205 mutex_spin_exit(&kq
->kq_lock
);
1206 mutex_enter(&fdp
->fd_lock
);
1207 knote_detach(kn
, fdp
, true);
1208 mutex_spin_enter(&kq
->kq_lock
);
1209 } else if (kn
->kn_flags
& EV_CLEAR
) {
1210 /* clear state after retrieval */
1213 kn
->kn_status
&= ~KN_ACTIVE
;
1215 /* add event back on list */
1217 TAILQ_INSERT_TAIL(&kq
->kq_head
, kn
, kn_tqe
);
1219 kn
->kn_status
|= KN_QUEUED
;
1222 if (nkev
== kevcnt
) {
1223 /* do copyouts in kevcnt chunks */
1224 mutex_spin_exit(&kq
->kq_lock
);
1225 error
= (*keops
->keo_put_events
)
1226 (keops
->keo_private
,
1227 kevbuf
, ulistp
, nevents
, nkev
);
1228 mutex_spin_enter(&kq
->kq_lock
);
1234 if (error
!= 0 || count
== 0) {
1236 TAILQ_REMOVE(&kq
->kq_head
, marker
, kn_tqe
);
1242 mutex_spin_exit(&kq
->kq_lock
);
1244 kmem_free(marker
, sizeof(*marker
));
1246 /* copyout remaining events */
1247 error
= (*keops
->keo_put_events
)(keops
->keo_private
,
1248 kevbuf
, ulistp
, nevents
, nkev
);
1250 *retval
= maxevents
- count
;
1256 * fileops ioctl method for a kqueue descriptor.
1258 * Two ioctls are currently supported. They both use struct kfilter_mapping:
1259 * KFILTER_BYNAME find name for filter, and return result in
1260 * name, which is of size len.
1261 * KFILTER_BYFILTER find filter for name. len is ignored.
1265 kqueue_ioctl(file_t
*fp
, u_long com
, void *data
)
1267 struct kfilter_mapping
*km
;
1268 const struct kfilter
*kfilter
;
1274 name
= kmem_alloc(KFILTER_MAXNAME
, KM_SLEEP
);
1277 case KFILTER_BYFILTER
: /* convert filter -> name */
1278 rw_enter(&kqueue_filter_lock
, RW_READER
);
1279 kfilter
= kfilter_byfilter(km
->filter
);
1280 if (kfilter
!= NULL
) {
1281 strlcpy(name
, kfilter
->name
, KFILTER_MAXNAME
);
1282 rw_exit(&kqueue_filter_lock
);
1283 error
= copyoutstr(name
, km
->name
, km
->len
, NULL
);
1285 rw_exit(&kqueue_filter_lock
);
1290 case KFILTER_BYNAME
: /* convert name -> filter */
1291 error
= copyinstr(km
->name
, name
, KFILTER_MAXNAME
, NULL
);
1295 rw_enter(&kqueue_filter_lock
, RW_READER
);
1296 kfilter
= kfilter_byname(name
);
1297 if (kfilter
!= NULL
)
1298 km
->filter
= kfilter
->filter
;
1301 rw_exit(&kqueue_filter_lock
);
1309 kmem_free(name
, KFILTER_MAXNAME
);
1314 * fileops fcntl method for a kqueue descriptor.
1317 kqueue_fcntl(file_t
*fp
, u_int com
, void *data
)
1324 * fileops poll method for a kqueue descriptor.
1325 * Determine if kqueue has events pending.
1328 kqueue_poll(file_t
*fp
, int events
)
1336 if (events
& (POLLIN
| POLLRDNORM
)) {
1337 mutex_spin_enter(&kq
->kq_lock
);
1338 if (kq
->kq_count
!= 0) {
1339 revents
|= events
& (POLLIN
| POLLRDNORM
);
1341 selrecord(curlwp
, &kq
->kq_sel
);
1344 mutex_spin_exit(&kq
->kq_lock
);
1351 * fileops stat method for a kqueue descriptor.
1352 * Returns dummy info, with st_size being number of events pending.
1355 kqueue_stat(file_t
*fp
, struct stat
*st
)
1361 memset(st
, 0, sizeof(*st
));
1362 st
->st_size
= kq
->kq_count
;
1363 st
->st_blksize
= sizeof(struct kevent
);
1364 st
->st_mode
= S_IFIFO
;
1370 kqueue_doclose(struct kqueue
*kq
, struct klist
*list
, int fd
)
1377 KASSERT(mutex_owned(&fdp
->fd_lock
));
1379 for (kn
= SLIST_FIRST(list
); kn
!= NULL
;) {
1380 if (kq
!= kn
->kn_kq
) {
1381 kn
= SLIST_NEXT(kn
, kn_link
);
1384 knote_detach(kn
, fdp
, true);
1385 mutex_enter(&fdp
->fd_lock
);
1386 kn
= SLIST_FIRST(list
);
1392 * fileops close method for a kqueue descriptor.
1395 kqueue_close(file_t
*fp
)
1405 mutex_enter(&fdp
->fd_lock
);
1406 for (i
= 0; i
<= fdp
->fd_lastkqfile
; i
++) {
1407 if ((ff
= fdp
->fd_dt
->dt_ff
[i
]) == NULL
)
1409 kqueue_doclose(kq
, (struct klist
*)&ff
->ff_knlist
, i
);
1411 if (fdp
->fd_knhashmask
!= 0) {
1412 for (i
= 0; i
< fdp
->fd_knhashmask
+ 1; i
++) {
1413 kqueue_doclose(kq
, &fdp
->fd_knhash
[i
], -1);
1416 mutex_exit(&fdp
->fd_lock
);
1418 KASSERT(kq
->kq_count
== 0);
1419 mutex_destroy(&kq
->kq_lock
);
1420 cv_destroy(&kq
->kq_cv
);
1421 seldestroy(&kq
->kq_sel
);
1422 kmem_free(kq
, sizeof(*kq
));
1429 * struct fileops kqfilter method for a kqueue descriptor.
1430 * Event triggered when monitored kqueue changes.
1433 kqueue_kqfilter(file_t
*fp
, struct knote
*kn
)
1438 kq
= ((file_t
*)kn
->kn_obj
)->f_data
;
1440 KASSERT(fp
== kn
->kn_obj
);
1442 if (kn
->kn_filter
!= EVFILT_READ
)
1445 kn
->kn_fop
= &kqread_filtops
;
1447 mutex_enter(&kq
->kq_lock
);
1448 SLIST_INSERT_HEAD(&kq
->kq_sel
.sel_klist
, kn
, kn_selnext
);
1449 mutex_exit(&kq
->kq_lock
);
1456 * Walk down a list of knotes, activating them if their event has
1457 * triggered. The caller's object lock (e.g. device driver lock)
1461 knote(struct klist
*list
, long hint
)
1465 SLIST_FOREACH(kn
, list
, kn_selnext
) {
1466 if ((*kn
->kn_fop
->f_event
)(kn
, hint
))
1472 * Remove all knotes referencing a specified fd
1475 knote_fdclose(int fd
)
1482 list
= (struct klist
*)&fdp
->fd_dt
->dt_ff
[fd
]->ff_knlist
;
1483 mutex_enter(&fdp
->fd_lock
);
1484 while ((kn
= SLIST_FIRST(list
)) != NULL
) {
1485 knote_detach(kn
, fdp
, true);
1486 mutex_enter(&fdp
->fd_lock
);
1488 mutex_exit(&fdp
->fd_lock
);
1492 * Drop knote. Called with fdp->fd_lock held, and will drop before
1496 knote_detach(struct knote
*kn
, filedesc_t
*fdp
, bool dofop
)
1503 KASSERT((kn
->kn_status
& KN_MARKER
) == 0);
1504 KASSERT(mutex_owned(&fdp
->fd_lock
));
1506 /* Remove from monitored object. */
1508 KERNEL_LOCK(1, NULL
); /* XXXSMP */
1509 (*kn
->kn_fop
->f_detach
)(kn
);
1510 KERNEL_UNLOCK_ONE(NULL
); /* XXXSMP */
1513 /* Remove from descriptor table. */
1514 if (kn
->kn_fop
->f_isfd
)
1515 list
= (struct klist
*)&fdp
->fd_dt
->dt_ff
[kn
->kn_id
]->ff_knlist
;
1517 list
= &fdp
->fd_knhash
[KN_HASH(kn
->kn_id
, fdp
->fd_knhashmask
)];
1519 SLIST_REMOVE(list
, kn
, knote
, kn_link
);
1521 /* Remove from kqueue. */
1522 /* XXXAD should verify not in use by kqueue_scan. */
1523 mutex_spin_enter(&kq
->kq_lock
);
1524 if ((kn
->kn_status
& KN_QUEUED
) != 0) {
1526 TAILQ_REMOVE(&kq
->kq_head
, kn
, kn_tqe
);
1527 kn
->kn_status
&= ~KN_QUEUED
;
1531 mutex_spin_exit(&kq
->kq_lock
);
1533 mutex_exit(&fdp
->fd_lock
);
1534 if (kn
->kn_fop
->f_isfd
)
1535 fd_putfile(kn
->kn_id
);
1536 atomic_dec_uint(&kn
->kn_kfilter
->refcnt
);
1537 kmem_free(kn
, sizeof(*kn
));
1541 * Queue new event for knote.
1544 knote_enqueue(struct knote
*kn
)
1548 KASSERT((kn
->kn_status
& KN_MARKER
) == 0);
1552 mutex_spin_enter(&kq
->kq_lock
);
1553 if ((kn
->kn_status
& KN_DISABLED
) != 0) {
1554 kn
->kn_status
&= ~KN_DISABLED
;
1556 if ((kn
->kn_status
& (KN_ACTIVE
| KN_QUEUED
)) == KN_ACTIVE
) {
1558 TAILQ_INSERT_TAIL(&kq
->kq_head
, kn
, kn_tqe
);
1559 kn
->kn_status
|= KN_QUEUED
;
1562 cv_broadcast(&kq
->kq_cv
);
1563 selnotify(&kq
->kq_sel
, 0, NOTE_SUBMIT
);
1565 mutex_spin_exit(&kq
->kq_lock
);
1568 * Queue new event for knote.
1571 knote_activate(struct knote
*kn
)
1575 KASSERT((kn
->kn_status
& KN_MARKER
) == 0);
1579 mutex_spin_enter(&kq
->kq_lock
);
1580 kn
->kn_status
|= KN_ACTIVE
;
1581 if ((kn
->kn_status
& (KN_QUEUED
| KN_DISABLED
)) == 0) {
1583 TAILQ_INSERT_TAIL(&kq
->kq_head
, kn
, kn_tqe
);
1584 kn
->kn_status
|= KN_QUEUED
;
1587 cv_broadcast(&kq
->kq_cv
);
1588 selnotify(&kq
->kq_sel
, 0, NOTE_SUBMIT
);
1590 mutex_spin_exit(&kq
->kq_lock
);