sys/kern/kern_event.c

   1 /*      $NetBSD: kern_event.c,v 1.68 2009/12/20 09:36:05 dsl Exp $      */
   2
   3 /*-
   4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*-
  33  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  34  * All rights reserved.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  *
  45  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  55  * SUCH DAMAGE.
  56  *
  57  * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
  58  */
  59
  60 #include <sys/cdefs.h>
  61 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.68 2009/12/20 09:36:05 dsl Exp $");
  62
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/kernel.h>
  66 #include <sys/proc.h>
  67 #include <sys/file.h>
  68 #include <sys/select.h>
  69 #include <sys/queue.h>
  70 #include <sys/event.h>
  71 #include <sys/eventvar.h>
  72 #include <sys/poll.h>
  73 #include <sys/kmem.h>
  74 #include <sys/stat.h>
  75 #include <sys/filedesc.h>
  76 #include <sys/syscallargs.h>
  77 #include <sys/kauth.h>
  78 #include <sys/conf.h>
  79 #include <sys/atomic.h>
  80
  81 static int      kqueue_scan(file_t *, size_t, struct kevent *,
  82                             const struct timespec *, register_t *,
  83                             const struct kevent_ops *, struct kevent *,
  84                             size_t);
  85 static int      kqueue_ioctl(file_t *, u_long, void *);
  86 static int      kqueue_fcntl(file_t *, u_int, void *);
  87 static int      kqueue_poll(file_t *, int);
  88 static int      kqueue_kqfilter(file_t *, struct knote *);
  89 static int      kqueue_stat(file_t *, struct stat *);
  90 static int      kqueue_close(file_t *);
  91 static int      kqueue_register(struct kqueue *, struct kevent *);
  92 static void     kqueue_doclose(struct kqueue *, struct klist *, int);
  93
  94 static void     knote_detach(struct knote *, filedesc_t *fdp, bool);
  95 static void     knote_enqueue(struct knote *);
  96 static void     knote_activate(struct knote *);
  97
  98 static void     filt_kqdetach(struct knote *);
  99 static int      filt_kqueue(struct knote *, long hint);
 100 static int      filt_procattach(struct knote *);
 101 static void     filt_procdetach(struct knote *);
 102 static int      filt_proc(struct knote *, long hint);
 103 static int      filt_fileattach(struct knote *);
 104 static void     filt_timerexpire(void *x);
 105 static int      filt_timerattach(struct knote *);
 106 static void     filt_timerdetach(struct knote *);
 107 static int      filt_timer(struct knote *, long hint);
 108
 109 static const struct fileops kqueueops = {
 110         .fo_read = (void *)enxio,
 111         .fo_write = (void *)enxio,
 112         .fo_ioctl = kqueue_ioctl,
 113         .fo_fcntl = kqueue_fcntl,
 114         .fo_poll = kqueue_poll,
 115         .fo_stat = kqueue_stat,
 116         .fo_close = kqueue_close,
 117         .fo_kqfilter = kqueue_kqfilter,
 118         .fo_restart = fnullop_restart,
 119 };
 120
 121 static const struct filterops kqread_filtops =
 122         { 1, NULL, filt_kqdetach, filt_kqueue };
 123 static const struct filterops proc_filtops =
 124         { 0, filt_procattach, filt_procdetach, filt_proc };
 125 static const struct filterops file_filtops =
 126         { 1, filt_fileattach, NULL, NULL };
 127 static const struct filterops timer_filtops =
 128         { 0, filt_timerattach, filt_timerdetach, filt_timer };
 129
 130 static u_int    kq_ncallouts = 0;
 131 static int      kq_calloutmax = (4 * 1024);
 132
 133 #define KN_HASHSIZE             64              /* XXX should be tunable */
 134 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 135
 136 extern const struct filterops sig_filtops;
 137
 138 /*
 139  * Table for for all system-defined filters.
 140  * These should be listed in the numeric order of the EVFILT_* defines.
 141  * If filtops is NULL, the filter isn't implemented in NetBSD.
 142  * End of list is when name is NULL.
 143  *
 144  * Note that 'refcnt' is meaningless for built-in filters.
 145  */
 146 struct kfilter {
 147         const char      *name;          /* name of filter */
 148         uint32_t        filter;         /* id of filter */
 149         unsigned        refcnt;         /* reference count */
 150         const struct filterops *filtops;/* operations for filter */
 151         size_t          namelen;        /* length of name string */
 152 };
 153
 154 /* System defined filters */
 155 static struct kfilter sys_kfilters[] = {
 156         { "EVFILT_READ",        EVFILT_READ,    0, &file_filtops, 0 },
 157         { "EVFILT_WRITE",       EVFILT_WRITE,   0, &file_filtops, 0, },
 158         { "EVFILT_AIO",         EVFILT_AIO,     0, NULL, 0 },
 159         { "EVFILT_VNODE",       EVFILT_VNODE,   0, &file_filtops, 0 },
 160         { "EVFILT_PROC",        EVFILT_PROC,    0, &proc_filtops, 0 },
 161         { "EVFILT_SIGNAL",      EVFILT_SIGNAL,  0, &sig_filtops, 0 },
 162         { "EVFILT_TIMER",       EVFILT_TIMER,   0, &timer_filtops, 0 },
 163         { NULL,                 0,              0, NULL, 0 },
 164 };
 165
 166 /* User defined kfilters */
 167 static struct kfilter   *user_kfilters;         /* array */
 168 static int              user_kfilterc;          /* current offset */
 169 static int              user_kfiltermaxc;       /* max size so far */
 170 static size_t           user_kfiltersz;         /* size of allocated memory */
 171
 172 /* Locks */
 173 static krwlock_t        kqueue_filter_lock;     /* lock on filter lists */
 174 static kmutex_t         kqueue_misc_lock;       /* miscellaneous */
 175
 176 static kauth_listener_t kqueue_listener;
 177
 178 static int
 179 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
 180     void *arg0, void *arg1, void *arg2, void *arg3)
 181 {
 182         struct proc *p;
 183         int result;
 184
 185         result = KAUTH_RESULT_DEFER;
 186         p = arg0;
 187
 188         if (action != KAUTH_PROCESS_KEVENT_FILTER)
 189                 return result;
 190
 191         if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
 192             ISSET(p->p_flag, PK_SUGID)))
 193                 return result;
 194
 195         result = KAUTH_RESULT_ALLOW;
 196
 197         return result;
 198 }
 199
 200 /*
 201  * Initialize the kqueue subsystem.
 202  */
 203 void
 204 kqueue_init(void)
 205 {
 206
 207         rw_init(&kqueue_filter_lock);
 208         mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
 209
 210         kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
 211             kqueue_listener_cb, NULL);
 212 }
 213
 214 /*
 215  * Find kfilter entry by name, or NULL if not found.
 216  */
 217 static struct kfilter *
 218 kfilter_byname_sys(const char *name)
 219 {
 220         int i;
 221
 222         KASSERT(rw_lock_held(&kqueue_filter_lock));
 223
 224         for (i = 0; sys_kfilters[i].name != NULL; i++) {
 225                 if (strcmp(name, sys_kfilters[i].name) == 0)
 226                         return &sys_kfilters[i];
 227         }
 228         return NULL;
 229 }
 230
 231 static struct kfilter *
 232 kfilter_byname_user(const char *name)
 233 {
 234         int i;
 235
 236         KASSERT(rw_lock_held(&kqueue_filter_lock));
 237
 238         /* user filter slots have a NULL name if previously deregistered */
 239         for (i = 0; i < user_kfilterc ; i++) {
 240                 if (user_kfilters[i].name != NULL &&
 241                     strcmp(name, user_kfilters[i].name) == 0)
 242                         return &user_kfilters[i];
 243         }
 244         return NULL;
 245 }
 246
 247 static struct kfilter *
 248 kfilter_byname(const char *name)
 249 {
 250         struct kfilter *kfilter;
 251
 252         KASSERT(rw_lock_held(&kqueue_filter_lock));
 253
 254         if ((kfilter = kfilter_byname_sys(name)) != NULL)
 255                 return kfilter;
 256
 257         return kfilter_byname_user(name);
 258 }
 259
 260 /*
 261  * Find kfilter entry by filter id, or NULL if not found.
 262  * Assumes entries are indexed in filter id order, for speed.
 263  */
 264 static struct kfilter *
 265 kfilter_byfilter(uint32_t filter)
 266 {
 267         struct kfilter *kfilter;
 268
 269         KASSERT(rw_lock_held(&kqueue_filter_lock));
 270
 271         if (filter < EVFILT_SYSCOUNT)   /* it's a system filter */
 272                 kfilter = &sys_kfilters[filter];
 273         else if (user_kfilters != NULL &&
 274             filter < EVFILT_SYSCOUNT + user_kfilterc)
 275                                         /* it's a user filter */
 276                 kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
 277         else
 278                 return (NULL);          /* out of range */
 279         KASSERT(kfilter->filter == filter);     /* sanity check! */
 280         return (kfilter);
 281 }
 282
 283 /*
 284  * Register a new kfilter. Stores the entry in user_kfilters.
 285  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
 286  * If retfilter != NULL, the new filterid is returned in it.
 287  */
 288 int
 289 kfilter_register(const char *name, const struct filterops *filtops,
 290                  int *retfilter)
 291 {
 292         struct kfilter *kfilter;
 293         size_t len;
 294         int i;
 295
 296         if (name == NULL || name[0] == '\0' || filtops == NULL)
 297                 return (EINVAL);        /* invalid args */
 298
 299         rw_enter(&kqueue_filter_lock, RW_WRITER);
 300         if (kfilter_byname(name) != NULL) {
 301                 rw_exit(&kqueue_filter_lock);
 302                 return (EEXIST);        /* already exists */
 303         }
 304         if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
 305                 rw_exit(&kqueue_filter_lock);
 306                 return (EINVAL);        /* too many */
 307         }
 308
 309         for (i = 0; i < user_kfilterc; i++) {
 310                 kfilter = &user_kfilters[i];
 311                 if (kfilter->name == NULL) {
 312                         /* Previously deregistered slot.  Reuse. */
 313                         goto reuse;
 314                 }
 315         }
 316
 317         /* check if need to grow user_kfilters */
 318         if (user_kfilterc + 1 > user_kfiltermaxc) {
 319                 /* Grow in KFILTER_EXTENT chunks. */
 320                 user_kfiltermaxc += KFILTER_EXTENT;
 321                 len = user_kfiltermaxc * sizeof(*kfilter);
 322                 kfilter = kmem_alloc(len, KM_SLEEP);
 323                 memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
 324                 if (user_kfilters != NULL) {
 325                         memcpy(kfilter, user_kfilters, user_kfiltersz);
 326                         kmem_free(user_kfilters, user_kfiltersz);
 327                 }
 328                 user_kfiltersz = len;
 329                 user_kfilters = kfilter;
 330         }
 331         /* Adding new slot */
 332         kfilter = &user_kfilters[user_kfilterc++];
 333 reuse:
 334         kfilter->namelen = strlen(name) + 1;
 335         kfilter->name = kmem_alloc(kfilter->namelen, KM_SLEEP);
 336         memcpy(__UNCONST(kfilter->name), name, kfilter->namelen);
 337
 338         kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
 339
 340         kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
 341         memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
 342
 343         if (retfilter != NULL)
 344                 *retfilter = kfilter->filter;
 345         rw_exit(&kqueue_filter_lock);
 346
 347         return (0);
 348 }
 349
 350 /*
 351  * Unregister a kfilter previously registered with kfilter_register.
 352  * This retains the filter id, but clears the name and frees filtops (filter
 353  * operations), so that the number isn't reused during a boot.
 354  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
 355  */
 356 int
 357 kfilter_unregister(const char *name)
 358 {
 359         struct kfilter *kfilter;
 360
 361         if (name == NULL || name[0] == '\0')
 362                 return (EINVAL);        /* invalid name */
 363
 364         rw_enter(&kqueue_filter_lock, RW_WRITER);
 365         if (kfilter_byname_sys(name) != NULL) {
 366                 rw_exit(&kqueue_filter_lock);
 367                 return (EINVAL);        /* can't detach system filters */
 368         }
 369
 370         kfilter = kfilter_byname_user(name);
 371         if (kfilter == NULL) {
 372                 rw_exit(&kqueue_filter_lock);
 373                 return (ENOENT);
 374         }
 375         if (kfilter->refcnt != 0) {
 376                 rw_exit(&kqueue_filter_lock);
 377                 return (EBUSY);
 378         }
 379
 380         /* Cast away const (but we know it's safe. */
 381         kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
 382         kfilter->name = NULL;   /* mark as `not implemented' */
 383
 384         if (kfilter->filtops != NULL) {
 385                 /* Cast away const (but we know it's safe. */
 386                 kmem_free(__UNCONST(kfilter->filtops),
 387                     sizeof(*kfilter->filtops));
 388                 kfilter->filtops = NULL; /* mark as `not implemented' */
 389         }
 390         rw_exit(&kqueue_filter_lock);
 391
 392         return (0);
 393 }
 394
 395
 396 /*
 397  * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
 398  * descriptors. Calls fileops kqfilter method for given file descriptor.
 399  */
 400 static int
 401 filt_fileattach(struct knote *kn)
 402 {
 403         file_t *fp;
 404
 405         fp = kn->kn_obj;
 406
 407         return (*fp->f_ops->fo_kqfilter)(fp, kn);
 408 }
 409
 410 /*
 411  * Filter detach method for EVFILT_READ on kqueue descriptor.
 412  */
 413 static void
 414 filt_kqdetach(struct knote *kn)
 415 {
 416         struct kqueue *kq;
 417
 418         kq = ((file_t *)kn->kn_obj)->f_data;
 419
 420         mutex_spin_enter(&kq->kq_lock);
 421         SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
 422         mutex_spin_exit(&kq->kq_lock);
 423 }
 424
 425 /*
 426  * Filter event method for EVFILT_READ on kqueue descriptor.
 427  */
 428 /*ARGSUSED*/
 429 static int
 430 filt_kqueue(struct knote *kn, long hint)
 431 {
 432         struct kqueue *kq;
 433         int rv;
 434
 435         kq = ((file_t *)kn->kn_obj)->f_data;
 436
 437         if (hint != NOTE_SUBMIT)
 438                 mutex_spin_enter(&kq->kq_lock);
 439         kn->kn_data = kq->kq_count;
 440         rv = (kn->kn_data > 0);
 441         if (hint != NOTE_SUBMIT)
 442                 mutex_spin_exit(&kq->kq_lock);
 443
 444         return rv;
 445 }
 446
 447 /*
 448  * Filter attach method for EVFILT_PROC.
 449  */
 450 static int
 451 filt_procattach(struct knote *kn)
 452 {
 453         struct proc *p, *curp;
 454         struct lwp *curl;
 455
 456         curl = curlwp;
 457         curp = curl->l_proc;
 458
 459         mutex_enter(proc_lock);
 460         p = p_find(kn->kn_id, PFIND_LOCKED);
 461         if (p == NULL) {
 462                 mutex_exit(proc_lock);
 463                 return ESRCH;
 464         }
 465
 466         /*
 467          * Fail if it's not owned by you, or the last exec gave us
 468          * setuid/setgid privs (unless you're root).
 469          */
 470         mutex_enter(p->p_lock);
 471         mutex_exit(proc_lock);
 472         if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
 473             p, NULL, NULL, NULL) != 0) {
 474                 mutex_exit(p->p_lock);
 475                 return EACCES;
 476         }
 477
 478         kn->kn_obj = p;
 479         kn->kn_flags |= EV_CLEAR;       /* automatically set */
 480
 481         /*
 482          * internal flag indicating registration done by kernel
 483          */
 484         if (kn->kn_flags & EV_FLAG1) {
 485                 kn->kn_data = kn->kn_sdata;     /* ppid */
 486                 kn->kn_fflags = NOTE_CHILD;
 487                 kn->kn_flags &= ~EV_FLAG1;
 488         }
 489         SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
 490         mutex_exit(p->p_lock);
 491
 492         return 0;
 493 }
 494
 495 /*
 496  * Filter detach method for EVFILT_PROC.
 497  *
 498  * The knote may be attached to a different process, which may exit,
 499  * leaving nothing for the knote to be attached to.  So when the process
 500  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
 501  * it will be deleted when read out.  However, as part of the knote deletion,
 502  * this routine is called, so a check is needed to avoid actually performing
 503  * a detach, because the original process might not exist any more.
 504  */
 505 static void
 506 filt_procdetach(struct knote *kn)
 507 {
 508         struct proc *p;
 509
 510         if (kn->kn_status & KN_DETACHED)
 511                 return;
 512
 513         p = kn->kn_obj;
 514
 515         mutex_enter(p->p_lock);
 516         SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
 517         mutex_exit(p->p_lock);
 518 }
 519
 520 /*
 521  * Filter event method for EVFILT_PROC.
 522  */
 523 static int
 524 filt_proc(struct knote *kn, long hint)
 525 {
 526         u_int event, fflag;
 527         struct kevent kev;
 528         struct kqueue *kq;
 529         int error;
 530
 531         event = (u_int)hint & NOTE_PCTRLMASK;
 532         kq = kn->kn_kq;
 533         fflag = 0;
 534
 535         /* If the user is interested in this event, record it. */
 536         if (kn->kn_sfflags & event)
 537                 fflag |= event;
 538
 539         if (event == NOTE_EXIT) {
 540                 /*
 541                  * Process is gone, so flag the event as finished.
 542                  *
 543                  * Detach the knote from watched process and mark
 544                  * it as such. We can't leave this to kqueue_scan(),
 545                  * since the process might not exist by then. And we
 546                  * have to do this now, since psignal KNOTE() is called
 547                  * also for zombies and we might end up reading freed
 548                  * memory if the kevent would already be picked up
 549                  * and knote g/c'ed.
 550                  */
 551                 filt_procdetach(kn);
 552
 553                 mutex_spin_enter(&kq->kq_lock);
 554                 kn->kn_status |= KN_DETACHED;
 555                 /* Mark as ONESHOT, so that the knote it g/c'ed when read */
 556                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 557                 kn->kn_fflags |= fflag;
 558                 mutex_spin_exit(&kq->kq_lock);
 559
 560                 return 1;
 561         }
 562
 563         mutex_spin_enter(&kq->kq_lock);
 564         if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
 565                 /*
 566                  * Process forked, and user wants to track the new process,
 567                  * so attach a new knote to it, and immediately report an
 568                  * event with the parent's pid.  Register knote with new
 569                  * process.
 570                  */
 571                 kev.ident = hint & NOTE_PDATAMASK;      /* pid */
 572                 kev.filter = kn->kn_filter;
 573                 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 574                 kev.fflags = kn->kn_sfflags;
 575                 kev.data = kn->kn_id;                   /* parent */
 576                 kev.udata = kn->kn_kevent.udata;        /* preserve udata */
 577                 mutex_spin_exit(&kq->kq_lock);
 578                 error = kqueue_register(kq, &kev);
 579                 mutex_spin_enter(&kq->kq_lock);
 580                 if (error != 0)
 581                         kn->kn_fflags |= NOTE_TRACKERR;
 582         }
 583         kn->kn_fflags |= fflag;
 584         fflag = kn->kn_fflags;
 585         mutex_spin_exit(&kq->kq_lock);
 586
 587         return fflag != 0;
 588 }
 589
 590 static void
 591 filt_timerexpire(void *knx)
 592 {
 593         struct knote *kn = knx;
 594         int tticks;
 595
 596         mutex_enter(&kqueue_misc_lock);
 597         kn->kn_data++;
 598         knote_activate(kn);
 599         if ((kn->kn_flags & EV_ONESHOT) == 0) {
 600                 tticks = mstohz(kn->kn_sdata);
 601                 callout_schedule((callout_t *)kn->kn_hook, tticks);
 602         }
 603         mutex_exit(&kqueue_misc_lock);
 604 }
 605
 606 /*
 607  * data contains amount of time to sleep, in milliseconds
 608  */
 609 static int
 610 filt_timerattach(struct knote *kn)
 611 {
 612         callout_t *calloutp;
 613         struct kqueue *kq;
 614         int tticks;
 615
 616         tticks = mstohz(kn->kn_sdata);
 617
 618         /* if the supplied value is under our resolution, use 1 tick */
 619         if (tticks == 0) {
 620                 if (kn->kn_sdata == 0)
 621                         return EINVAL;
 622                 tticks = 1;
 623         }
 624
 625         if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
 626             (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
 627                 atomic_dec_uint(&kq_ncallouts);
 628                 return ENOMEM;
 629         }
 630         callout_init(calloutp, CALLOUT_MPSAFE);
 631
 632         kq = kn->kn_kq;
 633         mutex_spin_enter(&kq->kq_lock);
 634         kn->kn_flags |= EV_CLEAR;               /* automatically set */
 635         kn->kn_hook = calloutp;
 636         mutex_spin_exit(&kq->kq_lock);
 637
 638         callout_reset(calloutp, tticks, filt_timerexpire, kn);
 639
 640         return (0);
 641 }
 642
 643 static void
 644 filt_timerdetach(struct knote *kn)
 645 {
 646         callout_t *calloutp;
 647
 648         calloutp = (callout_t *)kn->kn_hook;
 649         callout_halt(calloutp, NULL);
 650         callout_destroy(calloutp);
 651         kmem_free(calloutp, sizeof(*calloutp));
 652         atomic_dec_uint(&kq_ncallouts);
 653 }
 654
 655 static int
 656 filt_timer(struct knote *kn, long hint)
 657 {
 658         int rv;
 659
 660         mutex_enter(&kqueue_misc_lock);
 661         rv = (kn->kn_data != 0);
 662         mutex_exit(&kqueue_misc_lock);
 663
 664         return rv;
 665 }
 666
 667 /*
 668  * filt_seltrue:
 669  *
 670  *      This filter "event" routine simulates seltrue().
 671  */
 672 int
 673 filt_seltrue(struct knote *kn, long hint)
 674 {
 675
 676         /*
 677          * We don't know how much data can be read/written,
 678          * but we know that it *can* be.  This is about as
 679          * good as select/poll does as well.
 680          */
 681         kn->kn_data = 0;
 682         return (1);
 683 }
 684
 685 /*
 686  * This provides full kqfilter entry for device switch tables, which
 687  * has same effect as filter using filt_seltrue() as filter method.
 688  */
 689 static void
 690 filt_seltruedetach(struct knote *kn)
 691 {
 692         /* Nothing to do */
 693 }
 694
 695 const struct filterops seltrue_filtops =
 696         { 1, NULL, filt_seltruedetach, filt_seltrue };
 697
 698 int
 699 seltrue_kqfilter(dev_t dev, struct knote *kn)
 700 {
 701         switch (kn->kn_filter) {
 702         case EVFILT_READ:
 703         case EVFILT_WRITE:
 704                 kn->kn_fop = &seltrue_filtops;
 705                 break;
 706         default:
 707                 return (EINVAL);
 708         }
 709
 710         /* Nothing more to do */
 711         return (0);
 712 }
 713
 714 /*
 715  * kqueue(2) system call.
 716  */
 717 int
 718 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
 719 {
 720         struct kqueue *kq;
 721         file_t *fp;
 722         int fd, error;
 723
 724         if ((error = fd_allocfile(&fp, &fd)) != 0)
 725                 return error;
 726         fp->f_flag = FREAD | FWRITE;
 727         fp->f_type = DTYPE_KQUEUE;
 728         fp->f_ops = &kqueueops;
 729         kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
 730         mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
 731         cv_init(&kq->kq_cv, "kqueue");
 732         selinit(&kq->kq_sel);
 733         TAILQ_INIT(&kq->kq_head);
 734         fp->f_data = kq;
 735         *retval = fd;
 736         kq->kq_fdp = curlwp->l_fd;
 737         fd_affix(curproc, fp, fd);
 738         return error;
 739 }
 740
 741 /*
 742  * kevent(2) system call.
 743  */
 744 int
 745 kevent_fetch_changes(void *private, const struct kevent *changelist,
 746     struct kevent *changes, size_t index, int n)
 747 {
 748
 749         return copyin(changelist + index, changes, n * sizeof(*changes));
 750 }
 751
 752 int
 753 kevent_put_events(void *private, struct kevent *events,
 754     struct kevent *eventlist, size_t index, int n)
 755 {
 756
 757         return copyout(events, eventlist + index, n * sizeof(*events));
 758 }
 759
 760 static const struct kevent_ops kevent_native_ops = {
 761         .keo_private = NULL,
 762         .keo_fetch_timeout = copyin,
 763         .keo_fetch_changes = kevent_fetch_changes,
 764         .keo_put_events = kevent_put_events,
 765 };
 766
 767 int
 768 sys___kevent50(struct lwp *l, const struct sys___kevent50_args *uap,
 769     register_t *retval)
 770 {
 771         /* {
 772                 syscallarg(int) fd;
 773                 syscallarg(const struct kevent *) changelist;
 774                 syscallarg(size_t) nchanges;
 775                 syscallarg(struct kevent *) eventlist;
 776                 syscallarg(size_t) nevents;
 777                 syscallarg(const struct timespec *) timeout;
 778         } */
 779
 780         return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
 781             SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
 782             SCARG(uap, timeout), &kevent_native_ops);
 783 }
 784
 785 int
 786 kevent1(register_t *retval, int fd,
 787         const struct kevent *changelist, size_t nchanges,
 788         struct kevent *eventlist, size_t nevents,
 789         const struct timespec *timeout,
 790         const struct kevent_ops *keops)
 791 {
 792         struct kevent *kevp;
 793         struct kqueue *kq;
 794         struct timespec ts;
 795         size_t i, n, ichange;
 796         int nerrors, error;
 797         struct kevent kevbuf[8];        /* approx 300 bytes on 64-bit */
 798         file_t *fp;
 799
 800         /* check that we're dealing with a kq */
 801         fp = fd_getfile(fd);
 802         if (fp == NULL)
 803                 return (EBADF);
 804
 805         if (fp->f_type != DTYPE_KQUEUE) {
 806                 fd_putfile(fd);
 807                 return (EBADF);
 808         }
 809
 810         if (timeout != NULL) {
 811                 error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
 812                 if (error)
 813                         goto done;
 814                 timeout = &ts;
 815         }
 816
 817         kq = (struct kqueue *)fp->f_data;
 818         nerrors = 0;
 819         ichange = 0;
 820
 821         /* traverse list of events to register */
 822         while (nchanges > 0) {
 823                 n = MIN(nchanges, __arraycount(kevbuf));
 824                 error = (*keops->keo_fetch_changes)(keops->keo_private,
 825                     changelist, kevbuf, ichange, n);
 826                 if (error)
 827                         goto done;
 828                 for (i = 0; i < n; i++) {
 829                         kevp = &kevbuf[i];
 830                         kevp->flags &= ~EV_SYSFLAGS;
 831                         /* register each knote */
 832                         error = kqueue_register(kq, kevp);
 833                         if (error) {
 834                                 if (nevents != 0) {
 835                                         kevp->flags = EV_ERROR;
 836                                         kevp->data = error;
 837                                         error = (*keops->keo_put_events)
 838                                             (keops->keo_private, kevp,
 839                                             eventlist, nerrors, 1);
 840                                         if (error)
 841                                                 goto done;
 842                                         nevents--;
 843                                         nerrors++;
 844                                 } else {
 845                                         goto done;
 846                                 }
 847                         }
 848                 }
 849                 nchanges -= n;  /* update the results */
 850                 ichange += n;
 851         }
 852         if (nerrors) {
 853                 *retval = nerrors;
 854                 error = 0;
 855                 goto done;
 856         }
 857
 858         /* actually scan through the events */
 859         error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
 860             kevbuf, __arraycount(kevbuf));
 861  done:
 862         fd_putfile(fd);
 863         return (error);
 864 }
 865
 866 /*
 867  * Register a given kevent kev onto the kqueue
 868  */
 869 static int
 870 kqueue_register(struct kqueue *kq, struct kevent *kev)
 871 {
 872         struct kfilter *kfilter;
 873         filedesc_t *fdp;
 874         file_t *fp;
 875         fdfile_t *ff;
 876         struct knote *kn, *newkn;
 877         struct klist *list;
 878         int error, fd, rv;
 879
 880         fdp = kq->kq_fdp;
 881         fp = NULL;
 882         kn = NULL;
 883         error = 0;
 884         fd = 0;
 885
 886         newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
 887
 888         rw_enter(&kqueue_filter_lock, RW_READER);
 889         kfilter = kfilter_byfilter(kev->filter);
 890         if (kfilter == NULL || kfilter->filtops == NULL) {
 891                 /* filter not found nor implemented */
 892                 rw_exit(&kqueue_filter_lock);
 893                 kmem_free(newkn, sizeof(*newkn));
 894                 return (EINVAL);
 895         }
 896
 897         mutex_enter(&fdp->fd_lock);
 898
 899         /* search if knote already exists */
 900         if (kfilter->filtops->f_isfd) {
 901                 /* monitoring a file descriptor */
 902                 fd = kev->ident;
 903                 if ((fp = fd_getfile(fd)) == NULL) {
 904                         mutex_exit(&fdp->fd_lock);
 905                         rw_exit(&kqueue_filter_lock);
 906                         kmem_free(newkn, sizeof(*newkn));
 907                         return EBADF;
 908                 }
 909                 ff = fdp->fd_dt->dt_ff[fd];
 910                 if (fd <= fdp->fd_lastkqfile) {
 911                         SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
 912                                 if (kq == kn->kn_kq &&
 913                                     kev->filter == kn->kn_filter)
 914                                         break;
 915                         }
 916                 }
 917         } else {
 918                 /*
 919                  * not monitoring a file descriptor, so
 920                  * lookup knotes in internal hash table
 921                  */
 922                 if (fdp->fd_knhashmask != 0) {
 923                         list = &fdp->fd_knhash[
 924                             KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
 925                         SLIST_FOREACH(kn, list, kn_link) {
 926                                 if (kev->ident == kn->kn_id &&
 927                                     kq == kn->kn_kq &&
 928                                     kev->filter == kn->kn_filter)
 929                                         break;
 930                         }
 931                 }
 932         }
 933
 934         /*
 935          * kn now contains the matching knote, or NULL if no match
 936          */
 937         if (kev->flags & EV_ADD) {
 938                 if (kn == NULL) {
 939                         /* create new knote */
 940                         kn = newkn;
 941                         newkn = NULL;
 942                         kn->kn_obj = fp;
 943                         kn->kn_kq = kq;
 944                         kn->kn_fop = kfilter->filtops;
 945                         kn->kn_kfilter = kfilter;
 946                         kn->kn_sfflags = kev->fflags;
 947                         kn->kn_sdata = kev->data;
 948                         kev->fflags = 0;
 949                         kev->data = 0;
 950                         kn->kn_kevent = *kev;
 951
 952                         /*
 953                          * apply reference count to knote structure, and
 954                          * do not release it at the end of this routine.
 955                          */
 956                         fp = NULL;
 957
 958                         if (!kn->kn_fop->f_isfd) {
 959                                 /*
 960                                  * If knote is not on an fd, store on
 961                                  * internal hash table.
 962                                  */
 963                                 if (fdp->fd_knhashmask == 0) {
 964                                         /* XXXAD can block with fd_lock held */
 965                                         fdp->fd_knhash = hashinit(KN_HASHSIZE,
 966                                             HASH_LIST, true,
 967                                             &fdp->fd_knhashmask);
 968                                 }
 969                                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
 970                                     fdp->fd_knhashmask)];
 971                         } else {
 972                                 /* Otherwise, knote is on an fd. */
 973                                 list = (struct klist *)
 974                                     &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
 975                                 if ((int)kn->kn_id > fdp->fd_lastkqfile)
 976                                         fdp->fd_lastkqfile = kn->kn_id;
 977                         }
 978                         SLIST_INSERT_HEAD(list, kn, kn_link);
 979
 980                         KERNEL_LOCK(1, NULL);           /* XXXSMP */
 981                         error = (*kfilter->filtops->f_attach)(kn);
 982                         KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
 983                         if (error != 0) {
 984                                 /* knote_detach() drops fdp->fd_lock */
 985                                 knote_detach(kn, fdp, false);
 986                                 goto done;
 987                         }
 988                         atomic_inc_uint(&kfilter->refcnt);
 989                 } else {
 990                         /*
 991                          * The user may change some filter values after the
 992                          * initial EV_ADD, but doing so will not reset any
 993                          * filter which have already been triggered.
 994                          */
 995                         kn->kn_sfflags = kev->fflags;
 996                         kn->kn_sdata = kev->data;
 997                         kn->kn_kevent.udata = kev->udata;
 998                 }
 999                 KERNEL_LOCK(1, NULL);                   /* XXXSMP */
1000                 rv = (*kn->kn_fop->f_event)(kn, 0);
1001                 KERNEL_UNLOCK_ONE(NULL);                /* XXXSMP */
1002                 if (rv)
1003                         knote_activate(kn);
1004         } else {
1005                 if (kn == NULL) {
1006                         error = ENOENT;
1007                         mutex_exit(&fdp->fd_lock);
1008                         goto done;
1009                 }
1010                 if (kev->flags & EV_DELETE) {
1011                         /* knote_detach() drops fdp->fd_lock */
1012                         knote_detach(kn, fdp, true);
1013                         goto done;
1014                 }
1015         }
1016
1017         /* disable knote */
1018         if ((kev->flags & EV_DISABLE)) {
1019                 mutex_spin_enter(&kq->kq_lock);
1020                 if ((kn->kn_status & KN_DISABLED) == 0)
1021                         kn->kn_status |= KN_DISABLED;
1022                 mutex_spin_exit(&kq->kq_lock);
1023         }
1024
1025         /* enable knote */
1026         if ((kev->flags & EV_ENABLE)) {
1027                 knote_enqueue(kn);
1028         }
1029         mutex_exit(&fdp->fd_lock);
1030  done:
1031         rw_exit(&kqueue_filter_lock);
1032         if (newkn != NULL)
1033                 kmem_free(newkn, sizeof(*newkn));
1034         if (fp != NULL)
1035                 fd_putfile(fd);
1036         return (error);
1037 }
1038
1039 #if defined(DEBUG)
1040 static void
1041 kq_check(struct kqueue *kq)
1042 {
1043         const struct knote *kn;
1044         int count;
1045         int nmarker;
1046
1047         KASSERT(mutex_owned(&kq->kq_lock));
1048         KASSERT(kq->kq_count >= 0);
1049
1050         count = 0;
1051         nmarker = 0;
1052         TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1053                 if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
1054                         panic("%s: kq=%p kn=%p inconsist 1", __func__, kq, kn);
1055                 }
1056                 if ((kn->kn_status & KN_MARKER) == 0) {
1057                         if (kn->kn_kq != kq) {
1058                                 panic("%s: kq=%p kn=%p inconsist 2",
1059                                     __func__, kq, kn);
1060                         }
1061                         if ((kn->kn_status & KN_ACTIVE) == 0) {
1062                                 panic("%s: kq=%p kn=%p: not active",
1063                                     __func__, kq, kn);
1064                         }
1065                         count++;
1066                         if (count > kq->kq_count) {
1067                                 goto bad;
1068                         }
1069                 } else {
1070                         nmarker++;
1071 #if 0
1072                         if (nmarker > 10000) {
1073                                 panic("%s: kq=%p too many markers: %d != %d, "
1074                                     "nmarker=%d",
1075                                     __func__, kq, kq->kq_count, count, nmarker);
1076                         }
1077 #endif
1078                 }
1079         }
1080         if (kq->kq_count != count) {
1081 bad:
1082                 panic("%s: kq=%p inconsist 3: %d != %d, nmarker=%d",
1083                     __func__, kq, kq->kq_count, count, nmarker);
1084         }
1085 }
1086 #else /* defined(DEBUG) */
1087 #define kq_check(a)     /* nothing */
1088 #endif /* defined(DEBUG) */
1089
1090 /*
1091  * Scan through the list of events on fp (for a maximum of maxevents),
1092  * returning the results in to ulistp. Timeout is determined by tsp; if
1093  * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1094  * as appropriate.
1095  */
1096 static int
1097 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
1098             const struct timespec *tsp, register_t *retval,
1099             const struct kevent_ops *keops, struct kevent *kevbuf,
1100             size_t kevcnt)
1101 {
1102         struct kqueue   *kq;
1103         struct kevent   *kevp;
1104         struct timespec ats, sleepts;
1105         struct knote    *kn, *marker;
1106         size_t          count, nkev, nevents;
1107         int             timeout, error, rv;
1108         filedesc_t      *fdp;
1109
1110         fdp = curlwp->l_fd;
1111         kq = fp->f_data;
1112         count = maxevents;
1113         nkev = nevents = error = 0;
1114         if (count == 0) {
1115                 *retval = 0;
1116                 return 0;
1117         }
1118
1119         if (tsp) {                              /* timeout supplied */
1120                 ats = *tsp;
1121                 if (inittimeleft(&ats, &sleepts) == -1) {
1122                         *retval = maxevents;
1123                         return EINVAL;
1124                 }
1125                 timeout = tstohz(&ats);
1126                 if (timeout <= 0)
1127                         timeout = -1;           /* do poll */
1128         } else {
1129                 /* no timeout, wait forever */
1130                 timeout = 0;
1131         }
1132
1133         marker = kmem_zalloc(sizeof(*marker), KM_SLEEP);
1134         marker->kn_status = KN_MARKER;
1135         mutex_spin_enter(&kq->kq_lock);
1136  retry:
1137         kevp = kevbuf;
1138         if (kq->kq_count == 0) {
1139                 if (timeout >= 0) {
1140                         error = cv_timedwait_sig(&kq->kq_cv,
1141                             &kq->kq_lock, timeout);
1142                         if (error == 0) {
1143                                  if (tsp == NULL || (timeout =
1144                                      gettimeleft(&ats, &sleepts)) > 0)
1145                                         goto retry;
1146                         } else {
1147                                 /* don't restart after signals... */
1148                                 if (error == ERESTART)
1149                                         error = EINTR;
1150                                 if (error == EWOULDBLOCK)
1151                                         error = 0;
1152                         }
1153                 }
1154         } else {
1155                 /* mark end of knote list */
1156                 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1157
1158                 while (count != 0) {
1159                         kn = TAILQ_FIRST(&kq->kq_head); /* get next knote */
1160                         while ((kn->kn_status & KN_MARKER) != 0) {
1161                                 if (kn == marker) {
1162                                         /* it's our marker, stop */
1163                                         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1164                                         if (count < maxevents || (tsp != NULL &&
1165                                             (timeout = gettimeleft(&ats,
1166                                             &sleepts)) <= 0))
1167                                                 goto done;
1168                                         goto retry;
1169                                 }
1170                                 /* someone else's marker. */
1171                                 kn = TAILQ_NEXT(kn, kn_tqe);
1172                         }
1173                         kq_check(kq);
1174                         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1175                         kq->kq_count--;
1176                         kn->kn_status &= ~KN_QUEUED;
1177                         kq_check(kq);
1178                         if (kn->kn_status & KN_DISABLED) {
1179                                 /* don't want disabled events */
1180                                 continue;
1181                         }
1182                         if ((kn->kn_flags & EV_ONESHOT) == 0) {
1183                                 mutex_spin_exit(&kq->kq_lock);
1184                                 KERNEL_LOCK(1, NULL);           /* XXXSMP */
1185                                 rv = (*kn->kn_fop->f_event)(kn, 0);
1186                                 KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
1187                                 mutex_spin_enter(&kq->kq_lock);
1188                                 /* Re-poll if note was re-enqueued. */
1189                                 if ((kn->kn_status & KN_QUEUED) != 0)
1190                                         continue;
1191                                 if (rv == 0) {
1192                                         /*
1193                                          * non-ONESHOT event that hasn't
1194                                          * triggered again, so de-queue.
1195                                          */
1196                                         kn->kn_status &= ~KN_ACTIVE;
1197                                         continue;
1198                                 }
1199                         }
1200                         /* XXXAD should be got from f_event if !oneshot. */
1201                         *kevp++ = kn->kn_kevent;
1202                         nkev++;
1203                         if (kn->kn_flags & EV_ONESHOT) {
1204                                 /* delete ONESHOT events after retrieval */
1205                                 mutex_spin_exit(&kq->kq_lock);
1206                                 mutex_enter(&fdp->fd_lock);
1207                                 knote_detach(kn, fdp, true);
1208                                 mutex_spin_enter(&kq->kq_lock);
1209                         } else if (kn->kn_flags & EV_CLEAR) {
1210                                 /* clear state after retrieval */
1211                                 kn->kn_data = 0;
1212                                 kn->kn_fflags = 0;
1213                                 kn->kn_status &= ~KN_ACTIVE;
1214                         } else {
1215                                 /* add event back on list */
1216                                 kq_check(kq);
1217                                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1218                                 kq->kq_count++;
1219                                 kn->kn_status |= KN_QUEUED;
1220                                 kq_check(kq);
1221                         }
1222                         if (nkev == kevcnt) {
1223                                 /* do copyouts in kevcnt chunks */
1224                                 mutex_spin_exit(&kq->kq_lock);
1225                                 error = (*keops->keo_put_events)
1226                                     (keops->keo_private,
1227                                     kevbuf, ulistp, nevents, nkev);
1228                                 mutex_spin_enter(&kq->kq_lock);
1229                                 nevents += nkev;
1230                                 nkev = 0;
1231                                 kevp = kevbuf;
1232                         }
1233                         count--;
1234                         if (error != 0 || count == 0) {
1235                                 /* remove marker */
1236                                 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1237                                 break;
1238                         }
1239                 }
1240         }
1241  done:
1242         mutex_spin_exit(&kq->kq_lock);
1243         if (marker != NULL)
1244                 kmem_free(marker, sizeof(*marker));
1245         if (nkev != 0) {
1246                 /* copyout remaining events */
1247                 error = (*keops->keo_put_events)(keops->keo_private,
1248                     kevbuf, ulistp, nevents, nkev);
1249         }
1250         *retval = maxevents - count;
1251
1252         return error;
1253 }
1254
1255 /*
1256  * fileops ioctl method for a kqueue descriptor.
1257  *
1258  * Two ioctls are currently supported. They both use struct kfilter_mapping:
1259  *      KFILTER_BYNAME          find name for filter, and return result in
1260  *                              name, which is of size len.
1261  *      KFILTER_BYFILTER        find filter for name. len is ignored.
1262  */
1263 /*ARGSUSED*/
1264 static int
1265 kqueue_ioctl(file_t *fp, u_long com, void *data)
1266 {
1267         struct kfilter_mapping  *km;
1268         const struct kfilter    *kfilter;
1269         char                    *name;
1270         int                     error;
1271
1272         km = data;
1273         error = 0;
1274         name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1275
1276         switch (com) {
1277         case KFILTER_BYFILTER:  /* convert filter -> name */
1278                 rw_enter(&kqueue_filter_lock, RW_READER);
1279                 kfilter = kfilter_byfilter(km->filter);
1280                 if (kfilter != NULL) {
1281                         strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1282                         rw_exit(&kqueue_filter_lock);
1283                         error = copyoutstr(name, km->name, km->len, NULL);
1284                 } else {
1285                         rw_exit(&kqueue_filter_lock);
1286                         error = ENOENT;
1287                 }
1288                 break;
1289
1290         case KFILTER_BYNAME:    /* convert name -> filter */
1291                 error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1292                 if (error) {
1293                         break;
1294                 }
1295                 rw_enter(&kqueue_filter_lock, RW_READER);
1296                 kfilter = kfilter_byname(name);
1297                 if (kfilter != NULL)
1298                         km->filter = kfilter->filter;
1299                 else
1300                         error = ENOENT;
1301                 rw_exit(&kqueue_filter_lock);
1302                 break;
1303
1304         default:
1305                 error = ENOTTY;
1306                 break;
1307
1308         }
1309         kmem_free(name, KFILTER_MAXNAME);
1310         return (error);
1311 }
1312
1313 /*
1314  * fileops fcntl method for a kqueue descriptor.
1315  */
1316 static int
1317 kqueue_fcntl(file_t *fp, u_int com, void *data)
1318 {
1319
1320         return (ENOTTY);
1321 }
1322
1323 /*
1324  * fileops poll method for a kqueue descriptor.
1325  * Determine if kqueue has events pending.
1326  */
1327 static int
1328 kqueue_poll(file_t *fp, int events)
1329 {
1330         struct kqueue   *kq;
1331         int             revents;
1332
1333         kq = fp->f_data;
1334
1335         revents = 0;
1336         if (events & (POLLIN | POLLRDNORM)) {
1337                 mutex_spin_enter(&kq->kq_lock);
1338                 if (kq->kq_count != 0) {
1339                         revents |= events & (POLLIN | POLLRDNORM);
1340                 } else {
1341                         selrecord(curlwp, &kq->kq_sel);
1342                 }
1343                 kq_check(kq);
1344                 mutex_spin_exit(&kq->kq_lock);
1345         }
1346
1347         return revents;
1348 }
1349
1350 /*
1351  * fileops stat method for a kqueue descriptor.
1352  * Returns dummy info, with st_size being number of events pending.
1353  */
1354 static int
1355 kqueue_stat(file_t *fp, struct stat *st)
1356 {
1357         struct kqueue *kq;
1358
1359         kq = fp->f_data;
1360
1361         memset(st, 0, sizeof(*st));
1362         st->st_size = kq->kq_count;
1363         st->st_blksize = sizeof(struct kevent);
1364         st->st_mode = S_IFIFO;
1365
1366         return 0;
1367 }
1368
1369 static void
1370 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
1371 {
1372         struct knote *kn;
1373         filedesc_t *fdp;
1374
1375         fdp = kq->kq_fdp;
1376
1377         KASSERT(mutex_owned(&fdp->fd_lock));
1378
1379         for (kn = SLIST_FIRST(list); kn != NULL;) {
1380                 if (kq != kn->kn_kq) {
1381                         kn = SLIST_NEXT(kn, kn_link);
1382                         continue;
1383                 }
1384                 knote_detach(kn, fdp, true);
1385                 mutex_enter(&fdp->fd_lock);
1386                 kn = SLIST_FIRST(list);
1387         }
1388 }
1389
1390
1391 /*
1392  * fileops close method for a kqueue descriptor.
1393  */
1394 static int
1395 kqueue_close(file_t *fp)
1396 {
1397         struct kqueue *kq;
1398         filedesc_t *fdp;
1399         fdfile_t *ff;
1400         int i;
1401
1402         kq = fp->f_data;
1403         fdp = curlwp->l_fd;
1404
1405         mutex_enter(&fdp->fd_lock);
1406         for (i = 0; i <= fdp->fd_lastkqfile; i++) {
1407                 if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1408                         continue;
1409                 kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1410         }
1411         if (fdp->fd_knhashmask != 0) {
1412                 for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
1413                         kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
1414                 }
1415         }
1416         mutex_exit(&fdp->fd_lock);
1417
1418         KASSERT(kq->kq_count == 0);
1419         mutex_destroy(&kq->kq_lock);
1420         cv_destroy(&kq->kq_cv);
1421         seldestroy(&kq->kq_sel);
1422         kmem_free(kq, sizeof(*kq));
1423         fp->f_data = NULL;
1424
1425         return (0);
1426 }
1427
1428 /*
1429  * struct fileops kqfilter method for a kqueue descriptor.
1430  * Event triggered when monitored kqueue changes.
1431  */
1432 static int
1433 kqueue_kqfilter(file_t *fp, struct knote *kn)
1434 {
1435         struct kqueue *kq;
1436         filedesc_t *fdp;
1437
1438         kq = ((file_t *)kn->kn_obj)->f_data;
1439
1440         KASSERT(fp == kn->kn_obj);
1441
1442         if (kn->kn_filter != EVFILT_READ)
1443                 return 1;
1444
1445         kn->kn_fop = &kqread_filtops;
1446         fdp = curlwp->l_fd;
1447         mutex_enter(&kq->kq_lock);
1448         SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1449         mutex_exit(&kq->kq_lock);
1450
1451         return 0;
1452 }
1453
1454
1455 /*
1456  * Walk down a list of knotes, activating them if their event has
1457  * triggered.  The caller's object lock (e.g. device driver lock)
1458  * must be held.
1459  */
1460 void
1461 knote(struct klist *list, long hint)
1462 {
1463         struct knote *kn;
1464
1465         SLIST_FOREACH(kn, list, kn_selnext) {
1466                 if ((*kn->kn_fop->f_event)(kn, hint))
1467                         knote_activate(kn);
1468         }
1469 }
1470
1471 /*
1472  * Remove all knotes referencing a specified fd
1473  */
1474 void
1475 knote_fdclose(int fd)
1476 {
1477         struct klist *list;
1478         struct knote *kn;
1479         filedesc_t *fdp;
1480
1481         fdp = curlwp->l_fd;
1482         list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1483         mutex_enter(&fdp->fd_lock);
1484         while ((kn = SLIST_FIRST(list)) != NULL) {
1485                 knote_detach(kn, fdp, true);
1486                 mutex_enter(&fdp->fd_lock);
1487         }
1488         mutex_exit(&fdp->fd_lock);
1489 }
1490
1491 /*
1492  * Drop knote.  Called with fdp->fd_lock held, and will drop before
1493  * returning.
1494  */
1495 static void
1496 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
1497 {
1498         struct klist *list;
1499         struct kqueue *kq;
1500
1501         kq = kn->kn_kq;
1502
1503         KASSERT((kn->kn_status & KN_MARKER) == 0);
1504         KASSERT(mutex_owned(&fdp->fd_lock));
1505
1506         /* Remove from monitored object. */
1507         if (dofop) {
1508                 KERNEL_LOCK(1, NULL);           /* XXXSMP */
1509                 (*kn->kn_fop->f_detach)(kn);
1510                 KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
1511         }
1512
1513         /* Remove from descriptor table. */
1514         if (kn->kn_fop->f_isfd)
1515                 list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1516         else
1517                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1518
1519         SLIST_REMOVE(list, kn, knote, kn_link);
1520
1521         /* Remove from kqueue. */
1522         /* XXXAD should verify not in use by kqueue_scan. */
1523         mutex_spin_enter(&kq->kq_lock);
1524         if ((kn->kn_status & KN_QUEUED) != 0) {
1525                 kq_check(kq);
1526                 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1527                 kn->kn_status &= ~KN_QUEUED;
1528                 kq->kq_count--;
1529                 kq_check(kq);
1530         }
1531         mutex_spin_exit(&kq->kq_lock);
1532
1533         mutex_exit(&fdp->fd_lock);
1534         if (kn->kn_fop->f_isfd)
1535                 fd_putfile(kn->kn_id);
1536         atomic_dec_uint(&kn->kn_kfilter->refcnt);
1537         kmem_free(kn, sizeof(*kn));
1538 }
1539
1540 /*
1541  * Queue new event for knote.
1542  */
1543 static void
1544 knote_enqueue(struct knote *kn)
1545 {
1546         struct kqueue *kq;
1547
1548         KASSERT((kn->kn_status & KN_MARKER) == 0);
1549
1550         kq = kn->kn_kq;
1551
1552         mutex_spin_enter(&kq->kq_lock);
1553         if ((kn->kn_status & KN_DISABLED) != 0) {
1554                 kn->kn_status &= ~KN_DISABLED;
1555         }
1556         if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
1557                 kq_check(kq);
1558                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1559                 kn->kn_status |= KN_QUEUED;
1560                 kq->kq_count++;
1561                 kq_check(kq);
1562                 cv_broadcast(&kq->kq_cv);
1563                 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1564         }
1565         mutex_spin_exit(&kq->kq_lock);
1566 }
1567 /*
1568  * Queue new event for knote.
1569  */
1570 static void
1571 knote_activate(struct knote *kn)
1572 {
1573         struct kqueue *kq;
1574
1575         KASSERT((kn->kn_status & KN_MARKER) == 0);
1576
1577         kq = kn->kn_kq;
1578
1579         mutex_spin_enter(&kq->kq_lock);
1580         kn->kn_status |= KN_ACTIVE;
1581         if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
1582                 kq_check(kq);
1583                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1584                 kn->kn_status |= KN_QUEUED;
1585                 kq->kq_count++;
1586                 kq_check(kq);
1587                 cv_broadcast(&kq->kq_cv);
1588                 selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
1589         }
1590         mutex_spin_exit(&kq->kq_lock);
1591 }