minix/net/lwip/bpfdev.c

   1 /* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */
   2 /*
   3  * BPF is a cloning device: opening /dev/bpf returns a new BPF device which is
   4  * independent from any other opened BPF devices.  We assume that each BPF
   5  * device is used by one single user process, and this implementation therefore
   6  * does not support multiple concurrent device calls on the same BPF device.
   7  *
   8  * Packet buffering basically follows the BSD model: each BPF device that is
   9  * configured (that is, it has been attached to an interface) has two buffers,
  10  * each of the configured size: a store buffer, where new packets are stored,
  11  * and a hold buffer, which is typically full and awaiting retrieval through a
  12  * read call from userland.  The buffers are swapped ("rotated") when the store
  13  * buffer is filled up and the hold buffer is empty - if the hold buffer is not
  14  * empty is not empty either, additional packets are dropped.
  15  *
  16  * These buffers are allocated when the BPF device is attached to an interface.
  17  * The interface may later disappear, in which case the BPF device is detached
  18  * from it, allowing any final packets to be read before read requests start
  19  * returning I/O errors.  The buffers are freed only when the device is closed.
  20  */
  21
  22 #include "lwip.h"
  23 #include "bpfdev.h"
  24
  25 #include <minix/chardriver.h>
  26 #include <net/if.h>
  27 #include <net/bpfdesc.h>
  28 #include <minix/bpf.h>
  29 #include <sys/mman.h>
  30
  31 /*
  32  * Make sure that our implementation matches the BPF version in the NetBSD
  33  * headers.  If they change the version number, we may have to make changes
  34  * here accordingly.
  35  */
  36 #if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1
  37 #error "NetBSD BPF version has changed"
  38 #endif
  39
  40 /* The number of BPF devices. */
  41 #define NR_BPFDEV               16
  42
  43 /* BPF receive buffer size: allowed range and default. */
  44 #define BPF_BUF_MIN             BPF_WORDALIGN(sizeof(struct bpf_hdr))
  45 #define BPF_BUF_DEF             32768
  46 #define BPF_BUF_MAX             262144
  47
  48 /*
  49  * By opening /dev/bpf, one will obtain a cloned device with a different minor
  50  * number, which maps to one of the BPF devices.
  51  */
  52 #define BPFDEV_MINOR            0       /* minor number of /dev/bpf */
  53 #define BPFDEV_BASE_MINOR       1       /* base minor number for BPF devices */
  54
  55 static struct bpfdev {
  56         struct bpfdev_link bpf_link;    /* structure link, MUST be first */
  57         TAILQ_ENTRY(bpfdev) bpf_next;   /* next on free or interface list */
  58         struct ifdev *bpf_ifdev;        /* associated interface, or NULL */
  59         unsigned int bpf_flags;         /* flags (BPFF_) */
  60         size_t bpf_size;                /* size of packet buffers */
  61         char *bpf_sbuf;                 /* store buffer (mmap'd, or NULL) */
  62         char *bpf_hbuf;                 /* hold buffer (mmap'd, or NULL) */
  63         size_t bpf_slen;                /* used part of store buffer */
  64         size_t bpf_hlen;                /* used part of hold buffer */
  65         struct bpf_insn *bpf_filter;    /* verified BPF filter, or NULL */
  66         size_t bpf_filterlen;           /* length of filter, for munmap */
  67         pid_t bpf_pid;                  /* process ID of last using process */
  68         clock_t bpf_timeout;            /* timeout for read calls (0 = none) */
  69         struct {                        /* state for pending read request */
  70                 endpoint_t br_endpt;    /* reading endpoint, or NONE */
  71                 cp_grant_id_t br_grant; /* grant for reader's buffer */
  72                 cdev_id_t br_id;        /* read request identifier */
  73                 minix_timer_t br_timer; /* timer for read timeout */
  74         } bpf_read;
  75         struct {                        /* state for pending select request */
  76                 endpoint_t bs_endpt;    /* selecting endpoint, or NONE */
  77                 unsigned int bs_selops; /* pending select operations */
  78         } bpf_select;
  79         struct {                        /* packet capture statistics */
  80                 uint64_t bs_recv;       /* # of packets run through filter */
  81                 uint64_t bs_drop;       /* # of packets dropped: buffer full */
  82                 uint64_t bs_capt;       /* # of packets accepted by filter */
  83         } bpf_stat;
  84 } bpf_array[NR_BPFDEV];
  85
  86 #define BPFF_IN_USE     0x01            /* this BPF device object is in use */
  87 #define BPFF_PROMISC    0x02            /* promiscuous mode enabled */
  88 #define BPFF_IMMEDIATE  0x04            /* immediate mode is enabled */
  89 #define BPFF_SEESENT    0x08            /* also process host-sent packets */
  90 #define BPFF_HDRCMPLT   0x10            /* do not fill in link-layer source */
  91 #define BPFF_FEEDBACK   0x20            /* feed back written packet as input */
  92
  93 static TAILQ_HEAD(, bpfdev_link) bpfl_freelist; /* list of free BPF devices */
  94
  95 static struct bpf_stat bpf_stat;
  96
  97 static ssize_t bpfdev_peers(struct rmib_call *, struct rmib_node *,
  98         struct rmib_oldp *, struct rmib_newp *);
  99
 100 /* The CTL_NET NET_BPF subtree.  All nodes are dynamically numbered. */
 101 static struct rmib_node net_bpf_table[] = {
 102         RMIB_INT(RMIB_RO, BPF_BUF_MAX, "maxbufsize",
 103             "Maximum size for data capture buffer"), /* TODO: read-write */
 104         RMIB_STRUCT(RMIB_RO, sizeof(bpf_stat), &bpf_stat, "stats",
 105             "BPF stats"),
 106         RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, bpfdev_peers, "peers",
 107             "BPF peers"),
 108 };
 109
 110 static struct rmib_node net_bpf_node =
 111     RMIB_NODE(RMIB_RO, net_bpf_table, "bpf", "BPF options");
 112
 113 /*
 114  * Initialize the BPF module.
 115  */
 116 void
 117 bpfdev_init(void)
 118 {
 119         const int mib[] = { CTL_NET, NET_BPF };
 120         unsigned int slot;
 121         int r;
 122
 123         /* Initialize data structures. */
 124         TAILQ_INIT(&bpfl_freelist);
 125
 126         for (slot = 0; slot < __arraycount(bpf_array); slot++) {
 127                 bpf_array[slot].bpf_flags = 0;
 128
 129                 TAILQ_INSERT_TAIL(&bpfl_freelist, &bpf_array[slot].bpf_link,
 130                     bpfl_next);
 131         }
 132
 133         memset(&bpf_stat, 0, sizeof(bpf_stat));
 134
 135         /* Register the "net.bpf" subtree with the MIB service. */
 136         if ((r = rmib_register(mib, __arraycount(mib), &net_bpf_node)) != OK)
 137                 panic("unable to register net.bpf RMIB tree: %d", r);
 138 }
 139
 140 /*
 141  * Given a BPF device object, return the corresponding minor number.
 142  */
 143 static devminor_t
 144 bpfdev_get_minor(struct bpfdev * bpfdev)
 145 {
 146
 147         assert(bpfdev != NULL);
 148
 149         return BPFDEV_BASE_MINOR + (devminor_t)(bpfdev - bpf_array);
 150 }
 151
 152 /*
 153  * Given a minor number, return the corresponding BPF device object, or NULL if
 154  * the minor number does not identify a BPF device.
 155  */
 156 static struct bpfdev *
 157 bpfdev_get_by_minor(devminor_t minor)
 158 {
 159
 160         if (minor < BPFDEV_BASE_MINOR ||
 161             (unsigned int)minor >= BPFDEV_BASE_MINOR + __arraycount(bpf_array))
 162                 return NULL;
 163
 164         return &bpf_array[minor - BPFDEV_BASE_MINOR];
 165 }
 166
 167 /*
 168  * Open a BPF device, returning a cloned device instance.
 169  */
 170 static int
 171 bpfdev_open(devminor_t minor, int access __unused, endpoint_t user_endpt)
 172 {
 173         struct bpfdev_link *bpfl;
 174         struct bpfdev *bpf;
 175
 176         /* Disallow opening cloned devices through device nodes. */
 177         if (minor != BPFDEV_MINOR)
 178                 return ENXIO;
 179
 180         if (TAILQ_EMPTY(&bpfl_freelist))
 181                 return ENOBUFS;
 182
 183         bpfl = TAILQ_FIRST(&bpfl_freelist);
 184         TAILQ_REMOVE(&bpfl_freelist, bpfl, bpfl_next);
 185
 186         bpf = (struct bpfdev *)bpfl;
 187
 188         memset(bpf, 0, sizeof(*bpf));
 189
 190         bpf->bpf_flags = BPFF_IN_USE | BPFF_SEESENT;
 191         bpf->bpf_size = BPF_BUF_DEF;
 192         bpf->bpf_pid = getnpid(user_endpt);
 193         bpf->bpf_read.br_endpt = NONE;
 194         bpf->bpf_select.bs_endpt = NONE;
 195
 196         return CDEV_CLONED | bpfdev_get_minor(bpf);
 197 }
 198
 199 /*
 200  * Close a BPF device.
 201  */
 202 static int
 203 bpfdev_close(devminor_t minor)
 204 {
 205         struct bpfdev *bpf;
 206
 207         if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
 208                 return EINVAL;
 209
 210         /*
 211          * There cannot possibly be a pending read request, so we never need to
 212          * cancel the read timer from here either.
 213          */
 214         assert(bpf->bpf_read.br_endpt == NONE);
 215
 216         if (bpf->bpf_sbuf != NULL) {
 217                 assert(bpf->bpf_hbuf != NULL);
 218
 219                 if (munmap(bpf->bpf_sbuf, bpf->bpf_size) != 0)
 220                         panic("munmap failed: %d", -errno);
 221                 if (munmap(bpf->bpf_hbuf, bpf->bpf_size) != 0)
 222                         panic("munmap failed: %d", -errno);
 223
 224                 bpf->bpf_sbuf = NULL;
 225                 bpf->bpf_hbuf = NULL;
 226         } else
 227                 assert(bpf->bpf_hbuf == NULL);
 228
 229         if (bpf->bpf_filter != NULL) {
 230                 assert(bpf->bpf_filterlen > 0);
 231
 232                 if (munmap(bpf->bpf_filter, bpf->bpf_filterlen) != 0)
 233                         panic("munmap failed: %d", -errno);
 234
 235                 bpf->bpf_filter = NULL;
 236         }
 237
 238         /*
 239          * If the BPF device was attached to an interface, and that interface
 240          * has not disappeared in the meantime, detach from it now.
 241          */
 242         if (bpf->bpf_ifdev != NULL) {
 243                 if (bpf->bpf_flags & BPFF_PROMISC)
 244                         ifdev_clear_promisc(bpf->bpf_ifdev);
 245
 246                 ifdev_detach_bpf(bpf->bpf_ifdev, &bpf->bpf_link);
 247
 248                 bpf->bpf_ifdev = NULL;
 249         }
 250
 251         bpf->bpf_flags = 0;             /* mark as no longer in use */
 252
 253         TAILQ_INSERT_HEAD(&bpfl_freelist, &bpf->bpf_link, bpfl_next);
 254
 255         return OK;
 256 }
 257
 258 /*
 259  * Rotate buffers for the BPF device, by swapping the store buffer and the hold
 260  * buffer.
 261  */
 262 static void
 263 bpfdev_rotate(struct bpfdev * bpf)
 264 {
 265         char *buf;
 266         size_t len;
 267
 268         /*
 269          * When rotating, the store buffer may or may not be empty, but the
 270          * hold buffer must always be empty.
 271          */
 272         assert(bpf->bpf_hlen == 0);
 273
 274         buf = bpf->bpf_sbuf;
 275         len = bpf->bpf_slen;
 276         bpf->bpf_sbuf = bpf->bpf_hbuf;
 277         bpf->bpf_slen = bpf->bpf_hlen;
 278         bpf->bpf_hbuf = buf;
 279         bpf->bpf_hlen = len;
 280 }
 281
 282 /*
 283  * Test whether any of the given select operations are ready on the BPF device,
 284  * and return the set of ready operations.
 285  */
 286 static unsigned int
 287 bpfdev_test_select(struct bpfdev * bpf, unsigned int ops)
 288 {
 289         unsigned int ready_ops;
 290
 291         ready_ops = 0;
 292
 293         /*
 294          * The BPF device is ready for reading if the hold buffer is not empty
 295          * (i.e.: the store buffer has been filled up completely and was
 296          * therefore rotated) or if immediate mode is set and the store buffer
 297          * is not empty (i.e.: any packet is available at all).  In the latter
 298          * case, the buffers will be rotated during the read.  We do not
 299          * support applying the read timeout to selects and maintaining state
 300          * between the select and the following read, because despite that
 301          * libpcap claims that it is the right behavior, that is just insane.
 302          */
 303         if (ops & CDEV_OP_RD) {
 304                 if (bpf->bpf_ifdev == NULL)
 305                         ready_ops |= CDEV_OP_RD;
 306                 else if (bpf->bpf_hlen > 0)
 307                         ready_ops |= CDEV_OP_RD;
 308                 else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
 309                     bpf->bpf_slen > 0)
 310                         ready_ops |= CDEV_OP_RD;
 311         }
 312
 313         if (ops & CDEV_OP_WR)
 314                 ready_ops |= CDEV_OP_WR;
 315
 316         return ready_ops;
 317 }
 318
 319 /*
 320  * There has been a state change on the BPF device.  If now possible, resume a
 321  * pending select query, if any.
 322  */
 323 static void
 324 bpfdev_resume_select(struct bpfdev * bpf)
 325 {
 326         unsigned int ops, ready_ops;
 327         endpoint_t endpt;
 328
 329         /* First see if there is a pending select request at all. */
 330         if ((endpt = bpf->bpf_select.bs_endpt) == NONE)
 331                 return;
 332         ops = bpf->bpf_select.bs_selops;
 333
 334         assert(ops != 0);
 335
 336         /* Then see if any of the pending operations are now ready. */
 337         if ((ready_ops = bpfdev_test_select(bpf, ops)) == 0)
 338                 return;
 339
 340         /* If so, notify VFS about the ready operations. */
 341         chardriver_reply_select(bpf->bpf_select.bs_endpt,
 342             bpfdev_get_minor(bpf), ready_ops);
 343
 344         /*
 345          * Forget about the ready operations.  If that leaves no pending
 346          * operations, forget about the select request altogether.
 347          */
 348         if ((bpf->bpf_select.bs_selops &= ~ready_ops) == 0)
 349                 bpf->bpf_select.bs_endpt = NONE;
 350 }
 351
 352 /*
 353  * There has been a state change on the BPF device.  If now possible, resume a
 354  * pending read request, if any.  If the call is a result of a timeout,
 355  * 'is_timeout' is set.  In that case, the read request must be resumed with an
 356  * EAGAIN error if no packets are available, and the running timer must be
 357  * canceled.  Otherwise, the resumption is due to a full buffer or a
 358  * disappeared interface, and 'is_timeout' is not set.  In this case, the read
 359  * request must be resumed with an I/O error if no packets are available.
 360  */
 361 static void
 362 bpfdev_resume_read(struct bpfdev * bpf, int is_timeout)
 363 {
 364         ssize_t r;
 365
 366         assert(bpf->bpf_read.br_endpt != NONE);
 367
 368         /*
 369          * If the hold buffer is still empty, see if the store buffer has
 370          * any packets to copy out.
 371          */
 372         if (bpf->bpf_hlen == 0)
 373                 bpfdev_rotate(bpf);
 374
 375         /* Return any available packets, or otherwise an error. */
 376         if (bpf->bpf_hlen > 0) {
 377                 assert(bpf->bpf_hlen <= bpf->bpf_size);
 378
 379                 r = sys_safecopyto(bpf->bpf_read.br_endpt,
 380                     bpf->bpf_read.br_grant, 0, (vir_bytes)bpf->bpf_hbuf,
 381                     bpf->bpf_hlen);
 382
 383                 if (r == OK) {
 384                         r = (ssize_t)bpf->bpf_hlen;
 385
 386                         bpf->bpf_hlen = 0;
 387
 388                         assert(bpf->bpf_slen != bpf->bpf_size);
 389
 390                         /*
 391                          * Allow readers to get the last packets after the
 392                          * interface has disappeared, before getting errors.
 393                          */
 394                         if (bpf->bpf_ifdev == NULL)
 395                                 bpfdev_rotate(bpf);
 396                 }
 397         } else
 398                 r = (is_timeout) ? EAGAIN : EIO;
 399
 400         chardriver_reply_task(bpf->bpf_read.br_endpt, bpf->bpf_read.br_id, r);
 401
 402         bpf->bpf_read.br_endpt = NONE;
 403
 404         /* Was there still a timer running?  Then cancel it now. */
 405         if (bpf->bpf_timeout > 0 && !is_timeout)
 406                 cancel_timer(&bpf->bpf_read.br_timer);
 407 }
 408
 409 /*
 410  * A read timeout has triggered for the BPF device.  Wake up the pending read
 411  * request.
 412  */
 413 static void
 414 bpfdev_timeout(int arg)
 415 {
 416         struct bpfdev *bpf;
 417
 418         assert(arg >= 0 && (unsigned int)arg < __arraycount(bpf_array));
 419
 420         bpf = &bpf_array[arg];
 421
 422         assert(bpf->bpf_read.br_endpt != NONE);
 423
 424         bpfdev_resume_read(bpf, TRUE /*is_timeout*/);
 425 }
 426
 427 /*
 428  * Read from a BPF device.
 429  */
 430 static ssize_t
 431 bpfdev_read(devminor_t minor, uint64_t position, endpoint_t endpt,
 432         cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
 433 {
 434         struct bpfdev *bpf;
 435         ssize_t r;
 436         int suspend;
 437
 438         if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
 439                 return EINVAL;
 440
 441         /* Allow only one read call at a time. */
 442         if (bpf->bpf_read.br_endpt != NONE)
 443                 return EIO;
 444
 445         /* Has this BPF device been configured at all yet? */
 446         if (bpf->bpf_sbuf == NULL)
 447                 return EINVAL;
 448
 449         /*
 450          * Does the read call size match the entire buffer size?  This is a
 451          * ridiculous requirement but it makes our job quite a bit easier..
 452          */
 453         if (size != bpf->bpf_size)
 454                 return EINVAL;
 455
 456         /*
 457          * Following standard receive semantics, if the interface is gone,
 458          * return all the packets that were pending before returning an error.
 459          * This requires extra buffer rotations after read completion, too.
 460          */
 461         if (bpf->bpf_ifdev == NULL && bpf->bpf_hlen == 0)
 462                 return EIO;
 463
 464         /*
 465          * If immediate mode is not enabled, we should always suspend the read
 466          * call if the hold buffer is empty.  If immediate mode is enabled, we
 467          * should only suspend the read call if both buffers are empty, and
 468          * return data from the hold buffer or otherwise the store buffer,
 469          * whichever is not empty.  A non-blocking call behaves as though
 470          * immediate mode is enabled, except it will return EAGAIN instead of
 471          * suspending the read call if both buffers are empty.  Thus, we may
 472          * have to rotate buffers for both immediate mode and non-blocking
 473          * calls.  The latter is necessary for libpcap to behave correctly.
 474          */
 475         if ((flags & CDEV_NONBLOCK) || (bpf->bpf_flags & BPFF_IMMEDIATE))
 476                 suspend = (bpf->bpf_hlen == 0 && bpf->bpf_slen == 0);
 477         else
 478                 suspend = (bpf->bpf_hlen == 0);
 479
 480         if (suspend) {
 481                 if (flags & CDEV_NONBLOCK)
 482                         return EAGAIN;
 483
 484                 /* Suspend the read call for later. */
 485                 bpf->bpf_read.br_endpt = endpt;
 486                 bpf->bpf_read.br_grant = grant;
 487                 bpf->bpf_read.br_id = id;
 488
 489                 /* Set a timer if requested. */
 490                 if (bpf->bpf_timeout > 0)
 491                         set_timer(&bpf->bpf_read.br_timer, bpf->bpf_timeout,
 492                             bpfdev_timeout, (int)(bpf - bpf_array));
 493
 494                 return EDONTREPLY;
 495         }
 496
 497         /* If we get here, either buffer has data; rotate buffers if needed. */
 498         if (bpf->bpf_hlen == 0)
 499                 bpfdev_rotate(bpf);
 500         assert(bpf->bpf_hlen > 0);
 501
 502         if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)bpf->bpf_hbuf,
 503             bpf->bpf_hlen)) != OK)
 504                 return r;
 505
 506         r = (ssize_t)bpf->bpf_hlen;
 507
 508         bpf->bpf_hlen = 0;
 509
 510         /*
 511          * If the store buffer is exactly full, rotate it now.  Also, if the
 512          * interface has disappeared, the store buffer will never fill up.
 513          * Rotate it so that the application will get any remaining data before
 514          * getting errors about the interface being gone.
 515          */
 516         if (bpf->bpf_slen == bpf->bpf_size || bpf->bpf_ifdev == NULL)
 517                 bpfdev_rotate(bpf);
 518
 519         return r;
 520 }
 521
 522 /*
 523  * Write to a BPF device.
 524  */
 525 static ssize_t
 526 bpfdev_write(devminor_t minor, uint64_t position, endpoint_t endpt,
 527         cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
 528 {
 529         struct bpfdev *bpf;
 530         struct pbuf *pbuf, *pptr, *pcopy;
 531         size_t off;
 532         err_t err;
 533         int r;
 534
 535         if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
 536                 return EINVAL;
 537
 538         if (bpf->bpf_ifdev == NULL)
 539                 return EINVAL;
 540
 541         /* VFS skips zero-sized I/O calls right now, but that may change. */
 542         if (size == 0)
 543                 return 0;       /* nothing to do */
 544
 545         if (size > ifdev_get_hdrlen(bpf->bpf_ifdev) +
 546             ifdev_get_mtu(bpf->bpf_ifdev))
 547                 return EMSGSIZE;
 548
 549         if ((pbuf = pchain_alloc(PBUF_LINK, size)) == NULL)
 550                 return ENOMEM;
 551
 552         /* TODO: turn this into a series of vector copies. */
 553         off = 0;
 554         for (pptr = pbuf; pptr != NULL; pptr = pptr->next) {
 555                 if ((r = sys_safecopyfrom(endpt, grant, off,
 556                     (vir_bytes)pptr->payload, pptr->len)) != OK) {
 557                         pbuf_free(pbuf);
 558
 559                         return r;
 560                 }
 561                 off += pptr->len;
 562         }
 563         assert(off == size);
 564
 565         /*
 566          * In feedback mode, we cannot use the same packet buffers for both
 567          * output and input, so make a copy.  We do this before calling the
 568          * output function, which may change part of the buffers, because the
 569          * BSDs take this approach as well.
 570          */
 571         if (bpf->bpf_flags & BPFF_FEEDBACK) {
 572                 if ((pcopy = pchain_alloc(PBUF_LINK, size)) == NULL) {
 573                         pbuf_free(pbuf);
 574
 575                         return ENOMEM;
 576                 }
 577
 578                 if (pbuf_copy(pcopy, pbuf) != ERR_OK)
 579                         panic("unexpected pbuf copy failure");
 580         } else
 581                 pcopy = NULL;
 582
 583         /* Pass in the packet as output, and free it again. */
 584         err = ifdev_output(bpf->bpf_ifdev, pbuf, NULL /*netif*/,
 585             TRUE /*to_bpf*/, !!(bpf->bpf_flags & BPFF_HDRCMPLT));
 586
 587         pbuf_free(pbuf);
 588
 589         /* In feedback mode, pass in the copy as input, if output succeeded. */
 590         if (err == ERR_OK && (bpf->bpf_flags & BPFF_FEEDBACK))
 591                 ifdev_input(bpf->bpf_ifdev, pcopy, NULL /*netif*/,
 592                     FALSE /*to_bpf*/);
 593         else if (pcopy != NULL)
 594                 pbuf_free(pcopy);
 595
 596         return (err == ERR_OK) ? (ssize_t)size : util_convert_err(err);
 597 }
 598
 599 /*
 600  * Attach a BPF device to a network interface, using the interface name given
 601  * in an ifreq structure.  As side effect, allocate hold and store buffers for
 602  * the device.  These buffers will stay allocated until the device is closed,
 603  * even though the interface may disappear before that.  Return OK if the BPF
 604  * device was successfully attached to the interface, or a negative error code
 605  * otherwise.
 606  */
 607 static int
 608 bpfdev_attach(struct bpfdev * bpf, struct ifreq * ifr)
 609 {
 610         struct ifdev *ifdev;
 611         void *sbuf, *hbuf;
 612
 613         /* Find the interface with the given name. */
 614         ifr->ifr_name[sizeof(ifr->ifr_name) - 1] = '\0';
 615         if ((ifdev = ifdev_find_by_name(ifr->ifr_name)) == NULL)
 616                 return ENXIO;
 617
 618         /*
 619          * Allocate a store buffer and a hold buffer.  Preallocate the memory,
 620          * or we might get killed later during low-memory conditions.
 621          */
 622         if ((sbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
 623             MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED)
 624                 return ENOMEM;
 625
 626         if ((hbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
 627             MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) {
 628                 (void)munmap(sbuf, bpf->bpf_size);
 629
 630                 return ENOMEM;
 631         }
 632
 633         bpf->bpf_ifdev = ifdev;
 634         bpf->bpf_sbuf = sbuf;
 635         bpf->bpf_hbuf = hbuf;
 636         assert(bpf->bpf_slen == 0);
 637         assert(bpf->bpf_hlen == 0);
 638
 639         ifdev_attach_bpf(ifdev, &bpf->bpf_link);
 640
 641         return OK;
 642 }
 643
 644 /*
 645  * Detach the BPF device from its interface, which is about to disappear.
 646  */
 647 void
 648 bpfdev_detach(struct bpfdev_link * bpfl)
 649 {
 650         struct bpfdev *bpf = (struct bpfdev *)bpfl;
 651
 652         assert(bpf->bpf_flags & BPFF_IN_USE);
 653         assert(bpf->bpf_ifdev != NULL);
 654
 655         /*
 656          * We deliberately leave the buffers allocated here, for two reasons:
 657          *
 658          * 1) it lets applications to read any last packets in the buffers;
 659          * 2) it prevents reattaching the BPF device to another interface.
 660          */
 661         bpf->bpf_ifdev = NULL;
 662
 663         /*
 664          * Resume pending read and select requests, returning any data left,
 665          * or an error if none.
 666          */
 667         if (bpf->bpf_hlen == 0)
 668                 bpfdev_rotate(bpf);
 669
 670         if (bpf->bpf_read.br_endpt != NONE)
 671                 bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
 672
 673         bpfdev_resume_select(bpf);
 674 }
 675
 676 /*
 677  * Flush the given BPF device, resetting its buffer contents and statistics
 678  * counters.
 679  */
 680 static void
 681 bpfdev_flush(struct bpfdev * bpf)
 682 {
 683
 684         bpf->bpf_slen = 0;
 685         bpf->bpf_hlen = 0;
 686
 687         bpf->bpf_stat.bs_recv = 0;
 688         bpf->bpf_stat.bs_drop = 0;
 689         bpf->bpf_stat.bs_capt = 0;
 690 }
 691
 692 /*
 693  * Install a filter program on the BPF device.  A new filter replaces any old
 694  * one.  A zero-sized filter simply clears a previous filter.  On success,
 695  * perform a flush and return OK.  On failure, return a negative error code
 696  * without making any modifications to the current filter.
 697  */
 698 static int
 699 bpfdev_setfilter(struct bpfdev * bpf, endpoint_t endpt, cp_grant_id_t grant)
 700 {
 701         struct bpf_insn *filter;
 702         unsigned int count;
 703         size_t len;
 704         int r;
 705
 706         if ((r = sys_safecopyfrom(endpt, grant,
 707             offsetof(struct minix_bpf_program, mbf_len), (vir_bytes)&count,
 708             sizeof(count))) != OK)
 709                 return r;
 710
 711         if (count > BPF_MAXINSNS)
 712                 return EINVAL;
 713         len = count * sizeof(struct bpf_insn);
 714
 715         if (len > 0) {
 716                 if ((filter = (struct bpf_insn *)mmap(NULL, len,
 717                     PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0)) ==
 718                     MAP_FAILED)
 719                         return ENOMEM;
 720
 721                 if ((r = sys_safecopyfrom(endpt, grant,
 722                     offsetof(struct minix_bpf_program, mbf_insns),
 723                     (vir_bytes)filter, len)) != OK) {
 724                         (void)munmap(filter, len);
 725
 726                         return r;
 727                 }
 728
 729                 if (!bpf_validate(filter, count)) {
 730                         (void)munmap(filter, len);
 731
 732                         return EINVAL;
 733                 }
 734         } else
 735                 filter = NULL;
 736
 737         if (bpf->bpf_filter != NULL)
 738                 (void)munmap(bpf->bpf_filter, bpf->bpf_filterlen);
 739
 740         bpf->bpf_filter = filter;
 741         bpf->bpf_filterlen = len;
 742
 743         bpfdev_flush(bpf);
 744
 745         return OK;
 746 }
 747
 748 /*
 749  * Process an I/O control request on the BPF device.
 750  */
 751 static int
 752 bpfdev_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
 753         cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
 754 {
 755         struct bpfdev *bpf;
 756         struct bpf_stat bs;
 757         struct bpf_version bv;
 758         struct bpf_dltlist bfl;
 759         struct timeval tv;
 760         struct ifreq ifr;
 761         unsigned int uval;
 762         int r, val;
 763
 764         if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
 765                 return EINVAL;
 766
 767         /*
 768          * We do not support multiple concurrent requests in this module.  That
 769          * not only means that we forbid a read(2) call on a BPF device object
 770          * while another read(2) is already pending: we also disallow IOCTL
 771          * IOCTL calls while such a read(2) call is in progress.  This
 772          * restriction should never be a problem for user programs, and allows
 773          * us to rely on the fact that that no settings can change between the
 774          * start and end of any read call.  As a side note, pending select(2)
 775          * queries may be similarly affected, and will also not be fully
 776          * accurate if any options are changed while pending.
 777          */
 778         if (bpf->bpf_read.br_endpt != NONE)
 779                 return EIO;
 780
 781         bpf->bpf_pid = getnpid(user_endpt);
 782
 783         /* These are in order of the NetBSD BIOC.. IOCTL numbers. */
 784         switch (request) {
 785         case BIOCGBLEN:
 786                 uval = bpf->bpf_size;
 787
 788                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
 789                     sizeof(uval));
 790
 791         case BIOCSBLEN:
 792                 if (bpf->bpf_sbuf != NULL)
 793                         return EINVAL;
 794
 795                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
 796                     sizeof(uval))) != OK)
 797                         return r;
 798
 799                 if (uval < BPF_BUF_MIN)
 800                         uval = BPF_BUF_MIN;
 801                 else if (uval > BPF_BUF_MAX)
 802                         uval = BPF_BUF_MAX;
 803
 804                 /* Is this the right thing to do?  It doesn't matter for us. */
 805                 uval = BPF_WORDALIGN(uval);
 806
 807                 if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
 808                     sizeof(uval))) != OK)
 809                         return r;
 810
 811                 bpf->bpf_size = uval;
 812
 813                 return OK;
 814
 815         case MINIX_BIOCSETF:
 816                 return bpfdev_setfilter(bpf, endpt, grant);
 817
 818         case BIOCPROMISC:
 819                 if (bpf->bpf_ifdev == NULL)
 820                         return EINVAL;
 821
 822                 if (!(bpf->bpf_flags & BPFF_PROMISC)) {
 823                         if (!ifdev_set_promisc(bpf->bpf_ifdev))
 824                                 return EINVAL;
 825
 826                         bpf->bpf_flags |= BPFF_PROMISC;
 827                 }
 828
 829                 return OK;
 830
 831         case BIOCFLUSH:
 832                 bpfdev_flush(bpf);
 833
 834                 return OK;
 835
 836         case BIOCGDLT:
 837                 if (bpf->bpf_ifdev == NULL)
 838                         return EINVAL;
 839
 840                 /* TODO: support for type configuration per BPF device. */
 841                 uval = ifdev_get_dlt(bpf->bpf_ifdev);
 842
 843                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
 844                     sizeof(uval));
 845
 846         case BIOCGETIF:
 847                 if (bpf->bpf_ifdev == NULL)
 848                         return EINVAL;
 849
 850                 memset(&ifr, 0, sizeof(ifr));
 851                 strlcpy(ifr.ifr_name, ifdev_get_name(bpf->bpf_ifdev),
 852                     sizeof(ifr.ifr_name));
 853
 854                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&ifr,
 855                     sizeof(ifr));
 856
 857         case BIOCSETIF:
 858                 /*
 859                  * Test on the presence of a buffer rather than on an interface
 860                  * since the latter may disappear and thus be reset to NULL, in
 861                  * which case we do not want to allow rebinding to another.
 862                  */
 863                 if (bpf->bpf_sbuf != NULL)
 864                         return EINVAL;
 865
 866                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&ifr,
 867                     sizeof(ifr))) != OK)
 868                         return r;
 869
 870                 return bpfdev_attach(bpf, &ifr);
 871
 872         case BIOCGSTATS:
 873                 /*
 874                  * Why do we not embed a bpf_stat structure directly in the
 875                  * BPF device structure?  Well, bpf_stat has massive padding..
 876                  */
 877                 memset(&bs, 0, sizeof(bs));
 878                 bs.bs_recv = bpf->bpf_stat.bs_recv;
 879                 bs.bs_drop = bpf->bpf_stat.bs_drop;
 880                 bs.bs_capt = bpf->bpf_stat.bs_capt;
 881
 882                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bs,
 883                     sizeof(bs));
 884
 885         case BIOCIMMEDIATE:
 886                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
 887                     sizeof(uval))) != OK)
 888                         return r;
 889
 890                 if (uval)
 891                         bpf->bpf_flags |= BPFF_IMMEDIATE;
 892                 else
 893                         bpf->bpf_flags &= ~BPFF_IMMEDIATE;
 894
 895                 return OK;
 896
 897         case BIOCVERSION:
 898                 memset(&bv, 0, sizeof(bv));
 899                 bv.bv_major = BPF_MAJOR_VERSION;
 900                 bv.bv_minor = BPF_MINOR_VERSION;
 901
 902                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bv,
 903                     sizeof(bv));
 904
 905         case BIOCGHDRCMPLT:
 906                 uval = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
 907
 908                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
 909                     sizeof(uval));
 910
 911         case BIOCSHDRCMPLT:
 912                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
 913                     sizeof(uval))) != OK)
 914                         return r;
 915
 916                 if (uval)
 917                         bpf->bpf_flags |= BPFF_HDRCMPLT;
 918                 else
 919                         bpf->bpf_flags &= ~BPFF_HDRCMPLT;
 920
 921                 return OK;
 922
 923         case BIOCSDLT:
 924                 if (bpf->bpf_ifdev == NULL)
 925                         return EINVAL;
 926
 927                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
 928                     sizeof(uval))) != OK)
 929                         return r;
 930
 931                 /* TODO: support for type configuration per BPF device. */
 932                 if (uval != ifdev_get_dlt(bpf->bpf_ifdev))
 933                         return EINVAL;
 934
 935                 return OK;
 936
 937         case MINIX_BIOCGDLTLIST:
 938                 if (bpf->bpf_ifdev == NULL)
 939                         return EINVAL;
 940
 941                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&bfl,
 942                     sizeof(bfl))) != OK)
 943                         return r;
 944
 945                 if (bfl.bfl_list != NULL) {
 946                         if (bfl.bfl_len < 1)
 947                                 return ENOMEM;
 948
 949                         /*
 950                          * Copy out the 'list', which consists of one entry.
 951                          * If we were to produce multiple entries, we would
 952                          * have to check against the MINIX_BPF_MAXDLT limit.
 953                          */
 954                         uval = ifdev_get_dlt(bpf->bpf_ifdev);
 955
 956                         if ((r = sys_safecopyto(endpt, grant,
 957                             offsetof(struct minix_bpf_dltlist, mbfl_list),
 958                             (vir_bytes)&uval, sizeof(uval))) != OK)
 959                                 return r;
 960                 }
 961                 bfl.bfl_len = 1;
 962
 963                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bfl,
 964                     sizeof(bfl));
 965
 966         case BIOCGSEESENT:
 967                 uval = !!(bpf->bpf_flags & BPFF_SEESENT);
 968
 969                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
 970                     sizeof(uval));
 971
 972         case BIOCSSEESENT:
 973                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
 974                     sizeof(uval))) != OK)
 975                         return r;
 976
 977                 if (uval)
 978                         bpf->bpf_flags |= BPFF_SEESENT;
 979                 else
 980                         bpf->bpf_flags &= ~BPFF_SEESENT;
 981
 982                 return OK;
 983
 984         case BIOCSRTIMEOUT:
 985                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&tv,
 986                     sizeof(tv))) != OK)
 987                         return r;
 988
 989                 if ((r = util_timeval_to_ticks(&tv, &bpf->bpf_timeout)) != OK)
 990                         return r;
 991
 992                 return OK;
 993
 994         case BIOCGRTIMEOUT:
 995                 util_ticks_to_timeval(bpf->bpf_timeout, &tv);
 996
 997                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&tv,
 998                     sizeof(tv));
 999
1000         case BIOCGFEEDBACK:
1001                 uval = !!(bpf->bpf_flags & BPFF_FEEDBACK);
1002
1003                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
1004                     sizeof(uval));
1005
1006         case BIOCSFEEDBACK:
1007                 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
1008                     sizeof(uval))) != OK)
1009                         return r;
1010
1011                 if (uval)
1012                         bpf->bpf_flags |= BPFF_FEEDBACK;
1013                 else
1014                         bpf->bpf_flags &= ~BPFF_FEEDBACK;
1015
1016                 return OK;
1017
1018         case FIONREAD:
1019                 val = 0;
1020                 if (bpf->bpf_hlen > 0)
1021                         val = bpf->bpf_hlen;
1022                 else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
1023                     bpf->bpf_slen > 0)
1024                         val = bpf->bpf_slen;
1025                 else
1026                         val = 0;
1027
1028                 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&val,
1029                     sizeof(val));
1030
1031         default:
1032                 return ENOTTY;
1033         }
1034 }
1035
1036 /*
1037  * Cancel a previously suspended request on a BPF device.  Since only read
1038  * requests may be suspended (select is handled differently), the cancel
1039  * request must be for a read request.  Note that character devices currently
1040  * (still) behave slightly differently from socket devices here: while socket
1041  * drivers are supposed to respond to the original request, character drivers
1042  * must respond to the original request from the cancel callback.
1043  */
1044 static int
1045 bpfdev_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
1046 {
1047         struct bpfdev *bpf;
1048
1049         if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
1050                 return EDONTREPLY;
1051
1052         /* Is this a cancel request for the currently pending read request? */
1053         if (bpf->bpf_read.br_endpt != endpt || bpf->bpf_read.br_id != id)
1054                 return EDONTREPLY;
1055
1056         /* If so, cancel the read request. */
1057         if (bpf->bpf_timeout > 0)
1058                 cancel_timer(&bpf->bpf_read.br_timer);
1059
1060         bpf->bpf_read.br_endpt = NONE;
1061
1062         return EINTR; /* the return value for the canceled read request */
1063 }
1064
1065 /*
1066  * Perform a select query on a BPF device.
1067  */
1068 static int
1069 bpfdev_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
1070 {
1071         struct bpfdev *bpf;
1072         unsigned int r, notify;
1073
1074         if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
1075                 return EINVAL;
1076
1077         notify = (ops & CDEV_NOTIFY);
1078         ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);
1079
1080         r = bpfdev_test_select(bpf, ops);
1081
1082         /*
1083          * For the operations that were not immediately ready, if requested,
1084          * save the select request for later.
1085          */
1086         ops &= ~r;
1087
1088         if (ops != 0 && notify) {
1089                 if (bpf->bpf_select.bs_endpt != NONE) {
1090                         /* Merge in the operations with any earlier request. */
1091                         if (bpf->bpf_select.bs_endpt != endpt)
1092                                 return EIO;
1093                         bpf->bpf_select.bs_selops |= ops;
1094                 } else {
1095                         bpf->bpf_select.bs_endpt = endpt;
1096                         bpf->bpf_select.bs_selops = ops;
1097                 }
1098         }
1099
1100         return r;
1101 }
1102
1103 /*
1104  * Process an incoming packet on the interface to which the given BPF device is
1105  * attached.  If the packet passes the filter (if any), store as much as
1106  * requested of it in the store buffer, rotating buffers if needed and resuming
1107  * suspended read and select requests as appropriate.  This function is also
1108  * called through bpfdev_output() below.
1109  */
1110 void
1111 bpfdev_input(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
1112 {
1113         struct bpfdev *bpf = (struct bpfdev *)bpfl;
1114         struct timespec ts;
1115         struct bpf_hdr bh;
1116         const struct pbuf *pptr;
1117         size_t caplen, hdrlen, totlen, off, chunk;
1118         int hfull;
1119
1120         /*
1121          * Apparently bs_recv is the counter of packets that were run through
1122          * the filter, not the number of packets that were or could be received
1123          * by the user (which is what I got from the manual page.. oh well).
1124          */
1125         bpf->bpf_stat.bs_recv++;
1126         bpf_stat.bs_recv++;
1127
1128         /*
1129          * Run the packet through the BPF device's filter to see whether the
1130          * packet should be stored and if so, how much of it.  If no filter is
1131          * set, all packets will be stored in their entirety.
1132          */
1133         caplen = bpf_filter_ext(bpf->bpf_filter, pbuf, (u_char *)pbuf->payload,
1134             pbuf->tot_len, pbuf->len);
1135
1136         if (caplen == 0)
1137                 return;         /* no match; ignore packet */
1138
1139         if (caplen > pbuf->tot_len)
1140                 caplen = pbuf->tot_len;
1141
1142         /* Truncate packet entries to the full size of the buffers. */
1143         hdrlen = BPF_WORDALIGN(sizeof(bh));
1144         totlen = BPF_WORDALIGN(hdrlen + caplen);
1145
1146         if (totlen > bpf->bpf_size) {
1147                 totlen = bpf->bpf_size;
1148                 caplen = totlen - hdrlen;
1149         }
1150         assert(totlen >= hdrlen);
1151
1152         bpf->bpf_stat.bs_capt++;
1153         bpf_stat.bs_capt++;
1154
1155         assert(bpf->bpf_sbuf != NULL);
1156         if (totlen > bpf->bpf_size - bpf->bpf_slen) {
1157                 /*
1158                  * If the store buffer is full and the hold buffer is not
1159                  * empty, we cannot swap the two buffers, and so we must drop
1160                  * the current packet.
1161                  */
1162                 if (bpf->bpf_hlen > 0) {
1163                         bpf->bpf_stat.bs_drop++;
1164                         bpf_stat.bs_drop++;
1165
1166                         return;
1167                 }
1168
1169                 /*
1170                  * Rotate the buffers: the hold buffer will now be "full" and
1171                  * ready to be read - it may not actually be entirely full, but
1172                  * we could not fit this packet and we are not going to deliver
1173                  * packets out of order..
1174                  */
1175                 bpfdev_rotate(bpf);
1176
1177                 hfull = TRUE;
1178         } else
1179                 hfull = FALSE;
1180
1181         /*
1182          * Retrieve the capture time for the packet.  Ideally this would be
1183          * done only once per accepted packet, but we do not expect many BPF
1184          * devices to be receiving the same packets often enough to make that
1185          * worth it.
1186          */
1187         clock_time(&ts);
1188
1189         /*
1190          * Copy the packet into the store buffer, including a newly generated
1191          * header.  Zero any padding areas, even if strictly not necessary.
1192          */
1193         memset(&bh, 0, sizeof(bh));
1194         bh.bh_tstamp.tv_sec = ts.tv_sec;
1195         bh.bh_tstamp.tv_usec = ts.tv_nsec / 1000;
1196         bh.bh_caplen = caplen;
1197         bh.bh_datalen = pbuf->tot_len;
1198         bh.bh_hdrlen = hdrlen;
1199
1200         assert(bpf->bpf_sbuf != NULL);
1201         off = bpf->bpf_slen;
1202
1203         memcpy(&bpf->bpf_sbuf[off], &bh, sizeof(bh));
1204         if (hdrlen > sizeof(bh))
1205                 memset(&bpf->bpf_sbuf[off + sizeof(bh)], 0,
1206                     hdrlen - sizeof(bh));
1207         off += hdrlen;
1208
1209         for (pptr = pbuf; pptr != NULL && caplen > 0; pptr = pptr->next) {
1210                 chunk = pptr->len;
1211                 if (chunk > caplen)
1212                         chunk = caplen;
1213
1214                 memcpy(&bpf->bpf_sbuf[off], pptr->payload, chunk);
1215
1216                 off += chunk;
1217                 caplen -= chunk;
1218         }
1219
1220         assert(off <= bpf->bpf_slen + totlen);
1221         if (bpf->bpf_slen + totlen > off)
1222                 memset(&bpf->bpf_sbuf[off], 0, bpf->bpf_slen + totlen - off);
1223
1224         bpf->bpf_slen += totlen;
1225
1226         /*
1227          * Edge case: if the hold buffer is empty and the store buffer is now
1228          * exactly full, rotate buffers so that the packets can be read
1229          * immediately, without waiting for the next packet to cause rotation.
1230          */
1231         if (bpf->bpf_hlen == 0 && bpf->bpf_slen == bpf->bpf_size) {
1232                 bpfdev_rotate(bpf);
1233
1234                 hfull = TRUE;
1235         }
1236
1237         /*
1238          * If the hold buffer is now full, or if immediate mode is enabled,
1239          * then we now have data to deliver to userland.  See if we can wake up
1240          * any read or select call (either but not both here).
1241          */
1242         if (hfull || (bpf->bpf_flags & BPFF_IMMEDIATE)) {
1243                 if (bpf->bpf_read.br_endpt != NONE)
1244                         bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
1245                 else
1246                         bpfdev_resume_select(bpf);
1247         }
1248 }
1249
1250 /*
1251  * Process an outgoing packet on the interface to which the given BPF device is
1252  * attached.  If the BPF device is configured to capture outgoing packets as
1253  * well, attempt to capture the packet as per bpfdev_input().
1254  */
1255 void
1256 bpfdev_output(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
1257 {
1258         struct bpfdev *bpf = (struct bpfdev *)bpfl;
1259
1260         if (bpf->bpf_flags & BPFF_SEESENT)
1261                 bpfdev_input(bpfl, pbuf);
1262 }
1263
1264 /*
1265  * Fill the given 'bde' structure with information about BPF device 'bpf'.
1266  */
1267 static void
1268 bpfdev_get_info(struct bpf_d_ext * bde, const struct bpfdev * bpf)
1269 {
1270
1271         bde->bde_bufsize = bpf->bpf_size;
1272         bde->bde_promisc = !!(bpf->bpf_flags & BPFF_PROMISC);
1273         bde->bde_state = BPF_IDLE;
1274         bde->bde_immediate = !!(bpf->bpf_flags & BPFF_IMMEDIATE);
1275         bde->bde_hdrcmplt = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
1276         bde->bde_seesent = !!(bpf->bpf_flags & BPFF_SEESENT);
1277         /*
1278          * NetBSD updates the process ID upon device open, close, ioctl, and
1279          * poll.  From those, only open and ioctl make sense for us.  Sadly
1280          * there is no way to indicate "no known PID" to netstat(1), so we
1281          * cannot even save just the endpoint and look up the corresponding PID
1282          * later, since the user process may be gone by then.
1283          */
1284         bde->bde_pid = bpf->bpf_pid;
1285         bde->bde_rcount = bpf->bpf_stat.bs_recv;
1286         bde->bde_dcount = bpf->bpf_stat.bs_drop;
1287         bde->bde_ccount = bpf->bpf_stat.bs_capt;
1288         if (bpf->bpf_ifdev != NULL)
1289                 strlcpy(bde->bde_ifname, ifdev_get_name(bpf->bpf_ifdev),
1290                     sizeof(bde->bde_ifname));
1291 }
1292
1293 /*
1294  * Obtain statistics about open BPF devices ("peers").  This node may be
1295  * accessed by the superuser only.  Used by netstat(1).
1296  */
1297 static ssize_t
1298 bpfdev_peers(struct rmib_call * call, struct rmib_node * node __unused,
1299         struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
1300 {
1301         struct bpfdev *bpf;
1302         struct bpf_d_ext bde;
1303         unsigned int slot;
1304         ssize_t off;
1305         int r, size, max;
1306
1307         if (!(call->call_flags & RMIB_FLAG_AUTH))
1308                 return EPERM;
1309
1310         if (call->call_namelen != 2)
1311                 return EINVAL;
1312
1313         size = call->call_name[0];
1314         if (size < 0 || (size_t)size > sizeof(bde))
1315                 return EINVAL;
1316         if (size == 0)
1317                 size = sizeof(bde);
1318         max = call->call_name[1];
1319
1320         off = 0;
1321
1322         for (slot = 0; slot < __arraycount(bpf_array); slot++) {
1323                 bpf = &bpf_array[slot];
1324
1325                 if (!(bpf->bpf_flags & BPFF_IN_USE))
1326                         continue;
1327
1328                 if (rmib_inrange(oldp, off)) {
1329                         memset(&bde, 0, sizeof(bde));
1330
1331                         bpfdev_get_info(&bde, bpf);
1332
1333                         if ((r = rmib_copyout(oldp, off, &bde, size)) < 0)
1334                                 return r;
1335                 }
1336
1337                 off += sizeof(bde);
1338                 if (max > 0 && --max == 0)
1339                         break;
1340         }
1341
1342         /* No slack needed: netstat(1) resizes its buffer as needed. */
1343         return off;
1344 }
1345
1346 static const struct chardriver bpfdev_tab = {
1347         .cdr_open               = bpfdev_open,
1348         .cdr_close              = bpfdev_close,
1349         .cdr_read               = bpfdev_read,
1350         .cdr_write              = bpfdev_write,
1351         .cdr_ioctl              = bpfdev_ioctl,
1352         .cdr_cancel             = bpfdev_cancel,
1353         .cdr_select             = bpfdev_select
1354 };
1355
1356 /*
1357  * Process a character driver request.  Since the LWIP service offers character
1358  * devices for BPF only, it must be a request for a BPF device.
1359  */
1360 void
1361 bpfdev_process(message * m_ptr, int ipc_status)
1362 {
1363
1364         chardriver_process(&bpfdev_tab, m_ptr, ipc_status);
1365 }