Sync with cat.c from netbsd-8
[minix3.git] / minix / net / lwip / bpfdev.c
blob3e12c8dac1dc554fab0bd15bace2ae67f0bb294f
1 /* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */
2 /*
3 * BPF is a cloning device: opening /dev/bpf returns a new BPF device which is
4 * independent from any other opened BPF devices. We assume that each BPF
5 * device is used by one single user process, and this implementation therefore
6 * does not support multiple concurrent device calls on the same BPF device.
8 * Packet buffering basically follows the BSD model: each BPF device that is
9 * configured (that is, it has been attached to an interface) has two buffers,
10 * each of the configured size: a store buffer, where new packets are stored,
11 * and a hold buffer, which is typically full and awaiting retrieval through a
12 * read call from userland. The buffers are swapped ("rotated") when the store
13 * buffer is filled up and the hold buffer is empty - if the hold buffer is not
14 * empty is not empty either, additional packets are dropped.
16 * These buffers are allocated when the BPF device is attached to an interface.
17 * The interface may later disappear, in which case the BPF device is detached
18 * from it, allowing any final packets to be read before read requests start
19 * returning I/O errors. The buffers are freed only when the device is closed.
22 #include "lwip.h"
23 #include "bpfdev.h"
25 #include <minix/chardriver.h>
26 #include <net/if.h>
27 #include <net/bpfdesc.h>
28 #include <minix/bpf.h>
29 #include <sys/mman.h>
32 * Make sure that our implementation matches the BPF version in the NetBSD
33 * headers. If they change the version number, we may have to make changes
34 * here accordingly.
36 #if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1
37 #error "NetBSD BPF version has changed"
38 #endif
40 /* The number of BPF devices. */
41 #define NR_BPFDEV 16
43 /* BPF receive buffer size: allowed range and default. */
44 #define BPF_BUF_MIN BPF_WORDALIGN(sizeof(struct bpf_hdr))
45 #define BPF_BUF_DEF 32768
46 #define BPF_BUF_MAX 262144
49 * By opening /dev/bpf, one will obtain a cloned device with a different minor
50 * number, which maps to one of the BPF devices.
52 #define BPFDEV_MINOR 0 /* minor number of /dev/bpf */
53 #define BPFDEV_BASE_MINOR 1 /* base minor number for BPF devices */
55 static struct bpfdev {
56 struct bpfdev_link bpf_link; /* structure link, MUST be first */
57 TAILQ_ENTRY(bpfdev) bpf_next; /* next on free or interface list */
58 struct ifdev *bpf_ifdev; /* associated interface, or NULL */
59 unsigned int bpf_flags; /* flags (BPFF_) */
60 size_t bpf_size; /* size of packet buffers */
61 char *bpf_sbuf; /* store buffer (mmap'd, or NULL) */
62 char *bpf_hbuf; /* hold buffer (mmap'd, or NULL) */
63 size_t bpf_slen; /* used part of store buffer */
64 size_t bpf_hlen; /* used part of hold buffer */
65 struct bpf_insn *bpf_filter; /* verified BPF filter, or NULL */
66 size_t bpf_filterlen; /* length of filter, for munmap */
67 pid_t bpf_pid; /* process ID of last using process */
68 clock_t bpf_timeout; /* timeout for read calls (0 = none) */
69 struct { /* state for pending read request */
70 endpoint_t br_endpt; /* reading endpoint, or NONE */
71 cp_grant_id_t br_grant; /* grant for reader's buffer */
72 cdev_id_t br_id; /* read request identifier */
73 minix_timer_t br_timer; /* timer for read timeout */
74 } bpf_read;
75 struct { /* state for pending select request */
76 endpoint_t bs_endpt; /* selecting endpoint, or NONE */
77 unsigned int bs_selops; /* pending select operations */
78 } bpf_select;
79 struct { /* packet capture statistics */
80 uint64_t bs_recv; /* # of packets run through filter */
81 uint64_t bs_drop; /* # of packets dropped: buffer full */
82 uint64_t bs_capt; /* # of packets accepted by filter */
83 } bpf_stat;
84 } bpf_array[NR_BPFDEV];
86 #define BPFF_IN_USE 0x01 /* this BPF device object is in use */
87 #define BPFF_PROMISC 0x02 /* promiscuous mode enabled */
88 #define BPFF_IMMEDIATE 0x04 /* immediate mode is enabled */
89 #define BPFF_SEESENT 0x08 /* also process host-sent packets */
90 #define BPFF_HDRCMPLT 0x10 /* do not fill in link-layer source */
91 #define BPFF_FEEDBACK 0x20 /* feed back written packet as input */
93 static TAILQ_HEAD(, bpfdev_link) bpfl_freelist; /* list of free BPF devices */
95 static struct bpf_stat bpf_stat;
97 static ssize_t bpfdev_peers(struct rmib_call *, struct rmib_node *,
98 struct rmib_oldp *, struct rmib_newp *);
100 /* The CTL_NET NET_BPF subtree. All nodes are dynamically numbered. */
101 static struct rmib_node net_bpf_table[] = {
102 RMIB_INT(RMIB_RO, BPF_BUF_MAX, "maxbufsize",
103 "Maximum size for data capture buffer"), /* TODO: read-write */
104 RMIB_STRUCT(RMIB_RO, sizeof(bpf_stat), &bpf_stat, "stats",
105 "BPF stats"),
106 RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, bpfdev_peers, "peers",
107 "BPF peers"),
110 static struct rmib_node net_bpf_node =
111 RMIB_NODE(RMIB_RO, net_bpf_table, "bpf", "BPF options");
114 * Initialize the BPF module.
116 void
117 bpfdev_init(void)
119 const int mib[] = { CTL_NET, NET_BPF };
120 unsigned int slot;
121 int r;
123 /* Initialize data structures. */
124 TAILQ_INIT(&bpfl_freelist);
126 for (slot = 0; slot < __arraycount(bpf_array); slot++) {
127 bpf_array[slot].bpf_flags = 0;
129 TAILQ_INSERT_TAIL(&bpfl_freelist, &bpf_array[slot].bpf_link,
130 bpfl_next);
133 memset(&bpf_stat, 0, sizeof(bpf_stat));
135 /* Register the "net.bpf" subtree with the MIB service. */
136 if ((r = rmib_register(mib, __arraycount(mib), &net_bpf_node)) != OK)
137 panic("unable to register net.bpf RMIB tree: %d", r);
141 * Given a BPF device object, return the corresponding minor number.
143 static devminor_t
144 bpfdev_get_minor(struct bpfdev * bpfdev)
147 assert(bpfdev != NULL);
149 return BPFDEV_BASE_MINOR + (devminor_t)(bpfdev - bpf_array);
153 * Given a minor number, return the corresponding BPF device object, or NULL if
154 * the minor number does not identify a BPF device.
156 static struct bpfdev *
157 bpfdev_get_by_minor(devminor_t minor)
160 if (minor < BPFDEV_BASE_MINOR ||
161 (unsigned int)minor >= BPFDEV_BASE_MINOR + __arraycount(bpf_array))
162 return NULL;
164 return &bpf_array[minor - BPFDEV_BASE_MINOR];
168 * Open a BPF device, returning a cloned device instance.
170 static int
171 bpfdev_open(devminor_t minor, int access __unused, endpoint_t user_endpt)
173 struct bpfdev_link *bpfl;
174 struct bpfdev *bpf;
176 /* Disallow opening cloned devices through device nodes. */
177 if (minor != BPFDEV_MINOR)
178 return ENXIO;
180 if (TAILQ_EMPTY(&bpfl_freelist))
181 return ENOBUFS;
183 bpfl = TAILQ_FIRST(&bpfl_freelist);
184 TAILQ_REMOVE(&bpfl_freelist, bpfl, bpfl_next);
186 bpf = (struct bpfdev *)bpfl;
188 memset(bpf, 0, sizeof(*bpf));
190 bpf->bpf_flags = BPFF_IN_USE | BPFF_SEESENT;
191 bpf->bpf_size = BPF_BUF_DEF;
192 bpf->bpf_pid = getnpid(user_endpt);
193 bpf->bpf_read.br_endpt = NONE;
194 bpf->bpf_select.bs_endpt = NONE;
196 return CDEV_CLONED | bpfdev_get_minor(bpf);
200 * Close a BPF device.
202 static int
203 bpfdev_close(devminor_t minor)
205 struct bpfdev *bpf;
207 if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
208 return EINVAL;
211 * There cannot possibly be a pending read request, so we never need to
212 * cancel the read timer from here either.
214 assert(bpf->bpf_read.br_endpt == NONE);
216 if (bpf->bpf_sbuf != NULL) {
217 assert(bpf->bpf_hbuf != NULL);
219 if (munmap(bpf->bpf_sbuf, bpf->bpf_size) != 0)
220 panic("munmap failed: %d", -errno);
221 if (munmap(bpf->bpf_hbuf, bpf->bpf_size) != 0)
222 panic("munmap failed: %d", -errno);
224 bpf->bpf_sbuf = NULL;
225 bpf->bpf_hbuf = NULL;
226 } else
227 assert(bpf->bpf_hbuf == NULL);
229 if (bpf->bpf_filter != NULL) {
230 assert(bpf->bpf_filterlen > 0);
232 if (munmap(bpf->bpf_filter, bpf->bpf_filterlen) != 0)
233 panic("munmap failed: %d", -errno);
235 bpf->bpf_filter = NULL;
239 * If the BPF device was attached to an interface, and that interface
240 * has not disappeared in the meantime, detach from it now.
242 if (bpf->bpf_ifdev != NULL) {
243 if (bpf->bpf_flags & BPFF_PROMISC)
244 ifdev_clear_promisc(bpf->bpf_ifdev);
246 ifdev_detach_bpf(bpf->bpf_ifdev, &bpf->bpf_link);
248 bpf->bpf_ifdev = NULL;
251 bpf->bpf_flags = 0; /* mark as no longer in use */
253 TAILQ_INSERT_HEAD(&bpfl_freelist, &bpf->bpf_link, bpfl_next);
255 return OK;
259 * Rotate buffers for the BPF device, by swapping the store buffer and the hold
260 * buffer.
262 static void
263 bpfdev_rotate(struct bpfdev * bpf)
265 char *buf;
266 size_t len;
269 * When rotating, the store buffer may or may not be empty, but the
270 * hold buffer must always be empty.
272 assert(bpf->bpf_hlen == 0);
274 buf = bpf->bpf_sbuf;
275 len = bpf->bpf_slen;
276 bpf->bpf_sbuf = bpf->bpf_hbuf;
277 bpf->bpf_slen = bpf->bpf_hlen;
278 bpf->bpf_hbuf = buf;
279 bpf->bpf_hlen = len;
283 * Test whether any of the given select operations are ready on the BPF device,
284 * and return the set of ready operations.
286 static unsigned int
287 bpfdev_test_select(struct bpfdev * bpf, unsigned int ops)
289 unsigned int ready_ops;
291 ready_ops = 0;
294 * The BPF device is ready for reading if the hold buffer is not empty
295 * (i.e.: the store buffer has been filled up completely and was
296 * therefore rotated) or if immediate mode is set and the store buffer
297 * is not empty (i.e.: any packet is available at all). In the latter
298 * case, the buffers will be rotated during the read. We do not
299 * support applying the read timeout to selects and maintaining state
300 * between the select and the following read, because despite that
301 * libpcap claims that it is the right behavior, that is just insane.
303 if (ops & CDEV_OP_RD) {
304 if (bpf->bpf_ifdev == NULL)
305 ready_ops |= CDEV_OP_RD;
306 else if (bpf->bpf_hlen > 0)
307 ready_ops |= CDEV_OP_RD;
308 else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
309 bpf->bpf_slen > 0)
310 ready_ops |= CDEV_OP_RD;
313 if (ops & CDEV_OP_WR)
314 ready_ops |= CDEV_OP_WR;
316 return ready_ops;
320 * There has been a state change on the BPF device. If now possible, resume a
321 * pending select query, if any.
323 static void
324 bpfdev_resume_select(struct bpfdev * bpf)
326 unsigned int ops, ready_ops;
327 endpoint_t endpt;
329 /* First see if there is a pending select request at all. */
330 if ((endpt = bpf->bpf_select.bs_endpt) == NONE)
331 return;
332 ops = bpf->bpf_select.bs_selops;
334 assert(ops != 0);
336 /* Then see if any of the pending operations are now ready. */
337 if ((ready_ops = bpfdev_test_select(bpf, ops)) == 0)
338 return;
340 /* If so, notify VFS about the ready operations. */
341 chardriver_reply_select(bpf->bpf_select.bs_endpt,
342 bpfdev_get_minor(bpf), ready_ops);
345 * Forget about the ready operations. If that leaves no pending
346 * operations, forget about the select request altogether.
348 if ((bpf->bpf_select.bs_selops &= ~ready_ops) == 0)
349 bpf->bpf_select.bs_endpt = NONE;
353 * There has been a state change on the BPF device. If now possible, resume a
354 * pending read request, if any. If the call is a result of a timeout,
355 * 'is_timeout' is set. In that case, the read request must be resumed with an
356 * EAGAIN error if no packets are available, and the running timer must be
357 * canceled. Otherwise, the resumption is due to a full buffer or a
358 * disappeared interface, and 'is_timeout' is not set. In this case, the read
359 * request must be resumed with an I/O error if no packets are available.
361 static void
362 bpfdev_resume_read(struct bpfdev * bpf, int is_timeout)
364 ssize_t r;
366 assert(bpf->bpf_read.br_endpt != NONE);
369 * If the hold buffer is still empty, see if the store buffer has
370 * any packets to copy out.
372 if (bpf->bpf_hlen == 0)
373 bpfdev_rotate(bpf);
375 /* Return any available packets, or otherwise an error. */
376 if (bpf->bpf_hlen > 0) {
377 assert(bpf->bpf_hlen <= bpf->bpf_size);
379 r = sys_safecopyto(bpf->bpf_read.br_endpt,
380 bpf->bpf_read.br_grant, 0, (vir_bytes)bpf->bpf_hbuf,
381 bpf->bpf_hlen);
383 if (r == OK) {
384 r = (ssize_t)bpf->bpf_hlen;
386 bpf->bpf_hlen = 0;
388 assert(bpf->bpf_slen != bpf->bpf_size);
391 * Allow readers to get the last packets after the
392 * interface has disappeared, before getting errors.
394 if (bpf->bpf_ifdev == NULL)
395 bpfdev_rotate(bpf);
397 } else
398 r = (is_timeout) ? EAGAIN : EIO;
400 chardriver_reply_task(bpf->bpf_read.br_endpt, bpf->bpf_read.br_id, r);
402 bpf->bpf_read.br_endpt = NONE;
404 /* Was there still a timer running? Then cancel it now. */
405 if (bpf->bpf_timeout > 0 && !is_timeout)
406 cancel_timer(&bpf->bpf_read.br_timer);
410 * A read timeout has triggered for the BPF device. Wake up the pending read
411 * request.
413 static void
414 bpfdev_timeout(int arg)
416 struct bpfdev *bpf;
418 assert(arg >= 0 && (unsigned int)arg < __arraycount(bpf_array));
420 bpf = &bpf_array[arg];
422 assert(bpf->bpf_read.br_endpt != NONE);
424 bpfdev_resume_read(bpf, TRUE /*is_timeout*/);
428 * Read from a BPF device.
430 static ssize_t
431 bpfdev_read(devminor_t minor, uint64_t position, endpoint_t endpt,
432 cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
434 struct bpfdev *bpf;
435 ssize_t r;
436 int suspend;
438 if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
439 return EINVAL;
441 /* Allow only one read call at a time. */
442 if (bpf->bpf_read.br_endpt != NONE)
443 return EIO;
445 /* Has this BPF device been configured at all yet? */
446 if (bpf->bpf_sbuf == NULL)
447 return EINVAL;
450 * Does the read call size match the entire buffer size? This is a
451 * ridiculous requirement but it makes our job quite a bit easier..
453 if (size != bpf->bpf_size)
454 return EINVAL;
457 * Following standard receive semantics, if the interface is gone,
458 * return all the packets that were pending before returning an error.
459 * This requires extra buffer rotations after read completion, too.
461 if (bpf->bpf_ifdev == NULL && bpf->bpf_hlen == 0)
462 return EIO;
465 * If immediate mode is not enabled, we should always suspend the read
466 * call if the hold buffer is empty. If immediate mode is enabled, we
467 * should only suspend the read call if both buffers are empty, and
468 * return data from the hold buffer or otherwise the store buffer,
469 * whichever is not empty. A non-blocking call behaves as though
470 * immediate mode is enabled, except it will return EAGAIN instead of
471 * suspending the read call if both buffers are empty. Thus, we may
472 * have to rotate buffers for both immediate mode and non-blocking
473 * calls. The latter is necessary for libpcap to behave correctly.
475 if ((flags & CDEV_NONBLOCK) || (bpf->bpf_flags & BPFF_IMMEDIATE))
476 suspend = (bpf->bpf_hlen == 0 && bpf->bpf_slen == 0);
477 else
478 suspend = (bpf->bpf_hlen == 0);
480 if (suspend) {
481 if (flags & CDEV_NONBLOCK)
482 return EAGAIN;
484 /* Suspend the read call for later. */
485 bpf->bpf_read.br_endpt = endpt;
486 bpf->bpf_read.br_grant = grant;
487 bpf->bpf_read.br_id = id;
489 /* Set a timer if requested. */
490 if (bpf->bpf_timeout > 0)
491 set_timer(&bpf->bpf_read.br_timer, bpf->bpf_timeout,
492 bpfdev_timeout, (int)(bpf - bpf_array));
494 return EDONTREPLY;
497 /* If we get here, either buffer has data; rotate buffers if needed. */
498 if (bpf->bpf_hlen == 0)
499 bpfdev_rotate(bpf);
500 assert(bpf->bpf_hlen > 0);
502 if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)bpf->bpf_hbuf,
503 bpf->bpf_hlen)) != OK)
504 return r;
506 r = (ssize_t)bpf->bpf_hlen;
508 bpf->bpf_hlen = 0;
511 * If the store buffer is exactly full, rotate it now. Also, if the
512 * interface has disappeared, the store buffer will never fill up.
513 * Rotate it so that the application will get any remaining data before
514 * getting errors about the interface being gone.
516 if (bpf->bpf_slen == bpf->bpf_size || bpf->bpf_ifdev == NULL)
517 bpfdev_rotate(bpf);
519 return r;
523 * Write to a BPF device.
525 static ssize_t
526 bpfdev_write(devminor_t minor, uint64_t position, endpoint_t endpt,
527 cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
529 struct bpfdev *bpf;
530 struct pbuf *pbuf, *pptr, *pcopy;
531 size_t off;
532 err_t err;
533 int r;
535 if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
536 return EINVAL;
538 if (bpf->bpf_ifdev == NULL)
539 return EINVAL;
541 /* VFS skips zero-sized I/O calls right now, but that may change. */
542 if (size == 0)
543 return 0; /* nothing to do */
545 if (size > ifdev_get_hdrlen(bpf->bpf_ifdev) +
546 ifdev_get_mtu(bpf->bpf_ifdev))
547 return EMSGSIZE;
549 if ((pbuf = pchain_alloc(PBUF_LINK, size)) == NULL)
550 return ENOMEM;
552 /* TODO: turn this into a series of vector copies. */
553 off = 0;
554 for (pptr = pbuf; pptr != NULL; pptr = pptr->next) {
555 if ((r = sys_safecopyfrom(endpt, grant, off,
556 (vir_bytes)pptr->payload, pptr->len)) != OK) {
557 pbuf_free(pbuf);
559 return r;
561 off += pptr->len;
563 assert(off == size);
566 * In feedback mode, we cannot use the same packet buffers for both
567 * output and input, so make a copy. We do this before calling the
568 * output function, which may change part of the buffers, because the
569 * BSDs take this approach as well.
571 if (bpf->bpf_flags & BPFF_FEEDBACK) {
572 if ((pcopy = pchain_alloc(PBUF_LINK, size)) == NULL) {
573 pbuf_free(pbuf);
575 return ENOMEM;
578 if (pbuf_copy(pcopy, pbuf) != ERR_OK)
579 panic("unexpected pbuf copy failure");
580 } else
581 pcopy = NULL;
583 /* Pass in the packet as output, and free it again. */
584 err = ifdev_output(bpf->bpf_ifdev, pbuf, NULL /*netif*/,
585 TRUE /*to_bpf*/, !!(bpf->bpf_flags & BPFF_HDRCMPLT));
587 pbuf_free(pbuf);
589 /* In feedback mode, pass in the copy as input, if output succeeded. */
590 if (err == ERR_OK && (bpf->bpf_flags & BPFF_FEEDBACK))
591 ifdev_input(bpf->bpf_ifdev, pcopy, NULL /*netif*/,
592 FALSE /*to_bpf*/);
593 else if (pcopy != NULL)
594 pbuf_free(pcopy);
596 return (err == ERR_OK) ? (ssize_t)size : util_convert_err(err);
600 * Attach a BPF device to a network interface, using the interface name given
601 * in an ifreq structure. As side effect, allocate hold and store buffers for
602 * the device. These buffers will stay allocated until the device is closed,
603 * even though the interface may disappear before that. Return OK if the BPF
604 * device was successfully attached to the interface, or a negative error code
605 * otherwise.
607 static int
608 bpfdev_attach(struct bpfdev * bpf, struct ifreq * ifr)
610 struct ifdev *ifdev;
611 void *sbuf, *hbuf;
613 /* Find the interface with the given name. */
614 ifr->ifr_name[sizeof(ifr->ifr_name) - 1] = '\0';
615 if ((ifdev = ifdev_find_by_name(ifr->ifr_name)) == NULL)
616 return ENXIO;
619 * Allocate a store buffer and a hold buffer. Preallocate the memory,
620 * or we might get killed later during low-memory conditions.
622 if ((sbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
623 MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED)
624 return ENOMEM;
626 if ((hbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
627 MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) {
628 (void)munmap(sbuf, bpf->bpf_size);
630 return ENOMEM;
633 bpf->bpf_ifdev = ifdev;
634 bpf->bpf_sbuf = sbuf;
635 bpf->bpf_hbuf = hbuf;
636 assert(bpf->bpf_slen == 0);
637 assert(bpf->bpf_hlen == 0);
639 ifdev_attach_bpf(ifdev, &bpf->bpf_link);
641 return OK;
645 * Detach the BPF device from its interface, which is about to disappear.
647 void
648 bpfdev_detach(struct bpfdev_link * bpfl)
650 struct bpfdev *bpf = (struct bpfdev *)bpfl;
652 assert(bpf->bpf_flags & BPFF_IN_USE);
653 assert(bpf->bpf_ifdev != NULL);
656 * We deliberately leave the buffers allocated here, for two reasons:
658 * 1) it lets applications to read any last packets in the buffers;
659 * 2) it prevents reattaching the BPF device to another interface.
661 bpf->bpf_ifdev = NULL;
664 * Resume pending read and select requests, returning any data left,
665 * or an error if none.
667 if (bpf->bpf_hlen == 0)
668 bpfdev_rotate(bpf);
670 if (bpf->bpf_read.br_endpt != NONE)
671 bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
673 bpfdev_resume_select(bpf);
677 * Flush the given BPF device, resetting its buffer contents and statistics
678 * counters.
680 static void
681 bpfdev_flush(struct bpfdev * bpf)
684 bpf->bpf_slen = 0;
685 bpf->bpf_hlen = 0;
687 bpf->bpf_stat.bs_recv = 0;
688 bpf->bpf_stat.bs_drop = 0;
689 bpf->bpf_stat.bs_capt = 0;
693 * Install a filter program on the BPF device. A new filter replaces any old
694 * one. A zero-sized filter simply clears a previous filter. On success,
695 * perform a flush and return OK. On failure, return a negative error code
696 * without making any modifications to the current filter.
698 static int
699 bpfdev_setfilter(struct bpfdev * bpf, endpoint_t endpt, cp_grant_id_t grant)
701 struct bpf_insn *filter;
702 unsigned int count;
703 size_t len;
704 int r;
706 if ((r = sys_safecopyfrom(endpt, grant,
707 offsetof(struct minix_bpf_program, mbf_len), (vir_bytes)&count,
708 sizeof(count))) != OK)
709 return r;
711 if (count > BPF_MAXINSNS)
712 return EINVAL;
713 len = count * sizeof(struct bpf_insn);
715 if (len > 0) {
716 if ((filter = (struct bpf_insn *)mmap(NULL, len,
717 PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0)) ==
718 MAP_FAILED)
719 return ENOMEM;
721 if ((r = sys_safecopyfrom(endpt, grant,
722 offsetof(struct minix_bpf_program, mbf_insns),
723 (vir_bytes)filter, len)) != OK) {
724 (void)munmap(filter, len);
726 return r;
729 if (!bpf_validate(filter, count)) {
730 (void)munmap(filter, len);
732 return EINVAL;
734 } else
735 filter = NULL;
737 if (bpf->bpf_filter != NULL)
738 (void)munmap(bpf->bpf_filter, bpf->bpf_filterlen);
740 bpf->bpf_filter = filter;
741 bpf->bpf_filterlen = len;
743 bpfdev_flush(bpf);
745 return OK;
749 * Process an I/O control request on the BPF device.
751 static int
752 bpfdev_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
753 cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
755 struct bpfdev *bpf;
756 struct bpf_stat bs;
757 struct bpf_version bv;
758 struct bpf_dltlist bfl;
759 struct timeval tv;
760 struct ifreq ifr;
761 unsigned int uval;
762 int r, val;
764 if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
765 return EINVAL;
768 * We do not support multiple concurrent requests in this module. That
769 * not only means that we forbid a read(2) call on a BPF device object
770 * while another read(2) is already pending: we also disallow IOCTL
771 * IOCTL calls while such a read(2) call is in progress. This
772 * restriction should never be a problem for user programs, and allows
773 * us to rely on the fact that that no settings can change between the
774 * start and end of any read call. As a side note, pending select(2)
775 * queries may be similarly affected, and will also not be fully
776 * accurate if any options are changed while pending.
778 if (bpf->bpf_read.br_endpt != NONE)
779 return EIO;
781 bpf->bpf_pid = getnpid(user_endpt);
783 /* These are in order of the NetBSD BIOC.. IOCTL numbers. */
784 switch (request) {
785 case BIOCGBLEN:
786 uval = bpf->bpf_size;
788 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
789 sizeof(uval));
791 case BIOCSBLEN:
792 if (bpf->bpf_sbuf != NULL)
793 return EINVAL;
795 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
796 sizeof(uval))) != OK)
797 return r;
799 if (uval < BPF_BUF_MIN)
800 uval = BPF_BUF_MIN;
801 else if (uval > BPF_BUF_MAX)
802 uval = BPF_BUF_MAX;
804 /* Is this the right thing to do? It doesn't matter for us. */
805 uval = BPF_WORDALIGN(uval);
807 if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
808 sizeof(uval))) != OK)
809 return r;
811 bpf->bpf_size = uval;
813 return OK;
815 case MINIX_BIOCSETF:
816 return bpfdev_setfilter(bpf, endpt, grant);
818 case BIOCPROMISC:
819 if (bpf->bpf_ifdev == NULL)
820 return EINVAL;
822 if (!(bpf->bpf_flags & BPFF_PROMISC)) {
823 if (!ifdev_set_promisc(bpf->bpf_ifdev))
824 return EINVAL;
826 bpf->bpf_flags |= BPFF_PROMISC;
829 return OK;
831 case BIOCFLUSH:
832 bpfdev_flush(bpf);
834 return OK;
836 case BIOCGDLT:
837 if (bpf->bpf_ifdev == NULL)
838 return EINVAL;
840 /* TODO: support for type configuration per BPF device. */
841 uval = ifdev_get_dlt(bpf->bpf_ifdev);
843 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
844 sizeof(uval));
846 case BIOCGETIF:
847 if (bpf->bpf_ifdev == NULL)
848 return EINVAL;
850 memset(&ifr, 0, sizeof(ifr));
851 strlcpy(ifr.ifr_name, ifdev_get_name(bpf->bpf_ifdev),
852 sizeof(ifr.ifr_name));
854 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&ifr,
855 sizeof(ifr));
857 case BIOCSETIF:
859 * Test on the presence of a buffer rather than on an interface
860 * since the latter may disappear and thus be reset to NULL, in
861 * which case we do not want to allow rebinding to another.
863 if (bpf->bpf_sbuf != NULL)
864 return EINVAL;
866 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&ifr,
867 sizeof(ifr))) != OK)
868 return r;
870 return bpfdev_attach(bpf, &ifr);
872 case BIOCGSTATS:
874 * Why do we not embed a bpf_stat structure directly in the
875 * BPF device structure? Well, bpf_stat has massive padding..
877 memset(&bs, 0, sizeof(bs));
878 bs.bs_recv = bpf->bpf_stat.bs_recv;
879 bs.bs_drop = bpf->bpf_stat.bs_drop;
880 bs.bs_capt = bpf->bpf_stat.bs_capt;
882 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bs,
883 sizeof(bs));
885 case BIOCIMMEDIATE:
886 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
887 sizeof(uval))) != OK)
888 return r;
890 if (uval)
891 bpf->bpf_flags |= BPFF_IMMEDIATE;
892 else
893 bpf->bpf_flags &= ~BPFF_IMMEDIATE;
895 return OK;
897 case BIOCVERSION:
898 memset(&bv, 0, sizeof(bv));
899 bv.bv_major = BPF_MAJOR_VERSION;
900 bv.bv_minor = BPF_MINOR_VERSION;
902 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bv,
903 sizeof(bv));
905 case BIOCGHDRCMPLT:
906 uval = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
908 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
909 sizeof(uval));
911 case BIOCSHDRCMPLT:
912 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
913 sizeof(uval))) != OK)
914 return r;
916 if (uval)
917 bpf->bpf_flags |= BPFF_HDRCMPLT;
918 else
919 bpf->bpf_flags &= ~BPFF_HDRCMPLT;
921 return OK;
923 case BIOCSDLT:
924 if (bpf->bpf_ifdev == NULL)
925 return EINVAL;
927 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
928 sizeof(uval))) != OK)
929 return r;
931 /* TODO: support for type configuration per BPF device. */
932 if (uval != ifdev_get_dlt(bpf->bpf_ifdev))
933 return EINVAL;
935 return OK;
937 case MINIX_BIOCGDLTLIST:
938 if (bpf->bpf_ifdev == NULL)
939 return EINVAL;
941 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&bfl,
942 sizeof(bfl))) != OK)
943 return r;
945 if (bfl.bfl_list != NULL) {
946 if (bfl.bfl_len < 1)
947 return ENOMEM;
950 * Copy out the 'list', which consists of one entry.
951 * If we were to produce multiple entries, we would
952 * have to check against the MINIX_BPF_MAXDLT limit.
954 uval = ifdev_get_dlt(bpf->bpf_ifdev);
956 if ((r = sys_safecopyto(endpt, grant,
957 offsetof(struct minix_bpf_dltlist, mbfl_list),
958 (vir_bytes)&uval, sizeof(uval))) != OK)
959 return r;
961 bfl.bfl_len = 1;
963 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bfl,
964 sizeof(bfl));
966 case BIOCGSEESENT:
967 uval = !!(bpf->bpf_flags & BPFF_SEESENT);
969 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
970 sizeof(uval));
972 case BIOCSSEESENT:
973 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
974 sizeof(uval))) != OK)
975 return r;
977 if (uval)
978 bpf->bpf_flags |= BPFF_SEESENT;
979 else
980 bpf->bpf_flags &= ~BPFF_SEESENT;
982 return OK;
984 case BIOCSRTIMEOUT:
985 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&tv,
986 sizeof(tv))) != OK)
987 return r;
989 if ((r = util_timeval_to_ticks(&tv, &bpf->bpf_timeout)) != OK)
990 return r;
992 return OK;
994 case BIOCGRTIMEOUT:
995 util_ticks_to_timeval(bpf->bpf_timeout, &tv);
997 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&tv,
998 sizeof(tv));
1000 case BIOCGFEEDBACK:
1001 uval = !!(bpf->bpf_flags & BPFF_FEEDBACK);
1003 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
1004 sizeof(uval));
1006 case BIOCSFEEDBACK:
1007 if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
1008 sizeof(uval))) != OK)
1009 return r;
1011 if (uval)
1012 bpf->bpf_flags |= BPFF_FEEDBACK;
1013 else
1014 bpf->bpf_flags &= ~BPFF_FEEDBACK;
1016 return OK;
1018 case FIONREAD:
1019 val = 0;
1020 if (bpf->bpf_hlen > 0)
1021 val = bpf->bpf_hlen;
1022 else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
1023 bpf->bpf_slen > 0)
1024 val = bpf->bpf_slen;
1025 else
1026 val = 0;
1028 return sys_safecopyto(endpt, grant, 0, (vir_bytes)&val,
1029 sizeof(val));
1031 default:
1032 return ENOTTY;
1037 * Cancel a previously suspended request on a BPF device. Since only read
1038 * requests may be suspended (select is handled differently), the cancel
1039 * request must be for a read request. Note that character devices currently
1040 * (still) behave slightly differently from socket devices here: while socket
1041 * drivers are supposed to respond to the original request, character drivers
1042 * must respond to the original request from the cancel callback.
1044 static int
1045 bpfdev_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
1047 struct bpfdev *bpf;
1049 if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
1050 return EDONTREPLY;
1052 /* Is this a cancel request for the currently pending read request? */
1053 if (bpf->bpf_read.br_endpt != endpt || bpf->bpf_read.br_id != id)
1054 return EDONTREPLY;
1056 /* If so, cancel the read request. */
1057 if (bpf->bpf_timeout > 0)
1058 cancel_timer(&bpf->bpf_read.br_timer);
1060 bpf->bpf_read.br_endpt = NONE;
1062 return EINTR; /* the return value for the canceled read request */
1066 * Perform a select query on a BPF device.
1068 static int
1069 bpfdev_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
1071 struct bpfdev *bpf;
1072 unsigned int r, notify;
1074 if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
1075 return EINVAL;
1077 notify = (ops & CDEV_NOTIFY);
1078 ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);
1080 r = bpfdev_test_select(bpf, ops);
1083 * For the operations that were not immediately ready, if requested,
1084 * save the select request for later.
1086 ops &= ~r;
1088 if (ops != 0 && notify) {
1089 if (bpf->bpf_select.bs_endpt != NONE) {
1090 /* Merge in the operations with any earlier request. */
1091 if (bpf->bpf_select.bs_endpt != endpt)
1092 return EIO;
1093 bpf->bpf_select.bs_selops |= ops;
1094 } else {
1095 bpf->bpf_select.bs_endpt = endpt;
1096 bpf->bpf_select.bs_selops = ops;
1100 return r;
1104 * Process an incoming packet on the interface to which the given BPF device is
1105 * attached. If the packet passes the filter (if any), store as much as
1106 * requested of it in the store buffer, rotating buffers if needed and resuming
1107 * suspended read and select requests as appropriate. This function is also
1108 * called through bpfdev_output() below.
1110 void
1111 bpfdev_input(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
1113 struct bpfdev *bpf = (struct bpfdev *)bpfl;
1114 struct timespec ts;
1115 struct bpf_hdr bh;
1116 const struct pbuf *pptr;
1117 size_t caplen, hdrlen, totlen, off, chunk;
1118 int hfull;
1121 * Apparently bs_recv is the counter of packets that were run through
1122 * the filter, not the number of packets that were or could be received
1123 * by the user (which is what I got from the manual page.. oh well).
1125 bpf->bpf_stat.bs_recv++;
1126 bpf_stat.bs_recv++;
1129 * Run the packet through the BPF device's filter to see whether the
1130 * packet should be stored and if so, how much of it. If no filter is
1131 * set, all packets will be stored in their entirety.
1133 caplen = bpf_filter_ext(bpf->bpf_filter, pbuf, (u_char *)pbuf->payload,
1134 pbuf->tot_len, pbuf->len);
1136 if (caplen == 0)
1137 return; /* no match; ignore packet */
1139 if (caplen > pbuf->tot_len)
1140 caplen = pbuf->tot_len;
1142 /* Truncate packet entries to the full size of the buffers. */
1143 hdrlen = BPF_WORDALIGN(sizeof(bh));
1144 totlen = BPF_WORDALIGN(hdrlen + caplen);
1146 if (totlen > bpf->bpf_size) {
1147 totlen = bpf->bpf_size;
1148 caplen = totlen - hdrlen;
1150 assert(totlen >= hdrlen);
1152 bpf->bpf_stat.bs_capt++;
1153 bpf_stat.bs_capt++;
1155 assert(bpf->bpf_sbuf != NULL);
1156 if (totlen > bpf->bpf_size - bpf->bpf_slen) {
1158 * If the store buffer is full and the hold buffer is not
1159 * empty, we cannot swap the two buffers, and so we must drop
1160 * the current packet.
1162 if (bpf->bpf_hlen > 0) {
1163 bpf->bpf_stat.bs_drop++;
1164 bpf_stat.bs_drop++;
1166 return;
1170 * Rotate the buffers: the hold buffer will now be "full" and
1171 * ready to be read - it may not actually be entirely full, but
1172 * we could not fit this packet and we are not going to deliver
1173 * packets out of order..
1175 bpfdev_rotate(bpf);
1177 hfull = TRUE;
1178 } else
1179 hfull = FALSE;
1182 * Retrieve the capture time for the packet. Ideally this would be
1183 * done only once per accepted packet, but we do not expect many BPF
1184 * devices to be receiving the same packets often enough to make that
1185 * worth it.
1187 clock_time(&ts);
1190 * Copy the packet into the store buffer, including a newly generated
1191 * header. Zero any padding areas, even if strictly not necessary.
1193 memset(&bh, 0, sizeof(bh));
1194 bh.bh_tstamp.tv_sec = ts.tv_sec;
1195 bh.bh_tstamp.tv_usec = ts.tv_nsec / 1000;
1196 bh.bh_caplen = caplen;
1197 bh.bh_datalen = pbuf->tot_len;
1198 bh.bh_hdrlen = hdrlen;
1200 assert(bpf->bpf_sbuf != NULL);
1201 off = bpf->bpf_slen;
1203 memcpy(&bpf->bpf_sbuf[off], &bh, sizeof(bh));
1204 if (hdrlen > sizeof(bh))
1205 memset(&bpf->bpf_sbuf[off + sizeof(bh)], 0,
1206 hdrlen - sizeof(bh));
1207 off += hdrlen;
1209 for (pptr = pbuf; pptr != NULL && caplen > 0; pptr = pptr->next) {
1210 chunk = pptr->len;
1211 if (chunk > caplen)
1212 chunk = caplen;
1214 memcpy(&bpf->bpf_sbuf[off], pptr->payload, chunk);
1216 off += chunk;
1217 caplen -= chunk;
1220 assert(off <= bpf->bpf_slen + totlen);
1221 if (bpf->bpf_slen + totlen > off)
1222 memset(&bpf->bpf_sbuf[off], 0, bpf->bpf_slen + totlen - off);
1224 bpf->bpf_slen += totlen;
1227 * Edge case: if the hold buffer is empty and the store buffer is now
1228 * exactly full, rotate buffers so that the packets can be read
1229 * immediately, without waiting for the next packet to cause rotation.
1231 if (bpf->bpf_hlen == 0 && bpf->bpf_slen == bpf->bpf_size) {
1232 bpfdev_rotate(bpf);
1234 hfull = TRUE;
1238 * If the hold buffer is now full, or if immediate mode is enabled,
1239 * then we now have data to deliver to userland. See if we can wake up
1240 * any read or select call (either but not both here).
1242 if (hfull || (bpf->bpf_flags & BPFF_IMMEDIATE)) {
1243 if (bpf->bpf_read.br_endpt != NONE)
1244 bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
1245 else
1246 bpfdev_resume_select(bpf);
1251 * Process an outgoing packet on the interface to which the given BPF device is
1252 * attached. If the BPF device is configured to capture outgoing packets as
1253 * well, attempt to capture the packet as per bpfdev_input().
1255 void
1256 bpfdev_output(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
1258 struct bpfdev *bpf = (struct bpfdev *)bpfl;
1260 if (bpf->bpf_flags & BPFF_SEESENT)
1261 bpfdev_input(bpfl, pbuf);
1265 * Fill the given 'bde' structure with information about BPF device 'bpf'.
1267 static void
1268 bpfdev_get_info(struct bpf_d_ext * bde, const struct bpfdev * bpf)
1271 bde->bde_bufsize = bpf->bpf_size;
1272 bde->bde_promisc = !!(bpf->bpf_flags & BPFF_PROMISC);
1273 bde->bde_state = BPF_IDLE;
1274 bde->bde_immediate = !!(bpf->bpf_flags & BPFF_IMMEDIATE);
1275 bde->bde_hdrcmplt = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
1276 bde->bde_seesent = !!(bpf->bpf_flags & BPFF_SEESENT);
1278 * NetBSD updates the process ID upon device open, close, ioctl, and
1279 * poll. From those, only open and ioctl make sense for us. Sadly
1280 * there is no way to indicate "no known PID" to netstat(1), so we
1281 * cannot even save just the endpoint and look up the corresponding PID
1282 * later, since the user process may be gone by then.
1284 bde->bde_pid = bpf->bpf_pid;
1285 bde->bde_rcount = bpf->bpf_stat.bs_recv;
1286 bde->bde_dcount = bpf->bpf_stat.bs_drop;
1287 bde->bde_ccount = bpf->bpf_stat.bs_capt;
1288 if (bpf->bpf_ifdev != NULL)
1289 strlcpy(bde->bde_ifname, ifdev_get_name(bpf->bpf_ifdev),
1290 sizeof(bde->bde_ifname));
1294 * Obtain statistics about open BPF devices ("peers"). This node may be
1295 * accessed by the superuser only. Used by netstat(1).
1297 static ssize_t
1298 bpfdev_peers(struct rmib_call * call, struct rmib_node * node __unused,
1299 struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
1301 struct bpfdev *bpf;
1302 struct bpf_d_ext bde;
1303 unsigned int slot;
1304 ssize_t off;
1305 int r, size, max;
1307 if (!(call->call_flags & RMIB_FLAG_AUTH))
1308 return EPERM;
1310 if (call->call_namelen != 2)
1311 return EINVAL;
1313 size = call->call_name[0];
1314 if (size < 0 || (size_t)size > sizeof(bde))
1315 return EINVAL;
1316 if (size == 0)
1317 size = sizeof(bde);
1318 max = call->call_name[1];
1320 off = 0;
1322 for (slot = 0; slot < __arraycount(bpf_array); slot++) {
1323 bpf = &bpf_array[slot];
1325 if (!(bpf->bpf_flags & BPFF_IN_USE))
1326 continue;
1328 if (rmib_inrange(oldp, off)) {
1329 memset(&bde, 0, sizeof(bde));
1331 bpfdev_get_info(&bde, bpf);
1333 if ((r = rmib_copyout(oldp, off, &bde, size)) < 0)
1334 return r;
1337 off += sizeof(bde);
1338 if (max > 0 && --max == 0)
1339 break;
1342 /* No slack needed: netstat(1) resizes its buffer as needed. */
1343 return off;
1346 static const struct chardriver bpfdev_tab = {
1347 .cdr_open = bpfdev_open,
1348 .cdr_close = bpfdev_close,
1349 .cdr_read = bpfdev_read,
1350 .cdr_write = bpfdev_write,
1351 .cdr_ioctl = bpfdev_ioctl,
1352 .cdr_cancel = bpfdev_cancel,
1353 .cdr_select = bpfdev_select
1357 * Process a character driver request. Since the LWIP service offers character
1358 * devices for BPF only, it must be a request for a BPF device.
1360 void
1361 bpfdev_process(message * m_ptr, int ipc_status)
1364 chardriver_process(&bpfdev_tab, m_ptr, ipc_status);