1 /* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */
3 * BPF is a cloning device: opening /dev/bpf returns a new BPF device which is
4 * independent from any other opened BPF devices. We assume that each BPF
5 * device is used by one single user process, and this implementation therefore
6 * does not support multiple concurrent device calls on the same BPF device.
8 * Packet buffering basically follows the BSD model: each BPF device that is
9 * configured (that is, it has been attached to an interface) has two buffers,
10 * each of the configured size: a store buffer, where new packets are stored,
11 * and a hold buffer, which is typically full and awaiting retrieval through a
12 * read call from userland. The buffers are swapped ("rotated") when the store
13 * buffer is filled up and the hold buffer is empty - if the hold buffer is not
14 * empty is not empty either, additional packets are dropped.
16 * These buffers are allocated when the BPF device is attached to an interface.
17 * The interface may later disappear, in which case the BPF device is detached
18 * from it, allowing any final packets to be read before read requests start
19 * returning I/O errors. The buffers are freed only when the device is closed.
25 #include <minix/chardriver.h>
27 #include <net/bpfdesc.h>
28 #include <minix/bpf.h>
32 * Make sure that our implementation matches the BPF version in the NetBSD
33 * headers. If they change the version number, we may have to make changes
36 #if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1
37 #error "NetBSD BPF version has changed"
40 /* The number of BPF devices. */
43 /* BPF receive buffer size: allowed range and default. */
44 #define BPF_BUF_MIN BPF_WORDALIGN(sizeof(struct bpf_hdr))
45 #define BPF_BUF_DEF 32768
46 #define BPF_BUF_MAX 262144
49 * By opening /dev/bpf, one will obtain a cloned device with a different minor
50 * number, which maps to one of the BPF devices.
52 #define BPFDEV_MINOR 0 /* minor number of /dev/bpf */
53 #define BPFDEV_BASE_MINOR 1 /* base minor number for BPF devices */
55 static struct bpfdev
{
56 struct bpfdev_link bpf_link
; /* structure link, MUST be first */
57 TAILQ_ENTRY(bpfdev
) bpf_next
; /* next on free or interface list */
58 struct ifdev
*bpf_ifdev
; /* associated interface, or NULL */
59 unsigned int bpf_flags
; /* flags (BPFF_) */
60 size_t bpf_size
; /* size of packet buffers */
61 char *bpf_sbuf
; /* store buffer (mmap'd, or NULL) */
62 char *bpf_hbuf
; /* hold buffer (mmap'd, or NULL) */
63 size_t bpf_slen
; /* used part of store buffer */
64 size_t bpf_hlen
; /* used part of hold buffer */
65 struct bpf_insn
*bpf_filter
; /* verified BPF filter, or NULL */
66 size_t bpf_filterlen
; /* length of filter, for munmap */
67 pid_t bpf_pid
; /* process ID of last using process */
68 clock_t bpf_timeout
; /* timeout for read calls (0 = none) */
69 struct { /* state for pending read request */
70 endpoint_t br_endpt
; /* reading endpoint, or NONE */
71 cp_grant_id_t br_grant
; /* grant for reader's buffer */
72 cdev_id_t br_id
; /* read request identifier */
73 minix_timer_t br_timer
; /* timer for read timeout */
75 struct { /* state for pending select request */
76 endpoint_t bs_endpt
; /* selecting endpoint, or NONE */
77 unsigned int bs_selops
; /* pending select operations */
79 struct { /* packet capture statistics */
80 uint64_t bs_recv
; /* # of packets run through filter */
81 uint64_t bs_drop
; /* # of packets dropped: buffer full */
82 uint64_t bs_capt
; /* # of packets accepted by filter */
84 } bpf_array
[NR_BPFDEV
];
86 #define BPFF_IN_USE 0x01 /* this BPF device object is in use */
87 #define BPFF_PROMISC 0x02 /* promiscuous mode enabled */
88 #define BPFF_IMMEDIATE 0x04 /* immediate mode is enabled */
89 #define BPFF_SEESENT 0x08 /* also process host-sent packets */
90 #define BPFF_HDRCMPLT 0x10 /* do not fill in link-layer source */
91 #define BPFF_FEEDBACK 0x20 /* feed back written packet as input */
93 static TAILQ_HEAD(, bpfdev_link
) bpfl_freelist
; /* list of free BPF devices */
95 static struct bpf_stat bpf_stat
;
97 static ssize_t
bpfdev_peers(struct rmib_call
*, struct rmib_node
*,
98 struct rmib_oldp
*, struct rmib_newp
*);
100 /* The CTL_NET NET_BPF subtree. All nodes are dynamically numbered. */
101 static struct rmib_node net_bpf_table
[] = {
102 RMIB_INT(RMIB_RO
, BPF_BUF_MAX
, "maxbufsize",
103 "Maximum size for data capture buffer"), /* TODO: read-write */
104 RMIB_STRUCT(RMIB_RO
, sizeof(bpf_stat
), &bpf_stat
, "stats",
106 RMIB_FUNC(RMIB_RO
| CTLTYPE_NODE
, 0, bpfdev_peers
, "peers",
110 static struct rmib_node net_bpf_node
=
111 RMIB_NODE(RMIB_RO
, net_bpf_table
, "bpf", "BPF options");
114 * Initialize the BPF module.
119 const int mib
[] = { CTL_NET
, NET_BPF
};
123 /* Initialize data structures. */
124 TAILQ_INIT(&bpfl_freelist
);
126 for (slot
= 0; slot
< __arraycount(bpf_array
); slot
++) {
127 bpf_array
[slot
].bpf_flags
= 0;
129 TAILQ_INSERT_TAIL(&bpfl_freelist
, &bpf_array
[slot
].bpf_link
,
133 memset(&bpf_stat
, 0, sizeof(bpf_stat
));
135 /* Register the "net.bpf" subtree with the MIB service. */
136 if ((r
= rmib_register(mib
, __arraycount(mib
), &net_bpf_node
)) != OK
)
137 panic("unable to register net.bpf RMIB tree: %d", r
);
141 * Given a BPF device object, return the corresponding minor number.
144 bpfdev_get_minor(struct bpfdev
* bpfdev
)
147 assert(bpfdev
!= NULL
);
149 return BPFDEV_BASE_MINOR
+ (devminor_t
)(bpfdev
- bpf_array
);
153 * Given a minor number, return the corresponding BPF device object, or NULL if
154 * the minor number does not identify a BPF device.
156 static struct bpfdev
*
157 bpfdev_get_by_minor(devminor_t minor
)
160 if (minor
< BPFDEV_BASE_MINOR
||
161 (unsigned int)minor
>= BPFDEV_BASE_MINOR
+ __arraycount(bpf_array
))
164 return &bpf_array
[minor
- BPFDEV_BASE_MINOR
];
168 * Open a BPF device, returning a cloned device instance.
171 bpfdev_open(devminor_t minor
, int access __unused
, endpoint_t user_endpt
)
173 struct bpfdev_link
*bpfl
;
176 /* Disallow opening cloned devices through device nodes. */
177 if (minor
!= BPFDEV_MINOR
)
180 if (TAILQ_EMPTY(&bpfl_freelist
))
183 bpfl
= TAILQ_FIRST(&bpfl_freelist
);
184 TAILQ_REMOVE(&bpfl_freelist
, bpfl
, bpfl_next
);
186 bpf
= (struct bpfdev
*)bpfl
;
188 memset(bpf
, 0, sizeof(*bpf
));
190 bpf
->bpf_flags
= BPFF_IN_USE
| BPFF_SEESENT
;
191 bpf
->bpf_size
= BPF_BUF_DEF
;
192 bpf
->bpf_pid
= getnpid(user_endpt
);
193 bpf
->bpf_read
.br_endpt
= NONE
;
194 bpf
->bpf_select
.bs_endpt
= NONE
;
196 return CDEV_CLONED
| bpfdev_get_minor(bpf
);
200 * Close a BPF device.
203 bpfdev_close(devminor_t minor
)
207 if ((bpf
= bpfdev_get_by_minor(minor
)) == NULL
)
211 * There cannot possibly be a pending read request, so we never need to
212 * cancel the read timer from here either.
214 assert(bpf
->bpf_read
.br_endpt
== NONE
);
216 if (bpf
->bpf_sbuf
!= NULL
) {
217 assert(bpf
->bpf_hbuf
!= NULL
);
219 if (munmap(bpf
->bpf_sbuf
, bpf
->bpf_size
) != 0)
220 panic("munmap failed: %d", -errno
);
221 if (munmap(bpf
->bpf_hbuf
, bpf
->bpf_size
) != 0)
222 panic("munmap failed: %d", -errno
);
224 bpf
->bpf_sbuf
= NULL
;
225 bpf
->bpf_hbuf
= NULL
;
227 assert(bpf
->bpf_hbuf
== NULL
);
229 if (bpf
->bpf_filter
!= NULL
) {
230 assert(bpf
->bpf_filterlen
> 0);
232 if (munmap(bpf
->bpf_filter
, bpf
->bpf_filterlen
) != 0)
233 panic("munmap failed: %d", -errno
);
235 bpf
->bpf_filter
= NULL
;
239 * If the BPF device was attached to an interface, and that interface
240 * has not disappeared in the meantime, detach from it now.
242 if (bpf
->bpf_ifdev
!= NULL
) {
243 if (bpf
->bpf_flags
& BPFF_PROMISC
)
244 ifdev_clear_promisc(bpf
->bpf_ifdev
);
246 ifdev_detach_bpf(bpf
->bpf_ifdev
, &bpf
->bpf_link
);
248 bpf
->bpf_ifdev
= NULL
;
251 bpf
->bpf_flags
= 0; /* mark as no longer in use */
253 TAILQ_INSERT_HEAD(&bpfl_freelist
, &bpf
->bpf_link
, bpfl_next
);
259 * Rotate buffers for the BPF device, by swapping the store buffer and the hold
263 bpfdev_rotate(struct bpfdev
* bpf
)
269 * When rotating, the store buffer may or may not be empty, but the
270 * hold buffer must always be empty.
272 assert(bpf
->bpf_hlen
== 0);
276 bpf
->bpf_sbuf
= bpf
->bpf_hbuf
;
277 bpf
->bpf_slen
= bpf
->bpf_hlen
;
283 * Test whether any of the given select operations are ready on the BPF device,
284 * and return the set of ready operations.
287 bpfdev_test_select(struct bpfdev
* bpf
, unsigned int ops
)
289 unsigned int ready_ops
;
294 * The BPF device is ready for reading if the hold buffer is not empty
295 * (i.e.: the store buffer has been filled up completely and was
296 * therefore rotated) or if immediate mode is set and the store buffer
297 * is not empty (i.e.: any packet is available at all). In the latter
298 * case, the buffers will be rotated during the read. We do not
299 * support applying the read timeout to selects and maintaining state
300 * between the select and the following read, because despite that
301 * libpcap claims that it is the right behavior, that is just insane.
303 if (ops
& CDEV_OP_RD
) {
304 if (bpf
->bpf_ifdev
== NULL
)
305 ready_ops
|= CDEV_OP_RD
;
306 else if (bpf
->bpf_hlen
> 0)
307 ready_ops
|= CDEV_OP_RD
;
308 else if ((bpf
->bpf_flags
& BPFF_IMMEDIATE
) &&
310 ready_ops
|= CDEV_OP_RD
;
313 if (ops
& CDEV_OP_WR
)
314 ready_ops
|= CDEV_OP_WR
;
320 * There has been a state change on the BPF device. If now possible, resume a
321 * pending select query, if any.
324 bpfdev_resume_select(struct bpfdev
* bpf
)
326 unsigned int ops
, ready_ops
;
329 /* First see if there is a pending select request at all. */
330 if ((endpt
= bpf
->bpf_select
.bs_endpt
) == NONE
)
332 ops
= bpf
->bpf_select
.bs_selops
;
336 /* Then see if any of the pending operations are now ready. */
337 if ((ready_ops
= bpfdev_test_select(bpf
, ops
)) == 0)
340 /* If so, notify VFS about the ready operations. */
341 chardriver_reply_select(bpf
->bpf_select
.bs_endpt
,
342 bpfdev_get_minor(bpf
), ready_ops
);
345 * Forget about the ready operations. If that leaves no pending
346 * operations, forget about the select request altogether.
348 if ((bpf
->bpf_select
.bs_selops
&= ~ready_ops
) == 0)
349 bpf
->bpf_select
.bs_endpt
= NONE
;
353 * There has been a state change on the BPF device. If now possible, resume a
354 * pending read request, if any. If the call is a result of a timeout,
355 * 'is_timeout' is set. In that case, the read request must be resumed with an
356 * EAGAIN error if no packets are available, and the running timer must be
357 * canceled. Otherwise, the resumption is due to a full buffer or a
358 * disappeared interface, and 'is_timeout' is not set. In this case, the read
359 * request must be resumed with an I/O error if no packets are available.
362 bpfdev_resume_read(struct bpfdev
* bpf
, int is_timeout
)
366 assert(bpf
->bpf_read
.br_endpt
!= NONE
);
369 * If the hold buffer is still empty, see if the store buffer has
370 * any packets to copy out.
372 if (bpf
->bpf_hlen
== 0)
375 /* Return any available packets, or otherwise an error. */
376 if (bpf
->bpf_hlen
> 0) {
377 assert(bpf
->bpf_hlen
<= bpf
->bpf_size
);
379 r
= sys_safecopyto(bpf
->bpf_read
.br_endpt
,
380 bpf
->bpf_read
.br_grant
, 0, (vir_bytes
)bpf
->bpf_hbuf
,
384 r
= (ssize_t
)bpf
->bpf_hlen
;
388 assert(bpf
->bpf_slen
!= bpf
->bpf_size
);
391 * Allow readers to get the last packets after the
392 * interface has disappeared, before getting errors.
394 if (bpf
->bpf_ifdev
== NULL
)
398 r
= (is_timeout
) ? EAGAIN
: EIO
;
400 chardriver_reply_task(bpf
->bpf_read
.br_endpt
, bpf
->bpf_read
.br_id
, r
);
402 bpf
->bpf_read
.br_endpt
= NONE
;
404 /* Was there still a timer running? Then cancel it now. */
405 if (bpf
->bpf_timeout
> 0 && !is_timeout
)
406 cancel_timer(&bpf
->bpf_read
.br_timer
);
410 * A read timeout has triggered for the BPF device. Wake up the pending read
414 bpfdev_timeout(int arg
)
418 assert(arg
>= 0 && (unsigned int)arg
< __arraycount(bpf_array
));
420 bpf
= &bpf_array
[arg
];
422 assert(bpf
->bpf_read
.br_endpt
!= NONE
);
424 bpfdev_resume_read(bpf
, TRUE
/*is_timeout*/);
428 * Read from a BPF device.
431 bpfdev_read(devminor_t minor
, uint64_t position
, endpoint_t endpt
,
432 cp_grant_id_t grant
, size_t size
, int flags
, cdev_id_t id
)
438 if ((bpf
= bpfdev_get_by_minor(minor
)) == NULL
)
441 /* Allow only one read call at a time. */
442 if (bpf
->bpf_read
.br_endpt
!= NONE
)
445 /* Has this BPF device been configured at all yet? */
446 if (bpf
->bpf_sbuf
== NULL
)
450 * Does the read call size match the entire buffer size? This is a
451 * ridiculous requirement but it makes our job quite a bit easier..
453 if (size
!= bpf
->bpf_size
)
457 * Following standard receive semantics, if the interface is gone,
458 * return all the packets that were pending before returning an error.
459 * This requires extra buffer rotations after read completion, too.
461 if (bpf
->bpf_ifdev
== NULL
&& bpf
->bpf_hlen
== 0)
465 * If immediate mode is not enabled, we should always suspend the read
466 * call if the hold buffer is empty. If immediate mode is enabled, we
467 * should only suspend the read call if both buffers are empty, and
468 * return data from the hold buffer or otherwise the store buffer,
469 * whichever is not empty. A non-blocking call behaves as though
470 * immediate mode is enabled, except it will return EAGAIN instead of
471 * suspending the read call if both buffers are empty. Thus, we may
472 * have to rotate buffers for both immediate mode and non-blocking
473 * calls. The latter is necessary for libpcap to behave correctly.
475 if ((flags
& CDEV_NONBLOCK
) || (bpf
->bpf_flags
& BPFF_IMMEDIATE
))
476 suspend
= (bpf
->bpf_hlen
== 0 && bpf
->bpf_slen
== 0);
478 suspend
= (bpf
->bpf_hlen
== 0);
481 if (flags
& CDEV_NONBLOCK
)
484 /* Suspend the read call for later. */
485 bpf
->bpf_read
.br_endpt
= endpt
;
486 bpf
->bpf_read
.br_grant
= grant
;
487 bpf
->bpf_read
.br_id
= id
;
489 /* Set a timer if requested. */
490 if (bpf
->bpf_timeout
> 0)
491 set_timer(&bpf
->bpf_read
.br_timer
, bpf
->bpf_timeout
,
492 bpfdev_timeout
, (int)(bpf
- bpf_array
));
497 /* If we get here, either buffer has data; rotate buffers if needed. */
498 if (bpf
->bpf_hlen
== 0)
500 assert(bpf
->bpf_hlen
> 0);
502 if ((r
= sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)bpf
->bpf_hbuf
,
503 bpf
->bpf_hlen
)) != OK
)
506 r
= (ssize_t
)bpf
->bpf_hlen
;
511 * If the store buffer is exactly full, rotate it now. Also, if the
512 * interface has disappeared, the store buffer will never fill up.
513 * Rotate it so that the application will get any remaining data before
514 * getting errors about the interface being gone.
516 if (bpf
->bpf_slen
== bpf
->bpf_size
|| bpf
->bpf_ifdev
== NULL
)
523 * Write to a BPF device.
526 bpfdev_write(devminor_t minor
, uint64_t position
, endpoint_t endpt
,
527 cp_grant_id_t grant
, size_t size
, int flags
, cdev_id_t id
)
530 struct pbuf
*pbuf
, *pptr
, *pcopy
;
535 if ((bpf
= bpfdev_get_by_minor(minor
)) == NULL
)
538 if (bpf
->bpf_ifdev
== NULL
)
541 /* VFS skips zero-sized I/O calls right now, but that may change. */
543 return 0; /* nothing to do */
545 if (size
> ifdev_get_hdrlen(bpf
->bpf_ifdev
) +
546 ifdev_get_mtu(bpf
->bpf_ifdev
))
549 if ((pbuf
= pchain_alloc(PBUF_LINK
, size
)) == NULL
)
552 /* TODO: turn this into a series of vector copies. */
554 for (pptr
= pbuf
; pptr
!= NULL
; pptr
= pptr
->next
) {
555 if ((r
= sys_safecopyfrom(endpt
, grant
, off
,
556 (vir_bytes
)pptr
->payload
, pptr
->len
)) != OK
) {
566 * In feedback mode, we cannot use the same packet buffers for both
567 * output and input, so make a copy. We do this before calling the
568 * output function, which may change part of the buffers, because the
569 * BSDs take this approach as well.
571 if (bpf
->bpf_flags
& BPFF_FEEDBACK
) {
572 if ((pcopy
= pchain_alloc(PBUF_LINK
, size
)) == NULL
) {
578 if (pbuf_copy(pcopy
, pbuf
) != ERR_OK
)
579 panic("unexpected pbuf copy failure");
583 /* Pass in the packet as output, and free it again. */
584 err
= ifdev_output(bpf
->bpf_ifdev
, pbuf
, NULL
/*netif*/,
585 TRUE
/*to_bpf*/, !!(bpf
->bpf_flags
& BPFF_HDRCMPLT
));
589 /* In feedback mode, pass in the copy as input, if output succeeded. */
590 if (err
== ERR_OK
&& (bpf
->bpf_flags
& BPFF_FEEDBACK
))
591 ifdev_input(bpf
->bpf_ifdev
, pcopy
, NULL
/*netif*/,
593 else if (pcopy
!= NULL
)
596 return (err
== ERR_OK
) ? (ssize_t
)size
: util_convert_err(err
);
600 * Attach a BPF device to a network interface, using the interface name given
601 * in an ifreq structure. As side effect, allocate hold and store buffers for
602 * the device. These buffers will stay allocated until the device is closed,
603 * even though the interface may disappear before that. Return OK if the BPF
604 * device was successfully attached to the interface, or a negative error code
608 bpfdev_attach(struct bpfdev
* bpf
, struct ifreq
* ifr
)
613 /* Find the interface with the given name. */
614 ifr
->ifr_name
[sizeof(ifr
->ifr_name
) - 1] = '\0';
615 if ((ifdev
= ifdev_find_by_name(ifr
->ifr_name
)) == NULL
)
619 * Allocate a store buffer and a hold buffer. Preallocate the memory,
620 * or we might get killed later during low-memory conditions.
622 if ((sbuf
= (char *)mmap(NULL
, bpf
->bpf_size
, PROT_READ
| PROT_WRITE
,
623 MAP_ANON
| MAP_PRIVATE
| MAP_PREALLOC
, -1, 0)) == MAP_FAILED
)
626 if ((hbuf
= (char *)mmap(NULL
, bpf
->bpf_size
, PROT_READ
| PROT_WRITE
,
627 MAP_ANON
| MAP_PRIVATE
| MAP_PREALLOC
, -1, 0)) == MAP_FAILED
) {
628 (void)munmap(sbuf
, bpf
->bpf_size
);
633 bpf
->bpf_ifdev
= ifdev
;
634 bpf
->bpf_sbuf
= sbuf
;
635 bpf
->bpf_hbuf
= hbuf
;
636 assert(bpf
->bpf_slen
== 0);
637 assert(bpf
->bpf_hlen
== 0);
639 ifdev_attach_bpf(ifdev
, &bpf
->bpf_link
);
645 * Detach the BPF device from its interface, which is about to disappear.
648 bpfdev_detach(struct bpfdev_link
* bpfl
)
650 struct bpfdev
*bpf
= (struct bpfdev
*)bpfl
;
652 assert(bpf
->bpf_flags
& BPFF_IN_USE
);
653 assert(bpf
->bpf_ifdev
!= NULL
);
656 * We deliberately leave the buffers allocated here, for two reasons:
658 * 1) it lets applications to read any last packets in the buffers;
659 * 2) it prevents reattaching the BPF device to another interface.
661 bpf
->bpf_ifdev
= NULL
;
664 * Resume pending read and select requests, returning any data left,
665 * or an error if none.
667 if (bpf
->bpf_hlen
== 0)
670 if (bpf
->bpf_read
.br_endpt
!= NONE
)
671 bpfdev_resume_read(bpf
, FALSE
/*is_timeout*/);
673 bpfdev_resume_select(bpf
);
677 * Flush the given BPF device, resetting its buffer contents and statistics
681 bpfdev_flush(struct bpfdev
* bpf
)
687 bpf
->bpf_stat
.bs_recv
= 0;
688 bpf
->bpf_stat
.bs_drop
= 0;
689 bpf
->bpf_stat
.bs_capt
= 0;
693 * Install a filter program on the BPF device. A new filter replaces any old
694 * one. A zero-sized filter simply clears a previous filter. On success,
695 * perform a flush and return OK. On failure, return a negative error code
696 * without making any modifications to the current filter.
699 bpfdev_setfilter(struct bpfdev
* bpf
, endpoint_t endpt
, cp_grant_id_t grant
)
701 struct bpf_insn
*filter
;
706 if ((r
= sys_safecopyfrom(endpt
, grant
,
707 offsetof(struct minix_bpf_program
, mbf_len
), (vir_bytes
)&count
,
708 sizeof(count
))) != OK
)
711 if (count
> BPF_MAXINSNS
)
713 len
= count
* sizeof(struct bpf_insn
);
716 if ((filter
= (struct bpf_insn
*)mmap(NULL
, len
,
717 PROT_READ
| PROT_WRITE
, MAP_ANON
| MAP_PRIVATE
, -1, 0)) ==
721 if ((r
= sys_safecopyfrom(endpt
, grant
,
722 offsetof(struct minix_bpf_program
, mbf_insns
),
723 (vir_bytes
)filter
, len
)) != OK
) {
724 (void)munmap(filter
, len
);
729 if (!bpf_validate(filter
, count
)) {
730 (void)munmap(filter
, len
);
737 if (bpf
->bpf_filter
!= NULL
)
738 (void)munmap(bpf
->bpf_filter
, bpf
->bpf_filterlen
);
740 bpf
->bpf_filter
= filter
;
741 bpf
->bpf_filterlen
= len
;
749 * Process an I/O control request on the BPF device.
752 bpfdev_ioctl(devminor_t minor
, unsigned long request
, endpoint_t endpt
,
753 cp_grant_id_t grant
, int flags
, endpoint_t user_endpt
, cdev_id_t id
)
757 struct bpf_version bv
;
758 struct bpf_dltlist bfl
;
764 if ((bpf
= bpfdev_get_by_minor(minor
)) == NULL
)
768 * We do not support multiple concurrent requests in this module. That
769 * not only means that we forbid a read(2) call on a BPF device object
770 * while another read(2) is already pending: we also disallow IOCTL
771 * IOCTL calls while such a read(2) call is in progress. This
772 * restriction should never be a problem for user programs, and allows
773 * us to rely on the fact that that no settings can change between the
774 * start and end of any read call. As a side note, pending select(2)
775 * queries may be similarly affected, and will also not be fully
776 * accurate if any options are changed while pending.
778 if (bpf
->bpf_read
.br_endpt
!= NONE
)
781 bpf
->bpf_pid
= getnpid(user_endpt
);
783 /* These are in order of the NetBSD BIOC.. IOCTL numbers. */
786 uval
= bpf
->bpf_size
;
788 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&uval
,
792 if (bpf
->bpf_sbuf
!= NULL
)
795 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&uval
,
796 sizeof(uval
))) != OK
)
799 if (uval
< BPF_BUF_MIN
)
801 else if (uval
> BPF_BUF_MAX
)
804 /* Is this the right thing to do? It doesn't matter for us. */
805 uval
= BPF_WORDALIGN(uval
);
807 if ((r
= sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&uval
,
808 sizeof(uval
))) != OK
)
811 bpf
->bpf_size
= uval
;
816 return bpfdev_setfilter(bpf
, endpt
, grant
);
819 if (bpf
->bpf_ifdev
== NULL
)
822 if (!(bpf
->bpf_flags
& BPFF_PROMISC
)) {
823 if (!ifdev_set_promisc(bpf
->bpf_ifdev
))
826 bpf
->bpf_flags
|= BPFF_PROMISC
;
837 if (bpf
->bpf_ifdev
== NULL
)
840 /* TODO: support for type configuration per BPF device. */
841 uval
= ifdev_get_dlt(bpf
->bpf_ifdev
);
843 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&uval
,
847 if (bpf
->bpf_ifdev
== NULL
)
850 memset(&ifr
, 0, sizeof(ifr
));
851 strlcpy(ifr
.ifr_name
, ifdev_get_name(bpf
->bpf_ifdev
),
852 sizeof(ifr
.ifr_name
));
854 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&ifr
,
859 * Test on the presence of a buffer rather than on an interface
860 * since the latter may disappear and thus be reset to NULL, in
861 * which case we do not want to allow rebinding to another.
863 if (bpf
->bpf_sbuf
!= NULL
)
866 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&ifr
,
870 return bpfdev_attach(bpf
, &ifr
);
874 * Why do we not embed a bpf_stat structure directly in the
875 * BPF device structure? Well, bpf_stat has massive padding..
877 memset(&bs
, 0, sizeof(bs
));
878 bs
.bs_recv
= bpf
->bpf_stat
.bs_recv
;
879 bs
.bs_drop
= bpf
->bpf_stat
.bs_drop
;
880 bs
.bs_capt
= bpf
->bpf_stat
.bs_capt
;
882 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&bs
,
886 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&uval
,
887 sizeof(uval
))) != OK
)
891 bpf
->bpf_flags
|= BPFF_IMMEDIATE
;
893 bpf
->bpf_flags
&= ~BPFF_IMMEDIATE
;
898 memset(&bv
, 0, sizeof(bv
));
899 bv
.bv_major
= BPF_MAJOR_VERSION
;
900 bv
.bv_minor
= BPF_MINOR_VERSION
;
902 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&bv
,
906 uval
= !!(bpf
->bpf_flags
& BPFF_HDRCMPLT
);
908 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&uval
,
912 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&uval
,
913 sizeof(uval
))) != OK
)
917 bpf
->bpf_flags
|= BPFF_HDRCMPLT
;
919 bpf
->bpf_flags
&= ~BPFF_HDRCMPLT
;
924 if (bpf
->bpf_ifdev
== NULL
)
927 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&uval
,
928 sizeof(uval
))) != OK
)
931 /* TODO: support for type configuration per BPF device. */
932 if (uval
!= ifdev_get_dlt(bpf
->bpf_ifdev
))
937 case MINIX_BIOCGDLTLIST
:
938 if (bpf
->bpf_ifdev
== NULL
)
941 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&bfl
,
945 if (bfl
.bfl_list
!= NULL
) {
950 * Copy out the 'list', which consists of one entry.
951 * If we were to produce multiple entries, we would
952 * have to check against the MINIX_BPF_MAXDLT limit.
954 uval
= ifdev_get_dlt(bpf
->bpf_ifdev
);
956 if ((r
= sys_safecopyto(endpt
, grant
,
957 offsetof(struct minix_bpf_dltlist
, mbfl_list
),
958 (vir_bytes
)&uval
, sizeof(uval
))) != OK
)
963 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&bfl
,
967 uval
= !!(bpf
->bpf_flags
& BPFF_SEESENT
);
969 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&uval
,
973 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&uval
,
974 sizeof(uval
))) != OK
)
978 bpf
->bpf_flags
|= BPFF_SEESENT
;
980 bpf
->bpf_flags
&= ~BPFF_SEESENT
;
985 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&tv
,
989 if ((r
= util_timeval_to_ticks(&tv
, &bpf
->bpf_timeout
)) != OK
)
995 util_ticks_to_timeval(bpf
->bpf_timeout
, &tv
);
997 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&tv
,
1001 uval
= !!(bpf
->bpf_flags
& BPFF_FEEDBACK
);
1003 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&uval
,
1007 if ((r
= sys_safecopyfrom(endpt
, grant
, 0, (vir_bytes
)&uval
,
1008 sizeof(uval
))) != OK
)
1012 bpf
->bpf_flags
|= BPFF_FEEDBACK
;
1014 bpf
->bpf_flags
&= ~BPFF_FEEDBACK
;
1020 if (bpf
->bpf_hlen
> 0)
1021 val
= bpf
->bpf_hlen
;
1022 else if ((bpf
->bpf_flags
& BPFF_IMMEDIATE
) &&
1024 val
= bpf
->bpf_slen
;
1028 return sys_safecopyto(endpt
, grant
, 0, (vir_bytes
)&val
,
1037 * Cancel a previously suspended request on a BPF device. Since only read
1038 * requests may be suspended (select is handled differently), the cancel
1039 * request must be for a read request. Note that character devices currently
1040 * (still) behave slightly differently from socket devices here: while socket
1041 * drivers are supposed to respond to the original request, character drivers
1042 * must respond to the original request from the cancel callback.
1045 bpfdev_cancel(devminor_t minor
, endpoint_t endpt
, cdev_id_t id
)
1049 if ((bpf
= bpfdev_get_by_minor(minor
)) == NULL
)
1052 /* Is this a cancel request for the currently pending read request? */
1053 if (bpf
->bpf_read
.br_endpt
!= endpt
|| bpf
->bpf_read
.br_id
!= id
)
1056 /* If so, cancel the read request. */
1057 if (bpf
->bpf_timeout
> 0)
1058 cancel_timer(&bpf
->bpf_read
.br_timer
);
1060 bpf
->bpf_read
.br_endpt
= NONE
;
1062 return EINTR
; /* the return value for the canceled read request */
1066 * Perform a select query on a BPF device.
1069 bpfdev_select(devminor_t minor
, unsigned int ops
, endpoint_t endpt
)
1072 unsigned int r
, notify
;
1074 if ((bpf
= bpfdev_get_by_minor(minor
)) == NULL
)
1077 notify
= (ops
& CDEV_NOTIFY
);
1078 ops
&= (CDEV_OP_RD
| CDEV_OP_WR
| CDEV_OP_ERR
);
1080 r
= bpfdev_test_select(bpf
, ops
);
1083 * For the operations that were not immediately ready, if requested,
1084 * save the select request for later.
1088 if (ops
!= 0 && notify
) {
1089 if (bpf
->bpf_select
.bs_endpt
!= NONE
) {
1090 /* Merge in the operations with any earlier request. */
1091 if (bpf
->bpf_select
.bs_endpt
!= endpt
)
1093 bpf
->bpf_select
.bs_selops
|= ops
;
1095 bpf
->bpf_select
.bs_endpt
= endpt
;
1096 bpf
->bpf_select
.bs_selops
= ops
;
1104 * Process an incoming packet on the interface to which the given BPF device is
1105 * attached. If the packet passes the filter (if any), store as much as
1106 * requested of it in the store buffer, rotating buffers if needed and resuming
1107 * suspended read and select requests as appropriate. This function is also
1108 * called through bpfdev_output() below.
1111 bpfdev_input(struct bpfdev_link
* bpfl
, const struct pbuf
* pbuf
)
1113 struct bpfdev
*bpf
= (struct bpfdev
*)bpfl
;
1116 const struct pbuf
*pptr
;
1117 size_t caplen
, hdrlen
, totlen
, off
, chunk
;
1121 * Apparently bs_recv is the counter of packets that were run through
1122 * the filter, not the number of packets that were or could be received
1123 * by the user (which is what I got from the manual page.. oh well).
1125 bpf
->bpf_stat
.bs_recv
++;
1129 * Run the packet through the BPF device's filter to see whether the
1130 * packet should be stored and if so, how much of it. If no filter is
1131 * set, all packets will be stored in their entirety.
1133 caplen
= bpf_filter_ext(bpf
->bpf_filter
, pbuf
, (u_char
*)pbuf
->payload
,
1134 pbuf
->tot_len
, pbuf
->len
);
1137 return; /* no match; ignore packet */
1139 if (caplen
> pbuf
->tot_len
)
1140 caplen
= pbuf
->tot_len
;
1142 /* Truncate packet entries to the full size of the buffers. */
1143 hdrlen
= BPF_WORDALIGN(sizeof(bh
));
1144 totlen
= BPF_WORDALIGN(hdrlen
+ caplen
);
1146 if (totlen
> bpf
->bpf_size
) {
1147 totlen
= bpf
->bpf_size
;
1148 caplen
= totlen
- hdrlen
;
1150 assert(totlen
>= hdrlen
);
1152 bpf
->bpf_stat
.bs_capt
++;
1155 assert(bpf
->bpf_sbuf
!= NULL
);
1156 if (totlen
> bpf
->bpf_size
- bpf
->bpf_slen
) {
1158 * If the store buffer is full and the hold buffer is not
1159 * empty, we cannot swap the two buffers, and so we must drop
1160 * the current packet.
1162 if (bpf
->bpf_hlen
> 0) {
1163 bpf
->bpf_stat
.bs_drop
++;
1170 * Rotate the buffers: the hold buffer will now be "full" and
1171 * ready to be read - it may not actually be entirely full, but
1172 * we could not fit this packet and we are not going to deliver
1173 * packets out of order..
1182 * Retrieve the capture time for the packet. Ideally this would be
1183 * done only once per accepted packet, but we do not expect many BPF
1184 * devices to be receiving the same packets often enough to make that
1190 * Copy the packet into the store buffer, including a newly generated
1191 * header. Zero any padding areas, even if strictly not necessary.
1193 memset(&bh
, 0, sizeof(bh
));
1194 bh
.bh_tstamp
.tv_sec
= ts
.tv_sec
;
1195 bh
.bh_tstamp
.tv_usec
= ts
.tv_nsec
/ 1000;
1196 bh
.bh_caplen
= caplen
;
1197 bh
.bh_datalen
= pbuf
->tot_len
;
1198 bh
.bh_hdrlen
= hdrlen
;
1200 assert(bpf
->bpf_sbuf
!= NULL
);
1201 off
= bpf
->bpf_slen
;
1203 memcpy(&bpf
->bpf_sbuf
[off
], &bh
, sizeof(bh
));
1204 if (hdrlen
> sizeof(bh
))
1205 memset(&bpf
->bpf_sbuf
[off
+ sizeof(bh
)], 0,
1206 hdrlen
- sizeof(bh
));
1209 for (pptr
= pbuf
; pptr
!= NULL
&& caplen
> 0; pptr
= pptr
->next
) {
1214 memcpy(&bpf
->bpf_sbuf
[off
], pptr
->payload
, chunk
);
1220 assert(off
<= bpf
->bpf_slen
+ totlen
);
1221 if (bpf
->bpf_slen
+ totlen
> off
)
1222 memset(&bpf
->bpf_sbuf
[off
], 0, bpf
->bpf_slen
+ totlen
- off
);
1224 bpf
->bpf_slen
+= totlen
;
1227 * Edge case: if the hold buffer is empty and the store buffer is now
1228 * exactly full, rotate buffers so that the packets can be read
1229 * immediately, without waiting for the next packet to cause rotation.
1231 if (bpf
->bpf_hlen
== 0 && bpf
->bpf_slen
== bpf
->bpf_size
) {
1238 * If the hold buffer is now full, or if immediate mode is enabled,
1239 * then we now have data to deliver to userland. See if we can wake up
1240 * any read or select call (either but not both here).
1242 if (hfull
|| (bpf
->bpf_flags
& BPFF_IMMEDIATE
)) {
1243 if (bpf
->bpf_read
.br_endpt
!= NONE
)
1244 bpfdev_resume_read(bpf
, FALSE
/*is_timeout*/);
1246 bpfdev_resume_select(bpf
);
1251 * Process an outgoing packet on the interface to which the given BPF device is
1252 * attached. If the BPF device is configured to capture outgoing packets as
1253 * well, attempt to capture the packet as per bpfdev_input().
1256 bpfdev_output(struct bpfdev_link
* bpfl
, const struct pbuf
* pbuf
)
1258 struct bpfdev
*bpf
= (struct bpfdev
*)bpfl
;
1260 if (bpf
->bpf_flags
& BPFF_SEESENT
)
1261 bpfdev_input(bpfl
, pbuf
);
1265 * Fill the given 'bde' structure with information about BPF device 'bpf'.
1268 bpfdev_get_info(struct bpf_d_ext
* bde
, const struct bpfdev
* bpf
)
1271 bde
->bde_bufsize
= bpf
->bpf_size
;
1272 bde
->bde_promisc
= !!(bpf
->bpf_flags
& BPFF_PROMISC
);
1273 bde
->bde_state
= BPF_IDLE
;
1274 bde
->bde_immediate
= !!(bpf
->bpf_flags
& BPFF_IMMEDIATE
);
1275 bde
->bde_hdrcmplt
= !!(bpf
->bpf_flags
& BPFF_HDRCMPLT
);
1276 bde
->bde_seesent
= !!(bpf
->bpf_flags
& BPFF_SEESENT
);
1278 * NetBSD updates the process ID upon device open, close, ioctl, and
1279 * poll. From those, only open and ioctl make sense for us. Sadly
1280 * there is no way to indicate "no known PID" to netstat(1), so we
1281 * cannot even save just the endpoint and look up the corresponding PID
1282 * later, since the user process may be gone by then.
1284 bde
->bde_pid
= bpf
->bpf_pid
;
1285 bde
->bde_rcount
= bpf
->bpf_stat
.bs_recv
;
1286 bde
->bde_dcount
= bpf
->bpf_stat
.bs_drop
;
1287 bde
->bde_ccount
= bpf
->bpf_stat
.bs_capt
;
1288 if (bpf
->bpf_ifdev
!= NULL
)
1289 strlcpy(bde
->bde_ifname
, ifdev_get_name(bpf
->bpf_ifdev
),
1290 sizeof(bde
->bde_ifname
));
1294 * Obtain statistics about open BPF devices ("peers"). This node may be
1295 * accessed by the superuser only. Used by netstat(1).
1298 bpfdev_peers(struct rmib_call
* call
, struct rmib_node
* node __unused
,
1299 struct rmib_oldp
* oldp
, struct rmib_newp
* newp __unused
)
1302 struct bpf_d_ext bde
;
1307 if (!(call
->call_flags
& RMIB_FLAG_AUTH
))
1310 if (call
->call_namelen
!= 2)
1313 size
= call
->call_name
[0];
1314 if (size
< 0 || (size_t)size
> sizeof(bde
))
1318 max
= call
->call_name
[1];
1322 for (slot
= 0; slot
< __arraycount(bpf_array
); slot
++) {
1323 bpf
= &bpf_array
[slot
];
1325 if (!(bpf
->bpf_flags
& BPFF_IN_USE
))
1328 if (rmib_inrange(oldp
, off
)) {
1329 memset(&bde
, 0, sizeof(bde
));
1331 bpfdev_get_info(&bde
, bpf
);
1333 if ((r
= rmib_copyout(oldp
, off
, &bde
, size
)) < 0)
1338 if (max
> 0 && --max
== 0)
1342 /* No slack needed: netstat(1) resizes its buffer as needed. */
1346 static const struct chardriver bpfdev_tab
= {
1347 .cdr_open
= bpfdev_open
,
1348 .cdr_close
= bpfdev_close
,
1349 .cdr_read
= bpfdev_read
,
1350 .cdr_write
= bpfdev_write
,
1351 .cdr_ioctl
= bpfdev_ioctl
,
1352 .cdr_cancel
= bpfdev_cancel
,
1353 .cdr_select
= bpfdev_select
1357 * Process a character driver request. Since the LWIP service offers character
1358 * devices for BPF only, it must be a request for a BPF device.
1361 bpfdev_process(message
* m_ptr
, int ipc_status
)
1364 chardriver_process(&bpfdev_tab
, m_ptr
, ipc_status
);