1 /* $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $ */
4 * Copyright (c) 1990, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from the Stanford/CMU enet packet filter,
8 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10 * Berkeley Laboratory.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * @(#)bpf.c 8.4 (Berkeley) 1/9/95
37 * static char rcsid[] =
38 * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
41 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
42 * Use is subject to license terms.
43 * Copyright 2017 Joyent, Inc.
47 * The BPF implements the following access controls for zones attempting
48 * to read and write data. Writing of data requires that the net_rawaccess
49 * privilege is held whilst reading data requires either net_rawaccess or
50 * net_observerability.
52 * | Shared | Exclusive | Global
53 * -----------------------------+--------+------------+------------+
54 * DLT_IPNET in local zone | Read | Read | Read |
55 * -----------------------------+--------+------------+------------+
56 * Raw access to local zone NIC | None | Read/Write | Read/Write |
57 * -----------------------------+--------+------------+------------+
58 * Raw access to all NICs | None | None | Read/Write |
59 * -----------------------------+--------+------------+------------+
61 * The BPF driver is written as a cloning driver: each call to bpfopen()
62 * allocates a new minor number. This provides BPF with a 1:1 relationship
63 * between open's and close's. There is some amount of "descriptor state"
64 * that is kept per open. Pointers to this data are stored in a hash table
65 * (bpf_hash) that is index'd by the minor device number for each open file.
67 #include <sys/param.h>
68 #include <sys/systm.h>
70 #include <sys/ioctl.h>
71 #include <sys/queue.h>
72 #include <sys/filio.h>
73 #include <sys/policy.h>
74 #include <sys/cmn_err.h>
77 #include <sys/sysmacros.h>
80 #include <sys/socket.h>
81 #include <sys/errno.h>
89 #include <net/bpfdesc.h>
92 #include <netinet/in.h>
94 #include <sys/mac_client.h>
95 #include <sys/mac_impl.h>
96 #include <sys/time_std_impl.h>
98 #include <sys/hook_event.h>
101 #define mtod(_v, _t) (_t)((_v)->b_rptr)
102 #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr)
105 * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
106 * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
108 #define BPF_BUFSIZE (32 * 1024)
110 typedef void *(*cp_fn_t
)(void *, const void *, size_t);
113 * The default read buffer size, and limit for BIOCSBLEN.
115 int bpf_bufsize
= BPF_BUFSIZE
;
116 int bpf_maxbufsize
= (16 * 1024 * 1024);
117 static mod_hash_t
*bpf_hash
= NULL
;
120 * Use a mutex to avoid a race condition between gathering the stats/peers
121 * and opening/closing the device.
123 static kcondvar_t bpf_dlt_waiter
;
124 static kmutex_t bpf_mtx
;
125 static bpf_kstats_t ks_stats
;
126 static bpf_kstats_t bpf_kstats
= {
127 { "readWait", KSTAT_DATA_UINT64
},
128 { "writeOk", KSTAT_DATA_UINT64
},
129 { "writeError", KSTAT_DATA_UINT64
},
130 { "receive", KSTAT_DATA_UINT64
},
131 { "captured", KSTAT_DATA_UINT64
},
132 { "dropped", KSTAT_DATA_UINT64
},
134 static kstat_t
*bpf_ksp
;
137 * bpf_list is a list of the BPF descriptors currently open
139 LIST_HEAD(, bpf_d
) bpf_list
;
141 static int bpf_allocbufs(struct bpf_d
*);
142 static void bpf_clear_timeout(struct bpf_d
*);
143 static void bpf_deliver(struct bpf_d
*, cp_fn_t
,
144 void *, uint_t
, uint_t
, boolean_t
);
145 static void bpf_freed(struct bpf_d
*);
146 static int bpf_ifname(struct bpf_d
*d
, char *, int);
147 static void *bpf_mcpy(void *, const void *, size_t);
148 static int bpf_attachd(struct bpf_d
*, const char *, int);
149 static void bpf_detachd(struct bpf_d
*);
150 static int bpf_setif(struct bpf_d
*, char *, int);
151 static void bpf_timed_out(void *);
153 bpf_wakeup(struct bpf_d
*);
154 static void catchpacket(struct bpf_d
*, uchar_t
*, uint_t
, uint_t
,
155 cp_fn_t
, struct timeval
*);
156 static void reset_d(struct bpf_d
*);
157 static int bpf_getdltlist(struct bpf_d
*, struct bpf_dltlist
*);
158 static int bpf_setdlt(struct bpf_d
*, void *);
159 static void bpf_dev_add(struct bpf_d
*);
160 static struct bpf_d
*bpf_dev_find(minor_t
);
161 static struct bpf_d
*bpf_dev_get(minor_t
);
162 static void bpf_dev_remove(struct bpf_d
*);
165 bpf_movein(struct uio
*uio
, int linktype
, int mtu
, mblk_t
**mp
)
174 * Build a sockaddr based on the data link layer type.
175 * We do this at this level because the ethernet header
176 * is copied directly into the data field of the sockaddr.
177 * In the case of SLIP, there is no header and the packet
178 * is forwarded as is.
179 * Also, we are careful to leave room at the front of the mbuf
180 * for the link level header.
185 hlen
= sizeof (struct ether_header
);
204 align
= 4 - (hlen
& 3);
206 len
= uio
->uio_resid
;
208 * If there aren't enough bytes for a link level header or the
209 * packet length exceeds the interface mtu, return an error.
211 if (len
< hlen
|| len
- hlen
> mtu
)
214 m
= allocb(len
+ align
, BPRI_MED
);
220 /* Insure the data is properly aligned */
223 m
->b_wptr
= m
->b_rptr
+ len
;
225 error
= uiomove(mtod(m
, void *), len
, UIO_WRITE
, uio
);
239 * Attach file to the bpf interface, i.e. make d listen on bp.
242 bpf_attachd(struct bpf_d
*d
, const char *ifname
, int dlt
)
244 bpf_provider_list_t
*bp
;
255 ASSERT(d
->bd_bif
== (uintptr_t)NULL
);
256 ASSERT(d
->bd_mcip
== (uintptr_t)NULL
);
262 LIST_FOREACH(bp
, &bpf_providers
, bpl_next
) {
264 error
= MBPF_OPEN(bpr
, ifname
, &mh
, zone
);
267 error
= MBPF_CLIENT_OPEN(bpr
, mh
, &mcip
);
270 error
= MBPF_GET_DLT(bpr
, mh
, &nicdlt
);
274 nicdlt
= bpf_dl_to_dlt(nicdlt
);
275 if (dlt
!= -1 && dlt
!= nicdlt
) {
280 error
= MBPF_GET_ZONE(bpr
, mh
, &niczone
);
284 DTRACE_PROBE4(bpf__attach
, struct bpf_provider_s
*, bpr
,
285 uintptr_t, mh
, int, nicdlt
, zoneid_t
, niczone
);
287 if (zonematch
&& niczone
!= zone
) {
294 MBPF_CLIENT_CLOSE(bpr
, mcip
);
297 if (mh
!= (uintptr_t)NULL
) {
303 if (zonematch
&& (zone
== GLOBAL_ZONEID
)) {
305 * If we failed to do an exact match for the global
306 * zone using the global zoneid, try again in case
307 * the network interface is owned by a local zone.
319 hdrlen
= bpf_dl_hdrsize(nicdlt
);
320 d
->bd_hdrlen
= BPF_WORDALIGN(hdrlen
+ SIZEOF_BPF_HDR
) - hdrlen
;
322 (void) strlcpy(d
->bd_ifname
, MBPF_CLIENT_NAME(&d
->bd_mac
, mcip
),
323 sizeof (d
->bd_ifname
));
325 (void) MBPF_GET_LINKID(&d
->bd_mac
, d
->bd_ifname
, &d
->bd_linkid
,
327 (void) MBPF_PROMISC_ADD(&d
->bd_mac
, d
->bd_mcip
, 0, d
,
328 &d
->bd_promisc_handle
, d
->bd_promisc_flags
);
333 * Detach a file from its interface.
336 bpf_detachd(struct bpf_d
*d
)
342 ASSERT(d
->bd_inuse
== -1);
349 * Check if this descriptor had requested promiscuous mode.
350 * If so, turn it off. There's no need to take any action
351 * here, that is done when MBPF_PROMISC_REMOVE is used;
352 * bd_promisc is just a local flag to stop promiscuous mode
353 * from being set more than once.
359 * Take device out of "promiscuous" mode. Since we were able to
360 * enter "promiscuous" mode, we should be able to turn it off.
361 * Note, this field stores a pointer used to support both
362 * promiscuous and non-promiscuous callbacks for packets.
364 mph
= d
->bd_promisc_handle
;
365 d
->bd_promisc_handle
= 0;
368 * The lock has to be dropped here because mac_promisc_remove may
369 * need to wait for mac_promisc_dispatch, which has called into
370 * bpf and catchpacket is waiting for bd_lock...
371 * i.e mac_promisc_remove() needs to be called with none of the
372 * locks held that are part of the bpf_mtap() call path.
374 mutex_exit(&d
->bd_lock
);
376 MBPF_PROMISC_REMOVE(&d
->bd_mac
, mph
);
379 MBPF_CLIENT_CLOSE(&d
->bd_mac
, mch
);
382 MBPF_CLOSE(&d
->bd_mac
, mh
);
385 * Because this function is called with bd_lock held, so it must
388 mutex_enter(&d
->bd_lock
);
389 *d
->bd_ifname
= '\0';
390 (void) memset(&d
->bd_mac
, 0, sizeof (d
->bd_mac
));
395 * bpfilterattach() is called at load time.
401 bpf_hash
= mod_hash_create_idhash("bpf_dev_tab", 31,
402 mod_hash_null_keydtor
);
403 if (bpf_hash
== NULL
)
406 (void) memcpy(&ks_stats
, &bpf_kstats
, sizeof (bpf_kstats
));
408 bpf_ksp
= kstat_create("bpf", 0, "global", "misc",
409 KSTAT_TYPE_NAMED
, sizeof (bpf_kstats
) / sizeof (kstat_named_t
),
411 if (bpf_ksp
!= NULL
) {
412 bpf_ksp
->ks_data
= &ks_stats
;
413 kstat_install(bpf_ksp
);
415 mod_hash_destroy_idhash(bpf_hash
);
420 cv_init(&bpf_dlt_waiter
, NULL
, CV_DRIVER
, NULL
);
421 mutex_init(&bpf_mtx
, NULL
, MUTEX_DRIVER
, NULL
);
423 LIST_INIT(&bpf_list
);
430 * bpfilterdetach() is called at unload time.
436 if (bpf_ksp
!= NULL
) {
437 kstat_delete(bpf_ksp
);
441 mod_hash_destroy_idhash(bpf_hash
);
444 cv_destroy(&bpf_dlt_waiter
);
445 mutex_destroy(&bpf_mtx
);
451 * Open ethernet device. Clones.
455 bpfopen(dev_t
*devp
, int flag
, int mode
, cred_t
*cred
)
461 * The security policy described at the top of this file is
464 if ((flag
& FWRITE
) != 0) {
465 if (secpolicy_net_rawaccess(cred
) != 0)
469 if ((flag
& FREAD
) != 0) {
470 if ((secpolicy_net_observability(cred
) != 0) &&
471 (secpolicy_net_rawaccess(cred
) != 0))
475 if ((flag
& (FWRITE
|FREAD
)) == 0)
479 * A structure is allocated per open file in BPF to store settings
480 * such as buffer capture size, provide private buffers, etc.
482 d
= kmem_zalloc(sizeof (*d
), KM_SLEEP
);
483 d
->bd_bufsize
= bpf_bufsize
;
485 d
->bd_zone
= crgetzoneid(cred
);
487 d
->bd_promisc_flags
= MAC_PROMISC_FLAGS_NO_PHYS
|
488 MAC_PROMISC_FLAGS_NO_COPY
;
489 mutex_init(&d
->bd_lock
, NULL
, MUTEX_DRIVER
, NULL
);
490 cv_init(&d
->bd_wait
, NULL
, CV_DRIVER
, NULL
);
492 mutex_enter(&bpf_mtx
);
494 * Find an unused minor number. Obviously this is an O(n) algorithm
495 * and doesn't scale particularly well, so if there are large numbers
496 * of open file descriptors happening in real use, this design may
497 * need to be revisited.
499 for (dmin
= 0; dmin
< L_MAXMIN
; dmin
++)
500 if (bpf_dev_find(dmin
) == NULL
)
502 if (dmin
== L_MAXMIN
) {
503 mutex_exit(&bpf_mtx
);
504 kmem_free(d
, sizeof (*d
));
508 LIST_INSERT_HEAD(&bpf_list
, d
, bd_list
);
510 mutex_exit(&bpf_mtx
);
512 *devp
= makedevice(getmajor(*devp
), dmin
);
518 * Close the descriptor by detaching it from its interface,
519 * deallocating its buffers, and marking it free.
521 * Because we only allow a device to be opened once, there is always a
522 * 1 to 1 relationship between opens and closes supporting this function.
526 bpfclose(dev_t dev
, int flag
, int otyp
, cred_t
*cred_p
)
528 struct bpf_d
*d
= bpf_dev_get(getminor(dev
));
530 mutex_enter(&d
->bd_lock
);
532 while (d
->bd_inuse
!= 0) {
534 if (cv_wait_sig(&d
->bd_wait
, &d
->bd_lock
) <= 0) {
536 mutex_exit(&d
->bd_lock
);
543 if (d
->bd_state
== BPF_WAITING
)
544 bpf_clear_timeout(d
);
545 d
->bd_state
= BPF_IDLE
;
548 mutex_exit(&d
->bd_lock
);
550 mutex_enter(&bpf_mtx
);
551 LIST_REMOVE(d
, bd_list
);
553 mutex_exit(&bpf_mtx
);
555 mutex_enter(&d
->bd_lock
);
556 mutex_destroy(&d
->bd_lock
);
557 cv_destroy(&d
->bd_wait
);
560 kmem_free(d
, sizeof (*d
));
566 * Rotate the packet buffers in descriptor d. Move the store buffer
567 * into the hold slot, and the free buffer into the store slot.
568 * Zero the length of the new store buffer.
570 #define ROTATE_BUFFERS(d) \
571 (d)->bd_hbuf = (d)->bd_sbuf; \
572 (d)->bd_hlen = (d)->bd_slen; \
573 (d)->bd_sbuf = (d)->bd_fbuf; \
577 * bpfread - read next chunk of packets from buffers
581 bpfread(dev_t dev
, struct uio
*uio
, cred_t
*cred
)
583 struct bpf_d
*d
= bpf_dev_get(getminor(dev
));
588 if ((d
->bd_fmode
& FREAD
) == 0)
592 * Restrict application to use a buffer the same size as
593 * the kernel buffers.
595 if (uio
->uio_resid
!= d
->bd_bufsize
)
598 mutex_enter(&d
->bd_lock
);
599 if (d
->bd_state
== BPF_WAITING
)
600 bpf_clear_timeout(d
);
601 timed_out
= (d
->bd_state
== BPF_TIMED_OUT
);
602 d
->bd_state
= BPF_IDLE
;
604 * If the hold buffer is empty, then do a timed sleep, which
605 * ends when the timeout expires or when enough packets
606 * have arrived to fill the store buffer.
608 while (d
->bd_hbuf
== 0) {
609 if (d
->bd_nonblock
) {
610 if (d
->bd_slen
== 0) {
611 mutex_exit(&d
->bd_lock
);
612 return (EWOULDBLOCK
);
618 if ((d
->bd_immediate
|| timed_out
) && d
->bd_slen
!= 0) {
620 * A packet(s) either arrived since the previous
621 * read or arrived while we were asleep.
622 * Rotate the buffers and return what's here.
627 ks_stats
.kp_read_wait
.value
.ui64
++;
628 delay
= ddi_get_lbolt() + d
->bd_rtout
;
629 error
= cv_timedwait_sig(&d
->bd_wait
, &d
->bd_lock
, delay
);
631 mutex_exit(&d
->bd_lock
);
636 * On a timeout, return what's in the buffer,
637 * which may be nothing. If there is something
638 * in the store buffer, we can rotate the buffers.
642 * We filled up the buffer in between
643 * getting the timeout and arriving
644 * here, so we don't need to rotate.
648 if (d
->bd_slen
== 0) {
649 mutex_exit(&d
->bd_lock
);
656 * At this point, we know we have something in the hold slot.
658 mutex_exit(&d
->bd_lock
);
661 * Move data from hold buffer into user space.
662 * We know the entire buffer is transferred since
663 * we checked above that the read buffer is bpf_bufsize bytes.
665 error
= uiomove(d
->bd_hbuf
, d
->bd_hlen
, UIO_READ
, uio
);
667 mutex_enter(&d
->bd_lock
);
668 d
->bd_fbuf
= d
->bd_hbuf
;
672 mutex_exit(&d
->bd_lock
);
678 * If there are processes sleeping on this descriptor, wake them up.
679 * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
680 * so there is no code here grabbing it.
683 bpf_wakeup(struct bpf_d
*d
)
685 cv_signal(&d
->bd_wait
);
689 bpf_timed_out(void *arg
)
691 struct bpf_d
*d
= arg
;
693 mutex_enter(&d
->bd_lock
);
694 if (d
->bd_state
== BPF_WAITING
) {
695 d
->bd_state
= BPF_TIMED_OUT
;
697 cv_signal(&d
->bd_wait
);
699 mutex_exit(&d
->bd_lock
);
705 bpfwrite(dev_t dev
, struct uio
*uio
, cred_t
*cred
)
707 struct bpf_d
*d
= bpf_dev_get(getminor(dev
));
714 if ((d
->bd_fmode
& FWRITE
) == 0)
717 mutex_enter(&d
->bd_lock
);
718 if (d
->bd_bif
== 0 || d
->bd_mcip
== 0 || d
->bd_bif
== 0) {
719 mutex_exit(&d
->bd_lock
);
723 if (uio
->uio_resid
== 0) {
724 mutex_exit(&d
->bd_lock
);
728 while (d
->bd_inuse
< 0) {
730 if (cv_wait_sig(&d
->bd_wait
, &d
->bd_lock
) <= 0) {
732 mutex_exit(&d
->bd_lock
);
738 mutex_exit(&d
->bd_lock
);
742 MBPF_SDU_GET(&d
->bd_mac
, d
->bd_bif
, &mtu
);
746 if (dlt
== DLT_IPNET
) {
751 error
= bpf_movein(uio
, dlt
, mtu
, &m
);
755 DTRACE_PROBE4(bpf__tx
, struct bpf_d
*, d
, int, dlt
,
756 uint_t
, mtu
, mblk_t
*, m
);
758 if (M_LEN(m
) > mtu
) {
763 error
= MBPF_TX(&d
->bd_mac
, mch
, m
);
765 * The "tx" action here is required to consume the mblk_t.
771 ks_stats
.kp_write_ok
.value
.ui64
++;
773 ks_stats
.kp_write_error
.value
.ui64
++;
777 mutex_enter(&d
->bd_lock
);
779 if ((d
->bd_inuse
== 0) && (d
->bd_waiting
!= 0))
780 cv_signal(&d
->bd_wait
);
781 mutex_exit(&d
->bd_lock
);
784 * The driver frees the mbuf.
791 * Reset a descriptor by flushing its packet buffer and clearing the
792 * receive and drop counts. Should be called at splnet.
795 reset_d(struct bpf_d
*d
)
798 /* Free the hold buffer. */
799 d
->bd_fbuf
= d
->bd_hbuf
;
810 * FIONREAD Check for read packet available.
811 * BIOCGBLEN Get buffer len [for read()].
812 * BIOCSETF Set ethernet read filter.
813 * BIOCFLUSH Flush read packet buffer.
814 * BIOCPROMISC Put interface into promiscuous mode.
815 * BIOCGDLT Get link layer type.
816 * BIOCGETIF Get interface name.
817 * BIOCSETIF Set interface.
818 * BIOCSRTIMEOUT Set read timeout.
819 * BIOCGRTIMEOUT Get read timeout.
820 * BIOCGSTATS Get packet stats.
821 * BIOCIMMEDIATE Set immediate mode.
822 * BIOCVERSION Get filter language version.
823 * BIOCGHDRCMPLT Get "header already complete" flag.
824 * BIOCSHDRCMPLT Set "header already complete" flag.
828 bpfioctl(dev_t dev
, int cmd
, intptr_t addr
, int mode
, cred_t
*cred
, int *rval
)
830 struct bpf_d
*d
= bpf_dev_get(getminor(dev
));
831 struct bpf_program prog
;
832 struct lifreq lifreq
;
838 * Refresh the PID associated with this bpf file.
840 mutex_enter(&d
->bd_lock
);
841 if (d
->bd_state
== BPF_WAITING
)
842 bpf_clear_timeout(d
);
843 d
->bd_state
= BPF_IDLE
;
844 mutex_exit(&d
->bd_lock
);
853 * Check for read packet available.
859 mutex_enter(&d
->bd_lock
);
863 mutex_exit(&d
->bd_lock
);
870 * Get buffer len [for read()].
873 error
= copyout(&d
->bd_bufsize
, (void *)addr
,
874 sizeof (d
->bd_bufsize
));
881 if (copyin((void *)addr
, &size
, sizeof (size
)) != 0) {
886 mutex_enter(&d
->bd_lock
);
887 if (d
->bd_bif
!= 0) {
890 if (size
> bpf_maxbufsize
)
891 size
= bpf_maxbufsize
;
892 else if (size
< BPF_MINBUFSIZE
)
893 size
= BPF_MINBUFSIZE
;
895 d
->bd_bufsize
= size
;
897 mutex_exit(&d
->bd_lock
);
900 error
= copyout(&size
, (void *)addr
, sizeof (size
));
904 * Set link layer read filter.
907 if (ddi_copyin((void *)addr
, &prog
, sizeof (prog
), mode
)) {
911 error
= bpf_setf(d
, &prog
);
915 * Flush read packet buffer.
918 mutex_enter(&d
->bd_lock
);
920 mutex_exit(&d
->bd_lock
);
924 * Put interface into promiscuous mode.
925 * This is a one-way ioctl, it is not used to turn promiscuous
929 if (d
->bd_bif
== 0) {
931 * No interface attached yet.
936 mutex_enter(&d
->bd_lock
);
937 if (d
->bd_promisc
== 0) {
939 if (d
->bd_promisc_handle
) {
942 mph
= d
->bd_promisc_handle
;
943 d
->bd_promisc_handle
= 0;
945 mutex_exit(&d
->bd_lock
);
946 MBPF_PROMISC_REMOVE(&d
->bd_mac
, mph
);
947 mutex_enter(&d
->bd_lock
);
950 d
->bd_promisc_flags
= MAC_PROMISC_FLAGS_NO_COPY
;
951 error
= MBPF_PROMISC_ADD(&d
->bd_mac
,
952 d
->bd_mcip
, MAC_CLIENT_PROMISC_ALL
, d
,
953 &d
->bd_promisc_handle
, d
->bd_promisc_flags
);
957 mutex_exit(&d
->bd_lock
);
961 * Get device parameters.
967 error
= copyout(&d
->bd_dlt
, (void *)addr
,
972 * Get a list of supported device parameters.
975 if (d
->bd_bif
== 0) {
978 struct bpf_dltlist list
;
980 if (copyin((void *)addr
, &list
, sizeof (list
)) != 0) {
984 error
= bpf_getdltlist(d
, &list
);
986 copyout(&list
, (void *)addr
, sizeof (list
)) != 0)
992 * Set device parameters.
995 error
= bpf_setdlt(d
, (void *)addr
);
999 * Get interface name.
1002 if (copyin((void *)addr
, &ifreq
, sizeof (ifreq
)) != 0) {
1006 error
= bpf_ifname(d
, ifreq
.ifr_name
, sizeof (ifreq
.ifr_name
));
1008 copyout(&ifreq
, (void *)addr
, sizeof (ifreq
)) != 0) {
1018 if (copyin((void *)addr
, &ifreq
, sizeof (ifreq
)) != 0) {
1022 error
= bpf_setif(d
, ifreq
.ifr_name
, sizeof (ifreq
.ifr_name
));
1026 * Get interface name.
1029 if (copyin((void *)addr
, &lifreq
, sizeof (lifreq
)) != 0) {
1033 error
= bpf_ifname(d
, lifreq
.lifr_name
,
1034 sizeof (lifreq
.lifr_name
));
1036 copyout(&lifreq
, (void *)addr
, sizeof (lifreq
)) != 0) {
1046 if (copyin((void *)addr
, &lifreq
, sizeof (lifreq
)) != 0) {
1050 error
= bpf_setif(d
, lifreq
.lifr_name
,
1051 sizeof (lifreq
.lifr_name
));
1054 #ifdef _SYSCALL32_IMPL
1058 case BIOCSRTIMEOUT32
:
1060 struct timeval32 tv
;
1062 if (copyin((void *)addr
, &tv
, sizeof (tv
)) != 0) {
1067 /* Convert the timeout in microseconds to ticks */
1068 d
->bd_rtout
= drv_usectohz(tv
.tv_sec
* 1000000 +
1070 if ((d
->bd_rtout
== 0) && (tv
.tv_usec
!= 0))
1078 case BIOCGRTIMEOUT32
:
1080 struct timeval32 tv
;
1083 ticks
= drv_hztousec(d
->bd_rtout
);
1084 tv
.tv_sec
= ticks
/ 1000000;
1085 tv
.tv_usec
= ticks
- (tv
.tv_sec
* 1000000);
1086 error
= copyout(&tv
, (void *)addr
, sizeof (tv
));
1091 * Get a list of supported device parameters.
1093 case BIOCGDLTLIST32
:
1094 if (d
->bd_bif
== 0) {
1097 struct bpf_dltlist32 lst32
;
1098 struct bpf_dltlist list
;
1100 if (copyin((void *)addr
, &lst32
, sizeof (lst32
)) != 0) {
1105 list
.bfl_len
= lst32
.bfl_len
;
1106 list
.bfl_list
= (void *)(uint64_t)lst32
.bfl_list
;
1107 error
= bpf_getdltlist(d
, &list
);
1109 lst32
.bfl_len
= list
.bfl_len
;
1111 if (copyout(&lst32
, (void *)addr
,
1112 sizeof (lst32
)) != 0)
1119 * Set link layer read filter.
1122 struct bpf_program32 prog32
;
1124 if (ddi_copyin((void *)addr
, &prog32
, sizeof (prog
), mode
)) {
1128 prog
.bf_len
= prog32
.bf_len
;
1129 prog
.bf_insns
= (void *)(uint64_t)prog32
.bf_insns
;
1130 error
= bpf_setf(d
, &prog
);
1142 if (copyin((void *)addr
, &tv
, sizeof (tv
)) != 0) {
1147 /* Convert the timeout in microseconds to ticks */
1148 d
->bd_rtout
= drv_usectohz(tv
.tv_sec
* 1000000 +
1150 if ((d
->bd_rtout
== 0) && (tv
.tv_usec
!= 0))
1163 ticks
= drv_hztousec(d
->bd_rtout
);
1164 tv
.tv_sec
= ticks
/ 1000000;
1165 tv
.tv_usec
= ticks
- (tv
.tv_sec
* 1000000);
1166 if (copyout(&tv
, (void *)addr
, sizeof (tv
)) != 0)
1178 bs
.bs_recv
= d
->bd_rcount
;
1179 bs
.bs_drop
= d
->bd_dcount
;
1180 bs
.bs_capt
= d
->bd_ccount
;
1181 if (copyout(&bs
, (void *)addr
, sizeof (bs
)) != 0)
1187 * Set immediate mode.
1190 if (copyin((void *)addr
, &d
->bd_immediate
,
1191 sizeof (d
->bd_immediate
)) != 0)
1197 struct bpf_version bv
;
1199 bv
.bv_major
= BPF_MAJOR_VERSION
;
1200 bv
.bv_minor
= BPF_MINOR_VERSION
;
1201 if (copyout(&bv
, (void *)addr
, sizeof (bv
)) != 0)
1206 case BIOCGHDRCMPLT
: /* get "header already complete" flag */
1207 if (copyout(&d
->bd_hdrcmplt
, (void *)addr
,
1208 sizeof (d
->bd_hdrcmplt
)) != 0)
1212 case BIOCSHDRCMPLT
: /* set "header already complete" flag */
1213 if (copyin((void *)addr
, &d
->bd_hdrcmplt
,
1214 sizeof (d
->bd_hdrcmplt
)) != 0)
1219 * Get "see sent packets" flag
1222 if (copyout(&d
->bd_seesent
, (void *)addr
,
1223 sizeof (d
->bd_seesent
)) != 0)
1228 * Set "see sent" packets flag
1231 if (copyin((void *)addr
, &d
->bd_seesent
,
1232 sizeof (d
->bd_seesent
)) != 0)
1236 case FIONBIO
: /* Non-blocking I/O */
1237 if (copyin((void *)addr
, &d
->bd_nonblock
,
1238 sizeof (d
->bd_nonblock
)) != 0)
1246 * Set d's packet filter program to fp. If this file already has a filter,
1247 * free it and replace it. If the new filter is "empty" (has a 0 size), then
1248 * the result is to just remove and free the existing filter.
1249 * Returns EINVAL for bogus requests.
1252 bpf_setf(struct bpf_d
*d
, struct bpf_program
*fp
)
1254 struct bpf_insn
*fcode
, *old
;
1258 if (fp
->bf_insns
== 0) {
1259 if (fp
->bf_len
!= 0)
1261 mutex_enter(&d
->bd_lock
);
1263 oldsize
= d
->bd_filter_size
;
1265 d
->bd_filter_size
= 0;
1267 mutex_exit(&d
->bd_lock
);
1269 kmem_free(old
, oldsize
);
1273 if (flen
> BPF_MAXINSNS
)
1276 size
= flen
* sizeof (*fp
->bf_insns
);
1277 fcode
= kmem_alloc(size
, KM_SLEEP
);
1278 if (copyin(fp
->bf_insns
, fcode
, size
) != 0)
1281 if (bpf_validate(fcode
, (int)flen
)) {
1282 mutex_enter(&d
->bd_lock
);
1284 oldsize
= d
->bd_filter_size
;
1285 d
->bd_filter
= fcode
;
1286 d
->bd_filter_size
= size
;
1288 mutex_exit(&d
->bd_lock
);
1290 kmem_free(old
, oldsize
);
1294 kmem_free(fcode
, size
);
1299 * Detach a file from its current interface (if attached at all) and attach
1300 * to the interface indicated by the name stored in ifname.
1301 * Return an errno or 0.
1304 bpf_setif(struct bpf_d
*d
, char *ifname
, int namesize
)
1312 * Make sure the provided name has a unit number, and default
1313 * it to '0' if not specified.
1314 * XXX This is ugly ... do this differently?
1318 cp
[namesize
- 1] = '\0'; /* sanity */
1320 if (*cp
>= '0' && *cp
<= '9')
1323 /* Make sure to leave room for the '\0'. */
1324 for (i
= 0; i
< (namesize
- 1); ++i
) {
1325 if ((ifname
[i
] >= 'a' && ifname
[i
] <= 'z') ||
1326 (ifname
[i
] >= 'A' && ifname
[i
] <= 'Z'))
1333 * Make sure that only one call to this function happens at a time
1334 * and that we're not interleaving a read/write
1336 mutex_enter(&d
->bd_lock
);
1337 while (d
->bd_inuse
!= 0) {
1339 if (cv_wait_sig(&d
->bd_wait
, &d
->bd_lock
) <= 0) {
1341 mutex_exit(&d
->bd_lock
);
1347 mutex_exit(&d
->bd_lock
);
1349 if (d
->bd_sbuf
== 0)
1350 error
= bpf_allocbufs(d
);
1353 mutex_enter(&d
->bd_lock
);
1356 * Detach if attached to something else.
1360 error
= bpf_attachd(d
, ifname
, -1);
1363 if (d
->bd_waiting
!= 0)
1364 cv_signal(&d
->bd_wait
);
1365 mutex_exit(&d
->bd_lock
);
1369 mutex_enter(&d
->bd_lock
);
1371 if (d
->bd_waiting
!= 0)
1372 cv_signal(&d
->bd_wait
);
1373 mutex_exit(&d
->bd_lock
);
1376 * Try tickle the mac layer into attaching the device...
1378 return (bpf_provider_tickle(ifname
, d
->bd_zone
));
1382 * Copy the interface name to the ifreq.
1385 bpf_ifname(struct bpf_d
*d
, char *buffer
, int bufsize
)
1388 mutex_enter(&d
->bd_lock
);
1389 if (d
->bd_bif
== (uintptr_t)NULL
) {
1390 mutex_exit(&d
->bd_lock
);
1394 (void) strlcpy(buffer
, d
->bd_ifname
, bufsize
);
1395 mutex_exit(&d
->bd_lock
);
1402 bpfchpoll(dev_t dev
, short events
, int anyyet
, short *reventsp
,
1403 struct pollhead
**phpp
)
1405 struct bpf_d
*d
= bpf_dev_get(getminor(dev
));
1408 * Until this driver is modified to issue proper pollwakeup() calls on
1409 * its pollhead, edge-triggered polling is not allowed.
1411 if (events
& POLLET
)
1414 if (events
& (POLLIN
| POLLRDNORM
)) {
1416 * An imitation of the FIONREAD ioctl code.
1418 mutex_enter(&d
->bd_lock
);
1419 if (d
->bd_hlen
!= 0 ||
1420 ((d
->bd_immediate
|| d
->bd_state
== BPF_TIMED_OUT
) &&
1422 *reventsp
|= events
& (POLLIN
| POLLRDNORM
);
1425 * Until the bpf driver has been updated to include
1426 * adequate pollwakeup() logic, no pollhead will be
1427 * emitted here, preventing the resource from being
1428 * cached by poll()/devpoll/epoll.
1431 /* Start the read timeout if necessary */
1432 if (d
->bd_rtout
> 0 && d
->bd_state
== BPF_IDLE
) {
1433 bpf_clear_timeout(d
);
1435 * Only allow the timeout to be set once.
1437 if (d
->bd_callout
== 0)
1438 d
->bd_callout
= timeout(bpf_timed_out
,
1440 d
->bd_state
= BPF_WAITING
;
1443 mutex_exit(&d
->bd_lock
);
1450 * Copy data from an mblk_t chain into a buffer. This works for ipnet
1451 * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1455 bpf_mcpy(void *dst_arg
, const void *src_arg
, size_t len
)
1466 count
= (uint_t
)min(M_LEN(m
), len
);
1467 (void) memcpy(dst
, mtod(m
, const void *), count
);
1476 * Dispatch a packet to all the listeners on interface bp.
1478 * marg pointer to the packet, either a data buffer or an mbuf chain
1479 * buflen buffer length, if marg is a data buffer
1480 * cpfn a function that can copy marg into the listener's buffer
1481 * pktlen length of the packet
1482 * issent boolean indicating whether the packet was sent or receive
1485 bpf_deliver(struct bpf_d
*d
, cp_fn_t cpfn
, void *marg
, uint_t pktlen
,
1486 uint_t buflen
, boolean_t issent
)
1491 if (!d
->bd_seesent
&& issent
)
1495 * Accuracy of the packet counters in BPF is vital so it
1496 * is important to protect even the outer ones.
1498 mutex_enter(&d
->bd_lock
);
1499 slen
= bpf_filter(d
->bd_filter
, marg
, pktlen
, buflen
);
1500 DTRACE_PROBE5(bpf__packet
, struct bpf_if
*, d
->bd_bif
,
1501 struct bpf_d
*, d
, void *, marg
, uint_t
, pktlen
, uint_t
, slen
);
1503 ks_stats
.kp_receive
.value
.ui64
++;
1506 catchpacket(d
, marg
, pktlen
, slen
, cpfn
, &tv
);
1508 mutex_exit(&d
->bd_lock
);
1512 * Incoming linkage from device drivers.
1516 bpf_mtap(void *arg
, mac_resource_handle_t mrh
, mblk_t
*m
, boolean_t issent
)
1519 struct bpf_d
*d
= arg
;
1520 uint_t pktlen
, buflen
;
1523 pktlen
= msgdsize(m
);
1525 if (pktlen
== M_LEN(m
)) {
1526 cpfn
= (cp_fn_t
)memcpy
;
1527 marg
= mtod(m
, void *);
1535 bpf_deliver(d
, cpfn
, marg
, pktlen
, buflen
, issent
);
1539 * Incoming linkage from ipnet.
1540 * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1541 * from all network interfaces. Thus the tap function needs to apply a
1542 * filter using the interface index/id to immitate snoop'ing on just the
1543 * specified interface.
1547 bpf_itap(void *arg
, mblk_t
*m
, boolean_t issent
, uint_t length
)
1549 hook_pkt_observe_t
*hdr
;
1550 struct bpf_d
*d
= arg
;
1552 hdr
= (hook_pkt_observe_t
*)m
->b_rptr
;
1553 if (ntohl(hdr
->hpo_ifindex
) != d
->bd_linkid
)
1555 bpf_deliver(d
, bpf_mcpy
, m
, length
, 0, issent
);
1560 * Move the packet data from interface memory (pkt) into the
1561 * store buffer. Return 1 if it's time to wakeup a listener (buffer full),
1562 * otherwise 0. "copy" is the routine called to do the actual data
1563 * transfer. memcpy is passed in to copy contiguous chunks, while
1564 * bpf_mcpy is passed in to copy mbuf chains. In the latter case,
1565 * pkt is really an mbuf.
1568 catchpacket(struct bpf_d
*d
, uchar_t
*pkt
, uint_t pktlen
, uint_t snaplen
,
1569 cp_fn_t cpfn
, struct timeval
*tv
)
1573 int hdrlen
= d
->bd_hdrlen
;
1577 ks_stats
.kp_capture
.value
.ui64
++;
1579 * Figure out how many bytes to move. If the packet is
1580 * greater or equal to the snapshot length, transfer that
1581 * much. Otherwise, transfer the whole packet (unless
1582 * we hit the buffer size limit).
1584 totlen
= hdrlen
+ min(snaplen
, pktlen
);
1585 if (totlen
> d
->bd_bufsize
)
1586 totlen
= d
->bd_bufsize
;
1589 * Round up the end of the previous packet to the next longword.
1591 curlen
= BPF_WORDALIGN(d
->bd_slen
);
1592 if (curlen
+ totlen
> d
->bd_bufsize
) {
1594 * This packet will overflow the storage buffer.
1595 * Rotate the buffers if we can, then wakeup any
1598 if (d
->bd_fbuf
== 0) {
1600 * We haven't completed the previous read yet,
1601 * so drop the packet.
1604 ks_stats
.kp_dropped
.value
.ui64
++;
1610 } else if (d
->bd_immediate
|| d
->bd_state
== BPF_TIMED_OUT
) {
1612 * Immediate mode is set, or the read timeout has
1613 * already expired during a select call. A packet
1614 * arrived, so the reader should be woken up.
1620 * Append the bpf header to the existing buffer before we add
1621 * on the actual packet data.
1623 hp
= (struct bpf_hdr
*)((char *)d
->bd_sbuf
+ curlen
);
1624 hp
->bh_tstamp
.tv_sec
= tv
->tv_sec
;
1625 hp
->bh_tstamp
.tv_usec
= tv
->tv_usec
;
1626 hp
->bh_datalen
= pktlen
;
1627 hp
->bh_hdrlen
= (uint16_t)hdrlen
;
1629 * Copy the packet data into the store buffer and update its length.
1631 (*cpfn
)((uchar_t
*)hp
+ hdrlen
, pkt
,
1632 (hp
->bh_caplen
= totlen
- hdrlen
));
1633 d
->bd_slen
= curlen
+ totlen
;
1636 * Call bpf_wakeup after bd_slen has been updated.
1643 * Initialize all nonzero fields of a descriptor.
1646 bpf_allocbufs(struct bpf_d
*d
)
1649 d
->bd_fbuf
= kmem_zalloc(d
->bd_bufsize
, KM_NOSLEEP
);
1652 d
->bd_sbuf
= kmem_zalloc(d
->bd_bufsize
, KM_NOSLEEP
);
1654 kmem_free(d
->bd_fbuf
, d
->bd_bufsize
);
1663 * Free buffers currently in use by a descriptor.
1667 bpf_freed(struct bpf_d
*d
)
1670 * At this point the descriptor has been detached from its
1671 * interface and it yet hasn't been marked free.
1673 if (d
->bd_sbuf
!= 0) {
1674 kmem_free(d
->bd_sbuf
, d
->bd_bufsize
);
1675 if (d
->bd_hbuf
!= 0)
1676 kmem_free(d
->bd_hbuf
, d
->bd_bufsize
);
1677 if (d
->bd_fbuf
!= 0)
1678 kmem_free(d
->bd_fbuf
, d
->bd_bufsize
);
1681 kmem_free(d
->bd_filter
, d
->bd_filter_size
);
1685 * Get a list of available data link type of the interface.
1688 bpf_getdltlist(struct bpf_d
*d
, struct bpf_dltlist
*listp
)
1690 bpf_provider_list_t
*bp
;
1691 bpf_provider_t
*bpr
;
1703 mutex_enter(&d
->bd_lock
);
1704 LIST_FOREACH(bp
, &bpf_providers
, bpl_next
) {
1706 error
= MBPF_OPEN(bpr
, d
->bd_ifname
, &mh
, d
->bd_zone
);
1709 error
= MBPF_CLIENT_OPEN(bpr
, mh
, &mcip
);
1712 error
= MBPF_GET_ZONE(bpr
, mh
, &zoneid
);
1715 if (d
->bd_zone
!= GLOBAL_ZONEID
&&
1716 d
->bd_zone
!= zoneid
)
1718 error
= MBPF_GET_DLT(bpr
, mh
, &nicdlt
);
1721 nicdlt
= bpf_dl_to_dlt(nicdlt
);
1722 if (listp
->bfl_list
!= NULL
) {
1723 if (n
>= listp
->bfl_len
) {
1724 MBPF_CLIENT_CLOSE(bpr
, mcip
);
1725 MBPF_CLOSE(bpr
, mh
);
1729 * Bumping of bd_inuse ensures the structure does not
1730 * disappear while the copyout runs and allows the for
1731 * loop to be continued.
1734 mutex_exit(&d
->bd_lock
);
1735 if (copyout(&nicdlt
,
1736 listp
->bfl_list
+ n
, sizeof (uint_t
)) != 0)
1738 mutex_enter(&d
->bd_lock
);
1746 MBPF_CLIENT_CLOSE(bpr
, mcip
);
1750 MBPF_CLOSE(bpr
, mh
);
1754 mutex_exit(&d
->bd_lock
);
1757 * It is quite possible that one or more provider to BPF may not
1758 * know about a link name whlist others do. In that case, so long
1759 * as we have one success, do not declare an error unless it was
1760 * an EFAULT as this indicates a problem that needs to be reported.
1762 if ((error
!= EFAULT
) && (n
> 0))
1770 * Set the data link type of a BPF instance.
1773 bpf_setdlt(struct bpf_d
*d
, void *addr
)
1775 char ifname
[LIFNAMSIZ
+1];
1780 if (copyin(addr
, &dlt
, sizeof (dlt
)) != 0)
1783 mutex_enter(&d
->bd_lock
);
1785 if (d
->bd_bif
== 0) { /* Interface not set */
1786 mutex_exit(&d
->bd_lock
);
1789 if (d
->bd_dlt
== dlt
) { /* NULL-op */
1790 mutex_exit(&d
->bd_lock
);
1794 error
= MBPF_GET_ZONE(&d
->bd_mac
, d
->bd_bif
, &niczone
);
1796 mutex_exit(&d
->bd_lock
);
1801 * See the matrix at the top of the file for the permissions table
1802 * enforced by this driver.
1804 if ((d
->bd_zone
!= GLOBAL_ZONEID
) && (dlt
!= DLT_IPNET
) &&
1805 (niczone
!= d
->bd_zone
)) {
1806 mutex_exit(&d
->bd_lock
);
1810 (void) strlcpy(ifname
, d
->bd_ifname
, sizeof (ifname
));
1813 error
= bpf_attachd(d
, ifname
, dlt
);
1817 mutex_exit(&d
->bd_lock
);
1822 * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1823 * with the necessary protection to retrieve and modify bd_callout but it
1824 * does not hold the lock for its entire duration... see below...
1827 bpf_clear_timeout(struct bpf_d
*d
)
1829 timeout_id_t tid
= d
->bd_callout
;
1834 * If the timeout has fired and is waiting on bd_lock, we could
1835 * deadlock here because untimeout if bd_lock is held and would
1836 * wait for bpf_timed_out to finish and it never would.
1839 mutex_exit(&d
->bd_lock
);
1840 (void) untimeout(tid
);
1841 mutex_enter(&d
->bd_lock
);
1848 * As a cloning device driver, BPF needs to keep track of which device
1849 * numbers are in use and which ones are not. A hash table, indexed by
1850 * the minor device number, is used to store the pointers to the
1851 * individual descriptors that are allocated in bpfopen().
1852 * The functions below present the interface for that hash table to
1853 * the rest of the driver.
1855 static struct bpf_d
*
1856 bpf_dev_find(minor_t minor
)
1858 struct bpf_d
*d
= NULL
;
1860 (void) mod_hash_find(bpf_hash
, (mod_hash_key_t
)(uintptr_t)minor
,
1861 (mod_hash_val_t
*)&d
);
1867 bpf_dev_add(struct bpf_d
*d
)
1869 (void) mod_hash_insert(bpf_hash
, (mod_hash_key_t
)(uintptr_t)d
->bd_dev
,
1874 bpf_dev_remove(struct bpf_d
*d
)
1878 (void) mod_hash_remove(bpf_hash
, (mod_hash_key_t
)(uintptr_t)d
->bd_dev
,
1879 (mod_hash_val_t
*)&stor
);
1884 * bpf_def_get should only ever be called for a minor number that exists,
1885 * thus there should always be a pointer in the hash table that corresponds
1888 static struct bpf_d
*
1889 bpf_dev_get(minor_t minor
)
1891 struct bpf_d
*d
= NULL
;
1893 (void) mod_hash_find(bpf_hash
, (mod_hash_key_t
)(uintptr_t)minor
,
1894 (mod_hash_val_t
*)&d
);