2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/cdefs.h>
34 #include "opt_inet6.h"
37 #include <sys/param.h>
38 #include <sys/eventhandler.h>
39 #include <sys/kernel.h>
41 #include <sys/malloc.h>
43 #include <sys/module.h>
44 #include <sys/kernel.h>
47 #include <sys/domain.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sysctl.h>
55 #include <net/if_var.h>
56 #include <net/if_private.h>
57 #include <net/netisr.h>
59 #include <netinet/in.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in_var.h>
63 #include <netinet/ip.h>
64 #include <netinet/ip_var.h>
65 #include <netinet/ip_divert.h>
67 #include <netinet/ip6.h>
68 #include <netinet6/ip6_var.h>
70 #if defined(SCTP) || defined(SCTP_SUPPORT)
71 #include <netinet/sctp_crc32.h>
74 #include <security/mac/mac_framework.h>
80 * Allocate enough space to hold a full IP packet
82 #define DIVSNDQ (65536 + 100)
83 #define DIVRCVQ (65536 + 100)
86 * Usually a system has very few divert ports. Previous implementation
89 #define DIVHASHSIZE (1 << 3) /* 8 entries, one cache line. */
90 #define DIVHASH(port) (port % DIVHASHSIZE)
91 #define DCBHASH(dcb) ((dcb)->dcb_port % DIVHASHSIZE)
94 * Divert sockets work in conjunction with ipfw or other packet filters,
95 * see the divert(4) manpage for features.
96 * Packets are selected by the packet filter and tagged with an
97 * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
98 * the packet filter) and information on the matching filter rule for
99 * subsequent reinjection. The divert_port is used to put the packet
100 * on the corresponding divert socket, while the rule number is passed
101 * up (at least partially) as the sin_port in the struct sockaddr.
103 * Packets written to the divert socket carry in sin_addr a
104 * destination address, and in sin_port the number of the filter rule
105 * after which to continue processing.
106 * If the destination address is INADDR_ANY, the packet is treated as
107 * as outgoing and sent to ip_output(); otherwise it is treated as
108 * incoming and sent to ip_input().
109 * Further, sin_zero carries some information on the interface,
110 * which can be used in the reinject -- see comments in the code.
112 * On reinjection, processing in ip_input() and ip_output()
113 * will be exactly the same as for the original packet, except that
114 * packet filter processing will start at the rule number after the one
115 * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
116 * will apply the entire ruleset to the packet).
118 static SYSCTL_NODE(_net_inet
, OID_AUTO
, divert
, CTLFLAG_RW
| CTLFLAG_MPSAFE
, 0,
121 VNET_PCPUSTAT_DEFINE_STATIC(struct divstat
, divstat
);
122 VNET_PCPUSTAT_SYSINIT(divstat
);
124 VNET_PCPUSTAT_SYSUNINIT(divstat
);
126 SYSCTL_VNET_PCPUSTAT(_net_inet_divert
, OID_AUTO
, stats
, struct divstat
,
127 divstat
, "divert(4) socket statistics");
128 #define DIVSTAT_INC(name) \
129 VNET_PCPUSTAT_ADD(struct divstat, divstat, div_ ## name, 1)
131 static u_long div_sendspace
= DIVSNDQ
; /* XXX sysctl ? */
132 static u_long div_recvspace
= DIVRCVQ
; /* XXX sysctl ? */
134 static int div_output_inbound(int fmaily
, struct socket
*so
, struct mbuf
*m
,
135 struct sockaddr_in
*sin
);
136 static int div_output_outbound(int family
, struct socket
*so
, struct mbuf
*m
);
140 SLIST_ENTRY(divcb
) dcb_next
;
142 #define DCB_UNBOUND ((intptr_t)-1)
144 struct socket
*dcb_socket
;
147 struct epoch_context dcb_epochctx
;
150 SLIST_HEAD(divhashhead
, divcb
);
152 VNET_DEFINE_STATIC(struct divhashhead
, divhash
[DIVHASHSIZE
]) = {};
153 #define V_divhash VNET(divhash)
154 VNET_DEFINE_STATIC(uint64_t, dcb_count
) = 0;
155 #define V_dcb_count VNET(dcb_count)
156 VNET_DEFINE_STATIC(uint64_t, dcb_gencnt
) = 0;
157 #define V_dcb_gencnt VNET(dcb_gencnt)
159 static struct mtx divert_mtx
;
160 MTX_SYSINIT(divert
, &divert_mtx
, "divert(4) socket pcb lists", MTX_DEF
);
161 #define DIVERT_LOCK() mtx_lock(&divert_mtx)
162 #define DIVERT_UNLOCK() mtx_unlock(&divert_mtx)
165 * Divert a packet by passing it up to the divert socket at port 'port'.
168 divert_packet(struct mbuf
*m
, bool incoming
)
172 struct sockaddr_in divsrc
;
178 mtag
= m_tag_locate(m
, MTAG_IPFW_RULE
, 0, NULL
);
180 cookie
= ((struct ipfw_rule_ref
*)(mtag
+1))->rulenum
;
181 nport
= htons((uint16_t)
182 (((struct ipfw_rule_ref
*)(mtag
+1))->info
));
183 } else if ((mtag
= m_tag_locate(m
, MTAG_PF_DIVERT
, 0, NULL
)) != NULL
) {
184 cookie
= ((struct pf_divert_mtag
*)(mtag
+1))->idir
;
185 nport
= htons(((struct pf_divert_mtag
*)(mtag
+1))->port
);
191 if (m
->m_len
< sizeof(struct ip
) &&
192 (m
= m_pullup(m
, sizeof(struct ip
))) == NULL
)
195 /* Delayed checksums are currently not compatible with divert. */
196 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
198 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
200 #if defined(SCTP) || defined(SCTP_SUPPORT)
201 if (m
->m_pkthdr
.csum_flags
& CSUM_SCTP
) {
204 ip
= mtod(m
, struct ip
*);
205 sctp_delayed_cksum(m
, (uint32_t)(ip
->ip_hl
<< 2));
206 m
->m_pkthdr
.csum_flags
&= ~CSUM_SCTP
;
211 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA_IPV6
) {
212 in6_delayed_cksum(m
, m
->m_pkthdr
.len
-
213 sizeof(struct ip6_hdr
), sizeof(struct ip6_hdr
));
214 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA_IPV6
;
216 #if defined(SCTP) || defined(SCTP_SUPPORT)
217 if (m
->m_pkthdr
.csum_flags
& CSUM_SCTP_IPV6
) {
218 sctp_delayed_cksum(m
, sizeof(struct ip6_hdr
));
219 m
->m_pkthdr
.csum_flags
&= ~CSUM_SCTP_IPV6
;
223 bzero(&divsrc
, sizeof(divsrc
));
224 divsrc
.sin_len
= sizeof(divsrc
);
225 divsrc
.sin_family
= AF_INET
;
226 /* record matching rule, in host format */
227 divsrc
.sin_port
= cookie
;
229 * Record receive interface address, if any.
230 * But only for incoming packets.
239 /* Find IP address for receive interface */
240 ifp
= m
->m_pkthdr
.rcvif
;
241 CK_STAILQ_FOREACH(ifa
, &ifp
->if_addrhead
, ifa_link
) {
242 if (ifa
->ifa_addr
->sa_family
!= AF_INET
)
245 ((struct sockaddr_in
*) ifa
->ifa_addr
)->sin_addr
;
250 * Record the incoming interface name whenever we have one.
252 if (m
->m_pkthdr
.rcvif
) {
254 * Hide the actual interface name in there in the
255 * sin_zero array. XXX This needs to be moved to a
256 * different sockaddr type for divert, e.g.
257 * sockaddr_div with multiple fields like
258 * sockaddr_dl. Presently we have only 7 bytes
259 * but that will do for now as most interfaces
260 * are 4 or less + 2 or less bytes for unit.
261 * There is probably a faster way of doing this,
262 * possibly taking it from the sockaddr_dl on the iface.
263 * This solves the problem of a P2P link and a LAN interface
264 * having the same address, which can result in the wrong
265 * interface being assigned to the packet when fed back
266 * into the divert socket. Theoretically if the daemon saves
267 * and re-uses the sockaddr_in as suggested in the man pages,
268 * this iface name will come along for the ride.
269 * (see div_output for the other half of this.)
271 strlcpy(divsrc
.sin_zero
, m
->m_pkthdr
.rcvif
->if_xname
,
272 sizeof(divsrc
.sin_zero
));
275 /* Put packet on socket queue, if any */
276 SLIST_FOREACH(dcb
, &V_divhash
[DIVHASH(nport
)], dcb_next
)
277 if (dcb
->dcb_port
== nport
)
281 struct socket
*sa
= dcb
->dcb_socket
;
283 SOCKBUF_LOCK(&sa
->so_rcv
);
284 if (sbappendaddr_locked(&sa
->so_rcv
,
285 (struct sockaddr
*)&divsrc
, m
, NULL
) == 0) {
286 soroverflow_locked(sa
);
289 sorwakeup_locked(sa
);
290 DIVSTAT_INC(diverted
);
299 * Deliver packet back into the IP processing machinery.
301 * If no address specified, or address is 0.0.0.0, send to ip_output();
302 * otherwise, send to ip_input() and mark as having been received on
303 * the interface with that address.
306 div_send(struct socket
*so
, int flags
, struct mbuf
*m
, struct sockaddr
*nam
,
307 struct mbuf
*control
, struct thread
*td
)
309 struct epoch_tracker et
;
310 struct sockaddr_in
*sin
= (struct sockaddr_in
*)nam
;
313 struct ipfw_rule_ref
*dt
;
314 struct pf_divert_mtag
*pfdt
;
320 /* Packet must have a header (but that's about it) */
321 if (m
->m_len
< sizeof (struct ip
) &&
322 (m
= m_pullup(m
, sizeof (struct ip
))) == NULL
) {
328 if (sin
->sin_family
!= AF_INET
) {
330 return (EAFNOSUPPORT
);
332 if (sin
->sin_len
!= sizeof(*sin
)) {
339 * An mbuf may hasn't come from userland, but we pretend
342 m
->m_pkthdr
.rcvif
= NULL
;
344 M_SETFIB(m
, so
->so_fibnum
);
346 mtag
= m_tag_locate(m
, MTAG_IPFW_RULE
, 0, NULL
);
348 /* this should be normal */
349 mtag
= m_tag_alloc(MTAG_IPFW_RULE
, 0,
350 sizeof(struct ipfw_rule_ref
), M_NOWAIT
| M_ZERO
);
355 m_tag_prepend(m
, mtag
);
357 dt
= (struct ipfw_rule_ref
*)(mtag
+1);
359 /* Loopback avoidance and state recovery */
363 /* set the starting point. We provide a non-zero slot,
364 * but a non_matching chain_id to skip that info and use
365 * the rulenum/rule_id.
367 dt
->slot
= 1; /* dummy, chain_id is invalid */
369 dt
->rulenum
= sin
->sin_port
+1; /* host format ? */
371 /* XXX: broken for IPv6 */
373 * Find receive interface with the given name, stuffed
374 * (if it exists) in the sin_zero[] field.
375 * The name is user supplied data so don't trust its size
376 * or that it is zero terminated.
378 for (i
= 0; i
< sizeof(sin
->sin_zero
) && sin
->sin_zero
[i
]; i
++)
380 if ( i
> 0 && i
< sizeof(sin
->sin_zero
))
381 m
->m_pkthdr
.rcvif
= ifunit(sin
->sin_zero
);
384 ip
= mtod(m
, struct ip
*);
392 case IPV6_VERSION
>> 4:
398 return (EAFNOSUPPORT
);
401 mtag
= m_tag_locate(m
, MTAG_PF_DIVERT
, 0, NULL
);
403 /* this should be normal */
404 mtag
= m_tag_alloc(MTAG_PF_DIVERT
, 0,
405 sizeof(struct pf_divert_mtag
), M_NOWAIT
| M_ZERO
);
410 m_tag_prepend(m
, mtag
);
412 pfdt
= (struct pf_divert_mtag
*)(mtag
+1);
414 pfdt
->idir
= sin
->sin_port
;
416 /* Reinject packet into the system as incoming or outgoing */
418 if (!sin
|| sin
->sin_addr
.s_addr
== 0) {
419 dt
->info
|= IPFW_IS_DIVERT
| IPFW_INFO_OUT
;
420 pfdt
->ndir
= PF_DIVERT_MTAG_DIR_OUT
;
421 error
= div_output_outbound(family
, so
, m
);
423 dt
->info
|= IPFW_IS_DIVERT
| IPFW_INFO_IN
;
424 pfdt
->ndir
= PF_DIVERT_MTAG_DIR_IN
;
425 error
= div_output_inbound(family
, so
, m
, sin
);
433 * Sends mbuf @m to the wire via ip[6]_output().
435 * Returns 0 on success or an errno value on failure. @m is always consumed.
438 div_output_outbound(int family
, struct socket
*so
, struct mbuf
*m
)
446 struct ip
*const ip
= mtod(m
, struct ip
*);
448 /* Don't allow packet length sizes that will crash. */
449 if (((u_short
)ntohs(ip
->ip_len
) > m
->m_pkthdr
.len
)) {
459 struct ip6_hdr
*const ip6
= mtod(m
, struct ip6_hdr
*);
461 /* Don't allow packet length sizes that will crash */
462 if (((u_short
)ntohs(ip6
->ip6_plen
) > m
->m_pkthdr
.len
)) {
472 mac_socket_create_mbuf(so
, m
);
479 error
= ip_output(m
, NULL
, NULL
,
480 ((so
->so_options
& SO_DONTROUTE
) ? IP_ROUTETOIF
: 0)
481 | IP_ALLOWBROADCAST
| IP_RAWOUTPUT
, NULL
, NULL
);
486 error
= ip6_output(m
, NULL
, NULL
, 0, NULL
, NULL
, NULL
);
491 DIVSTAT_INC(outbound
);
497 * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue.
499 * Returns 0 on success or an errno value on failure. @m is always consumed.
502 div_output_inbound(int family
, struct socket
*so
, struct mbuf
*m
,
503 struct sockaddr_in
*sin
)
507 if (m
->m_pkthdr
.rcvif
== NULL
) {
509 * No luck with the name, check by IP address.
510 * Clear the port and the ifname to make sure
511 * there are no distractions for ifa_ifwithaddr.
514 /* XXX: broken for IPv6 */
515 bzero(sin
->sin_zero
, sizeof(sin
->sin_zero
));
517 ifa
= ifa_ifwithaddr((struct sockaddr
*) sin
);
520 return (EADDRNOTAVAIL
);
522 m
->m_pkthdr
.rcvif
= ifa
->ifa_ifp
;
525 mac_socket_create_mbuf(so
, m
);
527 /* Send packet to input processing via netisr */
534 ip
= mtod(m
, struct ip
*);
536 * Restore M_BCAST flag when destination address is
537 * broadcast. It is expected by ip_tryforward().
539 if (IN_MULTICAST(ntohl(ip
->ip_dst
.s_addr
)))
540 m
->m_flags
|= M_MCAST
;
541 else if (in_broadcast(ip
->ip_dst
, m
->m_pkthdr
.rcvif
))
542 m
->m_flags
|= M_BCAST
;
543 netisr_queue_src(NETISR_IP
, (uintptr_t)so
, m
);
544 DIVSTAT_INC(inbound
);
550 netisr_queue_src(NETISR_IPV6
, (uintptr_t)so
, m
);
551 DIVSTAT_INC(inbound
);
563 div_attach(struct socket
*so
, int proto
, struct thread
*td
)
569 error
= priv_check(td
, PRIV_NETINET_DIVERT
);
573 error
= soreserve(so
, div_sendspace
, div_recvspace
);
576 dcb
= malloc(sizeof(*dcb
), M_PCB
, M_WAITOK
);
577 dcb
->dcb_bound
= DCB_UNBOUND
;
578 dcb
->dcb_socket
= so
;
581 dcb
->dcb_gencnt
= ++V_dcb_gencnt
;
589 div_free(epoch_context_t ctx
)
591 struct divcb
*dcb
= __containerof(ctx
, struct divcb
, dcb_epochctx
);
597 div_detach(struct socket
*so
)
599 struct divcb
*dcb
= so
->so_pcb
;
603 if (dcb
->dcb_bound
!= DCB_UNBOUND
)
604 SLIST_REMOVE(&V_divhash
[DCBHASH(dcb
)], dcb
, divcb
, dcb_next
);
608 NET_EPOCH_CALL(div_free
, &dcb
->dcb_epochctx
);
612 div_bind(struct socket
*so
, struct sockaddr
*nam
, struct thread
*td
)
617 if (nam
->sa_family
!= AF_INET
)
619 if (nam
->sa_len
!= sizeof(struct sockaddr_in
))
621 port
= ((struct sockaddr_in
*)nam
)->sin_port
;
623 SLIST_FOREACH(dcb
, &V_divhash
[DIVHASH(port
)], dcb_next
)
624 if (dcb
->dcb_port
== port
) {
629 if (dcb
->dcb_bound
!= DCB_UNBOUND
)
630 SLIST_REMOVE(&V_divhash
[DCBHASH(dcb
)], dcb
, divcb
, dcb_next
);
631 dcb
->dcb_port
= port
;
632 SLIST_INSERT_HEAD(&V_divhash
[DIVHASH(port
)], dcb
, dcb_next
);
639 div_pcblist(SYSCTL_HANDLER_ARGS
)
645 if (req
->newptr
!= 0)
648 if (req
->oldptr
== 0) {
652 n
+= imax(n
/ 8, 10);
653 req
->oldidx
= 2 * (sizeof xig
) + n
* sizeof(struct xinpcb
);
657 if ((error
= sysctl_wire_old_buffer(req
, 0)) != 0)
660 bzero(&xig
, sizeof(xig
));
661 xig
.xig_len
= sizeof xig
;
662 xig
.xig_count
= V_dcb_count
;
663 xig
.xig_gen
= V_dcb_gencnt
;
664 xig
.xig_sogen
= so_gencnt
;
665 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
670 for (int i
= 0; i
< DIVHASHSIZE
; i
++)
671 SLIST_FOREACH(dcb
, &V_divhash
[i
], dcb_next
) {
672 if (dcb
->dcb_gencnt
<= xig
.xig_gen
) {
675 bzero(&xi
, sizeof(xi
));
676 xi
.xi_len
= sizeof(struct xinpcb
);
677 sotoxsocket(dcb
->dcb_socket
, &xi
.xi_socket
);
678 xi
.inp_gencnt
= dcb
->dcb_gencnt
;
679 xi
.inp_vflag
= INP_IPV4
; /* XXX: netstat(1) */
680 xi
.inp_inc
.inc_ie
.ie_lport
= dcb
->dcb_port
;
681 error
= SYSCTL_OUT(req
, &xi
, sizeof xi
);
688 * Give the user an updated idea of our state.
689 * If the generation differs from what we told
690 * her before, she knows that something happened
691 * while we were processing this request, and it
692 * might be necessary to retry.
694 xig
.xig_gen
= V_dcb_gencnt
;
695 xig
.xig_sogen
= so_gencnt
;
696 xig
.xig_count
= V_dcb_count
;
697 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
704 SYSCTL_PROC(_net_inet_divert
, OID_AUTO
, pcblist
,
705 CTLTYPE_OPAQUE
| CTLFLAG_RD
| CTLFLAG_MPSAFE
, NULL
, 0, div_pcblist
,
706 "S,xinpcb", "List of active divert sockets");
708 static struct protosw div_protosw
= {
710 .pr_flags
= PR_ATOMIC
|PR_ADDR
,
711 .pr_attach
= div_attach
,
713 .pr_detach
= div_detach
,
717 static struct domain divertdomain
= {
718 .dom_family
= PF_DIVERT
,
719 .dom_name
= "divert",
721 .dom_protosw
= { &div_protosw
},
725 div_modevent(module_t mod
, int type
, void *unused
)
731 domain_add(&divertdomain
);
732 ip_divert_ptr
= divert_packet
;
736 * IPDIVERT may normally not be unloaded because of the
737 * potential race conditions. Tell kldunload we can't be
738 * unloaded unless the unload is forced.
746 * Module ipdivert can only be unloaded if no sockets are
747 * connected. Maybe this can be changed later to forcefully
748 * disconnect any open sockets.
750 * XXXRW: Note that there is a slight race here, as a new
751 * socket open request could be spinning on the lock and then
752 * we destroy the lock.
754 * XXXGL: One more reason this code is incorrect is that it
755 * checks only the current vnet.
758 if (V_dcb_count
!= 0) {
764 ip_divert_ptr
= NULL
;
765 domain_remove(&divertdomain
);
774 static moduledata_t ipdivertmod
= {
780 DECLARE_MODULE(ipdivert
, ipdivertmod
, SI_SUB_PROTO_FIREWALL
, SI_ORDER_ANY
);
781 MODULE_VERSION(ipdivert
, 1);