etc/services - sync with NetBSD-8
[minix.git] / minix / net / lwip / rawsock.c
blobd00df01e4479ae8bb6b6c1fea59f8f194100a042
1 /* LWIP service - rawsock.c - RAW sockets */
2 /*
3 * For IPv6 sockets, this module attempts to implement a part of RFC 3542, but
4 * currently not more than what is supported by lwIP and/or what is expected by
5 * a handful of standard utilities (dhcpcd, ping6, traceroute6..).
7 * For general understanding, be aware that IPv4 raw sockets always receive
8 * packets including the IP header, and may be used to send packets including
9 * the IP header if IP_HDRINCL is set, while IPv6 raw sockets always send and
10 * receive actual payloads only, using ancillary (control) data to set and
11 * retrieve per-packet IP header fields.
13 * For packet headers we follow general BSD semantics. For example, some IPv4
14 * header fields are swapped both when sending and when receiving. Also, like
15 * on NetBSD, IPPROTO_RAW is not a special value in any way.
18 #include "lwip.h"
19 #include "ifaddr.h"
20 #include "pktsock.h"
22 #include "lwip/raw.h"
23 #include "lwip/inet_chksum.h"
25 #include <net/route.h>
26 #include <netinet/icmp6.h>
27 #include <netinet/ip.h>
28 #include <netinet/in_pcb.h>
30 /* The number of RAW sockets. Inherited from the lwIP configuration. */
31 #define NR_RAWSOCK MEMP_NUM_RAW_PCB
34 * Outgoing packets are not getting buffered, so the send buffer size simply
35 * determines the maximum size for sent packets. The send buffer maximum is
36 * therefore limited to the maximum size of a single packet (64K-1 bytes),
37 * which is already enforced by lwIP's 16-bit length parameter to pbuf_alloc().
39 * The actual transmission may enforce a lower limit, though. The full packet
40 * size must not exceed the same 64K-1 limit, and that includes any headers
41 * that still have to be prepended to the given packet. The size of those
42 * headers depends on the socket type (IPv4/IPv6) and the IP_HDRINCL setting.
44 * The default is equal to the maximum here, because if a (by definition,
45 * privileged) application wishes to send large raw packets, it probably has a
46 * good reason, and we do not want to get in its way.
48 #define RAW_MAX_PAYLOAD (UINT16_MAX)
50 #define RAW_SNDBUF_MIN 1 /* minimum RAW send buffer size */
51 #define RAW_SNDBUF_DEF RAW_MAX_PAYLOAD /* default RAW send buffer size */
52 #define RAW_SNDBUF_MAX RAW_MAX_PAYLOAD /* maximum RAW send buffer size */
53 #define RAW_RCVBUF_MIN MEMPOOL_BUFSIZE /* minimum RAW receive buffer size */
54 #define RAW_RCVBUF_DEF 32768 /* default RAW receive buffer size */
55 #define RAW_RCVBUF_MAX 65536 /* maximum RAW receive buffer size */
57 static struct rawsock {
58 struct pktsock raw_pktsock; /* packet socket object */
59 struct raw_pcb *raw_pcb; /* lwIP RAW control block */
60 TAILQ_ENTRY(rawsock) raw_next; /* next in active/free list */
61 struct icmp6_filter raw_icmp6filter; /* ICMPv6 type filter */
62 } raw_array[NR_RAWSOCK];
64 static TAILQ_HEAD(, rawsock) raw_freelist; /* list of free RAW sockets */
65 static TAILQ_HEAD(, rawsock) raw_activelist; /* list, in-use RAW sockets */
67 static const struct sockevent_ops rawsock_ops;
69 #define rawsock_get_sock(raw) (ipsock_get_sock(rawsock_get_ipsock(raw)))
70 #define rawsock_get_ipsock(raw) (pktsock_get_ipsock(&(raw)->raw_pktsock))
71 #define rawsock_is_ipv6(raw) (ipsock_is_ipv6(rawsock_get_ipsock(raw)))
72 #define rawsock_is_v6only(raw) (ipsock_is_v6only(rawsock_get_ipsock(raw)))
73 #define rawsock_is_conn(raw) \
74 (raw_flags((raw)->raw_pcb) & RAW_FLAGS_CONNECTED)
75 #define rawsock_is_hdrincl(raw) \
76 (raw_flags((raw)->raw_pcb) & RAW_FLAGS_HDRINCL)
78 static ssize_t rawsock_pcblist(struct rmib_call *, struct rmib_node *,
79 struct rmib_oldp *, struct rmib_newp *);
81 /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_RAW subtree. */
82 /* All dynamically numbered; the sendspace/recvspace entries are ours. */
83 static struct rmib_node net_inet_raw_table[] = {
84 RMIB_INT(RMIB_RO, RAW_SNDBUF_DEF, "sendspace",
85 "Default RAW send buffer size"),
86 RMIB_INT(RMIB_RO, RAW_RCVBUF_DEF, "recvspace",
87 "Default RAW receive buffer size"),
88 RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, rawsock_pcblist, "pcblist",
89 "RAW IP protocol control block list"),
92 static struct rmib_node net_inet_raw_node =
93 RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw", "RAW IPv4 settings");
94 static struct rmib_node net_inet6_raw6_node =
95 RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw6", "RAW IPv6 settings");
98 * Initialize the raw sockets module.
100 void
101 rawsock_init(void)
103 unsigned int slot;
105 /* Initialize the list of free RAW sockets. */
106 TAILQ_INIT(&raw_freelist);
108 for (slot = 0; slot < __arraycount(raw_array); slot++)
109 TAILQ_INSERT_TAIL(&raw_freelist, &raw_array[slot], raw_next);
111 /* Initialize the list of active RAW sockets. */
112 TAILQ_INIT(&raw_activelist);
114 /* Register the net.inet.raw and net.inet6.raw6 RMIB subtrees. */
115 mibtree_register_inet(PF_INET, IPPROTO_RAW, &net_inet_raw_node);
116 mibtree_register_inet(PF_INET6, IPPROTO_RAW, &net_inet6_raw6_node);
120 * Check whether the given arrived IPv6 packet is fit to be received on the
121 * given raw socket.
123 static int
124 rawsock_check_v6(struct rawsock * raw, struct pbuf * pbuf)
126 uint8_t type;
128 assert(rawsock_is_ipv6(raw));
131 * For ICMPv6 packets, test against the configured type filter.
133 if (raw->raw_pcb->protocol == IPPROTO_ICMPV6) {
134 if (pbuf->len < offsetof(struct icmp6_hdr, icmp6_dataun))
135 return FALSE;
137 memcpy(&type, &((struct icmp6_hdr *)pbuf->payload)->icmp6_type,
138 sizeof(type));
140 if (!ICMP6_FILTER_WILLPASS((int)type, &raw->raw_icmp6filter))
141 return FALSE;
145 * For ICMPv6 packets, or if IPV6_CHECKSUM is enabled, we have to
146 * verify the checksum of the packet before passing it to the user.
147 * This is costly, but it needs to be done and lwIP is not doing it for
148 * us (as of writing, anyway), even though it maintains the offset..
150 if (raw->raw_pcb->chksum_reqd &&
151 (pbuf->tot_len < raw->raw_pcb->chksum_offset + sizeof(uint16_t) ||
152 ip6_chksum_pseudo(pbuf, raw->raw_pcb->protocol, pbuf->tot_len,
153 ip6_current_src_addr(), ip6_current_dest_addr()) != 0)) {
154 return FALSE;
157 /* No reason to filter out this packet. */
158 return TRUE;
162 * Adjust the given arrived IPv4 packet by changing the length and offset
163 * fields to host-byte order, as is done by the BSDs. This effectively mirrors
164 * the swapping part of the preparation done on IPv4 packets being sent if the
165 * IP_HDRINCL socket option is enabled.
167 static void
168 rawsock_adjust_v4(struct pbuf * pbuf)
170 struct ip_hdr *iphdr;
172 if (pbuf->len < sizeof(struct ip_hdr))
173 return;
175 iphdr = (struct ip_hdr *)pbuf->payload;
178 * W. Richard Stevens also mentions ip_id, but at least on NetBSD that
179 * field seems to be swapped neither when sending nor when receiving..
181 IPH_LEN(iphdr) = htons(IPH_LEN(iphdr));
182 IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr));
186 * A packet has arrived on a raw socket. Since the same packet may have to be
187 * delivered to multiple raw sockets, we always return 0 (= not consumed) from
188 * this function. As such, we must make a copy of the given packet if we want
189 * to keep it, and never free it.
191 static uint8_t
192 rawsock_input(void * arg, struct raw_pcb * pcb __unused, struct pbuf * psrc,
193 const ip_addr_t * srcaddr)
195 struct rawsock *raw = (struct rawsock *)arg;
196 struct pbuf *pbuf;
197 int off, hdrlen;
199 assert(raw->raw_pcb == pcb);
202 * If adding this packet would cause the receive buffer to go beyond
203 * the current limit, drop the new packet. This is just an estimation,
204 * because the copy we are about to make may not take the exact same
205 * amount of memory, due to the fact that 1) the pbuf we're given has
206 * an unknown set of headers in front of it, and 2) we need to store
207 * extra information in our copy. The return value of this call, if
208 * not -1, is the number of bytes we need to reserve to store that
209 * extra information.
211 if ((hdrlen = pktsock_test_input(&raw->raw_pktsock, psrc)) < 0)
212 return 0;
215 * Raw IPv6 sockets receive only the actual packet data, whereas raw
216 * IPv4 sockets receive the IP header as well.
218 if (ip_current_is_v6()) {
219 off = ip_current_header_tot_len();
221 util_pbuf_header(psrc, -off);
223 if (!rawsock_check_v6(raw, psrc)) {
224 util_pbuf_header(psrc, off);
226 return 0;
228 } else {
230 * For IPv6 sockets, drop the packet if it was sent as an IPv4
231 * packet and checksumming is enabled (this includes ICMPv6).
232 * Otherwise, the packet would bypass the above checks that we
233 * perform on IPv6 packets. Applications that want to use a
234 * dual-stack protocol with checksumming will have to do the
235 * checksum verification part themselves. Presumably the two
236 * different pseudoheaders would result in different checksums
237 * anyhow, so it would be useless to try to support that.
239 * Beyond that, for IPv4 packets on IPv6 sockets, hide the IPv4
240 * header.
242 if (rawsock_is_ipv6(raw)) {
243 if (raw->raw_pcb->chksum_reqd)
244 return 0;
246 off = IP_HLEN;
248 util_pbuf_header(psrc, -off);
249 } else
250 off = 0;
254 * We need to make a copy of the incoming packet. If we eat the one
255 * given to us, this will 1) stop any other raw sockets from getting
256 * the same packet, 2) allow a single raw socket to discard all TCP/UDP
257 * traffic, and 3) present us with a problem on how to store ancillary
258 * data. Raw sockets are not that performance critical so the extra
259 * copy -even when not always necessary- is not that big of a deal.
261 if ((pbuf = pchain_alloc(PBUF_RAW, hdrlen + psrc->tot_len)) == NULL) {
262 if (off > 0)
263 util_pbuf_header(psrc, off);
265 return 0;
268 util_pbuf_header(pbuf, -hdrlen);
270 if (pbuf_copy(pbuf, psrc) != ERR_OK)
271 panic("unexpected pbuf copy failure");
273 pbuf->flags |= psrc->flags & (PBUF_FLAG_LLMCAST | PBUF_FLAG_LLBCAST);
275 if (off > 0)
276 util_pbuf_header(psrc, off);
278 if (!rawsock_is_ipv6(raw))
279 rawsock_adjust_v4(pbuf);
281 pktsock_input(&raw->raw_pktsock, pbuf, srcaddr, 0);
283 return 0;
287 * Create a raw socket.
289 sockid_t
290 rawsock_socket(int domain, int protocol, struct sock ** sockp,
291 const struct sockevent_ops ** ops)
293 struct rawsock *raw;
294 unsigned int flags;
295 uint8_t ip_type;
297 if (protocol < 0 || protocol > UINT8_MAX)
298 return EPROTONOSUPPORT;
300 if (TAILQ_EMPTY(&raw_freelist))
301 return ENOBUFS;
303 raw = TAILQ_FIRST(&raw_freelist);
306 * Initialize the structure. Do not memset it to zero, as it is still
307 * part of the linked free list. Initialization may still fail.
310 ip_type = pktsock_socket(&raw->raw_pktsock, domain, RAW_SNDBUF_DEF,
311 RAW_RCVBUF_DEF, sockp);
313 /* We should have enough PCBs so this call should not fail.. */
314 if ((raw->raw_pcb = raw_new_ip_type(ip_type, protocol)) == NULL)
315 return ENOBUFS;
316 raw_recv(raw->raw_pcb, rawsock_input, (void *)raw);
318 /* By default, the multicast TTL is 1 and looping is enabled. */
319 raw_set_multicast_ttl(raw->raw_pcb, 1);
321 flags = raw_flags(raw->raw_pcb);
322 raw_setflags(raw->raw_pcb, flags | RAW_FLAGS_MULTICAST_LOOP);
325 * For ICMPv6, checksum generation and verification is mandatory and
326 * type filtering of incoming packets is supported (RFC 3542). For all
327 * other IPv6 protocols, checksumming may be turned on by the user.
329 if (rawsock_is_ipv6(raw) && protocol == IPPROTO_ICMPV6) {
330 raw->raw_pcb->chksum_reqd = 1;
331 raw->raw_pcb->chksum_offset =
332 offsetof(struct icmp6_hdr, icmp6_cksum);
334 ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter);
335 } else
336 raw->raw_pcb->chksum_reqd = 0;
338 TAILQ_REMOVE(&raw_freelist, raw, raw_next);
340 TAILQ_INSERT_TAIL(&raw_activelist, raw, raw_next);
342 *ops = &rawsock_ops;
343 return SOCKID_RAW | (sockid_t)(raw - raw_array);
347 * Bind a raw socket to a local address.
349 static int
350 rawsock_bind(struct sock * sock, const struct sockaddr * addr,
351 socklen_t addr_len, endpoint_t user_endpt)
353 struct rawsock *raw = (struct rawsock *)sock;
354 ip_addr_t ipaddr;
355 err_t err;
356 int r;
359 * Raw sockets may be rebound even if that is not too useful. However,
360 * we do not allow (re)binding when the socket is connected, so as to
361 * eliminate any problems with source and destination type mismatches:
362 * such mismatches are detected at connect time, and rebinding would
363 * avoid those, possibly triggering lwIP asserts as a result.
365 if (rawsock_is_conn(raw))
366 return EINVAL;
368 if ((r = ipsock_get_src_addr(rawsock_get_ipsock(raw), addr, addr_len,
369 user_endpt, &raw->raw_pcb->local_ip, 0 /*local_port*/,
370 TRUE /*allow_mcast*/, &ipaddr, NULL /*portp*/)) != OK)
371 return r;
373 err = raw_bind(raw->raw_pcb, &ipaddr);
375 return util_convert_err(err);
379 * Connect a raw socket to a remote address.
381 static int
382 rawsock_connect(struct sock * sock, const struct sockaddr * addr,
383 socklen_t addr_len, endpoint_t user_endpt __unused)
385 struct rawsock *raw = (struct rawsock *)sock;
386 const ip_addr_t *src_addr;
387 ip_addr_t dst_addr;
388 struct ifdev *ifdev;
389 uint32_t ifindex, ifindex2;
390 err_t err;
391 int r;
394 * One may "unconnect" socket by providing an address with family
395 * AF_UNSPEC.
397 if (addr_is_unspec(addr, addr_len)) {
398 raw_disconnect(raw->raw_pcb);
400 return OK;
403 if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr, addr_len,
404 &raw->raw_pcb->local_ip, &dst_addr, NULL /*dst_port*/)) != OK)
405 return r;
408 * Bind explicitly to a source address if the PCB is not bound to one
409 * yet. This is expected in the BSD socket API, but lwIP does not do
410 * it for us.
412 if (ip_addr_isany(&raw->raw_pcb->local_ip)) {
413 /* Help the multicast case a bit, if possible. */
414 ifdev = NULL;
415 if (ip_addr_ismulticast(&dst_addr)) {
416 ifindex = pktsock_get_ifindex(&raw->raw_pktsock);
417 ifindex2 = raw_get_multicast_netif_index(raw->raw_pcb);
418 if (ifindex == 0)
419 ifindex = ifindex2;
421 if (ifindex != 0) {
422 ifdev = ifdev_get_by_index(ifindex);
424 if (ifdev == NULL)
425 return ENXIO;
429 src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/);
431 if (src_addr == NULL)
432 return EHOSTUNREACH;
434 err = raw_bind(raw->raw_pcb, src_addr);
436 if (err != ERR_OK)
437 return util_convert_err(err);
441 * Connecting a raw socket serves two main purposes: 1) the socket uses
442 * the address as destination when sending, and 2) the socket receives
443 * packets from only the connected address.
445 err = raw_connect(raw->raw_pcb, &dst_addr);
447 if (err != ERR_OK)
448 return util_convert_err(err);
450 return OK;
454 * Perform preliminary checks on a send request.
456 static int
457 rawsock_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
458 const struct sockaddr * addr, socklen_t addr_len __unused,
459 endpoint_t user_endpt __unused, int flags)
461 struct rawsock *raw = (struct rawsock *)sock;
463 if ((flags & ~MSG_DONTROUTE) != 0)
464 return EOPNOTSUPP;
466 if (!rawsock_is_conn(raw) && addr == NULL)
467 return EDESTADDRREQ;
470 * This is only one part of the length check. The rest is done from
471 * rawsock_send(), once we have more information.
473 if (len > ipsock_get_sndbuf(rawsock_get_ipsock(raw)))
474 return EMSGSIZE;
476 return OK;
480 * Swap IP-level options between the RAW PCB and the packet options structure,
481 * for all options that have their flag set in the packet options structure.
482 * This function is called twice when sending a packet. The result is that the
483 * flagged options are overridden for only the packet being sent.
485 static void
486 rawsock_swap_opt(struct rawsock * raw, struct pktopt * pkto)
488 uint8_t tos, ttl, mcast_ttl;
490 if (pkto->pkto_flags & PKTOF_TOS) {
491 tos = raw->raw_pcb->tos;
492 raw->raw_pcb->tos = pkto->pkto_tos;
493 pkto->pkto_tos = tos;
496 if (pkto->pkto_flags & PKTOF_TTL) {
497 ttl = raw->raw_pcb->ttl;
498 mcast_ttl = raw_get_multicast_ttl(raw->raw_pcb);
499 raw->raw_pcb->ttl = pkto->pkto_ttl;
500 raw_set_multicast_ttl(raw->raw_pcb, pkto->pkto_ttl);
501 pkto->pkto_ttl = ttl;
502 pkto->pkto_mcast_ttl = mcast_ttl;
507 * We are about to send the given packet that already includes an IPv4 header,
508 * because the IP_HDRINCL option is enabled on a raw IPv4 socket. Prepare the
509 * IPv4 header for sending, by modifying a few fields in it, as expected by
510 * userland.
512 static int
513 rawsock_prepare_hdrincl(struct rawsock * raw, struct pbuf * pbuf,
514 const ip_addr_t * src_addr)
516 struct ip_hdr *iphdr;
517 size_t hlen;
520 * lwIP obtains the destination address from the IP packet header in
521 * this case, so make sure the packet has a full-sized header.
523 if (pbuf->len < sizeof(struct ip_hdr))
524 return EINVAL;
526 iphdr = (struct ip_hdr *)pbuf->payload;
529 * Fill in the source address if it is not set, and do the byte
530 * swapping and checksum computation common for the BSDs, without which
531 * ping(8) and traceroute(8) do not work properly. We consider this a
532 * convenience feature, so malformed packets are simply sent as is.
533 * TODO: deal with type punning..
535 hlen = (size_t)IPH_HL(iphdr) << 2;
537 if (pbuf->len >= hlen) {
538 /* Fill in the source address if it is blank. */
539 if (iphdr->src.addr == PP_HTONL(INADDR_ANY)) {
540 assert(IP_IS_V4(src_addr));
542 iphdr->src.addr = ip_addr_get_ip4_u32(src_addr);
545 IPH_LEN(iphdr) = htons(IPH_LEN(iphdr));
546 IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr));
547 IPH_CHKSUM(iphdr) = 0;
549 IPH_CHKSUM(iphdr) = inet_chksum(iphdr, hlen);
552 return OK;
556 * Send a packet on a raw socket.
558 static int
559 rawsock_send(struct sock * sock, const struct sockdriver_data * data,
560 size_t len, size_t * off, const struct sockdriver_data * ctl __unused,
561 socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
562 const struct sockaddr * addr, socklen_t addr_len,
563 endpoint_t user_endpt __unused, int flags, size_t min __unused)
565 struct rawsock *raw = (struct rawsock *)sock;
566 struct pktopt pktopt;
567 struct pbuf *pbuf;
568 struct ifdev *ifdev;
569 struct netif *netif;
570 const ip_addr_t *dst_addrp, *src_addrp;
571 ip_addr_t src_addr, dst_addr; /* for storage only; not always used! */
572 size_t hdrlen;
573 uint32_t ifindex;
574 err_t err;
575 int r;
577 /* Copy in and parse any packet options. */
578 pktopt.pkto_flags = 0;
580 if ((r = pktsock_get_ctl(&raw->raw_pktsock, ctl, ctl_len,
581 &pktopt)) != OK)
582 return r;
585 * For a more in-depth explanation of what is going on here, see the
586 * udpsock module, which has largely the same code but with more
587 * elaborate comments.
591 * Start by checking whether the source address and/or the outgoing
592 * interface are overridden using sticky and/or ancillary options.
594 if ((r = pktsock_get_pktinfo(&raw->raw_pktsock, &pktopt, &ifdev,
595 &src_addr)) != OK)
596 return r;
598 if (ifdev != NULL && !ip_addr_isany(&src_addr)) {
599 /* This is guaranteed to be a proper local unicast address. */
600 src_addrp = &src_addr;
601 } else {
602 src_addrp = &raw->raw_pcb->local_ip;
605 * If the socket is bound to a multicast address, use the
606 * unspecified ('any') address as source address instead. A
607 * real source address will then be selected further below.
609 if (ip_addr_ismulticast(src_addrp))
610 src_addrp = IP46_ADDR_ANY(IP_GET_TYPE(src_addrp));
614 * Determine the destination address to use. If the socket is
615 * connected, always ignore any address provided in the send call.
617 if (!rawsock_is_conn(raw)) {
618 assert(addr != NULL); /* already checked in pre_send */
620 if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr,
621 addr_len, src_addrp, &dst_addr, NULL /*dst_port*/)) != OK)
622 return r;
624 dst_addrp = &dst_addr;
625 } else
626 dst_addrp = &raw->raw_pcb->remote_ip;
629 * If the destination is a multicast address, select the outgoing
630 * interface based on the multicast interface index, if one is set.
631 * This must however *not* override an interface index already
632 * specified using IPV6_PKTINFO, as per RFC 3542 Sec. 6.7.
634 if (ifdev == NULL && ip_addr_ismulticast(dst_addrp)) {
635 ifindex = raw_get_multicast_netif_index(raw->raw_pcb);
637 if (ifindex != NETIF_NO_INDEX)
638 ifdev = ifdev_get_by_index(ifindex); /* (may fail) */
642 * If an interface has been determined already now, the send operation
643 * will bypass routing. In that case, we must perform our own checks
644 * on address zone violations, because those will not be made anywhere
645 * else. Subsequent steps below will never introduce violations.
647 if (ifdev != NULL && IP_IS_V6(dst_addrp)) {
648 if (ifaddr_is_zone_mismatch(ip_2_ip6(dst_addrp), ifdev))
649 return EHOSTUNREACH;
651 if (IP_IS_V6(src_addrp) &&
652 ifaddr_is_zone_mismatch(ip_2_ip6(src_addrp), ifdev))
653 return EHOSTUNREACH;
657 * If we do not yet have an interface at this point, perform a route
658 * lookup to determine the outgoing interface, unless MSG_DONTROUTE is
659 * set.
661 if (ifdev == NULL) {
662 if (!(flags & MSG_DONTROUTE)) {
664 * ip_route() should never be called with an
665 * IPADDR_TYPE_ANY type address. This is a lwIP-
666 * internal requirement; while we override both routing
667 * functions, we do not deviate from it.
669 if (IP_IS_ANY_TYPE_VAL(*src_addrp))
670 src_addrp =
671 IP46_ADDR_ANY(IP_GET_TYPE(dst_addrp));
673 /* Perform the route lookup. */
674 if ((netif = ip_route(src_addrp, dst_addrp)) == NULL)
675 return EHOSTUNREACH;
677 ifdev = netif_get_ifdev(netif);
678 } else {
679 if ((ifdev = ifaddr_map_by_subnet(dst_addrp)) == NULL)
680 return EHOSTUNREACH;
685 * At this point we have an outgoing interface. If we do not have a
686 * source address yet, pick one now. As a sidenote, if the destination
687 * address is scoped but has no zone, we could also fill in the zone
688 * now. We let lwIP handle that instead, though.
690 assert(ifdev != NULL);
692 if (ip_addr_isany(src_addrp)) {
693 src_addrp = ifaddr_select(dst_addrp, ifdev, NULL /*ifdevp*/);
695 if (src_addrp == NULL)
696 return EHOSTUNREACH;
700 * Now that we know the full conditions of what we are about to send,
701 * check whether the packet size leaves enough room for lwIP to prepend
702 * headers. If so, allocate a chain of pbufs for the packet.
704 assert(len <= RAW_MAX_PAYLOAD);
706 if (rawsock_is_hdrincl(raw))
707 hdrlen = 0;
708 else if (IP_IS_V6(dst_addrp))
709 hdrlen = IP6_HLEN;
710 else
711 hdrlen = IP_HLEN;
713 if (hdrlen + len > RAW_MAX_PAYLOAD)
714 return EMSGSIZE;
716 if ((pbuf = pchain_alloc(PBUF_IP, len)) == NULL)
717 return ENOBUFS;
719 /* Copy in the packet data. */
720 if ((r = pktsock_get_data(&raw->raw_pktsock, data, len, pbuf)) != OK) {
721 pbuf_free(pbuf);
723 return r;
727 * If the user has turned on IPV6_CHECKSUM, ensure that the packet is
728 * not only large enough to have the checksum stored at the configured
729 * place, but also that the checksum fits within the first pbuf: if we
730 * do not test this here, an assert will trigger in lwIP later. Also
731 * zero out the checksum field first, because lwIP does not do that.
733 if (raw->raw_pcb->chksum_reqd) {
734 if (pbuf->len < raw->raw_pcb->chksum_offset +
735 sizeof(uint16_t)) {
736 pbuf_free(pbuf);
738 return EINVAL;
741 memset((char *)pbuf->payload + raw->raw_pcb->chksum_offset, 0,
742 sizeof(uint16_t));
746 * For sockets where an IPv4 header is already included in the packet,
747 * we need to alter a few header fields to be compatible with BSD.
749 if (rawsock_is_hdrincl(raw) &&
750 (r = rawsock_prepare_hdrincl(raw, pbuf, src_addrp)) != OK) {
751 pbuf_free(pbuf);
753 return r;
756 /* Set broadcast/multicast flags for accounting purposes. */
757 if (ip_addr_ismulticast(dst_addrp))
758 pbuf->flags |= PBUF_FLAG_LLMCAST;
759 else if (ip_addr_isbroadcast(dst_addrp, ifdev_get_netif(ifdev)))
760 pbuf->flags |= PBUF_FLAG_LLBCAST;
762 /* Send the packet. */
763 rawsock_swap_opt(raw, &pktopt);
765 assert(!ip_addr_isany(src_addrp));
766 assert(!ip_addr_ismulticast(src_addrp));
768 err = raw_sendto_if_src(raw->raw_pcb, pbuf, dst_addrp,
769 ifdev_get_netif(ifdev), src_addrp);
771 rawsock_swap_opt(raw, &pktopt);
773 /* Free the pbuf again. */
774 pbuf_free(pbuf);
777 * On success, make sure to return the size of the sent packet as well.
778 * As an aside: ctl_off need not be updated, as it is not returned.
780 if ((r = util_convert_err(err)) == OK)
781 *off = len;
782 return r;
786 * Update the set of flag-type socket options on a raw socket.
788 static void
789 rawsock_setsockmask(struct sock * sock, unsigned int mask)
791 struct rawsock *raw = (struct rawsock *)sock;
794 * FIXME: raw sockets are not supposed to have a broardcast check, so
795 * perhaps just remove this and instead always set SOF_BROADCAST?
797 if (mask & SO_BROADCAST)
798 ip_set_option(raw->raw_pcb, SOF_BROADCAST);
799 else
800 ip_reset_option(raw->raw_pcb, SOF_BROADCAST);
804 * Prepare a helper structure for IP-level option processing.
806 static void
807 rawsock_get_ipopts(struct rawsock * raw, struct ipopts * ipopts)
810 ipopts->local_ip = &raw->raw_pcb->local_ip;
811 ipopts->remote_ip = &raw->raw_pcb->remote_ip;
812 ipopts->tos = &raw->raw_pcb->tos;
813 ipopts->ttl = &raw->raw_pcb->ttl;
814 ipopts->sndmin = RAW_SNDBUF_MIN;
815 ipopts->sndmax = RAW_SNDBUF_MAX;
816 ipopts->rcvmin = RAW_RCVBUF_MIN;
817 ipopts->rcvmax = RAW_RCVBUF_MAX;
821 * Set socket options on a raw socket.
823 static int
824 rawsock_setsockopt(struct sock * sock, int level, int name,
825 const struct sockdriver_data * data, socklen_t len)
827 struct rawsock *raw = (struct rawsock *)sock;
828 struct ipopts ipopts;
829 struct icmp6_filter filter;
830 ip_addr_t ipaddr;
831 struct in_addr in_addr;
832 struct ifdev *ifdev;
833 unsigned int flags;
834 uint32_t ifindex;
835 uint8_t byte;
836 int r, val;
839 * Unfortunately, we have to duplicate most of the multicast options
840 * rather than sharing them with udpsock at the pktsock level. The
841 * reason is that each of the PCBs have their own multicast abstraction
842 * functions and so we cannot merge the rest. Same for getsockopt.
845 switch (level) {
846 case IPPROTO_IP:
847 if (rawsock_is_ipv6(raw))
848 break;
850 switch (name) {
851 case IP_HDRINCL:
852 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
853 len)) != OK)
854 return r;
856 if (val) {
857 raw_setflags(raw->raw_pcb,
858 raw_flags(raw->raw_pcb) |
859 RAW_FLAGS_HDRINCL);
860 } else {
861 raw_setflags(raw->raw_pcb,
862 raw_flags(raw->raw_pcb) &
863 ~RAW_FLAGS_HDRINCL);
866 return OK;
868 case IP_MULTICAST_IF:
869 pktsock_set_mcaware(&raw->raw_pktsock);
871 if ((r = sockdriver_copyin_opt(data, &in_addr,
872 sizeof(in_addr), len)) != OK)
873 return r;
875 ip_addr_set_ip4_u32(&ipaddr, in_addr.s_addr);
877 if ((ifdev = ifaddr_map_by_addr(&ipaddr)) == NULL)
878 return EADDRNOTAVAIL;
880 raw_set_multicast_netif_index(raw->raw_pcb,
881 ifdev_get_index(ifdev));
883 return OK;
885 case IP_MULTICAST_LOOP:
886 pktsock_set_mcaware(&raw->raw_pktsock);
888 if ((r = sockdriver_copyin_opt(data, &byte,
889 sizeof(byte), len)) != OK)
890 return r;
892 flags = raw_flags(raw->raw_pcb);
894 if (byte)
895 flags |= RAW_FLAGS_MULTICAST_LOOP;
896 else
897 flags &= ~RAW_FLAGS_MULTICAST_LOOP;
899 raw_setflags(raw->raw_pcb, flags);
901 return OK;
903 case IP_MULTICAST_TTL:
904 pktsock_set_mcaware(&raw->raw_pktsock);
906 if ((r = sockdriver_copyin_opt(data, &byte,
907 sizeof(byte), len)) != OK)
908 return r;
910 raw_set_multicast_ttl(raw->raw_pcb, byte);
912 return OK;
915 break;
917 case IPPROTO_IPV6:
918 if (!rawsock_is_ipv6(raw))
919 break;
921 switch (name) {
922 case IPV6_CHECKSUM:
923 /* ICMPv6 checksums are always computed. */
924 if (raw->raw_pcb->protocol == IPPROTO_ICMPV6)
925 return EINVAL;
927 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
928 len)) != OK)
929 return r;
931 if (val == -1) {
932 raw->raw_pcb->chksum_reqd = 0;
934 return OK;
935 } else if (val >= 0 && !(val & 1)) {
936 raw->raw_pcb->chksum_reqd = 1;
937 raw->raw_pcb->chksum_offset = val;
939 return OK;
940 } else
941 return EINVAL;
943 case IPV6_MULTICAST_IF:
944 pktsock_set_mcaware(&raw->raw_pktsock);
946 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
947 len)) != OK)
948 return r;
950 if (val != 0) {
951 ifindex = (uint32_t)val;
953 ifdev = ifdev_get_by_index(ifindex);
955 if (ifdev == NULL)
956 return ENXIO;
957 } else
958 ifindex = NETIF_NO_INDEX;
960 raw_set_multicast_netif_index(raw->raw_pcb, ifindex);
962 return OK;
964 case IPV6_MULTICAST_LOOP:
965 pktsock_set_mcaware(&raw->raw_pktsock);
967 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
968 len)) != OK)
969 return r;
971 if (val < 0 || val > 1)
972 return EINVAL;
974 flags = raw_flags(raw->raw_pcb);
976 if (val)
977 flags |= RAW_FLAGS_MULTICAST_LOOP;
978 else
979 flags &= ~RAW_FLAGS_MULTICAST_LOOP;
982 * lwIP's IPv6 functionality does not actually check
983 * this flag at all yet. We set it in the hope that
984 * one day this will magically start working.
986 raw_setflags(raw->raw_pcb, flags);
988 return OK;
990 case IPV6_MULTICAST_HOPS:
991 pktsock_set_mcaware(&raw->raw_pktsock);
993 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
994 len)) != OK)
995 return r;
997 if (val < -1 || val > UINT8_MAX)
998 return EINVAL;
1000 if (val == -1)
1001 val = 1;
1003 raw_set_multicast_ttl(raw->raw_pcb, val);
1005 return OK;
1008 break;
1010 case IPPROTO_ICMPV6:
1011 if (!rawsock_is_ipv6(raw) ||
1012 raw->raw_pcb->protocol != IPPROTO_ICMPV6)
1013 break;
1015 switch (name) {
1016 case ICMP6_FILTER:
1017 /* Who comes up with these stupid exceptions? */
1018 if (len == 0) {
1019 ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter);
1021 return OK;
1024 if ((r = sockdriver_copyin_opt(data, &filter,
1025 sizeof(filter), len)) != OK)
1026 return r;
1029 * As always, never copy in the data into the actual
1030 * destination, as any copy may run into a copy fault
1031 * halfway through, potentially leaving the destination
1032 * in a half-updated and thus corrupted state.
1034 memcpy(&raw->raw_icmp6filter, &filter, sizeof(filter));
1036 return OK;
1040 rawsock_get_ipopts(raw, &ipopts);
1042 return pktsock_setsockopt(&raw->raw_pktsock, level, name, data, len,
1043 &ipopts);
1047 * Retrieve socket options on a raw socket.
1049 static int
1050 rawsock_getsockopt(struct sock * sock, int level, int name,
1051 const struct sockdriver_data * data, socklen_t * len)
1053 struct rawsock *raw = (struct rawsock *)sock;
1054 struct ipopts ipopts;
1055 const ip4_addr_t *ip4addr;
1056 struct in_addr in_addr;
1057 struct ifdev *ifdev;
1058 unsigned int flags;
1059 uint32_t ifindex;
1060 uint8_t byte;
1061 int val;
1063 switch (level) {
1064 case IPPROTO_IP:
1065 if (rawsock_is_ipv6(raw))
1066 break;
1068 switch (name) {
1069 case IP_HDRINCL:
1070 val = !!rawsock_is_hdrincl(raw);
1072 return sockdriver_copyout_opt(data, &val, sizeof(val),
1073 len);
1075 case IP_MULTICAST_IF:
1076 ifindex = raw_get_multicast_netif_index(raw->raw_pcb);
1079 * Map back from the interface index to the IPv4
1080 * address assigned to the corresponding interface.
1081 * Should this not work out, return the 'any' address.
1083 if (ifindex != NETIF_NO_INDEX &&
1084 (ifdev = ifdev_get_by_index(ifindex)) != NULL) {
1085 ip4addr =
1086 netif_ip4_addr(ifdev_get_netif(ifdev));
1088 in_addr.s_addr = ip4_addr_get_u32(ip4addr);
1089 } else
1090 in_addr.s_addr = PP_HTONL(INADDR_ANY);
1092 return sockdriver_copyout_opt(data, &in_addr,
1093 sizeof(in_addr), len);
1095 case IP_MULTICAST_LOOP:
1096 flags = raw_flags(raw->raw_pcb);
1098 byte = !!(flags & RAW_FLAGS_MULTICAST_LOOP);
1100 return sockdriver_copyout_opt(data, &byte,
1101 sizeof(byte), len);
1103 case IP_MULTICAST_TTL:
1104 byte = raw_get_multicast_ttl(raw->raw_pcb);
1106 return sockdriver_copyout_opt(data, &byte,
1107 sizeof(byte), len);
1110 break;
1112 case IPPROTO_IPV6:
1113 if (!rawsock_is_ipv6(raw))
1114 break;
1116 switch (name) {
1117 case IPV6_CHECKSUM:
1118 if (raw->raw_pcb->chksum_reqd)
1119 val = raw->raw_pcb->chksum_offset;
1120 else
1121 val = -1;
1123 return sockdriver_copyout_opt(data, &val, sizeof(val),
1124 len);
1126 case IPV6_MULTICAST_IF:
1127 ifindex = raw_get_multicast_netif_index(raw->raw_pcb);
1129 val = (int)ifindex;
1131 return sockdriver_copyout_opt(data, &val, sizeof(val),
1132 len);
1134 case IPV6_MULTICAST_LOOP:
1135 flags = raw_flags(raw->raw_pcb);
1137 val = !!(flags & RAW_FLAGS_MULTICAST_LOOP);
1139 return sockdriver_copyout_opt(data, &val, sizeof(val),
1140 len);
1142 case IPV6_MULTICAST_HOPS:
1143 val = raw_get_multicast_ttl(raw->raw_pcb);
1145 return sockdriver_copyout_opt(data, &val, sizeof(val),
1146 len);
1149 break;
1151 case IPPROTO_ICMPV6:
1152 if (!rawsock_is_ipv6(raw) ||
1153 raw->raw_pcb->protocol != IPPROTO_ICMPV6)
1154 break;
1156 switch (name) {
1157 case ICMP6_FILTER:
1158 return sockdriver_copyout_opt(data,
1159 &raw->raw_icmp6filter,
1160 sizeof(raw->raw_icmp6filter), len);
1163 break;
1166 rawsock_get_ipopts(raw, &ipopts);
1168 return pktsock_getsockopt(&raw->raw_pktsock, level, name, data, len,
1169 &ipopts);
1173 * Retrieve the local socket address of a raw socket.
1175 static int
1176 rawsock_getsockname(struct sock * sock, struct sockaddr * addr,
1177 socklen_t * addr_len)
1179 struct rawsock *raw = (struct rawsock *)sock;
1181 ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len,
1182 &raw->raw_pcb->local_ip, 0 /*port*/);
1184 return OK;
1188 * Retrieve the remote socket address of a raw socket.
1190 static int
1191 rawsock_getpeername(struct sock * sock, struct sockaddr * addr,
1192 socklen_t * addr_len)
1194 struct rawsock *raw = (struct rawsock *)sock;
1196 if (!rawsock_is_conn(raw))
1197 return ENOTCONN;
1199 ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len,
1200 &raw->raw_pcb->remote_ip, 0 /*port*/);
1202 return OK;
1206 * Shut down a raw socket for reading and/or writing.
1208 static int
1209 rawsock_shutdown(struct sock * sock, unsigned int mask)
1211 struct rawsock *raw = (struct rawsock *)sock;
1213 if (mask & SFL_SHUT_RD)
1214 raw_recv(raw->raw_pcb, NULL, NULL);
1216 pktsock_shutdown(&raw->raw_pktsock, mask);
1218 return OK;
1222 * Close a raw socket.
1224 static int
1225 rawsock_close(struct sock * sock, int force __unused)
1227 struct rawsock *raw = (struct rawsock *)sock;
1229 raw_recv(raw->raw_pcb, NULL, NULL);
1231 raw_remove(raw->raw_pcb);
1232 raw->raw_pcb = NULL;
1234 pktsock_close(&raw->raw_pktsock);
1236 return OK;
1240 * Free up a closed raw socket.
1242 static void
1243 rawsock_free(struct sock * sock)
1245 struct rawsock *raw = (struct rawsock *)sock;
1247 assert(raw->raw_pcb == NULL);
1249 TAILQ_REMOVE(&raw_activelist, raw, raw_next);
1251 TAILQ_INSERT_HEAD(&raw_freelist, raw, raw_next);
1255 * Fill the given kinfo_pcb sysctl(7) structure with information about the RAW
1256 * PCB identified by the given pointer.
1258 static void
1259 rawsock_get_info(struct kinfo_pcb * ki, const void * ptr)
1261 const struct raw_pcb *pcb = (const struct raw_pcb *)ptr;
1262 struct rawsock *raw;
1264 /* We iterate our own list so we can't find "strange" PCBs. */
1265 raw = (struct rawsock *)pcb->recv_arg;
1266 assert(raw >= raw_array &&
1267 raw < &raw_array[__arraycount(raw_array)]);
1269 ki->ki_type = SOCK_RAW;
1270 ki->ki_protocol = pcb->protocol;
1272 ipsock_get_info(ki, &pcb->local_ip, 0 /*local_port*/,
1273 &raw->raw_pcb->remote_ip, 0 /*remote_port*/);
1275 /* TODO: change this so that sockstat(1) may work one day. */
1276 ki->ki_sockaddr = (uint64_t)(uintptr_t)rawsock_get_sock(raw);
1278 ki->ki_rcvq = pktsock_get_recvlen(&raw->raw_pktsock);
1280 if (rawsock_is_hdrincl(raw))
1281 ki->ki_pflags |= INP_HDRINCL;
1285 * Given either NULL or a previously returned RAW PCB pointer, return the first
1286 * or next RAW PCB pointer, or NULL if there are no more. lwIP does not expose
1287 * 'raw_pcbs', but other modules in this service may also use RAW PCBs (which
1288 * should then stay hidden), so we iterate through our own list instead.
1290 static const void *
1291 rawsock_enum(const void * last)
1293 const struct raw_pcb *pcb;
1294 struct rawsock *raw;
1296 if (last != NULL) {
1297 pcb = (const struct raw_pcb *)last;
1299 raw = (struct rawsock *)pcb->recv_arg;
1300 assert(raw >= raw_array &&
1301 raw < &raw_array[__arraycount(raw_array)]);
1303 raw = TAILQ_NEXT(raw, raw_next);
1304 } else
1305 raw = TAILQ_FIRST(&raw_activelist);
1307 if (raw != NULL)
1308 return raw->raw_pcb;
1309 else
1310 return NULL;
1314 * Obtain the list of RAW protocol control blocks, for sysctl(7).
1316 static ssize_t
1317 rawsock_pcblist(struct rmib_call * call, struct rmib_node * node,
1318 struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
1321 return util_pcblist(call, oldp, rawsock_enum, rawsock_get_info);
1324 static const struct sockevent_ops rawsock_ops = {
1325 .sop_bind = rawsock_bind,
1326 .sop_connect = rawsock_connect,
1327 .sop_pre_send = rawsock_pre_send,
1328 .sop_send = rawsock_send,
1329 .sop_pre_recv = pktsock_pre_recv,
1330 .sop_recv = pktsock_recv,
1331 .sop_test_recv = pktsock_test_recv,
1332 .sop_ioctl = ifconf_ioctl,
1333 .sop_setsockmask = rawsock_setsockmask,
1334 .sop_setsockopt = rawsock_setsockopt,
1335 .sop_getsockopt = rawsock_getsockopt,
1336 .sop_getsockname = rawsock_getsockname,
1337 .sop_getpeername = rawsock_getpeername,
1338 .sop_shutdown = rawsock_shutdown,
1339 .sop_close = rawsock_close,
1340 .sop_free = rawsock_free