Sync with cat.c from netbsd-8
[minix3.git] / minix / net / lwip / pktsock.c
blob5ddb9b55a9987bfeadc27266411d848e80807f75
1 /* LWIP service - pktsock.c - packet code shared between UDP and RAW */
3 #include "lwip.h"
4 #include "pktsock.h"
5 #include "ifaddr.h"
7 /*
8 * This buffer should be much bigger (at least 10KB, according to RFC 3542),
9 * but we do not support the ancillary options that take so much space anyway.
11 #define PKTSOCK_CTLBUF_SIZE 256
13 static char pktsock_ctlbuf[PKTSOCK_CTLBUF_SIZE];
16 * Header structures with ancillary data for received packets. The reason that
17 * we do not simply use a generic pkthdr structure with ip_addr_t source and
18 * destination addresses, is that for UDP packets, we put this structure in
19 * place of the received (ethernet and IP headers), and such a full structure
20 * (including IPv6-size addresses) would not fit in the header space for IPv4
21 * packets. So instead we use two address structures, one for IPv4 and one for
22 * IPv6, and a generic header structure on top of it, which also identifies
23 * which address structure is underneath. The combination of the address
24 * structure and the header structure must fit in the IP header. The IPv6
25 * packet header is already so close to the limit here that we have to use
26 * packed addresses. For IPv4 we use the regular addresses for simplicity.
28 struct pkthdr {
29 uint16_t port; /* source port number (UDP only) */
30 uint8_t dstif; /* interface that received the pkt */
31 uint8_t addrif; /* interface that accepted the pkt */
32 uint8_t tos; /* TOS/TC value from the IP header */
33 uint8_t ttl; /* TTL/HL value from the IP header */
34 uint8_t flags; /* packet flags (PKTHF_) */
35 uint8_t _unused; /* all that is still available.. */
38 #define PKTHF_IPV6 0x01 /* packet has IPv6 header */
39 #define PKTHF_MCAST 0x02 /* packet has multicast destination */
40 #define PKTHF_BCAST 0x04 /* packet has broadcast destination */
42 struct pktaddr4 {
43 ip4_addr_t srcaddr;
44 ip4_addr_t dstaddr;
47 struct pktaddr6 {
48 ip6_addr_p_t srcaddr;
49 ip6_addr_p_t dstaddr;
53 * Create a packet socket. Relay parameters and return values to and from the
54 * IP module's socket creation function. This function must not allocate any
55 * resources in any form, as socket creation may still fail later, in which
56 * case no destruction function is called.
58 int
59 pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf, size_t rcvbuf,
60 struct sock ** sockp)
63 pkt->pkt_rcvhead = NULL;
64 pkt->pkt_rcvtailp = &pkt->pkt_rcvhead;
65 pkt->pkt_rcvlen = 0;
67 mcast_reset(&pkt->pkt_mcast);
69 memset(&pkt->pkt_srcaddr, 0, sizeof(pkt->pkt_srcaddr));
70 pkt->pkt_ifindex = 0;
73 * Any PKTF_ type flags should be initialized on the socket only after
74 * the following call, as this call will clear the flags field. For
75 * now, no PKTF_ flags need to be set by default, though.
77 return ipsock_socket(&pkt->pkt_ipsock, domain, sndbuf, rcvbuf, sockp);
81 * Return TRUE if the given packet can and should be received on the given
82 * socket, or FALSE if there is a reason not to receive the packet.
84 static int
85 pktsock_may_recv(struct pktsock * pkt, struct pbuf * pbuf)
89 * By policy, multicast packets should not be received on sockets of
90 * which the owning application is not multicast aware.
92 if (ip_addr_ismulticast(ip_current_dest_addr()) &&
93 !(ipsock_get_flag(&pkt->pkt_ipsock, PKTF_MCAWARE)))
94 return FALSE;
97 * Due to fragment reassembly, we might end up with packets that take
98 * up more buffer space than their byte size, even after rounding up
99 * the latter. The user probably does not want packets to get dropped
100 * for that reason, e.g. when they set a 64K limit and the packet ends
101 * up being estimated as 65K and dropped. So, we test against
102 * 'pbuf->tot_len' rather than the rounded-up packet size. However,
103 * 'pkt->pkt_rcvlen' itself is increased by the rounded-up packet size
104 * when enqueuing the packet, so that we still count the memory
105 * consumption (generally) conservatively, which is what we want.
107 return (pkt->pkt_rcvlen + pbuf->tot_len <=
108 ipsock_get_rcvbuf(&pkt->pkt_ipsock));
112 * Check whether the given packet can and should be received on the given
113 * socket. If so, return the amount of space for ancillary information that
114 * will be necessary for the packet. If not, return a negative value.
117 pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf)
121 * This check will be done again in pktsock_input(), but this function
122 * is called for raw packets only (not for UDP packets) and, if this
123 * (cheap) check fails, we can avoid a (rather expensive) packet copy.
125 if (!pktsock_may_recv(pkt, pbuf))
126 return -1;
128 if (ip_current_is_v6())
129 return (int)(sizeof(struct pktaddr6) + sizeof(struct pkthdr));
130 else
131 return (int)(sizeof(struct pktaddr4) + sizeof(struct pkthdr));
135 * A packet has arrived on a packet socket. We own the given packet buffer,
136 * and so we must free it if we do not want to keep it.
138 void
139 pktsock_input(struct pktsock * pkt, struct pbuf * pbuf,
140 const ip_addr_t * srcaddr, uint16_t port)
142 struct pktaddr4 pktaddr4;
143 struct pktaddr6 pktaddr6;
144 struct pkthdr pkthdr;
145 void *pktaddr;
146 struct ifdev *ifdev;
147 size_t pktaddrlen;
150 * We are going to mess with the packet's header and contents, so we
151 * must be the exclusive owner of the packet. For UDP packets, lwIP
152 * must have made a copy for us in case of non-exclusive delivery
153 * (e.g., multicast packets). For raw packets, we have made a copy of
154 * the packet ourselves just before the call to this function.
156 if (pbuf->ref != 1)
157 panic("input packet has multiple references!");
159 /* If the packet should not be received on this socket, drop it. */
160 if (!pktsock_may_recv(pkt, pbuf)) {
161 pbuf_free(pbuf);
163 return;
167 * Enqueue the packet. Overwrite the leading IP header with packet
168 * information that is used at the time of receipt by userland. The
169 * data structures are such that the information always fits in what
170 * was the IP header. The reference count check earlier ensures that
171 * we never overwrite part of a packet that is still in use elsewhere.
173 if (ip_current_is_v6()) {
174 assert(IP_IS_V6(srcaddr));
175 assert(ip6_current_dest_addr() != NULL);
177 ip6_addr_copy_to_packed(pktaddr6.srcaddr, *ip_2_ip6(srcaddr));
178 ip6_addr_copy_to_packed(pktaddr6.dstaddr,
179 *ip6_current_dest_addr());
180 pktaddr = &pktaddr6;
181 pktaddrlen = sizeof(pktaddr6);
183 assert(pktaddrlen + sizeof(pkthdr) <= IP6_HLEN);
185 pkthdr.tos = IP6H_TC(ip6_current_header());
186 pkthdr.ttl = IP6H_HOPLIM(ip6_current_header());
187 pkthdr.flags = PKTHF_IPV6;
188 } else {
189 assert(IP_IS_V4(srcaddr));
190 assert(ip4_current_dest_addr() != NULL);
192 memcpy(&pktaddr4.srcaddr, ip_2_ip4(srcaddr),
193 sizeof(pktaddr4.srcaddr));
194 memcpy(&pktaddr4.dstaddr, ip4_current_dest_addr(),
195 sizeof(pktaddr4.srcaddr));
196 pktaddr = &pktaddr4;
197 pktaddrlen = sizeof(pktaddr4);
199 assert(pktaddrlen + sizeof(pkthdr) <= IP_HLEN);
201 pkthdr.tos = IPH_TOS(ip4_current_header());
202 pkthdr.ttl = IPH_TTL(ip4_current_header());
203 pkthdr.flags = 0;
207 * Save both the interface on which the packet was received (for
208 * PKTINFO) and the interface that owns the destination address of the
209 * packet (for the source address's zone ID).
211 assert(ip_current_input_netif() != NULL);
212 ifdev = netif_get_ifdev(ip_current_input_netif());
213 pkthdr.dstif = (uint16_t)ifdev_get_index(ifdev);
215 assert(ip_current_netif() != NULL);
216 ifdev = netif_get_ifdev(ip_current_netif());
217 pkthdr.addrif = (uint16_t)ifdev_get_index(ifdev);
219 if ((pbuf->flags & PBUF_FLAG_LLMCAST) ||
220 ip_addr_ismulticast(ip_current_dest_addr()))
221 pkthdr.flags |= PKTHF_MCAST;
222 else if ((pbuf->flags & PBUF_FLAG_LLBCAST) ||
223 ip_addr_isbroadcast(ip_current_dest_addr(), ip_current_netif()))
224 pkthdr.flags |= PKTHF_BCAST;
226 pkthdr.port = port;
228 util_pbuf_header(pbuf, sizeof(pkthdr));
230 memcpy(pbuf->payload, &pkthdr, sizeof(pkthdr));
232 util_pbuf_header(pbuf, pktaddrlen);
234 memcpy(pbuf->payload, pktaddr, pktaddrlen);
236 util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + pktaddrlen));
238 *pkt->pkt_rcvtailp = pbuf;
239 pkt->pkt_rcvtailp = pchain_end(pbuf);
240 pkt->pkt_rcvlen += pchain_size(pbuf);
242 sockevent_raise(ipsock_get_sock(&pkt->pkt_ipsock), SEV_RECV);
246 * Obtain interface and source address information for an outgoing packet. In
247 * particular, parse any IPV6_PKTINFO options provided as either sticky options
248 * on the socket 'pkt' or as ancillary options in the packet options 'pkto'.
249 * On success, return OK, with 'ifdevp' set to either the outgoing interface to
250 * use for the packet, or NULL if no outgoing interface was specified using
251 * either of the aforementioned options. If, and only if, 'ifdevp' is set to
252 * an actual interface (i.e., not NULL), then 'src_addrp' is filled with either
253 * a locally owned, validated, unicast address to use as source of the packet,
254 * or the unspecified ('any') address if no source address was specified using
255 * the options. On failure, return a negative error code.
258 pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto,
259 struct ifdev ** ifdevp, ip_addr_t * src_addrp)
261 struct ifdev *ifdev, *ifdev2;
262 ip_addr_t ipaddr;
263 uint32_t ifindex;
264 int r;
266 /* We support only IPV6_PKTINFO. IP_PKTINFO is not supported. */
267 if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) {
268 *ifdevp = NULL;
269 return OK;
273 * TODO: we are spending a lot of effort on initializing and copying
274 * stuff around, even just to find out whether there is anything to do
275 * at all here. See if this can be optimized.
277 ip_addr_set_zero_ip6(&ipaddr);
280 * Ancillary data takes precedence over sticky options. We treat the
281 * source address and interface index fields as separate, overriding
282 * each earlier value only if non-zero. TODO: is that correct?
284 if (pkto->pkto_flags & PKTOF_PKTINFO) {
285 memcpy(ip_2_ip6(&ipaddr)->addr, &pkto->pkto_srcaddr.addr,
286 sizeof(ip_2_ip6(&ipaddr)->addr));
287 ifindex = pkto->pkto_ifindex;
288 } else
289 ifindex = 0;
291 if (ip6_addr_isany(ip_2_ip6(&ipaddr)))
292 memcpy(ip_2_ip6(&ipaddr)->addr, &pkt->pkt_srcaddr.addr,
293 sizeof(ip_2_ip6(&ipaddr)->addr));
294 if (ifindex == 0)
295 ifindex = pkt->pkt_ifindex;
297 /* If both fields are blank, there is nothing more to do. */
298 if (ip6_addr_isany(ip_2_ip6(&ipaddr)) && ifindex == 0) {
299 *ifdevp = NULL;
300 return OK;
303 /* If an interface index is specified, it must be valid. */
304 ifdev = NULL;
306 if (ifindex != 0 && (ifdev = ifdev_get_by_index(ifindex)) == NULL)
307 return ENXIO;
310 * Use the interface index to set a zone on the source address, if the
311 * source address has a scope.
313 if (ip6_addr_has_scope(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) {
314 if (ifindex == 0)
315 return EADDRNOTAVAIL;
317 ip6_addr_set_zone(ip_2_ip6(&ipaddr), ifindex);
321 * We need to validate the given address just as thoroughly as an
322 * address given through bind(). If we don't, we could allow forged
323 * source addresses etcetera. To be sure: this call may change the
324 * address to an IPv4 type address if needed.
326 if ((r = ipsock_check_src_addr(pktsock_get_ipsock(pkt), &ipaddr,
327 FALSE /*allow_mcast*/, &ifdev2)) != OK)
328 return r;
330 if (ifdev2 != NULL) {
331 if (ifdev == NULL)
332 ifdev = ifdev2;
333 else if (ifdev != ifdev2)
334 return EADDRNOTAVAIL;
335 } else {
337 * There should be no cases where the (non-multicast) address
338 * successfully parsed, is not unspecified, and yet did not map
339 * to an interface. Eliminate the possibility anyway by
340 * throwing an error for this case. As a result, we are left
341 * with one of two cases:
343 * 1) ifdevp is not NULL, and src_addrp is unspecified;
344 * 2) ifdevp is not NULL, and src_addrp is a locally assigned
345 * (unicast) address.
347 * This is why we need not fill src_addrp when ifdevp is NULL.
349 if (!ip_addr_isany(&ipaddr))
350 return EADDRNOTAVAIL;
353 *ifdevp = ifdev;
354 if (ifdev != NULL)
355 *src_addrp = ipaddr;
356 return OK;
360 * Parse a chunk of user-provided control data, on an IPv4 socket provided as
361 * 'pkt'. The control chunk is given as 'cmsg', and the length of the data
362 * following the control header (possibly zero) is given as 'len'. On success,
363 * return OK, with any parsed options merged into the set of packet options
364 * 'pkto'. On failure, return a negative error code.
366 static int
367 pktsock_parse_ctl_v4(struct pktsock * pkt __unused, struct cmsghdr * cmsg,
368 socklen_t len, struct pktopt * pkto)
370 uint8_t byte;
371 int val;
373 if (cmsg->cmsg_level != IPPROTO_IP)
374 return EAFNOSUPPORT;
376 switch (cmsg->cmsg_type) {
377 case IP_TOS:
379 * Some userland code (bind's libisc in particular) supplies
380 * a single byte instead of a full integer for this option.
381 * We go out of our way to accept that format, too.
383 if (len != sizeof(val) && len != sizeof(byte))
384 return EINVAL;
386 if (len == sizeof(byte)) {
387 memcpy(&byte, CMSG_DATA(cmsg), sizeof(byte));
388 val = (int)byte;
389 } else
390 memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
392 if (val < 0 || val > UINT8_MAX)
393 return EINVAL;
395 pkto->pkto_flags |= PKTOF_TOS;
396 pkto->pkto_tos = (uint8_t)val;
398 return OK;
400 case IP_TTL:
401 if (len != sizeof(val))
402 return EINVAL;
404 memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
406 if (val < 0 || val > UINT8_MAX)
407 return EINVAL;
409 pkto->pkto_flags |= PKTOF_TTL;
410 pkto->pkto_ttl = (uint8_t)val;
412 return OK;
415 * Implementing IP_PKTINFO might be a bit harder than its IPV6_PKTINFO
416 * sibling, because it would require the use of zone IDs (interface
417 * indices) for IPv4, which is not supported yet.
421 return EINVAL;
425 * Parse a chunk of user-provided control data, on an IPv6 socket provided as
426 * 'pkt'. The control chunk is given as 'cmsg', and the length of the data
427 * following the control header (possibly zero) is given as 'len'. On success,
428 * return OK, with any parsed options merged into the set of packet options
429 * 'pkto'. On failure, return a negative error code.
431 static int
432 pktsock_parse_ctl_v6(struct pktsock * pkt, struct cmsghdr * cmsg,
433 socklen_t len, struct pktopt * pkto)
435 struct in6_pktinfo ipi6;
436 int val;
438 if (cmsg->cmsg_level != IPPROTO_IPV6)
439 return EAFNOSUPPORT;
441 switch (cmsg->cmsg_type) {
442 case IPV6_TCLASS:
443 if (len != sizeof(val))
444 return EINVAL;
446 memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
448 if (val < -1 || val > UINT8_MAX)
449 return EINVAL;
451 if (val == -1)
452 val = 0;
454 pkto->pkto_flags |= PKTOF_TOS;
455 pkto->pkto_tos = (uint8_t)val;
457 return OK;
459 case IPV6_HOPLIMIT:
460 if (len != sizeof(val))
461 return EINVAL;
463 memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
465 if (val < -1 || val > UINT8_MAX)
466 return EINVAL;
468 if (val == -1)
469 val = IP_DEFAULT_TTL;
471 pkto->pkto_flags |= PKTOF_TTL;
472 pkto->pkto_ttl = (uint8_t)val;
474 return OK;
476 case IPV6_PKTINFO:
477 if (len != sizeof(ipi6))
478 return EINVAL;
480 memcpy(&ipi6, CMSG_DATA(cmsg), sizeof(ipi6));
482 pkto->pkto_flags |= PKTOF_PKTINFO;
483 memcpy(&pkto->pkto_srcaddr.addr, &ipi6.ipi6_addr,
484 sizeof(pkto->pkto_srcaddr.addr));
485 pkto->pkto_ifindex = ipi6.ipi6_ifindex;
487 return OK;
489 case IPV6_USE_MIN_MTU:
490 if (len != sizeof(int))
491 return EINVAL;
493 memcpy(&val, CMSG_DATA(cmsg), sizeof(val));
495 if (val < -1 || val > 1)
496 return EINVAL;
498 /* TODO: not supported by lwIP, but needed by applications. */
499 return OK;
502 return EINVAL;
506 * Copy in and parse control data, as part of sending a packet on socket 'pkt'.
507 * The control data is accessible through 'ctl', with a user-provided length of
508 * 'ctl_len'. On success, return OK, with any parsed packet options stored in
509 * 'pkto'. On failure, return a negative error code.
512 pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl,
513 socklen_t ctl_len, struct pktopt * pkto)
515 struct msghdr msghdr;
516 struct cmsghdr *cmsg;
517 socklen_t left, len;
518 int r;
520 /* The default: no packet options are being overridden. */
521 assert(pkto->pkto_flags == 0);
523 /* If no control length is given, we are done here. */
524 if (ctl_len == 0)
525 return OK;
528 * For now, we put a rather aggressive limit on the size of the control
529 * data. We copy in and parse the whole thing in a single buffer.
531 if (ctl_len > sizeof(pktsock_ctlbuf)) {
532 printf("LWIP: too much control data given (%u bytes)\n",
533 ctl_len);
535 return ENOBUFS;
538 if ((r = sockdriver_copyin(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK)
539 return r;
541 memset(&msghdr, 0, sizeof(msghdr));
542 msghdr.msg_control = pktsock_ctlbuf;
543 msghdr.msg_controllen = ctl_len;
545 for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
546 cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
547 /* Check for bogus lengths. */
548 assert((socklen_t)((char *)cmsg - pktsock_ctlbuf) <= ctl_len);
549 left = ctl_len - (socklen_t)((char *)cmsg - pktsock_ctlbuf);
550 assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
552 if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) {
553 printf("LWIP: malformed control data rejected\n");
555 return EINVAL;
558 len = cmsg->cmsg_len - CMSG_LEN(0);
560 if (ipsock_is_ipv6(&pkt->pkt_ipsock))
561 r = pktsock_parse_ctl_v6(pkt, cmsg, len, pkto);
562 else
563 r = pktsock_parse_ctl_v4(pkt, cmsg, len, pkto);
565 if (r != OK)
566 return r;
569 return OK;
573 * Copy in the packet data from the calling user process, and store it in the
574 * buffer 'pbuf' that must already have been allocated with the appropriate
575 * size.
578 pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data,
579 size_t len, struct pbuf * pbuf)
583 return util_copy_data(data, len, 0, pbuf, 0, TRUE /*copy_in*/);
587 * Dequeue and free the head of the receive queue of a packet socket.
589 static void
590 pktsock_dequeue(struct pktsock * pkt)
592 struct pbuf *pbuf, **pnext;
593 size_t size;
595 pbuf = pkt->pkt_rcvhead;
596 assert(pbuf != NULL);
598 pnext = pchain_end(pbuf);
599 size = pchain_size(pbuf);
601 if ((pkt->pkt_rcvhead = *pnext) == NULL)
602 pkt->pkt_rcvtailp = &pkt->pkt_rcvhead;
604 assert(pkt->pkt_rcvlen >= size);
605 pkt->pkt_rcvlen -= size;
607 *pnext = NULL;
608 pbuf_free(pbuf);
612 * Perform preliminary checks on a receive request.
615 pktsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
616 int flags)
620 * We accept the same flags across all socket types in LWIP, and then
621 * simply ignore the ones we do not support for packet sockets.
623 if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
624 return EOPNOTSUPP;
626 return OK;
630 * Add a chunk of control data to the global control buffer, starting from
631 * offset 'off'. The chunk has the given level and type, and its data is given
632 * in the buffer 'ptr' with size 'len'. Return the (padded) size of the chunk
633 * that was generated as a result.
635 static size_t
636 pktsock_add_ctl(int level, int type, void * ptr, socklen_t len, size_t off)
638 struct cmsghdr cmsg;
639 size_t size;
641 size = CMSG_SPACE(len);
644 * The global control buffer must be large enough to store one chunk
645 * of each of the supported options. If this panic triggers, increase
646 * PKTSOCK_CTLBUF_SIZE by as much as needed.
648 if (off + size > sizeof(pktsock_ctlbuf))
649 panic("control buffer too small, increase "
650 "PKTSOCK_CTLBUF_SIZE");
652 memset(&cmsg, 0, sizeof(cmsg));
653 cmsg.cmsg_len = CMSG_LEN(len);
654 cmsg.cmsg_level = level;
655 cmsg.cmsg_type = type;
658 * Clear any padding space. This can be optimized, but in any case we
659 * must be careful not to copy out any bytes that have not been
660 * initialized at all.
662 memset(&pktsock_ctlbuf[off], 0, size);
664 memcpy(&pktsock_ctlbuf[off], &cmsg, sizeof(cmsg));
665 memcpy(CMSG_DATA((struct cmsghdr *)&pktsock_ctlbuf[off]), ptr, len);
667 return size;
671 * Generate and copy out control data, as part of delivering a packet from
672 * socket 'pkt' to userland. The control data buffer is given as 'ctl', with
673 * a user-given length of 'ctl_len' bytes. The packet's header information is
674 * provided as 'pkthdr', and its source and destination addresses as 'pktaddr',
675 * which maybe a pktaddr4 or pktaddr6 structure depending on the value of the
676 * PKTHF_IPV6 flag in the 'flags' field in 'pkthdr'. Note that we support
677 * dual-stack sockets, and as such it is possible that the socket is of domain
678 * AF_INET6 while the received packet is an IPv4 packet. On success, return
679 * the size of the control data copied out (possibly zero). If more control
680 * data were generated than copied out, also merge the MSG_CTRUNC flag into
681 * 'rflags'. On failure, return a negative error code.
683 static int
684 pktsock_put_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl,
685 socklen_t ctl_len, struct pkthdr * pkthdr, void * pktaddr,
686 int * rflags)
688 struct pktaddr6 *pktaddr6;
689 struct pktaddr4 *pktaddr4;
690 struct in_pktinfo ipi;
691 struct in6_pktinfo ipi6;
692 ip_addr_t ipaddr;
693 unsigned int flags;
694 uint8_t byte;
695 size_t off;
696 int r, val;
698 flags = ipsock_get_flags(&pkt->pkt_ipsock);
700 if (!(flags & (PKTF_RECVINFO | PKTF_RECVTOS | PKTF_RECVTTL)))
701 return 0;
704 * Important: all generated control chunks must fit in the global
705 * control buffer together. When adding more options here, ensure that
706 * the control buffer remains large enough to receive all options at
707 * once. See also the panic in pktsock_add_ctl().
709 off = 0;
712 * IPv6 sockets may receive IPv4 packets. The ancillary data is in the
713 * format corresponding to the socket, which means we may have to
714 * convert any IPv4 addresses from the packet to IPv4-mapped IPv6
715 * addresses for the ancillary data, just like the source address.
717 if (ipsock_is_ipv6(&pkt->pkt_ipsock)) {
718 if (flags & PKTF_RECVTTL) {
719 val = pkthdr->ttl;
721 off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_HOPLIMIT,
722 &val, sizeof(val), off);
725 if (flags & PKTF_RECVTOS) {
726 val = pkthdr->tos;
728 off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_TCLASS, &val,
729 sizeof(val), off);
732 if (flags & PKTF_RECVINFO) {
733 memset(&ipi6, 0, sizeof(ipi6));
735 if (pkthdr->flags & PKTHF_IPV6) {
736 pktaddr6 = (struct pktaddr6 *)pktaddr;
737 memcpy(&ipi6.ipi6_addr, &pktaddr6->dstaddr,
738 sizeof(ipi6.ipi6_addr));
739 } else {
740 pktaddr4 = (struct pktaddr4 *)pktaddr;
742 addr_make_v4mapped_v6(&ipaddr,
743 &pktaddr4->dstaddr);
745 memcpy(&ipi6.ipi6_addr,
746 ip_2_ip6(&ipaddr)->addr,
747 sizeof(ipi6.ipi6_addr));
749 ipi6.ipi6_ifindex = pkthdr->dstif;
751 off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_PKTINFO,
752 &ipi6, sizeof(ipi6), off);
754 } else {
755 if (flags & PKTF_RECVTTL) {
756 byte = pkthdr->ttl;
758 off += pktsock_add_ctl(IPPROTO_IP, IP_TTL, &byte,
759 sizeof(byte), off);
762 if (flags & PKTF_RECVINFO) {
763 assert(!(pkthdr->flags & PKTHF_IPV6));
764 pktaddr4 = (struct pktaddr4 *)pktaddr;
766 memset(&ipi, 0, sizeof(ipi));
767 memcpy(&ipi.ipi_addr, &pktaddr4->dstaddr,
768 sizeof(ipi.ipi_addr));
769 ipi.ipi_ifindex = pkthdr->dstif;
771 off += pktsock_add_ctl(IPPROTO_IP, IP_PKTINFO, &ipi,
772 sizeof(ipi), off);
776 assert(off > 0);
778 if (ctl_len >= off)
779 ctl_len = off;
780 else
781 *rflags |= MSG_CTRUNC;
783 if (ctl_len > 0 &&
784 (r = sockdriver_copyout(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK)
785 return r;
787 return ctl_len;
791 * Receive data on a packet socket.
794 pktsock_recv(struct sock * sock, const struct sockdriver_data * data,
795 size_t len, size_t * off, const struct sockdriver_data * ctl,
796 socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr,
797 socklen_t * addr_len, endpoint_t user_endpt __unused, int flags,
798 size_t min __unused, int * rflags)
800 struct pktsock *pkt = (struct pktsock *)sock;
801 struct pktaddr4 pktaddr4;
802 struct pktaddr6 pktaddr6;
803 struct pkthdr pkthdr;
804 void *pktaddr;
805 struct pbuf *pbuf;
806 ip_addr_t srcaddr;
807 int r;
809 if ((pbuf = pkt->pkt_rcvhead) == NULL)
810 return SUSPEND;
813 * Get the ancillary data for the packet. The format of the ancillary
814 * data depends on the received packet type, which may be different
815 * from the socket type.
817 util_pbuf_header(pbuf, sizeof(pkthdr));
819 memcpy(&pkthdr, pbuf->payload, sizeof(pkthdr));
821 if (pkthdr.flags & PKTHF_IPV6) {
822 util_pbuf_header(pbuf, sizeof(pktaddr6));
824 memcpy(&pktaddr6, pbuf->payload, sizeof(pktaddr6));
825 pktaddr = &pktaddr6;
827 ip_addr_copy_from_ip6_packed(srcaddr, pktaddr6.srcaddr);
828 if (ip6_addr_has_scope(ip_2_ip6(&srcaddr), IP6_UNICAST))
829 ip6_addr_set_zone(ip_2_ip6(&srcaddr), pkthdr.addrif);
831 util_pbuf_header(pbuf,
832 -(int)(sizeof(pkthdr) + sizeof(pktaddr6)));
833 } else {
834 util_pbuf_header(pbuf, sizeof(pktaddr4));
836 memcpy(&pktaddr4, pbuf->payload, sizeof(pktaddr4));
837 pktaddr = &pktaddr4;
839 ip_addr_copy_from_ip4(srcaddr, pktaddr4.srcaddr);
841 util_pbuf_header(pbuf,
842 -(int)(sizeof(pkthdr) + sizeof(pktaddr4)));
845 /* Copy out the packet data to the calling user process. */
846 if (len >= pbuf->tot_len)
847 len = pbuf->tot_len;
848 else
849 *rflags |= MSG_TRUNC;
851 r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/);
853 if (r != OK)
854 return r;
856 /* Generate and copy out ancillary (control) data, if requested. */
857 if ((r = pktsock_put_ctl(pkt, ctl, ctl_len, &pkthdr, pktaddr,
858 rflags)) < 0)
859 return r;
861 /* Store the source IP address. */
862 ipsock_put_addr(&pkt->pkt_ipsock, addr, addr_len, &srcaddr,
863 pkthdr.port);
865 /* Set multicast or broadcast message flag, if applicable. */
866 if (pkthdr.flags & PKTHF_MCAST)
867 *rflags |= MSG_MCAST;
868 else if (pkthdr.flags & PKTHF_BCAST)
869 *rflags |= MSG_BCAST;
871 /* Discard the packet now, unless we were instructed to peek only. */
872 if (!(flags & MSG_PEEK))
873 pktsock_dequeue(pkt);
875 /* Return the received part of the packet length. */
876 *off = len;
877 *ctl_off = r;
878 return OK;
882 * Test whether data can be received on a packet socket, and if so, how many
883 * bytes of data.
886 pktsock_test_recv(struct sock * sock, size_t min __unused, size_t * size)
888 struct pktsock *pkt = (struct pktsock *)sock;
890 if (pkt->pkt_rcvhead == NULL)
891 return SUSPEND;
893 if (size != NULL)
894 *size = pkt->pkt_rcvhead->tot_len;
895 return OK;
899 * The caller has performed a multicast operation on the given socket. Thus,
900 * the caller is multicast aware. Remember this, because that means the socket
901 * may also receive traffic to multicast destinations.
903 void
904 pktsock_set_mcaware(struct pktsock * pkt)
907 ipsock_set_flag(&pkt->pkt_ipsock, PKTF_MCAWARE);
911 * Set socket options on a packet socket.
914 pktsock_setsockopt(struct pktsock * pkt, int level, int name,
915 const struct sockdriver_data * data, socklen_t len,
916 struct ipopts * ipopts)
918 struct ip_mreq imr;
919 struct ipv6_mreq ipv6mr;
920 struct in6_pktinfo ipi6;
921 ip_addr_t ipaddr, ifaddr;
922 struct ifdev *ifdev;
923 unsigned int flag;
924 uint32_t ifindex;
925 int r, val, has_scope;
927 switch (level) {
928 case IPPROTO_IP:
929 if (ipsock_is_ipv6(&pkt->pkt_ipsock))
930 break;
932 switch (name) {
933 case IP_ADD_MEMBERSHIP:
934 case IP_DROP_MEMBERSHIP:
935 pktsock_set_mcaware(pkt);
937 if ((r = sockdriver_copyin_opt(data, &imr, sizeof(imr),
938 len)) != OK)
939 return r;
941 ip_addr_set_ip4_u32(&ipaddr, imr.imr_multiaddr.s_addr);
942 ip_addr_set_ip4_u32(&ifaddr, imr.imr_interface.s_addr);
944 if (!ip_addr_isany(&ifaddr)) {
945 ifdev = ifaddr_map_by_addr(&ifaddr);
947 if (ifdev == NULL)
948 return EADDRNOTAVAIL;
949 } else
950 ifdev = NULL;
952 if (name == IP_ADD_MEMBERSHIP)
953 r = mcast_join(&pkt->pkt_mcast, &ipaddr,
954 ifdev);
955 else
956 r = mcast_leave(&pkt->pkt_mcast, &ipaddr,
957 ifdev);
959 return r;
961 case IP_RECVTTL:
962 case IP_RECVPKTINFO:
963 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
964 len)) != OK)
965 return r;
967 switch (name) {
968 case IP_RECVTTL: flag = PKTF_RECVTTL; break;
969 case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break;
970 default: flag = 0; assert(0); break;
973 if (val)
974 ipsock_set_flag(&pkt->pkt_ipsock, flag);
975 else
976 ipsock_clear_flag(&pkt->pkt_ipsock, flag);
978 return OK;
981 break;
983 case IPPROTO_IPV6:
984 if (!ipsock_is_ipv6(&pkt->pkt_ipsock))
985 break;
987 switch (name) {
988 case IPV6_JOIN_GROUP:
989 case IPV6_LEAVE_GROUP:
990 pktsock_set_mcaware(pkt);
992 if ((r = sockdriver_copyin_opt(data, &ipv6mr,
993 sizeof(ipv6mr), len)) != OK)
994 return r;
996 ip_addr_set_zero_ip6(&ipaddr);
997 memcpy(ip_2_ip6(&ipaddr)->addr,
998 &ipv6mr.ipv6mr_multiaddr,
999 sizeof(ip_2_ip6(&ipaddr)->addr));
1002 * We currently do not support joining IPv4 multicast
1003 * groups on IPv6 sockets. The reason for this is that
1004 * this would require decisions on what to do if the
1005 * socket is set to V6ONLY later, as well as various
1006 * additional exceptions for a case that hopefully
1007 * doesn't occur in practice anyway.
1009 if (ip6_addr_isipv4mappedipv6(ip_2_ip6(&ipaddr)))
1010 return EADDRNOTAVAIL;
1012 has_scope = ip6_addr_has_scope(ip_2_ip6(&ipaddr),
1013 IP6_UNKNOWN);
1015 if ((ifindex = ipv6mr.ipv6mr_interface) != 0) {
1016 ifdev = ifdev_get_by_index(ifindex);
1018 if (ifdev == NULL)
1019 return ENXIO;
1021 if (has_scope)
1022 ip6_addr_set_zone(ip_2_ip6(&ipaddr),
1023 ifindex);
1024 } else {
1025 if (has_scope)
1026 return EADDRNOTAVAIL;
1028 ifdev = NULL;
1031 if (name == IPV6_JOIN_GROUP)
1032 r = mcast_join(&pkt->pkt_mcast, &ipaddr,
1033 ifdev);
1034 else
1035 r = mcast_leave(&pkt->pkt_mcast, &ipaddr,
1036 ifdev);
1038 return r;
1040 case IPV6_USE_MIN_MTU:
1041 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1042 len)) != OK)
1043 return r;
1045 if (val < -1 || val > 1)
1046 return EINVAL;
1049 * lwIP does not support path MTU discovery, so do
1050 * nothing. TODO: see if this is actually good enough.
1052 return OK;
1054 case IPV6_PKTINFO:
1055 if ((r = sockdriver_copyin_opt(data, &ipi6,
1056 sizeof(ipi6), len)) != OK)
1057 return r;
1060 * Simply copy in what is given. The values will be
1061 * parsed only once a packet is sent, in
1062 * pktsock_get_pktinfo(). Otherwise, if we perform
1063 * checks here, they may be outdated by the time the
1064 * values are actually used.
1066 memcpy(&pkt->pkt_srcaddr.addr, &ipi6.ipi6_addr,
1067 sizeof(pkt->pkt_srcaddr.addr));
1068 pkt->pkt_ifindex = ipi6.ipi6_ifindex;
1070 return OK;
1072 case IPV6_RECVPKTINFO:
1073 case IPV6_RECVHOPLIMIT:
1074 case IPV6_RECVTCLASS:
1075 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
1076 len)) != OK)
1077 return r;
1079 switch (name) {
1080 case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break;
1081 case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break;
1082 case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break;
1083 default: flag = 0; assert(0); break;
1086 if (val)
1087 ipsock_set_flag(&pkt->pkt_ipsock, flag);
1088 else
1089 ipsock_clear_flag(&pkt->pkt_ipsock, flag);
1091 return OK;
1094 break;
1097 return ipsock_setsockopt(&pkt->pkt_ipsock, level, name, data, len,
1098 ipopts);
1102 * Retrieve socket options on a packet socket.
1105 pktsock_getsockopt(struct pktsock * pkt, int level, int name,
1106 const struct sockdriver_data * data, socklen_t * len,
1107 struct ipopts * ipopts)
1109 struct in6_pktinfo ipi6;
1110 unsigned int flag;
1111 int val;
1113 switch (level) {
1114 case IPPROTO_IP:
1115 if (ipsock_is_ipv6(&pkt->pkt_ipsock))
1116 break;
1118 switch (name) {
1119 case IP_RECVTTL:
1120 case IP_RECVPKTINFO:
1121 switch (name) {
1122 case IP_RECVTTL: flag = PKTF_RECVTTL; break;
1123 case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break;
1124 default: flag = 0; assert(0); break;
1127 val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag));
1129 return sockdriver_copyout_opt(data, &val, sizeof(val),
1130 len);
1133 break;
1135 case IPPROTO_IPV6:
1136 if (!ipsock_is_ipv6(&pkt->pkt_ipsock))
1137 break;
1139 switch (name) {
1140 case IPV6_USE_MIN_MTU:
1142 * TODO: sort out exactly what lwIP actually supports
1143 * in the way of path MTU discovery. Value 1 means
1144 * that path MTU discovery is disabled and packets are
1145 * sent at the minimum MTU (RFC 3542).
1147 val = 1;
1149 return sockdriver_copyout_opt(data, &val, sizeof(val),
1150 len);
1152 case IPV6_PKTINFO:
1153 memset(&ipi6, 0, sizeof(ipi6));
1156 * Simply copy out whatever was given before. These
1157 * fields are initialized to zero on socket creation.
1159 memcpy(&ipi6.ipi6_addr, &pkt->pkt_srcaddr.addr,
1160 sizeof(ipi6.ipi6_addr));
1161 ipi6.ipi6_ifindex = pkt->pkt_ifindex;
1163 return sockdriver_copyout_opt(data, &ipi6,
1164 sizeof(ipi6), len);
1166 case IPV6_RECVPKTINFO:
1167 case IPV6_RECVHOPLIMIT:
1168 case IPV6_RECVTCLASS:
1169 switch (name) {
1170 case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break;
1171 case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break;
1172 case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break;
1173 default: flag = 0; assert(0); break;
1176 val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag));
1178 return sockdriver_copyout_opt(data, &val, sizeof(val),
1179 len);
1182 break;
1185 return ipsock_getsockopt(&pkt->pkt_ipsock, level, name, data, len,
1186 ipopts);
1190 * Drain the receive queue of a packet socket.
1192 static void
1193 pktsock_drain(struct pktsock * pkt)
1196 while (pkt->pkt_rcvhead != NULL)
1197 pktsock_dequeue(pkt);
1199 assert(pkt->pkt_rcvlen == 0);
1200 assert(pkt->pkt_rcvtailp == &pkt->pkt_rcvhead);
1204 * Shut down a packet socket for reading and/or writing.
1206 void
1207 pktsock_shutdown(struct pktsock * pkt, unsigned int mask)
1210 if (mask & SFL_SHUT_RD)
1211 pktsock_drain(pkt);
1215 * Close a packet socket.
1217 void
1218 pktsock_close(struct pktsock * pkt)
1221 pktsock_drain(pkt);
1223 mcast_leave_all(&pkt->pkt_mcast);
1227 * Return the rounded-up number of bytes in the packet socket's receive queue,
1228 * for sysctl(7). NetBSD returns the used portion of each buffer, but that
1229 * would be quite some extra effort for us (TODO).
1231 size_t
1232 pktsock_get_recvlen(struct pktsock * pkt)
1235 return pkt->pkt_rcvlen;