Linux 5.1.15
[linux/fpc-iii.git] / net / ipv6 / ip6_output.c
blobed9f6a7d224b5b42ad9c38f86218971e88fec029
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
46 #include <net/sock.h>
47 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 struct neighbour *neigh;
67 struct in6_addr *nexthop;
68 int ret;
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_is_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb);
121 rcu_read_unlock_bh();
122 return ret;
124 rcu_read_unlock_bh();
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 int ret;
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
147 #endif
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 unsigned int head_room;
199 struct ipv6hdr *hdr;
200 u8 proto = fl6->flowi6_proto;
201 int seg_len = skb->len;
202 int hlimit = -1;
203 u32 mtu;
205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 if (opt)
207 head_room += opt->opt_nflen + opt->opt_flen;
209 if (unlikely(skb_headroom(skb) < head_room)) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 if (!skb2) {
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
215 return -ENOBUFS;
217 if (skb->sk)
218 skb_set_owner_w(skb2, skb->sk);
219 consume_skb(skb);
220 skb = skb2;
223 if (opt) {
224 seg_len += opt->opt_nflen + opt->opt_flen;
226 if (opt->opt_flen)
227 ipv6_push_frag_opts(skb, opt, &proto);
229 if (opt->opt_nflen)
230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 &fl6->saddr);
234 skb_push(skb, sizeof(struct ipv6hdr));
235 skb_reset_network_header(skb);
236 hdr = ipv6_hdr(skb);
239 * Fill in the IPv6 header
241 if (np)
242 hlimit = np->hop_limit;
243 if (hlimit < 0)
244 hlimit = ip6_dst_hoplimit(dst);
246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 ip6_autoflowlabel(net, np), fl6));
249 hdr->payload_len = htons(seg_len);
250 hdr->nexthdr = proto;
251 hdr->hop_limit = hlimit;
253 hdr->saddr = fl6->saddr;
254 hdr->daddr = *first_hop;
256 skb->protocol = htons(ETH_P_IPV6);
257 skb->priority = sk->sk_priority;
258 skb->mark = mark;
260 mtu = dst_mtu(dst);
261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 IPSTATS_MIB_OUT, skb->len);
265 /* if egress device is enslaved to an L3 master device pass the
266 * skb to its handler for processing
268 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 if (unlikely(!skb))
270 return 0;
272 /* hooks should never assume socket lock is held.
273 * we promote our socket to non const
275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 net, (struct sock *)sk, skb, NULL, dst->dev,
277 dst_output);
280 skb->dev = dst->dev;
281 /* ipv6_local_error() does not require socket lock,
282 * we promote our socket to non const
284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 kfree_skb(skb);
288 return -EMSGSIZE;
290 EXPORT_SYMBOL(ip6_xmit);
292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
294 struct ip6_ra_chain *ra;
295 struct sock *last = NULL;
297 read_lock(&ip6_ra_lock);
298 for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 struct sock *sk = ra->sk;
300 if (sk && ra->sel == sel &&
301 (!sk->sk_bound_dev_if ||
302 sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 struct ipv6_pinfo *np = inet6_sk(sk);
305 if (np && np->rtalert_isolate &&
306 !net_eq(sock_net(sk), dev_net(skb->dev))) {
307 continue;
309 if (last) {
310 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 if (skb2)
312 rawv6_rcv(last, skb2);
314 last = sk;
318 if (last) {
319 rawv6_rcv(last, skb);
320 read_unlock(&ip6_ra_lock);
321 return 1;
323 read_unlock(&ip6_ra_lock);
324 return 0;
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 struct ipv6hdr *hdr = ipv6_hdr(skb);
330 u8 nexthdr = hdr->nexthdr;
331 __be16 frag_off;
332 int offset;
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 if (offset < 0)
337 return 0;
338 } else
339 offset = sizeof(struct ipv6hdr);
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
346 return 0;
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 case NDISC_REDIRECT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
358 * input function.
360 return 1;
361 default:
362 break;
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
373 return -1;
376 return 0;
379 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
380 struct sk_buff *skb)
382 struct dst_entry *dst = skb_dst(skb);
384 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
385 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
387 #ifdef CONFIG_NET_SWITCHDEV
388 if (skb->offload_l3_fwd_mark) {
389 consume_skb(skb);
390 return 0;
392 #endif
394 skb->tstamp = 0;
395 return dst_output(net, sk, skb);
398 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
400 if (skb->len <= mtu)
401 return false;
403 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
404 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
405 return true;
407 if (skb->ignore_df)
408 return false;
410 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
411 return false;
413 return true;
416 int ip6_forward(struct sk_buff *skb)
418 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
419 struct dst_entry *dst = skb_dst(skb);
420 struct ipv6hdr *hdr = ipv6_hdr(skb);
421 struct inet6_skb_parm *opt = IP6CB(skb);
422 struct net *net = dev_net(dst->dev);
423 u32 mtu;
425 if (net->ipv6.devconf_all->forwarding == 0)
426 goto error;
428 if (skb->pkt_type != PACKET_HOST)
429 goto drop;
431 if (unlikely(skb->sk))
432 goto drop;
434 if (skb_warn_if_lro(skb))
435 goto drop;
437 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
438 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
439 goto drop;
442 skb_forward_csum(skb);
445 * We DO NOT make any processing on
446 * RA packets, pushing them to user level AS IS
447 * without ane WARRANTY that application will be able
448 * to interpret them. The reason is that we
449 * cannot make anything clever here.
451 * We are not end-node, so that if packet contains
452 * AH/ESP, we cannot make anything.
453 * Defragmentation also would be mistake, RA packets
454 * cannot be fragmented, because there is no warranty
455 * that different fragments will go along one path. --ANK
457 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
458 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
459 return 0;
463 * check and decrement ttl
465 if (hdr->hop_limit <= 1) {
466 /* Force OUTPUT device used as source address */
467 skb->dev = dst->dev;
468 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
469 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
471 kfree_skb(skb);
472 return -ETIMEDOUT;
475 /* XXX: idev->cnf.proxy_ndp? */
476 if (net->ipv6.devconf_all->proxy_ndp &&
477 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
478 int proxied = ip6_forward_proxy_check(skb);
479 if (proxied > 0)
480 return ip6_input(skb);
481 else if (proxied < 0) {
482 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
483 goto drop;
487 if (!xfrm6_route_forward(skb)) {
488 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
489 goto drop;
491 dst = skb_dst(skb);
493 /* IPv6 specs say nothing about it, but it is clear that we cannot
494 send redirects to source routed frames.
495 We don't send redirects to frames decapsulated from IPsec.
497 if (IP6CB(skb)->iif == dst->dev->ifindex &&
498 opt->srcrt == 0 && !skb_sec_path(skb)) {
499 struct in6_addr *target = NULL;
500 struct inet_peer *peer;
501 struct rt6_info *rt;
504 * incoming and outgoing devices are the same
505 * send a redirect.
508 rt = (struct rt6_info *) dst;
509 if (rt->rt6i_flags & RTF_GATEWAY)
510 target = &rt->rt6i_gateway;
511 else
512 target = &hdr->daddr;
514 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
516 /* Limit redirects both by destination (here)
517 and by source (inside ndisc_send_redirect)
519 if (inet_peer_xrlim_allow(peer, 1*HZ))
520 ndisc_send_redirect(skb, target);
521 if (peer)
522 inet_putpeer(peer);
523 } else {
524 int addrtype = ipv6_addr_type(&hdr->saddr);
526 /* This check is security critical. */
527 if (addrtype == IPV6_ADDR_ANY ||
528 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
529 goto error;
530 if (addrtype & IPV6_ADDR_LINKLOCAL) {
531 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
532 ICMPV6_NOT_NEIGHBOUR, 0);
533 goto error;
537 mtu = ip6_dst_mtu_forward(dst);
538 if (mtu < IPV6_MIN_MTU)
539 mtu = IPV6_MIN_MTU;
541 if (ip6_pkt_too_big(skb, mtu)) {
542 /* Again, force OUTPUT device used as source address */
543 skb->dev = dst->dev;
544 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
545 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
546 __IP6_INC_STATS(net, ip6_dst_idev(dst),
547 IPSTATS_MIB_FRAGFAILS);
548 kfree_skb(skb);
549 return -EMSGSIZE;
552 if (skb_cow(skb, dst->dev->hard_header_len)) {
553 __IP6_INC_STATS(net, ip6_dst_idev(dst),
554 IPSTATS_MIB_OUTDISCARDS);
555 goto drop;
558 hdr = ipv6_hdr(skb);
560 /* Mangling hops number delayed to point after skb COW */
562 hdr->hop_limit--;
564 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
565 net, NULL, skb, skb->dev, dst->dev,
566 ip6_forward_finish);
568 error:
569 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
570 drop:
571 kfree_skb(skb);
572 return -EINVAL;
575 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
577 to->pkt_type = from->pkt_type;
578 to->priority = from->priority;
579 to->protocol = from->protocol;
580 skb_dst_drop(to);
581 skb_dst_set(to, dst_clone(skb_dst(from)));
582 to->dev = from->dev;
583 to->mark = from->mark;
585 skb_copy_hash(to, from);
587 #ifdef CONFIG_NET_SCHED
588 to->tc_index = from->tc_index;
589 #endif
590 nf_copy(to, from);
591 skb_ext_copy(to, from);
592 skb_copy_secmark(to, from);
595 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
596 int (*output)(struct net *, struct sock *, struct sk_buff *))
598 struct sk_buff *frag;
599 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
600 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
601 inet6_sk(skb->sk) : NULL;
602 struct ipv6hdr *tmp_hdr;
603 struct frag_hdr *fh;
604 unsigned int mtu, hlen, left, len, nexthdr_offset;
605 int hroom, troom;
606 __be32 frag_id;
607 int ptr, offset = 0, err = 0;
608 u8 *prevhdr, nexthdr = 0;
610 err = ip6_find_1stfragopt(skb, &prevhdr);
611 if (err < 0)
612 goto fail;
613 hlen = err;
614 nexthdr = *prevhdr;
615 nexthdr_offset = prevhdr - skb_network_header(skb);
617 mtu = ip6_skb_dst_mtu(skb);
619 /* We must not fragment if the socket is set to force MTU discovery
620 * or if the skb it not generated by a local socket.
622 if (unlikely(!skb->ignore_df && skb->len > mtu))
623 goto fail_toobig;
625 if (IP6CB(skb)->frag_max_size) {
626 if (IP6CB(skb)->frag_max_size > mtu)
627 goto fail_toobig;
629 /* don't send fragments larger than what we received */
630 mtu = IP6CB(skb)->frag_max_size;
631 if (mtu < IPV6_MIN_MTU)
632 mtu = IPV6_MIN_MTU;
635 if (np && np->frag_size < mtu) {
636 if (np->frag_size)
637 mtu = np->frag_size;
639 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
640 goto fail_toobig;
641 mtu -= hlen + sizeof(struct frag_hdr);
643 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
644 &ipv6_hdr(skb)->saddr);
646 if (skb->ip_summed == CHECKSUM_PARTIAL &&
647 (err = skb_checksum_help(skb)))
648 goto fail;
650 prevhdr = skb_network_header(skb) + nexthdr_offset;
651 hroom = LL_RESERVED_SPACE(rt->dst.dev);
652 if (skb_has_frag_list(skb)) {
653 unsigned int first_len = skb_pagelen(skb);
654 struct sk_buff *frag2;
656 if (first_len - hlen > mtu ||
657 ((first_len - hlen) & 7) ||
658 skb_cloned(skb) ||
659 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
660 goto slow_path;
662 skb_walk_frags(skb, frag) {
663 /* Correct geometry. */
664 if (frag->len > mtu ||
665 ((frag->len & 7) && frag->next) ||
666 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
667 goto slow_path_clean;
669 /* Partially cloned skb? */
670 if (skb_shared(frag))
671 goto slow_path_clean;
673 BUG_ON(frag->sk);
674 if (skb->sk) {
675 frag->sk = skb->sk;
676 frag->destructor = sock_wfree;
678 skb->truesize -= frag->truesize;
681 err = 0;
682 offset = 0;
683 /* BUILD HEADER */
685 *prevhdr = NEXTHDR_FRAGMENT;
686 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
687 if (!tmp_hdr) {
688 err = -ENOMEM;
689 goto fail;
691 frag = skb_shinfo(skb)->frag_list;
692 skb_frag_list_init(skb);
694 __skb_pull(skb, hlen);
695 fh = __skb_push(skb, sizeof(struct frag_hdr));
696 __skb_push(skb, hlen);
697 skb_reset_network_header(skb);
698 memcpy(skb_network_header(skb), tmp_hdr, hlen);
700 fh->nexthdr = nexthdr;
701 fh->reserved = 0;
702 fh->frag_off = htons(IP6_MF);
703 fh->identification = frag_id;
705 first_len = skb_pagelen(skb);
706 skb->data_len = first_len - skb_headlen(skb);
707 skb->len = first_len;
708 ipv6_hdr(skb)->payload_len = htons(first_len -
709 sizeof(struct ipv6hdr));
711 for (;;) {
712 /* Prepare header of the next frame,
713 * before previous one went down. */
714 if (frag) {
715 frag->ip_summed = CHECKSUM_NONE;
716 skb_reset_transport_header(frag);
717 fh = __skb_push(frag, sizeof(struct frag_hdr));
718 __skb_push(frag, hlen);
719 skb_reset_network_header(frag);
720 memcpy(skb_network_header(frag), tmp_hdr,
721 hlen);
722 offset += skb->len - hlen - sizeof(struct frag_hdr);
723 fh->nexthdr = nexthdr;
724 fh->reserved = 0;
725 fh->frag_off = htons(offset);
726 if (frag->next)
727 fh->frag_off |= htons(IP6_MF);
728 fh->identification = frag_id;
729 ipv6_hdr(frag)->payload_len =
730 htons(frag->len -
731 sizeof(struct ipv6hdr));
732 ip6_copy_metadata(frag, skb);
735 err = output(net, sk, skb);
736 if (!err)
737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 IPSTATS_MIB_FRAGCREATES);
740 if (err || !frag)
741 break;
743 skb = frag;
744 frag = skb->next;
745 skb_mark_not_on_list(skb);
748 kfree(tmp_hdr);
750 if (err == 0) {
751 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
752 IPSTATS_MIB_FRAGOKS);
753 return 0;
756 kfree_skb_list(frag);
758 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
759 IPSTATS_MIB_FRAGFAILS);
760 return err;
762 slow_path_clean:
763 skb_walk_frags(skb, frag2) {
764 if (frag2 == frag)
765 break;
766 frag2->sk = NULL;
767 frag2->destructor = NULL;
768 skb->truesize += frag2->truesize;
772 slow_path:
773 left = skb->len - hlen; /* Space per frame */
774 ptr = hlen; /* Where to start from */
777 * Fragment the datagram.
780 troom = rt->dst.dev->needed_tailroom;
783 * Keep copying data until we run out.
785 while (left > 0) {
786 u8 *fragnexthdr_offset;
788 len = left;
789 /* IF: it doesn't fit, use 'mtu' - the data space left */
790 if (len > mtu)
791 len = mtu;
792 /* IF: we are not sending up to and including the packet end
793 then align the next start on an eight byte boundary */
794 if (len < left) {
795 len &= ~7;
798 /* Allocate buffer */
799 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
800 hroom + troom, GFP_ATOMIC);
801 if (!frag) {
802 err = -ENOMEM;
803 goto fail;
807 * Set up data on packet
810 ip6_copy_metadata(frag, skb);
811 skb_reserve(frag, hroom);
812 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
813 skb_reset_network_header(frag);
814 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
815 frag->transport_header = (frag->network_header + hlen +
816 sizeof(struct frag_hdr));
819 * Charge the memory for the fragment to any owner
820 * it might possess
822 if (skb->sk)
823 skb_set_owner_w(frag, skb->sk);
826 * Copy the packet header into the new buffer.
828 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
830 fragnexthdr_offset = skb_network_header(frag);
831 fragnexthdr_offset += prevhdr - skb_network_header(skb);
832 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
835 * Build fragment header.
837 fh->nexthdr = nexthdr;
838 fh->reserved = 0;
839 fh->identification = frag_id;
842 * Copy a block of the IP datagram.
844 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
845 len));
846 left -= len;
848 fh->frag_off = htons(offset);
849 if (left > 0)
850 fh->frag_off |= htons(IP6_MF);
851 ipv6_hdr(frag)->payload_len = htons(frag->len -
852 sizeof(struct ipv6hdr));
854 ptr += len;
855 offset += len;
858 * Put this fragment into the sending queue.
860 err = output(net, sk, frag);
861 if (err)
862 goto fail;
864 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865 IPSTATS_MIB_FRAGCREATES);
867 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
868 IPSTATS_MIB_FRAGOKS);
869 consume_skb(skb);
870 return err;
872 fail_toobig:
873 if (skb->sk && dst_allfrag(skb_dst(skb)))
874 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
876 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
877 err = -EMSGSIZE;
879 fail:
880 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
881 IPSTATS_MIB_FRAGFAILS);
882 kfree_skb(skb);
883 return err;
886 static inline int ip6_rt_check(const struct rt6key *rt_key,
887 const struct in6_addr *fl_addr,
888 const struct in6_addr *addr_cache)
890 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
891 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
894 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
895 struct dst_entry *dst,
896 const struct flowi6 *fl6)
898 struct ipv6_pinfo *np = inet6_sk(sk);
899 struct rt6_info *rt;
901 if (!dst)
902 goto out;
904 if (dst->ops->family != AF_INET6) {
905 dst_release(dst);
906 return NULL;
909 rt = (struct rt6_info *)dst;
910 /* Yes, checking route validity in not connected
911 * case is not very simple. Take into account,
912 * that we do not support routing by source, TOS,
913 * and MSG_DONTROUTE --ANK (980726)
915 * 1. ip6_rt_check(): If route was host route,
916 * check that cached destination is current.
917 * If it is network route, we still may
918 * check its validity using saved pointer
919 * to the last used address: daddr_cache.
920 * We do not want to save whole address now,
921 * (because main consumer of this service
922 * is tcp, which has not this problem),
923 * so that the last trick works only on connected
924 * sockets.
925 * 2. oif also should be the same.
927 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
928 #ifdef CONFIG_IPV6_SUBTREES
929 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
930 #endif
931 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
932 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
933 dst_release(dst);
934 dst = NULL;
937 out:
938 return dst;
941 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
942 struct dst_entry **dst, struct flowi6 *fl6)
944 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
945 struct neighbour *n;
946 struct rt6_info *rt;
947 #endif
948 int err;
949 int flags = 0;
951 /* The correct way to handle this would be to do
952 * ip6_route_get_saddr, and then ip6_route_output; however,
953 * the route-specific preferred source forces the
954 * ip6_route_output call _before_ ip6_route_get_saddr.
956 * In source specific routing (no src=any default route),
957 * ip6_route_output will fail given src=any saddr, though, so
958 * that's why we try it again later.
960 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
961 struct fib6_info *from;
962 struct rt6_info *rt;
963 bool had_dst = *dst != NULL;
965 if (!had_dst)
966 *dst = ip6_route_output(net, sk, fl6);
967 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
969 rcu_read_lock();
970 from = rt ? rcu_dereference(rt->from) : NULL;
971 err = ip6_route_get_saddr(net, from, &fl6->daddr,
972 sk ? inet6_sk(sk)->srcprefs : 0,
973 &fl6->saddr);
974 rcu_read_unlock();
976 if (err)
977 goto out_err_release;
979 /* If we had an erroneous initial result, pretend it
980 * never existed and let the SA-enabled version take
981 * over.
983 if (!had_dst && (*dst)->error) {
984 dst_release(*dst);
985 *dst = NULL;
988 if (fl6->flowi6_oif)
989 flags |= RT6_LOOKUP_F_IFACE;
992 if (!*dst)
993 *dst = ip6_route_output_flags(net, sk, fl6, flags);
995 err = (*dst)->error;
996 if (err)
997 goto out_err_release;
999 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1001 * Here if the dst entry we've looked up
1002 * has a neighbour entry that is in the INCOMPLETE
1003 * state and the src address from the flow is
1004 * marked as OPTIMISTIC, we release the found
1005 * dst entry and replace it instead with the
1006 * dst entry of the nexthop router
1008 rt = (struct rt6_info *) *dst;
1009 rcu_read_lock_bh();
1010 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1011 rt6_nexthop(rt, &fl6->daddr));
1012 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1013 rcu_read_unlock_bh();
1015 if (err) {
1016 struct inet6_ifaddr *ifp;
1017 struct flowi6 fl_gw6;
1018 int redirect;
1020 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1021 (*dst)->dev, 1);
1023 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1024 if (ifp)
1025 in6_ifa_put(ifp);
1027 if (redirect) {
1029 * We need to get the dst entry for the
1030 * default router instead
1032 dst_release(*dst);
1033 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1034 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1035 *dst = ip6_route_output(net, sk, &fl_gw6);
1036 err = (*dst)->error;
1037 if (err)
1038 goto out_err_release;
1041 #endif
1042 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1043 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1044 err = -EAFNOSUPPORT;
1045 goto out_err_release;
1048 return 0;
1050 out_err_release:
1051 dst_release(*dst);
1052 *dst = NULL;
1054 if (err == -ENETUNREACH)
1055 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1056 return err;
1060 * ip6_dst_lookup - perform route lookup on flow
1061 * @sk: socket which provides route info
1062 * @dst: pointer to dst_entry * for result
1063 * @fl6: flow to lookup
1065 * This function performs a route lookup on the given flow.
1067 * It returns zero on success, or a standard errno code on error.
1069 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1070 struct flowi6 *fl6)
1072 *dst = NULL;
1073 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1075 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1078 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1079 * @sk: socket which provides route info
1080 * @fl6: flow to lookup
1081 * @final_dst: final destination address for ipsec lookup
1083 * This function performs a route lookup on the given flow.
1085 * It returns a valid dst pointer on success, or a pointer encoded
1086 * error code.
1088 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1089 const struct in6_addr *final_dst)
1091 struct dst_entry *dst = NULL;
1092 int err;
1094 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1095 if (err)
1096 return ERR_PTR(err);
1097 if (final_dst)
1098 fl6->daddr = *final_dst;
1100 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1102 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1105 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1106 * @sk: socket which provides the dst cache and route info
1107 * @fl6: flow to lookup
1108 * @final_dst: final destination address for ipsec lookup
1109 * @connected: whether @sk is connected or not
1111 * This function performs a route lookup on the given flow with the
1112 * possibility of using the cached route in the socket if it is valid.
1113 * It will take the socket dst lock when operating on the dst cache.
1114 * As a result, this function can only be used in process context.
1116 * In addition, for a connected socket, cache the dst in the socket
1117 * if the current cache is not valid.
1119 * It returns a valid dst pointer on success, or a pointer encoded
1120 * error code.
1122 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1123 const struct in6_addr *final_dst,
1124 bool connected)
1126 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1128 dst = ip6_sk_dst_check(sk, dst, fl6);
1129 if (dst)
1130 return dst;
1132 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1133 if (connected && !IS_ERR(dst))
1134 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1136 return dst;
1138 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1140 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1141 gfp_t gfp)
1143 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1147 gfp_t gfp)
1149 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1152 static void ip6_append_data_mtu(unsigned int *mtu,
1153 int *maxfraglen,
1154 unsigned int fragheaderlen,
1155 struct sk_buff *skb,
1156 struct rt6_info *rt,
1157 unsigned int orig_mtu)
1159 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1160 if (!skb) {
1161 /* first fragment, reserve header_len */
1162 *mtu = orig_mtu - rt->dst.header_len;
1164 } else {
1166 * this fragment is not first, the headers
1167 * space is regarded as data space.
1169 *mtu = orig_mtu;
1171 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1172 + fragheaderlen - sizeof(struct frag_hdr);
1176 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1177 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1178 struct rt6_info *rt, struct flowi6 *fl6)
1180 struct ipv6_pinfo *np = inet6_sk(sk);
1181 unsigned int mtu;
1182 struct ipv6_txoptions *opt = ipc6->opt;
1185 * setup for corking
1187 if (opt) {
1188 if (WARN_ON(v6_cork->opt))
1189 return -EINVAL;
1191 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1192 if (unlikely(!v6_cork->opt))
1193 return -ENOBUFS;
1195 v6_cork->opt->tot_len = sizeof(*opt);
1196 v6_cork->opt->opt_flen = opt->opt_flen;
1197 v6_cork->opt->opt_nflen = opt->opt_nflen;
1199 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1200 sk->sk_allocation);
1201 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1202 return -ENOBUFS;
1204 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1205 sk->sk_allocation);
1206 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1207 return -ENOBUFS;
1209 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1210 sk->sk_allocation);
1211 if (opt->hopopt && !v6_cork->opt->hopopt)
1212 return -ENOBUFS;
1214 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1215 sk->sk_allocation);
1216 if (opt->srcrt && !v6_cork->opt->srcrt)
1217 return -ENOBUFS;
1219 /* need source address above miyazawa*/
1221 dst_hold(&rt->dst);
1222 cork->base.dst = &rt->dst;
1223 cork->fl.u.ip6 = *fl6;
1224 v6_cork->hop_limit = ipc6->hlimit;
1225 v6_cork->tclass = ipc6->tclass;
1226 if (rt->dst.flags & DST_XFRM_TUNNEL)
1227 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1228 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1229 else
1230 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1231 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1232 if (np->frag_size < mtu) {
1233 if (np->frag_size)
1234 mtu = np->frag_size;
1236 if (mtu < IPV6_MIN_MTU)
1237 return -EINVAL;
1238 cork->base.fragsize = mtu;
1239 cork->base.gso_size = ipc6->gso_size;
1240 cork->base.tx_flags = 0;
1241 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1243 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1244 cork->base.flags |= IPCORK_ALLFRAG;
1245 cork->base.length = 0;
1247 cork->base.transmit_time = ipc6->sockc.transmit_time;
1249 return 0;
1252 static int __ip6_append_data(struct sock *sk,
1253 struct flowi6 *fl6,
1254 struct sk_buff_head *queue,
1255 struct inet_cork *cork,
1256 struct inet6_cork *v6_cork,
1257 struct page_frag *pfrag,
1258 int getfrag(void *from, char *to, int offset,
1259 int len, int odd, struct sk_buff *skb),
1260 void *from, int length, int transhdrlen,
1261 unsigned int flags, struct ipcm6_cookie *ipc6)
1263 struct sk_buff *skb, *skb_prev = NULL;
1264 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1265 struct ubuf_info *uarg = NULL;
1266 int exthdrlen = 0;
1267 int dst_exthdrlen = 0;
1268 int hh_len;
1269 int copy;
1270 int err;
1271 int offset = 0;
1272 u32 tskey = 0;
1273 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1274 struct ipv6_txoptions *opt = v6_cork->opt;
1275 int csummode = CHECKSUM_NONE;
1276 unsigned int maxnonfragsize, headersize;
1277 unsigned int wmem_alloc_delta = 0;
1278 bool paged, extra_uref = false;
1280 skb = skb_peek_tail(queue);
1281 if (!skb) {
1282 exthdrlen = opt ? opt->opt_flen : 0;
1283 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1286 paged = !!cork->gso_size;
1287 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1288 orig_mtu = mtu;
1290 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1291 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1292 tskey = sk->sk_tskey++;
1294 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1296 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297 (opt ? opt->opt_nflen : 0);
1298 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1299 sizeof(struct frag_hdr);
1301 headersize = sizeof(struct ipv6hdr) +
1302 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1303 (dst_allfrag(&rt->dst) ?
1304 sizeof(struct frag_hdr) : 0) +
1305 rt->rt6i_nfheader_len;
1307 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1308 * the first fragment
1310 if (headersize + transhdrlen > mtu)
1311 goto emsgsize;
1313 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1314 (sk->sk_protocol == IPPROTO_UDP ||
1315 sk->sk_protocol == IPPROTO_RAW)) {
1316 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1317 sizeof(struct ipv6hdr));
1318 goto emsgsize;
1321 if (ip6_sk_ignore_df(sk))
1322 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1323 else
1324 maxnonfragsize = mtu;
1326 if (cork->length + length > maxnonfragsize - headersize) {
1327 emsgsize:
1328 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1329 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1330 return -EMSGSIZE;
1333 /* CHECKSUM_PARTIAL only with no extension headers and when
1334 * we are not going to fragment
1336 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337 headersize == sizeof(struct ipv6hdr) &&
1338 length <= mtu - headersize &&
1339 (!(flags & MSG_MORE) || cork->gso_size) &&
1340 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1341 csummode = CHECKSUM_PARTIAL;
1343 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1344 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1345 if (!uarg)
1346 return -ENOBUFS;
1347 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1348 if (rt->dst.dev->features & NETIF_F_SG &&
1349 csummode == CHECKSUM_PARTIAL) {
1350 paged = true;
1351 } else {
1352 uarg->zerocopy = 0;
1353 skb_zcopy_set(skb, uarg, &extra_uref);
1358 * Let's try using as much space as possible.
1359 * Use MTU if total length of the message fits into the MTU.
1360 * Otherwise, we need to reserve fragment header and
1361 * fragment alignment (= 8-15 octects, in total).
1363 * Note that we may need to "move" the data from the tail of
1364 * of the buffer to the new fragment when we split
1365 * the message.
1367 * FIXME: It may be fragmented into multiple chunks
1368 * at once if non-fragmentable extension headers
1369 * are too large.
1370 * --yoshfuji
1373 cork->length += length;
1374 if (!skb)
1375 goto alloc_new_skb;
1377 while (length > 0) {
1378 /* Check if the remaining data fits into current packet. */
1379 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1380 if (copy < length)
1381 copy = maxfraglen - skb->len;
1383 if (copy <= 0) {
1384 char *data;
1385 unsigned int datalen;
1386 unsigned int fraglen;
1387 unsigned int fraggap;
1388 unsigned int alloclen;
1389 unsigned int pagedlen;
1390 alloc_new_skb:
1391 /* There's no room in the current skb */
1392 if (skb)
1393 fraggap = skb->len - maxfraglen;
1394 else
1395 fraggap = 0;
1396 /* update mtu and maxfraglen if necessary */
1397 if (!skb || !skb_prev)
1398 ip6_append_data_mtu(&mtu, &maxfraglen,
1399 fragheaderlen, skb, rt,
1400 orig_mtu);
1402 skb_prev = skb;
1405 * If remaining data exceeds the mtu,
1406 * we know we need more fragment(s).
1408 datalen = length + fraggap;
1410 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1411 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1412 fraglen = datalen + fragheaderlen;
1413 pagedlen = 0;
1415 if ((flags & MSG_MORE) &&
1416 !(rt->dst.dev->features&NETIF_F_SG))
1417 alloclen = mtu;
1418 else if (!paged)
1419 alloclen = fraglen;
1420 else {
1421 alloclen = min_t(int, fraglen, MAX_HEADER);
1422 pagedlen = fraglen - alloclen;
1425 alloclen += dst_exthdrlen;
1427 if (datalen != length + fraggap) {
1429 * this is not the last fragment, the trailer
1430 * space is regarded as data space.
1432 datalen += rt->dst.trailer_len;
1435 alloclen += rt->dst.trailer_len;
1436 fraglen = datalen + fragheaderlen;
1439 * We just reserve space for fragment header.
1440 * Note: this may be overallocation if the message
1441 * (without MSG_MORE) fits into the MTU.
1443 alloclen += sizeof(struct frag_hdr);
1445 copy = datalen - transhdrlen - fraggap - pagedlen;
1446 if (copy < 0) {
1447 err = -EINVAL;
1448 goto error;
1450 if (transhdrlen) {
1451 skb = sock_alloc_send_skb(sk,
1452 alloclen + hh_len,
1453 (flags & MSG_DONTWAIT), &err);
1454 } else {
1455 skb = NULL;
1456 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1457 2 * sk->sk_sndbuf)
1458 skb = alloc_skb(alloclen + hh_len,
1459 sk->sk_allocation);
1460 if (unlikely(!skb))
1461 err = -ENOBUFS;
1463 if (!skb)
1464 goto error;
1466 * Fill in the control structures
1468 skb->protocol = htons(ETH_P_IPV6);
1469 skb->ip_summed = csummode;
1470 skb->csum = 0;
1471 /* reserve for fragmentation and ipsec header */
1472 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1473 dst_exthdrlen);
1476 * Find where to start putting bytes
1478 data = skb_put(skb, fraglen - pagedlen);
1479 skb_set_network_header(skb, exthdrlen);
1480 data += fragheaderlen;
1481 skb->transport_header = (skb->network_header +
1482 fragheaderlen);
1483 if (fraggap) {
1484 skb->csum = skb_copy_and_csum_bits(
1485 skb_prev, maxfraglen,
1486 data + transhdrlen, fraggap, 0);
1487 skb_prev->csum = csum_sub(skb_prev->csum,
1488 skb->csum);
1489 data += fraggap;
1490 pskb_trim_unique(skb_prev, maxfraglen);
1492 if (copy > 0 &&
1493 getfrag(from, data + transhdrlen, offset,
1494 copy, fraggap, skb) < 0) {
1495 err = -EFAULT;
1496 kfree_skb(skb);
1497 goto error;
1500 offset += copy;
1501 length -= copy + transhdrlen;
1502 transhdrlen = 0;
1503 exthdrlen = 0;
1504 dst_exthdrlen = 0;
1506 /* Only the initial fragment is time stamped */
1507 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1508 cork->tx_flags = 0;
1509 skb_shinfo(skb)->tskey = tskey;
1510 tskey = 0;
1511 skb_zcopy_set(skb, uarg, &extra_uref);
1513 if ((flags & MSG_CONFIRM) && !skb_prev)
1514 skb_set_dst_pending_confirm(skb, 1);
1517 * Put the packet on the pending queue
1519 if (!skb->destructor) {
1520 skb->destructor = sock_wfree;
1521 skb->sk = sk;
1522 wmem_alloc_delta += skb->truesize;
1524 __skb_queue_tail(queue, skb);
1525 continue;
1528 if (copy > length)
1529 copy = length;
1531 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1532 skb_tailroom(skb) >= copy) {
1533 unsigned int off;
1535 off = skb->len;
1536 if (getfrag(from, skb_put(skb, copy),
1537 offset, copy, off, skb) < 0) {
1538 __skb_trim(skb, off);
1539 err = -EFAULT;
1540 goto error;
1542 } else if (!uarg || !uarg->zerocopy) {
1543 int i = skb_shinfo(skb)->nr_frags;
1545 err = -ENOMEM;
1546 if (!sk_page_frag_refill(sk, pfrag))
1547 goto error;
1549 if (!skb_can_coalesce(skb, i, pfrag->page,
1550 pfrag->offset)) {
1551 err = -EMSGSIZE;
1552 if (i == MAX_SKB_FRAGS)
1553 goto error;
1555 __skb_fill_page_desc(skb, i, pfrag->page,
1556 pfrag->offset, 0);
1557 skb_shinfo(skb)->nr_frags = ++i;
1558 get_page(pfrag->page);
1560 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1561 if (getfrag(from,
1562 page_address(pfrag->page) + pfrag->offset,
1563 offset, copy, skb->len, skb) < 0)
1564 goto error_efault;
1566 pfrag->offset += copy;
1567 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1568 skb->len += copy;
1569 skb->data_len += copy;
1570 skb->truesize += copy;
1571 wmem_alloc_delta += copy;
1572 } else {
1573 err = skb_zerocopy_iter_dgram(skb, from, copy);
1574 if (err < 0)
1575 goto error;
1577 offset += copy;
1578 length -= copy;
1581 if (wmem_alloc_delta)
1582 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1583 return 0;
1585 error_efault:
1586 err = -EFAULT;
1587 error:
1588 if (uarg)
1589 sock_zerocopy_put_abort(uarg, extra_uref);
1590 cork->length -= length;
1591 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1592 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1593 return err;
1596 int ip6_append_data(struct sock *sk,
1597 int getfrag(void *from, char *to, int offset, int len,
1598 int odd, struct sk_buff *skb),
1599 void *from, int length, int transhdrlen,
1600 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1601 struct rt6_info *rt, unsigned int flags)
1603 struct inet_sock *inet = inet_sk(sk);
1604 struct ipv6_pinfo *np = inet6_sk(sk);
1605 int exthdrlen;
1606 int err;
1608 if (flags&MSG_PROBE)
1609 return 0;
1610 if (skb_queue_empty(&sk->sk_write_queue)) {
1612 * setup for corking
1614 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1615 ipc6, rt, fl6);
1616 if (err)
1617 return err;
1619 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1620 length += exthdrlen;
1621 transhdrlen += exthdrlen;
1622 } else {
1623 fl6 = &inet->cork.fl.u.ip6;
1624 transhdrlen = 0;
1627 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1628 &np->cork, sk_page_frag(sk), getfrag,
1629 from, length, transhdrlen, flags, ipc6);
1631 EXPORT_SYMBOL_GPL(ip6_append_data);
1633 static void ip6_cork_release(struct inet_cork_full *cork,
1634 struct inet6_cork *v6_cork)
1636 if (v6_cork->opt) {
1637 kfree(v6_cork->opt->dst0opt);
1638 kfree(v6_cork->opt->dst1opt);
1639 kfree(v6_cork->opt->hopopt);
1640 kfree(v6_cork->opt->srcrt);
1641 kfree(v6_cork->opt);
1642 v6_cork->opt = NULL;
1645 if (cork->base.dst) {
1646 dst_release(cork->base.dst);
1647 cork->base.dst = NULL;
1648 cork->base.flags &= ~IPCORK_ALLFRAG;
1650 memset(&cork->fl, 0, sizeof(cork->fl));
1653 struct sk_buff *__ip6_make_skb(struct sock *sk,
1654 struct sk_buff_head *queue,
1655 struct inet_cork_full *cork,
1656 struct inet6_cork *v6_cork)
1658 struct sk_buff *skb, *tmp_skb;
1659 struct sk_buff **tail_skb;
1660 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1661 struct ipv6_pinfo *np = inet6_sk(sk);
1662 struct net *net = sock_net(sk);
1663 struct ipv6hdr *hdr;
1664 struct ipv6_txoptions *opt = v6_cork->opt;
1665 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1666 struct flowi6 *fl6 = &cork->fl.u.ip6;
1667 unsigned char proto = fl6->flowi6_proto;
1669 skb = __skb_dequeue(queue);
1670 if (!skb)
1671 goto out;
1672 tail_skb = &(skb_shinfo(skb)->frag_list);
1674 /* move skb->data to ip header from ext header */
1675 if (skb->data < skb_network_header(skb))
1676 __skb_pull(skb, skb_network_offset(skb));
1677 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1678 __skb_pull(tmp_skb, skb_network_header_len(skb));
1679 *tail_skb = tmp_skb;
1680 tail_skb = &(tmp_skb->next);
1681 skb->len += tmp_skb->len;
1682 skb->data_len += tmp_skb->len;
1683 skb->truesize += tmp_skb->truesize;
1684 tmp_skb->destructor = NULL;
1685 tmp_skb->sk = NULL;
1688 /* Allow local fragmentation. */
1689 skb->ignore_df = ip6_sk_ignore_df(sk);
1691 *final_dst = fl6->daddr;
1692 __skb_pull(skb, skb_network_header_len(skb));
1693 if (opt && opt->opt_flen)
1694 ipv6_push_frag_opts(skb, opt, &proto);
1695 if (opt && opt->opt_nflen)
1696 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1698 skb_push(skb, sizeof(struct ipv6hdr));
1699 skb_reset_network_header(skb);
1700 hdr = ipv6_hdr(skb);
1702 ip6_flow_hdr(hdr, v6_cork->tclass,
1703 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1704 ip6_autoflowlabel(net, np), fl6));
1705 hdr->hop_limit = v6_cork->hop_limit;
1706 hdr->nexthdr = proto;
1707 hdr->saddr = fl6->saddr;
1708 hdr->daddr = *final_dst;
1710 skb->priority = sk->sk_priority;
1711 skb->mark = sk->sk_mark;
1713 skb->tstamp = cork->base.transmit_time;
1715 skb_dst_set(skb, dst_clone(&rt->dst));
1716 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1717 if (proto == IPPROTO_ICMPV6) {
1718 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1720 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1721 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1724 ip6_cork_release(cork, v6_cork);
1725 out:
1726 return skb;
1729 int ip6_send_skb(struct sk_buff *skb)
1731 struct net *net = sock_net(skb->sk);
1732 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1733 int err;
1735 err = ip6_local_out(net, skb->sk, skb);
1736 if (err) {
1737 if (err > 0)
1738 err = net_xmit_errno(err);
1739 if (err)
1740 IP6_INC_STATS(net, rt->rt6i_idev,
1741 IPSTATS_MIB_OUTDISCARDS);
1744 return err;
1747 int ip6_push_pending_frames(struct sock *sk)
1749 struct sk_buff *skb;
1751 skb = ip6_finish_skb(sk);
1752 if (!skb)
1753 return 0;
1755 return ip6_send_skb(skb);
1757 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1759 static void __ip6_flush_pending_frames(struct sock *sk,
1760 struct sk_buff_head *queue,
1761 struct inet_cork_full *cork,
1762 struct inet6_cork *v6_cork)
1764 struct sk_buff *skb;
1766 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1767 if (skb_dst(skb))
1768 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1769 IPSTATS_MIB_OUTDISCARDS);
1770 kfree_skb(skb);
1773 ip6_cork_release(cork, v6_cork);
1776 void ip6_flush_pending_frames(struct sock *sk)
1778 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1779 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1781 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1783 struct sk_buff *ip6_make_skb(struct sock *sk,
1784 int getfrag(void *from, char *to, int offset,
1785 int len, int odd, struct sk_buff *skb),
1786 void *from, int length, int transhdrlen,
1787 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1788 struct rt6_info *rt, unsigned int flags,
1789 struct inet_cork_full *cork)
1791 struct inet6_cork v6_cork;
1792 struct sk_buff_head queue;
1793 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1794 int err;
1796 if (flags & MSG_PROBE)
1797 return NULL;
1799 __skb_queue_head_init(&queue);
1801 cork->base.flags = 0;
1802 cork->base.addr = 0;
1803 cork->base.opt = NULL;
1804 cork->base.dst = NULL;
1805 v6_cork.opt = NULL;
1806 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1807 if (err) {
1808 ip6_cork_release(cork, &v6_cork);
1809 return ERR_PTR(err);
1811 if (ipc6->dontfrag < 0)
1812 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1814 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1815 &current->task_frag, getfrag, from,
1816 length + exthdrlen, transhdrlen + exthdrlen,
1817 flags, ipc6);
1818 if (err) {
1819 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1820 return ERR_PTR(err);
1823 return __ip6_make_skb(sk, &queue, cork, &v6_cork);